From 84a1ed5e67565b09b8fd22a26754d2897de55ce0 Mon Sep 17 00:00:00 2001
From: Francois Dugast <francois.dugast@intel.com>
Date: Thu, 22 Feb 2024 18:23:56 -0500
Subject: drm/xe/uapi: Remove unused flags
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Those cases missed in previous uAPI cleanups were mostly accidentally
brought in from i915 or created to exercise the possibilities of gpuvm
but they are not used by userspace yet, so let's remove them. They can
still be brought back later if needed.

v2:
- Fix XE_VM_FLAG_FAULT_MODE support in xe_lrc.c (Brian Welty)
- Leave DRM_XE_VM_BIND_OP_UNMAP_ALL (José Roberto de Souza)
- Ensure invalid flag values are rejected (Rodrigo Vivi)

v3: Rebase after removal of persistent exec_queues (Francois Dugast)

v4: Rodrigo: Rebase after the new dumpable flag.

Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs")
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Francois Dugast <francois.dugast@intel.com>
Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240222232356.175431-1-rodrigo.vivi@intel.com
---
 include/uapi/drm/xe_drm.h | 19 -------------------
 1 file changed, 19 deletions(-)

(limited to 'include')

diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
index 2fefec9c0e94..538a3ac95c54 100644
--- a/include/uapi/drm/xe_drm.h
+++ b/include/uapi/drm/xe_drm.h
@@ -862,10 +862,6 @@ struct drm_xe_vm_destroy {
  *  - %DRM_XE_VM_BIND_OP_PREFETCH
  *
  * and the @flags can be:
- *  - %DRM_XE_VM_BIND_FLAG_READONLY
- *  - %DRM_XE_VM_BIND_FLAG_IMMEDIATE - Valid on a faulting VM only, do the
- *    MAP operation immediately rather than deferring the MAP to the page
- *    fault handler.
  *  - %DRM_XE_VM_BIND_FLAG_NULL - When the NULL flag is set, the page
  *    tables are setup with a special bit which indicates writes are
  *    dropped and all reads return zero. In the future, the NULL flags
@@ -958,8 +954,6 @@ struct drm_xe_vm_bind_op {
 	/** @op: Bind operation to perform */
 	__u32 op;
 
-#define DRM_XE_VM_BIND_FLAG_READONLY	(1 << 0)
-#define DRM_XE_VM_BIND_FLAG_IMMEDIATE	(1 << 1)
 #define DRM_XE_VM_BIND_FLAG_NULL	(1 << 2)
 #define DRM_XE_VM_BIND_FLAG_DUMPABLE	(1 << 3)
 	/** @flags: Bind flags */
@@ -1076,19 +1070,6 @@ struct drm_xe_exec_queue_create {
 #define DRM_XE_EXEC_QUEUE_EXTENSION_SET_PROPERTY		0
 #define   DRM_XE_EXEC_QUEUE_SET_PROPERTY_PRIORITY		0
 #define   DRM_XE_EXEC_QUEUE_SET_PROPERTY_TIMESLICE		1
-#define   DRM_XE_EXEC_QUEUE_SET_PROPERTY_PREEMPTION_TIMEOUT	2
-#define   DRM_XE_EXEC_QUEUE_SET_PROPERTY_JOB_TIMEOUT		4
-#define   DRM_XE_EXEC_QUEUE_SET_PROPERTY_ACC_TRIGGER		5
-#define   DRM_XE_EXEC_QUEUE_SET_PROPERTY_ACC_NOTIFY		6
-#define   DRM_XE_EXEC_QUEUE_SET_PROPERTY_ACC_GRANULARITY	7
-/* Monitor 128KB contiguous region with 4K sub-granularity */
-#define     DRM_XE_ACC_GRANULARITY_128K				0
-/* Monitor 2MB contiguous region with 64KB sub-granularity */
-#define     DRM_XE_ACC_GRANULARITY_2M				1
-/* Monitor 16MB contiguous region with 512KB sub-granularity */
-#define     DRM_XE_ACC_GRANULARITY_16M				2
-/* Monitor 64MB contiguous region with 2M sub-granularity */
-#define     DRM_XE_ACC_GRANULARITY_64M				3
 
 	/** @extensions: Pointer to the first extension struct, if any */
 	__u64 extensions;
-- 
cgit v1.2.3


From b2121f2bd2232cd0556b2182078d159d81497885 Mon Sep 17 00:00:00 2001
From: Francois Dugast <francois.dugast@intel.com>
Date: Thu, 8 Feb 2024 10:35:39 -0800
Subject: drm/xe: Extend uAPI to query HuC micro-controler firmware version
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The infrastructure to query GuC firmware version is already in place. It
is extended with a new micro-controller type to query the HuC firmware
version. It can be used from user space to know if HuC is running.

Cc: John Harrison <John.C.Harrison@Intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
Cc: Lucas De Marchi <lucas.demarchi@intel.com>
Signed-off-by: Francois Dugast <francois.dugast@intel.com>
Signed-off-by: José Roberto de Souza <jose.souza@intel.com>
Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: John Harrison <John.C.Harrison@Intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240208183539.185095-2-jose.souza@intel.com
---
 drivers/gpu/drm/xe/xe_query.c | 38 ++++++++++++++++++++++++++++++++++----
 include/uapi/drm/xe_drm.h     |  1 +
 2 files changed, 35 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/xe/xe_query.c b/drivers/gpu/drm/xe/xe_query.c
index f1876b556ab4..a6a20a6dd360 100644
--- a/drivers/gpu/drm/xe/xe_query.c
+++ b/drivers/gpu/drm/xe/xe_query.c
@@ -540,14 +540,44 @@ query_uc_fw_version(struct xe_device *xe, struct drm_xe_device_query *query)
 		version = &guc->fw.versions.found[XE_UC_FW_VER_COMPATIBILITY];
 		break;
 	}
+	case XE_QUERY_UC_TYPE_HUC: {
+		struct xe_gt *media_gt = NULL;
+		struct xe_huc *huc;
+
+		if (MEDIA_VER(xe) >= 13) {
+			struct xe_tile *tile;
+			u8 gt_id;
+
+			for_each_tile(tile, xe, gt_id) {
+				if (tile->media_gt) {
+					media_gt = tile->media_gt;
+					break;
+				}
+			}
+		} else {
+			media_gt = xe->tiles[0].primary_gt;
+		}
+
+		if (!media_gt)
+			break;
+
+		huc = &media_gt->uc.huc;
+		if (huc->fw.status == XE_UC_FIRMWARE_RUNNING)
+			version = &huc->fw.versions.found[XE_UC_FW_VER_RELEASE];
+		break;
+	}
 	default:
 		return -EINVAL;
 	}
 
-	resp.branch_ver = 0;
-	resp.major_ver = version->major;
-	resp.minor_ver = version->minor;
-	resp.patch_ver = version->patch;
+	if (version) {
+		resp.branch_ver = 0;
+		resp.major_ver = version->major;
+		resp.minor_ver = version->minor;
+		resp.patch_ver = version->patch;
+	} else {
+		return -ENODEV;
+	}
 
 	if (copy_to_user(query_ptr, &resp, size))
 		return -EFAULT;
diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
index 538a3ac95c54..2fc19177d2b0 100644
--- a/include/uapi/drm/xe_drm.h
+++ b/include/uapi/drm/xe_drm.h
@@ -583,6 +583,7 @@ struct drm_xe_query_engine_cycles {
 struct drm_xe_query_uc_fw_version {
 	/** @uc_type: The micro-controller type to query firmware version */
 #define XE_QUERY_UC_TYPE_GUC_SUBMISSION 0
+#define XE_QUERY_UC_TYPE_HUC 1
 	__u16 uc_type;
 
 	/** @pad: MBZ */
-- 
cgit v1.2.3


From 0f25e493a2462dbdd2e34f4e100405380cc0201a Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@collabora.com>
Date: Thu, 29 Feb 2024 17:22:15 +0100
Subject: drm/panthor: Add uAPI

Panthor follows the lead of other recently submitted drivers with
ioctls allowing us to support modern Vulkan features, like sparse memory
binding:

- Pretty standard GEM management ioctls (BO_CREATE and BO_MMAP_OFFSET),
  with the 'exclusive-VM' bit to speed-up BO reservation on job submission
- VM management ioctls (VM_CREATE, VM_DESTROY and VM_BIND). The VM_BIND
  ioctl is loosely based on the Xe model, and can handle both
  asynchronous and synchronous requests
- GPU execution context creation/destruction, tiler heap context creation
  and job submission. Those ioctls reflect how the hardware/scheduler
  works and are thus driver specific.

We also have a way to expose IO regions, such that the usermode driver
can directly access specific/well-isolate registers, like the
LATEST_FLUSH register used to implement cache-flush reduction.

This uAPI intentionally keeps usermode queues out of the scope, which
explains why doorbell registers and command stream ring-buffers are not
directly exposed to userspace.

v6:
- Add Maxime's and Heiko's acks

v5:
- Fix typo
- Add Liviu's R-b

v4:
- Add a VM_GET_STATE ioctl
- Fix doc
- Expose the CORE_FEATURES register so we can deal with variants in the
  UMD
- Add Steve's R-b

v3:
- Add the concept of sync-only VM operation
- Fix support for 32-bit userspace
- Rework drm_panthor_vm_create to pass the user VA size instead of
  the kernel VA size (suggested by Robin Murphy)
- Typo fixes
- Explicitly cast enums with top bit set to avoid compiler warnings in
  -pedantic mode.
- Drop property core_group_count as it can be easily calculated by the
  number of bits set in l2_present.

Co-developed-by: Steven Price <steven.price@arm.com>
Signed-off-by: Steven Price <steven.price@arm.com>
Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Steven Price <steven.price@arm.com>
Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>
Acked-by: Maxime Ripard <mripard@kernel.org>
Acked-by: Heiko Stuebner <heiko@sntech.de>
Link: https://patchwork.freedesktop.org/patch/msgid/20240229162230.2634044-2-boris.brezillon@collabora.com
---
 Documentation/gpu/driver-uapi.rst |   5 +
 include/uapi/drm/panthor_drm.h    | 945 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 950 insertions(+)
 create mode 100644 include/uapi/drm/panthor_drm.h

(limited to 'include')

diff --git a/Documentation/gpu/driver-uapi.rst b/Documentation/gpu/driver-uapi.rst
index e5070a0e95ab..971cdb4816fc 100644
--- a/Documentation/gpu/driver-uapi.rst
+++ b/Documentation/gpu/driver-uapi.rst
@@ -18,6 +18,11 @@ VM_BIND / EXEC uAPI
 
 .. kernel-doc:: include/uapi/drm/nouveau_drm.h
 
+drm/panthor uAPI
+================
+
+.. kernel-doc:: include/uapi/drm/panthor_drm.h
+
 drm/xe uAPI
 ===========
 
diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
new file mode 100644
index 000000000000..373df80f41ed
--- /dev/null
+++ b/include/uapi/drm/panthor_drm.h
@@ -0,0 +1,945 @@
+/* SPDX-License-Identifier: MIT */
+/* Copyright (C) 2023 Collabora ltd. */
+#ifndef _PANTHOR_DRM_H_
+#define _PANTHOR_DRM_H_
+
+#include "drm.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/**
+ * DOC: Introduction
+ *
+ * This documentation describes the Panthor IOCTLs.
+ *
+ * Just a few generic rules about the data passed to the Panthor IOCTLs:
+ *
+ * - Structures must be aligned on 64-bit/8-byte. If the object is not
+ *   naturally aligned, a padding field must be added.
+ * - Fields must be explicitly aligned to their natural type alignment with
+ *   pad[0..N] fields.
+ * - All padding fields will be checked by the driver to make sure they are
+ *   zeroed.
+ * - Flags can be added, but not removed/replaced.
+ * - New fields can be added to the main structures (the structures
+ *   directly passed to the ioctl). Those fields can be added at the end of
+ *   the structure, or replace existing padding fields. Any new field being
+ *   added must preserve the behavior that existed before those fields were
+ *   added when a value of zero is passed.
+ * - New fields can be added to indirect objects (objects pointed by the
+ *   main structure), iff those objects are passed a size to reflect the
+ *   size known by the userspace driver (see drm_panthor_obj_array::stride
+ *   or drm_panthor_dev_query::size).
+ * - If the kernel driver is too old to know some fields, those will be
+ *   ignored if zero, and otherwise rejected (and so will be zero on output).
+ * - If userspace is too old to know some fields, those will be zeroed
+ *   (input) before the structure is parsed by the kernel driver.
+ * - Each new flag/field addition must come with a driver version update so
+ *   the userspace driver doesn't have to trial and error to know which
+ *   flags are supported.
+ * - Structures should not contain unions, as this would defeat the
+ *   extensibility of such structures.
+ * - IOCTLs can't be removed or replaced. New IOCTL IDs should be placed
+ *   at the end of the drm_panthor_ioctl_id enum.
+ */
+
+/**
+ * DOC: MMIO regions exposed to userspace.
+ *
+ * .. c:macro:: DRM_PANTHOR_USER_MMIO_OFFSET
+ *
+ * File offset for all MMIO regions being exposed to userspace. Don't use
+ * this value directly, use DRM_PANTHOR_USER_<name>_OFFSET values instead.
+ * pgoffset passed to mmap2() is an unsigned long, which forces us to use a
+ * different offset on 32-bit and 64-bit systems.
+ *
+ * .. c:macro:: DRM_PANTHOR_USER_FLUSH_ID_MMIO_OFFSET
+ *
+ * File offset for the LATEST_FLUSH_ID register. The Userspace driver controls
+ * GPU cache flushing through CS instructions, but the flush reduction
+ * mechanism requires a flush_id. This flush_id could be queried with an
+ * ioctl, but Arm provides a well-isolated register page containing only this
+ * read-only register, so let's expose this page through a static mmap offset
+ * and allow direct mapping of this MMIO region so we can avoid the
+ * user <-> kernel round-trip.
+ */
+#define DRM_PANTHOR_USER_MMIO_OFFSET_32BIT	(1ull << 43)
+#define DRM_PANTHOR_USER_MMIO_OFFSET_64BIT	(1ull << 56)
+#define DRM_PANTHOR_USER_MMIO_OFFSET		(sizeof(unsigned long) < 8 ? \
+						 DRM_PANTHOR_USER_MMIO_OFFSET_32BIT : \
+						 DRM_PANTHOR_USER_MMIO_OFFSET_64BIT)
+#define DRM_PANTHOR_USER_FLUSH_ID_MMIO_OFFSET	(DRM_PANTHOR_USER_MMIO_OFFSET | 0)
+
+/**
+ * DOC: IOCTL IDs
+ *
+ * enum drm_panthor_ioctl_id - IOCTL IDs
+ *
+ * Place new ioctls at the end, don't re-order, don't replace or remove entries.
+ *
+ * These IDs are not meant to be used directly. Use the DRM_IOCTL_PANTHOR_xxx
+ * definitions instead.
+ */
+enum drm_panthor_ioctl_id {
+	/** @DRM_PANTHOR_DEV_QUERY: Query device information. */
+	DRM_PANTHOR_DEV_QUERY = 0,
+
+	/** @DRM_PANTHOR_VM_CREATE: Create a VM. */
+	DRM_PANTHOR_VM_CREATE,
+
+	/** @DRM_PANTHOR_VM_DESTROY: Destroy a VM. */
+	DRM_PANTHOR_VM_DESTROY,
+
+	/** @DRM_PANTHOR_VM_BIND: Bind/unbind memory to a VM. */
+	DRM_PANTHOR_VM_BIND,
+
+	/** @DRM_PANTHOR_VM_GET_STATE: Get VM state. */
+	DRM_PANTHOR_VM_GET_STATE,
+
+	/** @DRM_PANTHOR_BO_CREATE: Create a buffer object. */
+	DRM_PANTHOR_BO_CREATE,
+
+	/**
+	 * @DRM_PANTHOR_BO_MMAP_OFFSET: Get the file offset to pass to
+	 * mmap to map a GEM object.
+	 */
+	DRM_PANTHOR_BO_MMAP_OFFSET,
+
+	/** @DRM_PANTHOR_GROUP_CREATE: Create a scheduling group. */
+	DRM_PANTHOR_GROUP_CREATE,
+
+	/** @DRM_PANTHOR_GROUP_DESTROY: Destroy a scheduling group. */
+	DRM_PANTHOR_GROUP_DESTROY,
+
+	/**
+	 * @DRM_PANTHOR_GROUP_SUBMIT: Submit jobs to queues belonging
+	 * to a specific scheduling group.
+	 */
+	DRM_PANTHOR_GROUP_SUBMIT,
+
+	/** @DRM_PANTHOR_GROUP_GET_STATE: Get the state of a scheduling group. */
+	DRM_PANTHOR_GROUP_GET_STATE,
+
+	/** @DRM_PANTHOR_TILER_HEAP_CREATE: Create a tiler heap. */
+	DRM_PANTHOR_TILER_HEAP_CREATE,
+
+	/** @DRM_PANTHOR_TILER_HEAP_DESTROY: Destroy a tiler heap. */
+	DRM_PANTHOR_TILER_HEAP_DESTROY,
+};
+
+/**
+ * DRM_IOCTL_PANTHOR() - Build a Panthor IOCTL number
+ * @__access: Access type. Must be R, W or RW.
+ * @__id: One of the DRM_PANTHOR_xxx id.
+ * @__type: Suffix of the type being passed to the IOCTL.
+ *
+ * Don't use this macro directly, use the DRM_IOCTL_PANTHOR_xxx
+ * values instead.
+ *
+ * Return: An IOCTL number to be passed to ioctl() from userspace.
+ */
+#define DRM_IOCTL_PANTHOR(__access, __id, __type) \
+	DRM_IO ## __access(DRM_COMMAND_BASE + DRM_PANTHOR_ ## __id, \
+			   struct drm_panthor_ ## __type)
+
+#define DRM_IOCTL_PANTHOR_DEV_QUERY \
+	DRM_IOCTL_PANTHOR(WR, DEV_QUERY, dev_query)
+#define DRM_IOCTL_PANTHOR_VM_CREATE \
+	DRM_IOCTL_PANTHOR(WR, VM_CREATE, vm_create)
+#define DRM_IOCTL_PANTHOR_VM_DESTROY \
+	DRM_IOCTL_PANTHOR(WR, VM_DESTROY, vm_destroy)
+#define DRM_IOCTL_PANTHOR_VM_BIND \
+	DRM_IOCTL_PANTHOR(WR, VM_BIND, vm_bind)
+#define DRM_IOCTL_PANTHOR_VM_GET_STATE \
+	DRM_IOCTL_PANTHOR(WR, VM_GET_STATE, vm_get_state)
+#define DRM_IOCTL_PANTHOR_BO_CREATE \
+	DRM_IOCTL_PANTHOR(WR, BO_CREATE, bo_create)
+#define DRM_IOCTL_PANTHOR_BO_MMAP_OFFSET \
+	DRM_IOCTL_PANTHOR(WR, BO_MMAP_OFFSET, bo_mmap_offset)
+#define DRM_IOCTL_PANTHOR_GROUP_CREATE \
+	DRM_IOCTL_PANTHOR(WR, GROUP_CREATE, group_create)
+#define DRM_IOCTL_PANTHOR_GROUP_DESTROY \
+	DRM_IOCTL_PANTHOR(WR, GROUP_DESTROY, group_destroy)
+#define DRM_IOCTL_PANTHOR_GROUP_SUBMIT \
+	DRM_IOCTL_PANTHOR(WR, GROUP_SUBMIT, group_submit)
+#define DRM_IOCTL_PANTHOR_GROUP_GET_STATE \
+	DRM_IOCTL_PANTHOR(WR, GROUP_GET_STATE, group_get_state)
+#define DRM_IOCTL_PANTHOR_TILER_HEAP_CREATE \
+	DRM_IOCTL_PANTHOR(WR, TILER_HEAP_CREATE, tiler_heap_create)
+#define DRM_IOCTL_PANTHOR_TILER_HEAP_DESTROY \
+	DRM_IOCTL_PANTHOR(WR, TILER_HEAP_DESTROY, tiler_heap_destroy)
+
+/**
+ * DOC: IOCTL arguments
+ */
+
+/**
+ * struct drm_panthor_obj_array - Object array.
+ *
+ * This object is used to pass an array of objects whose size is subject to changes in
+ * future versions of the driver. In order to support this mutability, we pass a stride
+ * describing the size of the object as known by userspace.
+ *
+ * You shouldn't fill drm_panthor_obj_array fields directly. You should instead use
+ * the DRM_PANTHOR_OBJ_ARRAY() macro that takes care of initializing the stride to
+ * the object size.
+ */
+struct drm_panthor_obj_array {
+	/** @stride: Stride of object struct. Used for versioning. */
+	__u32 stride;
+
+	/** @count: Number of objects in the array. */
+	__u32 count;
+
+	/** @array: User pointer to an array of objects. */
+	__u64 array;
+};
+
+/**
+ * DRM_PANTHOR_OBJ_ARRAY() - Initialize a drm_panthor_obj_array field.
+ * @cnt: Number of elements in the array.
+ * @ptr: Pointer to the array to pass to the kernel.
+ *
+ * Macro initializing a drm_panthor_obj_array based on the object size as known
+ * by userspace.
+ */
+#define DRM_PANTHOR_OBJ_ARRAY(cnt, ptr) \
+	{ .stride = sizeof((ptr)[0]), .count = (cnt), .array = (__u64)(uintptr_t)(ptr) }
+
+/**
+ * enum drm_panthor_sync_op_flags - Synchronization operation flags.
+ */
+enum drm_panthor_sync_op_flags {
+	/** @DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_MASK: Synchronization handle type mask. */
+	DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_MASK = 0xff,
+
+	/** @DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ: Synchronization object type. */
+	DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ = 0,
+
+	/**
+	 * @DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ: Timeline synchronization
+	 * object type.
+	 */
+	DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ = 1,
+
+	/** @DRM_PANTHOR_SYNC_OP_WAIT: Wait operation. */
+	DRM_PANTHOR_SYNC_OP_WAIT = 0 << 31,
+
+	/** @DRM_PANTHOR_SYNC_OP_SIGNAL: Signal operation. */
+	DRM_PANTHOR_SYNC_OP_SIGNAL = (int)(1u << 31),
+};
+
+/**
+ * struct drm_panthor_sync_op - Synchronization operation.
+ */
+struct drm_panthor_sync_op {
+	/** @flags: Synchronization operation flags. Combination of DRM_PANTHOR_SYNC_OP values. */
+	__u32 flags;
+
+	/** @handle: Sync handle. */
+	__u32 handle;
+
+	/**
+	 * @timeline_value: MBZ if
+	 * (flags & DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_MASK) !=
+	 * DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ.
+	 */
+	__u64 timeline_value;
+};
+
+/**
+ * enum drm_panthor_dev_query_type - Query type
+ *
+ * Place new types at the end, don't re-order, don't remove or replace.
+ */
+enum drm_panthor_dev_query_type {
+	/** @DRM_PANTHOR_DEV_QUERY_GPU_INFO: Query GPU information. */
+	DRM_PANTHOR_DEV_QUERY_GPU_INFO = 0,
+
+	/** @DRM_PANTHOR_DEV_QUERY_CSIF_INFO: Query command-stream interface information. */
+	DRM_PANTHOR_DEV_QUERY_CSIF_INFO,
+};
+
+/**
+ * struct drm_panthor_gpu_info - GPU information
+ *
+ * Structure grouping all queryable information relating to the GPU.
+ */
+struct drm_panthor_gpu_info {
+	/** @gpu_id : GPU ID. */
+	__u32 gpu_id;
+#define DRM_PANTHOR_ARCH_MAJOR(x)		((x) >> 28)
+#define DRM_PANTHOR_ARCH_MINOR(x)		(((x) >> 24) & 0xf)
+#define DRM_PANTHOR_ARCH_REV(x)			(((x) >> 20) & 0xf)
+#define DRM_PANTHOR_PRODUCT_MAJOR(x)		(((x) >> 16) & 0xf)
+#define DRM_PANTHOR_VERSION_MAJOR(x)		(((x) >> 12) & 0xf)
+#define DRM_PANTHOR_VERSION_MINOR(x)		(((x) >> 4) & 0xff)
+#define DRM_PANTHOR_VERSION_STATUS(x)		((x) & 0xf)
+
+	/** @gpu_rev: GPU revision. */
+	__u32 gpu_rev;
+
+	/** @csf_id: Command stream frontend ID. */
+	__u32 csf_id;
+#define DRM_PANTHOR_CSHW_MAJOR(x)		(((x) >> 26) & 0x3f)
+#define DRM_PANTHOR_CSHW_MINOR(x)		(((x) >> 20) & 0x3f)
+#define DRM_PANTHOR_CSHW_REV(x)			(((x) >> 16) & 0xf)
+#define DRM_PANTHOR_MCU_MAJOR(x)		(((x) >> 10) & 0x3f)
+#define DRM_PANTHOR_MCU_MINOR(x)		(((x) >> 4) & 0x3f)
+#define DRM_PANTHOR_MCU_REV(x)			((x) & 0xf)
+
+	/** @l2_features: L2-cache features. */
+	__u32 l2_features;
+
+	/** @tiler_features: Tiler features. */
+	__u32 tiler_features;
+
+	/** @mem_features: Memory features. */
+	__u32 mem_features;
+
+	/** @mmu_features: MMU features. */
+	__u32 mmu_features;
+#define DRM_PANTHOR_MMU_VA_BITS(x)		((x) & 0xff)
+
+	/** @thread_features: Thread features. */
+	__u32 thread_features;
+
+	/** @max_threads: Maximum number of threads. */
+	__u32 max_threads;
+
+	/** @thread_max_workgroup_size: Maximum workgroup size. */
+	__u32 thread_max_workgroup_size;
+
+	/**
+	 * @thread_max_barrier_size: Maximum number of threads that can wait
+	 * simultaneously on a barrier.
+	 */
+	__u32 thread_max_barrier_size;
+
+	/** @coherency_features: Coherency features. */
+	__u32 coherency_features;
+
+	/** @texture_features: Texture features. */
+	__u32 texture_features[4];
+
+	/** @as_present: Bitmask encoding the number of address-space exposed by the MMU. */
+	__u32 as_present;
+
+	/** @shader_present: Bitmask encoding the shader cores exposed by the GPU. */
+	__u64 shader_present;
+
+	/** @l2_present: Bitmask encoding the L2 caches exposed by the GPU. */
+	__u64 l2_present;
+
+	/** @tiler_present: Bitmask encoding the tiler units exposed by the GPU. */
+	__u64 tiler_present;
+
+	/* @core_features: Used to discriminate core variants when they exist. */
+	__u32 core_features;
+
+	/* @pad: MBZ. */
+	__u32 pad;
+};
+
+/**
+ * struct drm_panthor_csif_info - Command stream interface information
+ *
+ * Structure grouping all queryable information relating to the command stream interface.
+ */
+struct drm_panthor_csif_info {
+	/** @csg_slot_count: Number of command stream group slots exposed by the firmware. */
+	__u32 csg_slot_count;
+
+	/** @cs_slot_count: Number of command stream slots per group. */
+	__u32 cs_slot_count;
+
+	/** @cs_reg_count: Number of command stream registers. */
+	__u32 cs_reg_count;
+
+	/** @scoreboard_slot_count: Number of scoreboard slots. */
+	__u32 scoreboard_slot_count;
+
+	/**
+	 * @unpreserved_cs_reg_count: Number of command stream registers reserved by
+	 * the kernel driver to call a userspace command stream.
+	 *
+	 * All registers can be used by a userspace command stream, but the
+	 * [cs_slot_count - unpreserved_cs_reg_count .. cs_slot_count] registers are
+	 * used by the kernel when DRM_PANTHOR_IOCTL_GROUP_SUBMIT is called.
+	 */
+	__u32 unpreserved_cs_reg_count;
+
+	/**
+	 * @pad: Padding field, set to zero.
+	 */
+	__u32 pad;
+};
+
+/**
+ * struct drm_panthor_dev_query - Arguments passed to DRM_PANTHOR_IOCTL_DEV_QUERY
+ */
+struct drm_panthor_dev_query {
+	/** @type: the query type (see drm_panthor_dev_query_type). */
+	__u32 type;
+
+	/**
+	 * @size: size of the type being queried.
+	 *
+	 * If pointer is NULL, size is updated by the driver to provide the
+	 * output structure size. If pointer is not NULL, the driver will
+	 * only copy min(size, actual_structure_size) bytes to the pointer,
+	 * and update the size accordingly. This allows us to extend query
+	 * types without breaking userspace.
+	 */
+	__u32 size;
+
+	/**
+	 * @pointer: user pointer to a query type struct.
+	 *
+	 * Pointer can be NULL, in which case, nothing is copied, but the
+	 * actual structure size is returned. If not NULL, it must point to
+	 * a location that's large enough to hold size bytes.
+	 */
+	__u64 pointer;
+};
+
+/**
+ * struct drm_panthor_vm_create - Arguments passed to DRM_PANTHOR_IOCTL_VM_CREATE
+ */
+struct drm_panthor_vm_create {
+	/** @flags: VM flags, MBZ. */
+	__u32 flags;
+
+	/** @id: Returned VM ID. */
+	__u32 id;
+
+	/**
+	 * @user_va_range: Size of the VA space reserved for user objects.
+	 *
+	 * The kernel will pick the remaining space to map kernel-only objects to the
+	 * VM (heap chunks, heap context, ring buffers, kernel synchronization objects,
+	 * ...). If the space left for kernel objects is too small, kernel object
+	 * allocation will fail further down the road. One can use
+	 * drm_panthor_gpu_info::mmu_features to extract the total virtual address
+	 * range, and chose a user_va_range that leaves some space to the kernel.
+	 *
+	 * If user_va_range is zero, the kernel will pick a sensible value based on
+	 * TASK_SIZE and the virtual range supported by the GPU MMU (the kernel/user
+	 * split should leave enough VA space for userspace processes to support SVM,
+	 * while still allowing the kernel to map some amount of kernel objects in
+	 * the kernel VA range). The value chosen by the driver will be returned in
+	 * @user_va_range.
+	 *
+	 * User VA space always starts at 0x0, kernel VA space is always placed after
+	 * the user VA range.
+	 */
+	__u64 user_va_range;
+};
+
+/**
+ * struct drm_panthor_vm_destroy - Arguments passed to DRM_PANTHOR_IOCTL_VM_DESTROY
+ */
+struct drm_panthor_vm_destroy {
+	/** @id: ID of the VM to destroy. */
+	__u32 id;
+
+	/** @pad: MBZ. */
+	__u32 pad;
+};
+
+/**
+ * enum drm_panthor_vm_bind_op_flags - VM bind operation flags
+ */
+enum drm_panthor_vm_bind_op_flags {
+	/**
+	 * @DRM_PANTHOR_VM_BIND_OP_MAP_READONLY: Map the memory read-only.
+	 *
+	 * Only valid with DRM_PANTHOR_VM_BIND_OP_TYPE_MAP.
+	 */
+	DRM_PANTHOR_VM_BIND_OP_MAP_READONLY = 1 << 0,
+
+	/**
+	 * @DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC: Map the memory not-executable.
+	 *
+	 * Only valid with DRM_PANTHOR_VM_BIND_OP_TYPE_MAP.
+	 */
+	DRM_PANTHOR_VM_BIND_OP_MAP_NOEXEC = 1 << 1,
+
+	/**
+	 * @DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED: Map the memory uncached.
+	 *
+	 * Only valid with DRM_PANTHOR_VM_BIND_OP_TYPE_MAP.
+	 */
+	DRM_PANTHOR_VM_BIND_OP_MAP_UNCACHED = 1 << 2,
+
+	/**
+	 * @DRM_PANTHOR_VM_BIND_OP_TYPE_MASK: Mask used to determine the type of operation.
+	 */
+	DRM_PANTHOR_VM_BIND_OP_TYPE_MASK = (int)(0xfu << 28),
+
+	/** @DRM_PANTHOR_VM_BIND_OP_TYPE_MAP: Map operation. */
+	DRM_PANTHOR_VM_BIND_OP_TYPE_MAP = 0 << 28,
+
+	/** @DRM_PANTHOR_VM_BIND_OP_TYPE_UNMAP: Unmap operation. */
+	DRM_PANTHOR_VM_BIND_OP_TYPE_UNMAP = 1 << 28,
+
+	/**
+	 * @DRM_PANTHOR_VM_BIND_OP_TYPE_SYNC_ONLY: No VM operation.
+	 *
+	 * Just serves as a synchronization point on a VM queue.
+	 *
+	 * Only valid if %DRM_PANTHOR_VM_BIND_ASYNC is set in drm_panthor_vm_bind::flags,
+	 * and drm_panthor_vm_bind_op::syncs contains at least one element.
+	 */
+	DRM_PANTHOR_VM_BIND_OP_TYPE_SYNC_ONLY = 2 << 28,
+};
+
+/**
+ * struct drm_panthor_vm_bind_op - VM bind operation
+ */
+struct drm_panthor_vm_bind_op {
+	/** @flags: Combination of drm_panthor_vm_bind_op_flags flags. */
+	__u32 flags;
+
+	/**
+	 * @bo_handle: Handle of the buffer object to map.
+	 * MBZ for unmap or sync-only operations.
+	 */
+	__u32 bo_handle;
+
+	/**
+	 * @bo_offset: Buffer object offset.
+	 * MBZ for unmap or sync-only operations.
+	 */
+	__u64 bo_offset;
+
+	/**
+	 * @va: Virtual address to map/unmap.
+	 * MBZ for sync-only operations.
+	 */
+	__u64 va;
+
+	/**
+	 * @size: Size to map/unmap.
+	 * MBZ for sync-only operations.
+	 */
+	__u64 size;
+
+	/**
+	 * @syncs: Array of struct drm_panthor_sync_op synchronization
+	 * operations.
+	 *
+	 * This array must be empty if %DRM_PANTHOR_VM_BIND_ASYNC is not set on
+	 * the drm_panthor_vm_bind object containing this VM bind operation.
+	 *
+	 * This array shall not be empty for sync-only operations.
+	 */
+	struct drm_panthor_obj_array syncs;
+
+};
+
+/**
+ * enum drm_panthor_vm_bind_flags - VM bind flags
+ */
+enum drm_panthor_vm_bind_flags {
+	/**
+	 * @DRM_PANTHOR_VM_BIND_ASYNC: VM bind operations are queued to the VM
+	 * queue instead of being executed synchronously.
+	 */
+	DRM_PANTHOR_VM_BIND_ASYNC = 1 << 0,
+};
+
+/**
+ * struct drm_panthor_vm_bind - Arguments passed to DRM_IOCTL_PANTHOR_VM_BIND
+ */
+struct drm_panthor_vm_bind {
+	/** @vm_id: VM targeted by the bind request. */
+	__u32 vm_id;
+
+	/** @flags: Combination of drm_panthor_vm_bind_flags flags. */
+	__u32 flags;
+
+	/** @ops: Array of struct drm_panthor_vm_bind_op bind operations. */
+	struct drm_panthor_obj_array ops;
+};
+
+/**
+ * enum drm_panthor_vm_state - VM states.
+ */
+enum drm_panthor_vm_state {
+	/**
+	 * @DRM_PANTHOR_VM_STATE_USABLE: VM is usable.
+	 *
+	 * New VM operations will be accepted on this VM.
+	 */
+	DRM_PANTHOR_VM_STATE_USABLE,
+
+	/**
+	 * @DRM_PANTHOR_VM_STATE_UNUSABLE: VM is unusable.
+	 *
+	 * Something put the VM in an unusable state (like an asynchronous
+	 * VM_BIND request failing for any reason).
+	 *
+	 * Once the VM is in this state, all new MAP operations will be
+	 * rejected, and any GPU job targeting this VM will fail.
+	 * UNMAP operations are still accepted.
+	 *
+	 * The only way to recover from an unusable VM is to create a new
+	 * VM, and destroy the old one.
+	 */
+	DRM_PANTHOR_VM_STATE_UNUSABLE,
+};
+
+/**
+ * struct drm_panthor_vm_get_state - Get VM state.
+ */
+struct drm_panthor_vm_get_state {
+	/** @vm_id: VM targeted by the get_state request. */
+	__u32 vm_id;
+
+	/**
+	 * @state: state returned by the driver.
+	 *
+	 * Must be one of the enum drm_panthor_vm_state values.
+	 */
+	__u32 state;
+};
+
+/**
+ * enum drm_panthor_bo_flags - Buffer object flags, passed at creation time.
+ */
+enum drm_panthor_bo_flags {
+	/** @DRM_PANTHOR_BO_NO_MMAP: The buffer object will never be CPU-mapped in userspace. */
+	DRM_PANTHOR_BO_NO_MMAP = (1 << 0),
+};
+
+/**
+ * struct drm_panthor_bo_create - Arguments passed to DRM_IOCTL_PANTHOR_BO_CREATE.
+ */
+struct drm_panthor_bo_create {
+	/**
+	 * @size: Requested size for the object
+	 *
+	 * The (page-aligned) allocated size for the object will be returned.
+	 */
+	__u64 size;
+
+	/**
+	 * @flags: Flags. Must be a combination of drm_panthor_bo_flags flags.
+	 */
+	__u32 flags;
+
+	/**
+	 * @exclusive_vm_id: Exclusive VM this buffer object will be mapped to.
+	 *
+	 * If not zero, the field must refer to a valid VM ID, and implies that:
+	 *  - the buffer object will only ever be bound to that VM
+	 *  - cannot be exported as a PRIME fd
+	 */
+	__u32 exclusive_vm_id;
+
+	/**
+	 * @handle: Returned handle for the object.
+	 *
+	 * Object handles are nonzero.
+	 */
+	__u32 handle;
+
+	/** @pad: MBZ. */
+	__u32 pad;
+};
+
+/**
+ * struct drm_panthor_bo_mmap_offset - Arguments passed to DRM_IOCTL_PANTHOR_BO_MMAP_OFFSET.
+ */
+struct drm_panthor_bo_mmap_offset {
+	/** @handle: Handle of the object we want an mmap offset for. */
+	__u32 handle;
+
+	/** @pad: MBZ. */
+	__u32 pad;
+
+	/** @offset: The fake offset to use for subsequent mmap calls. */
+	__u64 offset;
+};
+
+/**
+ * struct drm_panthor_queue_create - Queue creation arguments.
+ */
+struct drm_panthor_queue_create {
+	/**
+	 * @priority: Defines the priority of queues inside a group. Goes from 0 to 15,
+	 * 15 being the highest priority.
+	 */
+	__u8 priority;
+
+	/** @pad: Padding fields, MBZ. */
+	__u8 pad[3];
+
+	/** @ringbuf_size: Size of the ring buffer to allocate to this queue. */
+	__u32 ringbuf_size;
+};
+
+/**
+ * enum drm_panthor_group_priority - Scheduling group priority
+ */
+enum drm_panthor_group_priority {
+	/** @PANTHOR_GROUP_PRIORITY_LOW: Low priority group. */
+	PANTHOR_GROUP_PRIORITY_LOW = 0,
+
+	/** @PANTHOR_GROUP_PRIORITY_MEDIUM: Medium priority group. */
+	PANTHOR_GROUP_PRIORITY_MEDIUM,
+
+	/** @PANTHOR_GROUP_PRIORITY_HIGH: High priority group. */
+	PANTHOR_GROUP_PRIORITY_HIGH,
+};
+
+/**
+ * struct drm_panthor_group_create - Arguments passed to DRM_IOCTL_PANTHOR_GROUP_CREATE
+ */
+struct drm_panthor_group_create {
+	/** @queues: Array of drm_panthor_queue_create elements. */
+	struct drm_panthor_obj_array queues;
+
+	/**
+	 * @max_compute_cores: Maximum number of cores that can be used by compute
+	 * jobs across CS queues bound to this group.
+	 *
+	 * Must be less or equal to the number of bits set in @compute_core_mask.
+	 */
+	__u8 max_compute_cores;
+
+	/**
+	 * @max_fragment_cores: Maximum number of cores that can be used by fragment
+	 * jobs across CS queues bound to this group.
+	 *
+	 * Must be less or equal to the number of bits set in @fragment_core_mask.
+	 */
+	__u8 max_fragment_cores;
+
+	/**
+	 * @max_tiler_cores: Maximum number of tilers that can be used by tiler jobs
+	 * across CS queues bound to this group.
+	 *
+	 * Must be less or equal to the number of bits set in @tiler_core_mask.
+	 */
+	__u8 max_tiler_cores;
+
+	/** @priority: Group priority (see enum drm_panthor_group_priority). */
+	__u8 priority;
+
+	/** @pad: Padding field, MBZ. */
+	__u32 pad;
+
+	/**
+	 * @compute_core_mask: Mask encoding cores that can be used for compute jobs.
+	 *
+	 * This field must have at least @max_compute_cores bits set.
+	 *
+	 * The bits set here should also be set in drm_panthor_gpu_info::shader_present.
+	 */
+	__u64 compute_core_mask;
+
+	/**
+	 * @fragment_core_mask: Mask encoding cores that can be used for fragment jobs.
+	 *
+	 * This field must have at least @max_fragment_cores bits set.
+	 *
+	 * The bits set here should also be set in drm_panthor_gpu_info::shader_present.
+	 */
+	__u64 fragment_core_mask;
+
+	/**
+	 * @tiler_core_mask: Mask encoding cores that can be used for tiler jobs.
+	 *
+	 * This field must have at least @max_tiler_cores bits set.
+	 *
+	 * The bits set here should also be set in drm_panthor_gpu_info::tiler_present.
+	 */
+	__u64 tiler_core_mask;
+
+	/**
+	 * @vm_id: VM ID to bind this group to.
+	 *
+	 * All submission to queues bound to this group will use this VM.
+	 */
+	__u32 vm_id;
+
+	/**
+	 * @group_handle: Returned group handle. Passed back when submitting jobs or
+	 * destroying a group.
+	 */
+	__u32 group_handle;
+};
+
+/**
+ * struct drm_panthor_group_destroy - Arguments passed to DRM_IOCTL_PANTHOR_GROUP_DESTROY
+ */
+struct drm_panthor_group_destroy {
+	/** @group_handle: Group to destroy */
+	__u32 group_handle;
+
+	/** @pad: Padding field, MBZ. */
+	__u32 pad;
+};
+
+/**
+ * struct drm_panthor_queue_submit - Job submission arguments.
+ *
+ * This is describing the userspace command stream to call from the kernel
+ * command stream ring-buffer. Queue submission is always part of a group
+ * submission, taking one or more jobs to submit to the underlying queues.
+ */
+struct drm_panthor_queue_submit {
+	/** @queue_index: Index of the queue inside a group. */
+	__u32 queue_index;
+
+	/**
+	 * @stream_size: Size of the command stream to execute.
+	 *
+	 * Must be 64-bit/8-byte aligned (the size of a CS instruction)
+	 *
+	 * Can be zero if stream_addr is zero too.
+	 */
+	__u32 stream_size;
+
+	/**
+	 * @stream_addr: GPU address of the command stream to execute.
+	 *
+	 * Must be aligned on 64-byte.
+	 *
+	 * Can be zero is stream_size is zero too.
+	 */
+	__u64 stream_addr;
+
+	/**
+	 * @latest_flush: FLUSH_ID read at the time the stream was built.
+	 *
+	 * This allows cache flush elimination for the automatic
+	 * flush+invalidate(all) done at submission time, which is needed to
+	 * ensure the GPU doesn't get garbage when reading the indirect command
+	 * stream buffers. If you want the cache flush to happen
+	 * unconditionally, pass a zero here.
+	 */
+	__u32 latest_flush;
+
+	/** @pad: MBZ. */
+	__u32 pad;
+
+	/** @syncs: Array of struct drm_panthor_sync_op sync operations. */
+	struct drm_panthor_obj_array syncs;
+};
+
+/**
+ * struct drm_panthor_group_submit - Arguments passed to DRM_IOCTL_PANTHOR_GROUP_SUBMIT
+ */
+struct drm_panthor_group_submit {
+	/** @group_handle: Handle of the group to queue jobs to. */
+	__u32 group_handle;
+
+	/** @pad: MBZ. */
+	__u32 pad;
+
+	/** @queue_submits: Array of drm_panthor_queue_submit objects. */
+	struct drm_panthor_obj_array queue_submits;
+};
+
+/**
+ * enum drm_panthor_group_state_flags - Group state flags
+ */
+enum drm_panthor_group_state_flags {
+	/**
+	 * @DRM_PANTHOR_GROUP_STATE_TIMEDOUT: Group had unfinished jobs.
+	 *
+	 * When a group ends up with this flag set, no jobs can be submitted to its queues.
+	 */
+	DRM_PANTHOR_GROUP_STATE_TIMEDOUT = 1 << 0,
+
+	/**
+	 * @DRM_PANTHOR_GROUP_STATE_FATAL_FAULT: Group had fatal faults.
+	 *
+	 * When a group ends up with this flag set, no jobs can be submitted to its queues.
+	 */
+	DRM_PANTHOR_GROUP_STATE_FATAL_FAULT = 1 << 1,
+};
+
+/**
+ * struct drm_panthor_group_get_state - Arguments passed to DRM_IOCTL_PANTHOR_GROUP_GET_STATE
+ *
+ * Used to query the state of a group and decide whether a new group should be created to
+ * replace it.
+ */
+struct drm_panthor_group_get_state {
+	/** @group_handle: Handle of the group to query state on */
+	__u32 group_handle;
+
+	/**
+	 * @state: Combination of DRM_PANTHOR_GROUP_STATE_* flags encoding the
+	 * group state.
+	 */
+	__u32 state;
+
+	/** @fatal_queues: Bitmask of queues that faced fatal faults. */
+	__u32 fatal_queues;
+
+	/** @pad: MBZ */
+	__u32 pad;
+};
+
+/**
+ * struct drm_panthor_tiler_heap_create - Arguments passed to DRM_IOCTL_PANTHOR_TILER_HEAP_CREATE
+ */
+struct drm_panthor_tiler_heap_create {
+	/** @vm_id: VM ID the tiler heap should be mapped to */
+	__u32 vm_id;
+
+	/** @initial_chunk_count: Initial number of chunks to allocate. */
+	__u32 initial_chunk_count;
+
+	/** @chunk_size: Chunk size. Must be a power of two at least 256KB large. */
+	__u32 chunk_size;
+
+	/** @max_chunks: Maximum number of chunks that can be allocated. */
+	__u32 max_chunks;
+
+	/**
+	 * @target_in_flight: Maximum number of in-flight render passes.
+	 *
+	 * If the heap has more than tiler jobs in-flight, the FW will wait for render
+	 * passes to finish before queuing new tiler jobs.
+	 */
+	__u32 target_in_flight;
+
+	/** @handle: Returned heap handle. Passed back to DESTROY_TILER_HEAP. */
+	__u32 handle;
+
+	/** @tiler_heap_ctx_gpu_va: Returned heap GPU virtual address returned */
+	__u64 tiler_heap_ctx_gpu_va;
+
+	/**
+	 * @first_heap_chunk_gpu_va: First heap chunk.
+	 *
+	 * The tiler heap is formed of heap chunks forming a single-link list. This
+	 * is the first element in the list.
+	 */
+	__u64 first_heap_chunk_gpu_va;
+};
+
+/**
+ * struct drm_panthor_tiler_heap_destroy - Arguments passed to DRM_IOCTL_PANTHOR_TILER_HEAP_DESTROY
+ */
+struct drm_panthor_tiler_heap_destroy {
+	/** @handle: Handle of the tiler heap to destroy */
+	__u32 handle;
+
+	/** @pad: Padding field, MBZ. */
+	__u32 pad;
+};
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* _PANTHOR_DRM_H_ */
-- 
cgit v1.2.3


From d10612f8303fab350d82f8b7d5793683af50ee3c Mon Sep 17 00:00:00 2001
From: Nirmoy Das <nirmoy.das@intel.com>
Date: Thu, 29 Feb 2024 14:29:18 +0100
Subject: drm/i915: Add missing doc for drm_i915_reset_stats

Add missing doc for struct drm_i915_reset_stats.

Cc: Andi Shyti <andi.shyti@linux.intel.com>
Signed-off-by: Nirmoy Das <nirmoy.das@intel.com>
Reviewed-by: Andi Shyti <andi.shyti@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240229132918.10205-1-nirmoy.das@intel.com
---
 include/uapi/drm/i915_drm.h | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index bd87386a8243..fb195d3b3e9f 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -2623,19 +2623,29 @@ struct drm_i915_reg_read {
  *
  */
 
+/*
+ * struct drm_i915_reset_stats - Return global reset and other context stats
+ *
+ * Driver keeps few stats for each contexts and also global reset count.
+ * This struct can be used to query those stats.
+ */
 struct drm_i915_reset_stats {
+	/** @ctx_id: ID of the requested context */
 	__u32 ctx_id;
+
+	/** @flags: MBZ */
 	__u32 flags;
 
-	/* All resets since boot/module reload, for all contexts */
+	/** @reset_count: All resets since boot/module reload, for all contexts */
 	__u32 reset_count;
 
-	/* Number of batches lost when active in GPU, for this context */
+	/** @batch_active: Number of batches lost when active in GPU, for this context */
 	__u32 batch_active;
 
-	/* Number of batches lost pending for execution, for this context */
+	/** @batch_pending: Number of batches lost pending for execution, for this context */
 	__u32 batch_pending;
 
+	/** @pad: MBZ */
 	__u32 pad;
 };
 
-- 
cgit v1.2.3


From cc941c70df3927be89dfb875a9b3cec3ef5cf2c8 Mon Sep 17 00:00:00 2001
From: Christian König <christian.koenig@amd.com>
Date: Wed, 6 Dec 2023 16:37:29 +0100
Subject: drm/ttm: improve idle/busy handling v5
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously we would never try to move a BO into the preferred placements
when it ever landed in a busy placement since those were considered
compatible.

Rework the whole handling and finally unify the idle and busy handling.
ttm_bo_validate() is now responsible to try idle placement first and then
use the busy placement if that didn't worked.

Drawback is that we now always try the idle placement first for each
validation which might cause some additional CPU overhead on overcommit.

v2: fix kerneldoc warning and coding style
v3: take care of XE as well
v4: keep the ttm_bo_mem_space functionality as it is for now, only add
    new handling for ttm_bo_validate as suggested by Thomas
v5: fix bug pointed out by Matthew

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Zack Rusin <zack.rusin@broadcom.com> v3
Link: https://patchwork.freedesktop.org/patch/msgid/20240229134003.3688-1-christian.koenig@amd.com
Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
 drivers/gpu/drm/ttm/ttm_bo.c       | 231 +++++++++++++++++--------------------
 drivers/gpu/drm/ttm/ttm_resource.c |  16 ++-
 include/drm/ttm/ttm_resource.h     |   3 +-
 3 files changed, 121 insertions(+), 129 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 96a724e8f3ff..e059b1e1b13b 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -724,64 +724,36 @@ static int ttm_bo_add_move_fence(struct ttm_buffer_object *bo,
 	return ret;
 }
 
-/*
- * Repeatedly evict memory from the LRU for @mem_type until we create enough
- * space, or we've evicted everything and there isn't enough space.
- */
-static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo,
-				  const struct ttm_place *place,
-				  struct ttm_resource **mem,
-				  struct ttm_operation_ctx *ctx)
-{
-	struct ttm_device *bdev = bo->bdev;
-	struct ttm_resource_manager *man;
-	struct ww_acquire_ctx *ticket;
-	int ret;
-
-	man = ttm_manager_type(bdev, place->mem_type);
-	ticket = dma_resv_locking_ctx(bo->base.resv);
-	do {
-		ret = ttm_resource_alloc(bo, place, mem);
-		if (likely(!ret))
-			break;
-		if (unlikely(ret != -ENOSPC))
-			return ret;
-		ret = ttm_mem_evict_first(bdev, man, place, ctx,
-					  ticket);
-		if (unlikely(ret != 0))
-			return ret;
-	} while (1);
-
-	return ttm_bo_add_move_fence(bo, man, *mem, ctx->no_wait_gpu);
-}
-
 /**
- * ttm_bo_mem_space
+ * ttm_bo_alloc_resource - Allocate backing store for a BO
  *
- * @bo: Pointer to a struct ttm_buffer_object. the data of which
- * we want to allocate space for.
- * @placement: Proposed new placement for the buffer object.
- * @mem: A struct ttm_resource.
+ * @bo: Pointer to a struct ttm_buffer_object of which we want a resource for
+ * @placement: Proposed new placement for the buffer object
  * @ctx: if and how to sleep, lock buffers and alloc memory
+ * @force_space: If we should evict buffers to force space
+ * @res: The resulting struct ttm_resource.
  *
- * Allocate memory space for the buffer object pointed to by @bo, using
- * the placement flags in @placement, potentially evicting other idle buffer objects.
- * This function may sleep while waiting for space to become available.
+ * Allocates a resource for the buffer object pointed to by @bo, using the
+ * placement flags in @placement, potentially evicting other buffer objects when
+ * @force_space is true.
+ * This function may sleep while waiting for resources to become available.
  * Returns:
- * -EBUSY: No space available (only if no_wait == 1).
+ * -EBUSY: No space available (only if no_wait == true).
  * -ENOSPC: Could not allocate space for the buffer object, either due to
  * fragmentation or concurrent allocators.
  * -ERESTARTSYS: An interruptible sleep was interrupted by a signal.
  */
-int ttm_bo_mem_space(struct ttm_buffer_object *bo,
-			struct ttm_placement *placement,
-			struct ttm_resource **mem,
-			struct ttm_operation_ctx *ctx)
+static int ttm_bo_alloc_resource(struct ttm_buffer_object *bo,
+				 struct ttm_placement *placement,
+				 struct ttm_operation_ctx *ctx,
+				 bool force_space,
+				 struct ttm_resource **res)
 {
 	struct ttm_device *bdev = bo->bdev;
-	bool type_found = false;
+	struct ww_acquire_ctx *ticket;
 	int i, ret;
 
+	ticket = dma_resv_locking_ctx(bo->base.resv);
 	ret = dma_resv_reserve_fences(bo->base.resv, 1);
 	if (unlikely(ret))
 		return ret;
@@ -790,98 +762,73 @@ int ttm_bo_mem_space(struct ttm_buffer_object *bo,
 		const struct ttm_place *place = &placement->placement[i];
 		struct ttm_resource_manager *man;
 
-		if (place->flags & TTM_PL_FLAG_FALLBACK)
-			continue;
-
 		man = ttm_manager_type(bdev, place->mem_type);
 		if (!man || !ttm_resource_manager_used(man))
 			continue;
 
-		type_found = true;
-		ret = ttm_resource_alloc(bo, place, mem);
-		if (ret == -ENOSPC)
+		if (place->flags & (force_space ? TTM_PL_FLAG_DESIRED :
+				    TTM_PL_FLAG_FALLBACK))
+			continue;
+
+		do {
+			ret = ttm_resource_alloc(bo, place, res);
+			if (unlikely(ret && ret != -ENOSPC))
+				return ret;
+			if (likely(!ret) || !force_space)
+				break;
+
+			ret = ttm_mem_evict_first(bdev, man, place, ctx,
+						  ticket);
+			if (unlikely(ret == -EBUSY))
+				break;
+			if (unlikely(ret))
+				return ret;
+		} while (1);
+		if (ret)
 			continue;
-		if (unlikely(ret))
-			goto error;
 
-		ret = ttm_bo_add_move_fence(bo, man, *mem, ctx->no_wait_gpu);
+		ret = ttm_bo_add_move_fence(bo, man, *res, ctx->no_wait_gpu);
 		if (unlikely(ret)) {
-			ttm_resource_free(bo, mem);
+			ttm_resource_free(bo, res);
 			if (ret == -EBUSY)
 				continue;
 
-			goto error;
+			return ret;
 		}
 		return 0;
 	}
 
-	for (i = 0; i < placement->num_placement; ++i) {
-		const struct ttm_place *place = &placement->placement[i];
-		struct ttm_resource_manager *man;
-
-		if (place->flags & TTM_PL_FLAG_DESIRED)
-			continue;
-
-		man = ttm_manager_type(bdev, place->mem_type);
-		if (!man || !ttm_resource_manager_used(man))
-			continue;
-
-		type_found = true;
-		ret = ttm_bo_mem_force_space(bo, place, mem, ctx);
-		if (likely(!ret))
-			return 0;
-
-		if (ret && ret != -EBUSY)
-			goto error;
-	}
-
-	ret = -ENOSPC;
-	if (!type_found) {
-		pr_err(TTM_PFX "No compatible memory type found\n");
-		ret = -EINVAL;
-	}
-
-error:
-	return ret;
+	return -ENOSPC;
 }
-EXPORT_SYMBOL(ttm_bo_mem_space);
 
-static int ttm_bo_move_buffer(struct ttm_buffer_object *bo,
-			      struct ttm_placement *placement,
-			      struct ttm_operation_ctx *ctx)
+/*
+ * ttm_bo_mem_space - Wrapper around ttm_bo_alloc_resource
+ *
+ * @bo: Pointer to a struct ttm_buffer_object of which we want a resource for
+ * @placement: Proposed new placement for the buffer object
+ * @res: The resulting struct ttm_resource.
+ * @ctx: if and how to sleep, lock buffers and alloc memory
+ *
+ * Tries both idle allocation and forcefully eviction of buffers. See
+ * ttm_bo_alloc_resource for details.
+ */
+int ttm_bo_mem_space(struct ttm_buffer_object *bo,
+		     struct ttm_placement *placement,
+		     struct ttm_resource **res,
+		     struct ttm_operation_ctx *ctx)
 {
-	struct ttm_resource *mem;
-	struct ttm_place hop;
+	bool force_space = false;
 	int ret;
 
-	dma_resv_assert_held(bo->base.resv);
+	do {
+		ret = ttm_bo_alloc_resource(bo, placement, ctx,
+					    force_space, res);
+		force_space = !force_space;
+	} while (ret == -ENOSPC && force_space);
 
-	/*
-	 * Determine where to move the buffer.
-	 *
-	 * If driver determines move is going to need
-	 * an extra step then it will return -EMULTIHOP
-	 * and the buffer will be moved to the temporary
-	 * stop and the driver will be called to make
-	 * the second hop.
-	 */
-	ret = ttm_bo_mem_space(bo, placement, &mem, ctx);
-	if (ret)
-		return ret;
-bounce:
-	ret = ttm_bo_handle_move_mem(bo, mem, false, ctx, &hop);
-	if (ret == -EMULTIHOP) {
-		ret = ttm_bo_bounce_temp_buffer(bo, &mem, ctx, &hop);
-		if (ret)
-			goto out;
-		/* try and move to final place now. */
-		goto bounce;
-	}
-out:
-	if (ret)
-		ttm_resource_free(bo, &mem);
 	return ret;
 }
+EXPORT_SYMBOL(ttm_bo_mem_space);
 
 /**
  * ttm_bo_validate
@@ -902,6 +849,9 @@ int ttm_bo_validate(struct ttm_buffer_object *bo,
 		    struct ttm_placement *placement,
 		    struct ttm_operation_ctx *ctx)
 {
+	struct ttm_resource *res;
+	struct ttm_place hop;
+	bool force_space;
 	int ret;
 
 	dma_resv_assert_held(bo->base.resv);
@@ -912,20 +862,53 @@ int ttm_bo_validate(struct ttm_buffer_object *bo,
 	if (!placement->num_placement)
 		return ttm_bo_pipeline_gutting(bo);
 
-	/* Check whether we need to move buffer. */
-	if (bo->resource && ttm_resource_compatible(bo->resource, placement))
-		return 0;
+	force_space = false;
+	do {
+		/* Check whether we need to move buffer. */
+		if (bo->resource &&
+		    ttm_resource_compatible(bo->resource, placement,
+					    force_space))
+			return 0;
 
-	/* Moving of pinned BOs is forbidden */
-	if (bo->pin_count)
-		return -EINVAL;
+		/* Moving of pinned BOs is forbidden */
+		if (bo->pin_count)
+			return -EINVAL;
+
+		/*
+		 * Determine where to move the buffer.
+		 *
+		 * If driver determines move is going to need
+		 * an extra step then it will return -EMULTIHOP
+		 * and the buffer will be moved to the temporary
+		 * stop and the driver will be called to make
+		 * the second hop.
+		 */
+		ret = ttm_bo_alloc_resource(bo, placement, ctx, force_space,
+					    &res);
+		force_space = !force_space;
+		if (ret == -ENOSPC)
+			continue;
+		if (ret)
+			return ret;
+
+bounce:
+		ret = ttm_bo_handle_move_mem(bo, res, false, ctx, &hop);
+		if (ret == -EMULTIHOP) {
+			ret = ttm_bo_bounce_temp_buffer(bo, &res, ctx, &hop);
+			/* try and move to final place now. */
+			if (!ret)
+				goto bounce;
+		}
+		if (ret) {
+			ttm_resource_free(bo, &res);
+			return ret;
+		}
+
+	} while (ret && force_space);
 
-	ret = ttm_bo_move_buffer(bo, placement, ctx);
 	/* For backward compatibility with userspace */
 	if (ret == -ENOSPC)
 		return -ENOMEM;
-	if (ret)
-		return ret;
 
 	/*
 	 * We might need to add a TTM.
diff --git a/drivers/gpu/drm/ttm/ttm_resource.c b/drivers/gpu/drm/ttm/ttm_resource.c
index fb14f7716cf8..65155f2013ca 100644
--- a/drivers/gpu/drm/ttm/ttm_resource.c
+++ b/drivers/gpu/drm/ttm/ttm_resource.c
@@ -295,11 +295,13 @@ bool ttm_resource_intersects(struct ttm_device *bdev,
  *
  * @res: the resource to check
  * @placement: the placement to check against
+ * @evicting: true if the caller is doing evictions
  *
  * Returns true if the placement is compatible.
  */
 bool ttm_resource_compatible(struct ttm_resource *res,
-			     struct ttm_placement *placement)
+			     struct ttm_placement *placement,
+			     bool evicting)
 {
 	struct ttm_buffer_object *bo = res->bo;
 	struct ttm_device *bdev = bo->bdev;
@@ -315,14 +317,20 @@ bool ttm_resource_compatible(struct ttm_resource *res,
 		if (res->mem_type != place->mem_type)
 			continue;
 
+		if (place->flags & (evicting ? TTM_PL_FLAG_DESIRED :
+				    TTM_PL_FLAG_FALLBACK))
+			continue;
+
+		if (place->flags & TTM_PL_FLAG_CONTIGUOUS &&
+		    !(res->placement & TTM_PL_FLAG_CONTIGUOUS))
+			continue;
+
 		man = ttm_manager_type(bdev, res->mem_type);
 		if (man->func->compatible &&
 		    !man->func->compatible(man, res, place, bo->base.size))
 			continue;
 
-		if ((!(place->flags & TTM_PL_FLAG_CONTIGUOUS) ||
-		     (res->placement & TTM_PL_FLAG_CONTIGUOUS)))
-			return true;
+		return true;
 	}
 	return false;
 }
diff --git a/include/drm/ttm/ttm_resource.h b/include/drm/ttm/ttm_resource.h
index 1afa13f0c22b..7561023db43d 100644
--- a/include/drm/ttm/ttm_resource.h
+++ b/include/drm/ttm/ttm_resource.h
@@ -366,7 +366,8 @@ bool ttm_resource_intersects(struct ttm_device *bdev,
 			     const struct ttm_place *place,
 			     size_t size);
 bool ttm_resource_compatible(struct ttm_resource *res,
-			     struct ttm_placement *placement);
+			     struct ttm_placement *placement,
+			     bool evicting);
 void ttm_resource_set_bo(struct ttm_resource *res,
 			 struct ttm_buffer_object *bo);
 
-- 
cgit v1.2.3


From 460be1d527a8e296d85301e8b14923299508d4fc Mon Sep 17 00:00:00 2001
From: Karol Herbst <kherbst@redhat.com>
Date: Tue, 5 Mar 2024 14:38:53 +0100
Subject: drm/nouveau: move more missing UAPI bits

Those are already de-facto UAPI, so let's just move it into the uapi
header.

Signed-off-by: Karol Herbst <kherbst@redhat.com>
Reviewed-by: Lyude Paul <lyude@redhat.com>
Reviewed-by: Danilo Krummrich <dakr@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240305133853.2214268-2-kherbst@redhat.com
---
 drivers/gpu/drm/nouveau/nouveau_abi16.c | 20 +++++++++++++++-----
 drivers/gpu/drm/nouveau/nouveau_abi16.h | 12 ------------
 include/uapi/drm/nouveau_drm.h          | 22 ++++++++++++++++++++++
 3 files changed, 37 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/nouveau/nouveau_abi16.c b/drivers/gpu/drm/nouveau/nouveau_abi16.c
index cd14f993bdd1..92f9127b284a 100644
--- a/drivers/gpu/drm/nouveau/nouveau_abi16.c
+++ b/drivers/gpu/drm/nouveau/nouveau_abi16.c
@@ -312,11 +312,21 @@ nouveau_abi16_ioctl_channel_alloc(ABI16_IOCTL_ARGS)
 	if (device->info.family >= NV_DEVICE_INFO_V0_KEPLER) {
 		if (init->fb_ctxdma_handle == ~0) {
 			switch (init->tt_ctxdma_handle) {
-			case 0x01: engine = NV_DEVICE_HOST_RUNLIST_ENGINES_GR    ; break;
-			case 0x02: engine = NV_DEVICE_HOST_RUNLIST_ENGINES_MSPDEC; break;
-			case 0x04: engine = NV_DEVICE_HOST_RUNLIST_ENGINES_MSPPP ; break;
-			case 0x08: engine = NV_DEVICE_HOST_RUNLIST_ENGINES_MSVLD ; break;
-			case 0x30: engine = NV_DEVICE_HOST_RUNLIST_ENGINES_CE    ; break;
+			case NOUVEAU_FIFO_ENGINE_GR:
+				engine = NV_DEVICE_HOST_RUNLIST_ENGINES_GR;
+				break;
+			case NOUVEAU_FIFO_ENGINE_VP:
+				engine = NV_DEVICE_HOST_RUNLIST_ENGINES_MSPDEC;
+				break;
+			case NOUVEAU_FIFO_ENGINE_PPP:
+				engine = NV_DEVICE_HOST_RUNLIST_ENGINES_MSPPP;
+				break;
+			case NOUVEAU_FIFO_ENGINE_BSP:
+				engine = NV_DEVICE_HOST_RUNLIST_ENGINES_MSVLD;
+				break;
+			case NOUVEAU_FIFO_ENGINE_CE:
+				engine = NV_DEVICE_HOST_RUNLIST_ENGINES_CE;
+				break;
 			default:
 				return nouveau_abi16_put(abi16, -ENOSYS);
 			}
diff --git a/drivers/gpu/drm/nouveau/nouveau_abi16.h b/drivers/gpu/drm/nouveau/nouveau_abi16.h
index 11c8c4a80079..661b901d8ecc 100644
--- a/drivers/gpu/drm/nouveau/nouveau_abi16.h
+++ b/drivers/gpu/drm/nouveau/nouveau_abi16.h
@@ -50,18 +50,6 @@ struct drm_nouveau_grobj_alloc {
 	int      class;
 };
 
-struct drm_nouveau_notifierobj_alloc {
-	uint32_t channel;
-	uint32_t handle;
-	uint32_t size;
-	uint32_t offset;
-};
-
-struct drm_nouveau_gpuobj_free {
-	int      channel;
-	uint32_t handle;
-};
-
 struct drm_nouveau_setparam {
 	uint64_t param;
 	uint64_t value;
diff --git a/include/uapi/drm/nouveau_drm.h b/include/uapi/drm/nouveau_drm.h
index cd84227f1b42..8ad8d1cd1566 100644
--- a/include/uapi/drm/nouveau_drm.h
+++ b/include/uapi/drm/nouveau_drm.h
@@ -73,6 +73,16 @@ struct drm_nouveau_getparam {
 	__u64 value;
 };
 
+/*
+ * Those are used to support selecting the main engine used on Kepler.
+ * This goes into drm_nouveau_channel_alloc::tt_ctxdma_handle
+ */
+#define NOUVEAU_FIFO_ENGINE_GR  0x01
+#define NOUVEAU_FIFO_ENGINE_VP  0x02
+#define NOUVEAU_FIFO_ENGINE_PPP 0x04
+#define NOUVEAU_FIFO_ENGINE_BSP 0x08
+#define NOUVEAU_FIFO_ENGINE_CE  0x30
+
 struct drm_nouveau_channel_alloc {
 	__u32     fb_ctxdma_handle;
 	__u32     tt_ctxdma_handle;
@@ -95,6 +105,18 @@ struct drm_nouveau_channel_free {
 	__s32 channel;
 };
 
+struct drm_nouveau_notifierobj_alloc {
+	__u32 channel;
+	__u32 handle;
+	__u32 size;
+	__u32 offset;
+};
+
+struct drm_nouveau_gpuobj_free {
+	__s32 channel;
+	__u32 handle;
+};
+
 #define NOUVEAU_GEM_DOMAIN_CPU       (1 << 0)
 #define NOUVEAU_GEM_DOMAIN_VRAM      (1 << 1)
 #define NOUVEAU_GEM_DOMAIN_GART      (1 << 2)
-- 
cgit v1.2.3


From 6a9b38cc68a03b948df1f3fcb907c7557cfc315c Mon Sep 17 00:00:00 2001
From: Matt Roper <matthew.d.roper@intel.com>
Date: Tue, 5 Mar 2024 16:40:49 -0800
Subject: drm/xe/arl: Add Arrow Lake H support

ARL-H uses the same media and display IP as MTL, and a version 12.74
graphics IP (referred to as Xe_LPG+). From a driver point of view, we
should be able to just treat the whole platform as MTL and rely on
GRAPHICS_VERx100 checks to handle any spots where ARL's Xe_LPG+ needs
different handling from MTL's Xe_LPG (i.e., workarounds).

v2: Resolve conflict and Reorder PCI ids in sorted order
v3: Append signed-off-by commiter to this commit

Bspec: 55420
Signed-off-by: Matt Roper <matthew.d.roper@intel.com>
Signed-off-by: Dnyaneshwar Bhadane <dnyaneshwar.bhadane@intel.com>
Reviewed-by: Matt Atwood <matthew.s.atwood@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240229070806.3402641-4-dnyaneshwar.bhadane@intel.com
---
 include/drm/xe_pciids.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/drm/xe_pciids.h b/include/drm/xe_pciids.h
index de1a344737bc..bc7cbef6e9d8 100644
--- a/include/drm/xe_pciids.h
+++ b/include/drm/xe_pciids.h
@@ -176,10 +176,13 @@
 /* MTL / ARL */
 #define XE_MTL_IDS(MACRO__, ...)		\
 	MACRO__(0x7D40, ## __VA_ARGS__),	\
+	MACRO__(0x7D41, ## __VA_ARGS__),	\
 	MACRO__(0x7D45, ## __VA_ARGS__),	\
+	MACRO__(0x7D51, ## __VA_ARGS__),        \
 	MACRO__(0x7D55, ## __VA_ARGS__),	\
 	MACRO__(0x7D60, ## __VA_ARGS__),	\
 	MACRO__(0x7D67, ## __VA_ARGS__),	\
+	MACRO__(0x7DD1, ## __VA_ARGS__),        \
 	MACRO__(0x7DD5, ## __VA_ARGS__)
 
 #define XE_LNL_IDS(MACRO__, ...) \
-- 
cgit v1.2.3


From 2cddf770be0cebb663af3d72c049b9e24928f335 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Wed, 6 Mar 2024 20:31:08 +0200
Subject: drm/kunit: fix drm_kunit_helpers.h kernel-doc

s/_features/_feat/ to match code.

Reviewed-by: Thomas Zimmermann <tzimmermann@suse.de>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/f909224abc8d542a89b66b957a32f152f42e9bba.1709749576.git.jani.nikula@intel.com
---
 include/drm/drm_kunit_helpers.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/drm/drm_kunit_helpers.h b/include/drm/drm_kunit_helpers.h
index 6e99627edf45..e7cc17ee4934 100644
--- a/include/drm/drm_kunit_helpers.h
+++ b/include/drm/drm_kunit_helpers.h
@@ -75,7 +75,7 @@ __drm_kunit_helper_alloc_drm_device(struct kunit *test,
  * @_dev: The parent device object
  * @_type: the type of the struct which contains struct &drm_device
  * @_member: the name of the &drm_device within @_type.
- * @_features: Mocked DRM device driver features
+ * @_feat: Mocked DRM device driver features
  *
  * This function creates a struct &drm_driver and will create a struct
  * &drm_device from @_dev and that driver.
-- 
cgit v1.2.3


From eddb24a800ce89be9a01587f5572816213634ddc Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Wed, 6 Mar 2024 20:31:09 +0200
Subject: drm/amdgpu: make amd_asic_type.h self-contained

Include <linux/types.h> for u8.

Reviewed-by: Thomas Zimmermann <tzimmermann@suse.de>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/115327b880b69b1c8ad157e5ff7f6b419868fab0.1709749576.git.jani.nikula@intel.com
---
 include/drm/amd_asic_type.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/drm/amd_asic_type.h b/include/drm/amd_asic_type.h
index 724c45e3e9a7..9be85b821aa6 100644
--- a/include/drm/amd_asic_type.h
+++ b/include/drm/amd_asic_type.h
@@ -22,6 +22,9 @@
 
 #ifndef __AMD_ASIC_TYPE_H__
 #define __AMD_ASIC_TYPE_H__
+
+#include <linux/types.h>
+
 /*
  * Supported ASIC types
  */
-- 
cgit v1.2.3


From 9811a99cddbafcdde06d9a7f15f073fb452f23e7 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Wed, 6 Mar 2024 20:31:10 +0200
Subject: drm: bridge: samsung-dsim: make samsung-dsim.h self-contained

Include <drm/drm_bridge.h> and forward declare struct platform device.

Reviewed-by: Thomas Zimmermann <tzimmermann@suse.de>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/9534ca7dfba96580950e116c84ce0fa68fdf93a4.1709749576.git.jani.nikula@intel.com
---
 include/drm/bridge/samsung-dsim.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/drm/bridge/samsung-dsim.h b/include/drm/bridge/samsung-dsim.h
index e0c105051246..9764d6eb5beb 100644
--- a/include/drm/bridge/samsung-dsim.h
+++ b/include/drm/bridge/samsung-dsim.h
@@ -11,9 +11,11 @@
 #include <linux/regulator/consumer.h>
 
 #include <drm/drm_atomic_helper.h>
-#include <drm/drm_of.h>
+#include <drm/drm_bridge.h>
 #include <drm/drm_mipi_dsi.h>
+#include <drm/drm_of.h>
 
+struct platform_device;
 struct samsung_dsim;
 
 #define DSIM_STATE_ENABLED		BIT(0)
-- 
cgit v1.2.3


From ca892057e03ed935113b3fcf23cf2f609cf6b26e Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Wed, 6 Mar 2024 20:31:11 +0200
Subject: drm/dp_mst: fix drm_dp_mst_helper.h kernel-doc

Drop excess vcpi member documentation.

Reviewed-by: Thomas Zimmermann <tzimmermann@suse.de>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/606eee280718ba372093fdebbda42c7581dbd827.1709749576.git.jani.nikula@intel.com
---
 include/drm/display/drm_dp_mst_helper.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/drm/display/drm_dp_mst_helper.h b/include/drm/display/drm_dp_mst_helper.h
index 9b19d8bd520a..3ae88a383a41 100644
--- a/include/drm/display/drm_dp_mst_helper.h
+++ b/include/drm/display/drm_dp_mst_helper.h
@@ -83,7 +83,6 @@ struct drm_dp_mst_branch;
  * @passthrough_aux: parent aux to which DSC pass-through requests should be
  * sent, only set if DSC pass-through is possible.
  * @parent: branch device parent of this port
- * @vcpi: Virtual Channel Payload info for this port.
  * @connector: DRM connector this port is connected to. Protected by
  * &drm_dp_mst_topology_mgr.base.lock.
  * @mgr: topology manager this port lives under.
-- 
cgit v1.2.3


From b0da0d9e86df27162ced208465de2e5caec1f95a Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Wed, 6 Mar 2024 20:31:12 +0200
Subject: drm/crc: make drm_debugfs_crc.h self-contained and fix kernel-doc

Add a number of require includes and forward declare struct
drm_crtc. s/crc/crcs/ kernel-doc to match code.

Reviewed-by: Thomas Zimmermann <tzimmermann@suse.de>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/290b006bb348a03bd7c4c062d337df21fdaced53.1709749576.git.jani.nikula@intel.com
---
 include/drm/drm_debugfs_crc.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/drm/drm_debugfs_crc.h b/include/drm/drm_debugfs_crc.h
index b225eeb30d05..1b4c98c2f838 100644
--- a/include/drm/drm_debugfs_crc.h
+++ b/include/drm/drm_debugfs_crc.h
@@ -22,13 +22,19 @@
 #ifndef __DRM_DEBUGFS_CRC_H__
 #define __DRM_DEBUGFS_CRC_H__
 
+#include <linux/spinlock_types.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+
+struct drm_crtc;
+
 #define DRM_MAX_CRC_NR		10
 
 /**
  * struct drm_crtc_crc_entry - entry describing a frame's content
  * @has_frame_counter: whether the source was able to provide a frame number
  * @frame: number of the frame this CRC is about, if @has_frame_counter is true
- * @crc: array of values that characterize the frame
+ * @crcs: array of values that characterize the frame
  */
 struct drm_crtc_crc_entry {
 	bool has_frame_counter;
-- 
cgit v1.2.3


From efc86a8769b96e0ee8d842fa8fde909678a998f8 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Wed, 6 Mar 2024 20:31:14 +0200
Subject: drm: fix drm_format_helper.h kernel-doc warnings

As the documentation says, all the fields are considered private. Mark
them private also for kernel-doc to silence warnings.

Reviewed-by: Thomas Zimmermann <tzimmermann@suse.de>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/6dc5b1df34abc10d416c2db5b390440cf56e21ce.1709749576.git.jani.nikula@intel.com
---
 include/drm/drm_format_helper.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/drm/drm_format_helper.h b/include/drm/drm_format_helper.h
index f13b34e0b752..428d81afe215 100644
--- a/include/drm/drm_format_helper.h
+++ b/include/drm/drm_format_helper.h
@@ -25,6 +25,7 @@ struct iosys_map;
  * All fields are considered private.
  */
 struct drm_format_conv_state {
+	/* private: */
 	struct {
 		void *mem;
 		size_t size;
-- 
cgit v1.2.3


From dbae67ed6ac6a565d4b2075fcecdc122dff66752 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Wed, 6 Mar 2024 20:31:15 +0200
Subject: drm/lease: make drm_lease.h self-contained

Include <linux/types.h> for types used.

Reviewed-by: Thomas Zimmermann <tzimmermann@suse.de>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/3a38688e90bb775380aad5b6aad5dce3d9d87ca7.1709749576.git.jani.nikula@intel.com
---
 include/drm/drm_lease.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/drm/drm_lease.h b/include/drm/drm_lease.h
index 5c9ef6a2aeae..53545b4ca9ef 100644
--- a/include/drm/drm_lease.h
+++ b/include/drm/drm_lease.h
@@ -6,6 +6,8 @@
 #ifndef _DRM_LEASE_H_
 #define _DRM_LEASE_H_
 
+#include <linux/types.h>
+
 struct drm_file;
 struct drm_device;
 struct drm_master;
-- 
cgit v1.2.3


From d0589a3a6c36f3fbe1caa2300a8fd527aef6301a Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Wed, 6 Mar 2024 20:31:16 +0200
Subject: drm: fix drm_gem_vram_helper.h kernel-doc

Remove excess funcs kernel-doc.

Reviewed-by: Thomas Zimmermann <tzimmermann@suse.de>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/02000be3455de639f32bde88f632ae8744142083.1709749576.git.jani.nikula@intel.com
---
 include/drm/drm_gem_vram_helper.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/drm/drm_gem_vram_helper.h b/include/drm/drm_gem_vram_helper.h
index e18429f09e53..c89c9bafeb44 100644
--- a/include/drm/drm_gem_vram_helper.h
+++ b/include/drm/drm_gem_vram_helper.h
@@ -170,7 +170,6 @@ void drm_gem_vram_simple_display_pipe_cleanup_fb(
  * @vram_base:	Base address of the managed video memory
  * @vram_size:	Size of the managed video memory in bytes
  * @bdev:	The TTM BO device.
- * @funcs:	TTM BO functions
  *
  * The fields &struct drm_vram_mm.vram_base and
  * &struct drm_vram_mm.vrm_size are managed by VRAM MM, but are
-- 
cgit v1.2.3


From 08179fe18a67cf7909411705dc68c549b18fa574 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Wed, 6 Mar 2024 20:31:17 +0200
Subject: drm/of: make drm_of.h self-contained

Include <linux/err.h> for ERR_PTR.

Reviewed-by: Thomas Zimmermann <tzimmermann@suse.de>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/e9b3ea30984b913f6e32ed35e0575438dff8278c.1709749576.git.jani.nikula@intel.com
---
 include/drm/drm_of.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/drm/drm_of.h b/include/drm/drm_of.h
index 082a6e980d01..02d1cdd7f798 100644
--- a/include/drm/drm_of.h
+++ b/include/drm/drm_of.h
@@ -2,6 +2,7 @@
 #ifndef __DRM_OF_H__
 #define __DRM_OF_H__
 
+#include <linux/err.h>
 #include <linux/of_graph.h>
 #if IS_ENABLED(CONFIG_OF) && IS_ENABLED(CONFIG_DRM_PANEL_BRIDGE)
 #include <drm/drm_bridge.h>
-- 
cgit v1.2.3


From 6a91585edc70009a8075639f12d5d1e1597b8544 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Wed, 6 Mar 2024 20:31:19 +0200
Subject: drm/suballoc: fix drm_suballoc.h kernel-doc

Rename dma_fence to fence to match code.

Reviewed-by: Thomas Zimmermann <tzimmermann@suse.de>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/36d18b315fd9a6788484667f644b3fea347fb007.1709749576.git.jani.nikula@intel.com
---
 include/drm/drm_suballoc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/drm/drm_suballoc.h b/include/drm/drm_suballoc.h
index c2188bb0b157..7ba72a81a808 100644
--- a/include/drm/drm_suballoc.h
+++ b/include/drm/drm_suballoc.h
@@ -37,7 +37,7 @@ struct drm_suballoc_manager {
  * @manager: The drm_suballoc_manager.
  * @soffset: Start offset.
  * @eoffset: End offset + 1 so that @eoffset - @soffset = size.
- * @dma_fence: The fence protecting the allocation.
+ * @fence: The fence protecting the allocation.
  */
 struct drm_suballoc {
 	struct list_head olist;
-- 
cgit v1.2.3


From cec82816d0d018f178b9b7f88fe4bf80d66954e9 Mon Sep 17 00:00:00 2001
From: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
Date: Tue, 5 Mar 2024 17:27:59 -0800
Subject: drm/i915/guc: Use context hints for GT frequency

Allow user to provide a low latency context hint. When set, KMD
sends a hint to GuC which results in special handling for this
context. SLPC will ramp the GT frequency aggressively every time
it switches to this context. The down freq threshold will also be
lower so GuC will ramp down the GT freq for this context more slowly.
We also disable waitboost for this context as that will interfere with
the strategy.

We need to enable the use of SLPC Compute strategy during init, but
it will apply only to contexts that set this bit during context
creation.

Userland can check whether this feature is supported using a new param-
I915_PARAM_HAS_CONTEXT_FREQ_HINT. This flag is true for all guc submission
enabled platforms as they use SLPC for frequency management.

The Mesa usage model for this flag is here -
https://gitlab.freedesktop.org/sushmave/mesa/-/commits/compute_hint

v2: Rename flags as per review suggestions (Rodrigo, Tvrtko).
Also, use flag bits in intel_context as it allows finer control for
toggling per engine if needed (Tvrtko).

v3: Minor review comments (Tvrtko)

v4: Update comment (Sushma)

Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Sushma Venkatesh Reddy <sushma.venkatesh.reddy@intel.com>
Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Acked-by: Ivan Briano <ivan.briano@intel.com>
Signed-off-by: Vinay Belgaumkar <vinay.belgaumkar@intel.com>
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240306012759.204938-1-vinay.belgaumkar@intel.com
---
 drivers/gpu/drm/i915/gem/i915_gem_context.c         | 16 ++++++++++++++--
 drivers/gpu/drm/i915/gem/i915_gem_context_types.h   |  1 +
 drivers/gpu/drm/i915/gt/intel_context_types.h       |  1 +
 drivers/gpu/drm/i915/gt/intel_rps.c                 |  4 ++++
 .../gpu/drm/i915/gt/uc/abi/guc_actions_slpc_abi.h   | 21 +++++++++++++++++++++
 drivers/gpu/drm/i915/gt/uc/intel_guc_slpc.c         | 17 +++++++++++++++++
 drivers/gpu/drm/i915/gt/uc/intel_guc_slpc.h         |  1 +
 drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c   |  6 ++++++
 drivers/gpu/drm/i915/i915_getparam.c                |  6 ++++++
 include/uapi/drm/i915_drm.h                         | 15 +++++++++++++++
 10 files changed, 86 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index dcbfe32fd30c..81f65cab1330 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -879,6 +879,7 @@ static int set_proto_ctx_param(struct drm_i915_file_private *fpriv,
 			       struct i915_gem_proto_context *pc,
 			       struct drm_i915_gem_context_param *args)
 {
+	struct drm_i915_private *i915 = fpriv->i915;
 	int ret = 0;
 
 	switch (args->param) {
@@ -904,6 +905,13 @@ static int set_proto_ctx_param(struct drm_i915_file_private *fpriv,
 			pc->user_flags &= ~BIT(UCONTEXT_BANNABLE);
 		break;
 
+	case I915_CONTEXT_PARAM_LOW_LATENCY:
+		if (intel_uc_uses_guc_submission(&to_gt(i915)->uc))
+			pc->user_flags |= BIT(UCONTEXT_LOW_LATENCY);
+		else
+			ret = -EINVAL;
+		break;
+
 	case I915_CONTEXT_PARAM_RECOVERABLE:
 		if (args->size)
 			ret = -EINVAL;
@@ -992,6 +1000,9 @@ static int intel_context_set_gem(struct intel_context *ce,
 	if (sseu.slice_mask && !WARN_ON(ce->engine->class != RENDER_CLASS))
 		ret = intel_context_reconfigure_sseu(ce, sseu);
 
+	if (test_bit(UCONTEXT_LOW_LATENCY, &ctx->user_flags))
+		__set_bit(CONTEXT_LOW_LATENCY, &ce->flags);
+
 	return ret;
 }
 
@@ -1630,6 +1641,9 @@ i915_gem_create_context(struct drm_i915_private *i915,
 	if (vm)
 		ctx->vm = vm;
 
+	/* Assign early so intel_context_set_gem can access these flags */
+	ctx->user_flags = pc->user_flags;
+
 	mutex_init(&ctx->engines_mutex);
 	if (pc->num_user_engines >= 0) {
 		i915_gem_context_set_user_engines(ctx);
@@ -1652,8 +1666,6 @@ i915_gem_create_context(struct drm_i915_private *i915,
 	 * is no remap info, it will be a NOP. */
 	ctx->remap_slice = ALL_L3_SLICES(i915);
 
-	ctx->user_flags = pc->user_flags;
-
 	for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp); i++)
 		ctx->hang_timestamp[i] = jiffies - CONTEXT_FAST_HANG_JIFFIES;
 
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
index 03bc7f9d191b..b6d97da63d1f 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
@@ -338,6 +338,7 @@ struct i915_gem_context {
 #define UCONTEXT_BANNABLE		2
 #define UCONTEXT_RECOVERABLE		3
 #define UCONTEXT_PERSISTENCE		4
+#define UCONTEXT_LOW_LATENCY		5
 
 	/**
 	 * @flags: small set of booleans
diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h
index 7eccbd70d89f..ed95a7b57cbb 100644
--- a/drivers/gpu/drm/i915/gt/intel_context_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_context_types.h
@@ -130,6 +130,7 @@ struct intel_context {
 #define CONTEXT_PERMA_PIN		11
 #define CONTEXT_IS_PARKING		12
 #define CONTEXT_EXITING			13
+#define CONTEXT_LOW_LATENCY		14
 
 	struct {
 		u64 timeout_us;
diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c b/drivers/gpu/drm/i915/gt/intel_rps.c
index 9c6812257ac2..a929aa6e3c85 100644
--- a/drivers/gpu/drm/i915/gt/intel_rps.c
+++ b/drivers/gpu/drm/i915/gt/intel_rps.c
@@ -1013,6 +1013,10 @@ void intel_rps_boost(struct i915_request *rq)
 	if (i915_request_signaled(rq) || i915_request_has_waitboost(rq))
 		return;
 
+	/* Waitboost is not needed for contexts marked with a Freq hint */
+	if (test_bit(CONTEXT_LOW_LATENCY, &rq->context->flags))
+		return;
+
 	/* Serializes with i915_request_retire() */
 	if (!test_and_set_bit(I915_FENCE_FLAG_BOOST, &rq->fence.flags)) {
 		struct intel_rps *rps = &READ_ONCE(rq->engine)->gt->rps;
diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_slpc_abi.h b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_slpc_abi.h
index 811add10c30d..c34674e797c6 100644
--- a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_slpc_abi.h
+++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_slpc_abi.h
@@ -207,6 +207,27 @@ struct slpc_shared_data {
 	u8 reserved_mode_definition[4096];
 } __packed;
 
+struct slpc_context_frequency_request {
+	u32 frequency_request:16;
+	u32 reserved:12;
+	u32 is_compute:1;
+	u32 ignore_busyness:1;
+	u32 is_minimum:1;
+	u32 is_predefined:1;
+} __packed;
+
+#define SLPC_CTX_FREQ_REQ_IS_COMPUTE		REG_BIT(28)
+
+struct slpc_optimized_strategies {
+	u32 compute:1;
+	u32 async_flip:1;
+	u32 media:1;
+	u32 vsync_flip:1;
+	u32 reserved:28;
+} __packed;
+
+#define SLPC_OPTIMIZED_STRATEGY_COMPUTE		REG_BIT(0)
+
 /**
  * DOC: SLPC H2G MESSAGE FORMAT
  *
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_slpc.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_slpc.c
index 3e681ab6fbf9..706fffca698b 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_slpc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_slpc.c
@@ -537,6 +537,20 @@ int intel_guc_slpc_get_min_freq(struct intel_guc_slpc *slpc, u32 *val)
 	return ret;
 }
 
+int intel_guc_slpc_set_strategy(struct intel_guc_slpc *slpc, u32 val)
+{
+	struct drm_i915_private *i915 = slpc_to_i915(slpc);
+	intel_wakeref_t wakeref;
+	int ret = 0;
+
+	with_intel_runtime_pm(&i915->runtime_pm, wakeref)
+		ret = slpc_set_param(slpc,
+				     SLPC_PARAM_STRATEGIES,
+				     val);
+
+	return ret;
+}
+
 int intel_guc_slpc_set_media_ratio_mode(struct intel_guc_slpc *slpc, u32 val)
 {
 	struct drm_i915_private *i915 = slpc_to_i915(slpc);
@@ -711,6 +725,9 @@ int intel_guc_slpc_enable(struct intel_guc_slpc *slpc)
 	/* Set cached media freq ratio mode */
 	intel_guc_slpc_set_media_ratio_mode(slpc, slpc->media_ratio_mode);
 
+	/* Enable SLPC Optimized Strategy for compute */
+	intel_guc_slpc_set_strategy(slpc, SLPC_OPTIMIZED_STRATEGY_COMPUTE);
+
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_slpc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc_slpc.h
index 6ac6503c39d4..1cb5fd44f05c 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_slpc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_slpc.h
@@ -45,5 +45,6 @@ void intel_guc_pm_intrmsk_enable(struct intel_gt *gt);
 void intel_guc_slpc_boost(struct intel_guc_slpc *slpc);
 void intel_guc_slpc_dec_waiters(struct intel_guc_slpc *slpc);
 int intel_guc_slpc_set_ignore_eff_freq(struct intel_guc_slpc *slpc, bool val);
+int intel_guc_slpc_set_strategy(struct intel_guc_slpc *slpc, u32 val);
 
 #endif
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index cc076e9302ad..e5c645137cfe 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -2645,6 +2645,7 @@ MAKE_CONTEXT_POLICY_ADD(execution_quantum, EXECUTION_QUANTUM)
 MAKE_CONTEXT_POLICY_ADD(preemption_timeout, PREEMPTION_TIMEOUT)
 MAKE_CONTEXT_POLICY_ADD(priority, SCHEDULING_PRIORITY)
 MAKE_CONTEXT_POLICY_ADD(preempt_to_idle, PREEMPT_TO_IDLE_ON_QUANTUM_EXPIRY)
+MAKE_CONTEXT_POLICY_ADD(slpc_ctx_freq_req, SLPM_GT_FREQUENCY)
 
 #undef MAKE_CONTEXT_POLICY_ADD
 
@@ -2664,6 +2665,7 @@ static int guc_context_policy_init_v70(struct intel_context *ce, bool loop)
 	struct context_policy policy;
 	u32 execution_quantum;
 	u32 preemption_timeout;
+	u32 slpc_ctx_freq_req = 0;
 	unsigned long flags;
 	int ret;
 
@@ -2675,11 +2677,15 @@ static int guc_context_policy_init_v70(struct intel_context *ce, bool loop)
 	execution_quantum = engine->props.timeslice_duration_ms * 1000;
 	preemption_timeout = engine->props.preempt_timeout_ms * 1000;
 
+	if (ce && (ce->flags & BIT(CONTEXT_LOW_LATENCY)))
+		slpc_ctx_freq_req |= SLPC_CTX_FREQ_REQ_IS_COMPUTE;
+
 	__guc_context_policy_start_klv(&policy, ce->guc_id.id);
 
 	__guc_context_policy_add_priority(&policy, ce->guc_state.prio);
 	__guc_context_policy_add_execution_quantum(&policy, execution_quantum);
 	__guc_context_policy_add_preemption_timeout(&policy, preemption_timeout);
+	__guc_context_policy_add_slpc_ctx_freq_req(&policy, slpc_ctx_freq_req);
 
 	if (engine->flags & I915_ENGINE_WANT_FORCED_PREEMPTION)
 		__guc_context_policy_add_preempt_to_idle(&policy, 1);
diff --git a/drivers/gpu/drm/i915/i915_getparam.c b/drivers/gpu/drm/i915/i915_getparam.c
index 5c3fec63cb4c..95c58805b2a4 100644
--- a/drivers/gpu/drm/i915/i915_getparam.c
+++ b/drivers/gpu/drm/i915/i915_getparam.c
@@ -155,6 +155,12 @@ int i915_getparam_ioctl(struct drm_device *dev, void *data,
 		 */
 		value = 1;
 		break;
+	case I915_PARAM_HAS_CONTEXT_FREQ_HINT:
+		if (intel_uc_uses_guc_submission(&to_gt(i915)->uc))
+			value = 1;
+		else
+			value = -EINVAL;
+		break;
 	case I915_PARAM_HAS_CONTEXT_ISOLATION:
 		value = intel_engines_has_context_isolation(i915);
 		break;
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 2ee338860b7e..558d95baf851 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -806,6 +806,12 @@ typedef struct drm_i915_irq_wait {
  */
 #define I915_PARAM_PXP_STATUS		 58
 
+/*
+ * Query if kernel allows marking a context to send a Freq hint to SLPC. This
+ * will enable use of the strategies allowed by the SLPC algorithm.
+ */
+#define I915_PARAM_HAS_CONTEXT_FREQ_HINT	59
+
 /* Must be kept compact -- no holes and well documented */
 
 /**
@@ -2148,6 +2154,15 @@ struct drm_i915_gem_context_param {
  * -EIO: The firmware did not succeed in creating the protected context.
  */
 #define I915_CONTEXT_PARAM_PROTECTED_CONTENT    0xd
+
+/*
+ * I915_CONTEXT_PARAM_LOW_LATENCY:
+ *
+ * Mark this context as a low latency workload which requires aggressive GT
+ * frequency scaling. Use I915_PARAM_HAS_CONTEXT_FREQ_HINT to check if the kernel
+ * supports this per context flag.
+ */
+#define I915_CONTEXT_PARAM_LOW_LATENCY		0xe
 /* Must be kept compact -- no holes and well documented */
 
 	/** @value: Context parameter value to be set or queried */
-- 
cgit v1.2.3


From 9f08bb3959cd1672e43e939636dce7847c3ad313 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Fri, 8 Mar 2024 13:55:41 +0200
Subject: drm/encoder: improve drm_encoder_slave.h kernel-doc

Document structs drm_encoder_slave_funcs, drm_encoder_slave, and
drm_i2c_encoder_driver.

v2: Actually document the structs instead of just silencing kernel-doc

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/19bc9672c8ae4f7aee235665a4d2360e8790193d.1709898638.git.jani.nikula@intel.com
---
 include/drm/drm_encoder_slave.h | 91 +++++++++++++++++++++++++++++++++--------
 1 file changed, 74 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/drm/drm_encoder_slave.h b/include/drm/drm_encoder_slave.h
index 7214101fd731..49172166a164 100644
--- a/include/drm/drm_encoder_slave.h
+++ b/include/drm/drm_encoder_slave.h
@@ -34,12 +34,6 @@
 
 /**
  * struct drm_encoder_slave_funcs - Entry points exposed by a slave encoder driver
- * @set_config:	Initialize any encoder-specific modesetting parameters.
- *		The meaning of the @params parameter is implementation
- *		dependent. It will usually be a structure with DVO port
- *		data format settings or timings. It's not required for
- *		the new parameters to take effect until the next mode
- *		is set.
  *
  * Most of its members are analogous to the function pointers in
  * &drm_encoder_helper_funcs and they can optionally be used to
@@ -48,41 +42,85 @@
  * if the encoder is the currently selected one for the connector.
  */
 struct drm_encoder_slave_funcs {
+	/**
+	 * @set_config: Initialize any encoder-specific modesetting parameters.
+	 * The meaning of the @params parameter is implementation dependent. It
+	 * will usually be a structure with DVO port data format settings or
+	 * timings. It's not required for the new parameters to take effect
+	 * until the next mode is set.
+	 */
 	void (*set_config)(struct drm_encoder *encoder,
 			   void *params);
 
+	/**
+	 * @destroy: Analogous to &drm_encoder_funcs @destroy callback.
+	 */
 	void (*destroy)(struct drm_encoder *encoder);
+
+	/**
+	 * @dpms: Analogous to &drm_encoder_helper_funcs @dpms callback. Wrapped
+	 * by drm_i2c_encoder_dpms().
+	 */
 	void (*dpms)(struct drm_encoder *encoder, int mode);
+
+	/**
+	 * @save: Save state. Wrapped by drm_i2c_encoder_save().
+	 */
 	void (*save)(struct drm_encoder *encoder);
+
+	/**
+	 * @restore: Restore state. Wrapped by drm_i2c_encoder_restore().
+	 */
 	void (*restore)(struct drm_encoder *encoder);
+
+	/**
+	 * @mode_fixup: Analogous to &drm_encoder_helper_funcs @mode_fixup
+	 * callback. Wrapped by drm_i2c_encoder_mode_fixup().
+	 */
 	bool (*mode_fixup)(struct drm_encoder *encoder,
 			   const struct drm_display_mode *mode,
 			   struct drm_display_mode *adjusted_mode);
+
+	/**
+	 * @mode_valid: Analogous to &drm_encoder_helper_funcs @mode_valid.
+	 */
 	int (*mode_valid)(struct drm_encoder *encoder,
 			  struct drm_display_mode *mode);
+	/**
+	 * @mode_set: Analogous to &drm_encoder_helper_funcs @mode_set
+	 * callback. Wrapped by drm_i2c_encoder_mode_set().
+	 */
 	void (*mode_set)(struct drm_encoder *encoder,
 			 struct drm_display_mode *mode,
 			 struct drm_display_mode *adjusted_mode);
 
+	/**
+	 * @detect: Analogous to &drm_encoder_helper_funcs @detect
+	 * callback. Wrapped by drm_i2c_encoder_detect().
+	 */
 	enum drm_connector_status (*detect)(struct drm_encoder *encoder,
 					    struct drm_connector *connector);
+	/**
+	 * @get_modes: Get modes.
+	 */
 	int (*get_modes)(struct drm_encoder *encoder,
 			 struct drm_connector *connector);
+	/**
+	 * @create_resources: Create resources.
+	 */
 	int (*create_resources)(struct drm_encoder *encoder,
 				 struct drm_connector *connector);
+	/**
+	 * @set_property: Set property.
+	 */
 	int (*set_property)(struct drm_encoder *encoder,
 			    struct drm_connector *connector,
 			    struct drm_property *property,
 			    uint64_t val);
-
 };
 
 /**
  * struct drm_encoder_slave - Slave encoder struct
- * @base: DRM encoder object.
- * @slave_funcs: Slave encoder callbacks.
- * @slave_priv: Slave encoder private data.
- * @bus_priv: Bus specific data.
  *
  * A &drm_encoder_slave has two sets of callbacks, @slave_funcs and the
  * ones in @base. The former are never actually called by the common
@@ -95,10 +133,24 @@ struct drm_encoder_slave_funcs {
  * this.
  */
 struct drm_encoder_slave {
+	/**
+	 * @base: DRM encoder object.
+	 */
 	struct drm_encoder base;
 
+	/**
+	 * @slave_funcs: Slave encoder callbacks.
+	 */
 	const struct drm_encoder_slave_funcs *slave_funcs;
+
+	/**
+	 * @slave_priv: Slave encoder private data.
+	 */
 	void *slave_priv;
+
+	/**
+	 * @bus_priv: Bus specific data.
+	 */
 	void *bus_priv;
 };
 #define to_encoder_slave(x) container_of((x), struct drm_encoder_slave, base)
@@ -112,16 +164,20 @@ int drm_i2c_encoder_init(struct drm_device *dev,
 /**
  * struct drm_i2c_encoder_driver
  *
- * Describes a device driver for an encoder connected to the GPU
- * through an I2C bus. In addition to the entry points in @i2c_driver
- * an @encoder_init function should be provided. It will be called to
- * give the driver an opportunity to allocate any per-encoder data
- * structures and to initialize the @slave_funcs and (optionally)
- * @slave_priv members of @encoder.
+ * Describes a device driver for an encoder connected to the GPU through an I2C
+ * bus.
  */
 struct drm_i2c_encoder_driver {
+	/**
+	 * @i2c_driver: I2C device driver description.
+	 */
 	struct i2c_driver i2c_driver;
 
+	/**
+	 * @encoder_init: Callback to allocate any per-encoder data structures
+	 * and to initialize the @slave_funcs and (optionally) @slave_priv
+	 * members of @encoder.
+	 */
 	int (*encoder_init)(struct i2c_client *client,
 			    struct drm_device *dev,
 			    struct drm_encoder_slave *encoder);
@@ -133,6 +189,7 @@ struct drm_i2c_encoder_driver {
 
 /**
  * drm_i2c_encoder_get_client - Get the I2C client corresponding to an encoder
+ * @encoder: The encoder
  */
 static inline struct i2c_client *drm_i2c_encoder_get_client(struct drm_encoder *encoder)
 {
-- 
cgit v1.2.3


From c68ca0a9ea872cf53e84d13ab6546daa2108c63a Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Fri, 8 Mar 2024 13:55:43 +0200
Subject: drm/i915: fix i915_gsc_proxy_mei_interface.h kernel-doc

There's no proper way to document function pointer members, but at least
silence the warnings.

Acked-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Lucas De Marchi <lucas.demarchi@intel.com>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/a84bd76162290940f709f5cb6e432e5e1f75a3b9.1709898638.git.jani.nikula@intel.com
---
 include/drm/i915_gsc_proxy_mei_interface.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/drm/i915_gsc_proxy_mei_interface.h b/include/drm/i915_gsc_proxy_mei_interface.h
index 9462341d3ae1..850dfbf40607 100644
--- a/include/drm/i915_gsc_proxy_mei_interface.h
+++ b/include/drm/i915_gsc_proxy_mei_interface.h
@@ -21,7 +21,7 @@ struct i915_gsc_proxy_component_ops {
 	struct module *owner;
 
 	/**
-	 * send - Sends a proxy message to ME FW.
+	 * @send: Sends a proxy message to ME FW.
 	 * @dev: device struct corresponding to the mei device
 	 * @buf: message buffer to send
 	 * @size: size of the message
@@ -30,7 +30,7 @@ struct i915_gsc_proxy_component_ops {
 	int (*send)(struct device *dev, const void *buf, size_t size);
 
 	/**
-	 * recv - Receives a proxy message from ME FW.
+	 * @recv: Receives a proxy message from ME FW.
 	 * @dev: device struct corresponding to the mei device
 	 * @buf: message buffer to contain the received message
 	 * @size: size of the buffer
-- 
cgit v1.2.3


From efcb7ef88a8c9ba027d36e0925b9c1ddce38c3bc Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Fri, 8 Mar 2024 13:55:44 +0200
Subject: drm/i915/hdcp: fix i915_hdcp_interface.h kernel-doc warnings

Make the documentation match code.

v2: Small fixups while at it (Lucas)

Acked-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Lucas De Marchi <lucas.demarchi@intel.com>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/5c7ba8db172101c40b686463f169ec579a509f29.1709898638.git.jani.nikula@intel.com
---
 include/drm/i915_hdcp_interface.h | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/drm/i915_hdcp_interface.h b/include/drm/i915_hdcp_interface.h
index 4c9c8167c2d5..d776ed7dcd00 100644
--- a/include/drm/i915_hdcp_interface.h
+++ b/include/drm/i915_hdcp_interface.h
@@ -54,7 +54,7 @@ enum hdcp_ddi {
 };
 
 /**
- * enum hdcp_tc - ME/GSC Firmware defined index for transcoders
+ * enum hdcp_transcoder - ME/GSC Firmware defined index for transcoders
  * @HDCP_INVALID_TRANSCODER: Index for Invalid transcoder
  * @HDCP_TRANSCODER_EDP: Index for EDP Transcoder
  * @HDCP_TRANSCODER_DSI0: Index for DSI0 Transcoder
@@ -106,7 +106,7 @@ struct hdcp_port_data {
  *			    And Prepare AKE_Init.
  * @verify_receiver_cert_prepare_km: Verify the Receiver Certificate
  *				     AKE_Send_Cert and prepare
-				     AKE_Stored_Km/AKE_No_Stored_Km
+ *				     AKE_Stored_Km/AKE_No_Stored_Km
  * @verify_hprime: Verify AKE_Send_H_prime
  * @store_pairing_info: Store pairing info received
  * @initiate_locality_check: Prepare LC_Init
@@ -170,14 +170,22 @@ struct i915_hdcp_ops {
 /**
  * struct i915_hdcp_arbiter - Used for communication between i915
  * and hdcp drivers for the HDCP2.2 services
- * @hdcp_dev: device that provide the HDCP2.2 service from MEI Bus.
- * @hdcp_ops: Ops implemented by hdcp driver or intel_hdcp_gsc , used by i915 driver.
  */
 struct i915_hdcp_arbiter {
+	/**
+	 * @hdcp_dev: device that provides the HDCP2.2 service from MEI Bus.
+	 */
 	struct device *hdcp_dev;
+
+	/**
+	 * @ops: Ops implemented by hdcp driver or intel_hdcp_gsc, used by i915
+	 * driver.
+	 */
 	const struct i915_hdcp_ops *ops;
 
-	/* To protect the above members. */
+	/**
+	 * @mutex: To protect the above members.
+	 */
 	struct mutex mutex;
 };
 
-- 
cgit v1.2.3


From 6e0fe04fe8c8b1f5d3ce7cad4ac51cfde29c2bd1 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Fri, 8 Mar 2024 13:55:45 +0200
Subject: drm/i915/pxp: fix i915_pxp_tee_interface.h kernel-doc warnings

Make documentation match code. Slightly fix up the documentation
comments while at it.

v2:
- Move comments next to members instead of struct comment (Lucas)
- Small fixups while at it

Cc: Lucas De Marchi <lucas.demarchi@intel.com>
Acked-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Lucas De Marchi <lucas.demarchi@intel.com>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/7c26256dc00f970f94d145b73e341c36f553dfe4.1709898638.git.jani.nikula@intel.com
---
 include/drm/i915_pxp_tee_interface.h | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/drm/i915_pxp_tee_interface.h b/include/drm/i915_pxp_tee_interface.h
index 7d96985f2d05..a532d32f58f3 100644
--- a/include/drm/i915_pxp_tee_interface.h
+++ b/include/drm/i915_pxp_tee_interface.h
@@ -12,20 +12,26 @@ struct scatterlist;
 
 /**
  * struct i915_pxp_component_ops - ops for PXP services.
- * @owner: Module providing the ops
- * @send: sends data to PXP
- * @receive: receives data from PXP
  */
 struct i915_pxp_component_ops {
 	/**
-	 * @owner: owner of the module provding the ops
+	 * @owner: Module providing the ops.
 	 */
 	struct module *owner;
 
+	/**
+	 * @send: Send a PXP message.
+	 */
 	int (*send)(struct device *dev, const void *message, size_t size,
 		    unsigned long timeout_ms);
+	/**
+	 * @recv: Receive a PXP message.
+	 */
 	int (*recv)(struct device *dev, void *buffer, size_t size,
 		    unsigned long timeout_ms);
+	/**
+	 * @gsc_command: Send a GSC command.
+	 */
 	ssize_t (*gsc_command)(struct device *dev, u8 client_id, u32 fence_id,
 			       struct scatterlist *sg_in, size_t total_in_len,
 			       struct scatterlist *sg_out);
@@ -35,14 +41,21 @@ struct i915_pxp_component_ops {
 /**
  * struct i915_pxp_component - Used for communication between i915 and TEE
  * drivers for the PXP services
- * @tee_dev: device that provide the PXP service from TEE Bus.
- * @pxp_ops: Ops implemented by TEE driver, used by i915 driver.
  */
 struct i915_pxp_component {
+	/**
+	 * @tee_dev: device that provide the PXP service from TEE Bus.
+	 */
 	struct device *tee_dev;
+
+	/**
+	 * @ops: Ops implemented by TEE driver, used by i915 driver.
+	 */
 	const struct i915_pxp_component_ops *ops;
 
-	/* To protect the above members. */
+	/**
+	 * @mutex: To protect the above members.
+	 */
 	struct mutex mutex;
 };
 
-- 
cgit v1.2.3


From 4f51e057c0a28f52fc17824c628cf3123b0ef12f Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Fri, 8 Mar 2024 18:07:50 +0200
Subject: drm/ttm: fix ttm_bo.h kernel-doc warnings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some renames, some formatting fixes, add some missing documentation.

v3: Fix struct ttm_buffer_object .sg documentation (Christian)

Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Huang Rui <ray.huang@amd.com>
Acked-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240308160750.3741833-1-jani.nikula@intel.com
---
 include/drm/ttm/ttm_bo.h | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/drm/ttm/ttm_bo.h b/include/drm/ttm/ttm_bo.h
index 0223a41a64b2..6ccf96c91f3a 100644
--- a/include/drm/ttm/ttm_bo.h
+++ b/include/drm/ttm/ttm_bo.h
@@ -83,6 +83,9 @@ enum ttm_bo_type {
  * @resource: structure describing current placement.
  * @ttm: TTM structure holding system pages.
  * @deleted: True if the object is only a zombie and already deleted.
+ * @bulk_move: The bulk move object.
+ * @priority: Priority for LRU, BOs with lower priority are evicted first.
+ * @pin_count: Pin count.
  *
  * Base class for TTM buffer object, that deals with data placement and CPU
  * mappings. GPU mappings are really up to the driver, but for simpler GPUs
@@ -128,26 +131,27 @@ struct ttm_buffer_object {
 	struct work_struct delayed_delete;
 
 	/**
-	 * Special members that are protected by the reserve lock
-	 * and the bo::lock when written to. Can be read with
-	 * either of these locks held.
+	 * @sg: external source of pages and DMA addresses, protected by the
+	 * reservation lock.
 	 */
 	struct sg_table *sg;
 };
 
+#define TTM_BO_MAP_IOMEM_MASK 0x80
+
 /**
  * struct ttm_bo_kmap_obj
  *
  * @virtual: The current kernel virtual address.
  * @page: The page when kmap'ing a single page.
  * @bo_kmap_type: Type of bo_kmap.
+ * @bo: The TTM BO.
  *
  * Object describing a kernel mapping. Since a TTM bo may be located
  * in various memory types with various caching policies, the
  * mapping can either be an ioremap, a vmap, a kmap or part of a
  * premapped region.
  */
-#define TTM_BO_MAP_IOMEM_MASK 0x80
 struct ttm_bo_kmap_obj {
 	void *virtual;
 	struct page *page;
@@ -171,6 +175,7 @@ struct ttm_bo_kmap_obj {
  * @force_alloc: Don't check the memory account during suspend or CPU page
  * faults. Should only be used by TTM internally.
  * @resv: Reservation object to allow reserved evictions with.
+ * @bytes_moved: Statistics on how many bytes have been moved.
  *
  * Context for TTM operations like changing buffer placement or general memory
  * allocation.
@@ -264,7 +269,7 @@ static inline int ttm_bo_reserve(struct ttm_buffer_object *bo,
  * ttm_bo_reserve_slowpath:
  * @bo: A pointer to a struct ttm_buffer_object.
  * @interruptible: Sleep interruptible if waiting.
- * @sequence: Set (@bo)->sequence to this value after lock
+ * @ticket: Ticket used to acquire the ww_mutex.
  *
  * This is called after ttm_bo_reserve returns -EAGAIN and we backed off
  * from all our other reservations. Because there are no other reservations
@@ -303,7 +308,7 @@ static inline void ttm_bo_assign_mem(struct ttm_buffer_object *bo,
 }
 
 /**
- * ttm_bo_move_null = assign memory for a buffer object.
+ * ttm_bo_move_null - assign memory for a buffer object.
  * @bo: The bo to assign the memory to
  * @new_mem: The memory to be assigned.
  *
-- 
cgit v1.2.3


From 2c6f6831876acabfc917248927e7622fa789eb24 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Fri, 8 Mar 2024 13:55:48 +0200
Subject: drm/ttm: make ttm_caching.h self-contained
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Include <linux/pgtable.h> for pgprot_t.

Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Huang Rui <ray.huang@amd.com>
Acked-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/fb87ab4b4490c53e9ece66d53c4f178ead244cb5.1709898638.git.jani.nikula@intel.com
---
 include/drm/ttm/ttm_caching.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/drm/ttm/ttm_caching.h b/include/drm/ttm/ttm_caching.h
index 235a743d90e1..a18f43e93aba 100644
--- a/include/drm/ttm/ttm_caching.h
+++ b/include/drm/ttm/ttm_caching.h
@@ -25,6 +25,8 @@
 #ifndef _TTM_CACHING_H_
 #define _TTM_CACHING_H_
 
+#include <linux/pgtable.h>
+
 #define TTM_NUM_CACHING_TYPES	3
 
 /**
-- 
cgit v1.2.3


From bc6adc2e340bb3b59f85562ff4f47316015a95ae Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Fri, 8 Mar 2024 13:55:49 +0200
Subject: drm/ttm: fix ttm_execbuf_util.h kernel-doc warnings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix some formatting errors and excess documentation.

Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Huang Rui <ray.huang@amd.com>
Acked-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/b0f6d243c4e5eb1564ef2f4bb5bb834ee2c0305b.1709898638.git.jani.nikula@intel.com
---
 include/drm/ttm/ttm_execbuf_util.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/drm/ttm/ttm_execbuf_util.h b/include/drm/ttm/ttm_execbuf_util.h
index 03aca29d3ce4..fac1e3e57ebd 100644
--- a/include/drm/ttm/ttm_execbuf_util.h
+++ b/include/drm/ttm/ttm_execbuf_util.h
@@ -52,7 +52,7 @@ struct ttm_validate_buffer {
 };
 
 /**
- * function ttm_eu_backoff_reservation
+ * ttm_eu_backoff_reservation
  *
  * @ticket:   ww_acquire_ctx from reserve call
  * @list:     thread private list of ttm_validate_buffer structs.
@@ -64,14 +64,13 @@ void ttm_eu_backoff_reservation(struct ww_acquire_ctx *ticket,
 				struct list_head *list);
 
 /**
- * function ttm_eu_reserve_buffers
+ * ttm_eu_reserve_buffers
  *
  * @ticket:  [out] ww_acquire_ctx filled in by call, or NULL if only
  *           non-blocking reserves should be tried.
  * @list:    thread private list of ttm_validate_buffer structs.
  * @intr:    should the wait be interruptible
  * @dups:    [out] optional list of duplicates.
- * @del_lru: true if BOs should be removed from the LRU.
  *
  * Tries to reserve bos pointed to by the list entries for validation.
  * If the function returns 0, all buffers are marked as "unfenced",
@@ -102,7 +101,7 @@ int ttm_eu_reserve_buffers(struct ww_acquire_ctx *ticket,
 			   struct list_head *dups);
 
 /**
- * function ttm_eu_fence_buffer_objects.
+ * ttm_eu_fence_buffer_objects
  *
  * @ticket:      ww_acquire_ctx from reserve call
  * @list:        thread private list of ttm_validate_buffer structs.
-- 
cgit v1.2.3


From c136883b10ce62c4101f96cd27c7f691fa7b9882 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Fri, 8 Mar 2024 13:55:50 +0200
Subject: drm/ttm: fix ttm_kmap_iter.h kernel-doc warnings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There's no proper way to document function pointer members, but at least
silence the warnings.

Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Huang Rui <ray.huang@amd.com>
Acked-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/e80e74ac4b6c5f1df3bc2dd98651ba289aae8e83.1709898638.git.jani.nikula@intel.com
---
 include/drm/ttm/ttm_kmap_iter.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/drm/ttm/ttm_kmap_iter.h b/include/drm/ttm/ttm_kmap_iter.h
index cc5c09a211b4..fe72631a6e93 100644
--- a/include/drm/ttm/ttm_kmap_iter.h
+++ b/include/drm/ttm/ttm_kmap_iter.h
@@ -20,7 +20,7 @@ struct iosys_map;
  */
 struct ttm_kmap_iter_ops {
 	/**
-	 * kmap_local() - Map a PAGE_SIZE part of the resource using
+	 * @map_local: Map a PAGE_SIZE part of the resource using
 	 * kmap_local semantics.
 	 * @res_iter: Pointer to the struct ttm_kmap_iter representing
 	 * the resource.
@@ -31,7 +31,7 @@ struct ttm_kmap_iter_ops {
 	void (*map_local)(struct ttm_kmap_iter *res_iter,
 			  struct iosys_map *dmap, pgoff_t i);
 	/**
-	 * unmap_local() - Unmap a PAGE_SIZE part of the resource previously
+	 * @unmap_local: Unmap a PAGE_SIZE part of the resource previously
 	 * mapped using kmap_local.
 	 * @res_iter: Pointer to the struct ttm_kmap_iter representing
 	 * the resource.
-- 
cgit v1.2.3


From 2042ada01519d0751bcffa1434209ffb6b53bf96 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Fri, 8 Mar 2024 13:55:51 +0200
Subject: drm/ttm: make ttm_pool.h self-contained
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

struct seq_file needs a forward declaration in some configs. Sort the
forward declarations while at it.

Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Huang Rui <ray.huang@amd.com>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202403072259.EEC2Vf1X-lkp@intel.com/
Reviewed-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/b92373759bba305ddf8d24fdca345f195400e206.1709898638.git.jani.nikula@intel.com
---
 include/drm/ttm/ttm_pool.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/drm/ttm/ttm_pool.h b/include/drm/ttm/ttm_pool.h
index 4490d43c63e3..160d954a261e 100644
--- a/include/drm/ttm/ttm_pool.h
+++ b/include/drm/ttm/ttm_pool.h
@@ -32,9 +32,10 @@
 #include <drm/ttm/ttm_caching.h>
 
 struct device;
-struct ttm_tt;
-struct ttm_pool;
+struct seq_file;
 struct ttm_operation_ctx;
+struct ttm_pool;
+struct ttm_tt;
 
 /**
  * struct ttm_pool_type - Pool for a certain memory type
-- 
cgit v1.2.3


From 8e3ed5440b0c305dcd1d5fa7419bd8066d22ef42 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Fri, 8 Mar 2024 13:55:42 +0200
Subject: drm/i2c: silence ch7006.h and sil164.h kernel-doc warnings

Mark some members private to silence kernel-doc warnings, and add FIXME
comments.

Acked-by: Thomas Zimmermann <tzimmermann@suse.de>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/ffc58be256d71e6a98eb9f13337add64458d3476.1709898638.git.jani.nikula@intel.com
---
 include/drm/i2c/ch7006.h | 1 +
 include/drm/i2c/sil164.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/drm/i2c/ch7006.h b/include/drm/i2c/ch7006.h
index 8390b437a1f8..5305b9797f93 100644
--- a/include/drm/i2c/ch7006.h
+++ b/include/drm/i2c/ch7006.h
@@ -37,6 +37,7 @@
  * meaning.
  */
 struct ch7006_encoder_params {
+	/* private: FIXME: document the members */
 	enum {
 		CH7006_FORMAT_RGB16 = 0,
 		CH7006_FORMAT_YCrCb24m16,
diff --git a/include/drm/i2c/sil164.h b/include/drm/i2c/sil164.h
index 205e27384c83..ddf248693c8b 100644
--- a/include/drm/i2c/sil164.h
+++ b/include/drm/i2c/sil164.h
@@ -36,6 +36,7 @@
  * See "http://www.siliconimage.com/docs/SiI-DS-0021-E-164.pdf".
  */
 struct sil164_encoder_params {
+	/* private: FIXME: document the members */
 	enum {
 		SIL164_INPUT_EDGE_FALLING = 0,
 		SIL164_INPUT_EDGE_RISING
-- 
cgit v1.2.3


From ec144244a43f6e4ca9767bf9fb1b2c9e293c2a31 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Tue, 27 Feb 2024 11:14:48 +0100
Subject: drm/gem-shmem: Acquire reservation lock in GEM pin/unpin callbacks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Export drm_gem_shmem_pin_locked() and acquire the reservation lock
directly in GEM pin callback. Same for unpin. Prepares for further
changes.

Dma-buf locking semantics require callers to hold the buffer's
reservation lock when invoking the pin and unpin callbacks. Prepare
gem-shmem accordingly by pushing locking out of the implementation.
A follow-up patch will fix locking for all GEM code at once.

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com> # virtio-gpu
Acked-by: Christian König <christian.koenig@amd.com>
Acked-by: Zack Rusin <zack.rusin@broadcom.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240227113853.8464-2-tzimmermann@suse.de
---
 drivers/gpu/drm/drm_gem_shmem_helper.c |  6 ++++--
 include/drm/drm_gem_shmem_helper.h     | 16 ++++++++++++++--
 2 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_gem_shmem_helper.c b/drivers/gpu/drm/drm_gem_shmem_helper.c
index e435f986cd13..0ac3dddb917f 100644
--- a/drivers/gpu/drm/drm_gem_shmem_helper.c
+++ b/drivers/gpu/drm/drm_gem_shmem_helper.c
@@ -228,7 +228,7 @@ void drm_gem_shmem_put_pages(struct drm_gem_shmem_object *shmem)
 }
 EXPORT_SYMBOL(drm_gem_shmem_put_pages);
 
-static int drm_gem_shmem_pin_locked(struct drm_gem_shmem_object *shmem)
+int drm_gem_shmem_pin_locked(struct drm_gem_shmem_object *shmem)
 {
 	int ret;
 
@@ -238,13 +238,15 @@ static int drm_gem_shmem_pin_locked(struct drm_gem_shmem_object *shmem)
 
 	return ret;
 }
+EXPORT_SYMBOL(drm_gem_shmem_pin_locked);
 
-static void drm_gem_shmem_unpin_locked(struct drm_gem_shmem_object *shmem)
+void drm_gem_shmem_unpin_locked(struct drm_gem_shmem_object *shmem)
 {
 	dma_resv_assert_held(shmem->base.resv);
 
 	drm_gem_shmem_put_pages(shmem);
 }
+EXPORT_SYMBOL(drm_gem_shmem_unpin_locked);
 
 /**
  * drm_gem_shmem_pin - Pin backing pages for a shmem GEM object
diff --git a/include/drm/drm_gem_shmem_helper.h b/include/drm/drm_gem_shmem_helper.h
index bf0c31aa8fbe..eb12aa9a8c55 100644
--- a/include/drm/drm_gem_shmem_helper.h
+++ b/include/drm/drm_gem_shmem_helper.h
@@ -108,6 +108,9 @@ void drm_gem_shmem_vunmap(struct drm_gem_shmem_object *shmem,
 			  struct iosys_map *map);
 int drm_gem_shmem_mmap(struct drm_gem_shmem_object *shmem, struct vm_area_struct *vma);
 
+int drm_gem_shmem_pin_locked(struct drm_gem_shmem_object *shmem);
+void drm_gem_shmem_unpin_locked(struct drm_gem_shmem_object *shmem);
+
 int drm_gem_shmem_madvise(struct drm_gem_shmem_object *shmem, int madv);
 
 static inline bool drm_gem_shmem_is_purgeable(struct drm_gem_shmem_object *shmem)
@@ -172,8 +175,15 @@ static inline void drm_gem_shmem_object_print_info(struct drm_printer *p, unsign
 static inline int drm_gem_shmem_object_pin(struct drm_gem_object *obj)
 {
 	struct drm_gem_shmem_object *shmem = to_drm_gem_shmem_obj(obj);
+	int ret;
+
+	ret = dma_resv_lock_interruptible(shmem->base.resv, NULL);
+	if (ret)
+		return ret;
+	ret = drm_gem_shmem_pin_locked(shmem);
+	dma_resv_unlock(shmem->base.resv);
 
-	return drm_gem_shmem_pin(shmem);
+	return ret;
 }
 
 /**
@@ -187,7 +197,9 @@ static inline void drm_gem_shmem_object_unpin(struct drm_gem_object *obj)
 {
 	struct drm_gem_shmem_object *shmem = to_drm_gem_shmem_obj(obj);
 
-	drm_gem_shmem_unpin(shmem);
+	dma_resv_lock(shmem->base.resv, NULL);
+	drm_gem_shmem_unpin_locked(shmem);
+	dma_resv_unlock(shmem->base.resv);
 }
 
 /**
-- 
cgit v1.2.3


From a78027847226493ea6f09a00875fa4871fd29e69 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Tue, 27 Feb 2024 11:14:56 +0100
Subject: drm/gem: Acquire reservation lock in drm_gem_{pin/unpin}()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Acquire the buffer object's reservation lock in drm_gem_pin() and
remove locking the drivers' GEM callbacks where necessary. Same for
unpin().

DRM drivers and memory managers modified by this patch will now have
correct dma-buf locking semantics: the caller is responsible for
holding the reservation lock when calling the pin or unpin callback.

DRM drivers and memory managers that are not modified will now be
protected against concurent invocation of their pin and unpin callbacks.

PRIME does not implement struct dma_buf_ops.pin, which requires
the caller to hold the reservation lock. It does implement struct
dma_buf_ops.attach, which requires to callee to acquire the
reservation lock. The PRIME code uses drm_gem_pin(), so locks
are now taken as specified. Same for unpin and detach.

The patch harmonizes GEM pin and unpin to have non-interruptible
reservation locking across all drivers, as is already the case for
vmap and vunmap. This affects gem-shmem, gem-vram, loongson, qxl and
radeon.

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Zack Rusin <zack.rusin@broadcom.com>
Reviewed-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com> # virtio-gpu
Link: https://patchwork.freedesktop.org/patch/msgid/20240227113853.8464-10-tzimmermann@suse.de
---
 drivers/gpu/drm/drm_gem.c               | 22 ++++++++++++++++++++--
 drivers/gpu/drm/drm_gem_vram_helper.c   | 15 +--------------
 drivers/gpu/drm/drm_internal.h          |  2 ++
 drivers/gpu/drm/loongson/lsdc_gem.c     | 13 ++-----------
 drivers/gpu/drm/msm/msm_gem_prime.c     |  4 ----
 drivers/gpu/drm/nouveau/nouveau_prime.c | 11 -----------
 drivers/gpu/drm/qxl/qxl_prime.c         | 14 +-------------
 drivers/gpu/drm/radeon/radeon_prime.c   | 11 -----------
 drivers/gpu/drm/vmwgfx/vmwgfx_gem.c     | 25 ++++++-------------------
 include/drm/drm_gem_shmem_helper.h      | 11 +----------
 10 files changed, 33 insertions(+), 95 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
index 44a948b80ee1..e0f80c6a7096 100644
--- a/drivers/gpu/drm/drm_gem.c
+++ b/drivers/gpu/drm/drm_gem.c
@@ -1161,7 +1161,7 @@ void drm_gem_print_info(struct drm_printer *p, unsigned int indent,
 		obj->funcs->print_info(p, indent, obj);
 }
 
-int drm_gem_pin(struct drm_gem_object *obj)
+int drm_gem_pin_locked(struct drm_gem_object *obj)
 {
 	if (obj->funcs->pin)
 		return obj->funcs->pin(obj);
@@ -1169,12 +1169,30 @@ int drm_gem_pin(struct drm_gem_object *obj)
 	return 0;
 }
 
-void drm_gem_unpin(struct drm_gem_object *obj)
+void drm_gem_unpin_locked(struct drm_gem_object *obj)
 {
 	if (obj->funcs->unpin)
 		obj->funcs->unpin(obj);
 }
 
+int drm_gem_pin(struct drm_gem_object *obj)
+{
+	int ret;
+
+	dma_resv_lock(obj->resv, NULL);
+	ret = drm_gem_pin_locked(obj);
+	dma_resv_unlock(obj->resv);
+
+	return ret;
+}
+
+void drm_gem_unpin(struct drm_gem_object *obj)
+{
+	dma_resv_lock(obj->resv, NULL);
+	drm_gem_unpin_locked(obj);
+	dma_resv_unlock(obj->resv);
+}
+
 int drm_gem_vmap(struct drm_gem_object *obj, struct iosys_map *map)
 {
 	int ret;
diff --git a/drivers/gpu/drm/drm_gem_vram_helper.c b/drivers/gpu/drm/drm_gem_vram_helper.c
index 15029d89badf..5a16b3e0a413 100644
--- a/drivers/gpu/drm/drm_gem_vram_helper.c
+++ b/drivers/gpu/drm/drm_gem_vram_helper.c
@@ -774,11 +774,6 @@ EXPORT_SYMBOL(drm_gem_vram_simple_display_pipe_cleanup_fb);
 static int drm_gem_vram_object_pin(struct drm_gem_object *gem)
 {
 	struct drm_gem_vram_object *gbo = drm_gem_vram_of_gem(gem);
-	int ret;
-
-	ret = ttm_bo_reserve(&gbo->bo, true, false, NULL);
-	if (ret)
-		return ret;
 
 	/*
 	 * Fbdev console emulation is the use case of these PRIME
@@ -789,10 +784,7 @@ static int drm_gem_vram_object_pin(struct drm_gem_object *gem)
 	 * the buffer to be pinned to VRAM, implement a callback that
 	 * sets the flags accordingly.
 	 */
-	ret = drm_gem_vram_pin_locked(gbo, 0);
-	ttm_bo_unreserve(&gbo->bo);
-
-	return ret;
+	return drm_gem_vram_pin_locked(gbo, 0);
 }
 
 /**
@@ -803,13 +795,8 @@ static int drm_gem_vram_object_pin(struct drm_gem_object *gem)
 static void drm_gem_vram_object_unpin(struct drm_gem_object *gem)
 {
 	struct drm_gem_vram_object *gbo = drm_gem_vram_of_gem(gem);
-	int ret;
 
-	ret = ttm_bo_reserve(&gbo->bo, true, false, NULL);
-	if (ret)
-		return;
 	drm_gem_vram_unpin_locked(gbo);
-	ttm_bo_unreserve(&gbo->bo);
 }
 
 /**
diff --git a/drivers/gpu/drm/drm_internal.h b/drivers/gpu/drm/drm_internal.h
index d69744c9ac15..2215baef9a3e 100644
--- a/drivers/gpu/drm/drm_internal.h
+++ b/drivers/gpu/drm/drm_internal.h
@@ -173,6 +173,8 @@ void drm_gem_release(struct drm_device *dev, struct drm_file *file_private);
 void drm_gem_print_info(struct drm_printer *p, unsigned int indent,
 			const struct drm_gem_object *obj);
 
+int drm_gem_pin_locked(struct drm_gem_object *obj);
+void drm_gem_unpin_locked(struct drm_gem_object *obj);
 int drm_gem_pin(struct drm_gem_object *obj);
 void drm_gem_unpin(struct drm_gem_object *obj);
 int drm_gem_vmap(struct drm_gem_object *obj, struct iosys_map *map);
diff --git a/drivers/gpu/drm/loongson/lsdc_gem.c b/drivers/gpu/drm/loongson/lsdc_gem.c
index 04293df2f0de..a720d8f53209 100644
--- a/drivers/gpu/drm/loongson/lsdc_gem.c
+++ b/drivers/gpu/drm/loongson/lsdc_gem.c
@@ -19,33 +19,24 @@ static int lsdc_gem_prime_pin(struct drm_gem_object *obj)
 	struct lsdc_bo *lbo = gem_to_lsdc_bo(obj);
 	int ret;
 
-	ret = lsdc_bo_reserve(lbo);
-	if (unlikely(ret))
-		return ret;
+	dma_resv_assert_held(obj->resv);
 
 	ret = lsdc_bo_pin(lbo, LSDC_GEM_DOMAIN_GTT, NULL);
 	if (likely(ret == 0))
 		lbo->sharing_count++;
 
-	lsdc_bo_unreserve(lbo);
-
 	return ret;
 }
 
 static void lsdc_gem_prime_unpin(struct drm_gem_object *obj)
 {
 	struct lsdc_bo *lbo = gem_to_lsdc_bo(obj);
-	int ret;
 
-	ret = lsdc_bo_reserve(lbo);
-	if (unlikely(ret))
-		return;
+	dma_resv_assert_held(obj->resv);
 
 	lsdc_bo_unpin(lbo);
 	if (lbo->sharing_count)
 		lbo->sharing_count--;
-
-	lsdc_bo_unreserve(lbo);
 }
 
 static struct sg_table *lsdc_gem_prime_get_sg_table(struct drm_gem_object *obj)
diff --git a/drivers/gpu/drm/msm/msm_gem_prime.c b/drivers/gpu/drm/msm/msm_gem_prime.c
index 0d22df53ab98..ee267490c935 100644
--- a/drivers/gpu/drm/msm/msm_gem_prime.c
+++ b/drivers/gpu/drm/msm/msm_gem_prime.c
@@ -53,11 +53,9 @@ int msm_gem_prime_pin(struct drm_gem_object *obj)
 	if (obj->import_attach)
 		return 0;
 
-	msm_gem_lock(obj);
 	pages = msm_gem_pin_pages_locked(obj);
 	if (IS_ERR(pages))
 		ret = PTR_ERR(pages);
-	msm_gem_unlock(obj);
 
 	return ret;
 }
@@ -67,7 +65,5 @@ void msm_gem_prime_unpin(struct drm_gem_object *obj)
 	if (obj->import_attach)
 		return;
 
-	msm_gem_lock(obj);
 	msm_gem_unpin_pages_locked(obj);
-	msm_gem_unlock(obj);
 }
diff --git a/drivers/gpu/drm/nouveau/nouveau_prime.c b/drivers/gpu/drm/nouveau/nouveau_prime.c
index 774f9bd03110..b58ab595faf8 100644
--- a/drivers/gpu/drm/nouveau/nouveau_prime.c
+++ b/drivers/gpu/drm/nouveau/nouveau_prime.c
@@ -86,17 +86,12 @@ unlock:
 int nouveau_gem_prime_pin(struct drm_gem_object *obj)
 {
 	struct nouveau_bo *nvbo = nouveau_gem_object(obj);
-	struct ttm_buffer_object *bo = &nvbo->bo;
 	int ret;
 
-	ret = ttm_bo_reserve(bo, false, false, NULL);
-	if (ret)
-		return -EINVAL;
 	/* pin buffer into GTT */
 	ret = nouveau_bo_pin_locked(nvbo, NOUVEAU_GEM_DOMAIN_GART, false);
 	if (ret)
 		ret = -EINVAL;
-	ttm_bo_unreserve(bo);
 
 	return ret;
 }
@@ -104,14 +99,8 @@ int nouveau_gem_prime_pin(struct drm_gem_object *obj)
 void nouveau_gem_prime_unpin(struct drm_gem_object *obj)
 {
 	struct nouveau_bo *nvbo = nouveau_gem_object(obj);
-	struct ttm_buffer_object *bo = &nvbo->bo;
-	int ret;
 
-	ret = ttm_bo_reserve(bo, false, false, NULL);
-	if (ret)
-		return;
 	nouveau_bo_unpin_locked(nvbo);
-	ttm_bo_unreserve(bo);
 }
 
 struct dma_buf *nouveau_gem_prime_export(struct drm_gem_object *gobj,
diff --git a/drivers/gpu/drm/qxl/qxl_prime.c b/drivers/gpu/drm/qxl/qxl_prime.c
index f2646603e12e..19bf551a7b31 100644
--- a/drivers/gpu/drm/qxl/qxl_prime.c
+++ b/drivers/gpu/drm/qxl/qxl_prime.c
@@ -31,27 +31,15 @@
 int qxl_gem_prime_pin(struct drm_gem_object *obj)
 {
 	struct qxl_bo *bo = gem_to_qxl_bo(obj);
-	int r;
 
-	r = qxl_bo_reserve(bo);
-	if (r)
-		return r;
-	r = qxl_bo_pin_locked(bo);
-	qxl_bo_unreserve(bo);
-
-	return r;
+	return qxl_bo_pin_locked(bo);
 }
 
 void qxl_gem_prime_unpin(struct drm_gem_object *obj)
 {
 	struct qxl_bo *bo = gem_to_qxl_bo(obj);
-	int r;
 
-	r = qxl_bo_reserve(bo);
-	if (r)
-		return;
 	qxl_bo_unpin_locked(bo);
-	qxl_bo_unreserve(bo);
 }
 
 struct sg_table *qxl_gem_prime_get_sg_table(struct drm_gem_object *obj)
diff --git a/drivers/gpu/drm/radeon/radeon_prime.c b/drivers/gpu/drm/radeon/radeon_prime.c
index b3cfc99f4d7e..a77881f035e7 100644
--- a/drivers/gpu/drm/radeon/radeon_prime.c
+++ b/drivers/gpu/drm/radeon/radeon_prime.c
@@ -73,32 +73,21 @@ int radeon_gem_prime_pin(struct drm_gem_object *obj)
 	struct radeon_bo *bo = gem_to_radeon_bo(obj);
 	int ret = 0;
 
-	ret = radeon_bo_reserve(bo, false);
-	if (unlikely(ret != 0))
-		return ret;
-
 	/* pin buffer into GTT */
 	ret = radeon_bo_pin(bo, RADEON_GEM_DOMAIN_GTT, NULL);
 	if (likely(ret == 0))
 		bo->prime_shared_count++;
 
-	radeon_bo_unreserve(bo);
 	return ret;
 }
 
 void radeon_gem_prime_unpin(struct drm_gem_object *obj)
 {
 	struct radeon_bo *bo = gem_to_radeon_bo(obj);
-	int ret = 0;
-
-	ret = radeon_bo_reserve(bo, false);
-	if (unlikely(ret != 0))
-		return;
 
 	radeon_bo_unpin(bo);
 	if (bo->prime_shared_count)
 		bo->prime_shared_count--;
-	radeon_bo_unreserve(bo);
 }
 
 
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_gem.c b/drivers/gpu/drm/vmwgfx/vmwgfx_gem.c
index 12787bb9c111..186150f41fbc 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_gem.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_gem.c
@@ -48,33 +48,20 @@ static void vmw_gem_object_close(struct drm_gem_object *obj,
 {
 }
 
-static int vmw_gem_pin_private(struct drm_gem_object *obj, bool do_pin)
+static int vmw_gem_object_pin(struct drm_gem_object *obj)
 {
-	struct ttm_buffer_object *bo = drm_gem_ttm_of_gem(obj);
 	struct vmw_bo *vbo = to_vmw_bo(obj);
-	int ret;
-
-	ret = ttm_bo_reserve(bo, false, false, NULL);
-	if (unlikely(ret != 0))
-		goto err;
-
-	vmw_bo_pin_reserved(vbo, do_pin);
-
-	ttm_bo_unreserve(bo);
-
-err:
-	return ret;
-}
 
+	vmw_bo_pin_reserved(vbo, true);
 
-static int vmw_gem_object_pin(struct drm_gem_object *obj)
-{
-	return vmw_gem_pin_private(obj, true);
+	return 0;
 }
 
 static void vmw_gem_object_unpin(struct drm_gem_object *obj)
 {
-	vmw_gem_pin_private(obj, false);
+	struct vmw_bo *vbo = to_vmw_bo(obj);
+
+	vmw_bo_pin_reserved(vbo, false);
 }
 
 static struct sg_table *vmw_gem_object_get_sg_table(struct drm_gem_object *obj)
diff --git a/include/drm/drm_gem_shmem_helper.h b/include/drm/drm_gem_shmem_helper.h
index eb12aa9a8c55..efbc9f27312b 100644
--- a/include/drm/drm_gem_shmem_helper.h
+++ b/include/drm/drm_gem_shmem_helper.h
@@ -175,15 +175,8 @@ static inline void drm_gem_shmem_object_print_info(struct drm_printer *p, unsign
 static inline int drm_gem_shmem_object_pin(struct drm_gem_object *obj)
 {
 	struct drm_gem_shmem_object *shmem = to_drm_gem_shmem_obj(obj);
-	int ret;
 
-	ret = dma_resv_lock_interruptible(shmem->base.resv, NULL);
-	if (ret)
-		return ret;
-	ret = drm_gem_shmem_pin_locked(shmem);
-	dma_resv_unlock(shmem->base.resv);
-
-	return ret;
+	return drm_gem_shmem_pin_locked(shmem);
 }
 
 /**
@@ -197,9 +190,7 @@ static inline void drm_gem_shmem_object_unpin(struct drm_gem_object *obj)
 {
 	struct drm_gem_shmem_object *shmem = to_drm_gem_shmem_obj(obj);
 
-	dma_resv_lock(shmem->base.resv, NULL);
 	drm_gem_shmem_unpin_locked(shmem);
-	dma_resv_unlock(shmem->base.resv);
 }
 
 /**
-- 
cgit v1.2.3


From b4b0193e83cb987143583e2b4011b35331f429bd Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Tue, 27 Feb 2024 11:14:57 +0100
Subject: drm/fbdev-generic: Fix locking with drm_client_buffer_vmap_local()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Temporarily lock the fbdev buffer object during updates to prevent
memory managers from evicting/moving the buffer. Moving a buffer
object while update its content results in undefined behaviour.

Fbdev-generic updates its buffer object from a shadow buffer. Gem-shmem
and gem-dma helpers do not move buffer objects, so they are safe to be
used with fbdev-generic. Gem-vram and qxl are based on TTM, but pin
buffer objects are part of the vmap operation. So both are also safe
to be used with fbdev-generic.

Amdgpu and nouveau do not pin or lock the buffer object during an
update. Their TTM-based memory management could move the buffer object
while the update is ongoing.

The new vmap_local and vunmap_local helpers hold the buffer object's
reservation lock during the buffer update. This prevents moving the
buffer object on all memory managers.

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
Tested-by: Dmitry Osipenko <dmitry.osipenko@collabora.com> # virtio-gpu
Acked-by: Zack Rusin <zack.rusin@broadcom.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240227113853.8464-11-tzimmermann@suse.de
---
 drivers/gpu/drm/drm_client.c        | 68 ++++++++++++++++++++++++++++++++-----
 drivers/gpu/drm/drm_fbdev_generic.c |  4 +--
 drivers/gpu/drm/drm_gem.c           | 12 +++++++
 include/drm/drm_client.h            | 10 ++++++
 include/drm/drm_gem.h               |  3 ++
 5 files changed, 87 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_client.c b/drivers/gpu/drm/drm_client.c
index 9403b3f576f7..2cc81831236b 100644
--- a/drivers/gpu/drm/drm_client.c
+++ b/drivers/gpu/drm/drm_client.c
@@ -304,6 +304,66 @@ err_delete:
 	return ERR_PTR(ret);
 }
 
+/**
+ * drm_client_buffer_vmap_local - Map DRM client buffer into address space
+ * @buffer: DRM client buffer
+ * @map_copy: Returns the mapped memory's address
+ *
+ * This function maps a client buffer into kernel address space. If the
+ * buffer is already mapped, it returns the existing mapping's address.
+ *
+ * Client buffer mappings are not ref'counted. Each call to
+ * drm_client_buffer_vmap_local() should be closely followed by a call to
+ * drm_client_buffer_vunmap_local(). See drm_client_buffer_vmap() for
+ * long-term mappings.
+ *
+ * The returned address is a copy of the internal value. In contrast to
+ * other vmap interfaces, you don't need it for the client's vunmap
+ * function. So you can modify it at will during blit and draw operations.
+ *
+ * Returns:
+ *	0 on success, or a negative errno code otherwise.
+ */
+int drm_client_buffer_vmap_local(struct drm_client_buffer *buffer,
+				 struct iosys_map *map_copy)
+{
+	struct drm_gem_object *gem = buffer->gem;
+	struct iosys_map *map = &buffer->map;
+	int ret;
+
+	drm_gem_lock(gem);
+
+	ret = drm_gem_vmap(gem, map);
+	if (ret)
+		goto err_drm_gem_vmap_unlocked;
+	*map_copy = *map;
+
+	return 0;
+
+err_drm_gem_vmap_unlocked:
+	drm_gem_unlock(gem);
+	return 0;
+}
+EXPORT_SYMBOL(drm_client_buffer_vmap_local);
+
+/**
+ * drm_client_buffer_vunmap_local - Unmap DRM client buffer
+ * @buffer: DRM client buffer
+ *
+ * This function removes a client buffer's memory mapping established
+ * with drm_client_buffer_vunmap_local(). Calling this function is only
+ * required by clients that manage their buffer mappings by themselves.
+ */
+void drm_client_buffer_vunmap_local(struct drm_client_buffer *buffer)
+{
+	struct drm_gem_object *gem = buffer->gem;
+	struct iosys_map *map = &buffer->map;
+
+	drm_gem_vunmap(gem, map);
+	drm_gem_unlock(gem);
+}
+EXPORT_SYMBOL(drm_client_buffer_vunmap_local);
+
 /**
  * drm_client_buffer_vmap - Map DRM client buffer into address space
  * @buffer: DRM client buffer
@@ -331,14 +391,6 @@ drm_client_buffer_vmap(struct drm_client_buffer *buffer,
 	struct iosys_map *map = &buffer->map;
 	int ret;
 
-	/*
-	 * FIXME: The dependency on GEM here isn't required, we could
-	 * convert the driver handle to a dma-buf instead and use the
-	 * backend-agnostic dma-buf vmap support instead. This would
-	 * require that the handle2fd prime ioctl is reworked to pull the
-	 * fd_install step out of the driver backend hooks, to make that
-	 * final step optional for internal users.
-	 */
 	ret = drm_gem_vmap_unlocked(buffer->gem, map);
 	if (ret)
 		return ret;
diff --git a/drivers/gpu/drm/drm_fbdev_generic.c b/drivers/gpu/drm/drm_fbdev_generic.c
index d647d89764cb..be357f926fae 100644
--- a/drivers/gpu/drm/drm_fbdev_generic.c
+++ b/drivers/gpu/drm/drm_fbdev_generic.c
@@ -197,14 +197,14 @@ static int drm_fbdev_generic_damage_blit(struct drm_fb_helper *fb_helper,
 	 */
 	mutex_lock(&fb_helper->lock);
 
-	ret = drm_client_buffer_vmap(buffer, &map);
+	ret = drm_client_buffer_vmap_local(buffer, &map);
 	if (ret)
 		goto out;
 
 	dst = map;
 	drm_fbdev_generic_damage_blit_real(fb_helper, clip, &dst);
 
-	drm_client_buffer_vunmap(buffer);
+	drm_client_buffer_vunmap_local(buffer);
 
 out:
 	mutex_unlock(&fb_helper->lock);
diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
index e0f80c6a7096..d4bbc5d109c8 100644
--- a/drivers/gpu/drm/drm_gem.c
+++ b/drivers/gpu/drm/drm_gem.c
@@ -1227,6 +1227,18 @@ void drm_gem_vunmap(struct drm_gem_object *obj, struct iosys_map *map)
 }
 EXPORT_SYMBOL(drm_gem_vunmap);
 
+void drm_gem_lock(struct drm_gem_object *obj)
+{
+	dma_resv_lock(obj->resv, NULL);
+}
+EXPORT_SYMBOL(drm_gem_lock);
+
+void drm_gem_unlock(struct drm_gem_object *obj)
+{
+	dma_resv_unlock(obj->resv);
+}
+EXPORT_SYMBOL(drm_gem_unlock);
+
 int drm_gem_vmap_unlocked(struct drm_gem_object *obj, struct iosys_map *map)
 {
 	int ret;
diff --git a/include/drm/drm_client.h b/include/drm/drm_client.h
index d47458ecdac4..bc0e66f9c425 100644
--- a/include/drm/drm_client.h
+++ b/include/drm/drm_client.h
@@ -141,6 +141,13 @@ struct drm_client_buffer {
 
 	/**
 	 * @gem: GEM object backing this buffer
+	 *
+	 * FIXME: The dependency on GEM here isn't required, we could
+	 * convert the driver handle to a dma-buf instead and use the
+	 * backend-agnostic dma-buf vmap support instead. This would
+	 * require that the handle2fd prime ioctl is reworked to pull the
+	 * fd_install step out of the driver backend hooks, to make that
+	 * final step optional for internal users.
 	 */
 	struct drm_gem_object *gem;
 
@@ -159,6 +166,9 @@ struct drm_client_buffer *
 drm_client_framebuffer_create(struct drm_client_dev *client, u32 width, u32 height, u32 format);
 void drm_client_framebuffer_delete(struct drm_client_buffer *buffer);
 int drm_client_framebuffer_flush(struct drm_client_buffer *buffer, struct drm_rect *rect);
+int drm_client_buffer_vmap_local(struct drm_client_buffer *buffer,
+				 struct iosys_map *map_copy);
+void drm_client_buffer_vunmap_local(struct drm_client_buffer *buffer);
 int drm_client_buffer_vmap(struct drm_client_buffer *buffer,
 			   struct iosys_map *map);
 void drm_client_buffer_vunmap(struct drm_client_buffer *buffer);
diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h
index 2ebec3984cd4..bae4865b2101 100644
--- a/include/drm/drm_gem.h
+++ b/include/drm/drm_gem.h
@@ -527,6 +527,9 @@ struct page **drm_gem_get_pages(struct drm_gem_object *obj);
 void drm_gem_put_pages(struct drm_gem_object *obj, struct page **pages,
 		bool dirty, bool accessed);
 
+void drm_gem_lock(struct drm_gem_object *obj);
+void drm_gem_unlock(struct drm_gem_object *obj);
+
 int drm_gem_vmap_unlocked(struct drm_gem_object *obj, struct iosys_map *map);
 void drm_gem_vunmap_unlocked(struct drm_gem_object *obj, struct iosys_map *map);
 
-- 
cgit v1.2.3


From 38d707c54df4ca58cd9ceae2ddcbd6f606b99e9f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 8 Mar 2024 11:58:54 +0100
Subject: sched/balancing: Change 'enum cpu_idle_type' to have more natural
 definitions

The cpu_idle_type enum has the confusingly inverted property
that 'not idle' is 1, and 'idle' is '0'.

This resulted in a number of unnecessary complications in the code.

Reverse the order, remove the CPU_NOT_IDLE type, and convert
all code to a natural boolean form.

It's much more readable:

  -       enum cpu_idle_type idle = this_rq->idle_balance ?
  -                                               CPU_IDLE : CPU_NOT_IDLE;
  -
  +       enum cpu_idle_type idle = this_rq->idle_balance;

  --------------------------------

  -       if (env->idle == CPU_NOT_IDLE || !busiest->sum_nr_running)
  +       if (!env->idle || !busiest->sum_nr_running)

  --------------------------------

And gets rid of the double negation in these usages:

  -               if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
  +               if (env->idle && env->src_rq->nr_running <= 1)

Furthermore, this makes code much more obvious where there's
differentiation between CPU_IDLE and CPU_NEWLY_IDLE.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Cc: "Gautham R. Shenoy" <gautham.shenoy@amd.com>
Link: https://lore.kernel.org/r/20240308105901.1096078-4-mingo@kernel.org
---
 include/linux/sched/idle.h |  2 +-
 kernel/sched/fair.c        | 27 ++++++++++++---------------
 2 files changed, 13 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h
index 478084f9105e..e670ac282333 100644
--- a/include/linux/sched/idle.h
+++ b/include/linux/sched/idle.h
@@ -5,8 +5,8 @@
 #include <linux/sched.h>
 
 enum cpu_idle_type {
+	__CPU_NOT_IDLE = 0,
 	CPU_IDLE,
-	CPU_NOT_IDLE,
 	CPU_NEWLY_IDLE,
 	CPU_MAX_IDLE_TYPES
 };
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2ef89b36aed1..3a510cf1fb00 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9070,7 +9070,7 @@ static int detach_tasks(struct lb_env *env)
 		 * We don't want to steal all, otherwise we may be treated likewise,
 		 * which could at worst lead to a livelock crash.
 		 */
-		if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
+		if (env->idle && env->src_rq->nr_running <= 1)
 			break;
 
 		env->loop++;
@@ -9803,7 +9803,7 @@ static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1,
 static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs,
 			       struct sched_group *group)
 {
-	if (env->idle == CPU_NOT_IDLE)
+	if (!env->idle)
 		return false;
 
 	/*
@@ -9827,7 +9827,7 @@ static inline long sibling_imbalance(struct lb_env *env,
 	int ncores_busiest, ncores_local;
 	long imbalance;
 
-	if (env->idle == CPU_NOT_IDLE || !busiest->sum_nr_running)
+	if (!env->idle || !busiest->sum_nr_running)
 		return 0;
 
 	ncores_busiest = sds->busiest->cores;
@@ -9927,8 +9927,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 				sgs->group_misfit_task_load = rq->misfit_task_load;
 				*sg_status |= SG_OVERLOAD;
 			}
-		} else if ((env->idle != CPU_NOT_IDLE) &&
-			   sched_reduced_capacity(rq, env->sd)) {
+		} else if (env->idle && sched_reduced_capacity(rq, env->sd)) {
 			/* Check for a task running on a CPU with reduced capacity */
 			if (sgs->group_misfit_task_load < load)
 				sgs->group_misfit_task_load = load;
@@ -9940,7 +9939,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 	sgs->group_weight = group->group_weight;
 
 	/* Check if dst CPU is idle and preferred to this group */
-	if (!local_group && env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running &&
+	if (!local_group && env->idle && sgs->sum_h_nr_running &&
 	    sched_group_asym(env, sgs, group))
 		sgs->group_asym_packing = 1;
 
@@ -10698,7 +10697,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 			 * waiting task in this overloaded busiest group. Let's
 			 * try to pull it.
 			 */
-			if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) {
+			if (env->idle && env->imbalance == 0) {
 				env->migration_type = migrate_task;
 				env->imbalance = 1;
 			}
@@ -10913,7 +10912,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 		goto force_balance;
 
 	if (busiest->group_type != group_overloaded) {
-		if (env->idle == CPU_NOT_IDLE) {
+		if (!env->idle) {
 			/*
 			 * If the busiest group is not overloaded (and as a
 			 * result the local one too) but this CPU is already
@@ -11121,7 +11120,7 @@ asym_active_balance(struct lb_env *env)
 	 * the lower priority @env::dst_cpu help it. Do not follow
 	 * CPU priority.
 	 */
-	return env->idle != CPU_NOT_IDLE && sched_use_asym_prio(env->sd, env->dst_cpu) &&
+	return env->idle && sched_use_asym_prio(env->sd, env->dst_cpu) &&
 	       (sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
 		!sched_use_asym_prio(env->sd, env->src_cpu));
 }
@@ -11159,7 +11158,7 @@ static int need_active_balance(struct lb_env *env)
 	 * because of other sched_class or IRQs if more capacity stays
 	 * available on dst_cpu.
 	 */
-	if ((env->idle != CPU_NOT_IDLE) &&
+	if (env->idle &&
 	    (env->src_rq->cfs.h_nr_running == 1)) {
 		if ((check_cpu_capacity(env->src_rq, sd)) &&
 		    (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
@@ -11735,8 +11734,8 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
 				 * env->dst_cpu, so we can't know our idle
 				 * state even if we migrated tasks. Update it.
 				 */
-				idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
-				busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
+				idle = idle_cpu(cpu);
+				busy = !idle && !sched_idle_cpu(cpu);
 			}
 			sd->last_balance = jiffies;
 			interval = get_sd_balance_interval(sd, busy);
@@ -12416,9 +12415,7 @@ out:
 static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
 {
 	struct rq *this_rq = this_rq();
-	enum cpu_idle_type idle = this_rq->idle_balance ?
-						CPU_IDLE : CPU_NOT_IDLE;
-
+	enum cpu_idle_type idle = this_rq->idle_balance;
 	/*
 	 * If this CPU has a pending nohz_balance_kick, then do the
 	 * balancing on behalf of the other idle CPUs whose ticks are
-- 
cgit v1.2.3


From 86dd6c04ef9f213e14d60c9f64bce1cc019f816e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 8 Mar 2024 12:18:08 +0100
Subject: sched/balancing: Rename scheduler_tick() => sched_tick()

- Standardize on prefixing scheduler-internal functions defined
  in <linux/sched.h> with sched_*() prefix. scheduler_tick() was
  the only function using the scheduler_ prefix. Harmonize it.

- The other reason to rename it is the NOHZ scheduler tick
  handling functions are already named sched_tick_*().
  Make the 'git grep sched_tick' more meaningful.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Valentin Schneider <vschneid@redhat.com>
Reviewed-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Link: https://lore.kernel.org/r/20240308111819.1101550-3-mingo@kernel.org
---
 Documentation/scheduler/sched-domains.rst                            | 4 ++--
 Documentation/translations/zh_CN/scheduler/sched-domains.rst         | 4 ++--
 include/linux/sched.h                                                | 2 +-
 kernel/sched/core.c                                                  | 4 ++--
 kernel/sched/loadavg.c                                               | 2 +-
 kernel/time/timer.c                                                  | 2 +-
 kernel/workqueue.c                                                   | 2 +-
 tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc | 2 +-
 8 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/Documentation/scheduler/sched-domains.rst b/Documentation/scheduler/sched-domains.rst
index 6577b068f921..541d6c617971 100644
--- a/Documentation/scheduler/sched-domains.rst
+++ b/Documentation/scheduler/sched-domains.rst
@@ -32,13 +32,13 @@ load of each of its member CPUs, and only when the load of a group becomes
 out of balance are tasks moved between groups.
 
 In kernel/sched/core.c, trigger_load_balance() is run periodically on each CPU
-through scheduler_tick(). It raises a softirq after the next regularly scheduled
+through sched_tick(). It raises a softirq after the next regularly scheduled
 rebalancing event for the current runqueue has arrived. The actual load
 balancing workhorse, sched_balance_softirq()->rebalance_domains(), is then run
 in softirq context (SCHED_SOFTIRQ).
 
 The latter function takes two arguments: the runqueue of current CPU and whether
-the CPU was idle at the time the scheduler_tick() happened and iterates over all
+the CPU was idle at the time the sched_tick() happened and iterates over all
 sched domains our CPU is on, starting from its base domain and going up the ->parent
 chain. While doing that, it checks to see if the current domain has exhausted its
 rebalance interval. If so, it runs load_balance() on that domain. It then checks
diff --git a/Documentation/translations/zh_CN/scheduler/sched-domains.rst b/Documentation/translations/zh_CN/scheduler/sched-domains.rst
index fbc326668e37..fa0c0bcc6ba5 100644
--- a/Documentation/translations/zh_CN/scheduler/sched-domains.rst
+++ b/Documentation/translations/zh_CN/scheduler/sched-domains.rst
@@ -34,12 +34,12 @@ CPU共享。任意两个组的CPU掩码的交集不一定为空，如果是这
 调度域中的负载均衡发生在调度组中。也就是说，每个组被视为一个实体。组的负载被定义为它
 管辖的每个CPU的负载之和。仅当组的负载不均衡后，任务才在组之间发生迁移。
 
-在kernel/sched/core.c中，trigger_load_balance()在每个CPU上通过scheduler_tick()
+在kernel/sched/core.c中，trigger_load_balance()在每个CPU上通过sched_tick()
 周期执行。在当前运行队列下一个定期调度再平衡事件到达后，它引发一个软中断。负载均衡真正
 的工作由sched_balance_softirq()->rebalance_domains()完成，在软中断上下文中执行
 （SCHED_SOFTIRQ）。
 
-后一个函数有两个入参：当前CPU的运行队列、它在scheduler_tick()调用时是否空闲。函数会从
+后一个函数有两个入参：当前CPU的运行队列、它在sched_tick()调用时是否空闲。函数会从
 当前CPU所在的基调度域开始迭代执行，并沿着parent指针链向上进入更高层级的调度域。在迭代
 过程中，函数会检查当前调度域是否已经耗尽了再平衡的时间间隔，如果是，它在该调度域运行
 load_balance()。接下来它检查父调度域（如果存在），再后来父调度域的父调度域，以此类推。
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 17cb0761ff65..7eb7f31af796 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -301,7 +301,7 @@ enum {
 	TASK_COMM_LEN = 16,
 };
 
-extern void scheduler_tick(void);
+extern void sched_tick(void);
 
 #define	MAX_SCHEDULE_TIMEOUT		LONG_MAX
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d44efa0d0611..71b7a08a6502 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5662,7 +5662,7 @@ static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
  */
-void scheduler_tick(void)
+void sched_tick(void)
 {
 	int cpu = smp_processor_id();
 	struct rq *rq = cpu_rq(cpu);
@@ -6585,7 +6585,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  *      paths. For example, see arch/x86/entry_64.S.
  *
  *      To drive preemption between tasks, the scheduler sets the flag in timer
- *      interrupt handler scheduler_tick().
+ *      interrupt handler sched_tick().
  *
  *   3. Wakeups don't really cause entry into schedule(). They add a
  *      task to the run-queue and that's it.
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index 52c8f8226b0d..ca9da66cc894 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -379,7 +379,7 @@ void calc_global_load(void)
 }
 
 /*
- * Called from scheduler_tick() to periodically update this CPU's
+ * Called from sched_tick() to periodically update this CPU's
  * active count.
  */
 void calc_global_load_tick(struct rq *this_rq)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index e69e75d3858c..ff49ddcc9800 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -2478,7 +2478,7 @@ void update_process_times(int user_tick)
 	if (in_irq())
 		irq_work_tick();
 #endif
-	scheduler_tick();
+	sched_tick();
 	if (IS_ENABLED(CONFIG_POSIX_TIMERS))
 		run_posix_cpu_timers();
 }
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index bf2bdac46843..8fbb0ec39079 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1464,7 +1464,7 @@ void wq_worker_sleeping(struct task_struct *task)
  * wq_worker_tick - a scheduler tick occurred while a kworker is running
  * @task: task currently running
  *
- * Called from scheduler_tick(). We're in the IRQ context and the current
+ * Called from sched_tick(). We're in the IRQ context and the current
  * worker's fields which follow the 'K' locking rule can be accessed safely.
  */
 void wq_worker_tick(struct task_struct *task)
diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc
index 25432b8cd5bd..073a748b9380 100644
--- a/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc
@@ -19,7 +19,7 @@ fail() { # mesg
 
 FILTER=set_ftrace_filter
 FUNC1="schedule"
-FUNC2="scheduler_tick"
+FUNC2="sched_tick"
 
 ALL_FUNCS="#### all functions enabled ####"
 
-- 
cgit v1.2.3


From 4c3e509ea9f249458e8692f8298cceac73105948 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 8 Mar 2024 12:18:11 +0100
Subject: sched/balancing: Rename load_balance() => sched_balance_rq()

Standardize scheduler load-balancing function names on the
sched_balance_() prefix.

Also load_balance() has become somewhat of a misnomer: historically
it was the first and primary load-balancing function that was called,
but with the introduction of sched domains, it's become a lower
layer function that balances runqueues.

Rename it to sched_balance_rq() accordingly.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Link: https://lore.kernel.org/r/20240308111819.1101550-6-mingo@kernel.org
---
 Documentation/scheduler/sched-domains.rst          |  4 +--
 Documentation/scheduler/sched-stats.rst            | 32 +++++++++++-----------
 .../translations/zh_CN/scheduler/sched-domains.rst |  4 +--
 .../translations/zh_CN/scheduler/sched-stats.rst   | 30 ++++++++++----------
 include/linux/sched/topology.h                     |  2 +-
 kernel/sched/fair.c                                | 10 +++----
 6 files changed, 41 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/Documentation/scheduler/sched-domains.rst b/Documentation/scheduler/sched-domains.rst
index 5d8e8b8b269e..5e996fe973b1 100644
--- a/Documentation/scheduler/sched-domains.rst
+++ b/Documentation/scheduler/sched-domains.rst
@@ -41,11 +41,11 @@ The latter function takes two arguments: the runqueue of current CPU and whether
 the CPU was idle at the time the sched_tick() happened and iterates over all
 sched domains our CPU is on, starting from its base domain and going up the ->parent
 chain. While doing that, it checks to see if the current domain has exhausted its
-rebalance interval. If so, it runs load_balance() on that domain. It then checks
+rebalance interval. If so, it runs sched_balance_rq() on that domain. It then checks
 the parent sched_domain (if it exists), and the parent of the parent and so
 forth.
 
-Initially, load_balance() finds the busiest group in the current sched domain.
+Initially, sched_balance_rq() finds the busiest group in the current sched domain.
 If it succeeds, it looks for the busiest runqueue of all the CPUs' runqueues in
 that group. If it manages to find such a runqueue, it locks both our initial
 CPU's runqueue and the newly found busiest one and starts moving tasks from it
diff --git a/Documentation/scheduler/sched-stats.rst b/Documentation/scheduler/sched-stats.rst
index 73c412666655..7c2b16c4729d 100644
--- a/Documentation/scheduler/sched-stats.rst
+++ b/Documentation/scheduler/sched-stats.rst
@@ -77,53 +77,53 @@ domain<N> <cpumask> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 
 The first field is a bit mask indicating what cpus this domain operates over.
 
-The next 24 are a variety of load_balance() statistics in grouped into types
+The next 24 are a variety of sched_balance_rq() statistics in grouped into types
 of idleness (idle, busy, and newly idle):
 
-    1)  # of times in this domain load_balance() was called when the
+    1)  # of times in this domain sched_balance_rq() was called when the
         cpu was idle
-    2)  # of times in this domain load_balance() checked but found
+    2)  # of times in this domain sched_balance_rq() checked but found
         the load did not require balancing when the cpu was idle
-    3)  # of times in this domain load_balance() tried to move one or
+    3)  # of times in this domain sched_balance_rq() tried to move one or
         more tasks and failed, when the cpu was idle
     4)  sum of imbalances discovered (if any) with each call to
-        load_balance() in this domain when the cpu was idle
+        sched_balance_rq() in this domain when the cpu was idle
     5)  # of times in this domain pull_task() was called when the cpu
         was idle
     6)  # of times in this domain pull_task() was called even though
         the target task was cache-hot when idle
-    7)  # of times in this domain load_balance() was called but did
+    7)  # of times in this domain sched_balance_rq() was called but did
         not find a busier queue while the cpu was idle
     8)  # of times in this domain a busier queue was found while the
         cpu was idle but no busier group was found
-    9)  # of times in this domain load_balance() was called when the
+    9)  # of times in this domain sched_balance_rq() was called when the
         cpu was busy
-    10) # of times in this domain load_balance() checked but found the
+    10) # of times in this domain sched_balance_rq() checked but found the
         load did not require balancing when busy
-    11) # of times in this domain load_balance() tried to move one or
+    11) # of times in this domain sched_balance_rq() tried to move one or
         more tasks and failed, when the cpu was busy
     12) sum of imbalances discovered (if any) with each call to
-        load_balance() in this domain when the cpu was busy
+        sched_balance_rq() in this domain when the cpu was busy
     13) # of times in this domain pull_task() was called when busy
     14) # of times in this domain pull_task() was called even though the
         target task was cache-hot when busy
-    15) # of times in this domain load_balance() was called but did not
+    15) # of times in this domain sched_balance_rq() was called but did not
         find a busier queue while the cpu was busy
     16) # of times in this domain a busier queue was found while the cpu
         was busy but no busier group was found
 
-    17) # of times in this domain load_balance() was called when the
+    17) # of times in this domain sched_balance_rq() was called when the
         cpu was just becoming idle
-    18) # of times in this domain load_balance() checked but found the
+    18) # of times in this domain sched_balance_rq() checked but found the
         load did not require balancing when the cpu was just becoming idle
-    19) # of times in this domain load_balance() tried to move one or more
+    19) # of times in this domain sched_balance_rq() tried to move one or more
         tasks and failed, when the cpu was just becoming idle
     20) sum of imbalances discovered (if any) with each call to
-        load_balance() in this domain when the cpu was just becoming idle
+        sched_balance_rq() in this domain when the cpu was just becoming idle
     21) # of times in this domain pull_task() was called when newly idle
     22) # of times in this domain pull_task() was called even though the
         target task was cache-hot when just becoming idle
-    23) # of times in this domain load_balance() was called but did not
+    23) # of times in this domain sched_balance_rq() was called but did not
         find a busier queue while the cpu was just becoming idle
     24) # of times in this domain a busier queue was found while the cpu
         was just becoming idle but no busier group was found
diff --git a/Documentation/translations/zh_CN/scheduler/sched-domains.rst b/Documentation/translations/zh_CN/scheduler/sched-domains.rst
index e6590fd80640..06363169c56b 100644
--- a/Documentation/translations/zh_CN/scheduler/sched-domains.rst
+++ b/Documentation/translations/zh_CN/scheduler/sched-domains.rst
@@ -42,9 +42,9 @@ CPU共享。任意两个组的CPU掩码的交集不一定为空，如果是这
 后一个函数有两个入参：当前CPU的运行队列、它在sched_tick()调用时是否空闲。函数会从
 当前CPU所在的基调度域开始迭代执行，并沿着parent指针链向上进入更高层级的调度域。在迭代
 过程中，函数会检查当前调度域是否已经耗尽了再平衡的时间间隔，如果是，它在该调度域运行
-load_balance()。接下来它检查父调度域（如果存在），再后来父调度域的父调度域，以此类推。
+sched_balance_rq()。接下来它检查父调度域（如果存在），再后来父调度域的父调度域，以此类推。
 
-起初，load_balance()查找当前调度域中最繁忙的调度组。如果成功，在该调度组管辖的全部CPU
+起初，sched_balance_rq()查找当前调度域中最繁忙的调度组。如果成功，在该调度组管辖的全部CPU
 的运行队列中找出最繁忙的运行队列。如能找到，对当前的CPU运行队列和新找到的最繁忙运行
 队列均加锁，并把任务从最繁忙队列中迁移到当前CPU上。被迁移的任务数量等于在先前迭代执行
 中计算出的该调度域的调度组的不均衡值。
diff --git a/Documentation/translations/zh_CN/scheduler/sched-stats.rst b/Documentation/translations/zh_CN/scheduler/sched-stats.rst
index c5e0be663837..09eee2517610 100644
--- a/Documentation/translations/zh_CN/scheduler/sched-stats.rst
+++ b/Documentation/translations/zh_CN/scheduler/sched-stats.rst
@@ -75,42 +75,42 @@ domain<N> <cpumask> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 繁忙，新空闲）：
 
 
-    1)  当CPU空闲时，load_balance()在这个调度域中被调用了#次
-    2)  当CPU空闲时，load_balance()在这个调度域中被调用，但是发现负载无需
+    1)  当CPU空闲时，sched_balance_rq()在这个调度域中被调用了#次
+    2)  当CPU空闲时，sched_balance_rq()在这个调度域中被调用，但是发现负载无需
         均衡#次
-    3)  当CPU空闲时，load_balance()在这个调度域中被调用，试图迁移1个或更多
+    3)  当CPU空闲时，sched_balance_rq()在这个调度域中被调用，试图迁移1个或更多
         任务且失败了#次
-    4)  当CPU空闲时，load_balance()在这个调度域中被调用，发现不均衡（如果有）
+    4)  当CPU空闲时，sched_balance_rq()在这个调度域中被调用，发现不均衡（如果有）
         #次
     5)  当CPU空闲时，pull_task()在这个调度域中被调用#次
     6)  当CPU空闲时，尽管目标任务是热缓存状态，pull_task()依然被调用#次
-    7)  当CPU空闲时，load_balance()在这个调度域中被调用，未能找到更繁忙的
+    7)  当CPU空闲时，sched_balance_rq()在这个调度域中被调用，未能找到更繁忙的
         队列#次
     8)  当CPU空闲时，在调度域中找到了更繁忙的队列，但未找到更繁忙的调度组
         #次
-    9)  当CPU繁忙时，load_balance()在这个调度域中被调用了#次
-    10) 当CPU繁忙时，load_balance()在这个调度域中被调用，但是发现负载无需
+    9)  当CPU繁忙时，sched_balance_rq()在这个调度域中被调用了#次
+    10) 当CPU繁忙时，sched_balance_rq()在这个调度域中被调用，但是发现负载无需
         均衡#次
-    11) 当CPU繁忙时，load_balance()在这个调度域中被调用，试图迁移1个或更多
+    11) 当CPU繁忙时，sched_balance_rq()在这个调度域中被调用，试图迁移1个或更多
         任务且失败了#次
-    12) 当CPU繁忙时，load_balance()在这个调度域中被调用，发现不均衡（如果有）
+    12) 当CPU繁忙时，sched_balance_rq()在这个调度域中被调用，发现不均衡（如果有）
         #次
     13) 当CPU繁忙时，pull_task()在这个调度域中被调用#次
     14) 当CPU繁忙时，尽管目标任务是热缓存状态，pull_task()依然被调用#次
-    15) 当CPU繁忙时，load_balance()在这个调度域中被调用，未能找到更繁忙的
+    15) 当CPU繁忙时，sched_balance_rq()在这个调度域中被调用，未能找到更繁忙的
         队列#次
     16) 当CPU繁忙时，在调度域中找到了更繁忙的队列，但未找到更繁忙的调度组
         #次
-    17) 当CPU新空闲时，load_balance()在这个调度域中被调用了#次
-    18) 当CPU新空闲时，load_balance()在这个调度域中被调用，但是发现负载无需
+    17) 当CPU新空闲时，sched_balance_rq()在这个调度域中被调用了#次
+    18) 当CPU新空闲时，sched_balance_rq()在这个调度域中被调用，但是发现负载无需
         均衡#次
-    19) 当CPU新空闲时，load_balance()在这个调度域中被调用，试图迁移1个或更多
+    19) 当CPU新空闲时，sched_balance_rq()在这个调度域中被调用，试图迁移1个或更多
         任务且失败了#次
-    20) 当CPU新空闲时，load_balance()在这个调度域中被调用，发现不均衡（如果有）
+    20) 当CPU新空闲时，sched_balance_rq()在这个调度域中被调用，发现不均衡（如果有）
         #次
     21) 当CPU新空闲时，pull_task()在这个调度域中被调用#次
     22) 当CPU新空闲时，尽管目标任务是热缓存状态，pull_task()依然被调用#次
-    23) 当CPU新空闲时，load_balance()在这个调度域中被调用，未能找到更繁忙的
+    23) 当CPU新空闲时，sched_balance_rq()在这个调度域中被调用，未能找到更繁忙的
         队列#次
     24) 当CPU新空闲时，在调度域中找到了更繁忙的队列，但未找到更繁忙的调度组
         #次
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 18572c9ea724..c8fe9bab981b 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -110,7 +110,7 @@ struct sched_domain {
 	unsigned long last_decay_max_lb_cost;
 
 #ifdef CONFIG_SCHEDSTATS
-	/* load_balance() stats */
+	/* sched_balance_rq() stats */
 	unsigned int lb_count[CPU_MAX_IDLE_TYPES];
 	unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
 	unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 330788b0c617..0d2753c50be9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6866,7 +6866,7 @@ dequeue_throttle:
 
 #ifdef CONFIG_SMP
 
-/* Working cpumask for: load_balance, load_balance_newidle. */
+/* Working cpumask for: sched_balance_rq, load_balance_newidle. */
 static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
 static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask);
 static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
@@ -11242,7 +11242,7 @@ static int should_we_balance(struct lb_env *env)
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
  */
-static int load_balance(int this_cpu, struct rq *this_rq,
+static int sched_balance_rq(int this_cpu, struct rq *this_rq,
 			struct sched_domain *sd, enum cpu_idle_type idle,
 			int *continue_balancing)
 {
@@ -11647,7 +11647,7 @@ out_unlock:
 static atomic_t sched_balance_running = ATOMIC_INIT(0);
 
 /*
- * Scale the max load_balance interval with the number of CPUs in the system.
+ * Scale the max sched_balance_rq interval with the number of CPUs in the system.
  * This trades load-balance latency on larger machines for less cross talk.
  */
 void update_max_interval(void)
@@ -11727,7 +11727,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
 		}
 
 		if (time_after_eq(jiffies, sd->last_balance + interval)) {
-			if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
+			if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) {
 				/*
 				 * The LBF_DST_PINNED logic could have changed
 				 * env->dst_cpu, so we can't know our idle
@@ -12353,7 +12353,7 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
 
 		if (sd->flags & SD_BALANCE_NEWIDLE) {
 
-			pulled_task = load_balance(this_cpu, this_rq,
+			pulled_task = sched_balance_rq(this_cpu, this_rq,
 						   sd, CPU_NEWLY_IDLE,
 						   &continue_balancing);
 
-- 
cgit v1.2.3


From e27b02e23a701e5761f1d6028643e1203a1c56de Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sat, 9 Mar 2024 14:45:38 -0800
Subject: ssb: drop use of non-existing CONFIG_SSB_DEBUG symbol
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CONFIG_SSB_DEBUG is used in one header file and nowhere else, so
remove it and the now-empty inline function that contained it. Also
remove the call to the empty inline function. The empty "default:"
case is kept to prevent 2 compiler warnings:

  drivers/ssb/main.c:1133:9: warning: enumeration value 'SSB_BUSTYPE_PCMCIA' not handled in switch [-Wswitch]
  drivers/ssb/main.c:1133:9: warning: enumeration value 'SSB_BUSTYPE_SDIO' not handled in switch [-Wswitch]

Reported-by: Ying Sun <sunying@isrc.iscas.ac.cn>
Link: https://lore.kernel.org/lkml/4e8525fe.607e2.18a8ddfdce8.Coremail.sunying@isrc.iscas.ac.cn/
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Michael Büsch <m@bues.ch>
Cc: linux-wireless@vger.kernel.org
Cc: Kalle Valo <kvalo@kernel.org>
Acked-by: Michael Büsch <m@bues.ch>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://msgid.link/20240309224540.22682-1-rdunlap@infradead.org
---
 drivers/ssb/main.c      | 1 -
 include/linux/ssb/ssb.h | 8 --------
 2 files changed, 9 deletions(-)

(limited to 'include')

diff --git a/drivers/ssb/main.c b/drivers/ssb/main.c
index bdb6595ffd2d..8210bf28a340 100644
--- a/drivers/ssb/main.c
+++ b/drivers/ssb/main.c
@@ -1146,7 +1146,6 @@ u32 ssb_dma_translation(struct ssb_device *dev)
 				return SSB_PCI_DMA;
 		}
 	default:
-		__ssb_dma_not_implemented(dev);
 	}
 	return 0;
 }
diff --git a/include/linux/ssb/ssb.h b/include/linux/ssb/ssb.h
index 1f326da289d3..a2257380c3f1 100644
--- a/include/linux/ssb/ssb.h
+++ b/include/linux/ssb/ssb.h
@@ -621,14 +621,6 @@ extern u32 ssb_dma_translation(struct ssb_device *dev);
 #define SSB_DMA_TRANSLATION_MASK	0xC0000000
 #define SSB_DMA_TRANSLATION_SHIFT	30
 
-static inline void __cold __ssb_dma_not_implemented(struct ssb_device *dev)
-{
-#ifdef CONFIG_SSB_DEBUG
-	printk(KERN_ERR "SSB: BUG! Calling DMA API for "
-	       "unsupported bustype %d\n", dev->bus->bustype);
-#endif /* DEBUG */
-}
-
 #ifdef CONFIG_SSB_PCIHOST
 /* PCI-host wrapper driver */
 extern int ssb_pcihost_register(struct pci_driver *driver);
-- 
cgit v1.2.3


From 8e61e3192a77e1006c1c54c022173026f7afae2c Mon Sep 17 00:00:00 2001
From: Matt Roper <matthew.d.roper@intel.com>
Date: Tue, 12 Mar 2024 14:12:25 -0700
Subject: drm/xe/uapi: Add IP version and stepping to GT list query

For modern platforms (MTL and later), both kernel and userspace drivers
are expected to apply GT programming and workarounds based on the IP
version and stepping self-reported by the GT hardware via the GMD_ID
registers.  Since userspace drivers can't access these registers
directly, pass along the version and stepping information via the GT
list query.  Note that the new query fields will remain 0's when running
on pre-GMD_ID platforms.  Userspace is expected to continue using PCI
devid / revid on those older platforms.

Although the hardware also has a GMD_ID register for display
version/stepping, that value is intentionally *not* included anywhere in
the Xe uapi.  Display userspace should be using platform-agnostic APIs
and auto-detecting platform capabilities rather than matching specific
IP versions.

v2:
 - s/revid/rev/  (Lucas)
 - Fix kerneldoc copy/paste mistakes

Signed-off-by: Matt Roper <matthew.d.roper@intel.com>
Reviewed-by: Lucas De Marchi <lucas.demarchi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240312211229.2871288-4-matthew.d.roper@intel.com
---
 drivers/gpu/drm/xe/xe_query.c |  8 ++++++++
 include/uapi/drm/xe_drm.h     | 10 +++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/xe/xe_query.c b/drivers/gpu/drm/xe/xe_query.c
index a6a20a6dd360..e80321b34918 100644
--- a/drivers/gpu/drm/xe/xe_query.c
+++ b/drivers/gpu/drm/xe/xe_query.c
@@ -12,6 +12,7 @@
 #include <drm/xe_drm.h>
 
 #include "regs/xe_engine_regs.h"
+#include "regs/xe_gt_regs.h"
 #include "xe_bo.h"
 #include "xe_device.h"
 #include "xe_exec_queue.h"
@@ -401,6 +402,13 @@ static int query_gt_list(struct xe_device *xe, struct drm_xe_device_query *query
 				BIT(gt_to_tile(gt)->id) << 1;
 		gt_list->gt_list[id].far_mem_regions = xe->info.mem_region_mask ^
 			gt_list->gt_list[id].near_mem_regions;
+
+		gt_list->gt_list[id].ip_ver_major =
+			REG_FIELD_GET(GMD_ID_ARCH_MASK, gt->info.gmdid);
+		gt_list->gt_list[id].ip_ver_minor =
+			REG_FIELD_GET(GMD_ID_RELEASE_MASK, gt->info.gmdid);
+		gt_list->gt_list[id].ip_ver_rev =
+			REG_FIELD_GET(GMD_ID_REVID, gt->info.gmdid);
 	}
 
 	if (copy_to_user(query_ptr, gt_list, size)) {
diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
index 2fc19177d2b0..808ad1c308ec 100644
--- a/include/uapi/drm/xe_drm.h
+++ b/include/uapi/drm/xe_drm.h
@@ -459,8 +459,16 @@ struct drm_xe_gt {
 	 * by struct drm_xe_query_mem_regions' mem_class.
 	 */
 	__u64 far_mem_regions;
+	/** @ip_ver_major: Graphics/media IP major version on GMD_ID platforms */
+	__u16 ip_ver_major;
+	/** @ip_ver_minor: Graphics/media IP minor version on GMD_ID platforms */
+	__u16 ip_ver_minor;
+	/** @ip_ver_rev: Graphics/media IP revision version on GMD_ID platforms */
+	__u16 ip_ver_rev;
+	/** @pad2: MBZ */
+	__u16 pad2;
 	/** @reserved: Reserved */
-	__u64 reserved[8];
+	__u64 reserved[7];
 };
 
 /**
-- 
cgit v1.2.3


From 1793f69326266269a8117d3f5324ac78db18931c Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Wed, 13 Mar 2024 07:58:36 +0000
Subject: vmlinux.lds.h: Fix a typo in comment

Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20240313075839.8321-2-richard.weiyang@gmail.com
---
 include/asm-generic/vmlinux.lds.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 5dd3a61d673d..514d3002ad8a 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -3,7 +3,7 @@
  * linker scripts.
  *
  * A minimal linker scripts has following content:
- * [This is a sample, architectures may have special requiriements]
+ * [This is a sample, architectures may have special requirements]
  *
  * OUTPUT_FORMAT(...)
  * OUTPUT_ARCH(...)
-- 
cgit v1.2.3


From 64be03575f9e9772ebdebc7f067d533348602083 Mon Sep 17 00:00:00 2001
From: Fiona Klute <fiona.klute@gmx.de>
Date: Mon, 11 Mar 2024 11:37:13 +0100
Subject: wifi: rtw88: SDIO device driver for RTL8723CS

This driver uses the new rtw8703b chip driver code.

Acked-by: Ping-Ke Shih <pkshih@realtek.com>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org> # For SDIO
Tested-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Fiona Klute <fiona.klute@gmx.de>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://msgid.link/20240311103735.615541-10-fiona.klute@gmx.de
---
 drivers/net/wireless/realtek/rtw88/Kconfig     | 18 ++++++++++++++
 drivers/net/wireless/realtek/rtw88/Makefile    |  6 +++++
 drivers/net/wireless/realtek/rtw88/rtw8723cs.c | 34 ++++++++++++++++++++++++++
 include/linux/mmc/sdio_ids.h                   |  1 +
 4 files changed, 59 insertions(+)
 create mode 100644 drivers/net/wireless/realtek/rtw88/rtw8723cs.c

(limited to 'include')

diff --git a/drivers/net/wireless/realtek/rtw88/Kconfig b/drivers/net/wireless/realtek/rtw88/Kconfig
index 07b5b2f6eef7..22838ede03cd 100644
--- a/drivers/net/wireless/realtek/rtw88/Kconfig
+++ b/drivers/net/wireless/realtek/rtw88/Kconfig
@@ -31,6 +31,10 @@ config RTW88_8822C
 config RTW88_8723X
 	tristate
 
+config RTW88_8703B
+	tristate
+	select RTW88_8723X
+
 config RTW88_8723D
 	tristate
 	select RTW88_8723X
@@ -126,6 +130,20 @@ config RTW88_8723DS
 
 	  802.11n SDIO wireless network adapter
 
+config RTW88_8723CS
+	tristate "Realtek 8723CS SDIO wireless network adapter"
+	depends on MMC
+	select RTW88_CORE
+	select RTW88_SDIO
+	select RTW88_8703B
+	help
+	  Select this option to enable support for 8723CS chipset (EXPERIMENTAL)
+
+	  This module adds support for the 8723CS 802.11n SDIO
+	  wireless network adapter.
+
+	  If you choose to build a module, it'll be called rtw88_8723cs.
+
 config RTW88_8723DU
 	tristate "Realtek 8723DU USB wireless network adapter"
 	depends on USB
diff --git a/drivers/net/wireless/realtek/rtw88/Makefile b/drivers/net/wireless/realtek/rtw88/Makefile
index 22516c984608..8f47359b4380 100644
--- a/drivers/net/wireless/realtek/rtw88/Makefile
+++ b/drivers/net/wireless/realtek/rtw88/Makefile
@@ -47,6 +47,12 @@ rtw88_8822cu-objs		:= rtw8822cu.o
 obj-$(CONFIG_RTW88_8723X)	+= rtw88_8723x.o
 rtw88_8723x-objs		:= rtw8723x.o
 
+obj-$(CONFIG_RTW88_8703B)	+= rtw88_8703b.o
+rtw88_8703b-objs		:= rtw8703b.o rtw8703b_tables.o
+
+obj-$(CONFIG_RTW88_8723CS)	+= rtw88_8723cs.o
+rtw88_8723cs-objs		:= rtw8723cs.o
+
 obj-$(CONFIG_RTW88_8723D)	+= rtw88_8723d.o
 rtw88_8723d-objs		:= rtw8723d.o rtw8723d_table.o
 
diff --git a/drivers/net/wireless/realtek/rtw88/rtw8723cs.c b/drivers/net/wireless/realtek/rtw88/rtw8723cs.c
new file mode 100644
index 000000000000..8d38d36be8c0
--- /dev/null
+++ b/drivers/net/wireless/realtek/rtw88/rtw8723cs.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+/* Copyright Fiona Klute <fiona.klute@gmx.de> */
+
+#include <linux/mmc/sdio_func.h>
+#include <linux/mmc/sdio_ids.h>
+#include <linux/module.h>
+#include "main.h"
+#include "rtw8703b.h"
+#include "sdio.h"
+
+static const struct sdio_device_id rtw_8723cs_id_table[] = {
+	{
+		SDIO_DEVICE(SDIO_VENDOR_ID_REALTEK,
+			    SDIO_DEVICE_ID_REALTEK_RTW8723CS),
+		.driver_data = (kernel_ulong_t)&rtw8703b_hw_spec,
+	},
+	{}
+};
+MODULE_DEVICE_TABLE(sdio, rtw_8723cs_id_table);
+
+static struct sdio_driver rtw_8723cs_driver = {
+	.name = "rtw8723cs",
+	.id_table = rtw_8723cs_id_table,
+	.probe = rtw_sdio_probe,
+	.remove = rtw_sdio_remove,
+	.drv = {
+		.pm = &rtw_sdio_pm_ops,
+		.shutdown = rtw_sdio_shutdown
+	}};
+module_sdio_driver(rtw_8723cs_driver);
+
+MODULE_AUTHOR("Fiona Klute <fiona.klute@gmx.de>");
+MODULE_DESCRIPTION("Realtek 802.11n wireless 8723cs driver");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h
index 7fada7a714fe..7cddfdac2f57 100644
--- a/include/linux/mmc/sdio_ids.h
+++ b/include/linux/mmc/sdio_ids.h
@@ -124,6 +124,7 @@
 #define SDIO_DEVICE_ID_REALTEK_RTW8723DS_2ANT	0xd723
 #define SDIO_DEVICE_ID_REALTEK_RTW8723DS_1ANT	0xd724
 #define SDIO_DEVICE_ID_REALTEK_RTW8821DS	0xd821
+#define SDIO_DEVICE_ID_REALTEK_RTW8723CS	0xb703
 
 #define SDIO_VENDOR_ID_SIANO			0x039a
 #define SDIO_DEVICE_ID_SIANO_NOVA_B0		0x0201
-- 
cgit v1.2.3


From a0b39da11618897e0a2b85ea28e34b8cbcbd3274 Mon Sep 17 00:00:00 2001
From: Hsin-Yi Wang <hsinyi@chromium.org>
Date: Thu, 7 Mar 2024 14:57:41 -0800
Subject: drm_edid: Add a function to get EDID base block

It's found that some panels have variants that they share the same panel id
although their EDID and names are different. Besides panel id, now we need
more information from the EDID base block to distinguish these panel
variants.

Add drm_edid_read_base_block() to return the EDID base block, which is
wrapped in struct drm_edid.

Caller can further use it to get panel id or check if the block contains
certain strings, such as panel name.

Merge drm_edid_get_panel_id() and edid_extract_panel_id() into one
function.

Signed-off-by: Hsin-Yi Wang <hsinyi@chromium.org>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Reviewed-by: Jani Nikula <jani.nikula@intel.com>
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20240307230653.1807557-2-hsinyi@chromium.org
---
 drivers/gpu/drm/drm_edid.c        | 71 +++++++++++++++++++++++----------------
 drivers/gpu/drm/panel/panel-edp.c |  8 +++--
 include/drm/drm_edid.h            |  3 +-
 3 files changed, 50 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index 923c4423151c..ecd6e9255b67 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -2749,8 +2749,27 @@ const struct drm_edid *drm_edid_read(struct drm_connector *connector)
 }
 EXPORT_SYMBOL(drm_edid_read);
 
-static u32 edid_extract_panel_id(const struct edid *edid)
+/**
+ * drm_edid_get_panel_id - Get a panel's ID from EDID
+ * @drm_edid: EDID that contains panel ID.
+ *
+ * This function uses the first block of the EDID of a panel and (assuming
+ * that the EDID is valid) extracts the ID out of it. The ID is a 32-bit value
+ * (16 bits of manufacturer ID and 16 bits of per-manufacturer ID) that's
+ * supposed to be different for each different modem of panel.
+ *
+ * Return: A 32-bit ID that should be different for each make/model of panel.
+ *         See the functions drm_edid_encode_panel_id() and
+ *         drm_edid_decode_panel_id() for some details on the structure of this
+ *         ID. Return 0 if the EDID size is less than a base block.
+ */
+u32 drm_edid_get_panel_id(const struct drm_edid *drm_edid)
 {
+	const struct edid *edid = drm_edid->edid;
+
+	if (drm_edid->size < EDID_LENGTH)
+		return 0;
+
 	/*
 	 * We represent the ID as a 32-bit number so it can easily be compared
 	 * with "==".
@@ -2768,60 +2787,54 @@ static u32 edid_extract_panel_id(const struct edid *edid)
 	       (u32)edid->mfg_id[1] << 16   |
 	       (u32)EDID_PRODUCT_ID(edid);
 }
+EXPORT_SYMBOL(drm_edid_get_panel_id);
 
 /**
- * drm_edid_get_panel_id - Get a panel's ID through DDC
+ * drm_edid_read_base_block - Get a panel's EDID base block
  * @adapter: I2C adapter to use for DDC
  *
- * This function reads the first block of the EDID of a panel and (assuming
- * that the EDID is valid) extracts the ID out of it. The ID is a 32-bit value
- * (16 bits of manufacturer ID and 16 bits of per-manufacturer ID) that's
- * supposed to be different for each different modem of panel.
+ * This function returns the drm_edid containing the first block of the EDID of
+ * a panel.
  *
  * This function is intended to be used during early probing on devices where
  * more than one panel might be present. Because of its intended use it must
- * assume that the EDID of the panel is correct, at least as far as the ID
- * is concerned (in other words, we don't process any overrides here).
+ * assume that the EDID of the panel is correct, at least as far as the base
+ * block is concerned (in other words, we don't process any overrides here).
+ *
+ * Caller should call drm_edid_free() after use.
  *
  * NOTE: it's expected that this function and drm_do_get_edid() will both
  * be read the EDID, but there is no caching between them. Since we're only
  * reading the first block, hopefully this extra overhead won't be too big.
  *
- * Return: A 32-bit ID that should be different for each make/model of panel.
- *         See the functions drm_edid_encode_panel_id() and
- *         drm_edid_decode_panel_id() for some details on the structure of this
- *         ID.
+ * WARNING: Only use this function when the connector is unknown. For example,
+ * during the early probe of panel. The EDID read from the function is temporary
+ * and should be replaced by the full EDID returned from other drm_edid_read.
+ *
+ * Return: Pointer to allocated EDID base block, or NULL on any failure.
  */
-
-u32 drm_edid_get_panel_id(struct i2c_adapter *adapter)
+const struct drm_edid *drm_edid_read_base_block(struct i2c_adapter *adapter)
 {
 	enum edid_block_status status;
 	void *base_block;
-	u32 panel_id = 0;
-
-	/*
-	 * There are no manufacturer IDs of 0, so if there is a problem reading
-	 * the EDID then we'll just return 0.
-	 */
 
 	base_block = kzalloc(EDID_LENGTH, GFP_KERNEL);
 	if (!base_block)
-		return 0;
+		return NULL;
 
 	status = edid_block_read(base_block, 0, drm_do_probe_ddc_edid, adapter);
 
 	edid_block_status_print(status, base_block, 0);
 
-	if (edid_block_status_valid(status, edid_block_tag(base_block)))
-		panel_id = edid_extract_panel_id(base_block);
-	else
+	if (!edid_block_status_valid(status, edid_block_tag(base_block))) {
 		edid_block_dump(KERN_NOTICE, base_block, 0);
+		kfree(base_block);
+		return NULL;
+	}
 
-	kfree(base_block);
-
-	return panel_id;
+	return _drm_edid_alloc(base_block, EDID_LENGTH);
 }
-EXPORT_SYMBOL(drm_edid_get_panel_id);
+EXPORT_SYMBOL(drm_edid_read_base_block);
 
 /**
  * drm_get_edid_switcheroo - get EDID data for a vga_switcheroo output
@@ -2906,7 +2919,7 @@ EXPORT_SYMBOL(drm_edid_duplicate);
  */
 static u32 edid_get_quirks(const struct drm_edid *drm_edid)
 {
-	u32 panel_id = edid_extract_panel_id(drm_edid->edid);
+	u32 panel_id = drm_edid_get_panel_id(drm_edid);
 	const struct edid_quirk *quirk;
 	int i;
 
diff --git a/drivers/gpu/drm/panel/panel-edp.c b/drivers/gpu/drm/panel/panel-edp.c
index 2906a382aae0..f7db5b10e233 100644
--- a/drivers/gpu/drm/panel/panel-edp.c
+++ b/drivers/gpu/drm/panel/panel-edp.c
@@ -766,6 +766,7 @@ static const struct edp_panel_entry *find_edp_panel(u32 panel_id);
 static int generic_edp_panel_probe(struct device *dev, struct panel_edp *panel)
 {
 	struct panel_desc *desc;
+	const struct drm_edid *base_block;
 	u32 panel_id;
 	char vend[4];
 	u16 product_id;
@@ -795,8 +796,11 @@ static int generic_edp_panel_probe(struct device *dev, struct panel_edp *panel)
 		goto exit;
 	}
 
-	panel_id = drm_edid_get_panel_id(panel->ddc);
-	if (!panel_id) {
+	base_block = drm_edid_read_base_block(panel->ddc);
+	if (base_block) {
+		panel_id = drm_edid_get_panel_id(base_block);
+		drm_edid_free(base_block);
+	} else {
 		dev_err(dev, "Couldn't identify panel via EDID\n");
 		ret = -EIO;
 		goto exit;
diff --git a/include/drm/drm_edid.h b/include/drm/drm_edid.h
index 7923bc00dc7a..9686a7cee6a6 100644
--- a/include/drm/drm_edid.h
+++ b/include/drm/drm_edid.h
@@ -410,7 +410,8 @@ struct edid *drm_do_get_edid(struct drm_connector *connector,
 	void *data);
 struct edid *drm_get_edid(struct drm_connector *connector,
 			  struct i2c_adapter *adapter);
-u32 drm_edid_get_panel_id(struct i2c_adapter *adapter);
+const struct drm_edid *drm_edid_read_base_block(struct i2c_adapter *adapter);
+u32 drm_edid_get_panel_id(const struct drm_edid *drm_edid);
 struct edid *drm_get_edid_switcheroo(struct drm_connector *connector,
 				     struct i2c_adapter *adapter);
 struct edid *drm_edid_duplicate(const struct edid *edid);
-- 
cgit v1.2.3


From 6e3fdedcf0bc03c852d9fdbb5443f1e43103195f Mon Sep 17 00:00:00 2001
From: Hsin-Yi Wang <hsinyi@chromium.org>
Date: Thu, 7 Mar 2024 14:57:42 -0800
Subject: drm/edid: Add a function to match EDID with identity

Create a type drm_edid_ident as the identity of an EDID. Currently it
contains panel id and monitor name.

Create a function that can match a given EDID and an identity:
1. Reject if the panel id doesn't match.
2. If name is not null in identity, try to match it in the detailed timing
   blocks. Note that some panel vendors put the monitor name after
   EDID_DETAIL_MONITOR_STRING.

Signed-off-by: Hsin-Yi Wang <hsinyi@chromium.org>
Reviewed-by: Douglas Anderson <dianders@chromium.org>
Reviewed-by: Jani Nikula <jani.nikula@intel.com>
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20240307230653.1807557-3-hsinyi@chromium.org
---
 drivers/gpu/drm/drm_edid.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++
 include/drm/drm_edid.h     |  9 +++++++
 2 files changed, 74 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index ecd6e9255b67..8fed2131f424 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -102,6 +102,11 @@ struct detailed_mode_closure {
 	int modes;
 };
 
+struct drm_edid_match_closure {
+	const struct drm_edid_ident *ident;
+	bool matched;
+};
+
 #define LEVEL_DMT	0
 #define LEVEL_GTF	1
 #define LEVEL_GTF2	2
@@ -5455,6 +5460,66 @@ drm_parse_hdmi_vsdb_audio(struct drm_connector *connector, const u8 *db)
 		    connector->audio_latency[0], connector->audio_latency[1]);
 }
 
+static void
+match_identity(const struct detailed_timing *timing, void *data)
+{
+	struct drm_edid_match_closure *closure = data;
+	unsigned int i;
+	const char *name = closure->ident->name;
+	unsigned int name_len = strlen(name);
+	const char *desc = timing->data.other_data.data.str.str;
+	unsigned int desc_len = ARRAY_SIZE(timing->data.other_data.data.str.str);
+
+	if (name_len > desc_len ||
+	    !(is_display_descriptor(timing, EDID_DETAIL_MONITOR_NAME) ||
+	      is_display_descriptor(timing, EDID_DETAIL_MONITOR_STRING)))
+		return;
+
+	if (strncmp(name, desc, name_len))
+		return;
+
+	for (i = name_len; i < desc_len; i++) {
+		if (desc[i] == '\n')
+			break;
+		/* Allow white space before EDID string terminator. */
+		if (!isspace(desc[i]))
+			return;
+	}
+
+	closure->matched = true;
+}
+
+/**
+ * drm_edid_match - match drm_edid with given identity
+ * @drm_edid: EDID
+ * @ident: the EDID identity to match with
+ *
+ * Check if the EDID matches with the given identity.
+ *
+ * Return: True if the given identity matched with EDID, false otherwise.
+ */
+bool drm_edid_match(const struct drm_edid *drm_edid,
+		    const struct drm_edid_ident *ident)
+{
+	if (!drm_edid || drm_edid_get_panel_id(drm_edid) != ident->panel_id)
+		return false;
+
+	/* Match with name only if it's not NULL. */
+	if (ident->name) {
+		struct drm_edid_match_closure closure = {
+			.ident = ident,
+			.matched = false,
+		};
+
+		drm_for_each_detailed_block(drm_edid, match_identity, &closure);
+
+		return closure.matched;
+	}
+
+	return true;
+}
+EXPORT_SYMBOL(drm_edid_match);
+
 static void
 monitor_name(const struct detailed_timing *timing, void *data)
 {
diff --git a/include/drm/drm_edid.h b/include/drm/drm_edid.h
index 9686a7cee6a6..6f65bbf655a1 100644
--- a/include/drm/drm_edid.h
+++ b/include/drm/drm_edid.h
@@ -312,6 +312,13 @@ struct edid {
 	u8 checksum;
 } __packed;
 
+/* EDID matching */
+struct drm_edid_ident {
+	/* ID encoded by drm_edid_encode_panel_id() */
+	u32 panel_id;
+	const char *name;
+};
+
 #define EDID_PRODUCT_ID(e) ((e)->prod_code[0] | ((e)->prod_code[1] << 8))
 
 /* Short Audio Descriptor */
@@ -412,6 +419,8 @@ struct edid *drm_get_edid(struct drm_connector *connector,
 			  struct i2c_adapter *adapter);
 const struct drm_edid *drm_edid_read_base_block(struct i2c_adapter *adapter);
 u32 drm_edid_get_panel_id(const struct drm_edid *drm_edid);
+bool drm_edid_match(const struct drm_edid *drm_edid,
+		    const struct drm_edid_ident *ident);
 struct edid *drm_get_edid_switcheroo(struct drm_connector *connector,
 				     struct i2c_adapter *adapter);
 struct edid *drm_edid_duplicate(const struct edid *edid);
-- 
cgit v1.2.3


From 7d2cc63eca0c993c99d18893214abf8f85d566d8 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Fri, 8 Mar 2024 06:38:07 +0100
Subject: bpf: Take return from set_memory_ro() into account with
 bpf_prog_lock_ro()

set_memory_ro() can fail, leaving memory unprotected.

Check its return and take it into account as an error.

Link: https://github.com/KSPP/linux/issues/7
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: linux-hardening@vger.kernel.org <linux-hardening@vger.kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Message-ID: <286def78955e04382b227cb3e4b6ba272a7442e3.1709850515.git.christophe.leroy@csgroup.eu>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h | 5 +++--
 kernel/bpf/core.c      | 4 +++-
 kernel/bpf/verifier.c  | 8 ++++++--
 3 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index c99bc3df2d28..9107ee1de66f 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -887,14 +887,15 @@ bpf_ctx_narrow_access_offset(u32 off, u32 size, u32 size_default)
 
 #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0]))
 
-static inline void bpf_prog_lock_ro(struct bpf_prog *fp)
+static inline int __must_check bpf_prog_lock_ro(struct bpf_prog *fp)
 {
 #ifndef CONFIG_BPF_JIT_ALWAYS_ON
 	if (!fp->jited) {
 		set_vm_flush_reset_perms(fp);
-		set_memory_ro((unsigned long)fp, fp->pages);
+		return set_memory_ro((unsigned long)fp, fp->pages);
 	}
 #endif
+	return 0;
 }
 
 static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 696bc55de8e8..63f100def31b 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2403,7 +2403,9 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 	}
 
 finalize:
-	bpf_prog_lock_ro(fp);
+	*err = bpf_prog_lock_ro(fp);
+	if (*err)
+		return fp;
 
 	/* The tail call compatibility check can only be done at
 	 * this late stage as we need to determine, if we deal
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7b208e5d38f6..de7813947981 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -19263,10 +19263,14 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 	 * bpf_prog_load will add the kallsyms for the main program.
 	 */
 	for (i = 1; i < env->subprog_cnt; i++) {
-		bpf_prog_lock_ro(func[i]);
-		bpf_prog_kallsyms_add(func[i]);
+		err = bpf_prog_lock_ro(func[i]);
+		if (err)
+			goto out_free;
 	}
 
+	for (i = 1; i < env->subprog_cnt; i++)
+		bpf_prog_kallsyms_add(func[i]);
+
 	/* Last step: make now unused interpreter insns from main
 	 * prog consistent for later dump requests, so they can
 	 * later look the same as if they were interpreted only.
-- 
cgit v1.2.3


From e60adf513275c3a38e5cb67f7fd12387e43a3ff5 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Fri, 8 Mar 2024 06:38:08 +0100
Subject: bpf: Take return from set_memory_rox() into account with
 bpf_jit_binary_lock_ro()

set_memory_rox() can fail, leaving memory unprotected.

Check return and bail out when bpf_jit_binary_lock_ro() returns
an error.

Link: https://github.com/KSPP/linux/issues/7
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: linux-hardening@vger.kernel.org <linux-hardening@vger.kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Puranjay Mohan <puranjay12@gmail.com>
Reviewed-by: Ilya Leoshkevich <iii@linux.ibm.com>  # s390x
Acked-by: Tiezhu Yang <yangtiezhu@loongson.cn>  # LoongArch
Reviewed-by: Johan Almbladh <johan.almbladh@anyfinetworks.com> # MIPS Part
Message-ID: <036b6393f23a2032ce75a1c92220b2afcb798d5d.1709850515.git.christophe.leroy@csgroup.eu>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/arm/net/bpf_jit_32.c        | 25 ++++++++++++-------------
 arch/loongarch/net/bpf_jit.c     | 22 ++++++++++++++++------
 arch/mips/net/bpf_jit_comp.c     |  3 ++-
 arch/parisc/net/bpf_jit_core.c   |  8 +++++++-
 arch/s390/net/bpf_jit_comp.c     |  6 +++++-
 arch/sparc/net/bpf_jit_comp_64.c |  6 +++++-
 arch/x86/net/bpf_jit_comp32.c    |  3 +--
 include/linux/filter.h           |  5 +++--
 8 files changed, 51 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index 1d672457d02f..01516f83a95a 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -2222,28 +2222,21 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	/* If building the body of the JITed code fails somehow,
 	 * we fall back to the interpretation.
 	 */
-	if (build_body(&ctx) < 0) {
-		image_ptr = NULL;
-		bpf_jit_binary_free(header);
-		prog = orig_prog;
-		goto out_imms;
-	}
+	if (build_body(&ctx) < 0)
+		goto out_free;
 	build_epilogue(&ctx);
 
 	/* 3.) Extra pass to validate JITed Code */
-	if (validate_code(&ctx)) {
-		image_ptr = NULL;
-		bpf_jit_binary_free(header);
-		prog = orig_prog;
-		goto out_imms;
-	}
+	if (validate_code(&ctx))
+		goto out_free;
 	flush_icache_range((u32)header, (u32)(ctx.target + ctx.idx));
 
 	if (bpf_jit_enable > 1)
 		/* there are 2 passes here */
 		bpf_jit_dump(prog->len, image_size, 2, ctx.target);
 
-	bpf_jit_binary_lock_ro(header);
+	if (bpf_jit_binary_lock_ro(header))
+		goto out_free;
 	prog->bpf_func = (void *)ctx.target;
 	prog->jited = 1;
 	prog->jited_len = image_size;
@@ -2260,5 +2253,11 @@ out:
 		bpf_jit_prog_release_other(prog, prog == orig_prog ?
 					   tmp : orig_prog);
 	return prog;
+
+out_free:
+	image_ptr = NULL;
+	bpf_jit_binary_free(header);
+	prog = orig_prog;
+	goto out_imms;
 }
 
diff --git a/arch/loongarch/net/bpf_jit.c b/arch/loongarch/net/bpf_jit.c
index e73323d759d0..7dbefd4ba210 100644
--- a/arch/loongarch/net/bpf_jit.c
+++ b/arch/loongarch/net/bpf_jit.c
@@ -1294,16 +1294,19 @@ skip_init_ctx:
 	flush_icache_range((unsigned long)header, (unsigned long)(ctx.image + ctx.idx));
 
 	if (!prog->is_func || extra_pass) {
+		int err;
+
 		if (extra_pass && ctx.idx != jit_data->ctx.idx) {
 			pr_err_once("multi-func JIT bug %d != %d\n",
 				    ctx.idx, jit_data->ctx.idx);
-			bpf_jit_binary_free(header);
-			prog->bpf_func = NULL;
-			prog->jited = 0;
-			prog->jited_len = 0;
-			goto out_offset;
+			goto out_free;
+		}
+		err = bpf_jit_binary_lock_ro(header);
+		if (err) {
+			pr_err_once("bpf_jit_binary_lock_ro() returned %d\n",
+				    err);
+			goto out_free;
 		}
-		bpf_jit_binary_lock_ro(header);
 	} else {
 		jit_data->ctx = ctx;
 		jit_data->image = image_ptr;
@@ -1334,6 +1337,13 @@ out:
 	out_offset = -1;
 
 	return prog;
+
+out_free:
+	bpf_jit_binary_free(header);
+	prog->bpf_func = NULL;
+	prog->jited = 0;
+	prog->jited_len = 0;
+	goto out_offset;
 }
 
 /* Indicate the JIT backend supports mixing bpf2bpf and tailcalls. */
diff --git a/arch/mips/net/bpf_jit_comp.c b/arch/mips/net/bpf_jit_comp.c
index a40d926b6513..e355dfca4400 100644
--- a/arch/mips/net/bpf_jit_comp.c
+++ b/arch/mips/net/bpf_jit_comp.c
@@ -1012,7 +1012,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	bpf_prog_fill_jited_linfo(prog, &ctx.descriptors[1]);
 
 	/* Set as read-only exec and flush instruction cache */
-	bpf_jit_binary_lock_ro(header);
+	if (bpf_jit_binary_lock_ro(header))
+		goto out_err;
 	flush_icache_range((unsigned long)header,
 			   (unsigned long)&ctx.target[ctx.jit_index]);
 
diff --git a/arch/parisc/net/bpf_jit_core.c b/arch/parisc/net/bpf_jit_core.c
index d6ee2fd45550..979f45d4d1fb 100644
--- a/arch/parisc/net/bpf_jit_core.c
+++ b/arch/parisc/net/bpf_jit_core.c
@@ -167,7 +167,13 @@ skip_init_ctx:
 	bpf_flush_icache(jit_data->header, ctx->insns + ctx->ninsns);
 
 	if (!prog->is_func || extra_pass) {
-		bpf_jit_binary_lock_ro(jit_data->header);
+		if (bpf_jit_binary_lock_ro(jit_data->header)) {
+			bpf_jit_binary_free(jit_data->header);
+			prog->bpf_func = NULL;
+			prog->jited = 0;
+			prog->jited_len = 0;
+			goto out_offset;
+		}
 		prologue_len = ctx->epilogue_offset - ctx->body_len;
 		for (i = 0; i < prog->len; i++)
 			ctx->offset[i] += prologue_len;
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index b418333bb086..e613eebfd349 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -2111,7 +2111,11 @@ skip_init_ctx:
 		print_fn_code(jit.prg_buf, jit.size_prg);
 	}
 	if (!fp->is_func || extra_pass) {
-		bpf_jit_binary_lock_ro(header);
+		if (bpf_jit_binary_lock_ro(header)) {
+			bpf_jit_binary_free(header);
+			fp = orig_fp;
+			goto free_addrs;
+		}
 	} else {
 		jit_data->header = header;
 		jit_data->ctx = jit;
diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c
index fa0759bfe498..73bf0aea8baf 100644
--- a/arch/sparc/net/bpf_jit_comp_64.c
+++ b/arch/sparc/net/bpf_jit_comp_64.c
@@ -1602,7 +1602,11 @@ skip_init_ctx:
 	bpf_flush_icache(header, (u8 *)header + header->size);
 
 	if (!prog->is_func || extra_pass) {
-		bpf_jit_binary_lock_ro(header);
+		if (bpf_jit_binary_lock_ro(header)) {
+			bpf_jit_binary_free(header);
+			prog = orig_prog;
+			goto out_off;
+		}
 	} else {
 		jit_data->ctx = ctx;
 		jit_data->image = image_ptr;
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index c10083a8e68e..de0f9e5f9f73 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -2600,8 +2600,7 @@ out_image:
 	if (bpf_jit_enable > 1)
 		bpf_jit_dump(prog->len, proglen, pass + 1, image);
 
-	if (image) {
-		bpf_jit_binary_lock_ro(header);
+	if (image && !bpf_jit_binary_lock_ro(header)) {
 		prog->bpf_func = (void *)image;
 		prog->jited = 1;
 		prog->jited_len = proglen;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 9107ee1de66f..c0d51bff8f96 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -898,10 +898,11 @@ static inline int __must_check bpf_prog_lock_ro(struct bpf_prog *fp)
 	return 0;
 }
 
-static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
+static inline int __must_check
+bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
 {
 	set_vm_flush_reset_perms(hdr);
-	set_memory_rox((unsigned long)hdr, hdr->size >> PAGE_SHIFT);
+	return set_memory_rox((unsigned long)hdr, hdr->size >> PAGE_SHIFT);
 }
 
 int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap);
-- 
cgit v1.2.3


From 734364d0ddee6c145705bac89400266c972efc0b Mon Sep 17 00:00:00 2001
From: Abel Vesa <abel.vesa@linaro.org>
Date: Fri, 23 Feb 2024 16:37:55 +0200
Subject: dt-bindings: arm: qcom,ids: Add SoC ID for X1E80100

Add the ID for the Qualcomm X1E80100 SoC.

Signed-off-by: Abel Vesa <abel.vesa@linaro.org>
Reviewed-by: Konrad Dybcio <konrad.dybcio@linaro.org>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20240223-x1e80100-socinfo-v1-1-be581ca60f27@linaro.org
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 include/dt-bindings/arm/qcom,ids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/dt-bindings/arm/qcom,ids.h b/include/dt-bindings/arm/qcom,ids.h
index 19ac7b36f608..d040033dc8ee 100644
--- a/include/dt-bindings/arm/qcom,ids.h
+++ b/include/dt-bindings/arm/qcom,ids.h
@@ -258,6 +258,7 @@
 #define QCOM_ID_QRU1000			539
 #define QCOM_ID_SM8475_2		540
 #define QCOM_ID_QDU1000			545
+#define QCOM_ID_X1E80100		555
 #define QCOM_ID_SM8650			557
 #define QCOM_ID_SM4450			568
 #define QCOM_ID_QDU1010			587
-- 
cgit v1.2.3


From 01c0cce88c5480cc2505b79330246ef12eda938f Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Wed, 28 Feb 2024 08:35:32 +0200
Subject: drm/omapdrm: Fix console with deferred ops

Commit 95da53d63dcf ("drm/omapdrm: Use regular fbdev I/O helpers")
stopped console from updating for command mode displays because there is
no damage handling in fb_sys_write() unlike we had earlier in
drm_fb_helper_sys_write().

Let's fix the issue by adding FB_GEN_DEFAULT_DEFERRED_DMAMEM_OPS and
FB_DMAMEM_HELPERS_DEFERRED as suggested by Thomas. We cannot use the
FB_DEFAULT_DEFERRED_OPS as fb_deferred_io_mmap() won't work properly
for write-combine.

Fixes: 95da53d63dcf ("drm/omapdrm: Use regular fbdev I/O helpers")
Suggested-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Thomas Zimmermann <tzimmermann@suse.de>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ideasonboard.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240228063540.4444-3-tony@atomide.com
---
 drivers/gpu/drm/omapdrm/Kconfig      |  2 +-
 drivers/gpu/drm/omapdrm/omap_fbdev.c | 28 ++++++++++++++++++++++------
 drivers/video/fbdev/core/Kconfig     |  6 ++++++
 include/linux/fb.h                   |  4 ++++
 4 files changed, 33 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/omapdrm/Kconfig b/drivers/gpu/drm/omapdrm/Kconfig
index b715301ec79f..6c49270cb290 100644
--- a/drivers/gpu/drm/omapdrm/Kconfig
+++ b/drivers/gpu/drm/omapdrm/Kconfig
@@ -4,7 +4,7 @@ config DRM_OMAP
 	depends on DRM && OF
 	depends on ARCH_OMAP2PLUS
 	select DRM_KMS_HELPER
-	select FB_DMAMEM_HELPERS if DRM_FBDEV_EMULATION
+	select FB_DMAMEM_HELPERS_DEFERRED if DRM_FBDEV_EMULATION
 	select VIDEOMODE_HELPERS
 	select HDMI
 	default n
diff --git a/drivers/gpu/drm/omapdrm/omap_fbdev.c b/drivers/gpu/drm/omapdrm/omap_fbdev.c
index 7c5af3de1e72..523be34682ca 100644
--- a/drivers/gpu/drm/omapdrm/omap_fbdev.c
+++ b/drivers/gpu/drm/omapdrm/omap_fbdev.c
@@ -51,6 +51,10 @@ static void pan_worker(struct work_struct *work)
 	omap_gem_roll(bo, fbi->var.yoffset * npages);
 }
 
+FB_GEN_DEFAULT_DEFERRED_DMAMEM_OPS(omap_fbdev,
+				   drm_fb_helper_damage_range,
+				   drm_fb_helper_damage_area)
+
 static int omap_fbdev_pan_display(struct fb_var_screeninfo *var,
 		struct fb_info *fbi)
 {
@@ -78,11 +82,9 @@ fallback:
 
 static int omap_fbdev_fb_mmap(struct fb_info *info, struct vm_area_struct *vma)
 {
-	struct drm_fb_helper *helper = info->par;
-	struct drm_framebuffer *fb = helper->fb;
-	struct drm_gem_object *bo = drm_gem_fb_get_obj(fb, 0);
+	vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags));
 
-	return drm_gem_mmap_obj(bo, omap_gem_mmap_size(bo), vma);
+	return fb_deferred_io_mmap(info, vma);
 }
 
 static void omap_fbdev_fb_destroy(struct fb_info *info)
@@ -94,6 +96,7 @@ static void omap_fbdev_fb_destroy(struct fb_info *info)
 
 	DBG();
 
+	fb_deferred_io_cleanup(info);
 	drm_fb_helper_fini(helper);
 
 	omap_gem_unpin(bo);
@@ -104,15 +107,19 @@ static void omap_fbdev_fb_destroy(struct fb_info *info)
 	kfree(fbdev);
 }
 
+/*
+ * For now, we cannot use FB_DEFAULT_DEFERRED_OPS and fb_deferred_io_mmap()
+ * because we use write-combine.
+ */
 static const struct fb_ops omap_fb_ops = {
 	.owner = THIS_MODULE,
-	__FB_DEFAULT_DMAMEM_OPS_RDWR,
+	__FB_DEFAULT_DEFERRED_OPS_RDWR(omap_fbdev),
 	.fb_check_var	= drm_fb_helper_check_var,
 	.fb_set_par	= drm_fb_helper_set_par,
 	.fb_setcmap	= drm_fb_helper_setcmap,
 	.fb_blank	= drm_fb_helper_blank,
 	.fb_pan_display = omap_fbdev_pan_display,
-	__FB_DEFAULT_DMAMEM_OPS_DRAW,
+	__FB_DEFAULT_DEFERRED_OPS_DRAW(omap_fbdev),
 	.fb_ioctl	= drm_fb_helper_ioctl,
 	.fb_mmap	= omap_fbdev_fb_mmap,
 	.fb_destroy	= omap_fbdev_fb_destroy,
@@ -213,6 +220,15 @@ static int omap_fbdev_create(struct drm_fb_helper *helper,
 	fbi->fix.smem_start = dma_addr;
 	fbi->fix.smem_len = bo->size;
 
+	/* deferred I/O */
+	helper->fbdefio.delay = HZ / 20;
+	helper->fbdefio.deferred_io = drm_fb_helper_deferred_io;
+
+	fbi->fbdefio = &helper->fbdefio;
+	ret = fb_deferred_io_init(fbi);
+	if (ret)
+		goto fail;
+
 	/* if we have DMM, then we can use it for scrolling by just
 	 * shuffling pages around in DMM rather than doing sw blit.
 	 */
diff --git a/drivers/video/fbdev/core/Kconfig b/drivers/video/fbdev/core/Kconfig
index db09fe87fcd4..0ab8848ba2f1 100644
--- a/drivers/video/fbdev/core/Kconfig
+++ b/drivers/video/fbdev/core/Kconfig
@@ -144,6 +144,12 @@ config FB_DMAMEM_HELPERS
 	select FB_SYS_IMAGEBLIT
 	select FB_SYSMEM_FOPS
 
+config FB_DMAMEM_HELPERS_DEFERRED
+	bool
+	depends on FB_CORE
+	select FB_DEFERRED_IO
+	select FB_DMAMEM_HELPERS
+
 config FB_IOMEM_FOPS
 	tristate
 	depends on FB_CORE
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 708e6a177b1b..5e210bf72fc9 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -688,6 +688,10 @@ extern int fb_deferred_io_fsync(struct file *file, loff_t start,
 	__FB_GEN_DEFAULT_DEFERRED_OPS_RDWR(__prefix, __damage_range, sys) \
 	__FB_GEN_DEFAULT_DEFERRED_OPS_DRAW(__prefix, __damage_area, sys)
 
+#define FB_GEN_DEFAULT_DEFERRED_DMAMEM_OPS(__prefix, __damage_range, __damage_area) \
+	__FB_GEN_DEFAULT_DEFERRED_OPS_RDWR(__prefix, __damage_range, sys) \
+	__FB_GEN_DEFAULT_DEFERRED_OPS_DRAW(__prefix, __damage_area, sys)
+
 /*
  * Initializes struct fb_ops for deferred I/O.
  */
-- 
cgit v1.2.3


From e3362acd796789dc0562eb1a3937007b0beb0c5b Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Sat, 16 Mar 2024 08:35:40 +0100
Subject: bpf: Remove arch_unprotect_bpf_trampoline()

Last user of arch_unprotect_bpf_trampoline() was removed by
commit 187e2af05abe ("bpf: struct_ops supports more than one page for
trampolines.")

Remove arch_unprotect_bpf_trampoline()

Reported-by: Daniel Borkmann <daniel@iogearbox.net>
Fixes: 187e2af05abe ("bpf: struct_ops supports more than one page for trampolines.")
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Link: https://lore.kernel.org/r/42c635bb54d3af91db0f9b85d724c7c290069f67.1710574353.git.christophe.leroy@csgroup.eu
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 arch/arm64/net/bpf_jit_comp.c | 4 ----
 arch/x86/net/bpf_jit_comp.c   | 4 ----
 include/linux/bpf.h           | 1 -
 kernel/bpf/trampoline.c       | 7 -------
 4 files changed, 16 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index c5b461dda438..132c8ffba109 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -2180,10 +2180,6 @@ void arch_protect_bpf_trampoline(void *image, unsigned int size)
 {
 }
 
-void arch_unprotect_bpf_trampoline(void *image, unsigned int size)
-{
-}
-
 int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image,
 				void *ro_image_end, const struct btf_func_model *m,
 				u32 flags, struct bpf_tramp_links *tlinks,
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index a7ba8e178645..7a56d2d84512 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -3008,10 +3008,6 @@ void arch_protect_bpf_trampoline(void *image, unsigned int size)
 {
 }
 
-void arch_unprotect_bpf_trampoline(void *image, unsigned int size)
-{
-}
-
 int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
 				const struct btf_func_model *m, u32 flags,
 				struct bpf_tramp_links *tlinks,
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4f20f62f9d63..d89bdefb42e2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1117,7 +1117,6 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 void *arch_alloc_bpf_trampoline(unsigned int size);
 void arch_free_bpf_trampoline(void *image, unsigned int size);
 void arch_protect_bpf_trampoline(void *image, unsigned int size);
-void arch_unprotect_bpf_trampoline(void *image, unsigned int size);
 int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
 			     struct bpf_tramp_links *tlinks, void *func_addr);
 
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index db7599c59c78..04fd1abd3661 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -1078,13 +1078,6 @@ void __weak arch_protect_bpf_trampoline(void *image, unsigned int size)
 	set_memory_rox((long)image, 1);
 }
 
-void __weak arch_unprotect_bpf_trampoline(void *image, unsigned int size)
-{
-	WARN_ON_ONCE(size > PAGE_SIZE);
-	set_memory_nx((long)image, 1);
-	set_memory_rw((long)image, 1);
-}
-
 int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
 				    struct bpf_tramp_links *tlinks, void *func_addr)
 {
-- 
cgit v1.2.3


From c733239f8f530872a1f80d8c45dcafbaff368737 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Sat, 16 Mar 2024 08:35:41 +0100
Subject: bpf: Check return from set_memory_rox()

arch_protect_bpf_trampoline() and alloc_new_pack() call
set_memory_rox() which can fail, leading to unprotected memory.

Take into account return from set_memory_rox() function and add
__must_check flag to arch_protect_bpf_trampoline().

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Reviewed-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/fe1c163c83767fde5cab31d209a4a6be3ddb3a73.1710574353.git.christophe.leroy@csgroup.eu
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 arch/arm64/net/bpf_jit_comp.c  |  3 ++-
 arch/x86/net/bpf_jit_comp.c    |  3 ++-
 include/linux/bpf.h            |  2 +-
 kernel/bpf/bpf_struct_ops.c    |  8 ++++++--
 kernel/bpf/core.c              | 28 +++++++++++++++++++++-------
 kernel/bpf/trampoline.c        |  8 +++++---
 net/bpf/bpf_dummy_struct_ops.c |  4 +++-
 7 files changed, 40 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 132c8ffba109..bc16eb694657 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -2176,8 +2176,9 @@ void arch_free_bpf_trampoline(void *image, unsigned int size)
 	bpf_prog_pack_free(image, size);
 }
 
-void arch_protect_bpf_trampoline(void *image, unsigned int size)
+int arch_protect_bpf_trampoline(void *image, unsigned int size)
 {
+	return 0;
 }
 
 int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *ro_image,
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 7a56d2d84512..4900b1ee019f 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -3004,8 +3004,9 @@ void arch_free_bpf_trampoline(void *image, unsigned int size)
 	bpf_prog_pack_free(image, size);
 }
 
-void arch_protect_bpf_trampoline(void *image, unsigned int size)
+int arch_protect_bpf_trampoline(void *image, unsigned int size)
 {
+	return 0;
 }
 
 int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d89bdefb42e2..17843e66a1d3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1116,7 +1116,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
 				void *func_addr);
 void *arch_alloc_bpf_trampoline(unsigned int size);
 void arch_free_bpf_trampoline(void *image, unsigned int size);
-void arch_protect_bpf_trampoline(void *image, unsigned int size);
+int __must_check arch_protect_bpf_trampoline(void *image, unsigned int size);
 int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
 			     struct bpf_tramp_links *tlinks, void *func_addr);
 
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 3fcd35314ce5..86c7884abaf8 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -740,8 +740,12 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 		if (err)
 			goto reset_unlock;
 	}
-	for (i = 0; i < st_map->image_pages_cnt; i++)
-		arch_protect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE);
+	for (i = 0; i < st_map->image_pages_cnt; i++) {
+		err = arch_protect_bpf_trampoline(st_map->image_pages[i],
+						  PAGE_SIZE);
+		if (err)
+			goto reset_unlock;
+	}
 
 	if (st_map->map.map_flags & BPF_F_LINK) {
 		err = 0;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 63f100def31b..5aacb1d3c4cc 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -908,23 +908,30 @@ static LIST_HEAD(pack_list);
 static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns)
 {
 	struct bpf_prog_pack *pack;
+	int err;
 
 	pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)),
 		       GFP_KERNEL);
 	if (!pack)
 		return NULL;
 	pack->ptr = bpf_jit_alloc_exec(BPF_PROG_PACK_SIZE);
-	if (!pack->ptr) {
-		kfree(pack);
-		return NULL;
-	}
+	if (!pack->ptr)
+		goto out;
 	bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE);
 	bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE);
-	list_add_tail(&pack->list, &pack_list);
 
 	set_vm_flush_reset_perms(pack->ptr);
-	set_memory_rox((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
+	err = set_memory_rox((unsigned long)pack->ptr,
+			     BPF_PROG_PACK_SIZE / PAGE_SIZE);
+	if (err)
+		goto out;
+	list_add_tail(&pack->list, &pack_list);
 	return pack;
+
+out:
+	bpf_jit_free_exec(pack->ptr);
+	kfree(pack);
+	return NULL;
 }
 
 void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
@@ -939,9 +946,16 @@ void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
 		size = round_up(size, PAGE_SIZE);
 		ptr = bpf_jit_alloc_exec(size);
 		if (ptr) {
+			int err;
+
 			bpf_fill_ill_insns(ptr, size);
 			set_vm_flush_reset_perms(ptr);
-			set_memory_rox((unsigned long)ptr, size / PAGE_SIZE);
+			err = set_memory_rox((unsigned long)ptr,
+					     size / PAGE_SIZE);
+			if (err) {
+				bpf_jit_free_exec(ptr);
+				ptr = NULL;
+			}
 		}
 		goto out;
 	}
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 04fd1abd3661..cc50607f8d8c 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -456,7 +456,9 @@ again:
 	if (err < 0)
 		goto out_free;
 
-	arch_protect_bpf_trampoline(im->image, im->size);
+	err = arch_protect_bpf_trampoline(im->image, im->size);
+	if (err)
+		goto out_free;
 
 	WARN_ON(tr->cur_image && total == 0);
 	if (tr->cur_image)
@@ -1072,10 +1074,10 @@ void __weak arch_free_bpf_trampoline(void *image, unsigned int size)
 	bpf_jit_free_exec(image);
 }
 
-void __weak arch_protect_bpf_trampoline(void *image, unsigned int size)
+int __weak arch_protect_bpf_trampoline(void *image, unsigned int size)
 {
 	WARN_ON_ONCE(size > PAGE_SIZE);
-	set_memory_rox((long)image, 1);
+	return set_memory_rox((long)image, 1);
 }
 
 int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags,
diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c
index de33dc1b0daa..25b75844891a 100644
--- a/net/bpf/bpf_dummy_struct_ops.c
+++ b/net/bpf/bpf_dummy_struct_ops.c
@@ -133,7 +133,9 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 	if (err < 0)
 		goto out;
 
-	arch_protect_bpf_trampoline(image, PAGE_SIZE);
+	err = arch_protect_bpf_trampoline(image, PAGE_SIZE);
+	if (err)
+		goto out;
 	prog_ret = dummy_ops_call_op(image, args);
 
 	err = dummy_ops_copy_args(args);
-- 
cgit v1.2.3


From 8522f6b760ca588928eede740d5d69dd1e936b49 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 28 Feb 2024 15:00:02 +0100
Subject: printk: Let no_printk() use _printk()

When printk-indexing is enabled, each printk() invocation emits a
pi_entry structure, containing the format string and other information
related to its location in the kernel sources.  This is even true for
no_printk(): while the actual code to print the message is optimized out
by the compiler due to the always-false check, the pi_entry structure is
still emitted.

As the main purpose of no_printk() is to provide a helper to maintain
printf()-style format checking when debugging is disabled, this leads to
the inclusion in the index of lots of printk formats that cannot be
emitted by the current kernel.

Fix this by switching no_printk() from printk() to _printk().

This reduces the size of an arm64 defconfig kernel with
CONFIG_PRINTK_INDEX=y by 576 KiB.

Fixes: 337015573718b161 ("printk: Userspace format indexing support")
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Chris Down <chris@chrisdown.name>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/56cf92edccffea970e1f40a075334dd6cf5bb2a4.1709127473.git.geert+renesas@glider.be
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/printk.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 955e31860095..2fde40cc9677 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -126,7 +126,7 @@ struct va_format {
 #define no_printk(fmt, ...)				\
 ({							\
 	if (0)						\
-		printk(fmt, ##__VA_ARGS__);		\
+		_printk(fmt, ##__VA_ARGS__);		\
 	0;						\
 })
 
-- 
cgit v1.2.3


From c26ec799042a3888935d59b599f33e41efedf5f8 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 28 Feb 2024 15:00:03 +0100
Subject: dev_printk: Add and use dev_no_printk()

When printk-indexing is enabled, each dev_printk() invocation emits a
pi_entry structure.  This is even true when the dev_printk() is
protected by an always-false check, as is typically the case for debug
messages: while the actual code to print the message is optimized out by
the compiler, the pi_entry structure is still emitted.

Avoid emitting pi_entry structures for unavailable dev_printk() kernel
messages by:
  1. Introducing a dev_no_printk() helper, mimicked after the existing
     no_printk() helper, which calls _dev_printk() instead of
     dev_printk(),
  2. Replacing all "if (0) dev_printk(...)" constructs by calls to the
     new helper.

This reduces the size of an arm64 defconfig kernel with
CONFIG_PRINTK_INDEX=y by 957 KiB.

Fixes: ad7d61f159db7397 ("printk: index: Add indexing support to dev_printk")
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Chris Down <chris@chrisdown.name>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/8583d54f1687c801c6cda8edddf2cf0344c6e883.1709127473.git.geert+renesas@glider.be
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/dev_printk.h | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/dev_printk.h b/include/linux/dev_printk.h
index 6bfe70decc9f..ae80a303c216 100644
--- a/include/linux/dev_printk.h
+++ b/include/linux/dev_printk.h
@@ -129,6 +129,16 @@ void _dev_info(const struct device *dev, const char *fmt, ...)
 		_dev_printk(level, dev, fmt, ##__VA_ARGS__);		\
 	})
 
+/*
+ * Dummy dev_printk for disabled debugging statements to use whilst maintaining
+ * gcc's format checking.
+ */
+#define dev_no_printk(level, dev, fmt, ...)				\
+	({								\
+		if (0)							\
+			_dev_printk(level, dev, fmt, ##__VA_ARGS__);	\
+	})
+
 /*
  * #defines for all the dev_<level> macros to prefix with whatever
  * possible use of #define dev_fmt(fmt) ...
@@ -158,10 +168,7 @@ void _dev_info(const struct device *dev, const char *fmt, ...)
 	dev_printk(KERN_DEBUG, dev, dev_fmt(fmt), ##__VA_ARGS__)
 #else
 #define dev_dbg(dev, fmt, ...)						\
-({									\
-	if (0)								\
-		dev_printk(KERN_DEBUG, dev, dev_fmt(fmt), ##__VA_ARGS__); \
-})
+	dev_no_printk(KERN_DEBUG, dev, dev_fmt(fmt), ##__VA_ARGS__)
 #endif
 
 #ifdef CONFIG_PRINTK
@@ -247,20 +254,14 @@ do {									\
 } while (0)
 #else
 #define dev_dbg_ratelimited(dev, fmt, ...)				\
-do {									\
-	if (0)								\
-		dev_printk(KERN_DEBUG, dev, dev_fmt(fmt), ##__VA_ARGS__); \
-} while (0)
+	dev_no_printk(KERN_DEBUG, dev, dev_fmt(fmt), ##__VA_ARGS__)
 #endif
 
 #ifdef VERBOSE_DEBUG
 #define dev_vdbg	dev_dbg
 #else
 #define dev_vdbg(dev, fmt, ...)						\
-({									\
-	if (0)								\
-		dev_printk(KERN_DEBUG, dev, dev_fmt(fmt), ##__VA_ARGS__); \
-})
+	dev_no_printk(KERN_DEBUG, dev, dev_fmt(fmt), ##__VA_ARGS__)
 #endif
 
 /*
-- 
cgit v1.2.3


From 0381e588949380704005dacdbef6e9a2920260df Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 28 Feb 2024 15:00:04 +0100
Subject: dyndbg: Use *no_printk() helpers

When printk-indexing is enabled, each printk() or dev_printk()
invocation emits a pi_entry structure.  This is even true when the call
is protected by an always-false check: while the actual code to print
the message is optimized out by the compiler, the pi_entry structure is
still emitted.

Fix this by replacing "if (0) *printk(...)" constructs by calls to the
corresponding *no_printk() helpers.

Note that this has minimal impact, as most (all?) callers of
dynamic_{pr,dev}_debug() are protected by checks for DYNAMIC_DEBUG
anyway.  Still, using the helpers serves as a good example to follow.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Chris Down <chris@chrisdown.name>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/32e18ceb528b99d9147be9c1af578a1965e2bad9.1709127473.git.geert+renesas@glider.be
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/dynamic_debug.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index 4fcbf4d4fd0a..ff44ec346162 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -305,9 +305,9 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor,
 #define DYNAMIC_DEBUG_BRANCH(descriptor) false
 
 #define dynamic_pr_debug(fmt, ...)					\
-	do { if (0) printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__); } while (0)
+	no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
 #define dynamic_dev_dbg(dev, fmt, ...)					\
-	do { if (0) dev_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__); } while (0)
+	dev_no_printk(KERN_DEBUG, dev, fmt, ##__VA_ARGS__)
 #define dynamic_hex_dump(prefix_str, prefix_type, rowsize,		\
 			 groupsize, buf, len, ascii)			\
 	do { if (0)							\
-- 
cgit v1.2.3


From b37cafacbf98ead69c2ecb220fb82b06aee916b9 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 28 Feb 2024 15:00:05 +0100
Subject: ceph: Use no_printk() helper

When printk-indexing is enabled, each printk() invocation emits a
pi_entry structure.  This is even true when the call is protected by an
always-false check: while the actual code to print the message is
optimized out by the compiler, the pi_entry structure is still emitted.

Fix this by replacing "if (0) printk(...)" constructs by calls to the
no_printk() helper.

This reduces the size of an arm64 kernel with CONFIG_PRINTK_INDEX=y and
CONFIG_CEPH_FS=y by ca. 4 KiB.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Chris Down <chris@chrisdown.name>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/a21767eccb823435a7f18cdf115d7d572b4e945d.1709127473.git.geert+renesas@glider.be
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/ceph/ceph_debug.h | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/ceph/ceph_debug.h b/include/linux/ceph/ceph_debug.h
index 11a92a946016..5f904591fa5f 100644
--- a/include/linux/ceph/ceph_debug.h
+++ b/include/linux/ceph/ceph_debug.h
@@ -27,17 +27,13 @@
 		 ##__VA_ARGS__)
 # else
 /* faux printk call just to see any compiler warnings. */
-#  define dout(fmt, ...)	do {				\
-		if (0)						\
-			printk(KERN_DEBUG fmt, ##__VA_ARGS__);	\
-	} while (0)
-#  define doutc(client, fmt, ...)	do {			\
-		if (0)						\
-			printk(KERN_DEBUG "[%pU %llu] " fmt,	\
-			&client->fsid,				\
-			client->monc.auth->global_id,		\
-			##__VA_ARGS__);				\
-		} while (0)
+#  define dout(fmt, ...)					\
+		no_printk(KERN_DEBUG fmt, ##__VA_ARGS__)
+#  define doutc(client, fmt, ...)				\
+		no_printk(KERN_DEBUG "[%pU %llu] " fmt,		\
+			  &client->fsid,			\
+			  client->monc.auth->global_id,		\
+			  ##__VA_ARGS__)
 # endif
 
 #else
-- 
cgit v1.2.3


From d4dfc5700e867b22ab94f960f9a9972696a637d5 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 19 Mar 2024 16:38:49 -0700
Subject: bpf: pass whole link instead of prog when triggering raw tracepoint

Instead of passing prog as an argument to bpf_trace_runX() helpers, that
are called from tracepoint triggering calls, store BPF link itself
(struct bpf_raw_tp_link for raw tracepoints). This will allow to pass
extra information like BPF cookie into raw tracepoint registration.

Instead of replacing `struct bpf_prog *prog = __data;` with
corresponding `struct bpf_raw_tp_link *link = __data;` assignment in
`__bpf_trace_##call` I just passed `__data` through into underlying
bpf_trace_runX() call. This works well because we implicitly cast `void *`,
and it also avoids naming clashes with arguments coming from
tracepoint's "proto" list. We could have run into the same problem with
"prog", we just happened to not have a tracepoint that has "prog" input
argument. We are less lucky with "link", as there are tracepoints using
"link" argument name already. So instead of trying to avoid naming
conflicts, let's just remove intermediate local variable. It doesn't
hurt readibility, it's either way a bit of a maze of calls and macros,
that requires careful reading.

Acked-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Message-ID: <20240319233852.1977493-3-andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h          |  5 +++++
 include/linux/trace_events.h | 36 ++++++++++++++++++++----------------
 include/trace/bpf_probe.h    |  3 +--
 kernel/bpf/syscall.c         |  9 ++-------
 kernel/trace/bpf_trace.c     | 18 ++++++++++--------
 5 files changed, 38 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 17843e66a1d3..2ea8ce59f582 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1607,6 +1607,11 @@ struct bpf_tracing_link {
 	struct bpf_prog *tgt_prog;
 };
 
+struct bpf_raw_tp_link {
+	struct bpf_link link;
+	struct bpf_raw_event_map *btp;
+};
+
 struct bpf_link_primer {
 	struct bpf_link *link;
 	struct file *file;
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index d68ff9b1247f..a7fc6fb6de3c 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -759,8 +759,11 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx);
 int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie);
 void perf_event_detach_bpf_prog(struct perf_event *event);
 int perf_event_query_prog_array(struct perf_event *event, void __user *info);
-int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
-int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
+
+struct bpf_raw_tp_link;
+int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link);
+int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link);
+
 struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name);
 void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp);
 int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
@@ -788,11 +791,12 @@ perf_event_query_prog_array(struct perf_event *event, void __user *info)
 {
 	return -EOPNOTSUPP;
 }
-static inline int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *p)
+struct bpf_raw_tp_link;
+static inline int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link)
 {
 	return -EOPNOTSUPP;
 }
-static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *p)
+static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link)
 {
 	return -EOPNOTSUPP;
 }
@@ -903,31 +907,31 @@ void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp);
 int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog, u64 bpf_cookie);
 void perf_event_free_bpf_prog(struct perf_event *event);
 
-void bpf_trace_run1(struct bpf_prog *prog, u64 arg1);
-void bpf_trace_run2(struct bpf_prog *prog, u64 arg1, u64 arg2);
-void bpf_trace_run3(struct bpf_prog *prog, u64 arg1, u64 arg2,
+void bpf_trace_run1(struct bpf_raw_tp_link *link, u64 arg1);
+void bpf_trace_run2(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2);
+void bpf_trace_run3(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2,
 		    u64 arg3);
-void bpf_trace_run4(struct bpf_prog *prog, u64 arg1, u64 arg2,
+void bpf_trace_run4(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2,
 		    u64 arg3, u64 arg4);
-void bpf_trace_run5(struct bpf_prog *prog, u64 arg1, u64 arg2,
+void bpf_trace_run5(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2,
 		    u64 arg3, u64 arg4, u64 arg5);
-void bpf_trace_run6(struct bpf_prog *prog, u64 arg1, u64 arg2,
+void bpf_trace_run6(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2,
 		    u64 arg3, u64 arg4, u64 arg5, u64 arg6);
-void bpf_trace_run7(struct bpf_prog *prog, u64 arg1, u64 arg2,
+void bpf_trace_run7(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2,
 		    u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7);
-void bpf_trace_run8(struct bpf_prog *prog, u64 arg1, u64 arg2,
+void bpf_trace_run8(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2,
 		    u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
 		    u64 arg8);
-void bpf_trace_run9(struct bpf_prog *prog, u64 arg1, u64 arg2,
+void bpf_trace_run9(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2,
 		    u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
 		    u64 arg8, u64 arg9);
-void bpf_trace_run10(struct bpf_prog *prog, u64 arg1, u64 arg2,
+void bpf_trace_run10(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2,
 		     u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
 		     u64 arg8, u64 arg9, u64 arg10);
-void bpf_trace_run11(struct bpf_prog *prog, u64 arg1, u64 arg2,
+void bpf_trace_run11(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2,
 		     u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
 		     u64 arg8, u64 arg9, u64 arg10, u64 arg11);
-void bpf_trace_run12(struct bpf_prog *prog, u64 arg1, u64 arg2,
+void bpf_trace_run12(struct bpf_raw_tp_link *link, u64 arg1, u64 arg2,
 		     u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
 		     u64 arg8, u64 arg9, u64 arg10, u64 arg11, u64 arg12);
 void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h
index e609cd7da47e..a2ea11cc912e 100644
--- a/include/trace/bpf_probe.h
+++ b/include/trace/bpf_probe.h
@@ -46,8 +46,7 @@
 static notrace void							\
 __bpf_trace_##call(void *__data, proto)					\
 {									\
-	struct bpf_prog *prog = __data;					\
-	CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(prog, CAST_TO_U64(args));	\
+	CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(__data, CAST_TO_U64(args));	\
 }
 
 #undef DECLARE_EVENT_CLASS
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ae2ff73bde7e..1cb4c3809af4 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3469,17 +3469,12 @@ out_put_prog:
 	return err;
 }
 
-struct bpf_raw_tp_link {
-	struct bpf_link link;
-	struct bpf_raw_event_map *btp;
-};
-
 static void bpf_raw_tp_link_release(struct bpf_link *link)
 {
 	struct bpf_raw_tp_link *raw_tp =
 		container_of(link, struct bpf_raw_tp_link, link);
 
-	bpf_probe_unregister(raw_tp->btp, raw_tp->link.prog);
+	bpf_probe_unregister(raw_tp->btp, raw_tp);
 	bpf_put_raw_tracepoint(raw_tp->btp);
 }
 
@@ -3833,7 +3828,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
 		goto out_put_btp;
 	}
 
-	err = bpf_probe_register(link->btp, prog);
+	err = bpf_probe_register(link->btp, link);
 	if (err) {
 		bpf_link_cleanup(&link_primer);
 		goto out_put_btp;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 30ecf62f8a17..17de91ad4a1f 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2366,8 +2366,10 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
 }
 
 static __always_inline
-void __bpf_trace_run(struct bpf_prog *prog, u64 *args)
+void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
 {
+	struct bpf_prog *prog = link->link.prog;
+
 	cant_sleep();
 	if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
 		bpf_prog_inc_misses_counter(prog);
@@ -2404,12 +2406,12 @@ out:
 #define __SEQ_0_11	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
 
 #define BPF_TRACE_DEFN_x(x)						\
-	void bpf_trace_run##x(struct bpf_prog *prog,			\
+	void bpf_trace_run##x(struct bpf_raw_tp_link *link,		\
 			      REPEAT(x, SARG, __DL_COM, __SEQ_0_11))	\
 	{								\
 		u64 args[x];						\
 		REPEAT(x, COPY, __DL_SEM, __SEQ_0_11);			\
-		__bpf_trace_run(prog, args);				\
+		__bpf_trace_run(link, args);				\
 	}								\
 	EXPORT_SYMBOL_GPL(bpf_trace_run##x)
 BPF_TRACE_DEFN_x(1);
@@ -2425,9 +2427,10 @@ BPF_TRACE_DEFN_x(10);
 BPF_TRACE_DEFN_x(11);
 BPF_TRACE_DEFN_x(12);
 
-int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
+int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link)
 {
 	struct tracepoint *tp = btp->tp;
+	struct bpf_prog *prog = link->link.prog;
 
 	/*
 	 * check that program doesn't access arguments beyond what's
@@ -2439,13 +2442,12 @@ int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
 	if (prog->aux->max_tp_access > btp->writable_size)
 		return -EINVAL;
 
-	return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func,
-						   prog);
+	return tracepoint_probe_register_may_exist(tp, (void *)btp->bpf_func, link);
 }
 
-int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
+int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_raw_tp_link *link)
 {
-	return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog);
+	return tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, link);
 }
 
 int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
-- 
cgit v1.2.3


From 68ca5d4eebb8c4de246ee5f634eee26bc689562d Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 19 Mar 2024 16:38:50 -0700
Subject: bpf: support BPF cookie in raw tracepoint (raw_tp, tp_btf) programs

Wire up BPF cookie for raw tracepoint programs (both BTF and non-BTF
aware variants). This brings them up to part w.r.t. BPF cookie usage
with classic tracepoint and fentry/fexit programs.

Acked-by: Stanislav Fomichev <sdf@google.com>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Message-ID: <20240319233852.1977493-4-andrii@kernel.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            |  1 +
 include/uapi/linux/bpf.h       |  6 ++++--
 kernel/bpf/syscall.c           | 13 +++++++++----
 kernel/trace/bpf_trace.c       | 13 +++++++++++++
 tools/include/uapi/linux/bpf.h |  1 +
 5 files changed, 28 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2ea8ce59f582..62762390c93d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1610,6 +1610,7 @@ struct bpf_tracing_link {
 struct bpf_raw_tp_link {
 	struct bpf_link link;
 	struct bpf_raw_event_map *btp;
+	u64 cookie;
 };
 
 struct bpf_link_primer {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3c42b9f1bada..9585f5345353 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1662,8 +1662,10 @@ union bpf_attr {
 	} query;
 
 	struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */
-		__u64 name;
-		__u32 prog_fd;
+		__u64		name;
+		__u32		prog_fd;
+		__u32		:32;
+		__aligned_u64	cookie;
 	} raw_tracepoint;
 
 	struct { /* anonymous struct for BPF_BTF_LOAD */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1cb4c3809af4..e44c276e8617 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3774,7 +3774,7 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro
 #endif /* CONFIG_PERF_EVENTS */
 
 static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
-				  const char __user *user_tp_name)
+				  const char __user *user_tp_name, u64 cookie)
 {
 	struct bpf_link_primer link_primer;
 	struct bpf_raw_tp_link *link;
@@ -3821,6 +3821,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
 	bpf_link_init(&link->link, BPF_LINK_TYPE_RAW_TRACEPOINT,
 		      &bpf_raw_tp_link_lops, prog);
 	link->btp = btp;
+	link->cookie = cookie;
 
 	err = bpf_link_prime(&link->link, &link_primer);
 	if (err) {
@@ -3841,11 +3842,13 @@ out_put_btp:
 	return err;
 }
 
-#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
+#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.cookie
 
 static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
 {
 	struct bpf_prog *prog;
+	void __user *tp_name;
+	__u64 cookie;
 	int fd;
 
 	if (CHECK_ATTR(BPF_RAW_TRACEPOINT_OPEN))
@@ -3855,7 +3858,9 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
 
-	fd = bpf_raw_tp_link_attach(prog, u64_to_user_ptr(attr->raw_tracepoint.name));
+	tp_name = u64_to_user_ptr(attr->raw_tracepoint.name);
+	cookie = attr->raw_tracepoint.cookie;
+	fd = bpf_raw_tp_link_attach(prog, tp_name, cookie);
 	if (fd < 0)
 		bpf_prog_put(prog);
 	return fd;
@@ -5193,7 +5198,7 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 			goto out;
 		}
 		if (prog->expected_attach_type == BPF_TRACE_RAW_TP)
-			ret = bpf_raw_tp_link_attach(prog, NULL);
+			ret = bpf_raw_tp_link_attach(prog, NULL, attr->link_create.tracing.cookie);
 		else if (prog->expected_attach_type == BPF_TRACE_ITER)
 			ret = bpf_iter_link_attach(attr, uattr, prog);
 		else if (prog->expected_attach_type == BPF_LSM_CGROUP)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 17de91ad4a1f..434e3ece6688 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2004,6 +2004,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_stackid_proto_raw_tp;
 	case BPF_FUNC_get_stack:
 		return &bpf_get_stack_proto_raw_tp;
+	case BPF_FUNC_get_attach_cookie:
+		return &bpf_get_attach_cookie_proto_tracing;
 	default:
 		return bpf_tracing_func_proto(func_id, prog);
 	}
@@ -2066,6 +2068,9 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_get_func_arg_cnt:
 		return bpf_prog_has_trampoline(prog) ? &bpf_get_func_arg_cnt_proto : NULL;
 	case BPF_FUNC_get_attach_cookie:
+		if (prog->type == BPF_PROG_TYPE_TRACING &&
+		    prog->expected_attach_type == BPF_TRACE_RAW_TP)
+			return &bpf_get_attach_cookie_proto_tracing;
 		return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto_tracing : NULL;
 	default:
 		fn = raw_tp_prog_func_proto(func_id, prog);
@@ -2369,15 +2374,23 @@ static __always_inline
 void __bpf_trace_run(struct bpf_raw_tp_link *link, u64 *args)
 {
 	struct bpf_prog *prog = link->link.prog;
+	struct bpf_run_ctx *old_run_ctx;
+	struct bpf_trace_run_ctx run_ctx;
 
 	cant_sleep();
 	if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
 		bpf_prog_inc_misses_counter(prog);
 		goto out;
 	}
+
+	run_ctx.bpf_cookie = link->cookie;
+	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+
 	rcu_read_lock();
 	(void) bpf_prog_run(prog, args);
 	rcu_read_unlock();
+
+	bpf_reset_run_ctx(old_run_ctx);
 out:
 	this_cpu_dec(*(prog->active));
 }
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 3c42b9f1bada..bf80b614c4db 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1664,6 +1664,7 @@ union bpf_attr {
 	struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */
 		__u64 name;
 		__u32 prog_fd;
+		__aligned_u64 cookie;
 	} raw_tracepoint;
 
 	struct { /* anonymous struct for BPF_BTF_LOAD */
-- 
cgit v1.2.3


From 5a95f39d9b214d59cbe742002767737e03d43844 Mon Sep 17 00:00:00 2001
From: Christian König <christian.koenig@amd.com>
Date: Tue, 12 Mar 2024 11:37:58 +0100
Subject: drm/ttm: warn when resv objs are mixed in a bulk_move
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The BOs in a bulk move must share all the same reservation object
to make sure that we lock the whole bulk during eviction.

Actually document and enforce that with a warning.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240312105555.3065-1-christian.koenig@amd.com
---
 drivers/gpu/drm/ttm/ttm_resource.c | 1 +
 include/drm/ttm/ttm_resource.h     | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/ttm/ttm_resource.c b/drivers/gpu/drm/ttm/ttm_resource.c
index 65155f2013ca..be8d286513f9 100644
--- a/drivers/gpu/drm/ttm/ttm_resource.c
+++ b/drivers/gpu/drm/ttm/ttm_resource.c
@@ -105,6 +105,7 @@ static void ttm_lru_bulk_move_add(struct ttm_lru_bulk_move *bulk,
 		pos->first = res;
 		pos->last = res;
 	} else {
+		WARN_ON(pos->first->bo->base.resv != res->bo->base.resv);
 		ttm_lru_bulk_move_pos_tail(pos, res);
 	}
 }
diff --git a/include/drm/ttm/ttm_resource.h b/include/drm/ttm/ttm_resource.h
index 7561023db43d..69769355139f 100644
--- a/include/drm/ttm/ttm_resource.h
+++ b/include/drm/ttm/ttm_resource.h
@@ -251,6 +251,9 @@ struct ttm_lru_bulk_move_pos {
  *
  * Container for the current bulk move state. Should be used with
  * ttm_lru_bulk_move_init() and ttm_bo_set_bulk_move().
+ * All BOs in a bulk_move structure need to share the same reservation object to
+ * ensure that the bulk as a whole is locked for eviction even if only one BO of
+ * the bulk is evicted.
  */
 struct ttm_lru_bulk_move {
 	struct ttm_lru_bulk_move_pos pos[TTM_NUM_MEM_TYPES][TTM_MAX_BO_PRIORITY];
-- 
cgit v1.2.3


From 281a2cc71a1b9fce8fde59b39671ca55c2bf6307 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Tue, 19 Mar 2024 11:12:48 +0200
Subject: drm/mst: read sideband messaging cap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Amend drm_dp_read_mst_cap() to return an enum, indicating "SST", "SST
with sideband messaging", or "MST". Modify all call sites to take the
new return value into account.

v2:
- Rename enumerators (Ville)

Cc: Arun R Murthy <arun.r.murthy@intel.com>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: Karol Herbst <kherbst@redhat.com>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@redhat.com>
Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Acked-by: Maxime Ripard <mripard@kernel.org>
Acked-by: Danilo Krummrich <dakr@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/b32a3704934871a67d06420b760e148b76c5ced8.1710839496.git.jani.nikula@intel.com
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 drivers/gpu/drm/display/drm_dp_mst_topology.c | 20 +++++++++++++-------
 drivers/gpu/drm/i915/display/intel_dp.c       |  4 ++--
 drivers/gpu/drm/nouveau/nouveau_dp.c          |  2 +-
 include/drm/display/drm_dp_mst_helper.h       | 23 ++++++++++++++++++++++-
 4 files changed, 38 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/display/drm_dp_mst_topology.c b/drivers/gpu/drm/display/drm_dp_mst_topology.c
index 03d528209426..c193be3577f7 100644
--- a/drivers/gpu/drm/display/drm_dp_mst_topology.c
+++ b/drivers/gpu/drm/display/drm_dp_mst_topology.c
@@ -3608,24 +3608,30 @@ fixed20_12 drm_dp_get_vc_payload_bw(const struct drm_dp_mst_topology_mgr *mgr,
 EXPORT_SYMBOL(drm_dp_get_vc_payload_bw);
 
 /**
- * drm_dp_read_mst_cap() - check whether or not a sink supports MST
+ * drm_dp_read_mst_cap() - Read the sink's MST mode capability
  * @aux: The DP AUX channel to use
  * @dpcd: A cached copy of the DPCD capabilities for this sink
  *
- * Returns: %True if the sink supports MST, %false otherwise
+ * Returns: enum drm_dp_mst_mode to indicate MST mode capability
  */
-bool drm_dp_read_mst_cap(struct drm_dp_aux *aux,
-			 const u8 dpcd[DP_RECEIVER_CAP_SIZE])
+enum drm_dp_mst_mode drm_dp_read_mst_cap(struct drm_dp_aux *aux,
+					 const u8 dpcd[DP_RECEIVER_CAP_SIZE])
 {
 	u8 mstm_cap;
 
 	if (dpcd[DP_DPCD_REV] < DP_DPCD_REV_12)
-		return false;
+		return DRM_DP_SST;
 
 	if (drm_dp_dpcd_readb(aux, DP_MSTM_CAP, &mstm_cap) != 1)
-		return false;
+		return DRM_DP_SST;
+
+	if (mstm_cap & DP_MST_CAP)
+		return DRM_DP_MST;
+
+	if (mstm_cap & DP_SINGLE_STREAM_SIDEBAND_MSG)
+		return DRM_DP_SST_SIDEBAND_MSG;
 
-	return mstm_cap & DP_MST_CAP;
+	return DRM_DP_SST;
 }
 EXPORT_SYMBOL(drm_dp_read_mst_cap);
 
diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c
index b6d45ab41ff7..3fb9a5ac1551 100644
--- a/drivers/gpu/drm/i915/display/intel_dp.c
+++ b/drivers/gpu/drm/i915/display/intel_dp.c
@@ -4046,7 +4046,7 @@ intel_dp_can_mst(struct intel_dp *intel_dp)
 
 	return i915->display.params.enable_dp_mst &&
 		intel_dp_mst_source_support(intel_dp) &&
-		drm_dp_read_mst_cap(&intel_dp->aux, intel_dp->dpcd);
+		drm_dp_read_mst_cap(&intel_dp->aux, intel_dp->dpcd) == DRM_DP_MST;
 }
 
 static void
@@ -4055,7 +4055,7 @@ intel_dp_configure_mst(struct intel_dp *intel_dp)
 	struct drm_i915_private *i915 = dp_to_i915(intel_dp);
 	struct intel_encoder *encoder =
 		&dp_to_dig_port(intel_dp)->base;
-	bool sink_can_mst = drm_dp_read_mst_cap(&intel_dp->aux, intel_dp->dpcd);
+	bool sink_can_mst = drm_dp_read_mst_cap(&intel_dp->aux, intel_dp->dpcd) == DRM_DP_MST;
 
 	drm_dbg_kms(&i915->drm,
 		    "[ENCODER:%d:%s] MST support: port: %s, sink: %s, modparam: %s\n",
diff --git a/drivers/gpu/drm/nouveau/nouveau_dp.c b/drivers/gpu/drm/nouveau/nouveau_dp.c
index 7de7707ec6a8..fb06ee17d9e5 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dp.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dp.c
@@ -181,7 +181,7 @@ nouveau_dp_probe_dpcd(struct nouveau_connector *nv_connector,
 	if (nouveau_mst) {
 		mstm = outp->dp.mstm;
 		if (mstm)
-			mstm->can_mst = drm_dp_read_mst_cap(aux, dpcd);
+			mstm->can_mst = drm_dp_read_mst_cap(aux, dpcd) == DRM_DP_MST;
 	}
 
 	if (nouveau_dp_has_sink_count(connector, outp)) {
diff --git a/include/drm/display/drm_dp_mst_helper.h b/include/drm/display/drm_dp_mst_helper.h
index 9b19d8bd520a..3c9e128c444a 100644
--- a/include/drm/display/drm_dp_mst_helper.h
+++ b/include/drm/display/drm_dp_mst_helper.h
@@ -818,7 +818,28 @@ int drm_dp_mst_topology_mgr_init(struct drm_dp_mst_topology_mgr *mgr,
 
 void drm_dp_mst_topology_mgr_destroy(struct drm_dp_mst_topology_mgr *mgr);
 
-bool drm_dp_read_mst_cap(struct drm_dp_aux *aux, const u8 dpcd[DP_RECEIVER_CAP_SIZE]);
+/**
+ * enum drm_dp_mst_mode - sink's MST mode capability
+ */
+enum drm_dp_mst_mode {
+	/**
+	 * @DRM_DP_SST: The sink does not support MST nor single stream sideband
+	 * messaging.
+	 */
+	DRM_DP_SST,
+	/**
+	 * @DRM_DP_MST: Sink supports MST, more than one stream and single
+	 * stream sideband messaging.
+	 */
+	DRM_DP_MST,
+	/**
+	 * @DRM_DP_SST_SIDEBAND_MSG: Sink supports only one stream and single
+	 * stream sideband messaging.
+	 */
+	DRM_DP_SST_SIDEBAND_MSG,
+};
+
+enum drm_dp_mst_mode drm_dp_read_mst_cap(struct drm_dp_aux *aux, const u8 dpcd[DP_RECEIVER_CAP_SIZE]);
 int drm_dp_mst_topology_mgr_set_mst(struct drm_dp_mst_topology_mgr *mgr, bool mst_state);
 
 int drm_dp_mst_hpd_irq_handle_event(struct drm_dp_mst_topology_mgr *mgr,
-- 
cgit v1.2.3


From 8f69cba096b5cfd09be2d06c15d08dbd4a58c9d7 Mon Sep 17 00:00:00 2001
From: "Xin Li (Intel)" <xin@zytor.com>
Date: Fri, 22 Mar 2024 01:16:16 -0700
Subject: x86: Rename __{start,end}_init_task to __{start,end}_init_stack

The stack of a task has been separated from the memory of a task_struct
struture for a long time on x86, as a result __{start,end}_init_task no
longer mark the start and end of the init_task structure, but its stack
only.

Rename __{start,end}_init_task to __{start,end}_init_stack.

Note other architectures are not affected because __{start,end}_init_task
are used on x86 only.

Signed-off-by: Xin Li (Intel) <xin@zytor.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Juergen Gross <jgross@suse.com>
Link: https://lore.kernel.org/r/20240322081616.3346181-1-xin@zytor.com
---
 arch/x86/kernel/vmlinux.lds.S     | 2 +-
 include/asm-generic/vmlinux.lds.h | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 91085c3430de..4c04a366f07d 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -173,7 +173,7 @@ SECTIONS
 		INIT_TASK_DATA(THREAD_SIZE)
 
 		/* equivalent to task_pt_regs(&init_task) */
-		__top_init_kernel_stack = __end_init_task - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE;
+		__top_init_kernel_stack = __end_init_stack - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE;
 
 #ifdef CONFIG_X86_32
 		/* 32 bit has nosave before _edata */
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index f7749d0f2562..87082434e244 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -399,13 +399,13 @@
 
 #define INIT_TASK_DATA(align)						\
 	. = ALIGN(align);						\
-	__start_init_task = .;						\
+	__start_init_stack = .;						\
 	init_thread_union = .;						\
 	init_stack = .;							\
 	KEEP(*(.data..init_task))					\
 	KEEP(*(.data..init_thread_info))				\
-	. = __start_init_task + THREAD_SIZE;				\
-	__end_init_task = .;
+	. = __start_init_stack + THREAD_SIZE;				\
+	__end_init_stack = .;
 
 #define JUMP_TABLE_DATA							\
 	. = ALIGN(8);							\
-- 
cgit v1.2.3


From 91a1d97ef482c1e4c9d4c1c656a53b0f6b16d0ed Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 13 Mar 2024 19:01:03 +0100
Subject: jump_label,module: Don't alloc static_key_mod for __ro_after_init
 keys

When a static_key is marked ro_after_init, its state will never change
(after init), therefore jump_label_update() will never need to iterate
the entries, and thus module load won't actually need to track this --
avoiding the static_key::next write.

Therefore, mark these keys such that jump_label_add_module() might
recognise them and avoid the modification.

Use the special state: 'static_key_linked(key) && !static_key_mod(key)'
to denote such keys.

jump_label_add_module() does not exist under CONFIG_JUMP_LABEL=n, so the
newly-introduced jump_label_init_ro() can be defined as a nop for that
configuration.

[ mingo: Renamed jump_label_ro() to jump_label_init_ro() ]

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Link: https://lore.kernel.org/r/20240313180106.2917308-2-vschneid@redhat.com
---
 include/asm-generic/sections.h |  5 ++++
 include/linux/jump_label.h     |  3 +++
 init/main.c                    |  1 +
 kernel/jump_label.c            | 53 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 62 insertions(+)

(limited to 'include')

diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index db13bb620f52..c768de6f19a9 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -180,6 +180,11 @@ static inline bool is_kernel_rodata(unsigned long addr)
 	       addr < (unsigned long)__end_rodata;
 }
 
+static inline bool is_kernel_ro_after_init(unsigned long addr)
+{
+	return addr >= (unsigned long)__start_ro_after_init &&
+	       addr < (unsigned long)__end_ro_after_init;
+}
 /**
  * is_kernel_inittext - checks if the pointer address is located in the
  *                      .init.text section
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index f0a949b7c973..f5a2727ca4a9 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -216,6 +216,7 @@ extern struct jump_entry __start___jump_table[];
 extern struct jump_entry __stop___jump_table[];
 
 extern void jump_label_init(void);
+extern void jump_label_init_ro(void);
 extern void jump_label_lock(void);
 extern void jump_label_unlock(void);
 extern void arch_jump_label_transform(struct jump_entry *entry,
@@ -265,6 +266,8 @@ static __always_inline void jump_label_init(void)
 	static_key_initialized = true;
 }
 
+static __always_inline void jump_label_init_ro(void) { }
+
 static __always_inline bool static_key_false(struct static_key *key)
 {
 	if (unlikely_notrace(static_key_count(key) > 0))
diff --git a/init/main.c b/init/main.c
index 2ca52474d0c3..6c3f251d6ef8 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1408,6 +1408,7 @@ static void mark_readonly(void)
 		 * insecure pages which are W+X.
 		 */
 		flush_module_init_free_work();
+		jump_label_init_ro();
 		mark_rodata_ro();
 		debug_checkwx();
 		rodata_test();
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index d9c822bbffb8..3218fa5688b9 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -530,6 +530,45 @@ void __init jump_label_init(void)
 	cpus_read_unlock();
 }
 
+static inline bool static_key_sealed(struct static_key *key)
+{
+	return (key->type & JUMP_TYPE_LINKED) && !(key->type & ~JUMP_TYPE_MASK);
+}
+
+static inline void static_key_seal(struct static_key *key)
+{
+	unsigned long type = key->type & JUMP_TYPE_TRUE;
+	key->type = JUMP_TYPE_LINKED | type;
+}
+
+void jump_label_init_ro(void)
+{
+	struct jump_entry *iter_start = __start___jump_table;
+	struct jump_entry *iter_stop = __stop___jump_table;
+	struct jump_entry *iter;
+
+	if (WARN_ON_ONCE(!static_key_initialized))
+		return;
+
+	cpus_read_lock();
+	jump_label_lock();
+
+	for (iter = iter_start; iter < iter_stop; iter++) {
+		struct static_key *iterk = jump_entry_key(iter);
+
+		if (!is_kernel_ro_after_init((unsigned long)iterk))
+			continue;
+
+		if (static_key_sealed(iterk))
+			continue;
+
+		static_key_seal(iterk);
+	}
+
+	jump_label_unlock();
+	cpus_read_unlock();
+}
+
 #ifdef CONFIG_MODULES
 
 enum jump_label_type jump_label_init_type(struct jump_entry *entry)
@@ -650,6 +689,15 @@ static int jump_label_add_module(struct module *mod)
 			static_key_set_entries(key, iter);
 			continue;
 		}
+
+		/*
+		 * If the key was sealed at init, then there's no need to keep a
+		 * reference to its module entries - just patch them now and be
+		 * done with it.
+		 */
+		if (static_key_sealed(key))
+			goto do_poke;
+
 		jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL);
 		if (!jlm)
 			return -ENOMEM;
@@ -675,6 +723,7 @@ static int jump_label_add_module(struct module *mod)
 		static_key_set_linked(key);
 
 		/* Only update if we've changed from our initial state */
+do_poke:
 		if (jump_label_type(iter) != jump_label_init_type(iter))
 			__jump_label_update(key, iter, iter_stop, true);
 	}
@@ -699,6 +748,10 @@ static void jump_label_del_module(struct module *mod)
 		if (within_module((unsigned long)key, mod))
 			continue;
 
+		/* No @jlm allocated because key was sealed at init. */
+		if (static_key_sealed(key))
+			continue;
+
 		/* No memory during module load */
 		if (WARN_ON(!static_key_linked(key)))
 			continue;
-- 
cgit v1.2.3


From 5edeb7d312628961046eec9b26a7e72f44baf846 Mon Sep 17 00:00:00 2001
From: Joy Zou <joy.zou@nxp.com>
Date: Mon, 18 Mar 2024 17:56:32 +0800
Subject: regulator: pca9450: add pca9451a support

Adding support for new pmic pca9451a.

Signed-off-by: Joy Zou <joy.zou@nxp.com>
Link: https://msgid.link/r/20240318095633.4079027-3-joy.zou@nxp.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/pca9450-regulator.c | 194 +++++++++++++++++++++++++++++++++-
 include/linux/regulator/pca9450.h     |   1 +
 2 files changed, 193 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/regulator/pca9450-regulator.c b/drivers/regulator/pca9450-regulator.c
index 2ab365d2749f..242ff25f5b83 100644
--- a/drivers/regulator/pca9450-regulator.c
+++ b/drivers/regulator/pca9450-regulator.c
@@ -106,6 +106,14 @@ static const struct linear_range pca9450_dvs_buck_volts[] = {
 	REGULATOR_LINEAR_RANGE(600000,  0x00, 0x7F, 12500),
 };
 
+/*
+ * BUCK1/3
+ * 0.65 to 2.2375V (12.5mV step)
+ */
+static const struct linear_range pca9451a_dvs_buck_volts[] = {
+	REGULATOR_LINEAR_RANGE(650000, 0x00, 0x7F, 12500),
+};
+
 /*
  * BUCK4/5/6
  * 0.6V to 3.4V (25mV step)
@@ -662,6 +670,178 @@ static const struct pca9450_regulator_desc pca9450bc_regulators[] = {
 	},
 };
 
+static const struct pca9450_regulator_desc pca9451a_regulators[] = {
+	{
+		.desc = {
+			.name = "buck1",
+			.of_match = of_match_ptr("BUCK1"),
+			.regulators_node = of_match_ptr("regulators"),
+			.id = PCA9450_BUCK1,
+			.ops = &pca9450_dvs_buck_regulator_ops,
+			.type = REGULATOR_VOLTAGE,
+			.n_voltages = PCA9450_BUCK1_VOLTAGE_NUM,
+			.linear_ranges = pca9451a_dvs_buck_volts,
+			.n_linear_ranges = ARRAY_SIZE(pca9451a_dvs_buck_volts),
+			.vsel_reg = PCA9450_REG_BUCK1OUT_DVS0,
+			.vsel_mask = BUCK1OUT_DVS0_MASK,
+			.enable_reg = PCA9450_REG_BUCK1CTRL,
+			.enable_mask = BUCK1_ENMODE_MASK,
+			.enable_val = BUCK_ENMODE_ONREQ,
+			.ramp_mask = BUCK1_RAMP_MASK,
+			.ramp_delay_table = pca9450_dvs_buck_ramp_table,
+			.n_ramp_values = ARRAY_SIZE(pca9450_dvs_buck_ramp_table),
+			.owner = THIS_MODULE,
+			.of_parse_cb = pca9450_set_dvs_levels,
+		},
+		.dvs = {
+			.run_reg = PCA9450_REG_BUCK1OUT_DVS0,
+			.run_mask = BUCK1OUT_DVS0_MASK,
+			.standby_reg = PCA9450_REG_BUCK1OUT_DVS1,
+			.standby_mask = BUCK1OUT_DVS1_MASK,
+		},
+	},
+	{
+		.desc = {
+			.name = "buck2",
+			.of_match = of_match_ptr("BUCK2"),
+			.regulators_node = of_match_ptr("regulators"),
+			.id = PCA9450_BUCK2,
+			.ops = &pca9450_dvs_buck_regulator_ops,
+			.type = REGULATOR_VOLTAGE,
+			.n_voltages = PCA9450_BUCK2_VOLTAGE_NUM,
+			.linear_ranges = pca9450_dvs_buck_volts,
+			.n_linear_ranges = ARRAY_SIZE(pca9450_dvs_buck_volts),
+			.vsel_reg = PCA9450_REG_BUCK2OUT_DVS0,
+			.vsel_mask = BUCK2OUT_DVS0_MASK,
+			.enable_reg = PCA9450_REG_BUCK2CTRL,
+			.enable_mask = BUCK2_ENMODE_MASK,
+			.enable_val = BUCK_ENMODE_ONREQ_STBYREQ,
+			.ramp_mask = BUCK2_RAMP_MASK,
+			.ramp_delay_table = pca9450_dvs_buck_ramp_table,
+			.n_ramp_values = ARRAY_SIZE(pca9450_dvs_buck_ramp_table),
+			.owner = THIS_MODULE,
+			.of_parse_cb = pca9450_set_dvs_levels,
+		},
+		.dvs = {
+			.run_reg = PCA9450_REG_BUCK2OUT_DVS0,
+			.run_mask = BUCK2OUT_DVS0_MASK,
+			.standby_reg = PCA9450_REG_BUCK2OUT_DVS1,
+			.standby_mask = BUCK2OUT_DVS1_MASK,
+		},
+	},
+	{
+		.desc = {
+			.name = "buck4",
+			.of_match = of_match_ptr("BUCK4"),
+			.regulators_node = of_match_ptr("regulators"),
+			.id = PCA9450_BUCK4,
+			.ops = &pca9450_buck_regulator_ops,
+			.type = REGULATOR_VOLTAGE,
+			.n_voltages = PCA9450_BUCK4_VOLTAGE_NUM,
+			.linear_ranges = pca9450_buck_volts,
+			.n_linear_ranges = ARRAY_SIZE(pca9450_buck_volts),
+			.vsel_reg = PCA9450_REG_BUCK4OUT,
+			.vsel_mask = BUCK4OUT_MASK,
+			.enable_reg = PCA9450_REG_BUCK4CTRL,
+			.enable_mask = BUCK4_ENMODE_MASK,
+			.enable_val = BUCK_ENMODE_ONREQ,
+			.owner = THIS_MODULE,
+		},
+	},
+	{
+		.desc = {
+			.name = "buck5",
+			.of_match = of_match_ptr("BUCK5"),
+			.regulators_node = of_match_ptr("regulators"),
+			.id = PCA9450_BUCK5,
+			.ops = &pca9450_buck_regulator_ops,
+			.type = REGULATOR_VOLTAGE,
+			.n_voltages = PCA9450_BUCK5_VOLTAGE_NUM,
+			.linear_ranges = pca9450_buck_volts,
+			.n_linear_ranges = ARRAY_SIZE(pca9450_buck_volts),
+			.vsel_reg = PCA9450_REG_BUCK5OUT,
+			.vsel_mask = BUCK5OUT_MASK,
+			.enable_reg = PCA9450_REG_BUCK5CTRL,
+			.enable_mask = BUCK5_ENMODE_MASK,
+			.enable_val = BUCK_ENMODE_ONREQ,
+			.owner = THIS_MODULE,
+		},
+	},
+	{
+		.desc = {
+			.name = "buck6",
+			.of_match = of_match_ptr("BUCK6"),
+			.regulators_node = of_match_ptr("regulators"),
+			.id = PCA9450_BUCK6,
+			.ops = &pca9450_buck_regulator_ops,
+			.type = REGULATOR_VOLTAGE,
+			.n_voltages = PCA9450_BUCK6_VOLTAGE_NUM,
+			.linear_ranges = pca9450_buck_volts,
+			.n_linear_ranges = ARRAY_SIZE(pca9450_buck_volts),
+			.vsel_reg = PCA9450_REG_BUCK6OUT,
+			.vsel_mask = BUCK6OUT_MASK,
+			.enable_reg = PCA9450_REG_BUCK6CTRL,
+			.enable_mask = BUCK6_ENMODE_MASK,
+			.enable_val = BUCK_ENMODE_ONREQ,
+			.owner = THIS_MODULE,
+		},
+	},
+	{
+		.desc = {
+			.name = "ldo1",
+			.of_match = of_match_ptr("LDO1"),
+			.regulators_node = of_match_ptr("regulators"),
+			.id = PCA9450_LDO1,
+			.ops = &pca9450_ldo_regulator_ops,
+			.type = REGULATOR_VOLTAGE,
+			.n_voltages = PCA9450_LDO1_VOLTAGE_NUM,
+			.linear_ranges = pca9450_ldo1_volts,
+			.n_linear_ranges = ARRAY_SIZE(pca9450_ldo1_volts),
+			.vsel_reg = PCA9450_REG_LDO1CTRL,
+			.vsel_mask = LDO1OUT_MASK,
+			.enable_reg = PCA9450_REG_LDO1CTRL,
+			.enable_mask = LDO1_EN_MASK,
+			.owner = THIS_MODULE,
+		},
+	},
+	{
+		.desc = {
+			.name = "ldo4",
+			.of_match = of_match_ptr("LDO4"),
+			.regulators_node = of_match_ptr("regulators"),
+			.id = PCA9450_LDO4,
+			.ops = &pca9450_ldo_regulator_ops,
+			.type = REGULATOR_VOLTAGE,
+			.n_voltages = PCA9450_LDO4_VOLTAGE_NUM,
+			.linear_ranges = pca9450_ldo34_volts,
+			.n_linear_ranges = ARRAY_SIZE(pca9450_ldo34_volts),
+			.vsel_reg = PCA9450_REG_LDO4CTRL,
+			.vsel_mask = LDO4OUT_MASK,
+			.enable_reg = PCA9450_REG_LDO4CTRL,
+			.enable_mask = LDO4_EN_MASK,
+			.owner = THIS_MODULE,
+		},
+	},
+	{
+		.desc = {
+			.name = "ldo5",
+			.of_match = of_match_ptr("LDO5"),
+			.regulators_node = of_match_ptr("regulators"),
+			.id = PCA9450_LDO5,
+			.ops = &pca9450_ldo_regulator_ops,
+			.type = REGULATOR_VOLTAGE,
+			.n_voltages = PCA9450_LDO5_VOLTAGE_NUM,
+			.linear_ranges = pca9450_ldo5_volts,
+			.n_linear_ranges = ARRAY_SIZE(pca9450_ldo5_volts),
+			.vsel_reg = PCA9450_REG_LDO5CTRL_H,
+			.vsel_mask = LDO5HOUT_MASK,
+			.enable_reg = PCA9450_REG_LDO5CTRL_H,
+			.enable_mask = LDO5H_EN_MASK,
+			.owner = THIS_MODULE,
+		},
+	},
+};
+
 static irqreturn_t pca9450_irq_handler(int irq, void *data)
 {
 	struct pca9450 *pca9450 = data;
@@ -729,6 +909,10 @@ static int pca9450_i2c_probe(struct i2c_client *i2c)
 		regulator_desc = pca9450bc_regulators;
 		pca9450->rcnt = ARRAY_SIZE(pca9450bc_regulators);
 		break;
+	case PCA9450_TYPE_PCA9451A:
+		regulator_desc = pca9451a_regulators;
+		pca9450->rcnt = ARRAY_SIZE(pca9451a_regulators);
+		break;
 	default:
 		dev_err(&i2c->dev, "Unknown device type");
 		return -EINVAL;
@@ -755,7 +939,8 @@ static int pca9450_i2c_probe(struct i2c_client *i2c)
 
 	/* Check your board and dts for match the right pmic */
 	if (((device_id >> 4) != 0x1 && type == PCA9450_TYPE_PCA9450A) ||
-	    ((device_id >> 4) != 0x3 && type == PCA9450_TYPE_PCA9450BC)) {
+	    ((device_id >> 4) != 0x3 && type == PCA9450_TYPE_PCA9450BC) ||
+	    ((device_id >> 4) != 0x9 && type == PCA9450_TYPE_PCA9451A)) {
 		dev_err(&i2c->dev, "Device id(%x) mismatched\n",
 			device_id >> 4);
 		return -EINVAL;
@@ -846,7 +1031,8 @@ static int pca9450_i2c_probe(struct i2c_client *i2c)
 	}
 
 	dev_info(&i2c->dev, "%s probed.\n",
-		type == PCA9450_TYPE_PCA9450A ? "pca9450a" : "pca9450bc");
+		type == PCA9450_TYPE_PCA9450A ? "pca9450a" :
+		(type == PCA9450_TYPE_PCA9451A ? "pca9451a" : "pca9450bc"));
 
 	return 0;
 }
@@ -864,6 +1050,10 @@ static const struct of_device_id pca9450_of_match[] = {
 		.compatible = "nxp,pca9450c",
 		.data = (void *)PCA9450_TYPE_PCA9450BC,
 	},
+	{
+		.compatible = "nxp,pca9451a",
+		.data = (void *)PCA9450_TYPE_PCA9451A,
+	},
 	{ }
 };
 MODULE_DEVICE_TABLE(of, pca9450_of_match);
diff --git a/include/linux/regulator/pca9450.h b/include/linux/regulator/pca9450.h
index 505c908dbb81..243633c8dceb 100644
--- a/include/linux/regulator/pca9450.h
+++ b/include/linux/regulator/pca9450.h
@@ -9,6 +9,7 @@
 enum pca9450_chip_type {
 	PCA9450_TYPE_PCA9450A = 0,
 	PCA9450_TYPE_PCA9450BC,
+	PCA9450_TYPE_PCA9451A,
 	PCA9450_TYPE_AMOUNT,
 };
 
-- 
cgit v1.2.3


From d650d1c46d8471bf8ebf556629ac13077f13e647 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 8 Mar 2024 18:27:46 +0200
Subject: spi: xilinx: Fix kernel documentation in the xilinx_spi.h

While updating the data structure layout the kernel documentation
became outdated. Synchronize kernel documentation with the actual
data structure layout.

Fixes: 1dd46599f83a ("spi: xilinx: add force_irq for QSPI mode")
Fixes: 082339bc63cc ("spi: spi-xilinx: Add run run-time endian detection")
Reviewed-by: Michal Simek <michal.simek@amd.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://msgid.link/r/20240308162920.46816-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/xilinx_spi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/spi/xilinx_spi.h b/include/linux/spi/xilinx_spi.h
index 3934ce789d87..fd6add419e94 100644
--- a/include/linux/spi/xilinx_spi.h
+++ b/include/linux/spi/xilinx_spi.h
@@ -5,10 +5,10 @@
 /**
  * struct xspi_platform_data - Platform data of the Xilinx SPI driver
  * @num_chipselect:	Number of chip select by the IP.
- * @little_endian:	If registers should be accessed little endian or not.
  * @bits_per_word:	Number of bits per word.
  * @devices:		Devices to add when the driver is probed.
  * @num_devices:	Number of devices in the devices array.
+ * @force_irq:		If set, forces QSPI transaction requirements.
  */
 struct xspi_platform_data {
 	u16 num_chipselect;
-- 
cgit v1.2.3


From 8f40647d87610ecff6637d673024fe7bd045c913 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 8 Mar 2024 18:27:47 +0200
Subject: spi: xilinx: Add necessary inclusion and forward declaration

xilinx_spi.h is mnissing inclusion and forward declaration, add them.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Michal Simek <michal.simek@amd.com>
Link: https://msgid.link/r/20240308162920.46816-3-andriy.shevchenko@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/xilinx_spi.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/spi/xilinx_spi.h b/include/linux/spi/xilinx_spi.h
index fd6add419e94..4ba8f53ce570 100644
--- a/include/linux/spi/xilinx_spi.h
+++ b/include/linux/spi/xilinx_spi.h
@@ -2,6 +2,10 @@
 #ifndef __LINUX_SPI_XILINX_SPI_H
 #define __LINUX_SPI_XILINX_SPI_H
 
+#include <linux/types.h>
+
+struct spi_board_info;
+
 /**
  * struct xspi_platform_data - Platform data of the Xilinx SPI driver
  * @num_chipselect:	Number of chip select by the IP.
-- 
cgit v1.2.3


From a39111b1cf0864b1782f30f9a1fa65260d057327 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 8 Mar 2024 18:27:48 +0200
Subject: spi: xilinx: Make num_chipselect 8-bit in the struct
 xspi_platform_data

There is no use for whole 16-bit for the number of chip select pins.
Drop it to 8 bits and reshuffle the data structure layout to avoid
unnecessary paddings.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Michal Simek <michal.simek@amd.com>
Link: https://msgid.link/r/20240308162920.46816-4-andriy.shevchenko@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/xilinx_spi.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/spi/xilinx_spi.h b/include/linux/spi/xilinx_spi.h
index 4ba8f53ce570..1b8d984668b6 100644
--- a/include/linux/spi/xilinx_spi.h
+++ b/include/linux/spi/xilinx_spi.h
@@ -8,17 +8,17 @@ struct spi_board_info;
 
 /**
  * struct xspi_platform_data - Platform data of the Xilinx SPI driver
- * @num_chipselect:	Number of chip select by the IP.
- * @bits_per_word:	Number of bits per word.
  * @devices:		Devices to add when the driver is probed.
  * @num_devices:	Number of devices in the devices array.
+ * @num_chipselect:	Number of chip select by the IP.
+ * @bits_per_word:	Number of bits per word.
  * @force_irq:		If set, forces QSPI transaction requirements.
  */
 struct xspi_platform_data {
-	u16 num_chipselect;
-	u8 bits_per_word;
 	struct spi_board_info *devices;
 	u8 num_devices;
+	u8 num_chipselect;
+	u8 bits_per_word;
 	bool force_irq;
 };
 
-- 
cgit v1.2.3


From 7fd54c205f104317b853fc417ac7e9d0b9531ddb Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 7 Mar 2024 21:47:45 +0200
Subject: spi: pxa2xx: Kill pxa2xx_set_spi_info()

There is the only one user of the pxa2xx_set_spi_info(). Unexport it
and inline to the actual user.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Link: https://msgid.link/r/20240307195056.4059864-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 arch/arm/mach-pxa/devices.c    | 18 ------------------
 arch/arm/mach-pxa/spitz.c      | 14 +++++++++++++-
 include/linux/spi/pxa2xx_spi.h |  8 --------
 3 files changed, 13 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/arch/arm/mach-pxa/devices.c b/arch/arm/mach-pxa/devices.c
index 661b3fc43275..1e4cd502340e 100644
--- a/arch/arm/mach-pxa/devices.c
+++ b/arch/arm/mach-pxa/devices.c
@@ -7,7 +7,6 @@
 #include <linux/clk-provider.h>
 #include <linux/dma-mapping.h>
 #include <linux/dmaengine.h>
-#include <linux/spi/pxa2xx_spi.h>
 #include <linux/platform_data/i2c-pxa.h>
 #include <linux/soc/pxa/cpu.h>
 
@@ -665,23 +664,6 @@ struct platform_device pxa27x_device_gpio = {
 	.resource	= pxa_resource_gpio,
 };
 
-/* pxa2xx-spi platform-device ID equals respective SSP platform-device ID + 1.
- * See comment in arch/arm/mach-pxa/ssp.c::ssp_probe() */
-void __init pxa2xx_set_spi_info(unsigned id, struct pxa2xx_spi_controller *info)
-{
-	struct platform_device *pd;
-
-	pd = platform_device_alloc("pxa2xx-spi", id);
-	if (pd == NULL) {
-		printk(KERN_ERR "pxa2xx-spi: failed to allocate device id %d\n",
-		       id);
-		return;
-	}
-
-	pd->dev.platform_data = info;
-	platform_device_add(pd);
-}
-
 static struct resource pxa_dma_resource[] = {
 	[0] = {
 		.start	= 0x40000000,
diff --git a/arch/arm/mach-pxa/spitz.c b/arch/arm/mach-pxa/spitz.c
index cc691b199429..318402ad685e 100644
--- a/arch/arm/mach-pxa/spitz.c
+++ b/arch/arm/mach-pxa/spitz.c
@@ -585,6 +585,9 @@ static struct gpiod_lookup_table spitz_spi_gpio_table = {
 
 static void __init spitz_spi_init(void)
 {
+	struct platform_device *pd;
+	int id = 2;
+
 	if (machine_is_akita())
 		gpiod_add_lookup_table(&akita_lcdcon_gpio_table);
 	else
@@ -592,7 +595,16 @@ static void __init spitz_spi_init(void)
 
 	gpiod_add_lookup_table(&spitz_ads7846_gpio_table);
 	gpiod_add_lookup_table(&spitz_spi_gpio_table);
-	pxa2xx_set_spi_info(2, &spitz_spi_info);
+
+	/* pxa2xx-spi platform-device ID equals respective SSP platform-device ID + 1 */
+	pd = platform_device_alloc("pxa2xx-spi", id);
+	if (pd == NULL) {
+		pr_err("pxa2xx-spi: failed to allocate device id %d\n", id);
+	} else {
+		pd->dev.platform_data = &spitz_spi_info;
+		platform_device_add(pd);
+	}
+
 	spi_register_board_info(ARRAY_AND_SIZE(spitz_spi_devices));
 }
 #else
diff --git a/include/linux/spi/pxa2xx_spi.h b/include/linux/spi/pxa2xx_spi.h
index ca2cd4e30ead..56aba2f737b1 100644
--- a/include/linux/spi/pxa2xx_spi.h
+++ b/include/linux/spi/pxa2xx_spi.h
@@ -45,12 +45,4 @@ struct pxa2xx_spi_chip {
 	u32 timeout;
 };
 
-#if defined(CONFIG_ARCH_PXA) || defined(CONFIG_ARCH_MMP)
-
-#include <linux/clk.h>
-
-extern void pxa2xx_set_spi_info(unsigned id, struct pxa2xx_spi_controller *info);
-
-#endif
-
 #endif	/* __LINUX_SPI_PXA2XX_SPI_H */
-- 
cgit v1.2.3


From e3f209e269d32ebc0ba7f497f5d2af21ed4f0dd0 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 7 Mar 2024 21:47:46 +0200
Subject: spi: pxa2xx: Make num_chipselect 8-bit in the struct
 pxa2xx_spi_controller

There is no use for whole 16-bit for the number of chip select pins.
Drop it to 8 bits.

Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://msgid.link/r/20240307195056.4059864-3-andriy.shevchenko@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 Documentation/spi/pxa2xx.rst   | 2 +-
 include/linux/spi/pxa2xx_spi.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/spi/pxa2xx.rst b/Documentation/spi/pxa2xx.rst
index 19479b801826..0997d8eaf190 100644
--- a/Documentation/spi/pxa2xx.rst
+++ b/Documentation/spi/pxa2xx.rst
@@ -24,7 +24,7 @@ arch/.../mach-*/board-*.c as a "platform device". The host controller configurat
 is passed to the driver via a table found in include/linux/spi/pxa2xx_spi.h::
 
   struct pxa2xx_spi_controller {
-	u16 num_chipselect;
+	u8 num_chipselect;
 	u8 enable_dma;
 	...
   };
diff --git a/include/linux/spi/pxa2xx_spi.h b/include/linux/spi/pxa2xx_spi.h
index 56aba2f737b1..e5a4a045fb67 100644
--- a/include/linux/spi/pxa2xx_spi.h
+++ b/include/linux/spi/pxa2xx_spi.h
@@ -17,7 +17,7 @@ struct dma_chan;
  * (resides in device.platform_data).
  */
 struct pxa2xx_spi_controller {
-	u16 num_chipselect;
+	u8 num_chipselect;
 	u8 enable_dma;
 	u8 dma_burst_size;
 	bool is_target;
-- 
cgit v1.2.3


From cee28113db17f0de58df0eaea4e2756c404ee01f Mon Sep 17 00:00:00 2001
From: Tomi Valkeinen <tomi.valkeinen@ideasonboard.com>
Date: Tue, 19 Mar 2024 10:22:36 +0200
Subject: ASoC: dmaengine_pcm: Allow passing component name via config

At the moment we cannot instantiate two dmaengine_pcms with the same
parent device, as the components will be named the same, leading to
conflicts.

Add 'name' field to the snd_dmaengine_pcm_config, and use that (if
defined) as the component name instead of deriving the component name
from the device.

Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ideasonboard.com>
Link: https://msgid.link/r/20240319-xilinx-dp-audio-v2-1-92d6d3a7ca7e@ideasonboard.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/dmaengine_pcm.h         | 2 ++
 sound/soc/soc-core.c                  | 8 +++++---
 sound/soc/soc-generic-dmaengine-pcm.c | 3 +++
 3 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/sound/dmaengine_pcm.h b/include/sound/dmaengine_pcm.h
index d70c55f17df7..c11aaf8079fb 100644
--- a/include/sound/dmaengine_pcm.h
+++ b/include/sound/dmaengine_pcm.h
@@ -118,6 +118,7 @@ int snd_dmaengine_pcm_refine_runtime_hwparams(
  *   which do not use devicetree.
  * @process: Callback used to apply processing on samples transferred from/to
  *   user space.
+ * @name: Component name. If null, dev_name will be used.
  * @compat_filter_fn: Will be used as the filter function when requesting a
  *  channel for platforms which do not use devicetree. The filter parameter
  *  will be the DAI's DMA data.
@@ -143,6 +144,7 @@ struct snd_dmaengine_pcm_config {
 	int (*process)(struct snd_pcm_substream *substream,
 		       int channel, unsigned long hwoff,
 		       unsigned long bytes);
+	const char *name;
 	dma_filter_fn compat_filter_fn;
 	struct device *dma_dev;
 	const char *chan_names[SNDRV_PCM_STREAM_LAST + 1];
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index 2ec13d1634b6..3ab6626ad680 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -2796,10 +2796,12 @@ int snd_soc_component_initialize(struct snd_soc_component *component,
 	INIT_LIST_HEAD(&component->list);
 	mutex_init(&component->io_mutex);
 
-	component->name = fmt_single_name(dev, &component->id);
 	if (!component->name) {
-		dev_err(dev, "ASoC: Failed to allocate name\n");
-		return -ENOMEM;
+		component->name = fmt_single_name(dev, &component->id);
+		if (!component->name) {
+			dev_err(dev, "ASoC: Failed to allocate name\n");
+			return -ENOMEM;
+		}
 	}
 
 	component->dev		= dev;
diff --git a/sound/soc/soc-generic-dmaengine-pcm.c b/sound/soc/soc-generic-dmaengine-pcm.c
index 092ca09f3631..83db1a83d8ba 100644
--- a/sound/soc/soc-generic-dmaengine-pcm.c
+++ b/sound/soc/soc-generic-dmaengine-pcm.c
@@ -441,6 +441,9 @@ int snd_dmaengine_pcm_register(struct device *dev,
 	pcm->config = config;
 	pcm->flags = flags;
 
+	if (config->name)
+		pcm->component.name = config->name;
+
 	ret = dmaengine_pcm_request_chan_of(pcm, dev, config);
 	if (ret)
 		goto err_free_dma;
-- 
cgit v1.2.3


From ada5caa4e081b067736e872f2701e1c677290f22 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Mon, 25 Mar 2024 02:57:29 +0000
Subject: fbdev: omap2: replace of_graph_get_next_endpoint()

From DT point of view, in general, drivers should be asking for a
specific port number because their function is fixed in the binding.

of_graph_get_next_endpoint() doesn't match to this concept.

Simply replace

	- of_graph_get_next_endpoint(xxx, NULL);
	+ of_graph_get_endpoint_by_regs(xxx, 0, -1);

Link: https://lore.kernel.org/r/20240202174941.GA310089-robh@kernel.org
Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Helge Deller <deller@gmx.de>
---
 drivers/video/fbdev/omap2/omapfb/dss/dsi.c    |  3 ++-
 drivers/video/fbdev/omap2/omapfb/dss/dss-of.c | 20 +-------------------
 drivers/video/fbdev/omap2/omapfb/dss/hdmi4.c  |  3 ++-
 drivers/video/fbdev/omap2/omapfb/dss/hdmi5.c  |  3 ++-
 drivers/video/fbdev/omap2/omapfb/dss/venc.c   |  3 ++-
 drivers/video/fbdev/pxafb.c                   |  2 +-
 include/video/omapfb_dss.h                    |  3 ---
 7 files changed, 10 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/drivers/video/fbdev/omap2/omapfb/dss/dsi.c b/drivers/video/fbdev/omap2/omapfb/dss/dsi.c
index b7eb17a16ec4..1f13bcf73da5 100644
--- a/drivers/video/fbdev/omap2/omapfb/dss/dsi.c
+++ b/drivers/video/fbdev/omap2/omapfb/dss/dsi.c
@@ -28,6 +28,7 @@
 #include <linux/debugfs.h>
 #include <linux/pm_runtime.h>
 #include <linux/of.h>
+#include <linux/of_graph.h>
 #include <linux/of_platform.h>
 #include <linux/component.h>
 
@@ -5079,7 +5080,7 @@ static int dsi_probe_of(struct platform_device *pdev)
 	struct device_node *ep;
 	struct omap_dsi_pin_config pin_cfg;
 
-	ep = omapdss_of_get_first_endpoint(node);
+	ep = of_graph_get_endpoint_by_regs(node, 0, -1);
 	if (!ep)
 		return 0;
 
diff --git a/drivers/video/fbdev/omap2/omapfb/dss/dss-of.c b/drivers/video/fbdev/omap2/omapfb/dss/dss-of.c
index 0282d4eef139..14965a3fd05b 100644
--- a/drivers/video/fbdev/omap2/omapfb/dss/dss-of.c
+++ b/drivers/video/fbdev/omap2/omapfb/dss/dss-of.c
@@ -130,24 +130,6 @@ static struct device_node *omapdss_of_get_remote_port(const struct device_node *
 	return np;
 }
 
-struct device_node *
-omapdss_of_get_first_endpoint(const struct device_node *parent)
-{
-	struct device_node *port, *ep;
-
-	port = omapdss_of_get_next_port(parent, NULL);
-
-	if (!port)
-		return NULL;
-
-	ep = omapdss_of_get_next_endpoint(port, NULL);
-
-	of_node_put(port);
-
-	return ep;
-}
-EXPORT_SYMBOL_GPL(omapdss_of_get_first_endpoint);
-
 struct omap_dss_device *
 omapdss_of_find_source_for_first_ep(struct device_node *node)
 {
@@ -155,7 +137,7 @@ omapdss_of_find_source_for_first_ep(struct device_node *node)
 	struct device_node *src_port;
 	struct omap_dss_device *src;
 
-	ep = omapdss_of_get_first_endpoint(node);
+	ep = of_graph_get_endpoint_by_regs(node, 0, -1);
 	if (!ep)
 		return ERR_PTR(-EINVAL);
 
diff --git a/drivers/video/fbdev/omap2/omapfb/dss/hdmi4.c b/drivers/video/fbdev/omap2/omapfb/dss/hdmi4.c
index f05b4e35a842..8f407ec134dc 100644
--- a/drivers/video/fbdev/omap2/omapfb/dss/hdmi4.c
+++ b/drivers/video/fbdev/omap2/omapfb/dss/hdmi4.c
@@ -20,6 +20,7 @@
 #include <linux/pm_runtime.h>
 #include <linux/clk.h>
 #include <linux/of.h>
+#include <linux/of_graph.h>
 #include <linux/regulator/consumer.h>
 #include <linux/component.h>
 #include <video/omapfb_dss.h>
@@ -529,7 +530,7 @@ static int hdmi_probe_of(struct platform_device *pdev)
 	struct device_node *ep;
 	int r;
 
-	ep = omapdss_of_get_first_endpoint(node);
+	ep = of_graph_get_endpoint_by_regs(node, 0, -1);
 	if (!ep)
 		return 0;
 
diff --git a/drivers/video/fbdev/omap2/omapfb/dss/hdmi5.c b/drivers/video/fbdev/omap2/omapfb/dss/hdmi5.c
index 03292945b1d4..4ad219f522b9 100644
--- a/drivers/video/fbdev/omap2/omapfb/dss/hdmi5.c
+++ b/drivers/video/fbdev/omap2/omapfb/dss/hdmi5.c
@@ -25,6 +25,7 @@
 #include <linux/pm_runtime.h>
 #include <linux/clk.h>
 #include <linux/of.h>
+#include <linux/of_graph.h>
 #include <linux/regulator/consumer.h>
 #include <linux/component.h>
 #include <video/omapfb_dss.h>
@@ -561,7 +562,7 @@ static int hdmi_probe_of(struct platform_device *pdev)
 	struct device_node *ep;
 	int r;
 
-	ep = omapdss_of_get_first_endpoint(node);
+	ep = of_graph_get_endpoint_by_regs(node, 0, -1);
 	if (!ep)
 		return 0;
 
diff --git a/drivers/video/fbdev/omap2/omapfb/dss/venc.c b/drivers/video/fbdev/omap2/omapfb/dss/venc.c
index c9d40e28a06f..0bd80d3b8f1b 100644
--- a/drivers/video/fbdev/omap2/omapfb/dss/venc.c
+++ b/drivers/video/fbdev/omap2/omapfb/dss/venc.c
@@ -24,6 +24,7 @@
 #include <linux/regulator/consumer.h>
 #include <linux/pm_runtime.h>
 #include <linux/of.h>
+#include <linux/of_graph.h>
 #include <linux/component.h>
 
 #include <video/omapfb_dss.h>
@@ -764,7 +765,7 @@ static int venc_probe_of(struct platform_device *pdev)
 	u32 channels;
 	int r;
 
-	ep = omapdss_of_get_first_endpoint(node);
+	ep = of_graph_get_endpoint_by_regs(node, 0, -1);
 	if (!ep)
 		return 0;
 
diff --git a/drivers/video/fbdev/pxafb.c b/drivers/video/fbdev/pxafb.c
index fa943612c4e2..2ef56fa28aff 100644
--- a/drivers/video/fbdev/pxafb.c
+++ b/drivers/video/fbdev/pxafb.c
@@ -2171,7 +2171,7 @@ static int of_get_pxafb_mode_info(struct device *dev,
 	u32 bus_width;
 	int ret, i;
 
-	np = of_graph_get_next_endpoint(dev->of_node, NULL);
+	np = of_graph_get_endpoint_by_regs(dev->of_node, 0, -1);
 	if (!np) {
 		dev_err(dev, "could not find endpoint\n");
 		return -EINVAL;
diff --git a/include/video/omapfb_dss.h b/include/video/omapfb_dss.h
index e8eaac2cb7b8..a8c0c3eeeb5b 100644
--- a/include/video/omapfb_dss.h
+++ b/include/video/omapfb_dss.h
@@ -819,9 +819,6 @@ struct device_node *
 omapdss_of_get_next_endpoint(const struct device_node *parent,
 			     struct device_node *prev);
 
-struct device_node *
-omapdss_of_get_first_endpoint(const struct device_node *parent);
-
 struct omap_dss_device *
 omapdss_of_find_source_for_first_ep(struct device_node *node);
 #else
-- 
cgit v1.2.3


From 4184e4912ca69d4f18a800144539af3b37c6a663 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Date: Tue, 12 Mar 2024 17:44:28 +0100
Subject: dt-bindings: pinctrl: samsung: drop unused header with register
 constants

The bindings header for Samsung pin controller DTS pin values (holding
register values in fact) was deprecated in v6.1 kernel in
commit 9d9292576810 ("dt-bindings: pinctrl: samsung: deprecate header
with register constants").  This was enough of time for users to switch
to in-DTS headers, so drop the bindings header.

Acked-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://lore.kernel.org/r/20240312164428.692552-1-krzysztof.kozlowski@linaro.org
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
---
 MAINTAINERS                           |  1 -
 include/dt-bindings/pinctrl/samsung.h | 95 -----------------------------------
 2 files changed, 96 deletions(-)
 delete mode 100644 include/dt-bindings/pinctrl/samsung.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index aa3b947fb080..643dba658346 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -17489,7 +17489,6 @@ C:	irc://irc.libera.chat/linux-exynos
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/pinctrl/samsung.git
 F:	Documentation/devicetree/bindings/pinctrl/samsung,pinctrl*yaml
 F:	drivers/pinctrl/samsung/
-F:	include/dt-bindings/pinctrl/samsung.h
 
 PIN CONTROLLER - SINGLE
 M:	Tony Lindgren <tony@atomide.com>
diff --git a/include/dt-bindings/pinctrl/samsung.h b/include/dt-bindings/pinctrl/samsung.h
deleted file mode 100644
index d1da5ff68d0c..000000000000
--- a/include/dt-bindings/pinctrl/samsung.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Samsung's Exynos pinctrl bindings
- *
- * Copyright (c) 2016 Samsung Electronics Co., Ltd.
- *		http://www.samsung.com
- * Author: Krzysztof Kozlowski <krzk@kernel.org>
- */
-
-#ifndef __DT_BINDINGS_PINCTRL_SAMSUNG_H__
-#define __DT_BINDINGS_PINCTRL_SAMSUNG_H__
-
-/*
- * These bindings are deprecated, because they do not match the actual
- * concept of bindings but rather contain pure register values.
- * Instead include the header in the DTS source directory.
- */
-#warning "These bindings are deprecated. Instead use the header in the DTS source directory."
-
-#define EXYNOS_PIN_PULL_NONE		0
-#define EXYNOS_PIN_PULL_DOWN		1
-#define EXYNOS_PIN_PULL_UP		3
-
-#define S3C64XX_PIN_PULL_NONE		0
-#define S3C64XX_PIN_PULL_DOWN		1
-#define S3C64XX_PIN_PULL_UP		2
-
-/* Pin function in power down mode */
-#define EXYNOS_PIN_PDN_OUT0		0
-#define EXYNOS_PIN_PDN_OUT1		1
-#define EXYNOS_PIN_PDN_INPUT		2
-#define EXYNOS_PIN_PDN_PREV		3
-
-/* Drive strengths for Exynos3250, Exynos4 (all) and Exynos5250 */
-#define EXYNOS4_PIN_DRV_LV1		0
-#define EXYNOS4_PIN_DRV_LV2		2
-#define EXYNOS4_PIN_DRV_LV3		1
-#define EXYNOS4_PIN_DRV_LV4		3
-
-/* Drive strengths for Exynos5260 */
-#define EXYNOS5260_PIN_DRV_LV1		0
-#define EXYNOS5260_PIN_DRV_LV2		1
-#define EXYNOS5260_PIN_DRV_LV4		2
-#define EXYNOS5260_PIN_DRV_LV6		3
-
-/*
- * Drive strengths for Exynos5410, Exynos542x, Exynos5800 and Exynos850 (except
- * GPIO_HSI block)
- */
-#define EXYNOS5420_PIN_DRV_LV1		0
-#define EXYNOS5420_PIN_DRV_LV2		1
-#define EXYNOS5420_PIN_DRV_LV3		2
-#define EXYNOS5420_PIN_DRV_LV4		3
-
-/* Drive strengths for Exynos5433 */
-#define EXYNOS5433_PIN_DRV_FAST_SR1	0
-#define EXYNOS5433_PIN_DRV_FAST_SR2	1
-#define EXYNOS5433_PIN_DRV_FAST_SR3	2
-#define EXYNOS5433_PIN_DRV_FAST_SR4	3
-#define EXYNOS5433_PIN_DRV_FAST_SR5	4
-#define EXYNOS5433_PIN_DRV_FAST_SR6	5
-#define EXYNOS5433_PIN_DRV_SLOW_SR1	8
-#define EXYNOS5433_PIN_DRV_SLOW_SR2	9
-#define EXYNOS5433_PIN_DRV_SLOW_SR3	0xa
-#define EXYNOS5433_PIN_DRV_SLOW_SR4	0xb
-#define EXYNOS5433_PIN_DRV_SLOW_SR5	0xc
-#define EXYNOS5433_PIN_DRV_SLOW_SR6	0xf
-
-/* Drive strengths for Exynos850 GPIO_HSI block */
-#define EXYNOS850_HSI_PIN_DRV_LV1	0	/* 1x   */
-#define EXYNOS850_HSI_PIN_DRV_LV1_5	1	/* 1.5x */
-#define EXYNOS850_HSI_PIN_DRV_LV2	2	/* 2x   */
-#define EXYNOS850_HSI_PIN_DRV_LV2_5	3	/* 2.5x */
-#define EXYNOS850_HSI_PIN_DRV_LV3	4	/* 3x   */
-#define EXYNOS850_HSI_PIN_DRV_LV4	5	/* 4x   */
-
-#define EXYNOS_PIN_FUNC_INPUT		0
-#define EXYNOS_PIN_FUNC_OUTPUT		1
-#define EXYNOS_PIN_FUNC_2		2
-#define EXYNOS_PIN_FUNC_3		3
-#define EXYNOS_PIN_FUNC_4		4
-#define EXYNOS_PIN_FUNC_5		5
-#define EXYNOS_PIN_FUNC_6		6
-#define EXYNOS_PIN_FUNC_EINT		0xf
-#define EXYNOS_PIN_FUNC_F		EXYNOS_PIN_FUNC_EINT
-
-/* Drive strengths for Exynos7 FSYS1 block */
-#define EXYNOS7_FSYS1_PIN_DRV_LV1	0
-#define EXYNOS7_FSYS1_PIN_DRV_LV2	4
-#define EXYNOS7_FSYS1_PIN_DRV_LV3	2
-#define EXYNOS7_FSYS1_PIN_DRV_LV4	6
-#define EXYNOS7_FSYS1_PIN_DRV_LV5	1
-#define EXYNOS7_FSYS1_PIN_DRV_LV6	5
-
-#endif /* __DT_BINDINGS_PINCTRL_SAMSUNG_H__ */
-- 
cgit v1.2.3


From d7cdb5946f6394c92916ea9502393b7ffe11fed9 Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Date: Thu, 14 Mar 2024 16:32:18 +0100
Subject: media: videobuf2: Update vb2_is_busy() logic

Do not rely on the number of allocated buffers to know if the
queue is busy but on a flag set when at least one buffer has been allocated
by REQBUFS or CREATE_BUFS ioctl.
The flag is reset when REQBUFS is called with count = 0 or the file
handle is closed.
This is needed because remove buffers feature will be able to remove
all the buffers from a queue while streaming so relying on the number
of allocated buffers in the queue won't be possible.

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Reviewed-by: Mauro Carvalho Chehab <mchehab@kernel.org>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 drivers/media/common/videobuf2/videobuf2-core.c | 4 ++++
 include/media/videobuf2-core.h                  | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/media/common/videobuf2/videobuf2-core.c b/drivers/media/common/videobuf2/videobuf2-core.c
index b6bf8f232f48..d8b3c04cb3b5 100644
--- a/drivers/media/common/videobuf2/videobuf2-core.c
+++ b/drivers/media/common/videobuf2/videobuf2-core.c
@@ -854,6 +854,7 @@ int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory,
 		__vb2_queue_free(q, q_num_bufs);
 		mutex_unlock(&q->mmap_lock);
 
+		q->is_busy = 0;
 		/*
 		 * In case of REQBUFS(0) return immediately without calling
 		 * driver's queue_setup() callback and allocating resources.
@@ -966,6 +967,7 @@ int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory,
 	 */
 	*count = allocated_buffers;
 	q->waiting_for_buffers = !q->is_output;
+	q->is_busy = 1;
 
 	return 0;
 
@@ -1091,6 +1093,7 @@ int vb2_core_create_bufs(struct vb2_queue *q, enum vb2_memory memory,
 	 * to the userspace.
 	 */
 	*count = allocated_buffers;
+	q->is_busy = 1;
 
 	return 0;
 
@@ -2555,6 +2558,7 @@ void vb2_core_queue_release(struct vb2_queue *q)
 	__vb2_queue_free(q, vb2_get_num_buffers(q));
 	kfree(q->bufs);
 	q->bufs = NULL;
+	q->is_busy = 0;
 	mutex_unlock(&q->mmap_lock);
 }
 EXPORT_SYMBOL_GPL(vb2_core_queue_release);
diff --git a/include/media/videobuf2-core.h b/include/media/videobuf2-core.h
index 8b86996b2719..667bf9ee1101 100644
--- a/include/media/videobuf2-core.h
+++ b/include/media/videobuf2-core.h
@@ -582,6 +582,7 @@ struct vb2_buf_ops {
  *		released. Used to prevent destroying the queue by other threads.
  * @is_multiplanar: set if buffer type is multiplanar
  * @is_output:	set if buffer type is output
+ * @is_busy:	set if at least one buffer has been allocated at some time.
  * @copy_timestamp: set if vb2-core should set timestamps
  * @last_buffer_dequeued: used in poll() and DQBUF to immediately return if the
  *		last decoded buffer was already dequeued. Set for capture queues
@@ -647,6 +648,7 @@ struct vb2_queue {
 	unsigned int			waiting_in_dqbuf:1;
 	unsigned int			is_multiplanar:1;
 	unsigned int			is_output:1;
+	unsigned int			is_busy:1;
 	unsigned int			copy_timestamp:1;
 	unsigned int			last_buffer_dequeued:1;
 
@@ -1166,7 +1168,7 @@ static inline unsigned int vb2_get_num_buffers(struct vb2_queue *q)
  */
 static inline bool vb2_is_busy(struct vb2_queue *q)
 {
-	return vb2_get_num_buffers(q) > 0;
+	return !!q->is_busy;
 }
 
 /**
-- 
cgit v1.2.3


From 6662edcd32ccf939849cd1cd4ff04733a20ef97a Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Date: Thu, 14 Mar 2024 16:32:19 +0100
Subject: media: videobuf2: Add min_reqbufs_allocation field to vb2_queue
 structure

Add 'min_reqbufs_allocation' field in the vb2_queue structure so drivers
can specify the minimum number of buffers to allocate when calling
VIDIOC_REQBUFS.
When initializing the queue, v4l2 core makes sure that the following
constraints are respected:
- the minimum number of buffers to allocate must be at least 2 because
one buffer is used by the hardware while the other is being processed
by userspace.
-if the driver needs 'min_queued_buffers' in the queue before calling
start_streaming(), then the minimum requirement is 'min_queued_buffers + 1'
to keep at least one buffer available for userspace.

Simplify __vb2_init_fileio() by using 'min_reqbufs_allocation' directly
to avoid duplicating the minimum number of buffers to allocate computation.

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Reviewed-by: Mauro Carvalho Chehab <mchehab@kernel.org>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 drivers/media/common/videobuf2/videobuf2-core.c | 38 +++++++++++++++----------
 include/media/videobuf2-core.h                  | 15 +++++++++-
 2 files changed, 37 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/media/common/videobuf2/videobuf2-core.c b/drivers/media/common/videobuf2/videobuf2-core.c
index d8b3c04cb3b5..58c495b253ce 100644
--- a/drivers/media/common/videobuf2/videobuf2-core.c
+++ b/drivers/media/common/videobuf2/videobuf2-core.c
@@ -866,7 +866,7 @@ int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory,
 	/*
 	 * Make sure the requested values and current defaults are sane.
 	 */
-	num_buffers = max_t(unsigned int, *count, q->min_queued_buffers);
+	num_buffers = max_t(unsigned int, *count, q->min_reqbufs_allocation);
 	num_buffers = min_t(unsigned int, num_buffers, q->max_num_buffers);
 	memset(q->alloc_devs, 0, sizeof(q->alloc_devs));
 	/*
@@ -918,7 +918,7 @@ int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory,
 	 * There is no point in continuing if we can't allocate the minimum
 	 * number of buffers needed by this vb2_queue.
 	 */
-	if (allocated_buffers < q->min_queued_buffers)
+	if (allocated_buffers < q->min_reqbufs_allocation)
 		ret = -ENOMEM;
 
 	/*
@@ -2524,6 +2524,25 @@ int vb2_core_queue_init(struct vb2_queue *q)
 	if (WARN_ON(q->supports_requests && q->min_queued_buffers))
 		return -EINVAL;
 
+	/*
+	 * The minimum requirement is 2: one buffer is used
+	 * by the hardware while the other is being processed by userspace.
+	 */
+	if (q->min_reqbufs_allocation < 2)
+		q->min_reqbufs_allocation = 2;
+
+	/*
+	 * If the driver needs 'min_queued_buffers' in the queue before
+	 * calling start_streaming() then the minimum requirement is
+	 * 'min_queued_buffers + 1' to keep at least one buffer available
+	 * for userspace.
+	 */
+	if (q->min_reqbufs_allocation < q->min_queued_buffers + 1)
+		q->min_reqbufs_allocation = q->min_queued_buffers + 1;
+
+	if (WARN_ON(q->min_reqbufs_allocation > q->max_num_buffers))
+		return -EINVAL;
+
 	INIT_LIST_HEAD(&q->queued_list);
 	INIT_LIST_HEAD(&q->done_list);
 	spin_lock_init(&q->done_lock);
@@ -2717,7 +2736,6 @@ static int __vb2_init_fileio(struct vb2_queue *q, int read)
 	struct vb2_fileio_data *fileio;
 	struct vb2_buffer *vb;
 	int i, ret;
-	unsigned int count = 0;
 
 	/*
 	 * Sanity check
@@ -2738,18 +2756,8 @@ static int __vb2_init_fileio(struct vb2_queue *q, int read)
 	if (q->streaming || vb2_get_num_buffers(q) > 0)
 		return -EBUSY;
 
-	/*
-	 * Start with q->min_queued_buffers + 1, driver can increase it in
-	 * queue_setup()
-	 *
-	 * 'min_queued_buffers' buffers need to be queued up before you
-	 * can start streaming, plus 1 for userspace (or in this case,
-	 * kernelspace) processing.
-	 */
-	count = max(2, q->min_queued_buffers + 1);
-
 	dprintk(q, 3, "setting up file io: mode %s, count %d, read_once %d, write_immediately %d\n",
-		(read) ? "read" : "write", count, q->fileio_read_once,
+		(read) ? "read" : "write", q->min_reqbufs_allocation, q->fileio_read_once,
 		q->fileio_write_immediately);
 
 	fileio = kzalloc(sizeof(*fileio), GFP_KERNEL);
@@ -2763,7 +2771,7 @@ static int __vb2_init_fileio(struct vb2_queue *q, int read)
 	 * Request buffers and use MMAP type to force driver
 	 * to allocate buffers by itself.
 	 */
-	fileio->count = count;
+	fileio->count = q->min_reqbufs_allocation;
 	fileio->memory = VB2_MEMORY_MMAP;
 	fileio->type = q->type;
 	q->fileio = fileio;
diff --git a/include/media/videobuf2-core.h b/include/media/videobuf2-core.h
index 667bf9ee1101..4a8b9135cec8 100644
--- a/include/media/videobuf2-core.h
+++ b/include/media/videobuf2-core.h
@@ -549,9 +549,21 @@ struct vb2_buf_ops {
  *		@start_streaming can be called. Used when a DMA engine
  *		cannot be started unless at least this number of buffers
  *		have been queued into the driver.
- *		VIDIOC_REQBUFS will ensure at least @min_queued_buffers
+ *		VIDIOC_REQBUFS will ensure at least @min_queued_buffers + 1
  *		buffers will be allocated. Note that VIDIOC_CREATE_BUFS will not
  *		modify the requested buffer count.
+ * @min_reqbufs_allocation: the minimum number of buffers to be allocated when
+ *		calling VIDIOC_REQBUFS. Note that VIDIOC_CREATE_BUFS will *not*
+ *		modify the requested buffer count and does not use this field.
+ *		Drivers can set this if there has to be a certain number of
+ *		buffers available for the hardware to work effectively.
+ *		This allows calling VIDIOC_REQBUFS with a buffer count of 1 and
+ *		it will be automatically adjusted to a workable	buffer count.
+ *		If set, then @min_reqbufs_allocation must be larger than
+ *		@min_queued_buffers + 1.
+ *		If this field is > 3, then it is highly recommended that the
+ *		driver implements the V4L2_CID_MIN_BUFFERS_FOR_CAPTURE/OUTPUT
+ *		control.
  * @alloc_devs:	&struct device memory type/allocator-specific per-plane device
  */
 /*
@@ -622,6 +634,7 @@ struct vb2_queue {
 	u32				timestamp_flags;
 	gfp_t				gfp_flags;
 	u32				min_queued_buffers;
+	u32				min_reqbufs_allocation;
 
 	struct device			*alloc_devs[VB2_MAX_PLANES];
 
-- 
cgit v1.2.3


From 6e423b75d0492e82fa6f057c615427ebd7ebe76c Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Date: Thu, 14 Mar 2024 16:32:21 +0100
Subject: media: core: Rework how create_buf index returned value is computed

When REMOVE_BUFS will be introduced holes could created in bufs array.
To be able to reuse these unused indices reworking how create->index
is set is mandatory.
Let __vb2_queue_alloc() decide which first index is correct and
forward this to the caller.

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Acked-by: Mauro Carvalho Chehab <mchehab@kernel.org>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 drivers/media/common/videobuf2/videobuf2-core.c | 18 +++++++++++++-----
 drivers/media/common/videobuf2/videobuf2-v4l2.c | 14 +++++++++-----
 include/media/videobuf2-core.h                  |  5 ++++-
 3 files changed, 26 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/drivers/media/common/videobuf2/videobuf2-core.c b/drivers/media/common/videobuf2/videobuf2-core.c
index 58c495b253ce..8e819d198c34 100644
--- a/drivers/media/common/videobuf2/videobuf2-core.c
+++ b/drivers/media/common/videobuf2/videobuf2-core.c
@@ -442,12 +442,15 @@ static void vb2_queue_remove_buffer(struct vb2_buffer *vb)
  * __vb2_queue_alloc() - allocate vb2 buffer structures and (for MMAP type)
  * video buffer memory for all buffers/planes on the queue and initializes the
  * queue
+ * @first_index: index of the first created buffer, all newly allocated buffers
+ *		 have indices in the range [first_index..first_index+count-1]
  *
  * Returns the number of buffers successfully allocated.
  */
 static int __vb2_queue_alloc(struct vb2_queue *q, enum vb2_memory memory,
 			     unsigned int num_buffers, unsigned int num_planes,
-			     const unsigned plane_sizes[VB2_MAX_PLANES])
+			     const unsigned int plane_sizes[VB2_MAX_PLANES],
+			     unsigned int *first_index)
 {
 	unsigned int q_num_buffers = vb2_get_num_buffers(q);
 	unsigned int buffer, plane;
@@ -461,6 +464,8 @@ static int __vb2_queue_alloc(struct vb2_queue *q, enum vb2_memory memory,
 	num_buffers = min_t(unsigned int, num_buffers,
 			    q->max_num_buffers - q_num_buffers);
 
+	*first_index = q_num_buffers;
+
 	for (buffer = 0; buffer < num_buffers; ++buffer) {
 		/* Allocate vb2 buffer structures */
 		vb = kzalloc(q->buf_struct_size, GFP_KERNEL);
@@ -820,7 +825,7 @@ int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory,
 	unsigned int q_num_bufs = vb2_get_num_buffers(q);
 	unsigned plane_sizes[VB2_MAX_PLANES] = { };
 	bool non_coherent_mem = flags & V4L2_MEMORY_FLAG_NON_COHERENT;
-	unsigned int i;
+	unsigned int i, first_index;
 	int ret = 0;
 
 	if (q->streaming) {
@@ -907,8 +912,10 @@ int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory,
 
 	/* Finally, allocate buffers and video memory */
 	allocated_buffers =
-		__vb2_queue_alloc(q, memory, num_buffers, num_planes, plane_sizes);
+		__vb2_queue_alloc(q, memory, num_buffers, num_planes, plane_sizes, &first_index);
 	if (allocated_buffers == 0) {
+		/* There shouldn't be any buffers allocated, so first_index == 0 */
+		WARN_ON(first_index);
 		dprintk(q, 1, "memory allocation failed\n");
 		ret = -ENOMEM;
 		goto error;
@@ -982,7 +989,8 @@ EXPORT_SYMBOL_GPL(vb2_core_reqbufs);
 int vb2_core_create_bufs(struct vb2_queue *q, enum vb2_memory memory,
 			 unsigned int flags, unsigned int *count,
 			 unsigned int requested_planes,
-			 const unsigned int requested_sizes[])
+			 const unsigned int requested_sizes[],
+			 unsigned int *first_index)
 {
 	unsigned int num_planes = 0, num_buffers, allocated_buffers;
 	unsigned plane_sizes[VB2_MAX_PLANES] = { };
@@ -1044,7 +1052,7 @@ int vb2_core_create_bufs(struct vb2_queue *q, enum vb2_memory memory,
 
 	/* Finally, allocate buffers and video memory */
 	allocated_buffers = __vb2_queue_alloc(q, memory, num_buffers,
-				num_planes, plane_sizes);
+				num_planes, plane_sizes, first_index);
 	if (allocated_buffers == 0) {
 		dprintk(q, 1, "memory allocation failed\n");
 		ret = -ENOMEM;
diff --git a/drivers/media/common/videobuf2/videobuf2-v4l2.c b/drivers/media/common/videobuf2/videobuf2-v4l2.c
index c575198e8354..03e8080a68a8 100644
--- a/drivers/media/common/videobuf2/videobuf2-v4l2.c
+++ b/drivers/media/common/videobuf2/videobuf2-v4l2.c
@@ -795,11 +795,15 @@ int vb2_create_bufs(struct vb2_queue *q, struct v4l2_create_buffers *create)
 	for (i = 0; i < requested_planes; i++)
 		if (requested_sizes[i] == 0)
 			return -EINVAL;
-	return ret ? ret : vb2_core_create_bufs(q, create->memory,
-						create->flags,
-						&create->count,
-						requested_planes,
-						requested_sizes);
+	if (ret)
+		return ret;
+
+	return vb2_core_create_bufs(q, create->memory,
+				    create->flags,
+				    &create->count,
+				    requested_planes,
+				    requested_sizes,
+				    &create->index);
 }
 EXPORT_SYMBOL_GPL(vb2_create_bufs);
 
diff --git a/include/media/videobuf2-core.h b/include/media/videobuf2-core.h
index 4a8b9135cec8..42526e289c8e 100644
--- a/include/media/videobuf2-core.h
+++ b/include/media/videobuf2-core.h
@@ -826,6 +826,8 @@ int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory,
  * @count: requested buffer count.
  * @requested_planes: number of planes requested.
  * @requested_sizes: array with the size of the planes.
+ * @first_index: index of the first created buffer, all allocated buffers have
+ *		 indices in the range [first_index..first_index+count-1]
  *
  * Videobuf2 core helper to implement VIDIOC_CREATE_BUFS() operation. It is
  * called internally by VB2 by an API-specific handler, like
@@ -842,7 +844,8 @@ int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory,
 int vb2_core_create_bufs(struct vb2_queue *q, enum vb2_memory memory,
 			 unsigned int flags, unsigned int *count,
 			 unsigned int requested_planes,
-			 const unsigned int requested_sizes[]);
+			 const unsigned int requested_sizes[],
+			 unsigned int *first_index);
 
 /**
  * vb2_core_prepare_buf() - Pass ownership of a buffer from userspace
-- 
cgit v1.2.3


From a286b0837e1390c558aabebc4f10b25b1f05bdce Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Date: Thu, 14 Mar 2024 16:32:22 +0100
Subject: media: core: Add bitmap manage bufs array entries

Add a bitmap field to know which of bufs array entries are
used or not.
Remove no more used num_buffers field from queue structure.
Use bitmap_find_next_zero_area() to find the first possible
range when creating new buffers to fill the gaps.
If no suitable range is found try to allocate less buffers
than requested.

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Acked-by: Mauro Carvalho Chehab <mchehab@kernel.org>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 drivers/media/common/videobuf2/videobuf2-core.c | 71 ++++++++++++++++++-------
 include/media/videobuf2-core.h                  | 18 ++++---
 2 files changed, 64 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/drivers/media/common/videobuf2/videobuf2-core.c b/drivers/media/common/videobuf2/videobuf2-core.c
index 8e819d198c34..ec81426d4d79 100644
--- a/drivers/media/common/videobuf2/videobuf2-core.c
+++ b/drivers/media/common/videobuf2/videobuf2-core.c
@@ -421,11 +421,12 @@ static void init_buffer_cache_hints(struct vb2_queue *q, struct vb2_buffer *vb)
  */
 static void vb2_queue_add_buffer(struct vb2_queue *q, struct vb2_buffer *vb, unsigned int index)
 {
-	WARN_ON(index >= q->max_num_buffers || q->bufs[index] || vb->vb2_queue);
+	WARN_ON(index >= q->max_num_buffers || test_bit(index, q->bufs_bitmap) || vb->vb2_queue);
 
 	q->bufs[index] = vb;
 	vb->index = index;
 	vb->vb2_queue = q;
+	set_bit(index, q->bufs_bitmap);
 }
 
 /**
@@ -434,6 +435,7 @@ static void vb2_queue_add_buffer(struct vb2_queue *q, struct vb2_buffer *vb, uns
  */
 static void vb2_queue_remove_buffer(struct vb2_buffer *vb)
 {
+	clear_bit(vb->index, vb->vb2_queue->bufs_bitmap);
 	vb->vb2_queue->bufs[vb->index] = NULL;
 	vb->vb2_queue = NULL;
 }
@@ -452,9 +454,9 @@ static int __vb2_queue_alloc(struct vb2_queue *q, enum vb2_memory memory,
 			     const unsigned int plane_sizes[VB2_MAX_PLANES],
 			     unsigned int *first_index)
 {
-	unsigned int q_num_buffers = vb2_get_num_buffers(q);
 	unsigned int buffer, plane;
 	struct vb2_buffer *vb;
+	unsigned long index = q->max_num_buffers;
 	int ret;
 
 	/*
@@ -462,9 +464,25 @@ static int __vb2_queue_alloc(struct vb2_queue *q, enum vb2_memory memory,
 	 * in the queue is below q->max_num_buffers
 	 */
 	num_buffers = min_t(unsigned int, num_buffers,
-			    q->max_num_buffers - q_num_buffers);
+			    q->max_num_buffers - vb2_get_num_buffers(q));
+
+	while (num_buffers) {
+		index = bitmap_find_next_zero_area(q->bufs_bitmap, q->max_num_buffers,
+						   0, num_buffers, 0);
+
+		if (index < q->max_num_buffers)
+			break;
+		/* Try to find free space for less buffers */
+		num_buffers--;
+	}
+
+	/* If there is no space left to allocate buffers return 0 to indicate the error */
+	if (!num_buffers) {
+		*first_index = 0;
+		return 0;
+	}
 
-	*first_index = q_num_buffers;
+	*first_index = index;
 
 	for (buffer = 0; buffer < num_buffers; ++buffer) {
 		/* Allocate vb2 buffer structures */
@@ -484,7 +502,7 @@ static int __vb2_queue_alloc(struct vb2_queue *q, enum vb2_memory memory,
 			vb->planes[plane].min_length = plane_sizes[plane];
 		}
 
-		vb2_queue_add_buffer(q, vb, q_num_buffers + buffer);
+		vb2_queue_add_buffer(q, vb, index++);
 		call_void_bufop(q, init_buffer, vb);
 
 		/* Allocate video buffer memory for the MMAP type */
@@ -664,7 +682,6 @@ static void __vb2_queue_free(struct vb2_queue *q, unsigned int buffers)
 		kfree(vb);
 	}
 
-	q->num_buffers -= buffers;
 	if (!vb2_get_num_buffers(q)) {
 		q->memory = VB2_MEMORY_UNKNOWN;
 		INIT_LIST_HEAD(&q->queued_list);
@@ -818,6 +835,32 @@ static bool verify_coherency_flags(struct vb2_queue *q, bool non_coherent_mem)
 	return true;
 }
 
+static int vb2_core_allocated_buffers_storage(struct vb2_queue *q)
+{
+	if (!q->bufs)
+		q->bufs = kcalloc(q->max_num_buffers, sizeof(*q->bufs), GFP_KERNEL);
+	if (!q->bufs)
+		return -ENOMEM;
+
+	if (!q->bufs_bitmap)
+		q->bufs_bitmap = bitmap_zalloc(q->max_num_buffers, GFP_KERNEL);
+	if (!q->bufs_bitmap) {
+		kfree(q->bufs);
+		q->bufs = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void vb2_core_free_buffers_storage(struct vb2_queue *q)
+{
+	kfree(q->bufs);
+	q->bufs = NULL;
+	bitmap_free(q->bufs_bitmap);
+	q->bufs_bitmap = NULL;
+}
+
 int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory,
 		     unsigned int flags, unsigned int *count)
 {
@@ -879,10 +922,7 @@ int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory,
 	 * in the queue_setup op.
 	 */
 	mutex_lock(&q->mmap_lock);
-	if (!q->bufs)
-		q->bufs = kcalloc(q->max_num_buffers, sizeof(*q->bufs), GFP_KERNEL);
-	if (!q->bufs)
-		ret = -ENOMEM;
+	ret = vb2_core_allocated_buffers_storage(q);
 	q->memory = memory;
 	mutex_unlock(&q->mmap_lock);
 	if (ret)
@@ -954,7 +994,6 @@ int vb2_core_reqbufs(struct vb2_queue *q, enum vb2_memory memory,
 	}
 
 	mutex_lock(&q->mmap_lock);
-	q->num_buffers = allocated_buffers;
 
 	if (ret < 0) {
 		/*
@@ -982,6 +1021,7 @@ error:
 	mutex_lock(&q->mmap_lock);
 	q->memory = VB2_MEMORY_UNKNOWN;
 	mutex_unlock(&q->mmap_lock);
+	vb2_core_free_buffers_storage(q);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(vb2_core_reqbufs);
@@ -1015,11 +1055,8 @@ int vb2_core_create_bufs(struct vb2_queue *q, enum vb2_memory memory,
 		 * value in the queue_setup op.
 		 */
 		mutex_lock(&q->mmap_lock);
+		ret = vb2_core_allocated_buffers_storage(q);
 		q->memory = memory;
-		if (!q->bufs)
-			q->bufs = kcalloc(q->max_num_buffers, sizeof(*q->bufs), GFP_KERNEL);
-		if (!q->bufs)
-			ret = -ENOMEM;
 		mutex_unlock(&q->mmap_lock);
 		if (ret)
 			return ret;
@@ -1082,7 +1119,6 @@ int vb2_core_create_bufs(struct vb2_queue *q, enum vb2_memory memory,
 	}
 
 	mutex_lock(&q->mmap_lock);
-	q->num_buffers += allocated_buffers;
 
 	if (ret < 0) {
 		/*
@@ -2583,8 +2619,7 @@ void vb2_core_queue_release(struct vb2_queue *q)
 	__vb2_queue_cancel(q);
 	mutex_lock(&q->mmap_lock);
 	__vb2_queue_free(q, vb2_get_num_buffers(q));
-	kfree(q->bufs);
-	q->bufs = NULL;
+	vb2_core_free_buffers_storage(q);
 	q->is_busy = 0;
 	mutex_unlock(&q->mmap_lock);
 }
diff --git a/include/media/videobuf2-core.h b/include/media/videobuf2-core.h
index 42526e289c8e..b9333e2c7797 100644
--- a/include/media/videobuf2-core.h
+++ b/include/media/videobuf2-core.h
@@ -346,8 +346,8 @@ struct vb2_buffer {
  *			describes the requested number of planes and sizes\[\]
  *			contains the requested plane sizes. In this case
  *			\*num_buffers are being allocated additionally to
- *			q->num_buffers. If either \*num_planes or the requested
- *			sizes are invalid callback must return %-EINVAL.
+ *			the buffers already allocated. If either \*num_planes
+ *			or the requested sizes are invalid callback must return %-EINVAL.
  * @wait_prepare:	release any locks taken while calling vb2 functions;
  *			it is called before an ioctl needs to wait for a new
  *			buffer to arrive; required to avoid a deadlock in
@@ -571,8 +571,9 @@ struct vb2_buf_ops {
  * @mmap_lock:	private mutex used when buffers are allocated/freed/mmapped
  * @memory:	current memory type used
  * @dma_dir:	DMA mapping direction.
- * @bufs:	videobuf2 buffer structures
- * @num_buffers: number of allocated/used buffers
+ * @bufs:	videobuf2 buffer structures. If it is non-NULL then
+ *		bufs_bitmap is also non-NULL.
+ * @bufs_bitmap: bitmap tracking whether each bufs[] entry is used
  * @max_num_buffers: upper limit of number of allocated/used buffers.
  *		     If set to 0 v4l2 core will change it VB2_MAX_FRAME
  *		     for backward compatibility.
@@ -643,7 +644,7 @@ struct vb2_queue {
 	unsigned int			memory;
 	enum dma_data_direction		dma_dir;
 	struct vb2_buffer		**bufs;
-	unsigned int			num_buffers;
+	unsigned long			*bufs_bitmap;
 	unsigned int			max_num_buffers;
 
 	struct list_head		queued_list;
@@ -1173,7 +1174,10 @@ static inline bool vb2_fileio_is_active(struct vb2_queue *q)
  */
 static inline unsigned int vb2_get_num_buffers(struct vb2_queue *q)
 {
-	return q->num_buffers;
+	if (q->bufs_bitmap)
+		return bitmap_weight(q->bufs_bitmap, q->max_num_buffers);
+
+	return 0;
 }
 
 /**
@@ -1282,7 +1286,7 @@ static inline struct vb2_buffer *vb2_get_buffer(struct vb2_queue *q,
 	if (index >= q->max_num_buffers)
 		return NULL;
 
-	if (index < q->num_buffers)
+	if (test_bit(index, q->bufs_bitmap))
 		return q->bufs[index];
 	return NULL;
 }
-- 
cgit v1.2.3


From a3293a85381ec9680aa2929547fbc76c5d87a1b2 Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Date: Thu, 14 Mar 2024 16:32:24 +0100
Subject: media: v4l2: Add REMOVE_BUFS ioctl

VIDIOC_REMOVE_BUFS ioctl allows to remove buffers from a queue.
The number of buffers to remove in given by count field of
struct v4l2_remove_buffers and the range start at the index
specified in the same structure.

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Reviewed-by: Mauro Carvalho Chehab <mchehab@kernel.org>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
[hverkuil: vidioc-remove-bufs.rst: mention no bufs are freed on error]
---
 .../userspace-api/media/v4l/user-func.rst          |  1 +
 .../userspace-api/media/v4l/vidioc-remove-bufs.rst | 86 ++++++++++++++++++++++
 .../userspace-api/media/v4l/vidioc-reqbufs.rst     |  1 +
 drivers/media/common/videobuf2/videobuf2-core.c    | 38 ++++++++++
 drivers/media/common/videobuf2/videobuf2-v4l2.c    | 20 ++++-
 drivers/media/v4l2-core/v4l2-dev.c                 |  3 +
 drivers/media/v4l2-core/v4l2-ioctl.c               | 30 ++++++++
 include/media/v4l2-ioctl.h                         |  4 +
 include/media/videobuf2-core.h                     | 10 +++
 include/media/videobuf2-v4l2.h                     |  2 +
 include/uapi/linux/videodev2.h                     | 17 +++++
 11 files changed, 211 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/userspace-api/media/v4l/vidioc-remove-bufs.rst

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/user-func.rst b/Documentation/userspace-api/media/v4l/user-func.rst
index 15ff0bf7bbe6..6f661138801c 100644
--- a/Documentation/userspace-api/media/v4l/user-func.rst
+++ b/Documentation/userspace-api/media/v4l/user-func.rst
@@ -62,6 +62,7 @@ Function Reference
     vidioc-query-dv-timings
     vidioc-querystd
     vidioc-reqbufs
+    vidioc-remove-bufs
     vidioc-s-hw-freq-seek
     vidioc-streamon
     vidioc-subdev-enum-frame-interval
diff --git a/Documentation/userspace-api/media/v4l/vidioc-remove-bufs.rst b/Documentation/userspace-api/media/v4l/vidioc-remove-bufs.rst
new file mode 100644
index 000000000000..1995b39af9ba
--- /dev/null
+++ b/Documentation/userspace-api/media/v4l/vidioc-remove-bufs.rst
@@ -0,0 +1,86 @@
+.. SPDX-License-Identifier: GFDL-1.1-no-invariants-or-later
+.. c:namespace:: V4L
+
+.. _VIDIOC_REMOVE_BUFS:
+
+************************
+ioctl VIDIOC_REMOVE_BUFS
+************************
+
+Name
+====
+
+VIDIOC_REMOVE_BUFS - Removes buffers from a queue
+
+Synopsis
+========
+
+.. c:macro:: VIDIOC_REMOVE_BUFS
+
+``int ioctl(int fd, VIDIOC_REMOVE_BUFS, struct v4l2_remove_buffers *argp)``
+
+Arguments
+=========
+
+``fd``
+    File descriptor returned by :c:func:`open()`.
+
+``argp``
+    Pointer to struct :c:type:`v4l2_remove_buffers`.
+
+Description
+===========
+
+Applications can optionally call the :ref:`VIDIOC_REMOVE_BUFS` ioctl to
+remove buffers from a queue.
+:ref:`VIDIOC_CREATE_BUFS` ioctl support is mandatory to enable :ref:`VIDIOC_REMOVE_BUFS`.
+This ioctl is available if the ``V4L2_BUF_CAP_SUPPORTS_REMOVE_BUFS`` capability
+is set on the queue when :c:func:`VIDIOC_REQBUFS` or :c:func:`VIDIOC_CREATE_BUFS`
+are invoked.
+
+.. c:type:: v4l2_remove_buffers
+
+.. tabularcolumns:: |p{4.4cm}|p{4.4cm}|p{8.5cm}|
+
+.. flat-table:: struct v4l2_remove_buffers
+    :header-rows:  0
+    :stub-columns: 0
+    :widths:       1 1 2
+
+    * - __u32
+      - ``index``
+      - The starting buffer index to remove. This field is ignored if count == 0.
+    * - __u32
+      - ``count``
+      - The number of buffers to be removed with indices 'index' until 'index + count - 1'.
+        All buffers in this range must be valid and in DEQUEUED state.
+        :ref:`VIDIOC_REMOVE_BUFS` will always check the validity of ``type`, if it is
+        invalid it returns ``EINVAL`` error code.
+        If count is set to 0 :ref:`VIDIOC_REMOVE_BUFS` will do nothing and return 0.
+    * - __u32
+      - ``type``
+      - Type of the stream or buffers, this is the same as the struct
+	:c:type:`v4l2_format` ``type`` field. See
+	:c:type:`v4l2_buf_type` for valid values.
+    * - __u32
+      - ``reserved``\ [13]
+      - A place holder for future extensions. Drivers and applications
+	must set the array to zero.
+
+Return Value
+============
+
+On success 0 is returned, on error -1 and the ``errno`` variable is set
+appropriately. The generic error codes are described at the
+:ref:`Generic Error Codes <gen-errors>` chapter. If an error occurs, no
+buffers will be freed and one of the error codes below will be returned:
+
+EBUSY
+    File I/O is in progress.
+    One or more of the buffers in the range ``index`` to ``index + count - 1`` are not
+    in DEQUEUED state.
+
+EINVAL
+    One or more of the buffers in the range ``index`` to ``index + count - 1`` do not
+    exist in the queue.
+    The buffer type (``type`` field) is not valid.
diff --git a/Documentation/userspace-api/media/v4l/vidioc-reqbufs.rst b/Documentation/userspace-api/media/v4l/vidioc-reqbufs.rst
index 0b3a41a45d05..bbc22dd76032 100644
--- a/Documentation/userspace-api/media/v4l/vidioc-reqbufs.rst
+++ b/Documentation/userspace-api/media/v4l/vidioc-reqbufs.rst
@@ -121,6 +121,7 @@ aborting or finishing any DMA in progress, an implicit
 .. _V4L2-BUF-CAP-SUPPORTS-M2M-HOLD-CAPTURE-BUF:
 .. _V4L2-BUF-CAP-SUPPORTS-MMAP-CACHE-HINTS:
 .. _V4L2-BUF-CAP-SUPPORTS-MAX-NUM-BUFFERS:
+.. _V4L2-BUF-CAP-SUPPORTS-REMOVE-BUFS:
 
 .. raw:: latex
 
diff --git a/drivers/media/common/videobuf2/videobuf2-core.c b/drivers/media/common/videobuf2/videobuf2-core.c
index 009cea95d662..0b2b48e1b2df 100644
--- a/drivers/media/common/videobuf2/videobuf2-core.c
+++ b/drivers/media/common/videobuf2/videobuf2-core.c
@@ -1691,6 +1691,44 @@ int vb2_core_prepare_buf(struct vb2_queue *q, struct vb2_buffer *vb, void *pb)
 }
 EXPORT_SYMBOL_GPL(vb2_core_prepare_buf);
 
+int vb2_core_remove_bufs(struct vb2_queue *q, unsigned int start, unsigned int count)
+{
+	unsigned int i, ret = 0;
+	unsigned int q_num_bufs = vb2_get_num_buffers(q);
+
+	if (count == 0)
+		return 0;
+
+	if (count > q_num_bufs)
+		return -EINVAL;
+
+	if (start > q->max_num_buffers - count)
+		return -EINVAL;
+
+	mutex_lock(&q->mmap_lock);
+
+	/* Check that all buffers in the range exist */
+	for (i = start; i < start + count; i++) {
+		struct vb2_buffer *vb = vb2_get_buffer(q, i);
+
+		if (!vb) {
+			ret = -EINVAL;
+			goto unlock;
+		}
+		if (vb->state != VB2_BUF_STATE_DEQUEUED) {
+			ret = -EBUSY;
+			goto unlock;
+		}
+	}
+	__vb2_queue_free(q, start, count);
+	dprintk(q, 2, "%u buffers removed\n", count);
+
+unlock:
+	mutex_unlock(&q->mmap_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(vb2_core_remove_bufs);
+
 /*
  * vb2_start_streaming() - Attempt to start streaming.
  * @q:		videobuf2 queue
diff --git a/drivers/media/common/videobuf2/videobuf2-v4l2.c b/drivers/media/common/videobuf2/videobuf2-v4l2.c
index 03e8080a68a8..293f3d5f1c4e 100644
--- a/drivers/media/common/videobuf2/videobuf2-v4l2.c
+++ b/drivers/media/common/videobuf2/videobuf2-v4l2.c
@@ -685,7 +685,7 @@ static void vb2_set_flags_and_caps(struct vb2_queue *q, u32 memory,
 		*flags &= V4L2_MEMORY_FLAG_NON_COHERENT;
 	}
 
-	*caps = V4L2_BUF_CAP_SUPPORTS_ORPHANED_BUFS;
+	*caps |= V4L2_BUF_CAP_SUPPORTS_ORPHANED_BUFS;
 	if (q->io_modes & VB2_MMAP)
 		*caps |= V4L2_BUF_CAP_SUPPORTS_MMAP;
 	if (q->io_modes & VB2_USERPTR)
@@ -1001,6 +1001,24 @@ EXPORT_SYMBOL_GPL(vb2_poll);
 
 /* vb2 ioctl helpers */
 
+int vb2_ioctl_remove_bufs(struct file *file, void *priv,
+			  struct v4l2_remove_buffers *d)
+{
+	struct video_device *vdev = video_devdata(file);
+
+	if (vdev->queue->type != d->type)
+		return -EINVAL;
+
+	if (d->count == 0)
+		return 0;
+
+	if (vb2_queue_is_busy(vdev->queue, file))
+		return -EBUSY;
+
+	return vb2_core_remove_bufs(vdev->queue, d->index, d->count);
+}
+EXPORT_SYMBOL_GPL(vb2_ioctl_remove_bufs);
+
 int vb2_ioctl_reqbufs(struct file *file, void *priv,
 			  struct v4l2_requestbuffers *p)
 {
diff --git a/drivers/media/v4l2-core/v4l2-dev.c b/drivers/media/v4l2-core/v4l2-dev.c
index d13954bd31fd..e39e9742fdb5 100644
--- a/drivers/media/v4l2-core/v4l2-dev.c
+++ b/drivers/media/v4l2-core/v4l2-dev.c
@@ -722,6 +722,9 @@ static void determine_valid_ioctls(struct video_device *vdev)
 		SET_VALID_IOCTL(ops, VIDIOC_PREPARE_BUF, vidioc_prepare_buf);
 		SET_VALID_IOCTL(ops, VIDIOC_STREAMON, vidioc_streamon);
 		SET_VALID_IOCTL(ops, VIDIOC_STREAMOFF, vidioc_streamoff);
+		/* VIDIOC_CREATE_BUFS support is mandatory to enable VIDIOC_REMOVE_BUFS */
+		if (ops->vidioc_create_bufs)
+			SET_VALID_IOCTL(ops, VIDIOC_REMOVE_BUFS, vidioc_remove_bufs);
 	}
 
 	if (is_vid || is_vbi || is_meta) {
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index 6e7b8b682d13..5aeff5519407 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -489,6 +489,14 @@ static void v4l_print_create_buffers(const void *arg, bool write_only)
 	v4l_print_format(&p->format, write_only);
 }
 
+static void v4l_print_remove_buffers(const void *arg, bool write_only)
+{
+	const struct v4l2_remove_buffers *p = arg;
+
+	pr_cont("type=%s, index=%u, count=%u\n",
+		prt_names(p->type, v4l2_type_names), p->index, p->count);
+}
+
 static void v4l_print_streamparm(const void *arg, bool write_only)
 {
 	const struct v4l2_streamparm *p = arg;
@@ -2092,6 +2100,7 @@ static int v4l_overlay(const struct v4l2_ioctl_ops *ops,
 static int v4l_reqbufs(const struct v4l2_ioctl_ops *ops,
 				struct file *file, void *fh, void *arg)
 {
+	struct video_device *vfd = video_devdata(file);
 	struct v4l2_requestbuffers *p = arg;
 	int ret = check_fmt(file, p->type);
 
@@ -2100,6 +2109,10 @@ static int v4l_reqbufs(const struct v4l2_ioctl_ops *ops,
 
 	memset_after(p, 0, flags);
 
+	p->capabilities = 0;
+	if (is_valid_ioctl(vfd, VIDIOC_REMOVE_BUFS))
+		p->capabilities = V4L2_BUF_CAP_SUPPORTS_REMOVE_BUFS;
+
 	return ops->vidioc_reqbufs(file, fh, p);
 }
 
@@ -2133,6 +2146,7 @@ static int v4l_dqbuf(const struct v4l2_ioctl_ops *ops,
 static int v4l_create_bufs(const struct v4l2_ioctl_ops *ops,
 				struct file *file, void *fh, void *arg)
 {
+	struct video_device *vfd = video_devdata(file);
 	struct v4l2_create_buffers *create = arg;
 	int ret = check_fmt(file, create->format.type);
 
@@ -2143,6 +2157,10 @@ static int v4l_create_bufs(const struct v4l2_ioctl_ops *ops,
 
 	v4l_sanitize_format(&create->format);
 
+	create->capabilities = 0;
+	if (is_valid_ioctl(vfd, VIDIOC_REMOVE_BUFS))
+		create->capabilities = V4L2_BUF_CAP_SUPPORTS_REMOVE_BUFS;
+
 	ret = ops->vidioc_create_bufs(file, fh, create);
 
 	if (create->format.type == V4L2_BUF_TYPE_VIDEO_CAPTURE ||
@@ -2161,6 +2179,17 @@ static int v4l_prepare_buf(const struct v4l2_ioctl_ops *ops,
 	return ret ? ret : ops->vidioc_prepare_buf(file, fh, b);
 }
 
+static int v4l_remove_bufs(const struct v4l2_ioctl_ops *ops,
+			   struct file *file, void *fh, void *arg)
+{
+	struct v4l2_remove_buffers *remove = arg;
+
+	if (ops->vidioc_remove_bufs)
+		return ops->vidioc_remove_bufs(file, fh, remove);
+
+	return -ENOTTY;
+}
+
 static int v4l_g_parm(const struct v4l2_ioctl_ops *ops,
 				struct file *file, void *fh, void *arg)
 {
@@ -2910,6 +2939,7 @@ static const struct v4l2_ioctl_info v4l2_ioctls[] = {
 	IOCTL_INFO(VIDIOC_ENUM_FREQ_BANDS, v4l_enum_freq_bands, v4l_print_freq_band, 0),
 	IOCTL_INFO(VIDIOC_DBG_G_CHIP_INFO, v4l_dbg_g_chip_info, v4l_print_dbg_chip_info, INFO_FL_CLEAR(v4l2_dbg_chip_info, match)),
 	IOCTL_INFO(VIDIOC_QUERY_EXT_CTRL, v4l_query_ext_ctrl, v4l_print_query_ext_ctrl, INFO_FL_CTRL | INFO_FL_CLEAR(v4l2_query_ext_ctrl, id)),
+	IOCTL_INFO(VIDIOC_REMOVE_BUFS, v4l_remove_bufs, v4l_print_remove_buffers, INFO_FL_PRIO | INFO_FL_QUEUE | INFO_FL_CLEAR(v4l2_remove_buffers, type)),
 };
 #define V4L2_IOCTLS ARRAY_SIZE(v4l2_ioctls)
 
diff --git a/include/media/v4l2-ioctl.h b/include/media/v4l2-ioctl.h
index edb733f21604..bdbb7e542321 100644
--- a/include/media/v4l2-ioctl.h
+++ b/include/media/v4l2-ioctl.h
@@ -163,6 +163,8 @@ struct v4l2_fh;
  *	:ref:`VIDIOC_CREATE_BUFS <vidioc_create_bufs>` ioctl
  * @vidioc_prepare_buf: pointer to the function that implements
  *	:ref:`VIDIOC_PREPARE_BUF <vidioc_prepare_buf>` ioctl
+ * @vidioc_remove_bufs: pointer to the function that implements
+ *	:ref:`VIDIOC_REMOVE_BUFS <vidioc_remove_bufs>` ioctl
  * @vidioc_overlay: pointer to the function that implements
  *	:ref:`VIDIOC_OVERLAY <vidioc_overlay>` ioctl
  * @vidioc_g_fbuf: pointer to the function that implements
@@ -422,6 +424,8 @@ struct v4l2_ioctl_ops {
 				  struct v4l2_create_buffers *b);
 	int (*vidioc_prepare_buf)(struct file *file, void *fh,
 				  struct v4l2_buffer *b);
+	int (*vidioc_remove_bufs)(struct file *file, void *fh,
+				  struct v4l2_remove_buffers *d);
 
 	int (*vidioc_overlay)(struct file *file, void *fh, unsigned int i);
 	int (*vidioc_g_fbuf)(struct file *file, void *fh,
diff --git a/include/media/videobuf2-core.h b/include/media/videobuf2-core.h
index b9333e2c7797..955237ac503d 100644
--- a/include/media/videobuf2-core.h
+++ b/include/media/videobuf2-core.h
@@ -870,6 +870,16 @@ int vb2_core_create_bufs(struct vb2_queue *q, enum vb2_memory memory,
  */
 int vb2_core_prepare_buf(struct vb2_queue *q, struct vb2_buffer *vb, void *pb);
 
+/**
+ * vb2_core_remove_bufs() -
+ * @q:		pointer to &struct vb2_queue with videobuf2 queue.
+ * @start:	first index of the range of buffers to remove.
+ * @count:	number of buffers to remove.
+ *
+ *  Return: returns zero on success; an error code otherwise.
+ */
+int vb2_core_remove_bufs(struct vb2_queue *q, unsigned int start, unsigned int count);
+
 /**
  * vb2_core_qbuf() - Queue a buffer from userspace
  *
diff --git a/include/media/videobuf2-v4l2.h b/include/media/videobuf2-v4l2.h
index 5a845887850b..77ce8238ab30 100644
--- a/include/media/videobuf2-v4l2.h
+++ b/include/media/videobuf2-v4l2.h
@@ -334,6 +334,8 @@ int vb2_ioctl_streamon(struct file *file, void *priv, enum v4l2_buf_type i);
 int vb2_ioctl_streamoff(struct file *file, void *priv, enum v4l2_buf_type i);
 int vb2_ioctl_expbuf(struct file *file, void *priv,
 	struct v4l2_exportbuffer *p);
+int vb2_ioctl_remove_bufs(struct file *file, void *priv,
+			  struct v4l2_remove_buffers *p);
 
 /* struct v4l2_file_operations helpers */
 
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index a8015e5e7fa4..2663213b76a4 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -1036,6 +1036,7 @@ struct v4l2_requestbuffers {
 #define V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF	(1 << 5)
 #define V4L2_BUF_CAP_SUPPORTS_MMAP_CACHE_HINTS		(1 << 6)
 #define V4L2_BUF_CAP_SUPPORTS_MAX_NUM_BUFFERS		(1 << 7)
+#define V4L2_BUF_CAP_SUPPORTS_REMOVE_BUFS		(1 << 8)
 
 /**
  * struct v4l2_plane - plane info for multi-planar buffers
@@ -2624,6 +2625,20 @@ struct v4l2_create_buffers {
 	__u32			reserved[5];
 };
 
+/**
+ * struct v4l2_remove_buffers - VIDIOC_REMOVE_BUFS argument
+ * @index:	the first buffer to be removed
+ * @count:	number of buffers to removed
+ * @type:	enum v4l2_buf_type
+ * @reserved:	future extensions
+ */
+struct v4l2_remove_buffers {
+	__u32			index;
+	__u32			count;
+	__u32			type;
+	__u32			reserved[13];
+};
+
 /*
  *	I O C T L   C O D E S   F O R   V I D E O   D E V I C E S
  *
@@ -2723,6 +2738,8 @@ struct v4l2_create_buffers {
 #define VIDIOC_DBG_G_CHIP_INFO  _IOWR('V', 102, struct v4l2_dbg_chip_info)
 
 #define VIDIOC_QUERY_EXT_CTRL	_IOWR('V', 103, struct v4l2_query_ext_ctrl)
+#define VIDIOC_REMOVE_BUFS	_IOWR('V', 104, struct v4l2_remove_buffers)
+
 
 /* Reminder: when adding new ioctls please add support for them to
    drivers/media/v4l2-core/v4l2-compat-ioctl32.c as well! */
-- 
cgit v1.2.3


From 2f2419502f6957b110dbc7d4b75e764e5f370ec2 Mon Sep 17 00:00:00 2001
From: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Date: Thu, 14 Mar 2024 16:32:25 +0100
Subject: media: v4l2: Add mem2mem helpers for REMOVE_BUFS ioctl

Create v4l2-mem2mem helpers for VIDIOC_REMOVE_BUFS ioctl and
make test drivers use it.

Signed-off-by: Benjamin Gaignard <benjamin.gaignard@collabora.com>
Reviewed-by: Mauro Carvalho Chehab <mchehab@kernel.org>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 drivers/media/test-drivers/vicodec/vicodec-core.c |  1 +
 drivers/media/test-drivers/vimc/vimc-capture.c    |  1 +
 drivers/media/test-drivers/visl/visl-video.c      |  1 +
 drivers/media/test-drivers/vivid/vivid-core.c     |  1 +
 drivers/media/v4l2-core/v4l2-mem2mem.c            | 15 +++++++++++++++
 include/media/v4l2-mem2mem.h                      |  2 ++
 6 files changed, 21 insertions(+)

(limited to 'include')

diff --git a/drivers/media/test-drivers/vicodec/vicodec-core.c b/drivers/media/test-drivers/vicodec/vicodec-core.c
index e13f5452b927..3e011fe62ae1 100644
--- a/drivers/media/test-drivers/vicodec/vicodec-core.c
+++ b/drivers/media/test-drivers/vicodec/vicodec-core.c
@@ -1345,6 +1345,7 @@ static const struct v4l2_ioctl_ops vicodec_ioctl_ops = {
 	.vidioc_prepare_buf	= v4l2_m2m_ioctl_prepare_buf,
 	.vidioc_create_bufs	= v4l2_m2m_ioctl_create_bufs,
 	.vidioc_expbuf		= v4l2_m2m_ioctl_expbuf,
+	.vidioc_remove_bufs	= v4l2_m2m_ioctl_remove_bufs,
 
 	.vidioc_streamon	= v4l2_m2m_ioctl_streamon,
 	.vidioc_streamoff	= v4l2_m2m_ioctl_streamoff,
diff --git a/drivers/media/test-drivers/vimc/vimc-capture.c b/drivers/media/test-drivers/vimc/vimc-capture.c
index 97693561f1e4..ba7550b8ba7e 100644
--- a/drivers/media/test-drivers/vimc/vimc-capture.c
+++ b/drivers/media/test-drivers/vimc/vimc-capture.c
@@ -221,6 +221,7 @@ static const struct v4l2_ioctl_ops vimc_capture_ioctl_ops = {
 	.vidioc_expbuf = vb2_ioctl_expbuf,
 	.vidioc_streamon = vb2_ioctl_streamon,
 	.vidioc_streamoff = vb2_ioctl_streamoff,
+	.vidioc_remove_bufs = vb2_ioctl_remove_bufs,
 };
 
 static void vimc_capture_return_all_buffers(struct vimc_capture_device *vcapture,
diff --git a/drivers/media/test-drivers/visl/visl-video.c b/drivers/media/test-drivers/visl/visl-video.c
index b9a4b44bd0ed..f8d970319764 100644
--- a/drivers/media/test-drivers/visl/visl-video.c
+++ b/drivers/media/test-drivers/visl/visl-video.c
@@ -539,6 +539,7 @@ const struct v4l2_ioctl_ops visl_ioctl_ops = {
 	.vidioc_prepare_buf		= v4l2_m2m_ioctl_prepare_buf,
 	.vidioc_create_bufs		= v4l2_m2m_ioctl_create_bufs,
 	.vidioc_expbuf			= v4l2_m2m_ioctl_expbuf,
+	.vidioc_remove_bufs		= v4l2_m2m_ioctl_remove_bufs,
 
 	.vidioc_streamon		= v4l2_m2m_ioctl_streamon,
 	.vidioc_streamoff		= v4l2_m2m_ioctl_streamoff,
diff --git a/drivers/media/test-drivers/vivid/vivid-core.c b/drivers/media/test-drivers/vivid/vivid-core.c
index e2d4f10003f3..0273bc9863b0 100644
--- a/drivers/media/test-drivers/vivid/vivid-core.c
+++ b/drivers/media/test-drivers/vivid/vivid-core.c
@@ -769,6 +769,7 @@ static const struct v4l2_ioctl_ops vivid_ioctl_ops = {
 	.vidioc_expbuf			= vb2_ioctl_expbuf,
 	.vidioc_streamon		= vb2_ioctl_streamon,
 	.vidioc_streamoff		= vb2_ioctl_streamoff,
+	.vidioc_remove_bufs		= vb2_ioctl_remove_bufs,
 
 	.vidioc_enum_input		= vivid_enum_input,
 	.vidioc_g_input			= vivid_g_input,
diff --git a/drivers/media/v4l2-core/v4l2-mem2mem.c b/drivers/media/v4l2-core/v4l2-mem2mem.c
index 75517134a5e9..eb22d6172462 100644
--- a/drivers/media/v4l2-core/v4l2-mem2mem.c
+++ b/drivers/media/v4l2-core/v4l2-mem2mem.c
@@ -1386,6 +1386,21 @@ int v4l2_m2m_ioctl_create_bufs(struct file *file, void *priv,
 }
 EXPORT_SYMBOL_GPL(v4l2_m2m_ioctl_create_bufs);
 
+int v4l2_m2m_ioctl_remove_bufs(struct file *file, void *priv,
+			       struct v4l2_remove_buffers *remove)
+{
+	struct v4l2_fh *fh = file->private_data;
+	struct vb2_queue *q = v4l2_m2m_get_vq(fh->m2m_ctx, remove->type);
+
+	if (!q)
+		return -EINVAL;
+	if (q->type != remove->type)
+		return -EINVAL;
+
+	return vb2_core_remove_bufs(q, remove->index, remove->count);
+}
+EXPORT_SYMBOL_GPL(v4l2_m2m_ioctl_remove_bufs);
+
 int v4l2_m2m_ioctl_querybuf(struct file *file, void *priv,
 				struct v4l2_buffer *buf)
 {
diff --git a/include/media/v4l2-mem2mem.h b/include/media/v4l2-mem2mem.h
index 7f1af1f7f912..0af330cf91c3 100644
--- a/include/media/v4l2-mem2mem.h
+++ b/include/media/v4l2-mem2mem.h
@@ -867,6 +867,8 @@ int v4l2_m2m_ioctl_reqbufs(struct file *file, void *priv,
 				struct v4l2_requestbuffers *rb);
 int v4l2_m2m_ioctl_create_bufs(struct file *file, void *fh,
 				struct v4l2_create_buffers *create);
+int v4l2_m2m_ioctl_remove_bufs(struct file *file, void *priv,
+			       struct v4l2_remove_buffers *d);
 int v4l2_m2m_ioctl_querybuf(struct file *file, void *fh,
 				struct v4l2_buffer *buf);
 int v4l2_m2m_ioctl_expbuf(struct file *file, void *fh,
-- 
cgit v1.2.3


From 22d5607400c62c72da9b60e3324744be83e147a4 Mon Sep 17 00:00:00 2001
From: Qais Yousef <qyousef@layalina.io>
Date: Sun, 24 Mar 2024 00:45:50 +0000
Subject: sched/fair: Check if a task has a fitting CPU when updating misfit

If a misfit task is affined to a subset of the possible CPUs, we need to
verify that one of these CPUs can fit it. Otherwise the load balancer
code will continuously trigger needlessly leading the balance_interval
to increase in return and eventually end up with a situation where real
imbalances take a long time to address because of this impossible
imbalance situation.

This can happen in Android world where it's common for background tasks
to be restricted to little cores.

Similarly if we can't fit the biggest core, triggering misfit is
pointless as it is the best we can ever get on this system.

To be able to detect that; we use asym_cap_list to iterate through
capacities in the system to see if the task is able to run at a higher
capacity level based on its p->cpus_ptr. We do that when the affinity
change, a fair task is forked, or when a task switched to fair policy.
We store the max_allowed_capacity in task_struct to allow for cheap
comparison in the fast path.

Improve check_misfit_status() function by removing redundant checks.
misfit_task_load will be 0 if the task can't move to a bigger CPU. And
nohz_balancer_kick() already checks for cpu_check_capacity() before
calling check_misfit_status().

Test:
=====

Add

	trace_printk("balance_interval = %lu\n", interval)

in get_sd_balance_interval().

run
	if [ "$MASK" != "0" ]; then
		adb shell "taskset -a $MASK cat /dev/zero > /dev/null"
	fi
	sleep 10
	// parse ftrace buffer counting the occurrence of each valaue

Where MASK is either:

	* 0: no busy task running
	* 1: busy task is pinned to 1 cpu; handled today to not cause
	  misfit
	* f: busy task pinned to little cores, simulates busy background
	  task, demonstrates the problem to be fixed

Results:
========

Note how occurrence of balance_interval = 128 overshoots for MASK = f.

BEFORE
------

	MASK=0

		   1 balance_interval = 175
		 120 balance_interval = 128
		 846 balance_interval = 64
		  55 balance_interval = 63
		 215 balance_interval = 32
		   2 balance_interval = 31
		   2 balance_interval = 16
		   4 balance_interval = 8
		1870 balance_interval = 4
		  65 balance_interval = 2

	MASK=1

		  27 balance_interval = 175
		  37 balance_interval = 127
		 840 balance_interval = 64
		 167 balance_interval = 63
		 449 balance_interval = 32
		  84 balance_interval = 31
		 304 balance_interval = 16
		1156 balance_interval = 8
		2781 balance_interval = 4
		 428 balance_interval = 2

	MASK=f

		   1 balance_interval = 175
		1328 balance_interval = 128
		  44 balance_interval = 64
		 101 balance_interval = 63
		  25 balance_interval = 32
		   5 balance_interval = 31
		  23 balance_interval = 16
		  23 balance_interval = 8
		4306 balance_interval = 4
		 177 balance_interval = 2

AFTER
-----

Note how the high values almost disappear for all MASK values. The
system has background tasks that could trigger the problem without
simulate it even with MASK=0.

	MASK=0

		 103 balance_interval = 63
		  19 balance_interval = 31
		 194 balance_interval = 8
		4827 balance_interval = 4
		 179 balance_interval = 2

	MASK=1

		 131 balance_interval = 63
		   1 balance_interval = 31
		  87 balance_interval = 8
		3600 balance_interval = 4
		   7 balance_interval = 2

	MASK=f

		   8 balance_interval = 127
		 182 balance_interval = 63
		   3 balance_interval = 31
		   9 balance_interval = 16
		 415 balance_interval = 8
		3415 balance_interval = 4
		  21 balance_interval = 2

Signed-off-by: Qais Yousef <qyousef@layalina.io>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lore.kernel.org/r/20240324004552.999936-3-qyousef@layalina.io
---
 include/linux/sched.h |  1 +
 init/init_task.c      |  1 +
 kernel/sched/fair.c   | 66 ++++++++++++++++++++++++++++++++++++++-------------
 3 files changed, 52 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3ed40e9f6155..c75fd46506df 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -835,6 +835,7 @@ struct task_struct {
 #endif
 
 	unsigned int			policy;
+	unsigned long			max_allowed_capacity;
 	int				nr_cpus_allowed;
 	const cpumask_t			*cpus_ptr;
 	cpumask_t			*user_cpus_ptr;
diff --git a/init/init_task.c b/init/init_task.c
index 4daee6d761c8..2558b719e053 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -77,6 +77,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
 	.cpus_ptr	= &init_task.cpus_mask,
 	.user_cpus_ptr	= NULL,
 	.cpus_mask	= CPU_MASK_ALL,
+	.max_allowed_capacity	= SCHED_CAPACITY_SCALE,
 	.nr_cpus_allowed= NR_CPUS,
 	.mm		= NULL,
 	.active_mm	= &init_mm,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e8270e2e15cb..c47c4f2e28f7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5098,15 +5098,19 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)
 
 static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
 {
+	int cpu = cpu_of(rq);
+
 	if (!sched_asym_cpucap_active())
 		return;
 
-	if (!p || p->nr_cpus_allowed == 1) {
-		rq->misfit_task_load = 0;
-		return;
-	}
+	/*
+	 * Affinity allows us to go somewhere higher?  Or are we on biggest
+	 * available CPU already? Or do we fit into this CPU ?
+	 */
+	if (!p || (p->nr_cpus_allowed == 1) ||
+	    (arch_scale_cpu_capacity(cpu) == p->max_allowed_capacity) ||
+	    task_fits_cpu(p, cpu)) {
 
-	if (task_fits_cpu(p, cpu_of(rq))) {
 		rq->misfit_task_load = 0;
 		return;
 	}
@@ -8253,6 +8257,36 @@ static void task_dead_fair(struct task_struct *p)
 	remove_entity_load_avg(&p->se);
 }
 
+/*
+ * Set the max capacity the task is allowed to run at for misfit detection.
+ */
+static void set_task_max_allowed_capacity(struct task_struct *p)
+{
+	struct asym_cap_data *entry;
+
+	if (!sched_asym_cpucap_active())
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(entry, &asym_cap_list, link) {
+		cpumask_t *cpumask;
+
+		cpumask = cpu_capacity_span(entry);
+		if (!cpumask_intersects(p->cpus_ptr, cpumask))
+			continue;
+
+		p->max_allowed_capacity = entry->capacity;
+		break;
+	}
+	rcu_read_unlock();
+}
+
+static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx)
+{
+	set_cpus_allowed_common(p, ctx);
+	set_task_max_allowed_capacity(p);
+}
+
 static int
 balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 {
@@ -8261,6 +8295,8 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 
 	return sched_balance_newidle(rq, rf) != 0;
 }
+#else
+static inline void set_task_max_allowed_capacity(struct task_struct *p) {}
 #endif /* CONFIG_SMP */
 
 static void set_next_buddy(struct sched_entity *se)
@@ -9610,16 +9646,10 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
 				(arch_scale_cpu_capacity(cpu_of(rq)) * 100));
 }
 
-/*
- * Check whether a rq has a misfit task and if it looks like we can actually
- * help that task: we can migrate the task to a CPU of higher capacity, or
- * the task's current CPU is heavily pressured.
- */
-static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
+/* Check if the rq has a misfit task */
+static inline bool check_misfit_status(struct rq *rq)
 {
-	return rq->misfit_task_load &&
-		(arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity ||
-		 check_cpu_capacity(rq, sd));
+	return rq->misfit_task_load;
 }
 
 /*
@@ -11923,7 +11953,7 @@ static void nohz_balancer_kick(struct rq *rq)
 		 * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
 		 * to run the misfit task on.
 		 */
-		if (check_misfit_status(rq, sd)) {
+		if (check_misfit_status(rq)) {
 			flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
 			goto unlock;
 		}
@@ -12648,6 +12678,8 @@ static void task_fork_fair(struct task_struct *p)
 	rq_lock(rq, &rf);
 	update_rq_clock(rq);
 
+	set_task_max_allowed_capacity(p);
+
 	cfs_rq = task_cfs_rq(current);
 	curr = cfs_rq->curr;
 	if (curr)
@@ -12771,6 +12803,8 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
 {
 	attach_task_cfs_rq(p);
 
+	set_task_max_allowed_capacity(p);
+
 	if (task_on_rq_queued(p)) {
 		/*
 		 * We were most likely switched from sched_rt, so
@@ -13142,7 +13176,7 @@ DEFINE_SCHED_CLASS(fair) = {
 	.rq_offline		= rq_offline_fair,
 
 	.task_dead		= task_dead_fair,
-	.set_cpus_allowed	= set_cpus_allowed_common,
+	.set_cpus_allowed	= set_cpus_allowed_fair,
 #endif
 
 	.task_tick		= task_tick_fair,
-- 
cgit v1.2.3


From 1c7b963c20650f1ffd787a088680c9a086ee349f Mon Sep 17 00:00:00 2001
From: Jeff Johnson <quic_jjohnson@quicinc.com>
Date: Tue, 19 Mar 2024 11:26:23 -0700
Subject: wifi: nl80211: rename enum plink_actions

kernel-doc flagged the following issue:
include/uapi/linux/nl80211.h:6081: warning: expecting prototype for enum nl80211_plink_action. Prototype was for enum plink_actions instead

This is because the documentation doesn't match the code. Normally the
correct fix for such an issue is to modify the documentation to match
the code. However, in this case, since the actual name plink_actions
is not referenced by any code, rename it to nl80211_plink_action to
give it a proper prefix and match the documentation.

Signed-off-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Link: https://msgid.link/20240319-kdoc-nl80211-v1-1-549e09d52866@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index f23ecbdd84a2..9397c61a48dc 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -6077,7 +6077,7 @@ enum nl80211_plink_state {
  * @NL80211_PLINK_ACTION_BLOCK: block traffic from this mesh peer
  * @NUM_NL80211_PLINK_ACTIONS: number of possible actions
  */
-enum plink_actions {
+enum nl80211_plink_action {
 	NL80211_PLINK_ACTION_NO_ACTION,
 	NL80211_PLINK_ACTION_OPEN,
 	NL80211_PLINK_ACTION_BLOCK,
-- 
cgit v1.2.3


From ec3a97d93b88602fffd84ce30a461f6eaeb6f067 Mon Sep 17 00:00:00 2001
From: Jeff Johnson <quic_jjohnson@quicinc.com>
Date: Tue, 19 Mar 2024 11:26:24 -0700
Subject: wifi: nl80211: fix nl80211 uapi comment style issues

Currently kernel-doc raises "warning: bad line:" for several comments
that have invalid multi-line comment style; they are missing the
leading '*'. And checkpatch.pl raises "WARNING: please, no space
before tabs" for a large number of comments which have space then tab
after the leading '*'.

Fix those issues.

Signed-off-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Link: https://msgid.link/20240319-kdoc-nl80211-v1-2-549e09d52866@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 166 +++++++++++++++++++++----------------------
 1 file changed, 83 insertions(+), 83 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 9397c61a48dc..4de84fbfd4fa 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -442,11 +442,11 @@
  *	stations connected and using at least that link as one of its links.
  *
  * @NL80211_CMD_GET_MPATH: Get mesh path attributes for mesh path to
- * 	destination %NL80211_ATTR_MAC on the interface identified by
- * 	%NL80211_ATTR_IFINDEX.
+ *	destination %NL80211_ATTR_MAC on the interface identified by
+ *	%NL80211_ATTR_IFINDEX.
  * @NL80211_CMD_SET_MPATH:  Set mesh path attributes for mesh path to
- * 	destination %NL80211_ATTR_MAC on the interface identified by
- * 	%NL80211_ATTR_IFINDEX.
+ *	destination %NL80211_ATTR_MAC on the interface identified by
+ *	%NL80211_ATTR_IFINDEX.
  * @NL80211_CMD_NEW_MPATH: Create a new mesh path for the destination given by
  *	%NL80211_ATTR_MAC via %NL80211_ATTR_MPATH_NEXT_HOP.
  * @NL80211_CMD_DEL_MPATH: Delete a mesh path to the destination given by
@@ -476,15 +476,15 @@
  *	after being queried by the kernel. CRDA replies by sending a regulatory
  *	domain structure which consists of %NL80211_ATTR_REG_ALPHA set to our
  *	current alpha2 if it found a match. It also provides
- * 	NL80211_ATTR_REG_RULE_FLAGS, and a set of regulatory rules. Each
- * 	regulatory rule is a nested set of attributes  given by
- * 	%NL80211_ATTR_REG_RULE_FREQ_[START|END] and
- * 	%NL80211_ATTR_FREQ_RANGE_MAX_BW with an attached power rule given by
- * 	%NL80211_ATTR_REG_RULE_POWER_MAX_ANT_GAIN and
- * 	%NL80211_ATTR_REG_RULE_POWER_MAX_EIRP.
+ *	NL80211_ATTR_REG_RULE_FLAGS, and a set of regulatory rules. Each
+ *	regulatory rule is a nested set of attributes  given by
+ *	%NL80211_ATTR_REG_RULE_FREQ_[START|END] and
+ *	%NL80211_ATTR_FREQ_RANGE_MAX_BW with an attached power rule given by
+ *	%NL80211_ATTR_REG_RULE_POWER_MAX_ANT_GAIN and
+ *	%NL80211_ATTR_REG_RULE_POWER_MAX_EIRP.
  * @NL80211_CMD_REQ_SET_REG: ask the wireless core to set the regulatory domain
- * 	to the specified ISO/IEC 3166-1 alpha2 country code. The core will
- * 	store this as a valid request and then query userspace for it.
+ *	to the specified ISO/IEC 3166-1 alpha2 country code. The core will
+ *	store this as a valid request and then query userspace for it.
  *
  * @NL80211_CMD_GET_MESH_CONFIG: Get mesh networking properties for the
  *	interface identified by %NL80211_ATTR_IFINDEX
@@ -574,31 +574,31 @@
  * @NL80211_CMD_FLUSH_PMKSA: Flush all PMKSA cache entries.
  *
  * @NL80211_CMD_REG_CHANGE: indicates to userspace the regulatory domain
- * 	has been changed and provides details of the request information
- * 	that caused the change such as who initiated the regulatory request
- * 	(%NL80211_ATTR_REG_INITIATOR), the wiphy_idx
- * 	(%NL80211_ATTR_REG_ALPHA2) on which the request was made from if
- * 	the initiator was %NL80211_REGDOM_SET_BY_COUNTRY_IE or
- * 	%NL80211_REGDOM_SET_BY_DRIVER, the type of regulatory domain
- * 	set (%NL80211_ATTR_REG_TYPE), if the type of regulatory domain is
- * 	%NL80211_REG_TYPE_COUNTRY the alpha2 to which we have moved on
- * 	to (%NL80211_ATTR_REG_ALPHA2).
+ *	has been changed and provides details of the request information
+ *	that caused the change such as who initiated the regulatory request
+ *	(%NL80211_ATTR_REG_INITIATOR), the wiphy_idx
+ *	(%NL80211_ATTR_REG_ALPHA2) on which the request was made from if
+ *	the initiator was %NL80211_REGDOM_SET_BY_COUNTRY_IE or
+ *	%NL80211_REGDOM_SET_BY_DRIVER, the type of regulatory domain
+ *	set (%NL80211_ATTR_REG_TYPE), if the type of regulatory domain is
+ *	%NL80211_REG_TYPE_COUNTRY the alpha2 to which we have moved on
+ *	to (%NL80211_ATTR_REG_ALPHA2).
  * @NL80211_CMD_REG_BEACON_HINT: indicates to userspace that an AP beacon
- * 	has been found while world roaming thus enabling active scan or
- * 	any mode of operation that initiates TX (beacons) on a channel
- * 	where we would not have been able to do either before. As an example
- * 	if you are world roaming (regulatory domain set to world or if your
- * 	driver is using a custom world roaming regulatory domain) and while
- * 	doing a passive scan on the 5 GHz band you find an AP there (if not
- * 	on a DFS channel) you will now be able to actively scan for that AP
- * 	or use AP mode on your card on that same channel. Note that this will
- * 	never be used for channels 1-11 on the 2 GHz band as they are always
- * 	enabled world wide. This beacon hint is only sent if your device had
- * 	either disabled active scanning or beaconing on a channel. We send to
- * 	userspace the wiphy on which we removed a restriction from
- * 	(%NL80211_ATTR_WIPHY) and the channel on which this occurred
- * 	before (%NL80211_ATTR_FREQ_BEFORE) and after (%NL80211_ATTR_FREQ_AFTER)
- * 	the beacon hint was processed.
+ *	has been found while world roaming thus enabling active scan or
+ *	any mode of operation that initiates TX (beacons) on a channel
+ *	where we would not have been able to do either before. As an example
+ *	if you are world roaming (regulatory domain set to world or if your
+ *	driver is using a custom world roaming regulatory domain) and while
+ *	doing a passive scan on the 5 GHz band you find an AP there (if not
+ *	on a DFS channel) you will now be able to actively scan for that AP
+ *	or use AP mode on your card on that same channel. Note that this will
+ *	never be used for channels 1-11 on the 2 GHz band as they are always
+ *	enabled world wide. This beacon hint is only sent if your device had
+ *	either disabled active scanning or beaconing on a channel. We send to
+ *	userspace the wiphy on which we removed a restriction from
+ *	(%NL80211_ATTR_WIPHY) and the channel on which this occurred
+ *	before (%NL80211_ATTR_FREQ_BEFORE) and after (%NL80211_ATTR_FREQ_AFTER)
+ *	the beacon hint was processed.
  *
  * @NL80211_CMD_AUTHENTICATE: authentication request and notification.
  *	This command is used both as a command (request to authenticate) and
@@ -1715,21 +1715,21 @@ enum nl80211_commands {
  *	(see &enum nl80211_plink_action).
  * @NL80211_ATTR_MPATH_NEXT_HOP: MAC address of the next hop for a mesh path.
  * @NL80211_ATTR_MPATH_INFO: information about a mesh_path, part of mesh path
- * 	info given for %NL80211_CMD_GET_MPATH, nested attribute described at
+ *	info given for %NL80211_CMD_GET_MPATH, nested attribute described at
  *	&enum nl80211_mpath_info.
  *
  * @NL80211_ATTR_MNTR_FLAGS: flags, nested element with NLA_FLAG attributes of
  *      &enum nl80211_mntr_flags.
  *
  * @NL80211_ATTR_REG_ALPHA2: an ISO-3166-alpha2 country code for which the
- * 	current regulatory domain should be set to or is already set to.
- * 	For example, 'CR', for Costa Rica. This attribute is used by the kernel
- * 	to query the CRDA to retrieve one regulatory domain. This attribute can
- * 	also be used by userspace to query the kernel for the currently set
- * 	regulatory domain. We chose an alpha2 as that is also used by the
- * 	IEEE-802.11 country information element to identify a country.
- * 	Users can also simply ask the wireless core to set regulatory domain
- * 	to a specific alpha2.
+ *	current regulatory domain should be set to or is already set to.
+ *	For example, 'CR', for Costa Rica. This attribute is used by the kernel
+ *	to query the CRDA to retrieve one regulatory domain. This attribute can
+ *	also be used by userspace to query the kernel for the currently set
+ *	regulatory domain. We chose an alpha2 as that is also used by the
+ *	IEEE-802.11 country information element to identify a country.
+ *	Users can also simply ask the wireless core to set regulatory domain
+ *	to a specific alpha2.
  * @NL80211_ATTR_REG_RULES: a nested array of regulatory domain regulatory
  *	rules.
  *
@@ -1772,9 +1772,9 @@ enum nl80211_commands {
  * @NL80211_ATTR_BSS: scan result BSS
  *
  * @NL80211_ATTR_REG_INITIATOR: indicates who requested the regulatory domain
- * 	currently in effect. This could be any of the %NL80211_REGDOM_SET_BY_*
+ *	currently in effect. This could be any of the %NL80211_REGDOM_SET_BY_*
  * @NL80211_ATTR_REG_TYPE: indicates the type of the regulatory domain currently
- * 	set. This can be one of the nl80211_reg_type (%NL80211_REGDOM_TYPE_*)
+ *	set. This can be one of the nl80211_reg_type (%NL80211_REGDOM_TYPE_*)
  *
  * @NL80211_ATTR_SUPPORTED_COMMANDS: wiphy attribute that specifies
  *	an array of command numbers (i.e. a mapping index to command number)
@@ -1793,15 +1793,15 @@ enum nl80211_commands {
  *	a u32
  *
  * @NL80211_ATTR_FREQ_BEFORE: A channel which has suffered a regulatory change
- * 	due to considerations from a beacon hint. This attribute reflects
- * 	the state of the channel _before_ the beacon hint processing. This
- * 	attributes consists of a nested attribute containing
- * 	NL80211_FREQUENCY_ATTR_*
+ *	due to considerations from a beacon hint. This attribute reflects
+ *	the state of the channel _before_ the beacon hint processing. This
+ *	attributes consists of a nested attribute containing
+ *	NL80211_FREQUENCY_ATTR_*
  * @NL80211_ATTR_FREQ_AFTER: A channel which has suffered a regulatory change
- * 	due to considerations from a beacon hint. This attribute reflects
- * 	the state of the channel _after_ the beacon hint processing. This
- * 	attributes consists of a nested attribute containing
- * 	NL80211_FREQUENCY_ATTR_*
+ *	due to considerations from a beacon hint. This attribute reflects
+ *	the state of the channel _after_ the beacon hint processing. This
+ *	attributes consists of a nested attribute containing
+ *	NL80211_FREQUENCY_ATTR_*
  *
  * @NL80211_ATTR_CIPHER_SUITES: a set of u32 values indicating the supported
  *	cipher suites
@@ -2416,7 +2416,7 @@ enum nl80211_commands {
  *	scheduled scan is started.  Or the delay before a WoWLAN
  *	net-detect scan is started, counting from the moment the
  *	system is suspended.  This value is a u32, in seconds.
-
+ *
  * @NL80211_ATTR_REG_INDOOR: flag attribute, if set indicates that the device
  *      is operating in an indoor environment.
  *
@@ -3823,7 +3823,7 @@ enum nl80211_sta_bss_param {
  *	(u64, to this station)
  * @NL80211_STA_INFO_SIGNAL: signal strength of last received PPDU (u8, dBm)
  * @NL80211_STA_INFO_TX_BITRATE: current unicast tx rate, nested attribute
- * 	containing info as possible, see &enum nl80211_rate_info
+ *	containing info as possible, see &enum nl80211_rate_info
  * @NL80211_STA_INFO_RX_PACKETS: total received packet (MSDUs and MMPDUs)
  *	(u32, from this station)
  * @NL80211_STA_INFO_TX_PACKETS: total transmitted packets (MSDUs and MMPDUs)
@@ -4039,7 +4039,7 @@ enum nl80211_mpath_flags {
  * @NL80211_MPATH_INFO_METRIC: metric (cost) of this mesh path
  * @NL80211_MPATH_INFO_EXPTIME: expiration time for the path, in msec from now
  * @NL80211_MPATH_INFO_FLAGS: mesh path flags, enumerated in
- * 	&enum nl80211_mpath_flags;
+ *	&enum nl80211_mpath_flags;
  * @NL80211_MPATH_INFO_DISCOVERY_TIMEOUT: total path discovery timeout, in msec
  * @NL80211_MPATH_INFO_DISCOVERY_RETRIES: mesh path discovery retries
  * @NL80211_MPATH_INFO_HOP_COUNT: hop count to destination
@@ -4201,8 +4201,8 @@ enum nl80211_wmm_rule {
  * @NL80211_FREQUENCY_ATTR_DISABLED: Channel is disabled in current
  *	regulatory domain.
  * @NL80211_FREQUENCY_ATTR_NO_IR: no mechanisms that initiate radiation
- * 	are permitted on this channel, this includes sending probe
- * 	requests, or modes of operation that require beaconing.
+ *	are permitted on this channel, this includes sending probe
+ *	requests, or modes of operation that require beaconing.
  * @NL80211_FREQUENCY_ATTR_RADAR: Radar detection is mandatory
  *	on this channel in current regulatory domain.
  * @NL80211_FREQUENCY_ATTR_MAX_TX_POWER: Maximum transmission power in mBm
@@ -4359,14 +4359,14 @@ enum nl80211_bitrate_attr {
 /**
  * enum nl80211_initiator - Indicates the initiator of a reg domain request
  * @NL80211_REGDOM_SET_BY_CORE: Core queried CRDA for a dynamic world
- * 	regulatory domain.
+ *	regulatory domain.
  * @NL80211_REGDOM_SET_BY_USER: User asked the wireless core to set the
- * 	regulatory domain.
+ *	regulatory domain.
  * @NL80211_REGDOM_SET_BY_DRIVER: a wireless drivers has hinted to the
- * 	wireless core it thinks its knows the regulatory domain we should be in.
+ *	wireless core it thinks its knows the regulatory domain we should be in.
  * @NL80211_REGDOM_SET_BY_COUNTRY_IE: the wireless core has received an
- * 	802.11 country information element with regulatory information it
- * 	thinks we should consider. cfg80211 only processes the country
+ *	802.11 country information element with regulatory information it
+ *	thinks we should consider. cfg80211 only processes the country
  *	code from the IE, and relies on the regulatory domain information
  *	structure passed by userspace (CRDA) from our wireless-regdb.
  *	If a channel is enabled but the country code indicates it should
@@ -4385,11 +4385,11 @@ enum nl80211_reg_initiator {
  *	to a specific country. When this is set you can count on the
  *	ISO / IEC 3166 alpha2 country code being valid.
  * @NL80211_REGDOM_TYPE_WORLD: the regulatory set domain is the world regulatory
- * 	domain.
+ *	domain.
  * @NL80211_REGDOM_TYPE_CUSTOM_WORLD: the regulatory domain set is a custom
- * 	driver specific world regulatory domain. These do not apply system-wide
- * 	and are only applicable to the individual devices which have requested
- * 	them to be applied.
+ *	driver specific world regulatory domain. These do not apply system-wide
+ *	and are only applicable to the individual devices which have requested
+ *	them to be applied.
  * @NL80211_REGDOM_TYPE_INTERSECTION: the regulatory domain set is the product
  *	of an intersection between two regulatory domains -- the previously
  *	set regulatory domain on the system and the last accepted regulatory
@@ -4406,21 +4406,21 @@ enum nl80211_reg_type {
  * enum nl80211_reg_rule_attr - regulatory rule attributes
  * @__NL80211_REG_RULE_ATTR_INVALID: attribute number 0 is reserved
  * @NL80211_ATTR_REG_RULE_FLAGS: a set of flags which specify additional
- * 	considerations for a given frequency range. These are the
- * 	&enum nl80211_reg_rule_flags.
+ *	considerations for a given frequency range. These are the
+ *	&enum nl80211_reg_rule_flags.
  * @NL80211_ATTR_FREQ_RANGE_START: starting frequencry for the regulatory
- * 	rule in KHz. This is not a center of frequency but an actual regulatory
- * 	band edge.
+ *	rule in KHz. This is not a center of frequency but an actual regulatory
+ *	band edge.
  * @NL80211_ATTR_FREQ_RANGE_END: ending frequency for the regulatory rule
- * 	in KHz. This is not a center a frequency but an actual regulatory
- * 	band edge.
+ *	in KHz. This is not a center a frequency but an actual regulatory
+ *	band edge.
  * @NL80211_ATTR_FREQ_RANGE_MAX_BW: maximum allowed bandwidth for this
  *	frequency range, in KHz.
  * @NL80211_ATTR_POWER_RULE_MAX_ANT_GAIN: the maximum allowed antenna gain
- * 	for a given frequency range. The value is in mBi (100 * dBi).
- * 	If you don't have one then don't send this.
+ *	for a given frequency range. The value is in mBi (100 * dBi).
+ *	If you don't have one then don't send this.
  * @NL80211_ATTR_POWER_RULE_MAX_EIRP: the maximum allowed EIRP for
- * 	a given frequency range. The value is in mBm (100 * dBm).
+ *	a given frequency range. The value is in mBm (100 * dBm).
  * @NL80211_ATTR_DFS_CAC_TIME: DFS CAC time in milliseconds.
  *	If not present or 0 default CAC time will be used.
  * @NL80211_ATTR_POWER_RULE_PSD: power spectral density (in dBm).
@@ -4507,8 +4507,8 @@ enum nl80211_sched_scan_match_attr {
  * @NL80211_RRF_PTP_ONLY: this is only for Point To Point links
  * @NL80211_RRF_PTMP_ONLY: this is only for Point To Multi Point links
  * @NL80211_RRF_NO_IR: no mechanisms that initiate radiation are allowed,
- * 	this includes probe requests or modes of operation that require
- * 	beaconing.
+ *	this includes probe requests or modes of operation that require
+ *	beaconing.
  * @NL80211_RRF_AUTO_BW: maximum available bandwidth should be calculated
  *	base on contiguous rules and wider channels will be allowed to cross
  *	multiple contiguous/overlapping frequency ranges.
@@ -4522,9 +4522,9 @@ enum nl80211_sched_scan_match_attr {
  * @NL80211_RRF_NO_EHT: EHT operation not allowed
  * @NL80211_RRF_PSD: Ruleset has power spectral density value
  * @NL80211_RRF_DFS_CONCURRENT: Operation on this channel is allowed for
-	peer-to-peer or adhoc communication under the control of a DFS master
-	which operates on the same channel (FCC-594280 D01 Section B.3).
-	Should be used together with %NL80211_RRF_DFS only.
+ *	peer-to-peer or adhoc communication under the control of a DFS master
+ *	which operates on the same channel (FCC-594280 D01 Section B.3).
+ *	Should be used together with %NL80211_RRF_DFS only.
  * @NL80211_RRF_NO_6GHZ_VLP_CLIENT: Client connection to VLP AP not allowed
  * @NL80211_RRF_NO_6GHZ_AFC_CLIENT: Client connection to AFC AP not allowed
  */
-- 
cgit v1.2.3


From b8cf4f4d700acaa7b3b40eb877a226b9a95164df Mon Sep 17 00:00:00 2001
From: Jeff Johnson <quic_jjohnson@quicinc.com>
Date: Tue, 19 Mar 2024 11:26:25 -0700
Subject: wifi: nl80211: cleanup nl80211.h kernel-doc

Fix all of the kernel-doc issues in nl80211.h.

Signed-off-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Link: https://msgid.link/20240319-kdoc-nl80211-v1-3-549e09d52866@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 68 ++++++++++++++++++++++++++------------------
 1 file changed, 40 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 4de84fbfd4fa..f917bc6c9b6f 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -413,8 +413,8 @@
  *	are like for %NL80211_CMD_SET_BEACON, and additionally parameters that
  *	do not change are used, these include %NL80211_ATTR_BEACON_INTERVAL,
  *	%NL80211_ATTR_DTIM_PERIOD, %NL80211_ATTR_SSID,
- *	%NL80211_ATTR_HIDDEN_SSID, %NL80211_ATTR_CIPHERS_PAIRWISE,
- *	%NL80211_ATTR_CIPHER_GROUP, %NL80211_ATTR_WPA_VERSIONS,
+ *	%NL80211_ATTR_HIDDEN_SSID, %NL80211_ATTR_CIPHER_SUITES_PAIRWISE,
+ *	%NL80211_ATTR_CIPHER_SUITE_GROUP, %NL80211_ATTR_WPA_VERSIONS,
  *	%NL80211_ATTR_AKM_SUITES, %NL80211_ATTR_PRIVACY,
  *	%NL80211_ATTR_AUTH_TYPE, %NL80211_ATTR_INACTIVITY_TIMEOUT,
  *	%NL80211_ATTR_ACL_POLICY and %NL80211_ATTR_MAC_ADDRS.
@@ -451,11 +451,6 @@
  *	%NL80211_ATTR_MAC via %NL80211_ATTR_MPATH_NEXT_HOP.
  * @NL80211_CMD_DEL_MPATH: Delete a mesh path to the destination given by
  *	%NL80211_ATTR_MAC.
- * @NL80211_CMD_NEW_PATH: Add a mesh path with given attributes to the
- *	interface identified by %NL80211_ATTR_IFINDEX.
- * @NL80211_CMD_DEL_PATH: Remove a mesh path identified by %NL80211_ATTR_MAC
- *	or, if no MAC address given, all mesh paths, on the interface identified
- *	by %NL80211_ATTR_IFINDEX.
  * @NL80211_CMD_SET_BSS: Set BSS attributes for BSS identified by
  *	%NL80211_ATTR_IFINDEX.
  *
@@ -1120,7 +1115,7 @@
  *	current configuration is not changed.  If it is present but
  *	set to zero, the configuration is changed to don't-care
  *	(i.e. the device can decide what to do).
- * @NL80211_CMD_NAN_FUNC_MATCH: Notification sent when a match is reported.
+ * @NL80211_CMD_NAN_MATCH: Notification sent when a match is reported.
  *	This will contain a %NL80211_ATTR_NAN_MATCH nested attribute and
  *	%NL80211_ATTR_COOKIE.
  *
@@ -1862,12 +1857,6 @@ enum nl80211_commands {
  *	that protected APs should be used. This is also used with NEW_BEACON to
  *	indicate that the BSS is to use protection.
  *
- * @NL80211_ATTR_CIPHERS_PAIRWISE: Used with CONNECT, ASSOCIATE, and NEW_BEACON
- *	to indicate which unicast key ciphers will be used with the connection
- *	(an array of u32).
- * @NL80211_ATTR_CIPHER_GROUP: Used with CONNECT, ASSOCIATE, and NEW_BEACON to
- *	indicate which group key cipher will be used with the connection (a
- *	u32).
  * @NL80211_ATTR_WPA_VERSIONS: Used with CONNECT, ASSOCIATE, and NEW_BEACON to
  *	indicate which WPA version(s) the AP we want to associate with is using
  *	(a u32 with flags from &enum nl80211_wpa_versions).
@@ -1898,6 +1887,7 @@ enum nl80211_commands {
  *	with %NL80211_KEY_* sub-attributes
  *
  * @NL80211_ATTR_PID: Process ID of a network namespace.
+ * @NL80211_ATTR_NETNS_FD: File descriptor of a network namespace.
  *
  * @NL80211_ATTR_GENERATION: Used to indicate consistent snapshots for
  *	dumps. This number increases whenever the object list being
@@ -1952,6 +1942,7 @@ enum nl80211_commands {
  *
  * @NL80211_ATTR_ACK: Flag attribute indicating that the frame was
  *	acknowledged by the recipient.
+ * @NL80211_ATTR_ACK_SIGNAL: Station's ack signal strength (s32)
  *
  * @NL80211_ATTR_PS_STATE: powersave state, using &enum nl80211_ps_state values.
  *
@@ -2149,6 +2140,9 @@ enum nl80211_commands {
  * @NL80211_ATTR_DISABLE_HE: Force HE capable interfaces to disable
  *      this feature during association. This is a flag attribute.
  *	Currently only supported in mac80211 drivers.
+ * @NL80211_ATTR_DISABLE_EHT: Force EHT capable interfaces to disable
+ *      this feature during association. This is a flag attribute.
+ *	Currently only supported in mac80211 drivers.
  * @NL80211_ATTR_HT_CAPABILITY_MASK: Specify which bits of the
  *      ATTR_HT_CAPABILITY to which attention should be paid.
  *      Currently, only mac80211 NICs support this feature.
@@ -2158,6 +2152,12 @@ enum nl80211_commands {
  *      All values are treated as suggestions and may be ignored
  *      by the driver as required.  The actual values may be seen in
  *      the station debugfs ht_caps file.
+ * @NL80211_ATTR_VHT_CAPABILITY_MASK: Specify which bits of the
+ *      ATTR_VHT_CAPABILITY to which attention should be paid.
+ *      Currently, only mac80211 NICs support this feature.
+ *      All values are treated as suggestions and may be ignored
+ *      by the driver as required.  The actual values may be seen in
+ *      the station debugfs vht_caps file.
  *
  * @NL80211_ATTR_DFS_REGION: region for regulatory rules which this country
  *    abides to when initiating radiation on DFS channels. A country maps
@@ -3565,7 +3565,7 @@ enum nl80211_sta_flags {
  * enum nl80211_sta_p2p_ps_status - station support of P2P PS
  *
  * @NL80211_P2P_PS_UNSUPPORTED: station doesn't support P2P PS mechanism
- * @@NL80211_P2P_PS_SUPPORTED: station supports P2P PS mechanism
+ * @NL80211_P2P_PS_SUPPORTED: station supports P2P PS mechanism
  * @NUM_NL80211_P2P_PS_STATUS: number of values
  */
 enum nl80211_sta_p2p_ps_status {
@@ -3603,9 +3603,9 @@ enum nl80211_he_gi {
 
 /**
  * enum nl80211_he_ltf - HE long training field
- * @NL80211_RATE_INFO_HE_1xLTF: 3.2 usec
- * @NL80211_RATE_INFO_HE_2xLTF: 6.4 usec
- * @NL80211_RATE_INFO_HE_4xLTF: 12.8 usec
+ * @NL80211_RATE_INFO_HE_1XLTF: 3.2 usec
+ * @NL80211_RATE_INFO_HE_2XLTF: 6.4 usec
+ * @NL80211_RATE_INFO_HE_4XLTF: 12.8 usec
  */
 enum nl80211_he_ltf {
 	NL80211_RATE_INFO_HE_1XLTF,
@@ -3720,7 +3720,7 @@ enum nl80211_eht_ru_alloc {
  * @NL80211_RATE_INFO_HE_GI: HE guard interval identifier
  *	(u8, see &enum nl80211_he_gi)
  * @NL80211_RATE_INFO_HE_DCM: HE DCM value (u8, 0/1)
- * @NL80211_RATE_INFO_RU_ALLOC: HE RU allocation, if not present then
+ * @NL80211_RATE_INFO_HE_RU_ALLOC: HE RU allocation, if not present then
  *	non-OFDMA was used (u8, see &enum nl80211_he_ru_alloc)
  * @NL80211_RATE_INFO_320_MHZ_WIDTH: 320 MHz bitrate
  * @NL80211_RATE_INFO_EHT_MCS: EHT MCS index (u8, 0-15)
@@ -3852,8 +3852,8 @@ enum nl80211_sta_bss_param {
  *	Contains a nested array of signal strength attributes (u8, dBm)
  * @NL80211_STA_INFO_CHAIN_SIGNAL_AVG: per-chain signal strength average
  *	Same format as NL80211_STA_INFO_CHAIN_SIGNAL.
- * @NL80211_STA_EXPECTED_THROUGHPUT: expected throughput considering also the
- *	802.11 header (u32, kbps)
+ * @NL80211_STA_INFO_EXPECTED_THROUGHPUT: expected throughput considering also
+ *	the 802.11 header (u32, kbps)
  * @NL80211_STA_INFO_RX_DROP_MISC: RX packets dropped for unspecified reasons
  *	(u64)
  * @NL80211_STA_INFO_BEACON_RX: number of beacons received from this peer (u64)
@@ -4179,7 +4179,7 @@ enum nl80211_band_attr {
  * @NL80211_WMMR_CW_MAX: Maximum contention window slot.
  * @NL80211_WMMR_AIFSN: Arbitration Inter Frame Space.
  * @NL80211_WMMR_TXOP: Maximum allowed tx operation time.
- * @nl80211_WMMR_MAX: highest possible wmm rule.
+ * @NL80211_WMMR_MAX: highest possible wmm rule.
  * @__NL80211_WMMR_LAST: Internal use.
  */
 enum nl80211_wmm_rule {
@@ -4203,6 +4203,7 @@ enum nl80211_wmm_rule {
  * @NL80211_FREQUENCY_ATTR_NO_IR: no mechanisms that initiate radiation
  *	are permitted on this channel, this includes sending probe
  *	requests, or modes of operation that require beaconing.
+ * @__NL80211_FREQUENCY_ATTR_NO_IBSS: obsolete, same as _NO_IR
  * @NL80211_FREQUENCY_ATTR_RADAR: Radar detection is mandatory
  *	on this channel in current regulatory domain.
  * @NL80211_FREQUENCY_ATTR_MAX_TX_POWER: Maximum transmission power in mBm
@@ -4357,7 +4358,7 @@ enum nl80211_bitrate_attr {
 };
 
 /**
- * enum nl80211_initiator - Indicates the initiator of a reg domain request
+ * enum nl80211_reg_initiator - Indicates the initiator of a reg domain request
  * @NL80211_REGDOM_SET_BY_CORE: Core queried CRDA for a dynamic world
  *	regulatory domain.
  * @NL80211_REGDOM_SET_BY_USER: User asked the wireless core to set the
@@ -4509,6 +4510,7 @@ enum nl80211_sched_scan_match_attr {
  * @NL80211_RRF_NO_IR: no mechanisms that initiate radiation are allowed,
  *	this includes probe requests or modes of operation that require
  *	beaconing.
+ * @__NL80211_RRF_NO_IBSS: obsolete, same as NO_IR
  * @NL80211_RRF_AUTO_BW: maximum available bandwidth should be calculated
  *	base on contiguous rules and wider channels will be allowed to cross
  *	multiple contiguous/overlapping frequency ranges.
@@ -4707,8 +4709,8 @@ enum nl80211_mntr_flags {
  *	alternate between Active and Doze states, but may not wake up
  *	for neighbor's beacons.
  *
- * @__NL80211_MESH_POWER_AFTER_LAST - internal use
- * @NL80211_MESH_POWER_MAX - highest possible power save level
+ * @__NL80211_MESH_POWER_AFTER_LAST: internal use
+ * @NL80211_MESH_POWER_MAX: highest possible power save level
  */
 
 enum nl80211_mesh_power_mode {
@@ -5728,7 +5730,7 @@ struct nl80211_pattern_support {
  *	"TCP connection wakeup" for more details. This is a nested attribute
  *	containing the exact information for establishing and keeping alive
  *	the TCP connection.
- * @NL80211_WOWLAN_TRIG_TCP_WAKEUP_MATCH: For wakeup reporting only, the
+ * @NL80211_WOWLAN_TRIG_WAKEUP_TCP_MATCH: For wakeup reporting only, the
  *	wakeup packet was received on the TCP connection
  * @NL80211_WOWLAN_TRIG_WAKEUP_TCP_CONNLOST: For wakeup reporting only, the
  *	TCP connection was lost or failed to be established
@@ -6404,6 +6406,7 @@ enum nl80211_feature_flags {
  *	receiving control port frames over nl80211 instead of the netdevice.
  * @NL80211_EXT_FEATURE_ACK_SIGNAL_SUPPORT: This driver/device supports
  *	(average) ACK signal strength reporting.
+ * @NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT: Backward-compatible ID
  * @NL80211_EXT_FEATURE_TXQS: Driver supports FQ-CoDel-enabled intermediate
  *      TXQs.
  * @NL80211_EXT_FEATURE_SCAN_RANDOM_SN: Driver/device supports randomizing the
@@ -6787,6 +6790,8 @@ enum nl80211_acl_policy {
  * @NL80211_SMPS_STATIC: static SMPS (use a single antenna)
  * @NL80211_SMPS_DYNAMIC: dynamic smps (start with a single antenna and
  *	turn on other antennas after CTS/RTS).
+ * @__NL80211_SMPS_AFTER_LAST: internal
+ * @NL80211_SMPS_MAX: highest used enumeration
  */
 enum nl80211_smps_mode {
 	NL80211_SMPS_OFF,
@@ -7008,6 +7013,8 @@ enum nl80211_bss_select_attr {
  * @NL80211_NAN_FUNC_PUBLISH: function is publish
  * @NL80211_NAN_FUNC_SUBSCRIBE: function is subscribe
  * @NL80211_NAN_FUNC_FOLLOW_UP: function is follow-up
+ * @__NL80211_NAN_FUNC_TYPE_AFTER_LAST: internal use
+ * @NL80211_NAN_FUNC_MAX_TYPE: internal use
  */
 enum nl80211_nan_function_type {
 	NL80211_NAN_FUNC_PUBLISH,
@@ -7168,7 +7175,7 @@ enum nl80211_nan_match_attributes {
 };
 
 /**
- * nl80211_external_auth_action - Action to perform with external
+ * enum nl80211_external_auth_action - Action to perform with external
  *     authentication request. Used by NL80211_ATTR_EXTERNAL_AUTH_ACTION.
  * @NL80211_EXTERNAL_AUTH_START: Start the authentication.
  * @NL80211_EXTERNAL_AUTH_ABORT: Abort the ongoing authentication.
@@ -7186,7 +7193,7 @@ enum nl80211_external_auth_action {
  * @NL80211_FTM_RESP_ATTR_LCI: The content of Measurement Report Element
  *	(9.4.2.22 in 802.11-2016) with type 8 - LCI (9.4.2.22.10),
  *	i.e. starting with the measurement token
- * @NL80211_FTM_RESP_ATTR_CIVIC: The content of Measurement Report Element
+ * @NL80211_FTM_RESP_ATTR_CIVICLOC: The content of Measurement Report Element
  *	(9.4.2.22 in 802.11-2016) with type 11 - Civic (Section 9.4.2.22.13),
  *	i.e. starting with the measurement token
  * @__NL80211_FTM_RESP_ATTR_LAST: Internal
@@ -7829,6 +7836,7 @@ enum nl80211_sae_pwe_mechanism {
  *
  * @NL80211_SAR_TYPE_POWER: power limitation specified in 0.25dBm unit
  *
+ * @NUM_NL80211_SAR_TYPE: internal
  */
 enum nl80211_sar_type {
 	NL80211_SAR_TYPE_POWER,
@@ -7842,6 +7850,8 @@ enum nl80211_sar_type {
 /**
  * enum nl80211_sar_attrs - Attributes for SAR spec
  *
+ * @__NL80211_SAR_ATTR_INVALID: Invalid
+ *
  * @NL80211_SAR_ATTR_TYPE: the SAR type as defined in &enum nl80211_sar_type.
  *
  * @NL80211_SAR_ATTR_SPECS: Nested array of SAR power
@@ -7873,6 +7883,8 @@ enum nl80211_sar_attrs {
 /**
  * enum nl80211_sar_specs_attrs - Attributes for SAR power limit specs
  *
+ * @__NL80211_SAR_ATTR_SPECS_INVALID: Invalid
+ *
  * @NL80211_SAR_ATTR_SPECS_POWER: Required (s32)value to specify the actual
  *	power limit value in units of 0.25 dBm if type is
  *	NL80211_SAR_TYPE_POWER. (i.e., a value of 44 represents 11 dBm).
-- 
cgit v1.2.3


From b7793a1a2f370c28b17d9554b58e9dc51afcfcbd Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 18 Mar 2024 18:53:17 +0200
Subject: wifi: ieee80211: check for NULL in ieee80211_mle_size_ok()

For simplicity, we may want to pass a NULL element, and
while we should then pass also a zero length, just be a
bit more careful here.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://msgid.link/20240318184907.4d983653cb8d.Ic3ea99b60c61ac2f7d38cb9fd202a03c97a05601@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 3385a2cc5b09..a99f048404b8 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -5166,7 +5166,7 @@ static inline bool ieee80211_mle_size_ok(const u8 *data, size_t len)
 	bool check_common_len = false;
 	u16 control;
 
-	if (len < fixed)
+	if (!data || len < fixed)
 		return false;
 
 	control = le16_to_cpu(mle->control);
-- 
cgit v1.2.3


From c121514df0daa800cc500dc2738e0b8a1c54af98 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 18 Mar 2024 18:53:18 +0200
Subject: wifi: ieee80211: fix ieee80211_mle_basic_sta_prof_size_ok()

If there was a possibility of an MLE basic STA profile without
subelements, we might reject it because we account for the one
octet for sta_info_len twice (it's part of itself, and in the
fixed portion). Like in ieee80211_mle_reconf_sta_prof_size_ok,
subtract 1 to adjust that.

When reading the elements we did take this into account, and
since there are always elements, this never really mattered.

Fixes: 7b6f08771bf6 ("wifi: ieee80211: Support validating ML station profile length")
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Reviewed-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://msgid.link/20240318184907.00bb0b20ed60.I8c41dd6fc14c4b187ab901dea15ade73c79fb98c@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index a99f048404b8..4fd9735bb75e 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -5302,7 +5302,7 @@ static inline bool ieee80211_mle_basic_sta_prof_size_ok(const u8 *data,
 		info_len += 1;
 
 	return prof->sta_info_len >= info_len &&
-	       fixed + prof->sta_info_len <= len;
+	       fixed + prof->sta_info_len - 1 <= len;
 }
 
 /**
-- 
cgit v1.2.3


From 385972ded343d127e382294e8721a980fb20c106 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 18 Mar 2024 18:53:20 +0200
Subject: wifi: mac80211: clarify the dormant/suspended links docs

Since I keep getting confused about this and asking about it,
update the documentation.

In the future, especially when we add more reasons for a link
to be disabled, we should add a per-link 'disable state' bitmap
instead of maintaining all these bitmaps with subset logic, to
indicate each of the different states separately.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://msgid.link/20240318184907.a5b24595b1a2.I92dd7d177b25df189b842d3bcddb1f2b13b1de13@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 353488ab94a2..47fa73a62f8a 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1921,10 +1921,12 @@ enum ieee80211_neg_ttlm_res {
  * @active_links: The bitmap of active links, or 0 for non-MLO.
  *	The driver shouldn't change this directly, but use the
  *	API calls meant for that purpose.
- * @dormant_links: bitmap of valid but disabled links, or 0 for non-MLO.
- *	Must be a subset of valid_links.
+ * @dormant_links: subset of the valid links that are disabled/suspended
+ *	due to advertised or negotiated TTLM respectively.
+ *	0 for non-MLO.
  * @suspended_links: subset of dormant_links representing links that are
- *	suspended.
+ *	suspended due to negotiated TTLM, and could be activated in the
+ *	future by tearing down the TTLM negotiation.
  *	0 for non-MLO.
  * @neg_ttlm: negotiated TID to link mapping info.
  *	see &struct ieee80211_neg_ttlm.
-- 
cgit v1.2.3


From ff30bf1e42644e48611aeb637d964fbd88e4fd27 Mon Sep 17 00:00:00 2001
From: Ayala Beker <ayala.beker@intel.com>
Date: Mon, 18 Mar 2024 18:53:21 +0200
Subject: wifi: mac80211: fix BSS_CHANGED_MLD_TTLM description

BSS_CHANGED_MLD_TTLM purpose is to let the driver know that
negotiated TTLM was updated and as a result MLD suspended links
status was change.
Fix the description to better reflect it.

Signed-off-by: Ayala Beker <ayala.beker@intel.com>
Reviewed-by: Ilan Peer <ilan.peer@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://msgid.link/20240318184907.52b893a70758.I2dcb322b389441f33605fb952450cc2724eb9efd@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 47fa73a62f8a..757ec33882c2 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -361,7 +361,7 @@ struct ieee80211_vif_chanctx_switch {
  * @BSS_CHANGED_UNSOL_BCAST_PROBE_RESP: Unsolicited broadcast probe response
  *	status changed.
  * @BSS_CHANGED_MLD_VALID_LINKS: MLD valid links status changed.
- * @BSS_CHANGED_MLD_TTLM: TID to link mapping was changed
+ * @BSS_CHANGED_MLD_TTLM: negotiated TID to link mapping was changed
  */
 enum ieee80211_bss_change {
 	BSS_CHANGED_ASSOC		= 1<<0,
-- 
cgit v1.2.3


From 56cc479188f70bbb6eacbe5ed1e43d96c91ccc6a Mon Sep 17 00:00:00 2001
From: Anjaneyulu <pagadala.yesu.anjaneyulu@intel.com>
Date: Mon, 18 Mar 2024 18:53:23 +0200
Subject: wifi: mac80211: handle indoor AFC/LPI AP on assoc success

Update power_type in bss_conf based on Indoor AFC and LPI power types
received in HE 6 GHz operation element on assoc success.

Signed-off-by: Anjaneyulu <pagadala.yesu.anjaneyulu@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://msgid.link/20240318184907.89c25dae34ff.Ifd8b2983f400623ac03dc032fc9a20025c9ca365@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 8 +++++---
 net/mac80211/mlme.c       | 2 ++
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 4fd9735bb75e..7f9b829dcb1e 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -2742,9 +2742,11 @@ static inline bool ieee80211_he_capa_size_ok(const u8 *data, u8 len)
 #define IEEE80211_HE_OPERATION_PARTIAL_BSS_COLOR		0x40000000
 #define IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED		0x80000000
 
-#define IEEE80211_6GHZ_CTRL_REG_LPI_AP	0
-#define IEEE80211_6GHZ_CTRL_REG_SP_AP	1
-#define IEEE80211_6GHZ_CTRL_REG_VLP_AP	2
+#define IEEE80211_6GHZ_CTRL_REG_LPI_AP		0
+#define IEEE80211_6GHZ_CTRL_REG_SP_AP		1
+#define IEEE80211_6GHZ_CTRL_REG_VLP_AP		2
+#define IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP	3
+#define IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP	4
 
 /**
  * struct ieee80211_he_6ghz_oper - HE 6 GHz operation Information field
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 47a2cba8313f..8cd3253c6188 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -4429,9 +4429,11 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link,
 			switch (u8_get_bits(he_6ghz_oper->control,
 					    IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) {
 			case IEEE80211_6GHZ_CTRL_REG_LPI_AP:
+			case IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP:
 				bss_conf->power_type = IEEE80211_REG_LPI_AP;
 				break;
 			case IEEE80211_6GHZ_CTRL_REG_SP_AP:
+			case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP:
 				bss_conf->power_type = IEEE80211_REG_SP_AP;
 				break;
 			case IEEE80211_6GHZ_CTRL_REG_VLP_AP:
-- 
cgit v1.2.3


From 6943e00331bb26e78edb2969165e750864a6bc6d Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 18 Mar 2024 18:53:25 +0200
Subject: wifi: mac80211: add flag to disallow puncturing in 5 GHz

Some devices may not be capable of handling puncturing in 5 GHz
only (vs. the current flag that just removes puncturing support
completely). Add a flag to support such devices: check and then
downgrade the channel width if needed.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://msgid.link/20240318184907.49759510da7d.I12c5a61f0be512e0c4e574c2f794ef4b37ecaf6b@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 3 +++
 net/mac80211/debugfs.c | 1 +
 net/mac80211/mlme.c    | 4 ++++
 3 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 757ec33882c2..88bb781f33f6 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2779,6 +2779,8 @@ struct ieee80211_txq {
  *
  * @IEEE80211_HW_DISALLOW_PUNCTURING: HW requires disabling puncturing in EHT
  *	and connecting with a lower bandwidth instead
+ * @IEEE80211_HW_DISALLOW_PUNCTURING_5GHZ: HW requires disabling puncturing in
+ *	EHT in 5 GHz and connecting with a lower bandwidth instead
  *
  * @IEEE80211_HW_HANDLES_QUIET_CSA: HW/driver handles quieting for CSA, so
  *	no need to stop queues. This really should be set by a driver that
@@ -2843,6 +2845,7 @@ enum ieee80211_hw_flags {
 	IEEE80211_HW_DETECTS_COLOR_COLLISION,
 	IEEE80211_HW_MLO_MCAST_MULTI_LINK_TX,
 	IEEE80211_HW_DISALLOW_PUNCTURING,
+	IEEE80211_HW_DISALLOW_PUNCTURING_5GHZ,
 	IEEE80211_HW_HANDLES_QUIET_CSA,
 
 	/* keep last, obviously */
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 2f68e92a7404..98310188f330 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -498,6 +498,7 @@ static const char *hw_flag_names[] = {
 	FLAG(DETECTS_COLOR_COLLISION),
 	FLAG(MLO_MCAST_MULTI_LINK_TX),
 	FLAG(DISALLOW_PUNCTURING),
+	FLAG(DISALLOW_PUNCTURING_5GHZ),
 	FLAG(HANDLES_QUIET_CSA),
 #undef FLAG
 };
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 8cd3253c6188..20f325b6b28c 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -599,6 +599,10 @@ static bool ieee80211_chandef_usable(struct ieee80211_sub_if_data *sdata,
 	    ieee80211_hw_check(&sdata->local->hw, DISALLOW_PUNCTURING))
 		return false;
 
+	if (chandef->punctured && chandef->chan->band == NL80211_BAND_5GHZ &&
+	    ieee80211_hw_check(&sdata->local->hw, DISALLOW_PUNCTURING_5GHZ))
+		return false;
+
 	return true;
 }
 
-- 
cgit v1.2.3


From a17a58ad2ff24f0d201fa5f9939182f3757d1737 Mon Sep 17 00:00:00 2001
From: Ayala Beker <ayala.beker@intel.com>
Date: Mon, 18 Mar 2024 18:53:31 +0200
Subject: wifi: mac80211: add support for tearing down negotiated TTLM

In order to activate a link that is currently inactive due to
a negotiated TTLM request, need to first tear down the negotiated
TTLM request.
Add support for sending TTLM teardown request and update the links
state accordingly.

Signed-off-by: Ayala Beker <ayala.beker@intel.com>
Reviewed-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Link: https://msgid.link/20240318184907.d480cbf46fcf.Idedad472469d2c27dd2a088cf80a13a1e1cf9b78@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |  9 +++++++
 net/mac80211/ieee80211_i.h |  2 ++
 net/mac80211/mlme.c        | 58 ++++++++++++++++++++++++++++++++++++++++++++++
 net/mac80211/status.c      | 22 +++++++++++++++++-
 4 files changed, 90 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 88bb781f33f6..f57c29de3a91 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -7591,6 +7591,15 @@ int ieee80211_set_active_links(struct ieee80211_vif *vif, u16 active_links);
 void ieee80211_set_active_links_async(struct ieee80211_vif *vif,
 				      u16 active_links);
 
+/**
+ * ieee80211_send_teardown_neg_ttlm - tear down a negotiated TTLM request
+ * @vif: the interface on which the tear down request should be sent.
+ *
+ * This function can be used to tear down a previously accepted negotiated
+ * TTLM request.
+ */
+void ieee80211_send_teardown_neg_ttlm(struct ieee80211_vif *vif);
+
 /* for older drivers - let's not document these ... */
 int ieee80211_emulate_add_chanctx(struct ieee80211_hw *hw,
 				  struct ieee80211_chanctx_conf *ctx);
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index b6fead612b66..925e875f9b96 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -89,6 +89,7 @@ enum ieee80211_status_data {
 	IEEE80211_STATUS_TYPE_MASK	= 0x00f,
 	IEEE80211_STATUS_TYPE_INVALID	= 0,
 	IEEE80211_STATUS_TYPE_SMPS	= 1,
+	IEEE80211_STATUS_TYPE_NEG_TTLM	= 2,
 	IEEE80211_STATUS_SUBDATA_MASK	= 0xff0,
 };
 
@@ -595,6 +596,7 @@ struct ieee80211_if_managed {
 	/* TID-to-link mapping support */
 	struct wiphy_delayed_work ttlm_work;
 	struct ieee80211_adv_ttlm_info ttlm_info;
+	struct wiphy_work teardown_ttlm_work;
 
 	/* dialog token enumerator for neg TTLM request */
 	u8 dialog_token_alloc;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 2d73f0f4dd2d..b653c7d5025f 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -6790,6 +6790,60 @@ void ieee80211_process_neg_ttlm_res(struct ieee80211_sub_if_data *sdata,
 		__ieee80211_disconnect(sdata);
 }
 
+static void ieee80211_teardown_ttlm_work(struct wiphy *wiphy,
+					 struct wiphy_work *work)
+{
+	u16 new_dormant_links;
+	struct ieee80211_sub_if_data *sdata =
+		container_of(work, struct ieee80211_sub_if_data,
+			     u.mgd.neg_ttlm_timeout_work.work);
+
+	if (!sdata->vif.neg_ttlm.valid)
+		return;
+
+	memset(&sdata->vif.neg_ttlm, 0, sizeof(sdata->vif.neg_ttlm));
+	new_dormant_links =
+		sdata->vif.dormant_links & ~sdata->vif.suspended_links;
+	sdata->vif.suspended_links = 0;
+	ieee80211_vif_set_links(sdata, sdata->vif.valid_links,
+				new_dormant_links);
+	ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_MLD_TTLM |
+					       BSS_CHANGED_MLD_VALID_LINKS);
+}
+
+void ieee80211_send_teardown_neg_ttlm(struct ieee80211_vif *vif)
+{
+	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_mgmt *mgmt;
+	struct sk_buff *skb;
+	int frame_len = offsetofend(struct ieee80211_mgmt,
+				  u.action.u.ttlm_tear_down);
+	struct ieee80211_tx_info *info;
+
+	skb = dev_alloc_skb(local->hw.extra_tx_headroom + frame_len);
+	if (!skb)
+		return;
+
+	skb_reserve(skb, local->hw.extra_tx_headroom);
+	mgmt = skb_put_zero(skb, frame_len);
+	mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+					  IEEE80211_STYPE_ACTION);
+	memcpy(mgmt->da, sdata->vif.cfg.ap_addr, ETH_ALEN);
+	memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
+	memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN);
+
+	mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT;
+	mgmt->u.action.u.ttlm_tear_down.action_code =
+		WLAN_PROTECTED_EHT_ACTION_TTLM_TEARDOWN;
+
+	info = IEEE80211_SKB_CB(skb);
+	info->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS;
+	info->status_data = IEEE80211_STATUS_TYPE_NEG_TTLM;
+	ieee80211_tx_skb(sdata, skb);
+}
+EXPORT_SYMBOL(ieee80211_send_teardown_neg_ttlm);
+
 void ieee80211_sta_rx_queued_ext(struct ieee80211_sub_if_data *sdata,
 				 struct sk_buff *skb)
 {
@@ -7421,6 +7475,8 @@ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata)
 				ieee80211_tid_to_link_map_work);
 	wiphy_delayed_work_init(&ifmgd->neg_ttlm_timeout_work,
 				ieee80211_neg_ttlm_timeout_work);
+	wiphy_work_init(&ifmgd->teardown_ttlm_work,
+			ieee80211_teardown_ttlm_work);
 
 	ifmgd->flags = 0;
 	ifmgd->powersave = sdata->wdev.ps;
@@ -8609,6 +8665,8 @@ void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata)
 			  &ifmgd->beacon_connection_loss_work);
 	wiphy_work_cancel(sdata->local->hw.wiphy,
 			  &ifmgd->csa_connection_drop_work);
+	wiphy_work_cancel(sdata->local->hw.wiphy,
+			  &ifmgd->teardown_ttlm_work);
 	wiphy_delayed_work_cancel(sdata->local->hw.wiphy,
 				  &ifmgd->tdls_peer_del_work);
 	wiphy_delayed_work_cancel(sdata->local->hw.wiphy,
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 1708b33cdc5e..dd8f857a1fbc 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -5,7 +5,7 @@
  * Copyright 2006-2007	Jiri Benc <jbenc@suse.cz>
  * Copyright 2008-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
- * Copyright 2021-2023  Intel Corporation
+ * Copyright 2021-2024  Intel Corporation
  */
 
 #include <linux/export.h>
@@ -696,6 +696,23 @@ static void ieee80211_handle_smps_status(struct ieee80211_sub_if_data *sdata,
 	wiphy_work_queue(sdata->local->hw.wiphy, &link->u.mgd.recalc_smps);
 }
 
+static void
+ieee80211_handle_teardown_ttlm_status(struct ieee80211_sub_if_data *sdata,
+				      bool acked)
+{
+	if (!sdata || !ieee80211_sdata_running(sdata))
+		return;
+
+	if (!acked)
+		return;
+
+	if (sdata->vif.type != NL80211_IFTYPE_STATION)
+		return;
+
+	wiphy_work_queue(sdata->local->hw.wiphy,
+			 &sdata->u.mgd.teardown_ttlm_work);
+}
+
 static void ieee80211_report_used_skb(struct ieee80211_local *local,
 				      struct sk_buff *skb, bool dropped,
 				      ktime_t ack_hwtstamp)
@@ -773,6 +790,9 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local,
 			ieee80211_handle_smps_status(sdata, acked,
 						     info->status_data);
 			break;
+		case IEEE80211_STATUS_TYPE_NEG_TTLM:
+			ieee80211_handle_teardown_ttlm_status(sdata, acked);
+			break;
 		}
 		rcu_read_unlock();
 	}
-- 
cgit v1.2.3


From 770546ae9f4c1ae1ebcaf0874f0dd9631d77ec97 Mon Sep 17 00:00:00 2001
From: Puranjay Mohan <puranjay12@gmail.com>
Date: Sun, 24 Mar 2024 18:32:26 +0000
Subject: bpf: implement insn_is_cast_user() helper for JITs

Implement a helper function to check if an instruction is
addr_space_cast from as(0) to as(1). Use this helper in the x86 JIT.

Other JITs can use this helper when they add support for this instruction.

Signed-off-by: Puranjay Mohan <puranjay12@gmail.com>
Link: https://lore.kernel.org/r/20240324183226.29674-1-puranjay12@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c |  3 +--
 include/linux/filter.h      | 10 ++++++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 4900b1ee019f..3b639d6f2f54 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1351,8 +1351,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 			break;
 
 		case BPF_ALU64 | BPF_MOV | BPF_X:
-			if (insn->off == BPF_ADDR_SPACE_CAST &&
-			    insn->imm == 1U << 16) {
+			if (insn_is_cast_user(insn)) {
 				if (dst_reg != src_reg)
 					/* 32-bit mov */
 					emit_mov_reg(&prog, false, dst_reg, src_reg);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index c0d51bff8f96..44934b968b57 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -228,6 +228,16 @@ static inline bool insn_is_zext(const struct bpf_insn *insn)
 	return insn->code == (BPF_ALU | BPF_MOV | BPF_X) && insn->imm == 1;
 }
 
+/* addr_space_cast from as(0) to as(1) is for converting bpf arena pointers
+ * to pointers in user vma.
+ */
+static inline bool insn_is_cast_user(const struct bpf_insn *insn)
+{
+	return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) &&
+			      insn->off == BPF_ADDR_SPACE_CAST &&
+			      insn->imm == 1U << 16;
+}
+
 /* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */
 #define BPF_LD_IMM64(DST, IMM)					\
 	BPF_LD_IMM64_RAW(DST, 0, IMM)
-- 
cgit v1.2.3


From 21a8f8a0eb35ceb21e2c9ddd87468bc3b5ac87c0 Mon Sep 17 00:00:00 2001
From: Anup Patel <apatel@ventanamicro.com>
Date: Thu, 7 Mar 2024 19:33:00 +0530
Subject: irqchip: Add RISC-V incoming MSI controller early driver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The RISC-V advanced interrupt architecture (AIA) specification
defines a new MSI controller called incoming message signalled
interrupt controller (IMSIC) which manages MSI on per-HART (or
per-CPU) basis. It also supports IPIs as software injected MSIs.
(For more details refer https://github.com/riscv/riscv-aia)

Add an early irqchip driver for RISC-V IMSIC which sets up the
IMSIC state and provide IPIs.

Signed-off-by: Anup Patel <apatel@ventanamicro.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Björn Töpel <bjorn@rivosinc.com>
Reviewed-by: Björn Töpel <bjorn@rivosinc.com>
Link: https://lore.kernel.org/r/20240307140307.646078-3-apatel@ventanamicro.com
---
 drivers/irqchip/Kconfig                 |   7 +
 drivers/irqchip/Makefile                |   1 +
 drivers/irqchip/irq-riscv-imsic-early.c | 201 ++++++++
 drivers/irqchip/irq-riscv-imsic-state.c | 865 ++++++++++++++++++++++++++++++++
 drivers/irqchip/irq-riscv-imsic-state.h | 107 ++++
 include/linux/cpuhotplug.h              |   1 +
 include/linux/irqchip/riscv-imsic.h     |  87 ++++
 7 files changed, 1269 insertions(+)
 create mode 100644 drivers/irqchip/irq-riscv-imsic-early.c
 create mode 100644 drivers/irqchip/irq-riscv-imsic-state.c
 create mode 100644 drivers/irqchip/irq-riscv-imsic-state.h
 create mode 100644 include/linux/irqchip/riscv-imsic.h

(limited to 'include')

diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index 72c07a12f5e1..8610810b80da 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -540,6 +540,13 @@ config RISCV_INTC
 	depends on RISCV
 	select IRQ_DOMAIN_HIERARCHY
 
+config RISCV_IMSIC
+	bool
+	depends on RISCV
+	select IRQ_DOMAIN_HIERARCHY
+	select GENERIC_IRQ_MATRIX_ALLOCATOR
+	select GENERIC_MSI_IRQ
+
 config SIFIVE_PLIC
 	bool
 	depends on RISCV
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index ec4a18380998..972aa9fecf96 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -95,6 +95,7 @@ obj-$(CONFIG_QCOM_MPM)			+= irq-qcom-mpm.o
 obj-$(CONFIG_CSKY_MPINTC)		+= irq-csky-mpintc.o
 obj-$(CONFIG_CSKY_APB_INTC)		+= irq-csky-apb-intc.o
 obj-$(CONFIG_RISCV_INTC)		+= irq-riscv-intc.o
+obj-$(CONFIG_RISCV_IMSIC)		+= irq-riscv-imsic-state.o irq-riscv-imsic-early.o
 obj-$(CONFIG_SIFIVE_PLIC)		+= irq-sifive-plic.o
 obj-$(CONFIG_STARFIVE_JH8100_INTC)	+= irq-starfive-jh8100-intc.o
 obj-$(CONFIG_IMX_IRQSTEER)		+= irq-imx-irqsteer.o
diff --git a/drivers/irqchip/irq-riscv-imsic-early.c b/drivers/irqchip/irq-riscv-imsic-early.c
new file mode 100644
index 000000000000..886418ec06cb
--- /dev/null
+++ b/drivers/irqchip/irq-riscv-imsic-early.c
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ * Copyright (C) 2022 Ventana Micro Systems Inc.
+ */
+
+#define pr_fmt(fmt) "riscv-imsic: " fmt
+#include <linux/cpu.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/irqchip.h>
+#include <linux/irqchip/chained_irq.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+
+#include "irq-riscv-imsic-state.h"
+
+static int imsic_parent_irq;
+
+#ifdef CONFIG_SMP
+static void imsic_ipi_send(unsigned int cpu)
+{
+	struct imsic_local_config *local = per_cpu_ptr(imsic->global.local, cpu);
+
+	writel_relaxed(IMSIC_IPI_ID, local->msi_va);
+}
+
+static void imsic_ipi_starting_cpu(void)
+{
+	/* Enable IPIs for current CPU. */
+	__imsic_id_set_enable(IMSIC_IPI_ID);
+}
+
+static void imsic_ipi_dying_cpu(void)
+{
+	/* Disable IPIs for current CPU. */
+	__imsic_id_clear_enable(IMSIC_IPI_ID);
+}
+
+static int __init imsic_ipi_domain_init(void)
+{
+	int virq;
+
+	/* Create IMSIC IPI multiplexing */
+	virq = ipi_mux_create(IMSIC_NR_IPI, imsic_ipi_send);
+	if (virq <= 0)
+		return virq < 0 ? virq : -ENOMEM;
+
+	/* Set vIRQ range */
+	riscv_ipi_set_virq_range(virq, IMSIC_NR_IPI, true);
+
+	/* Announce that IMSIC is providing IPIs */
+	pr_info("%pfwP: providing IPIs using interrupt %d\n", imsic->fwnode, IMSIC_IPI_ID);
+
+	return 0;
+}
+#else
+static void imsic_ipi_starting_cpu(void) { }
+static void imsic_ipi_dying_cpu(void) { }
+static int __init imsic_ipi_domain_init(void) { return 0; }
+#endif
+
+/*
+ * To handle an interrupt, we read the TOPEI CSR and write zero in one
+ * instruction. If TOPEI CSR is non-zero then we translate TOPEI.ID to
+ * Linux interrupt number and let Linux IRQ subsystem handle it.
+ */
+static void imsic_handle_irq(struct irq_desc *desc)
+{
+	struct irq_chip *chip = irq_desc_get_chip(desc);
+	int err, cpu = smp_processor_id();
+	struct imsic_vector *vec;
+	unsigned long local_id;
+
+	chained_irq_enter(chip, desc);
+
+	while ((local_id = csr_swap(CSR_TOPEI, 0))) {
+		local_id >>= TOPEI_ID_SHIFT;
+
+		if (local_id == IMSIC_IPI_ID) {
+			if (IS_ENABLED(CONFIG_SMP))
+				ipi_mux_process();
+			continue;
+		}
+
+		if (unlikely(!imsic->base_domain))
+			continue;
+
+		vec = imsic_vector_from_local_id(cpu, local_id);
+		if (!vec) {
+			pr_warn_ratelimited("vector not found for local ID 0x%lx\n", local_id);
+			continue;
+		}
+
+		err = generic_handle_domain_irq(imsic->base_domain, vec->hwirq);
+		if (unlikely(err))
+			pr_warn_ratelimited("hwirq 0x%x mapping not found\n", vec->hwirq);
+	}
+
+	chained_irq_exit(chip, desc);
+}
+
+static int imsic_starting_cpu(unsigned int cpu)
+{
+	/* Mark per-CPU IMSIC state as online */
+	imsic_state_online();
+
+	/* Enable per-CPU parent interrupt */
+	enable_percpu_irq(imsic_parent_irq, irq_get_trigger_type(imsic_parent_irq));
+
+	/* Setup IPIs */
+	imsic_ipi_starting_cpu();
+
+	/*
+	 * Interrupts identities might have been enabled/disabled while
+	 * this CPU was not running so sync-up local enable/disable state.
+	 */
+	imsic_local_sync_all();
+
+	/* Enable local interrupt delivery */
+	imsic_local_delivery(true);
+
+	return 0;
+}
+
+static int imsic_dying_cpu(unsigned int cpu)
+{
+	/* Cleanup IPIs */
+	imsic_ipi_dying_cpu();
+
+	/* Mark per-CPU IMSIC state as offline */
+	imsic_state_offline();
+
+	return 0;
+}
+
+static int __init imsic_early_probe(struct fwnode_handle *fwnode)
+{
+	struct irq_domain *domain;
+	int rc;
+
+	/* Find parent domain and register chained handler */
+	domain = irq_find_matching_fwnode(riscv_get_intc_hwnode(), DOMAIN_BUS_ANY);
+	if (!domain) {
+		pr_err("%pfwP: Failed to find INTC domain\n", fwnode);
+		return -ENOENT;
+	}
+	imsic_parent_irq = irq_create_mapping(domain, RV_IRQ_EXT);
+	if (!imsic_parent_irq) {
+		pr_err("%pfwP: Failed to create INTC mapping\n", fwnode);
+		return -ENOENT;
+	}
+
+	/* Initialize IPI domain */
+	rc = imsic_ipi_domain_init();
+	if (rc) {
+		pr_err("%pfwP: Failed to initialize IPI domain\n", fwnode);
+		return rc;
+	}
+
+	/* Setup chained handler to the parent domain interrupt */
+	irq_set_chained_handler(imsic_parent_irq, imsic_handle_irq);
+
+	/*
+	 * Setup cpuhp state (must be done after setting imsic_parent_irq)
+	 *
+	 * Don't disable per-CPU IMSIC file when CPU goes offline
+	 * because this affects IPI and the masking/unmasking of
+	 * virtual IPIs is done via generic IPI-Mux
+	 */
+	cpuhp_setup_state(CPUHP_AP_IRQ_RISCV_IMSIC_STARTING, "irqchip/riscv/imsic:starting",
+			  imsic_starting_cpu, imsic_dying_cpu);
+
+	return 0;
+}
+
+static int __init imsic_early_dt_init(struct device_node *node, struct device_node *parent)
+{
+	struct fwnode_handle *fwnode = &node->fwnode;
+	int rc;
+
+	/* Setup IMSIC state */
+	rc = imsic_setup_state(fwnode);
+	if (rc) {
+		pr_err("%pfwP: failed to setup state (error %d)\n", fwnode, rc);
+		return rc;
+	}
+
+	/* Do early setup of IPIs */
+	rc = imsic_early_probe(fwnode);
+	if (rc)
+		return rc;
+
+	/* Ensure that OF platform device gets probed */
+	of_node_clear_flag(node, OF_POPULATED);
+	return 0;
+}
+
+IRQCHIP_DECLARE(riscv_imsic, "riscv,imsics", imsic_early_dt_init);
diff --git a/drivers/irqchip/irq-riscv-imsic-state.c b/drivers/irqchip/irq-riscv-imsic-state.c
new file mode 100644
index 000000000000..5479f872e62b
--- /dev/null
+++ b/drivers/irqchip/irq-riscv-imsic-state.c
@@ -0,0 +1,865 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ * Copyright (C) 2022 Ventana Micro Systems Inc.
+ */
+
+#define pr_fmt(fmt) "riscv-imsic: " fmt
+#include <linux/cpu.h>
+#include <linux/bitmap.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <asm/hwcap.h>
+
+#include "irq-riscv-imsic-state.h"
+
+#define IMSIC_DISABLE_EIDELIVERY		0
+#define IMSIC_ENABLE_EIDELIVERY			1
+#define IMSIC_DISABLE_EITHRESHOLD		1
+#define IMSIC_ENABLE_EITHRESHOLD		0
+
+static inline void imsic_csr_write(unsigned long reg, unsigned long val)
+{
+	csr_write(CSR_ISELECT, reg);
+	csr_write(CSR_IREG, val);
+}
+
+static inline unsigned long imsic_csr_read(unsigned long reg)
+{
+	csr_write(CSR_ISELECT, reg);
+	return csr_read(CSR_IREG);
+}
+
+static inline unsigned long imsic_csr_read_clear(unsigned long reg, unsigned long val)
+{
+	csr_write(CSR_ISELECT, reg);
+	return csr_read_clear(CSR_IREG, val);
+}
+
+static inline void imsic_csr_set(unsigned long reg, unsigned long val)
+{
+	csr_write(CSR_ISELECT, reg);
+	csr_set(CSR_IREG, val);
+}
+
+static inline void imsic_csr_clear(unsigned long reg, unsigned long val)
+{
+	csr_write(CSR_ISELECT, reg);
+	csr_clear(CSR_IREG, val);
+}
+
+struct imsic_priv *imsic;
+
+const struct imsic_global_config *imsic_get_global_config(void)
+{
+	return imsic ? &imsic->global : NULL;
+}
+EXPORT_SYMBOL_GPL(imsic_get_global_config);
+
+static bool __imsic_eix_read_clear(unsigned long id, bool pend)
+{
+	unsigned long isel, imask;
+
+	isel = id / BITS_PER_LONG;
+	isel *= BITS_PER_LONG / IMSIC_EIPx_BITS;
+	isel += pend ? IMSIC_EIP0 : IMSIC_EIE0;
+	imask = BIT(id & (__riscv_xlen - 1));
+
+	return !!(imsic_csr_read_clear(isel, imask) & imask);
+}
+
+static inline bool __imsic_id_read_clear_enabled(unsigned long id)
+{
+	return __imsic_eix_read_clear(id, false);
+}
+
+static inline bool __imsic_id_read_clear_pending(unsigned long id)
+{
+	return __imsic_eix_read_clear(id, true);
+}
+
+void __imsic_eix_update(unsigned long base_id, unsigned long num_id, bool pend, bool val)
+{
+	unsigned long id = base_id, last_id = base_id + num_id;
+	unsigned long i, isel, ireg;
+
+	while (id < last_id) {
+		isel = id / BITS_PER_LONG;
+		isel *= BITS_PER_LONG / IMSIC_EIPx_BITS;
+		isel += pend ? IMSIC_EIP0 : IMSIC_EIE0;
+
+		/*
+		 * Prepare the ID mask to be programmed in the
+		 * IMSIC EIEx and EIPx registers. These registers
+		 * are XLEN-wide and we must not touch IDs which
+		 * are < base_id and >= (base_id + num_id).
+		 */
+		ireg = 0;
+		for (i = id & (__riscv_xlen - 1); id < last_id && i < __riscv_xlen; i++) {
+			ireg |= BIT(i);
+			id++;
+		}
+
+		/*
+		 * The IMSIC EIEx and EIPx registers are indirectly
+		 * accessed via using ISELECT and IREG CSRs so we
+		 * need to access these CSRs without getting preempted.
+		 *
+		 * All existing users of this function call this
+		 * function with local IRQs disabled so we don't
+		 * need to do anything special here.
+		 */
+		if (val)
+			imsic_csr_set(isel, ireg);
+		else
+			imsic_csr_clear(isel, ireg);
+	}
+}
+
+static void __imsic_local_sync(struct imsic_local_priv *lpriv)
+{
+	struct imsic_local_config *mlocal;
+	struct imsic_vector *vec, *mvec;
+	int i;
+
+	lockdep_assert_held(&lpriv->lock);
+
+	for_each_set_bit(i, lpriv->dirty_bitmap, imsic->global.nr_ids + 1) {
+		if (!i || i == IMSIC_IPI_ID)
+			goto skip;
+		vec = &lpriv->vectors[i];
+
+		if (READ_ONCE(vec->enable))
+			__imsic_id_set_enable(i);
+		else
+			__imsic_id_clear_enable(i);
+
+		/*
+		 * If the ID was being moved to a new ID on some other CPU
+		 * then we can get a MSI during the movement so check the
+		 * ID pending bit and re-trigger the new ID on other CPU
+		 * using MMIO write.
+		 */
+		mvec = READ_ONCE(vec->move);
+		WRITE_ONCE(vec->move, NULL);
+		if (mvec && mvec != vec) {
+			if (__imsic_id_read_clear_pending(i)) {
+				mlocal = per_cpu_ptr(imsic->global.local, mvec->cpu);
+				writel_relaxed(mvec->local_id, mlocal->msi_va);
+			}
+
+			imsic_vector_free(&lpriv->vectors[i]);
+		}
+
+skip:
+		bitmap_clear(lpriv->dirty_bitmap, i, 1);
+	}
+}
+
+void imsic_local_sync_all(void)
+{
+	struct imsic_local_priv *lpriv = this_cpu_ptr(imsic->lpriv);
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&lpriv->lock, flags);
+	bitmap_fill(lpriv->dirty_bitmap, imsic->global.nr_ids + 1);
+	__imsic_local_sync(lpriv);
+	raw_spin_unlock_irqrestore(&lpriv->lock, flags);
+}
+
+void imsic_local_delivery(bool enable)
+{
+	if (enable) {
+		imsic_csr_write(IMSIC_EITHRESHOLD, IMSIC_ENABLE_EITHRESHOLD);
+		imsic_csr_write(IMSIC_EIDELIVERY, IMSIC_ENABLE_EIDELIVERY);
+		return;
+	}
+
+	imsic_csr_write(IMSIC_EIDELIVERY, IMSIC_DISABLE_EIDELIVERY);
+	imsic_csr_write(IMSIC_EITHRESHOLD, IMSIC_DISABLE_EITHRESHOLD);
+}
+
+#ifdef CONFIG_SMP
+static void imsic_local_timer_callback(struct timer_list *timer)
+{
+	struct imsic_local_priv *lpriv = this_cpu_ptr(imsic->lpriv);
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&lpriv->lock, flags);
+	__imsic_local_sync(lpriv);
+	raw_spin_unlock_irqrestore(&lpriv->lock, flags);
+}
+
+static void __imsic_remote_sync(struct imsic_local_priv *lpriv, unsigned int cpu)
+{
+	lockdep_assert_held(&lpriv->lock);
+
+	/*
+	 * The spinlock acquire/release semantics ensure that changes
+	 * to vector enable, vector move and dirty bitmap are visible
+	 * to the target CPU.
+	 */
+
+	/*
+	 * We schedule a timer on the target CPU if the target CPU is not
+	 * same as the current CPU. An offline CPU will unconditionally
+	 * synchronize IDs through imsic_starting_cpu() when the
+	 * CPU is brought up.
+	 */
+	if (cpu_online(cpu)) {
+		if (cpu == smp_processor_id()) {
+			__imsic_local_sync(lpriv);
+			return;
+		}
+
+		if (!timer_pending(&lpriv->timer)) {
+			lpriv->timer.expires = jiffies + 1;
+			add_timer_on(&lpriv->timer, cpu);
+		}
+	}
+}
+#else
+static void __imsic_remote_sync(struct imsic_local_priv *lpriv, unsigned int cpu)
+{
+	lockdep_assert_held(&lpriv->lock);
+	__imsic_local_sync(lpriv);
+}
+#endif
+
+void imsic_vector_mask(struct imsic_vector *vec)
+{
+	struct imsic_local_priv *lpriv;
+
+	lpriv = per_cpu_ptr(imsic->lpriv, vec->cpu);
+	if (WARN_ON_ONCE(&lpriv->vectors[vec->local_id] != vec))
+		return;
+
+	/*
+	 * This function is called through Linux irq subsystem with
+	 * irqs disabled so no need to save/restore irq flags.
+	 */
+
+	raw_spin_lock(&lpriv->lock);
+
+	WRITE_ONCE(vec->enable, false);
+	bitmap_set(lpriv->dirty_bitmap, vec->local_id, 1);
+	__imsic_remote_sync(lpriv, vec->cpu);
+
+	raw_spin_unlock(&lpriv->lock);
+}
+
+void imsic_vector_unmask(struct imsic_vector *vec)
+{
+	struct imsic_local_priv *lpriv;
+
+	lpriv = per_cpu_ptr(imsic->lpriv, vec->cpu);
+	if (WARN_ON_ONCE(&lpriv->vectors[vec->local_id] != vec))
+		return;
+
+	/*
+	 * This function is called through Linux irq subsystem with
+	 * irqs disabled so no need to save/restore irq flags.
+	 */
+
+	raw_spin_lock(&lpriv->lock);
+
+	WRITE_ONCE(vec->enable, true);
+	bitmap_set(lpriv->dirty_bitmap, vec->local_id, 1);
+	__imsic_remote_sync(lpriv, vec->cpu);
+
+	raw_spin_unlock(&lpriv->lock);
+}
+
+static bool imsic_vector_move_update(struct imsic_local_priv *lpriv, struct imsic_vector *vec,
+				     bool new_enable, struct imsic_vector *new_move)
+{
+	unsigned long flags;
+	bool enabled;
+
+	raw_spin_lock_irqsave(&lpriv->lock, flags);
+
+	/* Update enable and move details */
+	enabled = READ_ONCE(vec->enable);
+	WRITE_ONCE(vec->enable, new_enable);
+	WRITE_ONCE(vec->move, new_move);
+
+	/* Mark the vector as dirty and synchronize */
+	bitmap_set(lpriv->dirty_bitmap, vec->local_id, 1);
+	__imsic_remote_sync(lpriv, vec->cpu);
+
+	raw_spin_unlock_irqrestore(&lpriv->lock, flags);
+
+	return enabled;
+}
+
+void imsic_vector_move(struct imsic_vector *old_vec, struct imsic_vector *new_vec)
+{
+	struct imsic_local_priv *old_lpriv, *new_lpriv;
+	bool enabled;
+
+	if (WARN_ON_ONCE(old_vec->cpu == new_vec->cpu))
+		return;
+
+	old_lpriv = per_cpu_ptr(imsic->lpriv, old_vec->cpu);
+	if (WARN_ON_ONCE(&old_lpriv->vectors[old_vec->local_id] != old_vec))
+		return;
+
+	new_lpriv = per_cpu_ptr(imsic->lpriv, new_vec->cpu);
+	if (WARN_ON_ONCE(&new_lpriv->vectors[new_vec->local_id] != new_vec))
+		return;
+
+	/*
+	 * Move and re-trigger the new vector based on the pending
+	 * state of the old vector because we might get a device
+	 * interrupt on the old vector while device was being moved
+	 * to the new vector.
+	 */
+	enabled = imsic_vector_move_update(old_lpriv, old_vec, false, new_vec);
+	imsic_vector_move_update(new_lpriv, new_vec, enabled, new_vec);
+}
+
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+void imsic_vector_debug_show(struct seq_file *m, struct imsic_vector *vec, int ind)
+{
+	struct imsic_local_priv *lpriv;
+	struct imsic_vector *mvec;
+	bool is_enabled;
+
+	lpriv = per_cpu_ptr(imsic->lpriv, vec->cpu);
+	if (WARN_ON_ONCE(&lpriv->vectors[vec->local_id] != vec))
+		return;
+
+	is_enabled = imsic_vector_isenabled(vec);
+	mvec = imsic_vector_get_move(vec);
+
+	seq_printf(m, "%*starget_cpu      : %5u\n", ind, "", vec->cpu);
+	seq_printf(m, "%*starget_local_id : %5u\n", ind, "", vec->local_id);
+	seq_printf(m, "%*sis_reserved     : %5u\n", ind, "",
+		   (vec->local_id <= IMSIC_IPI_ID) ? 1 : 0);
+	seq_printf(m, "%*sis_enabled      : %5u\n", ind, "", is_enabled ? 1 : 0);
+	seq_printf(m, "%*sis_move_pending : %5u\n", ind, "", mvec ? 1 : 0);
+	if (mvec) {
+		seq_printf(m, "%*smove_cpu        : %5u\n", ind, "", mvec->cpu);
+		seq_printf(m, "%*smove_local_id   : %5u\n", ind, "", mvec->local_id);
+	}
+}
+
+void imsic_vector_debug_show_summary(struct seq_file *m, int ind)
+{
+	irq_matrix_debug_show(m, imsic->matrix, ind);
+}
+#endif
+
+struct imsic_vector *imsic_vector_from_local_id(unsigned int cpu, unsigned int local_id)
+{
+	struct imsic_local_priv *lpriv = per_cpu_ptr(imsic->lpriv, cpu);
+
+	if (!lpriv || imsic->global.nr_ids < local_id)
+		return NULL;
+
+	return &lpriv->vectors[local_id];
+}
+
+struct imsic_vector *imsic_vector_alloc(unsigned int hwirq, const struct cpumask *mask)
+{
+	struct imsic_vector *vec = NULL;
+	struct imsic_local_priv *lpriv;
+	unsigned long flags;
+	unsigned int cpu;
+	int local_id;
+
+	raw_spin_lock_irqsave(&imsic->matrix_lock, flags);
+	local_id = irq_matrix_alloc(imsic->matrix, mask, false, &cpu);
+	raw_spin_unlock_irqrestore(&imsic->matrix_lock, flags);
+	if (local_id < 0)
+		return NULL;
+
+	lpriv = per_cpu_ptr(imsic->lpriv, cpu);
+	vec = &lpriv->vectors[local_id];
+	vec->hwirq = hwirq;
+	vec->enable = false;
+	vec->move = NULL;
+
+	return vec;
+}
+
+void imsic_vector_free(struct imsic_vector *vec)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&imsic->matrix_lock, flags);
+	vec->hwirq = UINT_MAX;
+	irq_matrix_free(imsic->matrix, vec->cpu, vec->local_id, false);
+	raw_spin_unlock_irqrestore(&imsic->matrix_lock, flags);
+}
+
+static void __init imsic_local_cleanup(void)
+{
+	struct imsic_local_priv *lpriv;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		lpriv = per_cpu_ptr(imsic->lpriv, cpu);
+
+		bitmap_free(lpriv->dirty_bitmap);
+		kfree(lpriv->vectors);
+	}
+
+	free_percpu(imsic->lpriv);
+}
+
+static int __init imsic_local_init(void)
+{
+	struct imsic_global_config *global = &imsic->global;
+	struct imsic_local_priv *lpriv;
+	struct imsic_vector *vec;
+	int cpu, i;
+
+	/* Allocate per-CPU private state */
+	imsic->lpriv = alloc_percpu(typeof(*imsic->lpriv));
+	if (!imsic->lpriv)
+		return -ENOMEM;
+
+	/* Setup per-CPU private state */
+	for_each_possible_cpu(cpu) {
+		lpriv = per_cpu_ptr(imsic->lpriv, cpu);
+
+		raw_spin_lock_init(&lpriv->lock);
+
+		/* Allocate dirty bitmap */
+		lpriv->dirty_bitmap = bitmap_zalloc(global->nr_ids + 1, GFP_KERNEL);
+		if (!lpriv->dirty_bitmap)
+			goto fail_local_cleanup;
+
+#ifdef CONFIG_SMP
+		/* Setup lazy timer for synchronization */
+		timer_setup(&lpriv->timer, imsic_local_timer_callback, TIMER_PINNED);
+#endif
+
+		/* Allocate vector array */
+		lpriv->vectors = kcalloc(global->nr_ids + 1, sizeof(*lpriv->vectors),
+					 GFP_KERNEL);
+		if (!lpriv->vectors)
+			goto fail_local_cleanup;
+
+		/* Setup vector array */
+		for (i = 0; i <= global->nr_ids; i++) {
+			vec = &lpriv->vectors[i];
+			vec->cpu = cpu;
+			vec->local_id = i;
+			vec->hwirq = UINT_MAX;
+		}
+	}
+
+	return 0;
+
+fail_local_cleanup:
+	imsic_local_cleanup();
+	return -ENOMEM;
+}
+
+void imsic_state_online(void)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&imsic->matrix_lock, flags);
+	irq_matrix_online(imsic->matrix);
+	raw_spin_unlock_irqrestore(&imsic->matrix_lock, flags);
+}
+
+void imsic_state_offline(void)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&imsic->matrix_lock, flags);
+	irq_matrix_offline(imsic->matrix);
+	raw_spin_unlock_irqrestore(&imsic->matrix_lock, flags);
+
+#ifdef CONFIG_SMP
+	struct imsic_local_priv *lpriv = this_cpu_ptr(imsic->lpriv);
+
+	raw_spin_lock_irqsave(&lpriv->lock, flags);
+	WARN_ON_ONCE(try_to_del_timer_sync(&lpriv->timer) < 0);
+	raw_spin_unlock_irqrestore(&lpriv->lock, flags);
+#endif
+}
+
+static int __init imsic_matrix_init(void)
+{
+	struct imsic_global_config *global = &imsic->global;
+
+	raw_spin_lock_init(&imsic->matrix_lock);
+	imsic->matrix = irq_alloc_matrix(global->nr_ids + 1,
+					 0, global->nr_ids + 1);
+	if (!imsic->matrix)
+		return -ENOMEM;
+
+	/* Reserve ID#0 because it is special and never implemented */
+	irq_matrix_assign_system(imsic->matrix, 0, false);
+
+	/* Reserve IPI ID because it is special and used internally */
+	irq_matrix_assign_system(imsic->matrix, IMSIC_IPI_ID, false);
+
+	return 0;
+}
+
+static int __init imsic_get_parent_hartid(struct fwnode_handle *fwnode,
+					  u32 index, unsigned long *hartid)
+{
+	struct of_phandle_args parent;
+	int rc;
+
+	/*
+	 * Currently, only OF fwnode is supported so extend this
+	 * function for ACPI support.
+	 */
+	if (!is_of_node(fwnode))
+		return -EINVAL;
+
+	rc = of_irq_parse_one(to_of_node(fwnode), index, &parent);
+	if (rc)
+		return rc;
+
+	/*
+	 * Skip interrupts other than external interrupts for
+	 * current privilege level.
+	 */
+	if (parent.args[0] != RV_IRQ_EXT)
+		return -EINVAL;
+
+	return riscv_of_parent_hartid(parent.np, hartid);
+}
+
+static int __init imsic_get_mmio_resource(struct fwnode_handle *fwnode,
+					  u32 index, struct resource *res)
+{
+	/*
+	 * Currently, only OF fwnode is supported so extend this
+	 * function for ACPI support.
+	 */
+	if (!is_of_node(fwnode))
+		return -EINVAL;
+
+	return of_address_to_resource(to_of_node(fwnode), index, res);
+}
+
+static int __init imsic_parse_fwnode(struct fwnode_handle *fwnode,
+				     struct imsic_global_config *global,
+				     u32 *nr_parent_irqs,
+				     u32 *nr_mmios)
+{
+	unsigned long hartid;
+	struct resource res;
+	int rc;
+	u32 i;
+
+	/*
+	 * Currently, only OF fwnode is supported so extend this
+	 * function for ACPI support.
+	 */
+	if (!is_of_node(fwnode))
+		return -EINVAL;
+
+	*nr_parent_irqs = 0;
+	*nr_mmios = 0;
+
+	/* Find number of parent interrupts */
+	while (!imsic_get_parent_hartid(fwnode, *nr_parent_irqs, &hartid))
+		(*nr_parent_irqs)++;
+	if (!*nr_parent_irqs) {
+		pr_err("%pfwP: no parent irqs available\n", fwnode);
+		return -EINVAL;
+	}
+
+	/* Find number of guest index bits in MSI address */
+	rc = of_property_read_u32(to_of_node(fwnode), "riscv,guest-index-bits",
+				  &global->guest_index_bits);
+	if (rc)
+		global->guest_index_bits = 0;
+
+	/* Find number of HART index bits */
+	rc = of_property_read_u32(to_of_node(fwnode), "riscv,hart-index-bits",
+				  &global->hart_index_bits);
+	if (rc) {
+		/* Assume default value */
+		global->hart_index_bits = __fls(*nr_parent_irqs);
+		if (BIT(global->hart_index_bits) < *nr_parent_irqs)
+			global->hart_index_bits++;
+	}
+
+	/* Find number of group index bits */
+	rc = of_property_read_u32(to_of_node(fwnode), "riscv,group-index-bits",
+				  &global->group_index_bits);
+	if (rc)
+		global->group_index_bits = 0;
+
+	/*
+	 * Find first bit position of group index.
+	 * If not specified assumed the default APLIC-IMSIC configuration.
+	 */
+	rc = of_property_read_u32(to_of_node(fwnode), "riscv,group-index-shift",
+				  &global->group_index_shift);
+	if (rc)
+		global->group_index_shift = IMSIC_MMIO_PAGE_SHIFT * 2;
+
+	/* Find number of interrupt identities */
+	rc = of_property_read_u32(to_of_node(fwnode), "riscv,num-ids",
+				  &global->nr_ids);
+	if (rc) {
+		pr_err("%pfwP: number of interrupt identities not found\n", fwnode);
+		return rc;
+	}
+
+	/* Find number of guest interrupt identities */
+	rc = of_property_read_u32(to_of_node(fwnode), "riscv,num-guest-ids",
+				  &global->nr_guest_ids);
+	if (rc)
+		global->nr_guest_ids = global->nr_ids;
+
+	/* Sanity check guest index bits */
+	i = BITS_PER_LONG - IMSIC_MMIO_PAGE_SHIFT;
+	if (i < global->guest_index_bits) {
+		pr_err("%pfwP: guest index bits too big\n", fwnode);
+		return -EINVAL;
+	}
+
+	/* Sanity check HART index bits */
+	i = BITS_PER_LONG - IMSIC_MMIO_PAGE_SHIFT - global->guest_index_bits;
+	if (i < global->hart_index_bits) {
+		pr_err("%pfwP: HART index bits too big\n", fwnode);
+		return -EINVAL;
+	}
+
+	/* Sanity check group index bits */
+	i = BITS_PER_LONG - IMSIC_MMIO_PAGE_SHIFT -
+	    global->guest_index_bits - global->hart_index_bits;
+	if (i < global->group_index_bits) {
+		pr_err("%pfwP: group index bits too big\n", fwnode);
+		return -EINVAL;
+	}
+
+	/* Sanity check group index shift */
+	i = global->group_index_bits + global->group_index_shift - 1;
+	if (i >= BITS_PER_LONG) {
+		pr_err("%pfwP: group index shift too big\n", fwnode);
+		return -EINVAL;
+	}
+
+	/* Sanity check number of interrupt identities */
+	if (global->nr_ids < IMSIC_MIN_ID ||
+	    global->nr_ids >= IMSIC_MAX_ID ||
+	    (global->nr_ids & IMSIC_MIN_ID) != IMSIC_MIN_ID) {
+		pr_err("%pfwP: invalid number of interrupt identities\n", fwnode);
+		return -EINVAL;
+	}
+
+	/* Sanity check number of guest interrupt identities */
+	if (global->nr_guest_ids < IMSIC_MIN_ID ||
+	    global->nr_guest_ids >= IMSIC_MAX_ID ||
+	    (global->nr_guest_ids & IMSIC_MIN_ID) != IMSIC_MIN_ID) {
+		pr_err("%pfwP: invalid number of guest interrupt identities\n", fwnode);
+		return -EINVAL;
+	}
+
+	/* Compute base address */
+	rc = imsic_get_mmio_resource(fwnode, 0, &res);
+	if (rc) {
+		pr_err("%pfwP: first MMIO resource not found\n", fwnode);
+		return -EINVAL;
+	}
+	global->base_addr = res.start;
+	global->base_addr &= ~(BIT(global->guest_index_bits +
+				   global->hart_index_bits +
+				   IMSIC_MMIO_PAGE_SHIFT) - 1);
+	global->base_addr &= ~((BIT(global->group_index_bits) - 1) <<
+			       global->group_index_shift);
+
+	/* Find number of MMIO register sets */
+	while (!imsic_get_mmio_resource(fwnode, *nr_mmios, &res))
+		(*nr_mmios)++;
+
+	return 0;
+}
+
+int __init imsic_setup_state(struct fwnode_handle *fwnode)
+{
+	u32 i, j, index, nr_parent_irqs, nr_mmios, nr_handlers = 0;
+	struct imsic_global_config *global;
+	struct imsic_local_config *local;
+	void __iomem **mmios_va = NULL;
+	struct resource *mmios = NULL;
+	unsigned long reloff, hartid;
+	phys_addr_t base_addr;
+	int rc, cpu;
+
+	/*
+	 * Only one IMSIC instance allowed in a platform for clean
+	 * implementation of SMP IRQ affinity and per-CPU IPIs.
+	 *
+	 * This means on a multi-socket (or multi-die) platform we
+	 * will have multiple MMIO regions for one IMSIC instance.
+	 */
+	if (imsic) {
+		pr_err("%pfwP: already initialized hence ignoring\n", fwnode);
+		return -EALREADY;
+	}
+
+	if (!riscv_isa_extension_available(NULL, SxAIA)) {
+		pr_err("%pfwP: AIA support not available\n", fwnode);
+		return -ENODEV;
+	}
+
+	imsic = kzalloc(sizeof(*imsic), GFP_KERNEL);
+	if (!imsic)
+		return -ENOMEM;
+	imsic->fwnode = fwnode;
+	global = &imsic->global;
+
+	global->local = alloc_percpu(typeof(*global->local));
+	if (!global->local) {
+		rc = -ENOMEM;
+		goto out_free_priv;
+	}
+
+	/* Parse IMSIC fwnode */
+	rc = imsic_parse_fwnode(fwnode, global, &nr_parent_irqs, &nr_mmios);
+	if (rc)
+		goto out_free_local;
+
+	/* Allocate MMIO resource array */
+	mmios = kcalloc(nr_mmios, sizeof(*mmios), GFP_KERNEL);
+	if (!mmios) {
+		rc = -ENOMEM;
+		goto out_free_local;
+	}
+
+	/* Allocate MMIO virtual address array */
+	mmios_va = kcalloc(nr_mmios, sizeof(*mmios_va), GFP_KERNEL);
+	if (!mmios_va) {
+		rc = -ENOMEM;
+		goto out_iounmap;
+	}
+
+	/* Parse and map MMIO register sets */
+	for (i = 0; i < nr_mmios; i++) {
+		rc = imsic_get_mmio_resource(fwnode, i, &mmios[i]);
+		if (rc) {
+			pr_err("%pfwP: unable to parse MMIO regset %d\n", fwnode, i);
+			goto out_iounmap;
+		}
+
+		base_addr = mmios[i].start;
+		base_addr &= ~(BIT(global->guest_index_bits +
+				   global->hart_index_bits +
+				   IMSIC_MMIO_PAGE_SHIFT) - 1);
+		base_addr &= ~((BIT(global->group_index_bits) - 1) <<
+			       global->group_index_shift);
+		if (base_addr != global->base_addr) {
+			rc = -EINVAL;
+			pr_err("%pfwP: address mismatch for regset %d\n", fwnode, i);
+			goto out_iounmap;
+		}
+
+		mmios_va[i] = ioremap(mmios[i].start, resource_size(&mmios[i]));
+		if (!mmios_va[i]) {
+			rc = -EIO;
+			pr_err("%pfwP: unable to map MMIO regset %d\n", fwnode, i);
+			goto out_iounmap;
+		}
+	}
+
+	/* Initialize local (or per-CPU )state */
+	rc = imsic_local_init();
+	if (rc) {
+		pr_err("%pfwP: failed to initialize local state\n",
+		       fwnode);
+		goto out_iounmap;
+	}
+
+	/* Configure handlers for target CPUs */
+	for (i = 0; i < nr_parent_irqs; i++) {
+		rc = imsic_get_parent_hartid(fwnode, i, &hartid);
+		if (rc) {
+			pr_warn("%pfwP: hart ID for parent irq%d not found\n", fwnode, i);
+			continue;
+		}
+
+		cpu = riscv_hartid_to_cpuid(hartid);
+		if (cpu < 0) {
+			pr_warn("%pfwP: invalid cpuid for parent irq%d\n", fwnode, i);
+			continue;
+		}
+
+		/* Find MMIO location of MSI page */
+		index = nr_mmios;
+		reloff = i * BIT(global->guest_index_bits) *
+			 IMSIC_MMIO_PAGE_SZ;
+		for (j = 0; nr_mmios; j++) {
+			if (reloff < resource_size(&mmios[j])) {
+				index = j;
+				break;
+			}
+
+			/*
+			 * MMIO region size may not be aligned to
+			 * BIT(global->guest_index_bits) * IMSIC_MMIO_PAGE_SZ
+			 * if holes are present.
+			 */
+			reloff -= ALIGN(resource_size(&mmios[j]),
+			BIT(global->guest_index_bits) * IMSIC_MMIO_PAGE_SZ);
+		}
+		if (index >= nr_mmios) {
+			pr_warn("%pfwP: MMIO not found for parent irq%d\n", fwnode, i);
+			continue;
+		}
+
+		local = per_cpu_ptr(global->local, cpu);
+		local->msi_pa = mmios[index].start + reloff;
+		local->msi_va = mmios_va[index] + reloff;
+
+		nr_handlers++;
+	}
+
+	/* If no CPU handlers found then can't take interrupts */
+	if (!nr_handlers) {
+		pr_err("%pfwP: No CPU handlers found\n", fwnode);
+		rc = -ENODEV;
+		goto out_local_cleanup;
+	}
+
+	/* Initialize matrix allocator */
+	rc = imsic_matrix_init();
+	if (rc) {
+		pr_err("%pfwP: failed to create matrix allocator\n", fwnode);
+		goto out_local_cleanup;
+	}
+
+	/* We don't need MMIO arrays anymore so let's free-up */
+	kfree(mmios_va);
+	kfree(mmios);
+
+	return 0;
+
+out_local_cleanup:
+	imsic_local_cleanup();
+out_iounmap:
+	for (i = 0; i < nr_mmios; i++) {
+		if (mmios_va[i])
+			iounmap(mmios_va[i]);
+	}
+	kfree(mmios_va);
+	kfree(mmios);
+out_free_local:
+	free_percpu(imsic->global.local);
+out_free_priv:
+	kfree(imsic);
+	imsic = NULL;
+	return rc;
+}
diff --git a/drivers/irqchip/irq-riscv-imsic-state.h b/drivers/irqchip/irq-riscv-imsic-state.h
new file mode 100644
index 000000000000..8ec9649d0d01
--- /dev/null
+++ b/drivers/irqchip/irq-riscv-imsic-state.h
@@ -0,0 +1,107 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ * Copyright (C) 2022 Ventana Micro Systems Inc.
+ */
+
+#ifndef _IRQ_RISCV_IMSIC_STATE_H
+#define _IRQ_RISCV_IMSIC_STATE_H
+
+#include <linux/irqchip/riscv-imsic.h>
+#include <linux/irqdomain.h>
+#include <linux/fwnode.h>
+#include <linux/timer.h>
+
+#define IMSIC_IPI_ID				1
+#define IMSIC_NR_IPI				8
+
+struct imsic_vector {
+	/* Fixed details of the vector */
+	unsigned int				cpu;
+	unsigned int				local_id;
+	/* Details saved by driver in the vector */
+	unsigned int				hwirq;
+	/* Details accessed using local lock held */
+	bool					enable;
+	struct imsic_vector			*move;
+};
+
+struct imsic_local_priv {
+	/* Local lock to protect vector enable/move variables and dirty bitmap */
+	raw_spinlock_t				lock;
+
+	/* Local dirty bitmap for synchronization */
+	unsigned long				*dirty_bitmap;
+
+#ifdef CONFIG_SMP
+	/* Local timer for synchronization */
+	struct timer_list			timer;
+#endif
+
+	/* Local vector table */
+	struct imsic_vector			*vectors;
+};
+
+struct imsic_priv {
+	/* Device details */
+	struct fwnode_handle			*fwnode;
+
+	/* Global configuration common for all HARTs */
+	struct imsic_global_config		global;
+
+	/* Per-CPU state */
+	struct imsic_local_priv __percpu	*lpriv;
+
+	/* State of IRQ matrix allocator */
+	raw_spinlock_t				matrix_lock;
+	struct irq_matrix			*matrix;
+
+	/* IRQ domains (created by platform driver) */
+	struct irq_domain			*base_domain;
+};
+
+extern struct imsic_priv *imsic;
+
+void __imsic_eix_update(unsigned long base_id, unsigned long num_id, bool pend, bool val);
+
+static inline void __imsic_id_set_enable(unsigned long id)
+{
+	__imsic_eix_update(id, 1, false, true);
+}
+
+static inline void __imsic_id_clear_enable(unsigned long id)
+{
+	__imsic_eix_update(id, 1, false, false);
+}
+
+void imsic_local_sync_all(void);
+void imsic_local_delivery(bool enable);
+
+void imsic_vector_mask(struct imsic_vector *vec);
+void imsic_vector_unmask(struct imsic_vector *vec);
+
+static inline bool imsic_vector_isenabled(struct imsic_vector *vec)
+{
+	return READ_ONCE(vec->enable);
+}
+
+static inline struct imsic_vector *imsic_vector_get_move(struct imsic_vector *vec)
+{
+	return READ_ONCE(vec->move);
+}
+
+void imsic_vector_move(struct imsic_vector *old_vec, struct imsic_vector *new_vec);
+
+struct imsic_vector *imsic_vector_from_local_id(unsigned int cpu, unsigned int local_id);
+
+struct imsic_vector *imsic_vector_alloc(unsigned int hwirq, const struct cpumask *mask);
+void imsic_vector_free(struct imsic_vector *vector);
+
+void imsic_vector_debug_show(struct seq_file *m, struct imsic_vector *vec, int ind);
+void imsic_vector_debug_show_summary(struct seq_file *m, int ind);
+
+void imsic_state_online(void);
+void imsic_state_offline(void);
+int imsic_setup_state(struct fwnode_handle *fwnode);
+
+#endif
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 35e78ddb2b37..7a5785f405b6 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -146,6 +146,7 @@ enum cpuhp_state {
 	CPUHP_AP_IRQ_MIPS_GIC_STARTING,
 	CPUHP_AP_IRQ_LOONGARCH_STARTING,
 	CPUHP_AP_IRQ_SIFIVE_PLIC_STARTING,
+	CPUHP_AP_IRQ_RISCV_IMSIC_STARTING,
 	CPUHP_AP_ARM_MVEBU_COHERENCY,
 	CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING,
 	CPUHP_AP_PERF_X86_STARTING,
diff --git a/include/linux/irqchip/riscv-imsic.h b/include/linux/irqchip/riscv-imsic.h
new file mode 100644
index 000000000000..faf0b800b1b0
--- /dev/null
+++ b/include/linux/irqchip/riscv-imsic.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ * Copyright (C) 2022 Ventana Micro Systems Inc.
+ */
+#ifndef __LINUX_IRQCHIP_RISCV_IMSIC_H
+#define __LINUX_IRQCHIP_RISCV_IMSIC_H
+
+#include <linux/types.h>
+#include <linux/bitops.h>
+#include <asm/csr.h>
+
+#define IMSIC_MMIO_PAGE_SHIFT		12
+#define IMSIC_MMIO_PAGE_SZ		BIT(IMSIC_MMIO_PAGE_SHIFT)
+#define IMSIC_MMIO_PAGE_LE		0x00
+#define IMSIC_MMIO_PAGE_BE		0x04
+
+#define IMSIC_MIN_ID			63
+#define IMSIC_MAX_ID			2048
+
+#define IMSIC_EIDELIVERY		0x70
+
+#define IMSIC_EITHRESHOLD		0x72
+
+#define IMSIC_EIP0			0x80
+#define IMSIC_EIP63			0xbf
+#define IMSIC_EIPx_BITS			32
+
+#define IMSIC_EIE0			0xc0
+#define IMSIC_EIE63			0xff
+#define IMSIC_EIEx_BITS			32
+
+#define IMSIC_FIRST			IMSIC_EIDELIVERY
+#define IMSIC_LAST			IMSIC_EIE63
+
+#define IMSIC_MMIO_SETIPNUM_LE		0x00
+#define IMSIC_MMIO_SETIPNUM_BE		0x04
+
+struct imsic_local_config {
+	phys_addr_t				msi_pa;
+	void __iomem				*msi_va;
+};
+
+struct imsic_global_config {
+	/*
+	 * MSI Target Address Scheme
+	 *
+	 * XLEN-1                                                12     0
+	 * |                                                     |     |
+	 * -------------------------------------------------------------
+	 * |xxxxxx|Group Index|xxxxxxxxxxx|HART Index|Guest Index|  0  |
+	 * -------------------------------------------------------------
+	 */
+
+	/* Bits representing Guest index, HART index, and Group index */
+	u32					guest_index_bits;
+	u32					hart_index_bits;
+	u32					group_index_bits;
+	u32					group_index_shift;
+
+	/* Global base address matching all target MSI addresses */
+	phys_addr_t				base_addr;
+
+	/* Number of interrupt identities */
+	u32					nr_ids;
+
+	/* Number of guest interrupt identities */
+	u32					nr_guest_ids;
+
+	/* Per-CPU IMSIC addresses */
+	struct imsic_local_config __percpu	*local;
+};
+
+#ifdef CONFIG_RISCV_IMSIC
+
+const struct imsic_global_config *imsic_get_global_config(void);
+
+#else
+
+static inline const struct imsic_global_config *imsic_get_global_config(void)
+{
+	return NULL;
+}
+
+#endif
+
+#endif
-- 
cgit v1.2.3


From 2333df5ae51ead2188d07c99e841e159a664741e Mon Sep 17 00:00:00 2001
From: Anup Patel <apatel@ventanamicro.com>
Date: Thu, 7 Mar 2024 19:33:04 +0530
Subject: irqchip: Add RISC-V advanced PLIC driver for direct-mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The RISC-V advanced interrupt architecture (AIA) specification defines
advanced platform-level interrupt controller (APLIC) which has two modes
of operation: 1) Direct mode and 2) MSI mode.
(For more details, refer https://github.com/riscv/riscv-aia)

In APLIC direct-mode, wired interrupts are forwared to CPUs (or HARTs)
as a local external interrupt.

Add a platform irqchip driver for the RISC-V APLIC direct-mode to
support RISC-V platforms having only wired interrupts.

Signed-off-by: Anup Patel <apatel@ventanamicro.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Björn Töpel <bjorn@rivosinc.com>
Reviewed-by: Björn Töpel <bjorn@rivosinc.com>
Link: https://lore.kernel.org/r/20240307140307.646078-7-apatel@ventanamicro.com
---
 drivers/irqchip/Kconfig                  |   5 +
 drivers/irqchip/Makefile                 |   1 +
 drivers/irqchip/irq-riscv-aplic-direct.c | 326 +++++++++++++++++++++++++++++++
 drivers/irqchip/irq-riscv-aplic-main.c   | 211 ++++++++++++++++++++
 drivers/irqchip/irq-riscv-aplic-main.h   |  44 +++++
 include/linux/irqchip/riscv-aplic.h      | 145 ++++++++++++++
 6 files changed, 732 insertions(+)
 create mode 100644 drivers/irqchip/irq-riscv-aplic-direct.c
 create mode 100644 drivers/irqchip/irq-riscv-aplic-main.c
 create mode 100644 drivers/irqchip/irq-riscv-aplic-main.h
 create mode 100644 include/linux/irqchip/riscv-aplic.h

(limited to 'include')

diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index 8f6ce419ba4c..f6728471f592 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -540,6 +540,11 @@ config RISCV_INTC
 	depends on RISCV
 	select IRQ_DOMAIN_HIERARCHY
 
+config RISCV_APLIC
+	bool
+	depends on RISCV
+	select IRQ_DOMAIN_HIERARCHY
+
 config RISCV_IMSIC
 	bool
 	depends on RISCV
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index c1390980b4eb..5adbe7a6f604 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -95,6 +95,7 @@ obj-$(CONFIG_QCOM_MPM)			+= irq-qcom-mpm.o
 obj-$(CONFIG_CSKY_MPINTC)		+= irq-csky-mpintc.o
 obj-$(CONFIG_CSKY_APB_INTC)		+= irq-csky-apb-intc.o
 obj-$(CONFIG_RISCV_INTC)		+= irq-riscv-intc.o
+obj-$(CONFIG_RISCV_APLIC)		+= irq-riscv-aplic-main.o irq-riscv-aplic-direct.o
 obj-$(CONFIG_RISCV_IMSIC)		+= irq-riscv-imsic-state.o irq-riscv-imsic-early.o irq-riscv-imsic-platform.o
 obj-$(CONFIG_SIFIVE_PLIC)		+= irq-sifive-plic.o
 obj-$(CONFIG_STARFIVE_JH8100_INTC)	+= irq-starfive-jh8100-intc.o
diff --git a/drivers/irqchip/irq-riscv-aplic-direct.c b/drivers/irqchip/irq-riscv-aplic-direct.c
new file mode 100644
index 000000000000..06bace9b7497
--- /dev/null
+++ b/drivers/irqchip/irq-riscv-aplic-direct.c
@@ -0,0 +1,326 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ * Copyright (C) 2022 Ventana Micro Systems Inc.
+ */
+
+#include <linux/bitfield.h>
+#include <linux/bitops.h>
+#include <linux/cpu.h>
+#include <linux/interrupt.h>
+#include <linux/irqchip.h>
+#include <linux/irqchip/chained_irq.h>
+#include <linux/irqchip/riscv-aplic.h>
+#include <linux/module.h>
+#include <linux/of_address.h>
+#include <linux/printk.h>
+#include <linux/smp.h>
+
+#include "irq-riscv-aplic-main.h"
+
+#define APLIC_DISABLE_IDELIVERY		0
+#define APLIC_ENABLE_IDELIVERY		1
+#define APLIC_DISABLE_ITHRESHOLD	1
+#define APLIC_ENABLE_ITHRESHOLD		0
+
+struct aplic_direct {
+	struct aplic_priv	priv;
+	struct irq_domain	*irqdomain;
+	struct cpumask		lmask;
+};
+
+struct aplic_idc {
+	unsigned int		hart_index;
+	void __iomem		*regs;
+	struct aplic_direct	*direct;
+};
+
+static unsigned int aplic_direct_parent_irq;
+static DEFINE_PER_CPU(struct aplic_idc, aplic_idcs);
+
+static void aplic_direct_irq_eoi(struct irq_data *d)
+{
+	/*
+	 * The fasteoi_handler requires irq_eoi() callback hence
+	 * provide a dummy handler.
+	 */
+}
+
+#ifdef CONFIG_SMP
+static int aplic_direct_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
+				     bool force)
+{
+	struct aplic_priv *priv = irq_data_get_irq_chip_data(d);
+	struct aplic_direct *direct = container_of(priv, struct aplic_direct, priv);
+	struct aplic_idc *idc;
+	unsigned int cpu, val;
+	struct cpumask amask;
+	void __iomem *target;
+
+	cpumask_and(&amask, &direct->lmask, mask_val);
+
+	if (force)
+		cpu = cpumask_first(&amask);
+	else
+		cpu = cpumask_any_and(&amask, cpu_online_mask);
+
+	if (cpu >= nr_cpu_ids)
+		return -EINVAL;
+
+	idc = per_cpu_ptr(&aplic_idcs, cpu);
+	target = priv->regs + APLIC_TARGET_BASE + (d->hwirq - 1) * sizeof(u32);
+	val = FIELD_PREP(APLIC_TARGET_HART_IDX, idc->hart_index);
+	val |= FIELD_PREP(APLIC_TARGET_IPRIO, APLIC_DEFAULT_PRIORITY);
+	writel(val, target);
+
+	irq_data_update_effective_affinity(d, cpumask_of(cpu));
+
+	return IRQ_SET_MASK_OK_DONE;
+}
+#endif
+
+static struct irq_chip aplic_direct_chip = {
+	.name		= "APLIC-DIRECT",
+	.irq_mask	= aplic_irq_mask,
+	.irq_unmask	= aplic_irq_unmask,
+	.irq_set_type	= aplic_irq_set_type,
+	.irq_eoi	= aplic_direct_irq_eoi,
+#ifdef CONFIG_SMP
+	.irq_set_affinity = aplic_direct_set_affinity,
+#endif
+	.flags		= IRQCHIP_SET_TYPE_MASKED |
+			  IRQCHIP_SKIP_SET_WAKE |
+			  IRQCHIP_MASK_ON_SUSPEND,
+};
+
+static int aplic_direct_irqdomain_translate(struct irq_domain *d, struct irq_fwspec *fwspec,
+					    unsigned long *hwirq, unsigned int *type)
+{
+	struct aplic_priv *priv = d->host_data;
+
+	return aplic_irqdomain_translate(fwspec, priv->gsi_base, hwirq, type);
+}
+
+static int aplic_direct_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+					unsigned int nr_irqs, void *arg)
+{
+	struct aplic_priv *priv = domain->host_data;
+	struct aplic_direct *direct = container_of(priv, struct aplic_direct, priv);
+	struct irq_fwspec *fwspec = arg;
+	irq_hw_number_t hwirq;
+	unsigned int type;
+	int i, ret;
+
+	ret = aplic_irqdomain_translate(fwspec, priv->gsi_base, &hwirq, &type);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < nr_irqs; i++) {
+		irq_domain_set_info(domain, virq + i, hwirq + i, &aplic_direct_chip,
+				    priv, handle_fasteoi_irq, NULL, NULL);
+		irq_set_affinity(virq + i, &direct->lmask);
+	}
+
+	return 0;
+}
+
+static const struct irq_domain_ops aplic_direct_irqdomain_ops = {
+	.translate	= aplic_direct_irqdomain_translate,
+	.alloc		= aplic_direct_irqdomain_alloc,
+	.free		= irq_domain_free_irqs_top,
+};
+
+/*
+ * To handle an APLIC direct interrupts, we just read the CLAIMI register
+ * which will return highest priority pending interrupt and clear the
+ * pending bit of the interrupt. This process is repeated until CLAIMI
+ * register return zero value.
+ */
+static void aplic_direct_handle_irq(struct irq_desc *desc)
+{
+	struct aplic_idc *idc = this_cpu_ptr(&aplic_idcs);
+	struct irq_domain *irqdomain = idc->direct->irqdomain;
+	struct irq_chip *chip = irq_desc_get_chip(desc);
+	irq_hw_number_t hw_irq;
+	int irq;
+
+	chained_irq_enter(chip, desc);
+
+	while ((hw_irq = readl(idc->regs + APLIC_IDC_CLAIMI))) {
+		hw_irq = hw_irq >> APLIC_IDC_TOPI_ID_SHIFT;
+		irq = irq_find_mapping(irqdomain, hw_irq);
+
+		if (unlikely(irq <= 0)) {
+			dev_warn_ratelimited(idc->direct->priv.dev,
+					     "hw_irq %lu mapping not found\n", hw_irq);
+		} else {
+			generic_handle_irq(irq);
+		}
+	}
+
+	chained_irq_exit(chip, desc);
+}
+
+static void aplic_idc_set_delivery(struct aplic_idc *idc, bool en)
+{
+	u32 de = (en) ? APLIC_ENABLE_IDELIVERY : APLIC_DISABLE_IDELIVERY;
+	u32 th = (en) ? APLIC_ENABLE_ITHRESHOLD : APLIC_DISABLE_ITHRESHOLD;
+
+	/* Priority must be less than threshold for interrupt triggering */
+	writel(th, idc->regs + APLIC_IDC_ITHRESHOLD);
+
+	/* Delivery must be set to 1 for interrupt triggering */
+	writel(de, idc->regs + APLIC_IDC_IDELIVERY);
+}
+
+static int aplic_direct_dying_cpu(unsigned int cpu)
+{
+	if (aplic_direct_parent_irq)
+		disable_percpu_irq(aplic_direct_parent_irq);
+
+	return 0;
+}
+
+static int aplic_direct_starting_cpu(unsigned int cpu)
+{
+	if (aplic_direct_parent_irq) {
+		enable_percpu_irq(aplic_direct_parent_irq,
+				  irq_get_trigger_type(aplic_direct_parent_irq));
+	}
+
+	return 0;
+}
+
+static int aplic_direct_parse_parent_hwirq(struct device *dev, u32 index,
+					   u32 *parent_hwirq, unsigned long *parent_hartid)
+{
+	struct of_phandle_args parent;
+	int rc;
+
+	/*
+	 * Currently, only OF fwnode is supported so extend this
+	 * function for ACPI support.
+	 */
+	if (!is_of_node(dev->fwnode))
+		return -EINVAL;
+
+	rc = of_irq_parse_one(to_of_node(dev->fwnode), index, &parent);
+	if (rc)
+		return rc;
+
+	rc = riscv_of_parent_hartid(parent.np, parent_hartid);
+	if (rc)
+		return rc;
+
+	*parent_hwirq = parent.args[0];
+	return 0;
+}
+
+int aplic_direct_setup(struct device *dev, void __iomem *regs)
+{
+	int i, j, rc, cpu, current_cpu, setup_count = 0;
+	struct aplic_direct *direct;
+	struct irq_domain *domain;
+	struct aplic_priv *priv;
+	struct aplic_idc *idc;
+	unsigned long hartid;
+	u32 v, hwirq;
+
+	direct = devm_kzalloc(dev, sizeof(*direct), GFP_KERNEL);
+	if (!direct)
+		return -ENOMEM;
+	priv = &direct->priv;
+
+	rc = aplic_setup_priv(priv, dev, regs);
+	if (rc) {
+		dev_err(dev, "failed to create APLIC context\n");
+		return rc;
+	}
+
+	/* Setup per-CPU IDC and target CPU mask */
+	current_cpu = get_cpu();
+	for (i = 0; i < priv->nr_idcs; i++) {
+		rc = aplic_direct_parse_parent_hwirq(dev, i, &hwirq, &hartid);
+		if (rc) {
+			dev_warn(dev, "parent irq for IDC%d not found\n", i);
+			continue;
+		}
+
+		/*
+		 * Skip interrupts other than external interrupts for
+		 * current privilege level.
+		 */
+		if (hwirq != RV_IRQ_EXT)
+			continue;
+
+		cpu = riscv_hartid_to_cpuid(hartid);
+		if (cpu < 0) {
+			dev_warn(dev, "invalid cpuid for IDC%d\n", i);
+			continue;
+		}
+
+		cpumask_set_cpu(cpu, &direct->lmask);
+
+		idc = per_cpu_ptr(&aplic_idcs, cpu);
+		idc->hart_index = i;
+		idc->regs = priv->regs + APLIC_IDC_BASE + i * APLIC_IDC_SIZE;
+		idc->direct = direct;
+
+		aplic_idc_set_delivery(idc, true);
+
+		/*
+		 * Boot cpu might not have APLIC hart_index = 0 so check
+		 * and update target registers of all interrupts.
+		 */
+		if (cpu == current_cpu && idc->hart_index) {
+			v = FIELD_PREP(APLIC_TARGET_HART_IDX, idc->hart_index);
+			v |= FIELD_PREP(APLIC_TARGET_IPRIO, APLIC_DEFAULT_PRIORITY);
+			for (j = 1; j <= priv->nr_irqs; j++)
+				writel(v, priv->regs + APLIC_TARGET_BASE + (j - 1) * sizeof(u32));
+		}
+
+		setup_count++;
+	}
+	put_cpu();
+
+	/* Find parent domain and register chained handler */
+	domain = irq_find_matching_fwnode(riscv_get_intc_hwnode(),
+					  DOMAIN_BUS_ANY);
+	if (!aplic_direct_parent_irq && domain) {
+		aplic_direct_parent_irq = irq_create_mapping(domain, RV_IRQ_EXT);
+		if (aplic_direct_parent_irq) {
+			irq_set_chained_handler(aplic_direct_parent_irq,
+						aplic_direct_handle_irq);
+
+			/*
+			 * Setup CPUHP notifier to enable parent
+			 * interrupt on all CPUs
+			 */
+			cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+					  "irqchip/riscv/aplic:starting",
+					  aplic_direct_starting_cpu,
+					  aplic_direct_dying_cpu);
+		}
+	}
+
+	/* Fail if we were not able to setup IDC for any CPU */
+	if (!setup_count)
+		return -ENODEV;
+
+	/* Setup global config and interrupt delivery */
+	aplic_init_hw_global(priv, false);
+
+	/* Create irq domain instance for the APLIC */
+	direct->irqdomain = irq_domain_create_linear(dev->fwnode, priv->nr_irqs + 1,
+						     &aplic_direct_irqdomain_ops, priv);
+	if (!direct->irqdomain) {
+		dev_err(dev, "failed to create direct irq domain\n");
+		return -ENOMEM;
+	}
+
+	/* Advertise the interrupt controller */
+	dev_info(dev, "%d interrupts directly connected to %d CPUs\n",
+		 priv->nr_irqs, priv->nr_idcs);
+
+	return 0;
+}
diff --git a/drivers/irqchip/irq-riscv-aplic-main.c b/drivers/irqchip/irq-riscv-aplic-main.c
new file mode 100644
index 000000000000..160ff99d6979
--- /dev/null
+++ b/drivers/irqchip/irq-riscv-aplic-main.c
@@ -0,0 +1,211 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ * Copyright (C) 2022 Ventana Micro Systems Inc.
+ */
+
+#include <linux/bitfield.h>
+#include <linux/irqchip/riscv-aplic.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_irq.h>
+#include <linux/platform_device.h>
+#include <linux/printk.h>
+
+#include "irq-riscv-aplic-main.h"
+
+void aplic_irq_unmask(struct irq_data *d)
+{
+	struct aplic_priv *priv = irq_data_get_irq_chip_data(d);
+
+	writel(d->hwirq, priv->regs + APLIC_SETIENUM);
+}
+
+void aplic_irq_mask(struct irq_data *d)
+{
+	struct aplic_priv *priv = irq_data_get_irq_chip_data(d);
+
+	writel(d->hwirq, priv->regs + APLIC_CLRIENUM);
+}
+
+int aplic_irq_set_type(struct irq_data *d, unsigned int type)
+{
+	struct aplic_priv *priv = irq_data_get_irq_chip_data(d);
+	void __iomem *sourcecfg;
+	u32 val = 0;
+
+	switch (type) {
+	case IRQ_TYPE_NONE:
+		val = APLIC_SOURCECFG_SM_INACTIVE;
+		break;
+	case IRQ_TYPE_LEVEL_LOW:
+		val = APLIC_SOURCECFG_SM_LEVEL_LOW;
+		break;
+	case IRQ_TYPE_LEVEL_HIGH:
+		val = APLIC_SOURCECFG_SM_LEVEL_HIGH;
+		break;
+	case IRQ_TYPE_EDGE_FALLING:
+		val = APLIC_SOURCECFG_SM_EDGE_FALL;
+		break;
+	case IRQ_TYPE_EDGE_RISING:
+		val = APLIC_SOURCECFG_SM_EDGE_RISE;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	sourcecfg = priv->regs + APLIC_SOURCECFG_BASE;
+	sourcecfg += (d->hwirq - 1) * sizeof(u32);
+	writel(val, sourcecfg);
+
+	return 0;
+}
+
+int aplic_irqdomain_translate(struct irq_fwspec *fwspec, u32 gsi_base,
+			      unsigned long *hwirq, unsigned int *type)
+{
+	if (WARN_ON(fwspec->param_count < 2))
+		return -EINVAL;
+	if (WARN_ON(!fwspec->param[0]))
+		return -EINVAL;
+
+	/* For DT, gsi_base is always zero. */
+	*hwirq = fwspec->param[0] - gsi_base;
+	*type = fwspec->param[1] & IRQ_TYPE_SENSE_MASK;
+
+	WARN_ON(*type == IRQ_TYPE_NONE);
+
+	return 0;
+}
+
+void aplic_init_hw_global(struct aplic_priv *priv, bool msi_mode)
+{
+	u32 val;
+#ifdef CONFIG_RISCV_M_MODE
+	u32 valh;
+
+	if (msi_mode) {
+		val = lower_32_bits(priv->msicfg.base_ppn);
+		valh = FIELD_PREP(APLIC_xMSICFGADDRH_BAPPN, upper_32_bits(priv->msicfg.base_ppn));
+		valh |= FIELD_PREP(APLIC_xMSICFGADDRH_LHXW, priv->msicfg.lhxw);
+		valh |= FIELD_PREP(APLIC_xMSICFGADDRH_HHXW, priv->msicfg.hhxw);
+		valh |= FIELD_PREP(APLIC_xMSICFGADDRH_LHXS, priv->msicfg.lhxs);
+		valh |= FIELD_PREP(APLIC_xMSICFGADDRH_HHXS, priv->msicfg.hhxs);
+		writel(val, priv->regs + APLIC_xMSICFGADDR);
+		writel(valh, priv->regs + APLIC_xMSICFGADDRH);
+	}
+#endif
+
+	/* Setup APLIC domaincfg register */
+	val = readl(priv->regs + APLIC_DOMAINCFG);
+	val |= APLIC_DOMAINCFG_IE;
+	if (msi_mode)
+		val |= APLIC_DOMAINCFG_DM;
+	writel(val, priv->regs + APLIC_DOMAINCFG);
+	if (readl(priv->regs + APLIC_DOMAINCFG) != val)
+		dev_warn(priv->dev, "unable to write 0x%x in domaincfg\n", val);
+}
+
+static void aplic_init_hw_irqs(struct aplic_priv *priv)
+{
+	int i;
+
+	/* Disable all interrupts */
+	for (i = 0; i <= priv->nr_irqs; i += 32)
+		writel(-1U, priv->regs + APLIC_CLRIE_BASE + (i / 32) * sizeof(u32));
+
+	/* Set interrupt type and default priority for all interrupts */
+	for (i = 1; i <= priv->nr_irqs; i++) {
+		writel(0, priv->regs + APLIC_SOURCECFG_BASE + (i - 1) * sizeof(u32));
+		writel(APLIC_DEFAULT_PRIORITY,
+		       priv->regs + APLIC_TARGET_BASE + (i - 1) * sizeof(u32));
+	}
+
+	/* Clear APLIC domaincfg */
+	writel(0, priv->regs + APLIC_DOMAINCFG);
+}
+
+int aplic_setup_priv(struct aplic_priv *priv, struct device *dev, void __iomem *regs)
+{
+	struct of_phandle_args parent;
+	int rc;
+
+	/*
+	 * Currently, only OF fwnode is supported so extend this
+	 * function for ACPI support.
+	 */
+	if (!is_of_node(dev->fwnode))
+		return -EINVAL;
+
+	/* Save device pointer and register base */
+	priv->dev = dev;
+	priv->regs = regs;
+
+	/* Find out number of interrupt sources */
+	rc = of_property_read_u32(to_of_node(dev->fwnode), "riscv,num-sources",
+				  &priv->nr_irqs);
+	if (rc) {
+		dev_err(dev, "failed to get number of interrupt sources\n");
+		return rc;
+	}
+
+	/*
+	 * Find out number of IDCs based on parent interrupts
+	 *
+	 * If "msi-parent" property is present then we ignore the
+	 * APLIC IDCs which forces the APLIC driver to use MSI mode.
+	 */
+	if (!of_property_present(to_of_node(dev->fwnode), "msi-parent")) {
+		while (!of_irq_parse_one(to_of_node(dev->fwnode), priv->nr_idcs, &parent))
+			priv->nr_idcs++;
+	}
+
+	/* Setup initial state APLIC interrupts */
+	aplic_init_hw_irqs(priv);
+
+	return 0;
+}
+
+static int aplic_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	bool msi_mode = false;
+	void __iomem *regs;
+	int rc;
+
+	/* Map the MMIO registers */
+	regs = devm_platform_ioremap_resource(pdev, 0);
+	if (!regs) {
+		dev_err(dev, "failed map MMIO registers\n");
+		return -ENOMEM;
+	}
+
+	/*
+	 * If msi-parent property is present then setup APLIC MSI
+	 * mode otherwise setup APLIC direct mode.
+	 */
+	if (is_of_node(dev->fwnode))
+		msi_mode = of_property_present(to_of_node(dev->fwnode), "msi-parent");
+	if (msi_mode)
+		rc = -ENODEV;
+	else
+		rc = aplic_direct_setup(dev, regs);
+	if (rc)
+		dev_err(dev, "failed to setup APLIC in %s mode\n", msi_mode ? "MSI" : "direct");
+
+	return rc;
+}
+
+static const struct of_device_id aplic_match[] = {
+	{ .compatible = "riscv,aplic" },
+	{}
+};
+
+static struct platform_driver aplic_driver = {
+	.driver = {
+		.name		= "riscv-aplic",
+		.of_match_table	= aplic_match,
+	},
+	.probe = aplic_probe,
+};
+builtin_platform_driver(aplic_driver);
diff --git a/drivers/irqchip/irq-riscv-aplic-main.h b/drivers/irqchip/irq-riscv-aplic-main.h
new file mode 100644
index 000000000000..4cfbadf37ddc
--- /dev/null
+++ b/drivers/irqchip/irq-riscv-aplic-main.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ * Copyright (C) 2022 Ventana Micro Systems Inc.
+ */
+
+#ifndef _IRQ_RISCV_APLIC_MAIN_H
+#define _IRQ_RISCV_APLIC_MAIN_H
+
+#include <linux/device.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/fwnode.h>
+
+#define APLIC_DEFAULT_PRIORITY		1
+
+struct aplic_msicfg {
+	phys_addr_t		base_ppn;
+	u32			hhxs;
+	u32			hhxw;
+	u32			lhxs;
+	u32			lhxw;
+};
+
+struct aplic_priv {
+	struct device		*dev;
+	u32			gsi_base;
+	u32			nr_irqs;
+	u32			nr_idcs;
+	void __iomem		*regs;
+	struct aplic_msicfg	msicfg;
+};
+
+void aplic_irq_unmask(struct irq_data *d);
+void aplic_irq_mask(struct irq_data *d);
+int aplic_irq_set_type(struct irq_data *d, unsigned int type);
+int aplic_irqdomain_translate(struct irq_fwspec *fwspec, u32 gsi_base,
+			      unsigned long *hwirq, unsigned int *type);
+void aplic_init_hw_global(struct aplic_priv *priv, bool msi_mode);
+int aplic_setup_priv(struct aplic_priv *priv, struct device *dev, void __iomem *regs);
+int aplic_direct_setup(struct device *dev, void __iomem *regs);
+
+#endif
diff --git a/include/linux/irqchip/riscv-aplic.h b/include/linux/irqchip/riscv-aplic.h
new file mode 100644
index 000000000000..ec8f7df50583
--- /dev/null
+++ b/include/linux/irqchip/riscv-aplic.h
@@ -0,0 +1,145 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 Western Digital Corporation or its affiliates.
+ * Copyright (C) 2022 Ventana Micro Systems Inc.
+ */
+#ifndef __LINUX_IRQCHIP_RISCV_APLIC_H
+#define __LINUX_IRQCHIP_RISCV_APLIC_H
+
+#include <linux/bitops.h>
+
+#define APLIC_MAX_IDC			BIT(14)
+#define APLIC_MAX_SOURCE		1024
+
+#define APLIC_DOMAINCFG			0x0000
+#define APLIC_DOMAINCFG_RDONLY		0x80000000
+#define APLIC_DOMAINCFG_IE		BIT(8)
+#define APLIC_DOMAINCFG_DM		BIT(2)
+#define APLIC_DOMAINCFG_BE		BIT(0)
+
+#define APLIC_SOURCECFG_BASE		0x0004
+#define APLIC_SOURCECFG_D		BIT(10)
+#define APLIC_SOURCECFG_CHILDIDX_MASK	0x000003ff
+#define APLIC_SOURCECFG_SM_MASK	0x00000007
+#define APLIC_SOURCECFG_SM_INACTIVE	0x0
+#define APLIC_SOURCECFG_SM_DETACH	0x1
+#define APLIC_SOURCECFG_SM_EDGE_RISE	0x4
+#define APLIC_SOURCECFG_SM_EDGE_FALL	0x5
+#define APLIC_SOURCECFG_SM_LEVEL_HIGH	0x6
+#define APLIC_SOURCECFG_SM_LEVEL_LOW	0x7
+
+#define APLIC_MMSICFGADDR		0x1bc0
+#define APLIC_MMSICFGADDRH		0x1bc4
+#define APLIC_SMSICFGADDR		0x1bc8
+#define APLIC_SMSICFGADDRH		0x1bcc
+
+#ifdef CONFIG_RISCV_M_MODE
+#define APLIC_xMSICFGADDR		APLIC_MMSICFGADDR
+#define APLIC_xMSICFGADDRH		APLIC_MMSICFGADDRH
+#else
+#define APLIC_xMSICFGADDR		APLIC_SMSICFGADDR
+#define APLIC_xMSICFGADDRH		APLIC_SMSICFGADDRH
+#endif
+
+#define APLIC_xMSICFGADDRH_L		BIT(31)
+#define APLIC_xMSICFGADDRH_HHXS_MASK	0x1f
+#define APLIC_xMSICFGADDRH_HHXS_SHIFT	24
+#define APLIC_xMSICFGADDRH_HHXS		(APLIC_xMSICFGADDRH_HHXS_MASK << \
+					 APLIC_xMSICFGADDRH_HHXS_SHIFT)
+#define APLIC_xMSICFGADDRH_LHXS_MASK	0x7
+#define APLIC_xMSICFGADDRH_LHXS_SHIFT	20
+#define APLIC_xMSICFGADDRH_LHXS		(APLIC_xMSICFGADDRH_LHXS_MASK << \
+					 APLIC_xMSICFGADDRH_LHXS_SHIFT)
+#define APLIC_xMSICFGADDRH_HHXW_MASK	0x7
+#define APLIC_xMSICFGADDRH_HHXW_SHIFT	16
+#define APLIC_xMSICFGADDRH_HHXW		(APLIC_xMSICFGADDRH_HHXW_MASK << \
+					 APLIC_xMSICFGADDRH_HHXW_SHIFT)
+#define APLIC_xMSICFGADDRH_LHXW_MASK	0xf
+#define APLIC_xMSICFGADDRH_LHXW_SHIFT	12
+#define APLIC_xMSICFGADDRH_LHXW		(APLIC_xMSICFGADDRH_LHXW_MASK << \
+					 APLIC_xMSICFGADDRH_LHXW_SHIFT)
+#define APLIC_xMSICFGADDRH_BAPPN_MASK	0xfff
+#define APLIC_xMSICFGADDRH_BAPPN_SHIFT	0
+#define APLIC_xMSICFGADDRH_BAPPN	(APLIC_xMSICFGADDRH_BAPPN_MASK << \
+					 APLIC_xMSICFGADDRH_BAPPN_SHIFT)
+
+#define APLIC_xMSICFGADDR_PPN_SHIFT	12
+
+#define APLIC_xMSICFGADDR_PPN_HART(__lhxs) \
+	(BIT(__lhxs) - 1)
+
+#define APLIC_xMSICFGADDR_PPN_LHX_MASK(__lhxw) \
+	(BIT(__lhxw) - 1)
+#define APLIC_xMSICFGADDR_PPN_LHX_SHIFT(__lhxs) \
+	((__lhxs))
+#define APLIC_xMSICFGADDR_PPN_LHX(__lhxw, __lhxs) \
+	(APLIC_xMSICFGADDR_PPN_LHX_MASK(__lhxw) << \
+	 APLIC_xMSICFGADDR_PPN_LHX_SHIFT(__lhxs))
+
+#define APLIC_xMSICFGADDR_PPN_HHX_MASK(__hhxw) \
+	(BIT(__hhxw) - 1)
+#define APLIC_xMSICFGADDR_PPN_HHX_SHIFT(__hhxs) \
+	((__hhxs) + APLIC_xMSICFGADDR_PPN_SHIFT)
+#define APLIC_xMSICFGADDR_PPN_HHX(__hhxw, __hhxs) \
+	(APLIC_xMSICFGADDR_PPN_HHX_MASK(__hhxw) << \
+	 APLIC_xMSICFGADDR_PPN_HHX_SHIFT(__hhxs))
+
+#define APLIC_IRQBITS_PER_REG		32
+
+#define APLIC_SETIP_BASE		0x1c00
+#define APLIC_SETIPNUM			0x1cdc
+
+#define APLIC_CLRIP_BASE		0x1d00
+#define APLIC_CLRIPNUM			0x1ddc
+
+#define APLIC_SETIE_BASE		0x1e00
+#define APLIC_SETIENUM			0x1edc
+
+#define APLIC_CLRIE_BASE		0x1f00
+#define APLIC_CLRIENUM			0x1fdc
+
+#define APLIC_SETIPNUM_LE		0x2000
+#define APLIC_SETIPNUM_BE		0x2004
+
+#define APLIC_GENMSI			0x3000
+
+#define APLIC_TARGET_BASE		0x3004
+#define APLIC_TARGET_HART_IDX_SHIFT	18
+#define APLIC_TARGET_HART_IDX_MASK	0x3fff
+#define APLIC_TARGET_HART_IDX		(APLIC_TARGET_HART_IDX_MASK << \
+					 APLIC_TARGET_HART_IDX_SHIFT)
+#define APLIC_TARGET_GUEST_IDX_SHIFT	12
+#define APLIC_TARGET_GUEST_IDX_MASK	0x3f
+#define APLIC_TARGET_GUEST_IDX		(APLIC_TARGET_GUEST_IDX_MASK << \
+					 APLIC_TARGET_GUEST_IDX_SHIFT)
+#define APLIC_TARGET_IPRIO_SHIFT	0
+#define APLIC_TARGET_IPRIO_MASK		0xff
+#define APLIC_TARGET_IPRIO		(APLIC_TARGET_IPRIO_MASK << \
+					 APLIC_TARGET_IPRIO_SHIFT)
+#define APLIC_TARGET_EIID_SHIFT		0
+#define APLIC_TARGET_EIID_MASK		0x7ff
+#define APLIC_TARGET_EIID		(APLIC_TARGET_EIID_MASK << \
+					 APLIC_TARGET_EIID_SHIFT)
+
+#define APLIC_IDC_BASE			0x4000
+#define APLIC_IDC_SIZE			32
+
+#define APLIC_IDC_IDELIVERY		0x00
+
+#define APLIC_IDC_IFORCE		0x04
+
+#define APLIC_IDC_ITHRESHOLD		0x08
+
+#define APLIC_IDC_TOPI			0x18
+#define APLIC_IDC_TOPI_ID_SHIFT		16
+#define APLIC_IDC_TOPI_ID_MASK		0x3ff
+#define APLIC_IDC_TOPI_ID		(APLIC_IDC_TOPI_ID_MASK << \
+					 APLIC_IDC_TOPI_ID_SHIFT)
+#define APLIC_IDC_TOPI_PRIO_SHIFT	0
+#define APLIC_IDC_TOPI_PRIO_MASK	0xff
+#define APLIC_IDC_TOPI_PRIO		(APLIC_IDC_TOPI_PRIO_MASK << \
+					 APLIC_IDC_TOPI_PRIO_SHIFT)
+
+#define APLIC_IDC_CLAIMI		0x1c
+
+#endif
-- 
cgit v1.2.3


From bdeef5dcea6b164f4bd614655821b1ef12ebec9a Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 25 Mar 2024 16:20:04 +0200
Subject: spi: rspi: Get rid of unused struct rspi_plat_data

No in-kernel users of struct rspi_plat_data. If required,
the software nodes should be used for such users. For now
just get rid of it.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://msgid.link/r/20240325142118.3210915-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-rspi.c   | 12 +-----------
 include/linux/spi/rspi.h | 18 ------------------
 2 files changed, 1 insertion(+), 29 deletions(-)
 delete mode 100644 include/linux/spi/rspi.h

(limited to 'include')

diff --git a/drivers/spi/spi-rspi.c b/drivers/spi/spi-rspi.c
index 8e81f1a8623f..7f95d22fb1ac 100644
--- a/drivers/spi/spi-rspi.c
+++ b/drivers/spi/spi-rspi.c
@@ -24,7 +24,6 @@
 #include <linux/reset.h>
 #include <linux/sh_dma.h>
 #include <linux/spi/spi.h>
-#include <linux/spi/rspi.h>
 #include <linux/spinlock.h>
 
 #define RSPI_SPCR		0x00	/* Control Register */
@@ -1131,16 +1130,12 @@ static struct dma_chan *rspi_request_dma_chan(struct device *dev,
 static int rspi_request_dma(struct device *dev, struct spi_controller *ctlr,
 			    const struct resource *res)
 {
-	const struct rspi_plat_data *rspi_pd = dev_get_platdata(dev);
 	unsigned int dma_tx_id, dma_rx_id;
 
 	if (dev->of_node) {
 		/* In the OF case we will get the slave IDs from the DT */
 		dma_tx_id = 0;
 		dma_rx_id = 0;
-	} else if (rspi_pd && rspi_pd->dma_tx_id && rspi_pd->dma_rx_id) {
-		dma_tx_id = rspi_pd->dma_tx_id;
-		dma_rx_id = rspi_pd->dma_rx_id;
 	} else {
 		/* The driver assumes no error. */
 		return 0;
@@ -1290,7 +1285,6 @@ static int rspi_probe(struct platform_device *pdev)
 	struct spi_controller *ctlr;
 	struct rspi_data *rspi;
 	int ret;
-	const struct rspi_plat_data *rspi_pd;
 	const struct spi_ops *ops;
 	unsigned long clksrc;
 
@@ -1305,11 +1299,7 @@ static int rspi_probe(struct platform_device *pdev)
 			goto error1;
 	} else {
 		ops = (struct spi_ops *)pdev->id_entry->driver_data;
-		rspi_pd = dev_get_platdata(&pdev->dev);
-		if (rspi_pd && rspi_pd->num_chipselect)
-			ctlr->num_chipselect = rspi_pd->num_chipselect;
-		else
-			ctlr->num_chipselect = 2; /* default */
+		ctlr->num_chipselect = 2; /* default */
 	}
 
 	rspi = spi_controller_get_devdata(ctlr);
diff --git a/include/linux/spi/rspi.h b/include/linux/spi/rspi.h
deleted file mode 100644
index dbdfcc7a3db2..000000000000
--- a/include/linux/spi/rspi.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Renesas SPI driver
- *
- * Copyright (C) 2012  Renesas Solutions Corp.
- */
-
-#ifndef __LINUX_SPI_RENESAS_SPI_H__
-#define __LINUX_SPI_RENESAS_SPI_H__
-
-struct rspi_plat_data {
-	unsigned int dma_tx_id;
-	unsigned int dma_rx_id;
-
-	u16 num_chipselect;
-};
-
-#endif
-- 
cgit v1.2.3


From 1211f3b21c2aa0d22d8d7f050e3a5930a91cd0e4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Mar 2024 07:21:02 -1000
Subject: workqueue: Preserve OFFQ bits in cancel[_sync] paths

The cancel[_sync] paths acquire and release WORK_STRUCT_PENDING, and
manipulate WORK_OFFQ_CANCELING. However, they assume that all the OFFQ bit
values except for the pool ID are statically known and don't preserve them,
which is not wrong in the current code as the pool ID and CANCELING are the
only information carried. However, the planned disable/enable support will
add more fields and need them to be preserved.

This patch updates work data handling so that only the bits which need
updating are updated.

- struct work_offq_data is added along with work_offqd_unpack() and
  work_offqd_pack_flags() to help manipulating multiple fields contained in
  work->data. Note that the helpers look a bit silly right now as there
  isn't that much to pack. The next patch will add more.

- mark_work_canceling() which is used only by __cancel_work_sync() is
  replaced by open-coded usage of work_offq_data and
  set_work_pool_and_keep_pending() in __cancel_work_sync().

- __cancel_work[_sync]() uses offq_data helpers to preserve other OFFQ bits
  when clearing WORK_STRUCT_PENDING and WORK_OFFQ_CANCELING at the end.

- This removes all users of get_work_pool_id() which is dropped. Note that
  get_work_pool_id() could handle both WORK_STRUCT_PWQ and !WORK_STRUCT_PWQ
  cases; however, it was only being called after try_to_grab_pending()
  succeeded, in which case WORK_STRUCT_PWQ is never set and thus it's safe
  to use work_offqd_unpack() instead.

No behavior changes intended.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Lai Jiangshan <jiangshanlai@gmail.com>
---
 include/linux/workqueue.h |  1 +
 kernel/workqueue.c        | 51 ++++++++++++++++++++++++++++-------------------
 2 files changed, 32 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 158784dd189a..ae7ae4a51499 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -97,6 +97,7 @@ enum wq_misc_consts {
 
 /* Convenience constants - of type 'unsigned long', not 'enum'! */
 #define WORK_OFFQ_CANCELING	(1ul << WORK_OFFQ_CANCELING_BIT)
+#define WORK_OFFQ_FLAG_MASK	(((1ul << WORK_OFFQ_FLAG_BITS) - 1) << WORK_OFFQ_FLAG_SHIFT)
 #define WORK_OFFQ_POOL_NONE	((1ul << WORK_OFFQ_POOL_BITS) - 1)
 #define WORK_STRUCT_NO_POOL	(WORK_OFFQ_POOL_NONE << WORK_OFFQ_POOL_SHIFT)
 #define WORK_STRUCT_PWQ_MASK	(~((1ul << WORK_STRUCT_PWQ_SHIFT) - 1))
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0066c8f6c154..d8f37cfa9935 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -392,6 +392,11 @@ struct wq_pod_type {
 	int			*cpu_pod;	/* cpu -> pod */
 };
 
+struct work_offq_data {
+	u32			pool_id;
+	u32			flags;
+};
+
 static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = {
 	[WQ_AFFN_DFL]		= "default",
 	[WQ_AFFN_CPU]		= "cpu",
@@ -892,29 +897,23 @@ static struct worker_pool *get_work_pool(struct work_struct *work)
 	return idr_find(&worker_pool_idr, pool_id);
 }
 
-/**
- * get_work_pool_id - return the worker pool ID a given work is associated with
- * @work: the work item of interest
- *
- * Return: The worker_pool ID @work was last associated with.
- * %WORK_OFFQ_POOL_NONE if none.
- */
-static int get_work_pool_id(struct work_struct *work)
+static unsigned long shift_and_mask(unsigned long v, u32 shift, u32 bits)
 {
-	unsigned long data = atomic_long_read(&work->data);
+	return (v >> shift) & ((1 << bits) - 1);
+}
 
-	if (data & WORK_STRUCT_PWQ)
-		return work_struct_pwq(data)->pool->id;
+static void work_offqd_unpack(struct work_offq_data *offqd, unsigned long data)
+{
+	WARN_ON_ONCE(data & WORK_STRUCT_PWQ);
 
-	return data >> WORK_OFFQ_POOL_SHIFT;
+	offqd->pool_id = shift_and_mask(data, WORK_OFFQ_POOL_SHIFT,
+					WORK_OFFQ_POOL_BITS);
+	offqd->flags = data & WORK_OFFQ_FLAG_MASK;
 }
 
-static void mark_work_canceling(struct work_struct *work)
+static unsigned long work_offqd_pack_flags(struct work_offq_data *offqd)
 {
-	unsigned long pool_id = get_work_pool_id(work);
-
-	pool_id <<= WORK_OFFQ_POOL_SHIFT;
-	set_work_data(work, pool_id | WORK_STRUCT_PENDING | WORK_OFFQ_CANCELING);
+	return (unsigned long)offqd->flags;
 }
 
 static bool work_is_canceling(struct work_struct *work)
@@ -4271,6 +4270,7 @@ EXPORT_SYMBOL(flush_rcu_work);
 
 static bool __cancel_work(struct work_struct *work, u32 cflags)
 {
+	struct work_offq_data offqd;
 	unsigned long irq_flags;
 	int ret;
 
@@ -4281,19 +4281,26 @@ static bool __cancel_work(struct work_struct *work, u32 cflags)
 	if (unlikely(ret < 0))
 		return false;
 
-	set_work_pool_and_clear_pending(work, get_work_pool_id(work), 0);
+	work_offqd_unpack(&offqd, *work_data_bits(work));
+	set_work_pool_and_clear_pending(work, offqd.pool_id,
+					work_offqd_pack_flags(&offqd));
 	local_irq_restore(irq_flags);
 	return ret;
 }
 
 static bool __cancel_work_sync(struct work_struct *work, u32 cflags)
 {
+	struct work_offq_data offqd;
 	unsigned long irq_flags;
 	bool ret;
 
 	/* claim @work and tell other tasks trying to grab @work to back off */
 	ret = work_grab_pending(work, cflags, &irq_flags);
-	mark_work_canceling(work);
+
+	work_offqd_unpack(&offqd, *work_data_bits(work));
+	offqd.flags |= WORK_OFFQ_CANCELING;
+	set_work_pool_and_keep_pending(work, offqd.pool_id,
+				       work_offqd_pack_flags(&offqd));
 	local_irq_restore(irq_flags);
 
 	/*
@@ -4303,12 +4310,16 @@ static bool __cancel_work_sync(struct work_struct *work, u32 cflags)
 	if (wq_online)
 		__flush_work(work, true);
 
+	work_offqd_unpack(&offqd, *work_data_bits(work));
+
 	/*
 	 * smp_mb() at the end of set_work_pool_and_clear_pending() is paired
 	 * with prepare_to_wait() above so that either waitqueue_active() is
 	 * visible here or !work_is_canceling() is visible there.
 	 */
-	set_work_pool_and_clear_pending(work, WORK_OFFQ_POOL_NONE, 0);
+	offqd.flags &= ~WORK_OFFQ_CANCELING;
+	set_work_pool_and_clear_pending(work, WORK_OFFQ_POOL_NONE,
+					work_offqd_pack_flags(&offqd));
 
 	if (waitqueue_active(&wq_cancel_waitq))
 		__wake_up(&wq_cancel_waitq, TASK_NORMAL, 1, work);
-- 
cgit v1.2.3


From 86898fa6b8cd942505860556f3a0bf52eae57fe8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Mar 2024 07:21:03 -1000
Subject: workqueue: Implement disable/enable for (delayed) work items

While (delayed) work items could be flushed and canceled, there was no way
to prevent them from being queued in the future. While this didn't lead to
functional deficiencies, it sometimes required a bit more effort from the
workqueue users to e.g. sequence shutdown steps with more care.

Workqueue is currently in the process of replacing tasklet which does
support disabling and enabling. The feature is used relatively widely to,
for example, temporarily suppress main path while a control plane operation
(reset or config change) is in progress.

To enable easy conversion of tasklet users and as it seems like an inherent
useful feature, this patch implements disabling and enabling of work items.

- A work item carries 16bit disable count in work->data while not queued.
  The access to the count is synchronized by the PENDING bit like all other
  parts of work->data.

- If the count is non-zero, the work item cannot be queued. Any attempt to
  queue the work item fails and returns %false.

- disable_work[_sync](), enable_work(), disable_delayed_work[_sync]() and
  enable_delayed_work() are added.

v3: enable_work() was using local_irq_enable() instead of
    local_irq_restore() to undo IRQ-disable by work_grab_pending(). This is
    awkward now and will become incorrect as enable_work() will later be
    used from IRQ context too. (Lai)

v2: Lai noticed that queue_work_node() wasn't checking the disable count.
    Fixed. queue_rcu_work() is updated to trigger warning if the inner work
    item is disabled.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Lai Jiangshan <jiangshanlai@gmail.com>
---
 include/linux/workqueue.h |  18 ++++-
 kernel/workqueue.c        | 177 +++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 182 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index ae7ae4a51499..bd80e66298a0 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -51,20 +51,23 @@ enum work_bits {
 	 * data contains off-queue information when !WORK_STRUCT_PWQ.
 	 *
 	 * MSB
-	 * [ pool ID ] [ OFFQ flags ] [ STRUCT flags ]
-	 *                 1 bit        4 or 5 bits
+	 * [ pool ID ] [ disable depth ] [ OFFQ flags ] [ STRUCT flags ]
+	 *                  16 bits          1 bit        4 or 5 bits
 	 */
 	WORK_OFFQ_FLAG_SHIFT	= WORK_STRUCT_FLAG_BITS,
 	WORK_OFFQ_CANCELING_BIT = WORK_OFFQ_FLAG_SHIFT,
 	WORK_OFFQ_FLAG_END,
 	WORK_OFFQ_FLAG_BITS	= WORK_OFFQ_FLAG_END - WORK_OFFQ_FLAG_SHIFT,
 
+	WORK_OFFQ_DISABLE_SHIFT	= WORK_OFFQ_FLAG_SHIFT + WORK_OFFQ_FLAG_BITS,
+	WORK_OFFQ_DISABLE_BITS	= 16,
+
 	/*
 	 * When a work item is off queue, the high bits encode off-queue flags
 	 * and the last pool it was on. Cap pool ID to 31 bits and use the
 	 * highest number to indicate that no pool is associated.
 	 */
-	WORK_OFFQ_POOL_SHIFT	= WORK_OFFQ_FLAG_SHIFT + WORK_OFFQ_FLAG_BITS,
+	WORK_OFFQ_POOL_SHIFT	= WORK_OFFQ_DISABLE_SHIFT + WORK_OFFQ_DISABLE_BITS,
 	WORK_OFFQ_LEFT		= BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT,
 	WORK_OFFQ_POOL_BITS	= WORK_OFFQ_LEFT <= 31 ? WORK_OFFQ_LEFT : 31,
 };
@@ -98,6 +101,7 @@ enum wq_misc_consts {
 /* Convenience constants - of type 'unsigned long', not 'enum'! */
 #define WORK_OFFQ_CANCELING	(1ul << WORK_OFFQ_CANCELING_BIT)
 #define WORK_OFFQ_FLAG_MASK	(((1ul << WORK_OFFQ_FLAG_BITS) - 1) << WORK_OFFQ_FLAG_SHIFT)
+#define WORK_OFFQ_DISABLE_MASK	(((1ul << WORK_OFFQ_DISABLE_BITS) - 1) << WORK_OFFQ_DISABLE_SHIFT)
 #define WORK_OFFQ_POOL_NONE	((1ul << WORK_OFFQ_POOL_BITS) - 1)
 #define WORK_STRUCT_NO_POOL	(WORK_OFFQ_POOL_NONE << WORK_OFFQ_POOL_SHIFT)
 #define WORK_STRUCT_PWQ_MASK	(~((1ul << WORK_STRUCT_PWQ_SHIFT) - 1))
@@ -560,6 +564,14 @@ extern bool flush_delayed_work(struct delayed_work *dwork);
 extern bool cancel_delayed_work(struct delayed_work *dwork);
 extern bool cancel_delayed_work_sync(struct delayed_work *dwork);
 
+extern bool disable_work(struct work_struct *work);
+extern bool disable_work_sync(struct work_struct *work);
+extern bool enable_work(struct work_struct *work);
+
+extern bool disable_delayed_work(struct delayed_work *dwork);
+extern bool disable_delayed_work_sync(struct delayed_work *dwork);
+extern bool enable_delayed_work(struct delayed_work *dwork);
+
 extern bool flush_rcu_work(struct rcu_work *rwork);
 
 extern void workqueue_set_max_active(struct workqueue_struct *wq,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d8f37cfa9935..5c53dde877fd 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -99,6 +99,7 @@ enum worker_flags {
 
 enum work_cancel_flags {
 	WORK_CANCEL_DELAYED	= 1 << 0,	/* canceling a delayed_work */
+	WORK_CANCEL_DISABLE	= 1 << 1,	/* canceling to disable */
 };
 
 enum wq_internal_consts {
@@ -394,6 +395,7 @@ struct wq_pod_type {
 
 struct work_offq_data {
 	u32			pool_id;
+	u32			disable;
 	u32			flags;
 };
 
@@ -908,12 +910,15 @@ static void work_offqd_unpack(struct work_offq_data *offqd, unsigned long data)
 
 	offqd->pool_id = shift_and_mask(data, WORK_OFFQ_POOL_SHIFT,
 					WORK_OFFQ_POOL_BITS);
+	offqd->disable = shift_and_mask(data, WORK_OFFQ_DISABLE_SHIFT,
+					WORK_OFFQ_DISABLE_BITS);
 	offqd->flags = data & WORK_OFFQ_FLAG_MASK;
 }
 
 static unsigned long work_offqd_pack_flags(struct work_offq_data *offqd)
 {
-	return (unsigned long)offqd->flags;
+	return ((unsigned long)offqd->disable << WORK_OFFQ_DISABLE_SHIFT) |
+		((unsigned long)offqd->flags);
 }
 
 static bool work_is_canceling(struct work_struct *work)
@@ -2408,6 +2413,21 @@ out:
 	rcu_read_unlock();
 }
 
+static bool clear_pending_if_disabled(struct work_struct *work)
+{
+	unsigned long data = *work_data_bits(work);
+	struct work_offq_data offqd;
+
+	if (likely((data & WORK_STRUCT_PWQ) ||
+		   !(data & WORK_OFFQ_DISABLE_MASK)))
+		return false;
+
+	work_offqd_unpack(&offqd, data);
+	set_work_pool_and_clear_pending(work, offqd.pool_id,
+					work_offqd_pack_flags(&offqd));
+	return true;
+}
+
 /**
  * queue_work_on - queue work on specific cpu
  * @cpu: CPU number to execute work on
@@ -2430,7 +2450,8 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
 
 	local_irq_save(irq_flags);
 
-	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
+	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
+	    !clear_pending_if_disabled(work)) {
 		__queue_work(cpu, wq, work);
 		ret = true;
 	}
@@ -2508,7 +2529,8 @@ bool queue_work_node(int node, struct workqueue_struct *wq,
 
 	local_irq_save(irq_flags);
 
-	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
+	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
+	    !clear_pending_if_disabled(work)) {
 		int cpu = select_numa_node_cpu(node);
 
 		__queue_work(cpu, wq, work);
@@ -2590,7 +2612,8 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 	/* read the comment in __queue_work() */
 	local_irq_save(irq_flags);
 
-	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
+	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
+	    !clear_pending_if_disabled(work)) {
 		__queue_delayed_work(cpu, wq, dwork, delay);
 		ret = true;
 	}
@@ -2663,7 +2686,12 @@ bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork)
 {
 	struct work_struct *work = &rwork->work;
 
-	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
+	/*
+	 * rcu_work can't be canceled or disabled. Warn if the user reached
+	 * inside @rwork and disabled the inner work.
+	 */
+	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
+	    !WARN_ON_ONCE(clear_pending_if_disabled(work))) {
 		rwork->wq = wq;
 		call_rcu_hurry(&rwork->rcu, rcu_work_rcufn);
 		return true;
@@ -4268,20 +4296,46 @@ bool flush_rcu_work(struct rcu_work *rwork)
 }
 EXPORT_SYMBOL(flush_rcu_work);
 
+static void work_offqd_disable(struct work_offq_data *offqd)
+{
+	const unsigned long max = (1lu << WORK_OFFQ_DISABLE_BITS) - 1;
+
+	if (likely(offqd->disable < max))
+		offqd->disable++;
+	else
+		WARN_ONCE(true, "workqueue: work disable count overflowed\n");
+}
+
+static void work_offqd_enable(struct work_offq_data *offqd)
+{
+	if (likely(offqd->disable > 0))
+		offqd->disable--;
+	else
+		WARN_ONCE(true, "workqueue: work disable count underflowed\n");
+}
+
 static bool __cancel_work(struct work_struct *work, u32 cflags)
 {
 	struct work_offq_data offqd;
 	unsigned long irq_flags;
 	int ret;
 
-	do {
-		ret = try_to_grab_pending(work, cflags, &irq_flags);
-	} while (unlikely(ret == -EAGAIN));
+	if (cflags & WORK_CANCEL_DISABLE) {
+		ret = work_grab_pending(work, cflags, &irq_flags);
+	} else {
+		do {
+			ret = try_to_grab_pending(work, cflags, &irq_flags);
+		} while (unlikely(ret == -EAGAIN));
 
-	if (unlikely(ret < 0))
-		return false;
+		if (unlikely(ret < 0))
+			return false;
+	}
 
 	work_offqd_unpack(&offqd, *work_data_bits(work));
+
+	if (cflags & WORK_CANCEL_DISABLE)
+		work_offqd_disable(&offqd);
+
 	set_work_pool_and_clear_pending(work, offqd.pool_id,
 					work_offqd_pack_flags(&offqd));
 	local_irq_restore(irq_flags);
@@ -4298,6 +4352,10 @@ static bool __cancel_work_sync(struct work_struct *work, u32 cflags)
 	ret = work_grab_pending(work, cflags, &irq_flags);
 
 	work_offqd_unpack(&offqd, *work_data_bits(work));
+
+	if (cflags & WORK_CANCEL_DISABLE)
+		work_offqd_disable(&offqd);
+
 	offqd.flags |= WORK_OFFQ_CANCELING;
 	set_work_pool_and_keep_pending(work, offqd.pool_id,
 				       work_offqd_pack_flags(&offqd));
@@ -4397,6 +4455,105 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork)
 }
 EXPORT_SYMBOL(cancel_delayed_work_sync);
 
+/**
+ * disable_work - Disable and cancel a work item
+ * @work: work item to disable
+ *
+ * Disable @work by incrementing its disable count and cancel it if currently
+ * pending. As long as the disable count is non-zero, any attempt to queue @work
+ * will fail and return %false. The maximum supported disable depth is 2 to the
+ * power of %WORK_OFFQ_DISABLE_BITS, currently 65536.
+ *
+ * Must be called from a sleepable context. Returns %true if @work was pending,
+ * %false otherwise.
+ */
+bool disable_work(struct work_struct *work)
+{
+	return __cancel_work(work, WORK_CANCEL_DISABLE);
+}
+EXPORT_SYMBOL_GPL(disable_work);
+
+/**
+ * disable_work_sync - Disable, cancel and drain a work item
+ * @work: work item to disable
+ *
+ * Similar to disable_work() but also wait for @work to finish if currently
+ * executing.
+ *
+ * Must be called from a sleepable context. Returns %true if @work was pending,
+ * %false otherwise.
+ */
+bool disable_work_sync(struct work_struct *work)
+{
+	return __cancel_work_sync(work, WORK_CANCEL_DISABLE);
+}
+EXPORT_SYMBOL_GPL(disable_work_sync);
+
+/**
+ * enable_work - Enable a work item
+ * @work: work item to enable
+ *
+ * Undo disable_work[_sync]() by decrementing @work's disable count. @work can
+ * only be queued if its disable count is 0.
+ *
+ * Must be called from a sleepable context. Returns %true if the disable count
+ * reached 0. Otherwise, %false.
+ */
+bool enable_work(struct work_struct *work)
+{
+	struct work_offq_data offqd;
+	unsigned long irq_flags;
+
+	work_grab_pending(work, 0, &irq_flags);
+
+	work_offqd_unpack(&offqd, *work_data_bits(work));
+	work_offqd_enable(&offqd);
+	set_work_pool_and_clear_pending(work, offqd.pool_id,
+					work_offqd_pack_flags(&offqd));
+	local_irq_restore(irq_flags);
+
+	return !offqd.disable;
+}
+EXPORT_SYMBOL_GPL(enable_work);
+
+/**
+ * disable_delayed_work - Disable and cancel a delayed work item
+ * @dwork: delayed work item to disable
+ *
+ * disable_work() for delayed work items.
+ */
+bool disable_delayed_work(struct delayed_work *dwork)
+{
+	return __cancel_work(&dwork->work,
+			     WORK_CANCEL_DELAYED | WORK_CANCEL_DISABLE);
+}
+EXPORT_SYMBOL_GPL(disable_delayed_work);
+
+/**
+ * disable_delayed_work_sync - Disable, cancel and drain a delayed work item
+ * @dwork: delayed work item to disable
+ *
+ * disable_work_sync() for delayed work items.
+ */
+bool disable_delayed_work_sync(struct delayed_work *dwork)
+{
+	return __cancel_work_sync(&dwork->work,
+				  WORK_CANCEL_DELAYED | WORK_CANCEL_DISABLE);
+}
+EXPORT_SYMBOL_GPL(disable_delayed_work_sync);
+
+/**
+ * enable_delayed_work - Enable a delayed work item
+ * @dwork: delayed work item to enable
+ *
+ * enable_work() for delayed work items.
+ */
+bool enable_delayed_work(struct delayed_work *dwork)
+{
+	return enable_work(&dwork->work);
+}
+EXPORT_SYMBOL_GPL(enable_delayed_work);
+
 /**
  * schedule_on_each_cpu - execute a function synchronously on each online CPU
  * @func: the function to call
-- 
cgit v1.2.3


From f09b10b6f442656524d2ee26e45966401a14f54b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Mar 2024 07:21:03 -1000
Subject: workqueue: Remove WORK_OFFQ_CANCELING

cancel[_delayed]_work_sync() guarantees that it can shut down
self-requeueing work items. To achieve that, it grabs and then holds
WORK_STRUCT_PENDING bit set while flushing the currently executing instance.
As the PENDING bit is set, all queueing attempts including the
self-requeueing ones fail and once the currently executing instance is
flushed, the work item should be idle as long as someone else isn't actively
queueing it.

This means that the cancel_work_sync path may hold the PENDING bit set while
flushing the target work item. This isn't a problem for the queueing path -
it can just fail which is the desired effect. It doesn't affect flush. It
doesn't matter to cancel_work either as it can just report that the work
item has successfully canceled. However, if there's another cancel_work_sync
attempt on the work item, it can't simply fail or report success and that
would breach the guarantee that it should provide. cancel_work_sync has to
wait for and grab that PENDING bit and go through the motions.

WORK_OFFQ_CANCELING and wq_cancel_waitq are what implement this
cancel_work_sync to cancel_work_sync wait mechanism. When a work item is
being canceled, WORK_OFFQ_CANCELING is also set on it and other
cancel_work_sync attempts wait on the bit to be cleared using the wait
queue.

While this works, it's an isolated wart which doesn't jive with the rest of
flush and cancel mechanisms and forces enable_work() and disable_work() to
require a sleepable context, which hampers their usability.

Now that a work item can be disabled, we can use that to block queueing
while cancel_work_sync is in progress. Instead of holding PENDING the bit,
it can temporarily disable the work item, flush and then re-enable it as
that'd achieve the same end result of blocking queueings while canceling and
thus enable canceling of self-requeueing work items.

- WORK_OFFQ_CANCELING and the surrounding mechanims are removed.

- work_grab_pending() is now simpler, no longer has to wait for a blocking
  operation and thus can be called from any context.

- With work_grab_pending() simplified, no need to use try_to_grab_pending()
  directly. All users are converted to use work_grab_pending().

- __cancel_work_sync() is updated to __cancel_work() with
  WORK_CANCEL_DISABLE to cancel and plug racing queueing attempts. It then
  flushes and re-enables the work item if necessary.

- These changes allow disable_work() and enable_work() to be called from any
  context.

v2: Lai pointed out that mod_delayed_work_on() needs to check the disable
    count before queueing the delayed work item. Added
    clear_pending_if_disabled() call.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Lai Jiangshan <jiangshanlai@gmail.com>
---
 include/linux/workqueue.h |   4 +-
 kernel/workqueue.c        | 140 +++++++---------------------------------------
 2 files changed, 20 insertions(+), 124 deletions(-)

(limited to 'include')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index bd80e66298a0..a5075969931b 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -52,10 +52,9 @@ enum work_bits {
 	 *
 	 * MSB
 	 * [ pool ID ] [ disable depth ] [ OFFQ flags ] [ STRUCT flags ]
-	 *                  16 bits          1 bit        4 or 5 bits
+	 *                  16 bits          0 bits       4 or 5 bits
 	 */
 	WORK_OFFQ_FLAG_SHIFT	= WORK_STRUCT_FLAG_BITS,
-	WORK_OFFQ_CANCELING_BIT = WORK_OFFQ_FLAG_SHIFT,
 	WORK_OFFQ_FLAG_END,
 	WORK_OFFQ_FLAG_BITS	= WORK_OFFQ_FLAG_END - WORK_OFFQ_FLAG_SHIFT,
 
@@ -99,7 +98,6 @@ enum wq_misc_consts {
 };
 
 /* Convenience constants - of type 'unsigned long', not 'enum'! */
-#define WORK_OFFQ_CANCELING	(1ul << WORK_OFFQ_CANCELING_BIT)
 #define WORK_OFFQ_FLAG_MASK	(((1ul << WORK_OFFQ_FLAG_BITS) - 1) << WORK_OFFQ_FLAG_SHIFT)
 #define WORK_OFFQ_DISABLE_MASK	(((1ul << WORK_OFFQ_DISABLE_BITS) - 1) << WORK_OFFQ_DISABLE_SHIFT)
 #define WORK_OFFQ_POOL_NONE	((1ul << WORK_OFFQ_POOL_BITS) - 1)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5c53dde877fd..e80a815ec172 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -496,12 +496,6 @@ static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
 /* I: attributes used when instantiating ordered pools on demand */
 static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];
 
-/*
- * Used to synchronize multiple cancel_sync attempts on the same work item. See
- * work_grab_pending() and __cancel_work_sync().
- */
-static DECLARE_WAIT_QUEUE_HEAD(wq_cancel_waitq);
-
 /*
  * I: kthread_worker to release pwq's. pwq release needs to be bounced to a
  * process context while holding a pool lock. Bounce to a dedicated kthread
@@ -783,11 +777,6 @@ static int work_next_color(int color)
  * corresponding to a work.  Pool is available once the work has been
  * queued anywhere after initialization until it is sync canceled.  pwq is
  * available only while the work item is queued.
- *
- * %WORK_OFFQ_CANCELING is used to mark a work item which is being
- * canceled.  While being canceled, a work item may have its PENDING set
- * but stay off timer and worklist for arbitrarily long and nobody should
- * try to steal the PENDING bit.
  */
 static inline void set_work_data(struct work_struct *work, unsigned long data)
 {
@@ -921,13 +910,6 @@ static unsigned long work_offqd_pack_flags(struct work_offq_data *offqd)
 		((unsigned long)offqd->flags);
 }
 
-static bool work_is_canceling(struct work_struct *work)
-{
-	unsigned long data = atomic_long_read(&work->data);
-
-	return !(data & WORK_STRUCT_PWQ) && (data & WORK_OFFQ_CANCELING);
-}
-
 /*
  * Policy functions.  These define the policies on how the global worker
  * pools are managed.  Unless noted otherwise, these functions assume that
@@ -2058,8 +2040,6 @@ out_put:
  *  1		if @work was pending and we successfully stole PENDING
  *  0		if @work was idle and we claimed PENDING
  *  -EAGAIN	if PENDING couldn't be grabbed at the moment, safe to busy-retry
- *  -ENOENT	if someone else is canceling @work, this state may persist
- *		for arbitrarily long
  *  ========	================================================================
  *
  * Note:
@@ -2155,26 +2135,9 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags,
 fail:
 	rcu_read_unlock();
 	local_irq_restore(*irq_flags);
-	if (work_is_canceling(work))
-		return -ENOENT;
-	cpu_relax();
 	return -EAGAIN;
 }
 
-struct cwt_wait {
-	wait_queue_entry_t	wait;
-	struct work_struct	*work;
-};
-
-static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
-{
-	struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait);
-
-	if (cwait->work != key)
-		return 0;
-	return autoremove_wake_function(wait, mode, sync, key);
-}
-
 /**
  * work_grab_pending - steal work item from worklist and disable irq
  * @work: work item to steal
@@ -2184,7 +2147,7 @@ static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *k
  * Grab PENDING bit of @work. @work can be in any stable state - idle, on timer
  * or on worklist.
  *
- * Must be called in process context. IRQ is disabled on return with IRQ state
+ * Can be called from any context. IRQ is disabled on return with IRQ state
  * stored in *@irq_flags. The caller is responsible for re-enabling it using
  * local_irq_restore().
  *
@@ -2193,41 +2156,14 @@ static int cwt_wakefn(wait_queue_entry_t *wait, unsigned mode, int sync, void *k
 static bool work_grab_pending(struct work_struct *work, u32 cflags,
 			      unsigned long *irq_flags)
 {
-	struct cwt_wait cwait;
 	int ret;
 
-	might_sleep();
-repeat:
-	ret = try_to_grab_pending(work, cflags, irq_flags);
-	if (likely(ret >= 0))
-		return ret;
-	if (ret != -ENOENT)
-		goto repeat;
-
-	/*
-	 * Someone is already canceling. Wait for it to finish. flush_work()
-	 * doesn't work for PREEMPT_NONE because we may get woken up between
-	 * @work's completion and the other canceling task resuming and clearing
-	 * CANCELING - flush_work() will return false immediately as @work is no
-	 * longer busy, try_to_grab_pending() will return -ENOENT as @work is
-	 * still being canceled and the other canceling task won't be able to
-	 * clear CANCELING as we're hogging the CPU.
-	 *
-	 * Let's wait for completion using a waitqueue. As this may lead to the
-	 * thundering herd problem, use a custom wake function which matches
-	 * @work along with exclusive wait and wakeup.
-	 */
-	init_wait(&cwait.wait);
-	cwait.wait.func = cwt_wakefn;
-	cwait.work = work;
-
-	prepare_to_wait_exclusive(&wq_cancel_waitq, &cwait.wait,
-				  TASK_UNINTERRUPTIBLE);
-	if (work_is_canceling(work))
-		schedule();
-	finish_wait(&wq_cancel_waitq, &cwait.wait);
-
-	goto repeat;
+	while (true) {
+		ret = try_to_grab_pending(work, cflags, irq_flags);
+		if (ret >= 0)
+			return ret;
+		cpu_relax();
+	}
 }
 
 /**
@@ -2645,19 +2581,14 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
 			 struct delayed_work *dwork, unsigned long delay)
 {
 	unsigned long irq_flags;
-	int ret;
+	bool ret;
 
-	do {
-		ret = try_to_grab_pending(&dwork->work, WORK_CANCEL_DELAYED,
-					  &irq_flags);
-	} while (unlikely(ret == -EAGAIN));
+	ret = work_grab_pending(&dwork->work, WORK_CANCEL_DELAYED, &irq_flags);
 
-	if (likely(ret >= 0)) {
+	if (!clear_pending_if_disabled(&dwork->work))
 		__queue_delayed_work(cpu, wq, dwork, delay);
-		local_irq_restore(irq_flags);
-	}
 
-	/* -ENOENT from try_to_grab_pending() becomes %true */
+	local_irq_restore(irq_flags);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(mod_delayed_work_on);
@@ -4320,16 +4251,7 @@ static bool __cancel_work(struct work_struct *work, u32 cflags)
 	unsigned long irq_flags;
 	int ret;
 
-	if (cflags & WORK_CANCEL_DISABLE) {
-		ret = work_grab_pending(work, cflags, &irq_flags);
-	} else {
-		do {
-			ret = try_to_grab_pending(work, cflags, &irq_flags);
-		} while (unlikely(ret == -EAGAIN));
-
-		if (unlikely(ret < 0))
-			return false;
-	}
+	ret = work_grab_pending(work, cflags, &irq_flags);
 
 	work_offqd_unpack(&offqd, *work_data_bits(work));
 
@@ -4344,22 +4266,9 @@ static bool __cancel_work(struct work_struct *work, u32 cflags)
 
 static bool __cancel_work_sync(struct work_struct *work, u32 cflags)
 {
-	struct work_offq_data offqd;
-	unsigned long irq_flags;
 	bool ret;
 
-	/* claim @work and tell other tasks trying to grab @work to back off */
-	ret = work_grab_pending(work, cflags, &irq_flags);
-
-	work_offqd_unpack(&offqd, *work_data_bits(work));
-
-	if (cflags & WORK_CANCEL_DISABLE)
-		work_offqd_disable(&offqd);
-
-	offqd.flags |= WORK_OFFQ_CANCELING;
-	set_work_pool_and_keep_pending(work, offqd.pool_id,
-				       work_offqd_pack_flags(&offqd));
-	local_irq_restore(irq_flags);
+	ret = __cancel_work(work, cflags | WORK_CANCEL_DISABLE);
 
 	/*
 	 * Skip __flush_work() during early boot when we know that @work isn't
@@ -4368,19 +4277,8 @@ static bool __cancel_work_sync(struct work_struct *work, u32 cflags)
 	if (wq_online)
 		__flush_work(work, true);
 
-	work_offqd_unpack(&offqd, *work_data_bits(work));
-
-	/*
-	 * smp_mb() at the end of set_work_pool_and_clear_pending() is paired
-	 * with prepare_to_wait() above so that either waitqueue_active() is
-	 * visible here or !work_is_canceling() is visible there.
-	 */
-	offqd.flags &= ~WORK_OFFQ_CANCELING;
-	set_work_pool_and_clear_pending(work, WORK_OFFQ_POOL_NONE,
-					work_offqd_pack_flags(&offqd));
-
-	if (waitqueue_active(&wq_cancel_waitq))
-		__wake_up(&wq_cancel_waitq, TASK_NORMAL, 1, work);
+	if (!(cflags & WORK_CANCEL_DISABLE))
+		enable_work(work);
 
 	return ret;
 }
@@ -4464,8 +4362,8 @@ EXPORT_SYMBOL(cancel_delayed_work_sync);
  * will fail and return %false. The maximum supported disable depth is 2 to the
  * power of %WORK_OFFQ_DISABLE_BITS, currently 65536.
  *
- * Must be called from a sleepable context. Returns %true if @work was pending,
- * %false otherwise.
+ * Can be called from any context. Returns %true if @work was pending, %false
+ * otherwise.
  */
 bool disable_work(struct work_struct *work)
 {
@@ -4496,8 +4394,8 @@ EXPORT_SYMBOL_GPL(disable_work_sync);
  * Undo disable_work[_sync]() by decrementing @work's disable count. @work can
  * only be queued if its disable count is 0.
  *
- * Must be called from a sleepable context. Returns %true if the disable count
- * reached 0. Otherwise, %false.
+ * Can be called from any context. Returns %true if the disable count reached 0.
+ * Otherwise, %false.
  */
 bool enable_work(struct work_struct *work)
 {
-- 
cgit v1.2.3


From 456a78eef2670d0e9521e87f35a056de8fec7fb2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Mar 2024 07:21:03 -1000
Subject: workqueue: Remember whether a work item was on a BH workqueue

Add an off-queue flag, WORK_OFFQ_BH, that indicates whether the last
workqueue the work item was on was a BH one. This will be used to test
whether a work item is BH in cancel_sync path to implement atomic
cancel_sync'ing for BH work items.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Lai Jiangshan <jiangshanlai@gmail.com>
---
 include/linux/workqueue.h |  4 +++-
 kernel/workqueue.c        | 10 ++++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index a5075969931b..777b0186317e 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -52,9 +52,10 @@ enum work_bits {
 	 *
 	 * MSB
 	 * [ pool ID ] [ disable depth ] [ OFFQ flags ] [ STRUCT flags ]
-	 *                  16 bits          0 bits       4 or 5 bits
+	 *                  16 bits          1 bit        4 or 5 bits
 	 */
 	WORK_OFFQ_FLAG_SHIFT	= WORK_STRUCT_FLAG_BITS,
+	WORK_OFFQ_BH_BIT	= WORK_OFFQ_FLAG_SHIFT,
 	WORK_OFFQ_FLAG_END,
 	WORK_OFFQ_FLAG_BITS	= WORK_OFFQ_FLAG_END - WORK_OFFQ_FLAG_SHIFT,
 
@@ -98,6 +99,7 @@ enum wq_misc_consts {
 };
 
 /* Convenience constants - of type 'unsigned long', not 'enum'! */
+#define WORK_OFFQ_BH		(1ul << WORK_OFFQ_BH_BIT)
 #define WORK_OFFQ_FLAG_MASK	(((1ul << WORK_OFFQ_FLAG_BITS) - 1) << WORK_OFFQ_FLAG_SHIFT)
 #define WORK_OFFQ_DISABLE_MASK	(((1ul << WORK_OFFQ_DISABLE_BITS) - 1) << WORK_OFFQ_DISABLE_SHIFT)
 #define WORK_OFFQ_POOL_NONE	((1ul << WORK_OFFQ_POOL_BITS) - 1)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e80a815ec172..baf7495338bc 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -764,6 +764,11 @@ static int work_next_color(int color)
 	return (color + 1) % WORK_NR_COLORS;
 }
 
+static unsigned long pool_offq_flags(struct worker_pool *pool)
+{
+	return (pool->flags & POOL_BH) ? WORK_OFFQ_BH : 0;
+}
+
 /*
  * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data
  * contain the pointer to the queued pwq.  Once execution starts, the flag
@@ -2122,7 +2127,8 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags,
 		 * this destroys work->data needed by the next step, stash it.
 		 */
 		work_data = *work_data_bits(work);
-		set_work_pool_and_keep_pending(work, pool->id, 0);
+		set_work_pool_and_keep_pending(work, pool->id,
+					       pool_offq_flags(pool));
 
 		/* must be the last step, see the function comment */
 		pwq_dec_nr_in_flight(pwq, work_data);
@@ -3175,7 +3181,7 @@ __acquires(&pool->lock)
 	 * PENDING and queued state changes happen together while IRQ is
 	 * disabled.
 	 */
-	set_work_pool_and_clear_pending(work, pool->id, 0);
+	set_work_pool_and_clear_pending(work, pool->id, pool_offq_flags(pool));
 
 	pwq->stats[PWQ_STAT_STARTED]++;
 	raw_spin_unlock_irq(&pool->lock);
-- 
cgit v1.2.3


From ae1296a7bfe4f8e446677ccb761d9419926557bc Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Date: Fri, 8 Mar 2024 17:42:52 +0800
Subject: workqueue: Move attrs->cpumask out of worker_pool's properties when
 attrs->affn_strict

Allow more pools can be shared when attrs->affn_strict.

Signed-off-by: Lai Jiangshan <jiangshan.ljs@antgroup.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h |  3 +++
 kernel/workqueue.c        | 13 ++++++++-----
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 777b0186317e..bfcf8d38f4b1 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -185,6 +185,9 @@ struct workqueue_attrs {
 	 * Below fields aren't properties of a worker_pool. They only modify how
 	 * :c:func:`apply_workqueue_attrs` select pools and thus don't
 	 * participate in pool hash calculations or equality comparisons.
+	 *
+	 * If @affn_strict is set, @cpumask isn't a property of a worker_pool
+	 * either.
 	 */
 
 	/**
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 45d2aae73c96..f03960f094fa 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4625,6 +4625,8 @@ static void wqattrs_clear_for_pool(struct workqueue_attrs *attrs)
 {
 	attrs->affn_scope = WQ_AFFN_NR_TYPES;
 	attrs->ordered = false;
+	if (attrs->affn_strict)
+		cpumask_copy(attrs->cpumask, cpu_possible_mask);
 }
 
 /* hash value of the content of @attr */
@@ -4633,11 +4635,12 @@ static u32 wqattrs_hash(const struct workqueue_attrs *attrs)
 	u32 hash = 0;
 
 	hash = jhash_1word(attrs->nice, hash);
-	hash = jhash(cpumask_bits(attrs->cpumask),
-		     BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
+	hash = jhash_1word(attrs->affn_strict, hash);
 	hash = jhash(cpumask_bits(attrs->__pod_cpumask),
 		     BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
-	hash = jhash_1word(attrs->affn_strict, hash);
+	if (!attrs->affn_strict)
+		hash = jhash(cpumask_bits(attrs->cpumask),
+			     BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
 	return hash;
 }
 
@@ -4647,11 +4650,11 @@ static bool wqattrs_equal(const struct workqueue_attrs *a,
 {
 	if (a->nice != b->nice)
 		return false;
-	if (!cpumask_equal(a->cpumask, b->cpumask))
+	if (a->affn_strict != b->affn_strict)
 		return false;
 	if (!cpumask_equal(a->__pod_cpumask, b->__pod_cpumask))
 		return false;
-	if (a->affn_strict != b->affn_strict)
+	if (!a->affn_strict && !cpumask_equal(a->cpumask, b->cpumask))
 		return false;
 	return true;
 }
-- 
cgit v1.2.3


From d6a7bbdde67227127e5e33fb9500bcc4abc40fb3 Mon Sep 17 00:00:00 2001
From: Kassey Li <quic_yingangl@quicinc.com>
Date: Fri, 8 Mar 2024 10:18:18 +0800
Subject: workqueue: add function in event of workqueue_activate_work

The trace event "workqueue_activate_work" only print work struct.
However, function is the region of interest in a full sequence of work.
Current workqueue_activate_work trace event output:

    workqueue_activate_work: work struct ffffff88b4a0f450

With this change, workqueue_activate_work will print the function name,
align with workqueue_queue_work/execute_start/execute_end event.

    workqueue_activate_work: work struct ffffff80413a78b8 function=vmstat_update

Signed-off-by: Kassey Li <quic_yingangl@quicinc.com>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/trace/events/workqueue.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/workqueue.h b/include/trace/events/workqueue.h
index 262d52021c23..6ef5b7254070 100644
--- a/include/trace/events/workqueue.h
+++ b/include/trace/events/workqueue.h
@@ -64,13 +64,15 @@ TRACE_EVENT(workqueue_activate_work,
 
 	TP_STRUCT__entry(
 		__field( void *,	work	)
+		__field( void *,	function)
 	),
 
 	TP_fast_assign(
 		__entry->work		= work;
+		__entry->function	= work->func;
 	),
 
-	TP_printk("work struct %p", __entry->work)
+	TP_printk("work struct %p function=%ps ", __entry->work, __entry->function)
 );
 
 /**
-- 
cgit v1.2.3


From 474a549ff4c989427a14fdab851e562c8a63fe24 Mon Sep 17 00:00:00 2001
From: Allen Pais <apais@linux.microsoft.com>
Date: Mon, 25 Mar 2024 18:02:01 +0000
Subject: workqueue: Introduce enable_and_queue_work() convenience function

The enable_and_queue_work() function is introduced to streamline
the process of enabling and queuing a work item on a specific
workqueue. This function combines the functionalities of
enable_work() and queue_work() in a single call, providing a
concise and convenient API for enabling and queuing work items.

The function accepts a target workqueue and a work item as parameters.
It first attempts to enable the work item using enable_work(). A successful
enable operation means that the work item was previously disabled
and is now marked as eligible for execution. If the enable operation
is successful, the work item is then queued on the specified workqueue
using queue_work(). The function returns true if the work item was
successfully enabled and queued, and false otherwise.

Note: This function may lead to unnecessary spurious wake-ups in cases
where the work item is expected to be dormant but enable/disable are called
frequently. Spurious wake-ups refer to the condition where worker threads
are woken up without actual work to be done. Callers should be aware of
this behavior and may need to employ additional synchronization mechanisms
to avoid these overheads if such wake-ups are not desired.

This addition aims to enhance code readability and maintainability by
providing a unified interface for the common use case of enabling and
queuing work items on a workqueue.

tj: Made the function comment more compact.

Signed-off-by: Allen Pais <allen.lkml@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'include')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index bfcf8d38f4b1..2df1188c0f96 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -682,6 +682,32 @@ static inline bool schedule_work(struct work_struct *work)
 	return queue_work(system_wq, work);
 }
 
+/**
+ * enable_and_queue_work - Enable and queue a work item on a specific workqueue
+ * @wq: The target workqueue
+ * @work: The work item to be enabled and queued
+ *
+ * This function combines the operations of enable_work() and queue_work(),
+ * providing a convenient way to enable and queue a work item in a single call.
+ * It invokes enable_work() on @work and then queues it if the disable depth
+ * reached 0. Returns %true if the disable depth reached 0 and @work is queued,
+ * and %false otherwise.
+ *
+ * Note that @work is always queued when disable depth reaches zero. If the
+ * desired behavior is queueing only if certain events took place while @work is
+ * disabled, the user should implement the necessary state tracking and perform
+ * explicit conditional queueing after enable_work().
+ */
+static inline bool enable_and_queue_work(struct workqueue_struct *wq,
+					 struct work_struct *work)
+{
+	if (enable_work(work)) {
+		queue_work(wq, work);
+		return true;
+	}
+	return false;
+}
+
 /*
  * Detect attempt to flush system-wide workqueues at compile time when possible.
  * Warn attempt to flush system-wide workqueues at runtime.
-- 
cgit v1.2.3


From 77fc5151f9c0e6068f1567b73d33e75a0c35333d Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Sat, 17 Feb 2024 16:42:35 +0000
Subject: device property: Move fwnode_handle_put() into property.h

By having this function as static inline in the header, the compiler
is able to see if can optmize the call out if (IS_ERR_OR_NULL(fwnode))
This will allow a simpler DEFINE_FREE() call in the following patch.

Suggested-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/r/20240217164249.921878-2-jic23@kernel.org
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/base/property.c  | 14 --------------
 include/linux/property.h | 14 +++++++++++++-
 2 files changed, 13 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/base/property.c b/drivers/base/property.c
index 7324a704a9a1..6a3a434d0d6f 100644
--- a/drivers/base/property.c
+++ b/drivers/base/property.c
@@ -868,20 +868,6 @@ struct fwnode_handle *fwnode_handle_get(struct fwnode_handle *fwnode)
 }
 EXPORT_SYMBOL_GPL(fwnode_handle_get);
 
-/**
- * fwnode_handle_put - Drop reference to a device node
- * @fwnode: Pointer to the device node to drop the reference to.
- *
- * This has to be used when terminating device_for_each_child_node() iteration
- * with break or return to prevent stale device node references from being left
- * behind.
- */
-void fwnode_handle_put(struct fwnode_handle *fwnode)
-{
-	fwnode_call_void_op(fwnode, put);
-}
-EXPORT_SYMBOL_GPL(fwnode_handle_put);
-
 /**
  * fwnode_device_is_available - check if a device is available for use
  * @fwnode: Pointer to the fwnode of the device.
diff --git a/include/linux/property.h b/include/linux/property.h
index 3a1045eb786c..93d992a92f59 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -180,7 +180,19 @@ struct fwnode_handle *device_get_named_child_node(const struct device *dev,
 						  const char *childname);
 
 struct fwnode_handle *fwnode_handle_get(struct fwnode_handle *fwnode);
-void fwnode_handle_put(struct fwnode_handle *fwnode);
+
+/**
+ * fwnode_handle_put - Drop reference to a device node
+ * @fwnode: Pointer to the device node to drop the reference to.
+ *
+ * This has to be used when terminating device_for_each_child_node() iteration
+ * with break or return to prevent stale device node references from being left
+ * behind.
+ */
+static inline void fwnode_handle_put(struct fwnode_handle *fwnode)
+{
+	fwnode_call_void_op(fwnode, put);
+}
 
 int fwnode_irq_get(const struct fwnode_handle *fwnode, unsigned int index);
 int fwnode_irq_get_byname(const struct fwnode_handle *fwnode, const char *name);
-- 
cgit v1.2.3


From 59ed5e2d505bf5f9b4af64d0021cd0c96aec1f7c Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Sat, 17 Feb 2024 16:42:36 +0000
Subject: device property: Add cleanup.h based fwnode_handle_put() scope based
 cleanup.

Useful where the fwnode_handle was obtained from a call such as
fwnode_find_reference() as it will safely do nothing if IS_ERR() is true
and will automatically release the reference on the variable leaving
scope.

Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Link: https://lore.kernel.org/r/20240217164249.921878-3-jic23@kernel.org
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/property.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/property.h b/include/linux/property.h
index 93d992a92f59..1322f0cce77a 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -13,6 +13,7 @@
 #include <linux/args.h>
 #include <linux/array_size.h>
 #include <linux/bits.h>
+#include <linux/cleanup.h>
 #include <linux/fwnode.h>
 #include <linux/stddef.h>
 #include <linux/types.h>
@@ -194,6 +195,8 @@ static inline void fwnode_handle_put(struct fwnode_handle *fwnode)
 	fwnode_call_void_op(fwnode, put);
 }
 
+DEFINE_FREE(fwnode_handle, struct fwnode_handle *, fwnode_handle_put(_T))
+
 int fwnode_irq_get(const struct fwnode_handle *fwnode, unsigned int index);
 int fwnode_irq_get_byname(const struct fwnode_handle *fwnode, const char *name);
 
-- 
cgit v1.2.3


From 365130fd47af6d4317aa16a407874b699ab8d8cb Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Date: Sat, 17 Feb 2024 16:42:38 +0000
Subject: device property: Introduce device_for_each_child_node_scoped()

Similar to recently propose for_each_child_of_node_scoped() this
new version of the loop macro instantiates a new local
struct fwnode_handle * that uses the __free(fwnode_handle) auto
cleanup handling so that if a reference to a node is held on early
exit from the loop the reference will be released. If the loop
runs to completion, the child pointer will be NULL and no action will
be taken.

The reason this is useful is that it removes the need for
fwnode_handle_put() on early loop exits.  If there is a need
to retain the reference, then return_ptr(child) or no_free_ptr(child)
may be used to safely disable the auto cleanup.

Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Link: https://lore.kernel.org/r/20240217164249.921878-5-jic23@kernel.org
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 include/linux/property.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/property.h b/include/linux/property.h
index 1322f0cce77a..61fc20e5f81f 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -175,6 +175,11 @@ struct fwnode_handle *device_get_next_child_node(const struct device *dev,
 	for (child = device_get_next_child_node(dev, NULL); child;	\
 	     child = device_get_next_child_node(dev, child))
 
+#define device_for_each_child_node_scoped(dev, child)			\
+	for (struct fwnode_handle *child __free(fwnode_handle) =	\
+		device_get_next_child_node(dev, NULL);			\
+	     child; child = device_get_next_child_node(dev, child))
+
 struct fwnode_handle *fwnode_get_named_child_node(const struct fwnode_handle *fwnode,
 						  const char *childname);
 struct fwnode_handle *device_get_named_child_node(const struct device *dev,
-- 
cgit v1.2.3


From abeded46bc1922e2249f6665654b5d81e7c5e1f1 Mon Sep 17 00:00:00 2001
From: Igor Pylypiv <ipylypiv@google.com>
Date: Thu, 7 Mar 2024 13:44:12 -0800
Subject: scsi: ata: libata-sata: Factor out NCQ Priority configuration helpers

Export libata NCQ Priority configuration helpers to be reused for libsas
managed SATA devices.

Switched locking from spin_lock_irq() to spin_lock_irqsave().  In the
future someone might call these helper functions when interrupts are
disabled. spin_unlock_irq() could lead to a premature re-enabling of
interrupts, whereas spin_unlock_irqrestore() restores the interrupt state
to its condition prior to the spin_lock_irqsave() call.

Acked-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Igor Pylypiv <ipylypiv@google.com>
Link: https://lore.kernel.org/r/20240307214418.3812290-2-ipylypiv@google.com
Reviewed-by: Niklas Cassel <cassel@kernel.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ata/libata-sata.c | 160 +++++++++++++++++++++++++++++++++++-----------
 include/linux/libata.h    |   6 ++
 2 files changed, 128 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/drivers/ata/libata-sata.c b/drivers/ata/libata-sata.c
index 0fb1934875f2..a8d773003d74 100644
--- a/drivers/ata/libata-sata.c
+++ b/drivers/ata/libata-sata.c
@@ -848,80 +848,143 @@ DEVICE_ATTR(link_power_management_policy, S_IRUGO | S_IWUSR,
 	    ata_scsi_lpm_show, ata_scsi_lpm_store);
 EXPORT_SYMBOL_GPL(dev_attr_link_power_management_policy);
 
-static ssize_t ata_ncq_prio_supported_show(struct device *device,
-					   struct device_attribute *attr,
-					   char *buf)
+/**
+ *	ata_ncq_prio_supported - Check if device supports NCQ Priority
+ *	@ap: ATA port of the target device
+ *	@sdev: SCSI device
+ *	@supported: Address of a boolean to store the result
+ *
+ *	Helper to check if device supports NCQ Priority feature.
+ *
+ *	Context: Any context. Takes and releases @ap->lock.
+ *
+ *	Return:
+ *	* %0		- OK. Status is stored into @supported
+ *	* %-ENODEV	- Failed to find the ATA device
+ */
+int ata_ncq_prio_supported(struct ata_port *ap, struct scsi_device *sdev,
+			   bool *supported)
 {
-	struct scsi_device *sdev = to_scsi_device(device);
-	struct ata_port *ap = ata_shost_to_port(sdev->host);
 	struct ata_device *dev;
-	bool ncq_prio_supported;
+	unsigned long flags;
 	int rc = 0;
 
-	spin_lock_irq(ap->lock);
+	spin_lock_irqsave(ap->lock, flags);
 	dev = ata_scsi_find_dev(ap, sdev);
 	if (!dev)
 		rc = -ENODEV;
 	else
-		ncq_prio_supported = dev->flags & ATA_DFLAG_NCQ_PRIO;
-	spin_unlock_irq(ap->lock);
+		*supported = dev->flags & ATA_DFLAG_NCQ_PRIO;
+	spin_unlock_irqrestore(ap->lock, flags);
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(ata_ncq_prio_supported);
+
+static ssize_t ata_ncq_prio_supported_show(struct device *device,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct scsi_device *sdev = to_scsi_device(device);
+	struct ata_port *ap = ata_shost_to_port(sdev->host);
+	bool supported;
+	int rc;
+
+	rc = ata_ncq_prio_supported(ap, sdev, &supported);
+	if (rc)
+		return rc;
 
-	return rc ? rc : sysfs_emit(buf, "%u\n", ncq_prio_supported);
+	return sysfs_emit(buf, "%d\n", supported);
 }
 
 DEVICE_ATTR(ncq_prio_supported, S_IRUGO, ata_ncq_prio_supported_show, NULL);
 EXPORT_SYMBOL_GPL(dev_attr_ncq_prio_supported);
 
-static ssize_t ata_ncq_prio_enable_show(struct device *device,
-					struct device_attribute *attr,
-					char *buf)
+/**
+ *	ata_ncq_prio_enabled - Check if NCQ Priority is enabled
+ *	@ap: ATA port of the target device
+ *	@sdev: SCSI device
+ *	@enabled: Address of a boolean to store the result
+ *
+ *	Helper to check if NCQ Priority feature is enabled.
+ *
+ *	Context: Any context. Takes and releases @ap->lock.
+ *
+ *	Return:
+ *	* %0		- OK. Status is stored into @enabled
+ *	* %-ENODEV	- Failed to find the ATA device
+ */
+int ata_ncq_prio_enabled(struct ata_port *ap, struct scsi_device *sdev,
+			 bool *enabled)
 {
-	struct scsi_device *sdev = to_scsi_device(device);
-	struct ata_port *ap = ata_shost_to_port(sdev->host);
 	struct ata_device *dev;
-	bool ncq_prio_enable;
+	unsigned long flags;
 	int rc = 0;
 
-	spin_lock_irq(ap->lock);
+	spin_lock_irqsave(ap->lock, flags);
 	dev = ata_scsi_find_dev(ap, sdev);
 	if (!dev)
 		rc = -ENODEV;
 	else
-		ncq_prio_enable = dev->flags & ATA_DFLAG_NCQ_PRIO_ENABLED;
-	spin_unlock_irq(ap->lock);
+		*enabled = dev->flags & ATA_DFLAG_NCQ_PRIO_ENABLED;
+	spin_unlock_irqrestore(ap->lock, flags);
 
-	return rc ? rc : sysfs_emit(buf, "%u\n", ncq_prio_enable);
+	return rc;
 }
+EXPORT_SYMBOL_GPL(ata_ncq_prio_enabled);
 
-static ssize_t ata_ncq_prio_enable_store(struct device *device,
-					 struct device_attribute *attr,
-					 const char *buf, size_t len)
+static ssize_t ata_ncq_prio_enable_show(struct device *device,
+					struct device_attribute *attr,
+					char *buf)
 {
 	struct scsi_device *sdev = to_scsi_device(device);
-	struct ata_port *ap;
-	struct ata_device *dev;
-	long int input;
-	int rc = 0;
+	struct ata_port *ap = ata_shost_to_port(sdev->host);
+	bool enabled;
+	int rc;
 
-	rc = kstrtol(buf, 10, &input);
+	rc = ata_ncq_prio_enabled(ap, sdev, &enabled);
 	if (rc)
 		return rc;
-	if ((input < 0) || (input > 1))
-		return -EINVAL;
 
-	ap = ata_shost_to_port(sdev->host);
-	dev = ata_scsi_find_dev(ap, sdev);
-	if (unlikely(!dev))
-		return  -ENODEV;
+	return sysfs_emit(buf, "%d\n", enabled);
+}
+
+/**
+ *	ata_ncq_prio_enable - Enable/disable NCQ Priority
+ *	@ap: ATA port of the target device
+ *	@sdev: SCSI device
+ *	@enable: true - enable NCQ Priority, false - disable NCQ Priority
+ *
+ *	Helper to enable/disable NCQ Priority feature.
+ *
+ *	Context: Any context. Takes and releases @ap->lock.
+ *
+ *	Return:
+ *	* %0		- OK. Status is stored into @enabled
+ *	* %-ENODEV	- Failed to find the ATA device
+ *	* %-EINVAL	- NCQ Priority is not supported or CDL is enabled
+ */
+int ata_ncq_prio_enable(struct ata_port *ap, struct scsi_device *sdev,
+			bool enable)
+{
+	struct ata_device *dev;
+	unsigned long flags;
+	int rc = 0;
+
+	spin_lock_irqsave(ap->lock, flags);
 
-	spin_lock_irq(ap->lock);
+	dev = ata_scsi_find_dev(ap, sdev);
+	if (!dev) {
+		rc = -ENODEV;
+		goto unlock;
+	}
 
 	if (!(dev->flags & ATA_DFLAG_NCQ_PRIO)) {
 		rc = -EINVAL;
 		goto unlock;
 	}
 
-	if (input) {
+	if (enable) {
 		if (dev->flags & ATA_DFLAG_CDL_ENABLED) {
 			ata_dev_err(dev,
 				"CDL must be disabled to enable NCQ priority\n");
@@ -934,9 +997,30 @@ static ssize_t ata_ncq_prio_enable_store(struct device *device,
 	}
 
 unlock:
-	spin_unlock_irq(ap->lock);
+	spin_unlock_irqrestore(ap->lock, flags);
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(ata_ncq_prio_enable);
+
+static ssize_t ata_ncq_prio_enable_store(struct device *device,
+					 struct device_attribute *attr,
+					 const char *buf, size_t len)
+{
+	struct scsi_device *sdev = to_scsi_device(device);
+	struct ata_port *ap = ata_shost_to_port(sdev->host);
+	bool enable;
+	int rc;
+
+	rc = kstrtobool(buf, &enable);
+	if (rc)
+		return rc;
+
+	rc = ata_ncq_prio_enable(ap, sdev, enable);
+	if (rc)
+		return rc;
 
-	return rc ? rc : len;
+	return len;
 }
 
 DEVICE_ATTR(ncq_prio_enable, S_IRUGO | S_IWUSR,
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 26d68115afb8..6dd9a4f9ca7c 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1157,6 +1157,12 @@ extern int ata_scsi_change_queue_depth(struct scsi_device *sdev,
 				       int queue_depth);
 extern int ata_change_queue_depth(struct ata_port *ap, struct scsi_device *sdev,
 				  int queue_depth);
+extern int ata_ncq_prio_supported(struct ata_port *ap, struct scsi_device *sdev,
+				  bool *supported);
+extern int ata_ncq_prio_enabled(struct ata_port *ap, struct scsi_device *sdev,
+				bool *enabled);
+extern int ata_ncq_prio_enable(struct ata_port *ap, struct scsi_device *sdev,
+			       bool enable);
 extern struct ata_device *ata_dev_pair(struct ata_device *adev);
 extern int ata_do_set_mode(struct ata_link *link, struct ata_device **r_failed_dev);
 extern void ata_scsi_port_error_handler(struct Scsi_Host *host, struct ata_port *ap);
-- 
cgit v1.2.3


From b4d3ddd2df7531e35c5f4e0b048d0999851747ae Mon Sep 17 00:00:00 2001
From: Igor Pylypiv <ipylypiv@google.com>
Date: Thu, 7 Mar 2024 13:44:13 -0800
Subject: scsi: libsas: Define NCQ Priority sysfs attributes for SATA devices

libata sysfs attributes cannot be used for libsas-managed SATA devices
because the ata_port location is different for libsas.

Defined sysfs attributes (visible for SATA devices only):

 - /sys/block/sda/device/ncq_prio_enable
 - /sys/block/sda/device/ncq_prio_supported

The newly defined attributes will pass the correct ata_port to libata
helper functions.

Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: Igor Pylypiv <ipylypiv@google.com>
Link: https://lore.kernel.org/r/20240307214418.3812290-3-ipylypiv@google.com
Reviewed-by: Niklas Cassel <cassel@kernel.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/libsas/sas_ata.c | 82 +++++++++++++++++++++++++++++++++++++++++++
 include/scsi/sas_ata.h        |  6 ++++
 2 files changed, 88 insertions(+)

(limited to 'include')

diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
index 12e2653846e3..b57c041a5544 100644
--- a/drivers/scsi/libsas/sas_ata.c
+++ b/drivers/scsi/libsas/sas_ata.c
@@ -964,3 +964,85 @@ int sas_execute_ata_cmd(struct domain_device *device, u8 *fis, int force_phy_id)
 			       force_phy_id, &tmf_task);
 }
 EXPORT_SYMBOL_GPL(sas_execute_ata_cmd);
+
+static ssize_t sas_ncq_prio_supported_show(struct device *device,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct scsi_device *sdev = to_scsi_device(device);
+	struct domain_device *ddev = sdev_to_domain_dev(sdev);
+	bool supported;
+	int rc;
+
+	rc = ata_ncq_prio_supported(ddev->sata_dev.ap, sdev, &supported);
+	if (rc)
+		return rc;
+
+	return sysfs_emit(buf, "%d\n", supported);
+}
+
+DEVICE_ATTR(ncq_prio_supported, S_IRUGO, sas_ncq_prio_supported_show, NULL);
+
+static ssize_t sas_ncq_prio_enable_show(struct device *device,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct scsi_device *sdev = to_scsi_device(device);
+	struct domain_device *ddev = sdev_to_domain_dev(sdev);
+	bool enabled;
+	int rc;
+
+	rc = ata_ncq_prio_enabled(ddev->sata_dev.ap, sdev, &enabled);
+	if (rc)
+		return rc;
+
+	return sysfs_emit(buf, "%d\n", enabled);
+}
+
+static ssize_t sas_ncq_prio_enable_store(struct device *device,
+					 struct device_attribute *attr,
+					 const char *buf, size_t len)
+{
+	struct scsi_device *sdev = to_scsi_device(device);
+	struct domain_device *ddev = sdev_to_domain_dev(sdev);
+	bool enable;
+	int rc;
+
+	rc = kstrtobool(buf, &enable);
+	if (rc)
+		return rc;
+
+	rc = ata_ncq_prio_enable(ddev->sata_dev.ap, sdev, enable);
+	if (rc)
+		return rc;
+
+	return len;
+}
+
+DEVICE_ATTR(ncq_prio_enable, S_IRUGO | S_IWUSR,
+	    sas_ncq_prio_enable_show, sas_ncq_prio_enable_store);
+
+static struct attribute *sas_ata_sdev_attrs[] = {
+	&dev_attr_ncq_prio_supported.attr,
+	&dev_attr_ncq_prio_enable.attr,
+	NULL
+};
+
+static umode_t sas_ata_attr_is_visible(struct kobject *kobj,
+				       struct attribute *attr, int i)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct scsi_device *sdev = to_scsi_device(dev);
+	struct domain_device *ddev = sdev_to_domain_dev(sdev);
+
+	if (!dev_is_sata(ddev))
+		return 0;
+
+	return attr->mode;
+}
+
+const struct attribute_group sas_ata_sdev_attr_group = {
+	.attrs = sas_ata_sdev_attrs,
+	.is_visible = sas_ata_attr_is_visible,
+};
+EXPORT_SYMBOL_GPL(sas_ata_sdev_attr_group);
diff --git a/include/scsi/sas_ata.h b/include/scsi/sas_ata.h
index 2f8c719840a6..92e27e7bf088 100644
--- a/include/scsi/sas_ata.h
+++ b/include/scsi/sas_ata.h
@@ -39,6 +39,9 @@ int smp_ata_check_ready_type(struct ata_link *link);
 int sas_discover_sata(struct domain_device *dev);
 int sas_ata_add_dev(struct domain_device *parent, struct ex_phy *phy,
 		    struct domain_device *child, int phy_id);
+
+extern const struct attribute_group sas_ata_sdev_attr_group;
+
 #else
 
 static inline void sas_ata_disabled_notice(void)
@@ -123,6 +126,9 @@ static inline int sas_ata_add_dev(struct domain_device *parent, struct ex_phy *p
 	sas_ata_disabled_notice();
 	return -ENODEV;
 }
+
+#define sas_ata_sdev_attr_group ((struct attribute_group) {})
+
 #endif
 
 #endif /* _SAS_ATA_H_ */
-- 
cgit v1.2.3


From bcd39f639e9e10e2b2d63604ec7c011cd21e52c1 Mon Sep 17 00:00:00 2001
From: John Garry <john.g.garry@oracle.com>
Date: Fri, 8 Mar 2024 11:43:34 +0000
Subject: scsi: libsas: Add LIBSAS_SHT_BASE

There is much duplication in the scsi_host_template structure for the
drivers which use libsas.

Similar to how a standard template is used in libata with __ATA_BASE_SHT,
create a standard template in LIBSAS_SHT_BASE.

Don't set a default for max_sectors at SCSI_DEFAULT_MAX_SECTORS, as
scsi_host_alloc() will default to this value automatically.

Even though some drivers don't set proc_name, it won't make much difference
to set as DRV_NAME.

Also add LIBSAS_SHT_BASE_NO_SLAVE_INIT for the hisi_sas drivers which have
custom .slave_alloc and .slave_configure methods.

Reviewed-by: Jason Yan <yanaijie@huawei.com>
Signed-off-by: John Garry <john.g.garry@oracle.com>
Link: https://lore.kernel.org/r/20240308114339.1340549-2-john.g.garry@oracle.com
Reviewed-by: Igor Pylypiv <ipylypiv@google.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/scsi/libsas.h | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'include')

diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h
index f5257103fdb6..d06a0570f4c5 100644
--- a/include/scsi/libsas.h
+++ b/include/scsi/libsas.h
@@ -726,4 +726,33 @@ void sas_notify_port_event(struct asd_sas_phy *phy, enum port_event event,
 void sas_notify_phy_event(struct asd_sas_phy *phy, enum phy_event event,
 			   gfp_t gfp_flags);
 
+#define __LIBSAS_SHT_BASE						\
+	.module				= THIS_MODULE,			\
+	.name				= DRV_NAME,			\
+	.proc_name			= DRV_NAME,			\
+	.queuecommand			= sas_queuecommand,		\
+	.dma_need_drain			= ata_scsi_dma_need_drain,	\
+	.target_alloc			= sas_target_alloc,		\
+	.change_queue_depth		= sas_change_queue_depth,	\
+	.bios_param			= sas_bios_param,		\
+	.this_id			= -1,				\
+	.eh_device_reset_handler	= sas_eh_device_reset_handler,	\
+	.eh_target_reset_handler	= sas_eh_target_reset_handler,	\
+	.target_destroy			= sas_target_destroy,		\
+	.ioctl				= sas_ioctl,			\
+
+#ifdef CONFIG_COMPAT
+#define _LIBSAS_SHT_BASE		__LIBSAS_SHT_BASE		\
+	.compat_ioctl			= sas_ioctl,
+#else
+#define _LIBSAS_SHT_BASE		__LIBSAS_SHT_BASE
+#endif
+
+#define LIBSAS_SHT_BASE			_LIBSAS_SHT_BASE		\
+	.slave_configure		= sas_slave_configure,		\
+	.slave_alloc			= sas_slave_alloc,		\
+
+#define LIBSAS_SHT_BASE_NO_SLAVE_INIT	_LIBSAS_SHT_BASE
+
+
 #endif /* _SASLIB_H_ */
-- 
cgit v1.2.3


From 7b0c9f8fa3d25622c21eeaa2c095c8a36b8b08b7 Mon Sep 17 00:00:00 2001
From: Dumitru Ceclan <mitrutzceclan@gmail.com>
Date: Wed, 28 Feb 2024 13:06:19 +0200
Subject: iio: adc: ad_sigma_delta: Add optional irq selection

Add optional irq_num attribute to ad_sigma_delta_info structure for
selecting the used interrupt line for ADC's conversion completion.

Signed-off-by: Dumitru Ceclan <mitrutzceclan@gmail.com>
Reviewed-by: Nuno Sa <nuno.sa@analog.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Link: https://lore.kernel.org/r/20240228110622.25114-2-mitrutzceclan@gmail.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/adc/ad_sigma_delta.c       | 23 ++++++++++++++---------
 include/linux/iio/adc/ad_sigma_delta.h |  3 +++
 2 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/iio/adc/ad_sigma_delta.c b/drivers/iio/adc/ad_sigma_delta.c
index a602429cdde4..97a05f325df7 100644
--- a/drivers/iio/adc/ad_sigma_delta.c
+++ b/drivers/iio/adc/ad_sigma_delta.c
@@ -222,11 +222,11 @@ int ad_sd_calibrate(struct ad_sigma_delta *sigma_delta,
 		goto out;
 
 	sigma_delta->irq_dis = false;
-	enable_irq(sigma_delta->spi->irq);
+	enable_irq(sigma_delta->irq_line);
 	timeout = wait_for_completion_timeout(&sigma_delta->completion, 2 * HZ);
 	if (timeout == 0) {
 		sigma_delta->irq_dis = true;
-		disable_irq_nosync(sigma_delta->spi->irq);
+		disable_irq_nosync(sigma_delta->irq_line);
 		ret = -EIO;
 	} else {
 		ret = 0;
@@ -295,7 +295,7 @@ int ad_sigma_delta_single_conversion(struct iio_dev *indio_dev,
 	ad_sigma_delta_set_mode(sigma_delta, AD_SD_MODE_SINGLE);
 
 	sigma_delta->irq_dis = false;
-	enable_irq(sigma_delta->spi->irq);
+	enable_irq(sigma_delta->irq_line);
 	ret = wait_for_completion_interruptible_timeout(
 			&sigma_delta->completion, HZ);
 
@@ -315,7 +315,7 @@ int ad_sigma_delta_single_conversion(struct iio_dev *indio_dev,
 
 out:
 	if (!sigma_delta->irq_dis) {
-		disable_irq_nosync(sigma_delta->spi->irq);
+		disable_irq_nosync(sigma_delta->irq_line);
 		sigma_delta->irq_dis = true;
 	}
 
@@ -396,7 +396,7 @@ static int ad_sd_buffer_postenable(struct iio_dev *indio_dev)
 		goto err_unlock;
 
 	sigma_delta->irq_dis = false;
-	enable_irq(sigma_delta->spi->irq);
+	enable_irq(sigma_delta->irq_line);
 
 	return 0;
 
@@ -414,7 +414,7 @@ static int ad_sd_buffer_postdisable(struct iio_dev *indio_dev)
 	wait_for_completion_timeout(&sigma_delta->completion, HZ);
 
 	if (!sigma_delta->irq_dis) {
-		disable_irq_nosync(sigma_delta->spi->irq);
+		disable_irq_nosync(sigma_delta->irq_line);
 		sigma_delta->irq_dis = true;
 	}
 
@@ -516,7 +516,7 @@ static irqreturn_t ad_sd_trigger_handler(int irq, void *p)
 irq_handled:
 	iio_trigger_notify_done(indio_dev->trig);
 	sigma_delta->irq_dis = false;
-	enable_irq(sigma_delta->spi->irq);
+	enable_irq(sigma_delta->irq_line);
 
 	return IRQ_HANDLED;
 }
@@ -587,13 +587,13 @@ static int devm_ad_sd_probe_trigger(struct device *dev, struct iio_dev *indio_de
 	sigma_delta->irq_dis = true;
 
 	/* the IRQ core clears IRQ_DISABLE_UNLAZY flag when freeing an IRQ */
-	irq_set_status_flags(sigma_delta->spi->irq, IRQ_DISABLE_UNLAZY);
+	irq_set_status_flags(sigma_delta->irq_line, IRQ_DISABLE_UNLAZY);
 
 	/* Allow overwriting the flags from firmware */
 	if (!irq_flags)
 		irq_flags = sigma_delta->info->irq_flags;
 
-	ret = devm_request_irq(dev, sigma_delta->spi->irq,
+	ret = devm_request_irq(dev, sigma_delta->irq_line,
 			       ad_sd_data_rdy_trig_poll,
 			       irq_flags | IRQF_NO_AUTOEN,
 			       indio_dev->name,
@@ -673,6 +673,11 @@ int ad_sd_init(struct ad_sigma_delta *sigma_delta, struct iio_dev *indio_dev,
 		}
 	}
 
+	if (info->irq_line)
+		sigma_delta->irq_line = info->irq_line;
+	else
+		sigma_delta->irq_line = spi->irq;
+
 	iio_device_set_drvdata(indio_dev, sigma_delta);
 
 	return 0;
diff --git a/include/linux/iio/adc/ad_sigma_delta.h b/include/linux/iio/adc/ad_sigma_delta.h
index 719cf9cc6e1a..383614ebd760 100644
--- a/include/linux/iio/adc/ad_sigma_delta.h
+++ b/include/linux/iio/adc/ad_sigma_delta.h
@@ -48,6 +48,7 @@ struct iio_dev;
  *   be used.
  * @irq_flags: flags for the interrupt used by the triggered buffer
  * @num_slots: Number of sequencer slots
+ * @irq_line: IRQ for reading conversions. If 0, spi->irq will be used
  */
 struct ad_sigma_delta_info {
 	int (*set_channel)(struct ad_sigma_delta *, unsigned int channel);
@@ -62,6 +63,7 @@ struct ad_sigma_delta_info {
 	unsigned int data_reg;
 	unsigned long irq_flags;
 	unsigned int num_slots;
+	int irq_line;
 };
 
 /**
@@ -89,6 +91,7 @@ struct ad_sigma_delta {
 	unsigned int		active_slots;
 	unsigned int		current_slot;
 	unsigned int		num_slots;
+	int		irq_line;
 	bool			status_appended;
 	/* map slots to channels in order to know what to expect from devices */
 	unsigned int		*slots;
-- 
cgit v1.2.3


From 9b163e0d330debbf7dcc14b2c3e2dc19a3b50a1d Mon Sep 17 00:00:00 2001
From: David Lechner <dlechner@baylibre.com>
Date: Mon, 25 Mar 2024 14:22:53 -0500
Subject: spi: remove struct spi_message::is_dma_mapped

There are no more users of the deprecated is_dma_mapped in struct
spi_message so it can be removed.

References in documentation and comments are also removed.

A few similar checks if xfer->tx_dma or xfer->rx_dma are not NULL are
also removed since these are now guaranteed to be NULL because they
were previously set only if is_dma_mapped was true.

Signed-off-by: David Lechner <dlechner@baylibre.com>
Link: https://msgid.link/r/20240325-spi-remove-is_dma_mapped-v2-1-d08d62b61f1c@baylibre.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 Documentation/spi/pxa2xx.rst      |  3 ---
 Documentation/spi/spi-summary.rst |  4 ----
 drivers/spi/spi-atmel.c           |  8 ++------
 drivers/spi/spi-pxa2xx.c          | 11 -----------
 drivers/spi/spi.c                 |  7 -------
 include/linux/spi/spi.h           | 11 +++--------
 6 files changed, 5 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/Documentation/spi/pxa2xx.rst b/Documentation/spi/pxa2xx.rst
index 19479b801826..43e0b758803a 100644
--- a/Documentation/spi/pxa2xx.rst
+++ b/Documentation/spi/pxa2xx.rst
@@ -194,9 +194,6 @@ The following logic is used to determine the type of I/O to be used on
 a per "spi_transfer" basis::
 
   if spi_message.len > 65536 then
-	if spi_message.is_dma_mapped or rx_dma_buf != 0 or tx_dma_buf != 0 then
-		reject premapped transfers
-
 	print "rate limited" warning
 	use PIO transfers
 
diff --git a/Documentation/spi/spi-summary.rst b/Documentation/spi/spi-summary.rst
index 546de37d6caf..f7f8b1573f25 100644
--- a/Documentation/spi/spi-summary.rst
+++ b/Documentation/spi/spi-summary.rst
@@ -419,10 +419,6 @@ any more such messages.
     to make extra copies unless the hardware requires it (e.g. working
     around hardware errata that force the use of bounce buffering).
 
-    If standard dma_map_single() handling of these buffers is inappropriate,
-    you can use spi_message.is_dma_mapped to tell the controller driver
-    that you've already provided the relevant DMA addresses.
-
   - The basic I/O primitive is spi_async().  Async requests may be
     issued in any context (irq handler, task, etc) and completion
     is reported using a callback provided with the message.
diff --git a/drivers/spi/spi-atmel.c b/drivers/spi/spi-atmel.c
index bad34998454a..b62f57390d8f 100644
--- a/drivers/spi/spi-atmel.c
+++ b/drivers/spi/spi-atmel.c
@@ -987,8 +987,6 @@ static void atmel_spi_pdc_next_xfer(struct spi_controller *host,
  * For DMA, tx_buf/tx_dma have the same relationship as rx_buf/rx_dma:
  *  - The buffer is either valid for CPU access, else NULL
  *  - If the buffer is valid, so is its DMA address
- *
- * This driver manages the dma address unless message->is_dma_mapped.
  */
 static int
 atmel_spi_dma_map_xfer(struct atmel_spi *as, struct spi_transfer *xfer)
@@ -1374,8 +1372,7 @@ static int atmel_spi_one_transfer(struct spi_controller *host,
 	 * DMA map early, for performance (empties dcache ASAP) and
 	 * better fault reporting.
 	 */
-	if ((!host->cur_msg->is_dma_mapped)
-		&& as->use_pdc) {
+	if (as->use_pdc) {
 		if (atmel_spi_dma_map_xfer(as, xfer) < 0)
 			return -ENOMEM;
 	}
@@ -1454,8 +1451,7 @@ static int atmel_spi_one_transfer(struct spi_controller *host,
 		}
 	}
 
-	if (!host->cur_msg->is_dma_mapped
-		&& as->use_pdc)
+	if (as->use_pdc)
 		atmel_spi_dma_unmap_xfer(host, xfer);
 
 	if (as->use_pdc)
diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c
index f2a856f6a99e..6c2a14418972 100644
--- a/drivers/spi/spi-pxa2xx.c
+++ b/drivers/spi/spi-pxa2xx.c
@@ -944,7 +944,6 @@ static int pxa2xx_spi_transfer_one(struct spi_controller *controller,
 				   struct spi_transfer *transfer)
 {
 	struct driver_data *drv_data = spi_controller_get_devdata(controller);
-	struct spi_message *message = controller->cur_msg;
 	struct chip_data *chip = spi_get_ctldata(spi);
 	u32 dma_thresh = chip->dma_threshold;
 	u32 dma_burst = chip->dma_burst_size;
@@ -959,16 +958,6 @@ static int pxa2xx_spi_transfer_one(struct spi_controller *controller,
 
 	/* Check if we can DMA this transfer */
 	if (transfer->len > MAX_DMA_LEN && chip->enable_dma) {
-
-		/* Reject already-mapped transfers; PIO won't always work */
-		if (message->is_dma_mapped
-				|| transfer->rx_dma || transfer->tx_dma) {
-			dev_err(&spi->dev,
-				"Mapped transfer length of %u is greater than %d\n",
-				transfer->len, MAX_DMA_LEN);
-			return -EINVAL;
-		}
-
 		/* Warn ... we force this to PIO mode */
 		dev_warn_ratelimited(&spi->dev,
 				     "DMA disabled for transfer length %u greater than %d\n",
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index ff75838c1b5d..a2f01116ba09 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -3709,9 +3709,6 @@ static int __spi_split_transfer_maxsize(struct spi_controller *ctlr,
 	 * to the same values as *xferp, so tx_buf, rx_buf and len
 	 * are all identical (as well as most others)
 	 * so we just have to fix up len and the pointers.
-	 *
-	 * This also includes support for the depreciated
-	 * spi_message.is_dma_mapped interface.
 	 */
 
 	/*
@@ -3725,12 +3722,8 @@ static int __spi_split_transfer_maxsize(struct spi_controller *ctlr,
 		/* Update rx_buf, tx_buf and DMA */
 		if (xfers[i].rx_buf)
 			xfers[i].rx_buf += offset;
-		if (xfers[i].rx_dma)
-			xfers[i].rx_dma += offset;
 		if (xfers[i].tx_buf)
 			xfers[i].tx_buf += offset;
-		if (xfers[i].tx_dma)
-			xfers[i].tx_dma += offset;
 
 		/* Update length */
 		xfers[i].len = min(maxsize, xfers[i].len - offset);
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index c459809efee4..b589e2547439 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -955,8 +955,8 @@ struct spi_res {
  * struct spi_transfer - a read/write buffer pair
  * @tx_buf: data to be written (DMA-safe memory), or NULL
  * @rx_buf: data to be read (DMA-safe memory), or NULL
- * @tx_dma: DMA address of tx_buf, if @spi_message.is_dma_mapped
- * @rx_dma: DMA address of rx_buf, if @spi_message.is_dma_mapped
+ * @tx_dma: DMA address of tx_buf, currently not for client use
+ * @rx_dma: DMA address of rx_buf, currently not for client use
  * @tx_nbits: number of bits used for writing. If 0 the default
  *      (SPI_NBITS_SINGLE) is used.
  * @rx_nbits: number of bits used for reading. If 0 the default
@@ -1066,8 +1066,7 @@ struct spi_transfer {
 	/*
 	 * It's okay if tx_buf == rx_buf (right?).
 	 * For MicroWire, one buffer must be NULL.
-	 * Buffers must work with dma_*map_single() calls, unless
-	 * spi_message.is_dma_mapped reports a pre-existing mapping.
+	 * Buffers must work with dma_*map_single() calls.
 	 */
 	const void	*tx_buf;
 	void		*rx_buf;
@@ -1111,8 +1110,6 @@ struct spi_transfer {
  * struct spi_message - one multi-segment SPI transaction
  * @transfers: list of transfer segments in this transaction
  * @spi: SPI device to which the transaction is queued
- * @is_dma_mapped: if true, the caller provided both DMA and CPU virtual
- *	addresses for each transfer buffer
  * @pre_optimized: peripheral driver pre-optimized the message
  * @optimized: the message is in the optimized state
  * @prepared: spi_prepare_message was called for the this message
@@ -1146,8 +1143,6 @@ struct spi_message {
 
 	struct spi_device	*spi;
 
-	unsigned		is_dma_mapped:1;
-
 	/* spi_optimize_message() was called for this message */
 	bool			pre_optimized;
 	/* __spi_optimize_message() was called for this message */
-- 
cgit v1.2.3


From 996a24b99d63df08caf89fc03f4783279424788f Mon Sep 17 00:00:00 2001
From: Avri Altman <avri.altman@wdc.com>
Date: Sat, 9 Mar 2024 10:11:02 +0200
Subject: scsi: ufs: Reuse compose_devman_upiu

Move some code fragments into ufshcd_prepare_req_desc_hdr() so it can be
used throughout.

Signed-off-by: Avri Altman <avri.altman@wdc.com>
Link: https://lore.kernel.org/r/20240309081104.5006-5-avri.altman@wdc.com
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Bean Huo <beanhuo@micron.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ufs/core/ufshcd.c | 49 ++++++++++++++++-------------------------------
 include/ufs/ufshci.h      |  2 +-
 2 files changed, 18 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index a089981c7496..823bc28c0cb0 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -2710,18 +2710,27 @@ static void ufshcd_disable_intr(struct ufs_hba *hba, u32 intrs)
 /**
  * ufshcd_prepare_req_desc_hdr - Fill UTP Transfer request descriptor header according to request
  * descriptor according to request
+ * @hba: per adapter instance
  * @lrbp: pointer to local reference block
  * @upiu_flags: flags required in the header
  * @cmd_dir: requests data direction
  * @ehs_length: Total EHS Length (in 32‐bytes units of all Extra Header Segments)
+ * @legacy_type: UTP_CMD_TYPE_SCSI or UTP_CMD_TYPE_DEV_MANAGE
  */
-static void ufshcd_prepare_req_desc_hdr(struct ufshcd_lrb *lrbp, u8 *upiu_flags,
-					enum dma_data_direction cmd_dir, int ehs_length)
+static void
+ufshcd_prepare_req_desc_hdr(struct ufs_hba *hba, struct ufshcd_lrb *lrbp,
+			    u8 *upiu_flags, enum dma_data_direction cmd_dir,
+			    int ehs_length, enum utp_cmd_type legacy_type)
 {
 	struct utp_transfer_req_desc *req_desc = lrbp->utr_descriptor_ptr;
 	struct request_desc_header *h = &req_desc->header;
 	enum utp_data_direction data_direction;
 
+	if (hba->ufs_version <= ufshci_version(1, 1))
+		lrbp->command_type = legacy_type;
+	else
+		lrbp->command_type = UTP_CMD_TYPE_UFS_STORAGE;
+
 	*h = (typeof(*h)){ };
 
 	if (cmd_dir == DMA_FROM_DEVICE) {
@@ -2854,12 +2863,8 @@ static int ufshcd_compose_devman_upiu(struct ufs_hba *hba,
 	u8 upiu_flags;
 	int ret = 0;
 
-	if (hba->ufs_version <= ufshci_version(1, 1))
-		lrbp->command_type = UTP_CMD_TYPE_DEV_MANAGE;
-	else
-		lrbp->command_type = UTP_CMD_TYPE_UFS_STORAGE;
+	ufshcd_prepare_req_desc_hdr(hba, lrbp, &upiu_flags, DMA_NONE, 0, UTP_CMD_TYPE_DEV_MANAGE);
 
-	ufshcd_prepare_req_desc_hdr(lrbp, &upiu_flags, DMA_NONE, 0);
 	if (hba->dev_cmd.type == DEV_CMD_TYPE_QUERY)
 		ufshcd_prepare_utp_query_req_upiu(hba, lrbp, upiu_flags);
 	else if (hba->dev_cmd.type == DEV_CMD_TYPE_NOP)
@@ -2882,13 +2887,8 @@ static void ufshcd_comp_scsi_upiu(struct ufs_hba *hba, struct ufshcd_lrb *lrbp)
 	unsigned int ioprio_class = IOPRIO_PRIO_CLASS(req_get_ioprio(rq));
 	u8 upiu_flags;
 
-	if (hba->ufs_version <= ufshci_version(1, 1))
-		lrbp->command_type = UTP_CMD_TYPE_SCSI;
-	else
-		lrbp->command_type = UTP_CMD_TYPE_UFS_STORAGE;
-
-	ufshcd_prepare_req_desc_hdr(lrbp, &upiu_flags,
-				    lrbp->cmd->sc_data_direction, 0);
+	ufshcd_prepare_req_desc_hdr(hba, lrbp, &upiu_flags,
+				    lrbp->cmd->sc_data_direction, 0, UTP_CMD_TYPE_SCSI);
 	if (ioprio_class == IOPRIO_CLASS_RT)
 		upiu_flags |= UPIU_CMD_FLAGS_CP;
 	ufshcd_prepare_utp_scsi_cmd_upiu(lrbp, upiu_flags);
@@ -7229,16 +7229,11 @@ static int ufshcd_issue_devman_upiu_cmd(struct ufs_hba *hba,
 
 	ufshcd_setup_dev_cmd(hba, lrbp, cmd_type, 0, tag);
 
-	if (hba->ufs_version <= ufshci_version(1, 1))
-		lrbp->command_type = UTP_CMD_TYPE_DEV_MANAGE;
-	else
-		lrbp->command_type = UTP_CMD_TYPE_UFS_STORAGE;
+	ufshcd_prepare_req_desc_hdr(hba, lrbp, &upiu_flags, DMA_NONE, 0, UTP_CMD_TYPE_DEV_MANAGE);
 
 	/* update the task tag in the request upiu */
 	req_upiu->header.task_tag = tag;
 
-	ufshcd_prepare_req_desc_hdr(lrbp, &upiu_flags, DMA_NONE, 0);
-
 	/* just copy the upiu request as it is */
 	memcpy(lrbp->ucd_req_ptr, req_upiu, sizeof(*lrbp->ucd_req_ptr));
 	if (desc_buff && desc_op == UPIU_QUERY_OPCODE_WRITE_DESC) {
@@ -7379,24 +7374,14 @@ int ufshcd_advanced_rpmb_req_handler(struct ufs_hba *hba, struct utp_upiu_req *r
 	u8 upiu_flags;
 	u8 *ehs_data;
 	u16 ehs_len;
+	int ehs = (hba->capabilities & MASK_EHSLUTRD_SUPPORTED) ? 2 : 0;
 
 	/* Protects use of hba->reserved_slot. */
 	ufshcd_dev_man_lock(hba);
 
 	ufshcd_setup_dev_cmd(hba, lrbp, DEV_CMD_TYPE_RPMB, UFS_UPIU_RPMB_WLUN, tag);
 
-	/* Advanced RPMB starts from UFS 4.0, so its command type is UTP_CMD_TYPE_UFS_STORAGE */
-	lrbp->command_type = UTP_CMD_TYPE_UFS_STORAGE;
-
-	/*
-	 * According to UFSHCI 4.0 specification page 24, if EHSLUTRDS is 0, host controller takes
-	 * EHS length from CMD UPIU, and SW driver use EHS Length field in CMD UPIU. if it is 1,
-	 * HW controller takes EHS length from UTRD.
-	 */
-	if (hba->capabilities & MASK_EHSLUTRD_SUPPORTED)
-		ufshcd_prepare_req_desc_hdr(lrbp, &upiu_flags, dir, 2);
-	else
-		ufshcd_prepare_req_desc_hdr(lrbp, &upiu_flags, dir, 0);
+	ufshcd_prepare_req_desc_hdr(hba, lrbp, &upiu_flags, DMA_NONE, ehs, UTP_CMD_TYPE_DEV_MANAGE);
 
 	/* update the task tag */
 	req_upiu->header.task_tag = tag;
diff --git a/include/ufs/ufshci.h b/include/ufs/ufshci.h
index a196e1c4c3bb..88193f5540e5 100644
--- a/include/ufs/ufshci.h
+++ b/include/ufs/ufshci.h
@@ -426,7 +426,7 @@ union ufs_crypto_cfg_entry {
  */
 
 /* Transfer request command type */
-enum {
+enum utp_cmd_type {
 	UTP_CMD_TYPE_SCSI		= 0x0,
 	UTP_CMD_TYPE_UFS		= 0x1,
 	UTP_CMD_TYPE_DEV_MANAGE		= 0x2,
-- 
cgit v1.2.3


From 2a82bb02941fb53d1f8df2a360e7798ae3d9d962 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 7 Mar 2024 21:29:12 -0500
Subject: statx: stx_subvol

Add a new statx field for (sub)volume identifiers, as implemented by
btrfs and bcachefs.

This includes bcachefs support; we'll definitely want btrfs support as
well.

Link: https://lore.kernel.org/linux-fsdevel/2uvhm6gweyl7iyyp2xpfryvcu2g3padagaeqcbiavjyiis6prl@yjm725bizncq/
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Miklos Szeredi <mszeredi@redhat.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Link: https://lore.kernel.org/r/20240308022914.196982-1-kent.overstreet@linux.dev
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/bcachefs/fs.c          | 3 +++
 fs/btrfs/inode.c          | 3 +++
 fs/stat.c                 | 1 +
 include/linux/stat.h      | 1 +
 include/uapi/linux/stat.h | 4 +++-
 5 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 0ccee05f6887..99a0abeadbe2 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -841,6 +841,9 @@ static int bch2_getattr(struct mnt_idmap *idmap,
 	stat->blksize	= block_bytes(c);
 	stat->blocks	= inode->v.i_blocks;
 
+	stat->subvol	= inode->ei_subvol;
+	stat->result_mask |= STATX_SUBVOL;
+
 	if (request_mask & STATX_BTIME) {
 		stat->result_mask |= STATX_BTIME;
 		stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 37701531eeb1..8cf692c708d7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8779,6 +8779,9 @@ static int btrfs_getattr(struct mnt_idmap *idmap,
 	generic_fillattr(idmap, request_mask, inode, stat);
 	stat->dev = BTRFS_I(inode)->root->anon_dev;
 
+	stat->subvol = BTRFS_I(inode)->root->root_key.objectid;
+	stat->result_mask |= STATX_SUBVOL;
+
 	spin_lock(&BTRFS_I(inode)->lock);
 	delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
 	inode_bytes = inode_get_bytes(inode);
diff --git a/fs/stat.c b/fs/stat.c
index 77cdc69eb422..70bd3e888cfa 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -658,6 +658,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
 	tmp.stx_mnt_id = stat->mnt_id;
 	tmp.stx_dio_mem_align = stat->dio_mem_align;
 	tmp.stx_dio_offset_align = stat->dio_offset_align;
+	tmp.stx_subvol = stat->subvol;
 
 	return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
 }
diff --git a/include/linux/stat.h b/include/linux/stat.h
index 52150570d37a..bf92441dbad2 100644
--- a/include/linux/stat.h
+++ b/include/linux/stat.h
@@ -53,6 +53,7 @@ struct kstat {
 	u32		dio_mem_align;
 	u32		dio_offset_align;
 	u64		change_cookie;
+	u64		subvol;
 };
 
 /* These definitions are internal to the kernel for now. Mainly used by nfsd. */
diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h
index 2f2ee82d5517..67626d535316 100644
--- a/include/uapi/linux/stat.h
+++ b/include/uapi/linux/stat.h
@@ -126,8 +126,9 @@ struct statx {
 	__u64	stx_mnt_id;
 	__u32	stx_dio_mem_align;	/* Memory buffer alignment for direct I/O */
 	__u32	stx_dio_offset_align;	/* File offset alignment for direct I/O */
+	__u64	stx_subvol;	/* Subvolume identifier */
 	/* 0xa0 */
-	__u64	__spare3[12];	/* Spare space for future expansion */
+	__u64	__spare3[11];	/* Spare space for future expansion */
 	/* 0x100 */
 };
 
@@ -155,6 +156,7 @@ struct statx {
 #define STATX_MNT_ID		0x00001000U	/* Got stx_mnt_id */
 #define STATX_DIOALIGN		0x00002000U	/* Want/got direct I/O alignment info */
 #define STATX_MNT_ID_UNIQUE	0x00004000U	/* Want/got extended stx_mount_id */
+#define STATX_SUBVOL		0x00008000U	/* Want/got stx_subvol */
 
 #define STATX__RESERVED		0x80000000U	/* Reserved for future struct statx expansion */
 
-- 
cgit v1.2.3


From 7b30851a70645400ba37a184b9a2f10e1d6987a6 Mon Sep 17 00:00:00 2001
From: "Luis Henriques (SUSE)" <luis.henriques@linux.dev>
Date: Tue, 12 Mar 2024 10:47:57 +0000
Subject: fs_parser: move fsparam_string_empty() helper into header

Since both ext4 and overlayfs define the same macro to specify string
parameters that may allow empty values, define it in an header file so
that this helper can be shared.

Signed-off-by: Luis Henriques (SUSE) <luis.henriques@linux.dev>
Link: https://lore.kernel.org/r/20240312104757.27333-1-luis.henriques@linux.dev
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/ext4/super.c           | 4 ----
 fs/overlayfs/params.c     | 4 ----
 include/linux/fs_parser.h | 4 ++++
 3 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index cfb8449c731f..2e906dbbe015 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1723,10 +1723,6 @@ static const struct constant_table ext4_param_dax[] = {
 	{}
 };
 
-/* String parameter that allows empty argument */
-#define fsparam_string_empty(NAME, OPT) \
-	__fsparam(fs_param_is_string, NAME, OPT, fs_param_can_be_empty, NULL)
-
 /*
  * Mount option specification
  * We don't use fsparam_flag_no because of the way we set the
diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index 36dcc530ac28..4860fcc4611b 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -139,10 +139,6 @@ static int ovl_verity_mode_def(void)
 	return OVL_VERITY_OFF;
 }
 
-#define fsparam_string_empty(NAME, OPT) \
-	__fsparam(fs_param_is_string, NAME, OPT, fs_param_can_be_empty, NULL)
-
-
 const struct fs_parameter_spec ovl_parameter_spec[] = {
 	fsparam_string_empty("lowerdir",    Opt_lowerdir),
 	fsparam_string("lowerdir+",         Opt_lowerdir_add),
diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h
index 01542c4b87a2..d3350979115f 100644
--- a/include/linux/fs_parser.h
+++ b/include/linux/fs_parser.h
@@ -132,4 +132,8 @@ static inline bool fs_validate_description(const char *name,
 #define fsparam_path(NAME, OPT)	__fsparam(fs_param_is_path, NAME, OPT, 0, NULL)
 #define fsparam_fd(NAME, OPT)	__fsparam(fs_param_is_fd, NAME, OPT, 0, NULL)
 
+/* String parameter that allows empty argument */
+#define fsparam_string_empty(NAME, OPT) \
+	__fsparam(fs_param_is_string, NAME, OPT, fs_param_can_be_empty, NULL)
+
 #endif /* _LINUX_FS_PARSER_H */
-- 
cgit v1.2.3


From ecc79ab919ec54c658fb14f955c76872119829b8 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 19 Mar 2024 17:29:05 +0100
Subject: ARM: dts: renesas: r8a73a4: Add TMU nodes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add device nodes for the Timer Units (TMU) on the R-Mobile APE6 SoC,
and the clocks serving them.

Note that TMU channels 1 and 2 are not added, as their interrupts are
not wired to the interrupt controller for the AP-System Core (INTC-SYS),
only to the interrupt controller for the AP-Realtime Core (INTC-RT).

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
Link: https://lore.kernel.org/r/1a60832f3ba37afb4a5791f4e5db4610ab31beb3.1710864964.git.geert+renesas@glider.be
---
 arch/arm/boot/dts/renesas/r8a73a4.dtsi    | 37 +++++++++++++++++++++++++++++++
 include/dt-bindings/clock/r8a73a4-clock.h |  4 ++++
 2 files changed, 41 insertions(+)

(limited to 'include')

diff --git a/arch/arm/boot/dts/renesas/r8a73a4.dtsi b/arch/arm/boot/dts/renesas/r8a73a4.dtsi
index ac654ff45d0e..9a2ae282a46b 100644
--- a/arch/arm/boot/dts/renesas/r8a73a4.dtsi
+++ b/arch/arm/boot/dts/renesas/r8a73a4.dtsi
@@ -60,6 +60,32 @@
 			     <GIC_PPI 10 (GIC_CPU_MASK_SIMPLE(8) | IRQ_TYPE_LEVEL_LOW)>;
 	};
 
+	tmu0: timer@e61e0000 {
+		compatible = "renesas,tmu-r8a73a4", "renesas,tmu";
+		reg = <0 0xe61e0000 0 0x30>;
+		interrupts = <GIC_SPI 136 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 137 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 138 IRQ_TYPE_LEVEL_HIGH>;
+		interrupt-names = "tuni0", "tuni1", "tuni2";
+		clocks = <&mstp1_clks R8A73A4_CLK_TMU0>;
+		clock-names = "fck";
+		power-domains = <&pd_c5>;
+		status = "disabled";
+	};
+
+	tmu3: timer@fff80000 {
+		compatible = "renesas,tmu-r8a73a4", "renesas,tmu";
+		reg = <0 0xfff80000 0 0x30>;
+		interrupts = <GIC_SPI 131 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 132 IRQ_TYPE_LEVEL_HIGH>,
+			     <GIC_SPI 133 IRQ_TYPE_LEVEL_HIGH>;
+		interrupt-names = "tuni0", "tuni1", "tuni2";
+		clocks = <&mstp1_clks R8A73A4_CLK_TMU3>;
+		clock-names = "fck";
+		power-domains = <&pd_a3r>;
+		status = "disabled";
+	};
+
 	dbsc1: memory-controller@e6790000 {
 		compatible = "renesas,dbsc-r8a73a4";
 		reg = <0 0xe6790000 0 0x10000>;
@@ -654,6 +680,17 @@
 		};
 
 		/* Gate clocks */
+		mstp1_clks: mstp1_clks@e6150134 {
+			compatible = "renesas,r8a73a4-mstp-clocks", "renesas,cpg-mstp-clocks";
+			reg = <0 0xe6150134 0 4>, <0 0xe6150038 0 4>;
+			clocks = <&cp_clk>, <&mp_clk>;
+			#clock-cells = <1>;
+			clock-indices = <
+				R8A73A4_CLK_TMU0 R8A73A4_CLK_TMU3
+			>;
+			clock-output-names =
+				"tmu0", "tmu3";
+		};
 		mstp2_clks: mstp2_clks@e6150138 {
 			compatible = "renesas,r8a73a4-mstp-clocks", "renesas,cpg-mstp-clocks";
 			reg = <0 0xe6150138 0 4>, <0 0xe6150040 0 4>;
diff --git a/include/dt-bindings/clock/r8a73a4-clock.h b/include/dt-bindings/clock/r8a73a4-clock.h
index 1ec4827b8091..655440a3e7c6 100644
--- a/include/dt-bindings/clock/r8a73a4-clock.h
+++ b/include/dt-bindings/clock/r8a73a4-clock.h
@@ -24,6 +24,10 @@
 #define R8A73A4_CLK_ZS		14
 #define R8A73A4_CLK_HP		15
 
+/* MSTP1 */
+#define R8A73A4_CLK_TMU0	25
+#define R8A73A4_CLK_TMU3	21
+
 /* MSTP2 */
 #define R8A73A4_CLK_DMAC	18
 #define R8A73A4_CLK_SCIFB3	17
-- 
cgit v1.2.3


From 002b831076722f3a966a58defd1dc6976a46c109 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Mon, 26 Feb 2024 11:59:45 +0100
Subject: dt-bindings: clock: r9a07g043-cpg: Annotate RZ/G2UL-only core clocks

The M2 (CRU main clock), M3 (LCDC Video Clock), and AT (Cortex-A55 Debug
clock) core clocks are only present on RZ/G2UL, not on RZ/Five.

Annotate this in the comments, like is already done for module clocks
and resets.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Acked-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/ffcdcd479c76b92f67481836a33ec86e97f85634.1708944903.git.geert+renesas@glider.be
---
 include/dt-bindings/clock/r9a07g043-cpg.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/dt-bindings/clock/r9a07g043-cpg.h b/include/dt-bindings/clock/r9a07g043-cpg.h
index 77cde8effdc7..a64139fec815 100644
--- a/include/dt-bindings/clock/r9a07g043-cpg.h
+++ b/include/dt-bindings/clock/r9a07g043-cpg.h
@@ -16,15 +16,15 @@
 #define R9A07G043_CLK_SD0		5
 #define R9A07G043_CLK_SD1		6
 #define R9A07G043_CLK_M0		7
-#define R9A07G043_CLK_M2		8
-#define R9A07G043_CLK_M3		9
+#define R9A07G043_CLK_M2		8	/* RZ/G2UL Only */
+#define R9A07G043_CLK_M3		9	/* RZ/G2UL Only */
 #define R9A07G043_CLK_HP		10
 #define R9A07G043_CLK_TSU		11
 #define R9A07G043_CLK_ZT		12
 #define R9A07G043_CLK_P0		13
 #define R9A07G043_CLK_P1		14
 #define R9A07G043_CLK_P2		15
-#define R9A07G043_CLK_AT		16
+#define R9A07G043_CLK_AT		16	/* RZ/G2UL Only */
 #define R9A07G043_OSCCLK		17
 #define R9A07G043_CLK_P0_DIV2		18
 
-- 
cgit v1.2.3


From b3af9045b482925c770f56f81d68c0b9bc6c5049 Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Mon, 25 Mar 2024 11:43:45 +0800
Subject: trace: move to TP_STORE_ADDRS related macro to net_probe_common.h

Put the macro into another standalone file for better extension.
Some tracepoints can use this common part in the future.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/trace/events/net_probe_common.h | 29 +++++++++++++++++++++++++++++
 include/trace/events/tcp.h              | 29 -----------------------------
 2 files changed, 29 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/net_probe_common.h b/include/trace/events/net_probe_common.h
index 3930119cab08..b1f9a4d3ee13 100644
--- a/include/trace/events/net_probe_common.h
+++ b/include/trace/events/net_probe_common.h
@@ -41,4 +41,33 @@
 
 #endif
 
+#define TP_STORE_V4MAPPED(__entry, saddr, daddr)		\
+	do {							\
+		struct in6_addr *pin6;				\
+								\
+		pin6 = (struct in6_addr *)__entry->saddr_v6;	\
+		ipv6_addr_set_v4mapped(saddr, pin6);		\
+		pin6 = (struct in6_addr *)__entry->daddr_v6;	\
+		ipv6_addr_set_v4mapped(daddr, pin6);		\
+	} while (0)
+
+#if IS_ENABLED(CONFIG_IPV6)
+#define TP_STORE_ADDRS(__entry, saddr, daddr, saddr6, daddr6)		\
+	do {								\
+		if (sk->sk_family == AF_INET6) {			\
+			struct in6_addr *pin6;				\
+									\
+			pin6 = (struct in6_addr *)__entry->saddr_v6;	\
+			*pin6 = saddr6;					\
+			pin6 = (struct in6_addr *)__entry->daddr_v6;	\
+			*pin6 = daddr6;					\
+		} else {						\
+			TP_STORE_V4MAPPED(__entry, saddr, daddr);	\
+		}							\
+	} while (0)
+#else
+#define TP_STORE_ADDRS(__entry, saddr, daddr, saddr6, daddr6)	\
+	TP_STORE_V4MAPPED(__entry, saddr, daddr)
+#endif
+
 #endif
diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 699dafd204ea..3c08a0846c47 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -12,35 +12,6 @@
 #include <net/tcp.h>
 #include <linux/sock_diag.h>
 
-#define TP_STORE_V4MAPPED(__entry, saddr, daddr)		\
-	do {							\
-		struct in6_addr *pin6;				\
-								\
-		pin6 = (struct in6_addr *)__entry->saddr_v6;	\
-		ipv6_addr_set_v4mapped(saddr, pin6);		\
-		pin6 = (struct in6_addr *)__entry->daddr_v6;	\
-		ipv6_addr_set_v4mapped(daddr, pin6);		\
-	} while (0)
-
-#if IS_ENABLED(CONFIG_IPV6)
-#define TP_STORE_ADDRS(__entry, saddr, daddr, saddr6, daddr6)		\
-	do {								\
-		if (sk->sk_family == AF_INET6) {			\
-			struct in6_addr *pin6;				\
-									\
-			pin6 = (struct in6_addr *)__entry->saddr_v6;	\
-			*pin6 = saddr6;					\
-			pin6 = (struct in6_addr *)__entry->daddr_v6;	\
-			*pin6 = daddr6;					\
-		} else {						\
-			TP_STORE_V4MAPPED(__entry, saddr, daddr);	\
-		}							\
-	} while (0)
-#else
-#define TP_STORE_ADDRS(__entry, saddr, daddr, saddr6, daddr6)	\
-	TP_STORE_V4MAPPED(__entry, saddr, daddr)
-#endif
-
 /*
  * tcp event with arguments sk and skb
  *
-- 
cgit v1.2.3


From a24c855a5ef2cd07dece2ccf131c4f40986c584d Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Mon, 25 Mar 2024 11:43:46 +0800
Subject: trace: use TP_STORE_ADDRS() macro in inet_sk_error_report()

As the title said, use the macro directly like the patch[1] did
to avoid those duplications. No functional change.

[1]
commit 6a6b0b9914e7 ("tcp: Avoid preprocessor directives in tracepoint macro args")

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/trace/events/sock.h | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h
index fd206a6ab5b8..4397f7bfa406 100644
--- a/include/trace/events/sock.h
+++ b/include/trace/events/sock.h
@@ -10,6 +10,7 @@
 #include <linux/tracepoint.h>
 #include <linux/ipv6.h>
 #include <linux/tcp.h>
+#include <trace/events/net_probe_common.h>
 
 #define family_names			\
 		EM(AF_INET)				\
@@ -223,7 +224,6 @@ TRACE_EVENT(inet_sk_error_report,
 
 	TP_fast_assign(
 		const struct inet_sock *inet = inet_sk(sk);
-		struct in6_addr *pin6;
 		__be32 *p32;
 
 		__entry->error = sk->sk_err;
@@ -238,20 +238,8 @@ TRACE_EVENT(inet_sk_error_report,
 		p32 = (__be32 *) __entry->daddr;
 		*p32 =  inet->inet_daddr;
 
-#if IS_ENABLED(CONFIG_IPV6)
-		if (sk->sk_family == AF_INET6) {
-			pin6 = (struct in6_addr *)__entry->saddr_v6;
-			*pin6 = sk->sk_v6_rcv_saddr;
-			pin6 = (struct in6_addr *)__entry->daddr_v6;
-			*pin6 = sk->sk_v6_daddr;
-		} else
-#endif
-		{
-			pin6 = (struct in6_addr *)__entry->saddr_v6;
-			ipv6_addr_set_v4mapped(inet->inet_saddr, pin6);
-			pin6 = (struct in6_addr *)__entry->daddr_v6;
-			ipv6_addr_set_v4mapped(inet->inet_daddr, pin6);
-		}
+		TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
+			       sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
 	),
 
 	TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c error=%d",
-- 
cgit v1.2.3


From 646700ce23f4a3f5e967c394d74ed4ea8998b2af Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Mon, 25 Mar 2024 11:43:47 +0800
Subject: trace: use TP_STORE_ADDRS() macro in inet_sock_set_state()

As the title said, use the macro directly like the patch[1] did
to avoid those duplications. No functional change.

[1]
commit 6a6b0b9914e7 ("tcp: Avoid preprocessor directives in tracepoint macro args")

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/trace/events/sock.h | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h
index 4397f7bfa406..0d1c5ce4e6a6 100644
--- a/include/trace/events/sock.h
+++ b/include/trace/events/sock.h
@@ -160,7 +160,6 @@ TRACE_EVENT(inet_sock_set_state,
 
 	TP_fast_assign(
 		const struct inet_sock *inet = inet_sk(sk);
-		struct in6_addr *pin6;
 		__be32 *p32;
 
 		__entry->skaddr = sk;
@@ -178,20 +177,8 @@ TRACE_EVENT(inet_sock_set_state,
 		p32 = (__be32 *) __entry->daddr;
 		*p32 =  inet->inet_daddr;
 
-#if IS_ENABLED(CONFIG_IPV6)
-		if (sk->sk_family == AF_INET6) {
-			pin6 = (struct in6_addr *)__entry->saddr_v6;
-			*pin6 = sk->sk_v6_rcv_saddr;
-			pin6 = (struct in6_addr *)__entry->daddr_v6;
-			*pin6 = sk->sk_v6_daddr;
-		} else
-#endif
-		{
-			pin6 = (struct in6_addr *)__entry->saddr_v6;
-			ipv6_addr_set_v4mapped(inet->inet_saddr, pin6);
-			pin6 = (struct in6_addr *)__entry->daddr_v6;
-			ipv6_addr_set_v4mapped(inet->inet_daddr, pin6);
-		}
+		TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
+			       sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
 	),
 
 	TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s",
-- 
cgit v1.2.3


From 462a7c0f8e3f833e5ca3dd3f427882b60b3f38e9 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@collabora.com>
Date: Tue, 26 Mar 2024 10:30:55 +0100
Subject: drm/panthor: Fix wrong kernel-doc format in the uAPI header

The kernel doc prefix is /** not /*.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Acked-by: Liviu Dudau <liviu.dudau@arm.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240326093055.411932-1-boris.brezillon@collabora.com
---
 include/uapi/drm/panthor_drm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
index 373df80f41ed..dadb05ab1235 100644
--- a/include/uapi/drm/panthor_drm.h
+++ b/include/uapi/drm/panthor_drm.h
@@ -336,10 +336,10 @@ struct drm_panthor_gpu_info {
 	/** @tiler_present: Bitmask encoding the tiler units exposed by the GPU. */
 	__u64 tiler_present;
 
-	/* @core_features: Used to discriminate core variants when they exist. */
+	/** @core_features: Used to discriminate core variants when they exist. */
 	__u32 core_features;
 
-	/* @pad: MBZ. */
+	/** @pad: MBZ. */
 	__u32 pad;
 };
 
-- 
cgit v1.2.3


From 80d2eefcb4c84aa9018b2a997ab3a4c567bc821a Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 25 Mar 2024 08:40:30 +0100
Subject: net: Use backlog-NAPI to clean up the defer_list.

The defer_list is a per-CPU list which is used to free skbs outside of
the socket lock and on the CPU on which they have been allocated.
The list is processed during NAPI callbacks so ideally the list is
cleaned up.
Should the amount of skbs on the list exceed a certain water mark then
the softirq is triggered remotely on the target CPU by invoking a remote
function call. The raise of the softirqs via a remote function call
leads to waking the ksoftirqd on PREEMPT_RT which is undesired.
The backlog-NAPI threads already provide the infrastructure which can be
utilized to perform the cleanup of the defer_list.

The NAPI state is updated with the input_pkt_queue.lock acquired. It
order not to break the state, it is needed to also wake the backlog-NAPI
thread with the lock held. This requires to acquire the use the lock in
rps_lock_irq*() if the backlog-NAPI threads are used even with RPS
disabled.

Move the logic of remotely starting softirqs to clean up the defer_list
into kick_defer_list_purge(). Make sure a lock is held in
rps_lock_irq*() if backlog-NAPI threads are used. Schedule backlog-NAPI
for defer_list cleanup if backlog-NAPI is available.

Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/netdevice.h |  1 +
 net/core/dev.c            | 25 +++++++++++++++++++++----
 net/core/skbuff.c         |  4 ++--
 3 files changed, 24 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cb37817d6382..e41d30ebaca6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3287,6 +3287,7 @@ static inline void dev_xmit_recursion_dec(void)
 	__this_cpu_dec(softnet_data.xmit.recursion);
 }
 
+void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu);
 void __netif_schedule(struct Qdisc *q);
 void netif_schedule_queue(struct netdev_queue *txq);
 
diff --git a/net/core/dev.c b/net/core/dev.c
index ea574df49640..85e11dabab48 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -226,7 +226,7 @@ static bool use_backlog_threads(void)
 static inline void rps_lock_irqsave(struct softnet_data *sd,
 				    unsigned long *flags)
 {
-	if (IS_ENABLED(CONFIG_RPS))
+	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
 		spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
 	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
 		local_irq_save(*flags);
@@ -234,7 +234,7 @@ static inline void rps_lock_irqsave(struct softnet_data *sd,
 
 static inline void rps_lock_irq_disable(struct softnet_data *sd)
 {
-	if (IS_ENABLED(CONFIG_RPS))
+	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
 		spin_lock_irq(&sd->input_pkt_queue.lock);
 	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
 		local_irq_disable();
@@ -243,7 +243,7 @@ static inline void rps_lock_irq_disable(struct softnet_data *sd)
 static inline void rps_unlock_irq_restore(struct softnet_data *sd,
 					  unsigned long *flags)
 {
-	if (IS_ENABLED(CONFIG_RPS))
+	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
 		spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
 	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
 		local_irq_restore(*flags);
@@ -251,7 +251,7 @@ static inline void rps_unlock_irq_restore(struct softnet_data *sd,
 
 static inline void rps_unlock_irq_enable(struct softnet_data *sd)
 {
-	if (IS_ENABLED(CONFIG_RPS))
+	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
 		spin_unlock_irq(&sd->input_pkt_queue.lock);
 	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
 		local_irq_enable();
@@ -4722,6 +4722,23 @@ static void napi_schedule_rps(struct softnet_data *sd)
 	__napi_schedule_irqoff(&mysd->backlog);
 }
 
+void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu)
+{
+	unsigned long flags;
+
+	if (use_backlog_threads()) {
+		rps_lock_irqsave(sd, &flags);
+
+		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
+			__napi_schedule_irqoff(&sd->backlog);
+
+		rps_unlock_irq_restore(sd, &flags);
+
+	} else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) {
+		smp_call_function_single_async(cpu, &sd->defer_csd);
+	}
+}
+
 #ifdef CONFIG_NET_FLOW_LIMIT
 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
 #endif
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b99127712e67..17617c29be2d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -7039,8 +7039,8 @@ nodefer:	__kfree_skb(skb);
 	/* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
 	 * if we are unlucky enough (this seems very unlikely).
 	 */
-	if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1))
-		smp_call_function_single_async(cpu, &sd->defer_csd);
+	if (unlikely(kick))
+		kick_defer_list_purge(sd, cpu);
 }
 
 static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
-- 
cgit v1.2.3


From da251ce210617fa31aa2a65ba7f28e1c584a1938 Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Mon, 25 Mar 2024 20:46:16 +0000
Subject: include: trace: Widen the tag buffer in trace_scmi_dump_msg

A bigger buffer allow for more diverse tag names.

Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20240325204620.1437237-2-cristian.marussi@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 include/trace/events/scmi.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/scmi.h b/include/trace/events/scmi.h
index 422c1ad9484d..127300481123 100644
--- a/include/trace/events/scmi.h
+++ b/include/trace/events/scmi.h
@@ -7,6 +7,8 @@
 
 #include <linux/tracepoint.h>
 
+#define TRACE_SCMI_MAX_TAG_LEN	6
+
 TRACE_EVENT(scmi_fc_call,
 	TP_PROTO(u8 protocol_id, u8 msg_id, u32 res_id, u32 val1, u32 val2),
 	TP_ARGS(protocol_id, msg_id, res_id, val1, val2),
@@ -150,7 +152,7 @@ TRACE_EVENT(scmi_msg_dump,
 		__field(u8, channel_id)
 		__field(u8, protocol_id)
 		__field(u8, msg_id)
-		__array(char, tag, 5)
+		__array(char, tag, TRACE_SCMI_MAX_TAG_LEN)
 		__field(u16, seq)
 		__field(int, status)
 		__field(size_t, len)
@@ -162,7 +164,7 @@ TRACE_EVENT(scmi_msg_dump,
 		__entry->channel_id = channel_id;
 		__entry->protocol_id = protocol_id;
 		__entry->msg_id = msg_id;
-		strscpy(__entry->tag, tag, 5);
+		strscpy(__entry->tag, tag, TRACE_SCMI_MAX_TAG_LEN);
 		__entry->seq = seq;
 		__entry->status = status;
 		__entry->len = len;
-- 
cgit v1.2.3


From 264a2c52062802ea6052011015cb3d919dc14d9f Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Mon, 25 Mar 2024 20:46:19 +0000
Subject: firmware: arm_scmi: Simplify scmi_devm_notifier_unregister

Unregistering SCMI notifications using the managed devres interface can be
done providing as a reference simply the previously successfully registered
notification block since it could have been registered only on one kernel
notification_chain: drop any reference to SCMI protocol, events and
sources.

Devres internal helpers can search for the provided notification block
reference and, once found, the associated devres object will already
provide the above SCMI references for the event.

Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Link: https://lore.kernel.org/r/20240325204620.1437237-5-cristian.marussi@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/notify.c | 30 ++++--------------------------
 include/linux/scmi_protocol.h      |  2 --
 2 files changed, 4 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/arm_scmi/notify.c b/drivers/firmware/arm_scmi/notify.c
index 27c52531194d..e160ecb22948 100644
--- a/drivers/firmware/arm_scmi/notify.c
+++ b/drivers/firmware/arm_scmi/notify.c
@@ -1513,17 +1513,12 @@ static int scmi_devm_notifier_register(struct scmi_device *sdev,
 static int scmi_devm_notifier_match(struct device *dev, void *res, void *data)
 {
 	struct scmi_notifier_devres *dres = res;
-	struct scmi_notifier_devres *xres = data;
+	struct notifier_block *nb = data;
 
-	if (WARN_ON(!dres || !xres))
+	if (WARN_ON(!dres || !nb))
 		return 0;
 
-	return dres->proto_id == xres->proto_id &&
-		dres->evt_id == xres->evt_id &&
-		dres->nb == xres->nb &&
-		((!dres->src_id && !xres->src_id) ||
-		  (dres->src_id && xres->src_id &&
-		   dres->__src_id == xres->__src_id));
+	return dres->nb == nb;
 }
 
 /**
@@ -1531,10 +1526,6 @@ static int scmi_devm_notifier_match(struct device *dev, void *res, void *data)
  * notifier_block for an event
  * @sdev: A reference to an scmi_device whose embedded struct device is to
  *	  be used for devres accounting.
- * @proto_id: Protocol ID
- * @evt_id: Event ID
- * @src_id: Source ID, when NULL register for events coming form ALL possible
- *	    sources
  * @nb: A standard notifier block to register for the specified event
  *
  * Generic devres managed helper to explicitly un-register a notifier_block
@@ -1544,25 +1535,12 @@ static int scmi_devm_notifier_match(struct device *dev, void *res, void *data)
  * Return: 0 on Success
  */
 static int scmi_devm_notifier_unregister(struct scmi_device *sdev,
-					 u8 proto_id, u8 evt_id,
-					 const u32 *src_id,
 					 struct notifier_block *nb)
 {
 	int ret;
-	struct scmi_notifier_devres dres;
-
-	dres.handle = sdev->handle;
-	dres.proto_id = proto_id;
-	dres.evt_id = evt_id;
-	if (src_id) {
-		dres.__src_id = *src_id;
-		dres.src_id = &dres.__src_id;
-	} else {
-		dres.src_id = NULL;
-	}
 
 	ret = devres_release(&sdev->dev, scmi_devm_release_notifier,
-			     scmi_devm_notifier_match, &dres);
+			     scmi_devm_notifier_match, nb);
 
 	WARN_ON(ret);
 
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index b807141acc14..a3addb07e00a 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -783,8 +783,6 @@ struct scmi_notify_ops {
 					    const u32 *src_id,
 					    struct notifier_block *nb);
 	int (*devm_event_notifier_unregister)(struct scmi_device *sdev,
-					      u8 proto_id, u8 evt_id,
-					      const u32 *src_id,
 					      struct notifier_block *nb);
 	int (*event_notifier_register)(const struct scmi_handle *handle,
 				       u8 proto_id, u8 evt_id,
-- 
cgit v1.2.3


From 6e06312035032924fc97f2050bfe85e63ca26514 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 25 Mar 2024 13:41:55 +0000
Subject: net: remove skb_free_datagram_locked()

Last user of skb_free_datagram_locked() went away in 2016
with commit 850cbaddb52d ("udp: use it's own memory
accounting schema").

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20240325134155.620531-1-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/skbuff.h |  6 ------
 net/core/datagram.c    | 19 -------------------
 2 files changed, 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0c7c67b3a87b..b945af8a6208 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4063,12 +4063,6 @@ int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
 				 struct iov_iter *from, int len);
 int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm);
 void skb_free_datagram(struct sock *sk, struct sk_buff *skb);
-void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len);
-static inline void skb_free_datagram_locked(struct sock *sk,
-					    struct sk_buff *skb)
-{
-	__skb_free_datagram_locked(sk, skb, 0);
-}
 int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags);
 int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len);
 int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index a8b625abe242..e614cfd8e14a 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -324,25 +324,6 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(skb_free_datagram);
 
-void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len)
-{
-	bool slow;
-
-	if (!skb_unref(skb)) {
-		sk_peek_offset_bwd(sk, len);
-		return;
-	}
-
-	slow = lock_sock_fast(sk);
-	sk_peek_offset_bwd(sk, len);
-	skb_orphan(skb);
-	unlock_sock_fast(sk, slow);
-
-	/* skb is now orphaned, can be freed outside of locked section */
-	__kfree_skb(skb);
-}
-EXPORT_SYMBOL(__skb_free_datagram_locked);
-
 int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,
 			struct sk_buff *skb, unsigned int flags,
 			void (*destructor)(struct sock *sk,
-- 
cgit v1.2.3


From d75a21611a6e723d81db3f827e131ad39b69186c Mon Sep 17 00:00:00 2001
From: Luca Ceresoli <luca.ceresoli@bootlin.com>
Date: Tue, 5 Mar 2024 15:36:30 +0100
Subject: ASoC: core: add SOC_DOUBLE_RANGE_TLV() helper macro

No macro currently allows handling a stereo control that has left and right
in the same register and whose minimum register value is not zero. Add one
that does that.

Note that even though the snd_soc_*_volsw_range() look more appropriate
given the _range suffix, they are not suitable because they don't honor the
two shift values. The snd_soc_*_volsw() look more generic and are suitable
for the task.

Signed-off-by: Luca Ceresoli <luca.ceresoli@bootlin.com>
Link: https://msgid.link/r/20240305-rk3308-audio-codec-v4-3-312acdbe628f@bootlin.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index 39613b406b1d..0376f7e4c15d 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -149,6 +149,18 @@
 		{.reg = xreg, .rreg = xreg, \
 		.shift = shift_left, .rshift = shift_right, \
 		.max = xmax, .min = xmin} }
+#define SOC_DOUBLE_RANGE_TLV(xname, xreg, xshift_left, xshift_right, xmin, xmax, \
+			     xinvert, tlv_array) \
+{	.iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = (xname),\
+	.access = SNDRV_CTL_ELEM_ACCESS_TLV_READ |\
+		  SNDRV_CTL_ELEM_ACCESS_READWRITE,\
+	.tlv.p = (tlv_array), \
+	.info = snd_soc_info_volsw, \
+	.get = snd_soc_get_volsw, .put = snd_soc_put_volsw, \
+	.private_value = (unsigned long)&(struct soc_mixer_control) \
+		{.reg = xreg, .rreg = xreg, \
+		 .shift = xshift_left, .rshift = xshift_right, \
+		 .min = xmin, .max = xmax, .invert = xinvert} }
 #define SOC_DOUBLE_R_TLV(xname, reg_left, reg_right, xshift, xmax, xinvert, tlv_array) \
 {	.iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = (xname),\
 	.access = SNDRV_CTL_ELEM_ACCESS_TLV_READ |\
-- 
cgit v1.2.3


From 6376eb8b911534735fec104c1a0d780e4cf3116a Mon Sep 17 00:00:00 2001
From: Douglas Anderson <dianders@chromium.org>
Date: Tue, 19 Mar 2024 13:58:39 -0700
Subject: drm/dp: Clarify that wait_hpd_asserted() is not optional for panels

In response to my patch removing the "wait for HPD" logic at the
beginning of the MSM DP transfer() callback [1], we had some debate
about what the "This is an optional function" meant in the
documentation of the wait_hpd_asserted() callback. Let's clarify.

As talked about in the MSM DP patch [1], before wait_hpd_asserted()
was introduced there was no great way for panel drivers to wait for
HPD in the case that the "built-in" HPD signal was used. Panel drivers
could only wait for HPD if a GPIO was used. At the time, we ended up
just saying that if we were using the "built-in" HPD signal that DP
AUX controllers needed to wait for HPD themselves at the beginning of
their transfer() callback. The fact that the wait for HPD at the
beginning of transfer() was awkward/problematic was the whole reason
wait_hpd_asserted() was added.

Let's make it obvious that if a DP AUX controller implements
wait_hpd_asserted() that they don't need a loop waiting for HPD at the
start of their transfer() function. We'll still allow DP controllers
to work the old way but mark it as deprecated.

[1] https://lore.kernel.org/r/20240315143621.v2.3.I535606f6d4f7e3e5588bb75c55996f61980183cd@changeid

Reviewed-by: Abhinav Kumar <quic_abhinavk@quicinc.com>
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Signed-off-by: Douglas Anderson <dianders@chromium.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20240319135836.v2.1.I521dad0693cc24fe4dd14cba0c7048d94f5b6b41@changeid
---
 include/drm/display/drm_dp_helper.h | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/drm/display/drm_dp_helper.h b/include/drm/display/drm_dp_helper.h
index a62fcd051d4d..6c6a96496df6 100644
--- a/include/drm/display/drm_dp_helper.h
+++ b/include/drm/display/drm_dp_helper.h
@@ -422,7 +422,18 @@ struct drm_dp_aux {
 	 * @wait_hpd_asserted: wait for HPD to be asserted
 	 *
 	 * This is mainly useful for eDP panels drivers to wait for an eDP
-	 * panel to finish powering on. This is an optional function.
+	 * panel to finish powering on. It is optional for DP AUX controllers
+	 * to implement this function. It is required for DP AUX endpoints
+	 * (panel drivers) to call this function after powering up but before
+	 * doing AUX transfers unless the DP AUX endpoint driver knows that
+	 * we're not using the AUX controller's HPD. One example of the panel
+	 * driver not needing to call this is if HPD is hooked up to a GPIO
+	 * that the panel driver can read directly.
+	 *
+	 * If a DP AUX controller does not implement this function then it
+	 * may still support eDP panels that use the AUX controller's built-in
+	 * HPD signal by implementing a long wait for HPD in the transfer()
+	 * callback, though this is deprecated.
 	 *
 	 * This function will efficiently wait for the HPD signal to be
 	 * asserted. The `wait_us` parameter that is passed in says that we
-- 
cgit v1.2.3


From 7b95ee0db7e0a7f99077f1b926323c7bf0d2e8f8 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 26 Mar 2024 19:09:48 +0200
Subject: ASoC: soc-jack: Get rid of legacy GPIO support

No more users.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://msgid.link/r/20240326171134.1414462-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-jack.h              |  2 --
 sound/soc/generic/simple-card-utils.c |  2 --
 sound/soc/soc-jack.c                  | 23 ++++-------------------
 3 files changed, 4 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc-jack.h b/include/sound/soc-jack.h
index a0abb1ee5110..3a81d4b8ca8a 100644
--- a/include/sound/soc-jack.h
+++ b/include/sound/soc-jack.h
@@ -44,7 +44,6 @@ struct snd_soc_jack_zone {
 /**
  * struct snd_soc_jack_gpio - Describes a gpio pin for jack detection
  *
- * @gpio:         legacy gpio number
  * @idx:          gpio descriptor index within the function of the GPIO
  *                consumer device
  * @gpiod_dev:    GPIO consumer device
@@ -59,7 +58,6 @@ struct snd_soc_jack_zone {
  *		       ADC).
  */
 struct snd_soc_jack_gpio {
-	unsigned int gpio;
 	unsigned int idx;
 	struct device *gpiod_dev;
 	const char *name;
diff --git a/sound/soc/generic/simple-card-utils.c b/sound/soc/generic/simple-card-utils.c
index 81077d16d22f..b4876b4f259d 100644
--- a/sound/soc/generic/simple-card-utils.c
+++ b/sound/soc/generic/simple-card-utils.c
@@ -752,8 +752,6 @@ int simple_util_init_jack(struct snd_soc_card *card,
 	if (!prefix)
 		prefix = "";
 
-	sjack->gpio.gpio = -ENOENT;
-
 	if (is_hp) {
 		snprintf(prop, sizeof(prop), "%shp-det", prefix);
 		pin_name	= pin ? pin : "Headphones";
diff --git a/sound/soc/soc-jack.c b/sound/soc/soc-jack.c
index b2cc13b9c77b..63971396b708 100644
--- a/sound/soc/soc-jack.c
+++ b/sound/soc/soc-jack.c
@@ -8,7 +8,6 @@
 
 #include <sound/jack.h>
 #include <sound/soc.h>
-#include <linux/gpio.h>
 #include <linux/gpio/consumer.h>
 #include <linux/interrupt.h>
 #include <linux/workqueue.h>
@@ -345,21 +344,9 @@ int snd_soc_jack_add_gpios(struct snd_soc_jack *jack, int count,
 				goto undo;
 			}
 		} else {
-			/* legacy GPIO number */
-			if (!gpio_is_valid(gpios[i].gpio)) {
-				dev_err(jack->card->dev,
-					"ASoC: Invalid gpio %d\n",
-					gpios[i].gpio);
-				ret = -EINVAL;
-				goto undo;
-			}
-
-			ret = gpio_request_one(gpios[i].gpio, GPIOF_IN,
-					       gpios[i].name);
-			if (ret)
-				goto undo;
-
-			gpios[i].desc = gpio_to_desc(gpios[i].gpio);
+			dev_err(jack->card->dev, "ASoC: Invalid gpio at index %d\n", i);
+		        ret = -EINVAL;
+		        goto undo;
 		}
 got_gpio:
 		INIT_DELAYED_WORK(&gpios[i].work, gpio_work);
@@ -373,7 +360,7 @@ got_gpio:
 					      gpios[i].name,
 					      &gpios[i]);
 		if (ret < 0)
-			goto err;
+			goto undo;
 
 		if (gpios[i].wake) {
 			ret = irq_set_irq_wake(gpiod_to_irq(gpios[i].desc), 1);
@@ -401,8 +388,6 @@ got_gpio:
 	devres_add(jack->card->dev, tbl);
 	return 0;
 
-err:
-	gpio_free(gpios[i].gpio);
 undo:
 	jack_free_gpios(jack, i, gpios);
 	devres_free(tbl);
-- 
cgit v1.2.3


From dad983d8812975b53db83f02ae6b0ad15f018a9e Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 26 Mar 2024 20:07:52 +0200
Subject: spi: pxa2xx: Keep PXA*_SSP types together

Keep the PXA*_SSP types together in enum pxa_ssp_type
for better maintenance.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://msgid.link/r/20240326181027.1418989-3-andriy.shevchenko@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/pxa2xx_ssp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/pxa2xx_ssp.h b/include/linux/pxa2xx_ssp.h
index cd1973e6ac4b..844a2743ca94 100644
--- a/include/linux/pxa2xx_ssp.h
+++ b/include/linux/pxa2xx_ssp.h
@@ -217,9 +217,9 @@ enum pxa_ssp_type {
 	PXA27x_SSP,
 	PXA3xx_SSP,
 	PXA168_SSP,
-	MMP2_SSP,
 	PXA910_SSP,
 	CE4100_SSP,
+	MMP2_SSP,
 	MRFLD_SSP,
 	QUARK_X1000_SSP,
 	/* Keep LPSS types sorted with lpss_platforms[] */
-- 
cgit v1.2.3


From 0bd25f78c4fc97ec61ebaf6147973ddc2e15c7b6 Mon Sep 17 00:00:00 2001
From: Ravi Kumar Vodapalli <ravi.kumar.vodapalli@intel.com>
Date: Tue, 26 Mar 2024 16:08:25 +0530
Subject: drm/xe: Add new PCI IDs to DG2 platform

New PCI IDs are added in Bspec for DG2 platform, add them in driver

Bspec: 44477
Signed-off-by: Ravi Kumar Vodapalli <ravi.kumar.vodapalli@intel.com>
Reviewed-by: Matt Roper <matthew.d.roper@intel.com>
Signed-off-by: Matt Roper <matthew.d.roper@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240326103825.3832879-1-ravi.kumar.vodapalli@intel.com
---
 include/drm/xe_pciids.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/drm/xe_pciids.h b/include/drm/xe_pciids.h
index bc7cbef6e9d8..adb37bc541e4 100644
--- a/include/drm/xe_pciids.h
+++ b/include/drm/xe_pciids.h
@@ -134,7 +134,9 @@
 	MACRO__(0x5692, ## __VA_ARGS__),	\
 	MACRO__(0x56A0, ## __VA_ARGS__),	\
 	MACRO__(0x56A1, ## __VA_ARGS__),	\
-	MACRO__(0x56A2, ## __VA_ARGS__)
+	MACRO__(0x56A2, ## __VA_ARGS__),	\
+	MACRO__(0x56BE, ## __VA_ARGS__),	\
+	MACRO__(0x56BF, ## __VA_ARGS__)
 
 #define XE_DG2_G11_IDS(MACRO__, ...)		\
 	MACRO__(0x5693, ## __VA_ARGS__),	\
-- 
cgit v1.2.3


From 31e7f6c015d9eb35e77ae9868801c53ab0ff19ac Mon Sep 17 00:00:00 2001
From: Javier Carrasco <javier.carrasco@wolfvision.net>
Date: Mon, 25 Mar 2024 10:15:12 +0100
Subject: usb: misc: onboard_hub: rename to onboard_dev

This patch prepares onboad_hub to support non-hub devices by renaming
the driver files and their content, the headers and their references.

The comments and descriptions have been slightly modified to keep
coherence and account for the specific cases that only affect onboard
hubs (e.g. peer-hub).

The "hub" variables in functions where "dev" (and similar names) variables
already exist have been renamed to onboard_dev for clarity, which adds a
few lines in cases where more than 80 characters are used.

No new functionality has been added.

Acked-by: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Javier Carrasco <javier.carrasco@wolfvision.net>
Link: https://lore.kernel.org/r/20240325-onboard_xvf3500-v8-2-29e3f9222922@wolfvision.net
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 .../ABI/testing/sysfs-bus-platform-onboard-usb-dev |   9 +
 .../ABI/testing/sysfs-bus-platform-onboard-usb-hub |   8 -
 MAINTAINERS                                        |   4 +-
 drivers/usb/core/Makefile                          |   4 +-
 drivers/usb/core/hub.c                             |   8 +-
 drivers/usb/core/hub.h                             |   2 +-
 drivers/usb/misc/Kconfig                           |  16 +-
 drivers/usb/misc/Makefile                          |   2 +-
 drivers/usb/misc/onboard_usb_dev.c                 | 521 +++++++++++++++++++++
 drivers/usb/misc/onboard_usb_dev.h                 | 103 ++++
 drivers/usb/misc/onboard_usb_dev_pdevs.c           | 144 ++++++
 drivers/usb/misc/onboard_usb_hub.c                 | 506 --------------------
 drivers/usb/misc/onboard_usb_hub.h                 | 103 ----
 drivers/usb/misc/onboard_usb_hub_pdevs.c           | 143 ------
 include/linux/usb/onboard_dev.h                    |  18 +
 include/linux/usb/onboard_hub.h                    |  18 -
 16 files changed, 813 insertions(+), 796 deletions(-)
 create mode 100644 Documentation/ABI/testing/sysfs-bus-platform-onboard-usb-dev
 delete mode 100644 Documentation/ABI/testing/sysfs-bus-platform-onboard-usb-hub
 create mode 100644 drivers/usb/misc/onboard_usb_dev.c
 create mode 100644 drivers/usb/misc/onboard_usb_dev.h
 create mode 100644 drivers/usb/misc/onboard_usb_dev_pdevs.c
 delete mode 100644 drivers/usb/misc/onboard_usb_hub.c
 delete mode 100644 drivers/usb/misc/onboard_usb_hub.h
 delete mode 100644 drivers/usb/misc/onboard_usb_hub_pdevs.c
 create mode 100644 include/linux/usb/onboard_dev.h
 delete mode 100644 include/linux/usb/onboard_hub.h

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-bus-platform-onboard-usb-dev b/Documentation/ABI/testing/sysfs-bus-platform-onboard-usb-dev
new file mode 100644
index 000000000000..b06a48c3c85a
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-platform-onboard-usb-dev
@@ -0,0 +1,9 @@
+What:		/sys/bus/platform/devices/<dev>/always_powered_in_suspend
+Date:		June 2022
+KernelVersion:	5.20
+Contact:	Matthias Kaehlcke <matthias@kaehlcke.net>
+		linux-usb@vger.kernel.org
+Description:
+		(RW) Controls whether the USB hub remains always powered
+		during system suspend or not. This attribute is not
+		available for non-hub devices.
diff --git a/Documentation/ABI/testing/sysfs-bus-platform-onboard-usb-hub b/Documentation/ABI/testing/sysfs-bus-platform-onboard-usb-hub
deleted file mode 100644
index 42deb0552065..000000000000
--- a/Documentation/ABI/testing/sysfs-bus-platform-onboard-usb-hub
+++ /dev/null
@@ -1,8 +0,0 @@
-What:		/sys/bus/platform/devices/<dev>/always_powered_in_suspend
-Date:		June 2022
-KernelVersion:	5.20
-Contact:	Matthias Kaehlcke <matthias@kaehlcke.net>
-		linux-usb@vger.kernel.org
-Description:
-		(RW) Controls whether the USB hub remains always powered
-		during system suspend or not.
\ No newline at end of file
diff --git a/MAINTAINERS b/MAINTAINERS
index aa3b947fb080..48cbcab391c9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16439,8 +16439,8 @@ ONBOARD USB HUB DRIVER
 M:	Matthias Kaehlcke <mka@chromium.org>
 L:	linux-usb@vger.kernel.org
 S:	Maintained
-F:	Documentation/ABI/testing/sysfs-bus-platform-onboard-usb-hub
-F:	drivers/usb/misc/onboard_usb_hub.c
+F:	Documentation/ABI/testing/sysfs-bus-platform-onboard-usb-dev
+F:	drivers/usb/misc/onboard_usb_dev.c
 
 ONENAND FLASH DRIVER
 M:	Kyungmin Park <kyungmin.park@samsung.com>
diff --git a/drivers/usb/core/Makefile b/drivers/usb/core/Makefile
index 7d338e9c0657..ac006abd13b3 100644
--- a/drivers/usb/core/Makefile
+++ b/drivers/usb/core/Makefile
@@ -12,8 +12,8 @@ usbcore-$(CONFIG_OF)		+= of.o
 usbcore-$(CONFIG_USB_PCI)		+= hcd-pci.o
 usbcore-$(CONFIG_ACPI)		+= usb-acpi.o
 
-ifdef CONFIG_USB_ONBOARD_HUB
-usbcore-y			+= ../misc/onboard_usb_hub_pdevs.o
+ifdef CONFIG_USB_ONBOARD_DEV
+usbcore-y			+= ../misc/onboard_usb_dev_pdevs.o
 endif
 
 obj-$(CONFIG_USB)		+= usbcore.o
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 3ee8455585b6..1dcae5db85b2 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -23,7 +23,7 @@
 #include <linux/usb.h>
 #include <linux/usbdevice_fs.h>
 #include <linux/usb/hcd.h>
-#include <linux/usb/onboard_hub.h>
+#include <linux/usb/onboard_dev.h>
 #include <linux/usb/otg.h>
 #include <linux/usb/quirks.h>
 #include <linux/workqueue.h>
@@ -1805,7 +1805,7 @@ static void hub_disconnect(struct usb_interface *intf)
 	if (hub->quirk_disable_autosuspend)
 		usb_autopm_put_interface(intf);
 
-	onboard_hub_destroy_pdevs(&hub->onboard_hub_devs);
+	onboard_dev_destroy_pdevs(&hub->onboard_devs);
 
 	kref_put(&hub->kref, hub_release);
 }
@@ -1924,7 +1924,7 @@ static int hub_probe(struct usb_interface *intf, const struct usb_device_id *id)
 	INIT_DELAYED_WORK(&hub->leds, led_work);
 	INIT_DELAYED_WORK(&hub->init_work, NULL);
 	INIT_WORK(&hub->events, hub_event);
-	INIT_LIST_HEAD(&hub->onboard_hub_devs);
+	INIT_LIST_HEAD(&hub->onboard_devs);
 	spin_lock_init(&hub->irq_urb_lock);
 	timer_setup(&hub->irq_urb_retry, hub_retry_irq_urb, 0);
 	usb_get_intf(intf);
@@ -1954,7 +1954,7 @@ static int hub_probe(struct usb_interface *intf, const struct usb_device_id *id)
 	}
 
 	if (hub_configure(hub, &desc->endpoint[0].desc) >= 0) {
-		onboard_hub_create_pdevs(hdev, &hub->onboard_hub_devs);
+		onboard_dev_create_pdevs(hdev, &hub->onboard_devs);
 
 		return 0;
 	}
diff --git a/drivers/usb/core/hub.h b/drivers/usb/core/hub.h
index 43ce21c96a51..3820703b11d8 100644
--- a/drivers/usb/core/hub.h
+++ b/drivers/usb/core/hub.h
@@ -74,7 +74,7 @@ struct usb_hub {
 	spinlock_t		irq_urb_lock;
 	struct timer_list	irq_urb_retry;
 	struct usb_port		**ports;
-	struct list_head        onboard_hub_devs;
+	struct list_head        onboard_devs;
 };
 
 /**
diff --git a/drivers/usb/misc/Kconfig b/drivers/usb/misc/Kconfig
index c510af7baa0d..50b86d531701 100644
--- a/drivers/usb/misc/Kconfig
+++ b/drivers/usb/misc/Kconfig
@@ -316,18 +316,18 @@ config BRCM_USB_PINMAP
 	  signals, which are typically on dedicated pins on the chip,
 	  to any gpio.
 
-config USB_ONBOARD_HUB
-	tristate "Onboard USB hub support"
+config USB_ONBOARD_DEV
+	tristate "Onboard USB device support"
 	depends on OF
 	help
-	  Say Y here if you want to support discrete onboard USB hubs that
-	  don't require an additional control bus for initialization, but
-	  need some non-trivial form of initialization, such as enabling a
-	  power regulator. An example for such a hub is the Realtek
-	  RTS5411.
+	  Say Y here if you want to support discrete onboard USB devices
+	  that don't require an additional control bus for initialization,
+	  but need some non-trivial form of initialization, such as
+	  enabling a power regulator. An example for such device is the
+	  Realtek RTS5411 hub.
 
 	  This driver can be used as a module but its state (module vs
 	  builtin) must match the state of the USB subsystem. Enabling
 	  this config will enable the driver and it will automatically
 	  match the state of the USB subsystem. If this driver is a
-	  module it will be called onboard_usb_hub.
+	  module it will be called onboard_usb_dev.
diff --git a/drivers/usb/misc/Makefile b/drivers/usb/misc/Makefile
index 0bc732bcb162..0cd5bc8f52fe 100644
--- a/drivers/usb/misc/Makefile
+++ b/drivers/usb/misc/Makefile
@@ -33,4 +33,4 @@ obj-$(CONFIG_USB_CHAOSKEY)		+= chaoskey.o
 obj-$(CONFIG_USB_SISUSBVGA)		+= sisusbvga/
 obj-$(CONFIG_USB_LINK_LAYER_TEST)	+= lvstest.o
 obj-$(CONFIG_BRCM_USB_PINMAP)		+= brcmstb-usb-pinmap.o
-obj-$(CONFIG_USB_ONBOARD_HUB)		+= onboard_usb_hub.o
+obj-$(CONFIG_USB_ONBOARD_DEV)		+= onboard_usb_dev.o
diff --git a/drivers/usb/misc/onboard_usb_dev.c b/drivers/usb/misc/onboard_usb_dev.c
new file mode 100644
index 000000000000..6aee43896216
--- /dev/null
+++ b/drivers/usb/misc/onboard_usb_dev.c
@@ -0,0 +1,521 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Driver for onboard USB devices
+ *
+ * Copyright (c) 2022, Google LLC
+ */
+
+#include <linux/clk.h>
+#include <linux/device.h>
+#include <linux/export.h>
+#include <linux/err.h>
+#include <linux/gpio/consumer.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/regulator/consumer.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+#include <linux/sysfs.h>
+#include <linux/usb.h>
+#include <linux/usb/hcd.h>
+#include <linux/usb/onboard_dev.h>
+#include <linux/workqueue.h>
+
+#include "onboard_usb_dev.h"
+
+static void onboard_dev_attach_usb_driver(struct work_struct *work);
+
+static struct usb_device_driver onboard_dev_usbdev_driver;
+static DECLARE_WORK(attach_usb_driver_work, onboard_dev_attach_usb_driver);
+
+/************************** Platform driver **************************/
+
+struct usbdev_node {
+	struct usb_device *udev;
+	struct list_head list;
+};
+
+struct onboard_dev {
+	struct regulator_bulk_data supplies[MAX_SUPPLIES];
+	struct device *dev;
+	const struct onboard_dev_pdata *pdata;
+	struct gpio_desc *reset_gpio;
+	bool always_powered_in_suspend;
+	bool is_powered_on;
+	bool going_away;
+	struct list_head udev_list;
+	struct mutex lock;
+	struct clk *clk;
+};
+
+static int onboard_dev_get_regulators(struct onboard_dev *onboard_dev)
+{
+	const char * const *supply_names = onboard_dev->pdata->supply_names;
+	unsigned int num_supplies = onboard_dev->pdata->num_supplies;
+	struct device *dev = onboard_dev->dev;
+	unsigned int i;
+	int err;
+
+	if (num_supplies > MAX_SUPPLIES)
+		return dev_err_probe(dev, -EINVAL, "max %d supplies supported!\n",
+				     MAX_SUPPLIES);
+
+	for (i = 0; i < num_supplies; i++)
+		onboard_dev->supplies[i].supply = supply_names[i];
+
+	err = devm_regulator_bulk_get(dev, num_supplies, onboard_dev->supplies);
+	if (err)
+		dev_err(dev, "Failed to get regulator supplies: %pe\n",
+			ERR_PTR(err));
+
+	return err;
+}
+
+static int onboard_dev_power_on(struct onboard_dev *onboard_dev)
+{
+	int err;
+
+	err = clk_prepare_enable(onboard_dev->clk);
+	if (err) {
+		dev_err(onboard_dev->dev, "failed to enable clock: %pe\n",
+			ERR_PTR(err));
+		return err;
+	}
+
+	err = regulator_bulk_enable(onboard_dev->pdata->num_supplies,
+				    onboard_dev->supplies);
+	if (err) {
+		dev_err(onboard_dev->dev, "failed to enable supplies: %pe\n",
+			ERR_PTR(err));
+		return err;
+	}
+
+	fsleep(onboard_dev->pdata->reset_us);
+	gpiod_set_value_cansleep(onboard_dev->reset_gpio, 0);
+
+	onboard_dev->is_powered_on = true;
+
+	return 0;
+}
+
+static int onboard_dev_power_off(struct onboard_dev *onboard_dev)
+{
+	int err;
+
+	gpiod_set_value_cansleep(onboard_dev->reset_gpio, 1);
+
+	err = regulator_bulk_disable(onboard_dev->pdata->num_supplies,
+				     onboard_dev->supplies);
+	if (err) {
+		dev_err(onboard_dev->dev, "failed to disable supplies: %pe\n",
+			ERR_PTR(err));
+		return err;
+	}
+
+	clk_disable_unprepare(onboard_dev->clk);
+
+	onboard_dev->is_powered_on = false;
+
+	return 0;
+}
+
+static int __maybe_unused onboard_dev_suspend(struct device *dev)
+{
+	struct onboard_dev *onboard_dev = dev_get_drvdata(dev);
+	struct usbdev_node *node;
+	bool power_off = true;
+
+	if (onboard_dev->always_powered_in_suspend)
+		return 0;
+
+	mutex_lock(&onboard_dev->lock);
+
+	list_for_each_entry(node, &onboard_dev->udev_list, list) {
+		if (!device_may_wakeup(node->udev->bus->controller))
+			continue;
+
+		if (usb_wakeup_enabled_descendants(node->udev)) {
+			power_off = false;
+			break;
+		}
+	}
+
+	mutex_unlock(&onboard_dev->lock);
+
+	if (!power_off)
+		return 0;
+
+	return onboard_dev_power_off(onboard_dev);
+}
+
+static int __maybe_unused onboard_dev_resume(struct device *dev)
+{
+	struct onboard_dev *onboard_dev = dev_get_drvdata(dev);
+
+	if (onboard_dev->is_powered_on)
+		return 0;
+
+	return onboard_dev_power_on(onboard_dev);
+}
+
+static inline void get_udev_link_name(const struct usb_device *udev, char *buf,
+				      size_t size)
+{
+	snprintf(buf, size, "usb_dev.%s", dev_name(&udev->dev));
+}
+
+static int onboard_dev_add_usbdev(struct onboard_dev *onboard_dev,
+				  struct usb_device *udev)
+{
+	struct usbdev_node *node;
+	char link_name[64];
+	int err;
+
+	mutex_lock(&onboard_dev->lock);
+
+	if (onboard_dev->going_away) {
+		err = -EINVAL;
+		goto error;
+	}
+
+	node = kzalloc(sizeof(*node), GFP_KERNEL);
+	if (!node) {
+		err = -ENOMEM;
+		goto error;
+	}
+
+	node->udev = udev;
+
+	list_add(&node->list, &onboard_dev->udev_list);
+
+	mutex_unlock(&onboard_dev->lock);
+
+	get_udev_link_name(udev, link_name, sizeof(link_name));
+	WARN_ON(sysfs_create_link(&onboard_dev->dev->kobj, &udev->dev.kobj,
+				  link_name));
+
+	return 0;
+
+error:
+	mutex_unlock(&onboard_dev->lock);
+
+	return err;
+}
+
+static void onboard_dev_remove_usbdev(struct onboard_dev *onboard_dev,
+				      const struct usb_device *udev)
+{
+	struct usbdev_node *node;
+	char link_name[64];
+
+	get_udev_link_name(udev, link_name, sizeof(link_name));
+	sysfs_remove_link(&onboard_dev->dev->kobj, link_name);
+
+	mutex_lock(&onboard_dev->lock);
+
+	list_for_each_entry(node, &onboard_dev->udev_list, list) {
+		if (node->udev == udev) {
+			list_del(&node->list);
+			kfree(node);
+			break;
+		}
+	}
+
+	mutex_unlock(&onboard_dev->lock);
+}
+
+static ssize_t always_powered_in_suspend_show(struct device *dev,
+					      struct device_attribute *attr,
+					      char *buf)
+{
+	const struct onboard_dev *onboard_dev = dev_get_drvdata(dev);
+
+	return sysfs_emit(buf, "%d\n", onboard_dev->always_powered_in_suspend);
+}
+
+static ssize_t always_powered_in_suspend_store(struct device *dev,
+					       struct device_attribute *attr,
+					       const char *buf, size_t count)
+{
+	struct onboard_dev *onboard_dev = dev_get_drvdata(dev);
+	bool val;
+	int ret;
+
+	ret = kstrtobool(buf, &val);
+	if (ret < 0)
+		return ret;
+
+	onboard_dev->always_powered_in_suspend = val;
+
+	return count;
+}
+static DEVICE_ATTR_RW(always_powered_in_suspend);
+
+static struct attribute *onboard_dev_attrs[] = {
+	&dev_attr_always_powered_in_suspend.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(onboard_dev);
+
+static void onboard_dev_attach_usb_driver(struct work_struct *work)
+{
+	int err;
+
+	err = driver_attach(&onboard_dev_usbdev_driver.driver);
+	if (err)
+		pr_err("Failed to attach USB driver: %pe\n", ERR_PTR(err));
+}
+
+static int onboard_dev_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct onboard_dev *onboard_dev;
+	int err;
+
+	onboard_dev = devm_kzalloc(dev, sizeof(*onboard_dev), GFP_KERNEL);
+	if (!onboard_dev)
+		return -ENOMEM;
+
+	onboard_dev->pdata = device_get_match_data(dev);
+	if (!onboard_dev->pdata)
+		return -EINVAL;
+
+	onboard_dev->dev = dev;
+
+	err = onboard_dev_get_regulators(onboard_dev);
+	if (err)
+		return err;
+
+	onboard_dev->clk = devm_clk_get_optional(dev, NULL);
+	if (IS_ERR(onboard_dev->clk))
+		return dev_err_probe(dev, PTR_ERR(onboard_dev->clk),
+				     "failed to get clock\n");
+
+	onboard_dev->reset_gpio = devm_gpiod_get_optional(dev, "reset",
+							  GPIOD_OUT_HIGH);
+	if (IS_ERR(onboard_dev->reset_gpio))
+		return dev_err_probe(dev, PTR_ERR(onboard_dev->reset_gpio),
+				     "failed to get reset GPIO\n");
+
+	mutex_init(&onboard_dev->lock);
+	INIT_LIST_HEAD(&onboard_dev->udev_list);
+
+	dev_set_drvdata(dev, onboard_dev);
+
+	err = onboard_dev_power_on(onboard_dev);
+	if (err)
+		return err;
+
+	/*
+	 * The USB driver might have been detached from the USB devices by
+	 * onboard_dev_remove() (e.g. through an 'unbind' by userspace),
+	 * make sure to re-attach it if needed.
+	 *
+	 * This needs to be done deferred to avoid self-deadlocks on systems
+	 * with nested onboard hubs.
+	 */
+	schedule_work(&attach_usb_driver_work);
+
+	return 0;
+}
+
+static void onboard_dev_remove(struct platform_device *pdev)
+{
+	struct onboard_dev *onboard_dev = dev_get_drvdata(&pdev->dev);
+	struct usbdev_node *node;
+	struct usb_device *udev;
+
+	onboard_dev->going_away = true;
+
+	mutex_lock(&onboard_dev->lock);
+
+	/* unbind the USB devices to avoid dangling references to this device */
+	while (!list_empty(&onboard_dev->udev_list)) {
+		node = list_first_entry(&onboard_dev->udev_list,
+					struct usbdev_node, list);
+		udev = node->udev;
+
+		/*
+		 * Unbinding the driver will call onboard_dev_remove_usbdev(),
+		 * which acquires onboard_dev->lock. We must release the lock
+		 * first.
+		 */
+		get_device(&udev->dev);
+		mutex_unlock(&onboard_dev->lock);
+		device_release_driver(&udev->dev);
+		put_device(&udev->dev);
+		mutex_lock(&onboard_dev->lock);
+	}
+
+	mutex_unlock(&onboard_dev->lock);
+
+	onboard_dev_power_off(onboard_dev);
+}
+
+MODULE_DEVICE_TABLE(of, onboard_dev_match);
+
+static const struct dev_pm_ops __maybe_unused onboard_dev_pm_ops = {
+	SET_LATE_SYSTEM_SLEEP_PM_OPS(onboard_dev_suspend, onboard_dev_resume)
+};
+
+static struct platform_driver onboard_dev_driver = {
+	.probe = onboard_dev_probe,
+	.remove_new = onboard_dev_remove,
+
+	.driver = {
+		.name = "onboard-usb-dev",
+		.of_match_table = onboard_dev_match,
+		.pm = pm_ptr(&onboard_dev_pm_ops),
+		.dev_groups = onboard_dev_groups,
+	},
+};
+
+/************************** USB driver **************************/
+
+#define VENDOR_ID_CYPRESS	0x04b4
+#define VENDOR_ID_GENESYS	0x05e3
+#define VENDOR_ID_MICROCHIP	0x0424
+#define VENDOR_ID_REALTEK	0x0bda
+#define VENDOR_ID_TI		0x0451
+#define VENDOR_ID_VIA		0x2109
+
+/*
+ * Returns the onboard_dev platform device that is associated with the USB
+ * device passed as parameter.
+ */
+static struct onboard_dev *_find_onboard_dev(struct device *dev)
+{
+	struct platform_device *pdev;
+	struct device_node *np;
+	struct onboard_dev *onboard_dev;
+
+	pdev = of_find_device_by_node(dev->of_node);
+	if (!pdev) {
+		np = of_parse_phandle(dev->of_node, "peer-hub", 0);
+		if (!np) {
+			dev_err(dev, "failed to find device node for peer hub\n");
+			return ERR_PTR(-EINVAL);
+		}
+
+		pdev = of_find_device_by_node(np);
+		of_node_put(np);
+
+		if (!pdev)
+			return ERR_PTR(-ENODEV);
+	}
+
+	onboard_dev = dev_get_drvdata(&pdev->dev);
+	put_device(&pdev->dev);
+
+	/*
+	 * The presence of drvdata indicates that the platform driver finished
+	 * probing. This handles the case where (conceivably) we could be
+	 * running at the exact same time as the platform driver's probe. If
+	 * we detect the race we request probe deferral and we'll come back and
+	 * try again.
+	 */
+	if (!onboard_dev)
+		return ERR_PTR(-EPROBE_DEFER);
+
+	return onboard_dev;
+}
+
+static int onboard_dev_usbdev_probe(struct usb_device *udev)
+{
+	struct device *dev = &udev->dev;
+	struct onboard_dev *onboard_dev;
+	int err;
+
+	/* ignore supported devices without device tree node */
+	if (!dev->of_node)
+		return -ENODEV;
+
+	onboard_dev = _find_onboard_dev(dev);
+	if (IS_ERR(onboard_dev))
+		return PTR_ERR(onboard_dev);
+
+	dev_set_drvdata(dev, onboard_dev);
+
+	err = onboard_dev_add_usbdev(onboard_dev, udev);
+	if (err)
+		return err;
+
+	return 0;
+}
+
+static void onboard_dev_usbdev_disconnect(struct usb_device *udev)
+{
+	struct onboard_dev *onboard_dev = dev_get_drvdata(&udev->dev);
+
+	onboard_dev_remove_usbdev(onboard_dev, udev);
+}
+
+static const struct usb_device_id onboard_dev_id_table[] = {
+	{ USB_DEVICE(VENDOR_ID_CYPRESS, 0x6504) }, /* CYUSB33{0,1,2}x/CYUSB230x 3.0 HUB */
+	{ USB_DEVICE(VENDOR_ID_CYPRESS, 0x6506) }, /* CYUSB33{0,1,2}x/CYUSB230x 2.0 HUB */
+	{ USB_DEVICE(VENDOR_ID_CYPRESS, 0x6570) }, /* CY7C6563x 2.0 HUB */
+	{ USB_DEVICE(VENDOR_ID_GENESYS, 0x0608) }, /* Genesys Logic GL850G USB 2.0 HUB */
+	{ USB_DEVICE(VENDOR_ID_GENESYS, 0x0610) }, /* Genesys Logic GL852G USB 2.0 HUB */
+	{ USB_DEVICE(VENDOR_ID_GENESYS, 0x0620) }, /* Genesys Logic GL3523 USB 3.1 HUB */
+	{ USB_DEVICE(VENDOR_ID_MICROCHIP, 0x2412) }, /* USB2412 USB 2.0 HUB */
+	{ USB_DEVICE(VENDOR_ID_MICROCHIP, 0x2514) }, /* USB2514B USB 2.0 HUB */
+	{ USB_DEVICE(VENDOR_ID_MICROCHIP, 0x2517) }, /* USB2517 USB 2.0 HUB */
+	{ USB_DEVICE(VENDOR_ID_MICROCHIP, 0x2744) }, /* USB5744 USB 2.0 HUB */
+	{ USB_DEVICE(VENDOR_ID_MICROCHIP, 0x5744) }, /* USB5744 USB 3.0 HUB */
+	{ USB_DEVICE(VENDOR_ID_REALTEK, 0x0411) }, /* RTS5411 USB 3.1 HUB */
+	{ USB_DEVICE(VENDOR_ID_REALTEK, 0x5411) }, /* RTS5411 USB 2.1 HUB */
+	{ USB_DEVICE(VENDOR_ID_REALTEK, 0x0414) }, /* RTS5414 USB 3.2 HUB */
+	{ USB_DEVICE(VENDOR_ID_REALTEK, 0x5414) }, /* RTS5414 USB 2.1 HUB */
+	{ USB_DEVICE(VENDOR_ID_TI, 0x8025) }, /* TI USB8020B 3.0 HUB */
+	{ USB_DEVICE(VENDOR_ID_TI, 0x8027) }, /* TI USB8020B 2.0 HUB */
+	{ USB_DEVICE(VENDOR_ID_TI, 0x8140) }, /* TI USB8041 3.0 HUB */
+	{ USB_DEVICE(VENDOR_ID_TI, 0x8142) }, /* TI USB8041 2.0 HUB */
+	{ USB_DEVICE(VENDOR_ID_VIA, 0x0817) }, /* VIA VL817 3.1 HUB */
+	{ USB_DEVICE(VENDOR_ID_VIA, 0x2817) }, /* VIA VL817 2.0 HUB */
+	{}
+};
+MODULE_DEVICE_TABLE(usb, onboard_dev_id_table);
+
+static struct usb_device_driver onboard_dev_usbdev_driver = {
+	.name = "onboard-usb-dev",
+	.probe = onboard_dev_usbdev_probe,
+	.disconnect = onboard_dev_usbdev_disconnect,
+	.generic_subclass = 1,
+	.supports_autosuspend =	1,
+	.id_table = onboard_dev_id_table,
+};
+
+static int __init onboard_dev_init(void)
+{
+	int ret;
+
+	ret = usb_register_device_driver(&onboard_dev_usbdev_driver, THIS_MODULE);
+	if (ret)
+		return ret;
+
+	ret = platform_driver_register(&onboard_dev_driver);
+	if (ret)
+		usb_deregister_device_driver(&onboard_dev_usbdev_driver);
+
+	return ret;
+}
+module_init(onboard_dev_init);
+
+static void __exit onboard_dev_exit(void)
+{
+	usb_deregister_device_driver(&onboard_dev_usbdev_driver);
+	platform_driver_unregister(&onboard_dev_driver);
+
+	cancel_work_sync(&attach_usb_driver_work);
+}
+module_exit(onboard_dev_exit);
+
+MODULE_AUTHOR("Matthias Kaehlcke <mka@chromium.org>");
+MODULE_DESCRIPTION("Driver for discrete onboard USB devices");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/usb/misc/onboard_usb_dev.h b/drivers/usb/misc/onboard_usb_dev.h
new file mode 100644
index 000000000000..debab2895a53
--- /dev/null
+++ b/drivers/usb/misc/onboard_usb_dev.h
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright (c) 2022, Google LLC
+ */
+
+#ifndef _USB_MISC_ONBOARD_USB_DEV_H
+#define _USB_MISC_ONBOARD_USB_DEV_H
+
+#define MAX_SUPPLIES 2
+
+struct onboard_dev_pdata {
+	unsigned long reset_us;		/* reset pulse width in us */
+	unsigned int num_supplies;	/* number of supplies */
+	const char * const supply_names[MAX_SUPPLIES];
+};
+
+static const struct onboard_dev_pdata microchip_usb424_data = {
+	.reset_us = 1,
+	.num_supplies = 1,
+	.supply_names = { "vdd" },
+};
+
+static const struct onboard_dev_pdata microchip_usb5744_data = {
+	.reset_us = 0,
+	.num_supplies = 2,
+	.supply_names = { "vdd", "vdd2" },
+};
+
+static const struct onboard_dev_pdata realtek_rts5411_data = {
+	.reset_us = 0,
+	.num_supplies = 1,
+	.supply_names = { "vdd" },
+};
+
+static const struct onboard_dev_pdata ti_tusb8020b_data = {
+	.reset_us = 3000,
+	.num_supplies = 1,
+	.supply_names = { "vdd" },
+};
+
+static const struct onboard_dev_pdata ti_tusb8041_data = {
+	.reset_us = 3000,
+	.num_supplies = 1,
+	.supply_names = { "vdd" },
+};
+
+static const struct onboard_dev_pdata cypress_hx3_data = {
+	.reset_us = 10000,
+	.num_supplies = 2,
+	.supply_names = { "vdd", "vdd2" },
+};
+
+static const struct onboard_dev_pdata cypress_hx2vl_data = {
+	.reset_us = 1,
+	.num_supplies = 1,
+	.supply_names = { "vdd" },
+};
+
+static const struct onboard_dev_pdata genesys_gl850g_data = {
+	.reset_us = 3,
+	.num_supplies = 1,
+	.supply_names = { "vdd" },
+};
+
+static const struct onboard_dev_pdata genesys_gl852g_data = {
+	.reset_us = 50,
+	.num_supplies = 1,
+	.supply_names = { "vdd" },
+};
+
+static const struct onboard_dev_pdata vialab_vl817_data = {
+	.reset_us = 10,
+	.num_supplies = 1,
+	.supply_names = { "vdd" },
+};
+
+static const struct of_device_id onboard_dev_match[] = {
+	{ .compatible = "usb424,2412", .data = &microchip_usb424_data, },
+	{ .compatible = "usb424,2514", .data = &microchip_usb424_data, },
+	{ .compatible = "usb424,2517", .data = &microchip_usb424_data, },
+	{ .compatible = "usb424,2744", .data = &microchip_usb5744_data, },
+	{ .compatible = "usb424,5744", .data = &microchip_usb5744_data, },
+	{ .compatible = "usb451,8025", .data = &ti_tusb8020b_data, },
+	{ .compatible = "usb451,8027", .data = &ti_tusb8020b_data, },
+	{ .compatible = "usb451,8140", .data = &ti_tusb8041_data, },
+	{ .compatible = "usb451,8142", .data = &ti_tusb8041_data, },
+	{ .compatible = "usb4b4,6504", .data = &cypress_hx3_data, },
+	{ .compatible = "usb4b4,6506", .data = &cypress_hx3_data, },
+	{ .compatible = "usb4b4,6570", .data = &cypress_hx2vl_data, },
+	{ .compatible = "usb5e3,608", .data = &genesys_gl850g_data, },
+	{ .compatible = "usb5e3,610", .data = &genesys_gl852g_data, },
+	{ .compatible = "usb5e3,620", .data = &genesys_gl852g_data, },
+	{ .compatible = "usb5e3,626", .data = &genesys_gl852g_data, },
+	{ .compatible = "usbbda,411", .data = &realtek_rts5411_data, },
+	{ .compatible = "usbbda,5411", .data = &realtek_rts5411_data, },
+	{ .compatible = "usbbda,414", .data = &realtek_rts5411_data, },
+	{ .compatible = "usbbda,5414", .data = &realtek_rts5411_data, },
+	{ .compatible = "usb2109,817", .data = &vialab_vl817_data, },
+	{ .compatible = "usb2109,2817", .data = &vialab_vl817_data, },
+	{}
+};
+
+#endif /* _USB_MISC_ONBOARD_USB_DEV_H */
diff --git a/drivers/usb/misc/onboard_usb_dev_pdevs.c b/drivers/usb/misc/onboard_usb_dev_pdevs.c
new file mode 100644
index 000000000000..722504752ce3
--- /dev/null
+++ b/drivers/usb/misc/onboard_usb_dev_pdevs.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * API for creating and destroying USB onboard platform devices
+ *
+ * Copyright (c) 2022, Google LLC
+ */
+
+#include <linux/device.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/usb.h>
+#include <linux/usb/hcd.h>
+#include <linux/usb/of.h>
+#include <linux/usb/onboard_dev.h>
+
+#include "onboard_usb_dev.h"
+
+struct pdev_list_entry {
+	struct platform_device *pdev;
+	struct list_head node;
+};
+
+static bool of_is_onboard_usb_dev(struct device_node *np)
+{
+	return !!of_match_node(onboard_dev_match, np);
+}
+
+/**
+ * onboard_dev_create_pdevs -- create platform devices for onboard USB devices
+ * @parent_hub	: parent hub to scan for connected onboard devices
+ * @pdev_list	: list of onboard platform devices owned by the parent hub
+ *
+ * Creates a platform device for each supported onboard device that is connected
+ * to the given parent hub. The platform device is in charge of initializing the
+ * device (enable regulators, take the device out of reset, ...). For onboard
+ * hubs, it can optionally control whether the device remains powered during
+ * system suspend or not.
+ *
+ * To keep track of the platform devices they are added to a list that is owned
+ * by the parent hub.
+ *
+ * Some background about the logic in this function, which can be a bit hard
+ * to follow:
+ *
+ * Root hubs don't have dedicated device tree nodes, but use the node of their
+ * HCD. The primary and secondary HCD are usually represented by a single DT
+ * node. That means the root hubs of the primary and secondary HCD share the
+ * same device tree node (the HCD node). As a result this function can be called
+ * twice with the same DT node for root hubs. We only want to create a single
+ * platform device for each physical onboard device, hence for root hubs the
+ * loop is only executed for the root hub of the primary HCD. Since the function
+ * scans through all child nodes it still creates pdevs for onboard devices
+ * connected to the root hub of the secondary HCD if needed.
+ *
+ * Further there must be only one platform device for onboard hubs with a peer
+ * hub (the hub is a single physical device). To achieve this two measures are
+ * taken: pdevs for onboard hubs with a peer are only created when the function
+ * is called on behalf of the parent hub that is connected to the primary HCD
+ * (directly or through other hubs). For onboard hubs connected to root hubs
+ * the function processes the nodes of both peers. A platform device is only
+ * created if the peer hub doesn't have one already.
+ */
+void onboard_dev_create_pdevs(struct usb_device *parent_hub, struct list_head *pdev_list)
+{
+	int i;
+	struct usb_hcd *hcd = bus_to_hcd(parent_hub->bus);
+	struct device_node *np, *npc;
+	struct platform_device *pdev;
+	struct pdev_list_entry *pdle;
+
+	if (!parent_hub->dev.of_node)
+		return;
+
+	if (!parent_hub->parent && !usb_hcd_is_primary_hcd(hcd))
+		return;
+
+	for (i = 1; i <= parent_hub->maxchild; i++) {
+		np = usb_of_get_device_node(parent_hub, i);
+		if (!np)
+			continue;
+
+		if (!of_is_onboard_usb_dev(np))
+			goto node_put;
+
+		npc = of_parse_phandle(np, "peer-hub", 0);
+		if (npc) {
+			if (!usb_hcd_is_primary_hcd(hcd)) {
+				of_node_put(npc);
+				goto node_put;
+			}
+
+			pdev = of_find_device_by_node(npc);
+			of_node_put(npc);
+
+			if (pdev) {
+				put_device(&pdev->dev);
+				goto node_put;
+			}
+		}
+
+		pdev = of_platform_device_create(np, NULL, &parent_hub->dev);
+		if (!pdev) {
+			dev_err(&parent_hub->dev,
+				"failed to create platform device for onboard dev '%pOF'\n", np);
+			goto node_put;
+		}
+
+		pdle = kzalloc(sizeof(*pdle), GFP_KERNEL);
+		if (!pdle) {
+			of_platform_device_destroy(&pdev->dev, NULL);
+			goto node_put;
+		}
+
+		pdle->pdev = pdev;
+		list_add(&pdle->node, pdev_list);
+
+node_put:
+		of_node_put(np);
+	}
+}
+EXPORT_SYMBOL_GPL(onboard_dev_create_pdevs);
+
+/**
+ * onboard_dev_destroy_pdevs -- free resources of onboard platform devices
+ * @pdev_list	: list of onboard platform devices
+ *
+ * Destroys the platform devices in the given list and frees the memory associated
+ * with the list entry.
+ */
+void onboard_dev_destroy_pdevs(struct list_head *pdev_list)
+{
+	struct pdev_list_entry *pdle, *tmp;
+
+	list_for_each_entry_safe(pdle, tmp, pdev_list, node) {
+		list_del(&pdle->node);
+		of_platform_device_destroy(&pdle->pdev->dev, NULL);
+		kfree(pdle);
+	}
+}
+EXPORT_SYMBOL_GPL(onboard_dev_destroy_pdevs);
diff --git a/drivers/usb/misc/onboard_usb_hub.c b/drivers/usb/misc/onboard_usb_hub.c
deleted file mode 100644
index 5308c54aaabe..000000000000
--- a/drivers/usb/misc/onboard_usb_hub.c
+++ /dev/null
@@ -1,506 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Driver for onboard USB hubs
- *
- * Copyright (c) 2022, Google LLC
- */
-
-#include <linux/clk.h>
-#include <linux/device.h>
-#include <linux/export.h>
-#include <linux/err.h>
-#include <linux/gpio/consumer.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/of.h>
-#include <linux/of_platform.h>
-#include <linux/platform_device.h>
-#include <linux/regulator/consumer.h>
-#include <linux/slab.h>
-#include <linux/suspend.h>
-#include <linux/sysfs.h>
-#include <linux/usb.h>
-#include <linux/usb/hcd.h>
-#include <linux/usb/onboard_hub.h>
-#include <linux/workqueue.h>
-
-#include "onboard_usb_hub.h"
-
-static void onboard_hub_attach_usb_driver(struct work_struct *work);
-
-static struct usb_device_driver onboard_hub_usbdev_driver;
-static DECLARE_WORK(attach_usb_driver_work, onboard_hub_attach_usb_driver);
-
-/************************** Platform driver **************************/
-
-struct usbdev_node {
-	struct usb_device *udev;
-	struct list_head list;
-};
-
-struct onboard_hub {
-	struct regulator_bulk_data supplies[MAX_SUPPLIES];
-	struct device *dev;
-	const struct onboard_hub_pdata *pdata;
-	struct gpio_desc *reset_gpio;
-	bool always_powered_in_suspend;
-	bool is_powered_on;
-	bool going_away;
-	struct list_head udev_list;
-	struct mutex lock;
-	struct clk *clk;
-};
-
-static int onboard_hub_get_regulators(struct onboard_hub *hub)
-{
-	const char * const *supply_names = hub->pdata->supply_names;
-	unsigned int num_supplies = hub->pdata->num_supplies;
-	struct device *dev = hub->dev;
-	unsigned int i;
-	int err;
-
-	if (num_supplies > MAX_SUPPLIES)
-		return dev_err_probe(dev, -EINVAL, "max %d supplies supported!\n",
-				     MAX_SUPPLIES);
-
-	for (i = 0; i < num_supplies; i++)
-		hub->supplies[i].supply = supply_names[i];
-
-	err = devm_regulator_bulk_get(dev, num_supplies, hub->supplies);
-	if (err)
-		dev_err(dev, "Failed to get regulator supplies: %pe\n",
-			ERR_PTR(err));
-
-	return err;
-}
-
-static int onboard_hub_power_on(struct onboard_hub *hub)
-{
-	int err;
-
-	err = clk_prepare_enable(hub->clk);
-	if (err) {
-		dev_err(hub->dev, "failed to enable clock: %pe\n", ERR_PTR(err));
-		return err;
-	}
-
-	err = regulator_bulk_enable(hub->pdata->num_supplies, hub->supplies);
-	if (err) {
-		dev_err(hub->dev, "failed to enable supplies: %pe\n", ERR_PTR(err));
-		return err;
-	}
-
-	fsleep(hub->pdata->reset_us);
-	gpiod_set_value_cansleep(hub->reset_gpio, 0);
-
-	hub->is_powered_on = true;
-
-	return 0;
-}
-
-static int onboard_hub_power_off(struct onboard_hub *hub)
-{
-	int err;
-
-	gpiod_set_value_cansleep(hub->reset_gpio, 1);
-
-	err = regulator_bulk_disable(hub->pdata->num_supplies, hub->supplies);
-	if (err) {
-		dev_err(hub->dev, "failed to disable supplies: %pe\n", ERR_PTR(err));
-		return err;
-	}
-
-	clk_disable_unprepare(hub->clk);
-
-	hub->is_powered_on = false;
-
-	return 0;
-}
-
-static int __maybe_unused onboard_hub_suspend(struct device *dev)
-{
-	struct onboard_hub *hub = dev_get_drvdata(dev);
-	struct usbdev_node *node;
-	bool power_off = true;
-
-	if (hub->always_powered_in_suspend)
-		return 0;
-
-	mutex_lock(&hub->lock);
-
-	list_for_each_entry(node, &hub->udev_list, list) {
-		if (!device_may_wakeup(node->udev->bus->controller))
-			continue;
-
-		if (usb_wakeup_enabled_descendants(node->udev)) {
-			power_off = false;
-			break;
-		}
-	}
-
-	mutex_unlock(&hub->lock);
-
-	if (!power_off)
-		return 0;
-
-	return onboard_hub_power_off(hub);
-}
-
-static int __maybe_unused onboard_hub_resume(struct device *dev)
-{
-	struct onboard_hub *hub = dev_get_drvdata(dev);
-
-	if (hub->is_powered_on)
-		return 0;
-
-	return onboard_hub_power_on(hub);
-}
-
-static inline void get_udev_link_name(const struct usb_device *udev, char *buf, size_t size)
-{
-	snprintf(buf, size, "usb_dev.%s", dev_name(&udev->dev));
-}
-
-static int onboard_hub_add_usbdev(struct onboard_hub *hub, struct usb_device *udev)
-{
-	struct usbdev_node *node;
-	char link_name[64];
-	int err;
-
-	mutex_lock(&hub->lock);
-
-	if (hub->going_away) {
-		err = -EINVAL;
-		goto error;
-	}
-
-	node = kzalloc(sizeof(*node), GFP_KERNEL);
-	if (!node) {
-		err = -ENOMEM;
-		goto error;
-	}
-
-	node->udev = udev;
-
-	list_add(&node->list, &hub->udev_list);
-
-	mutex_unlock(&hub->lock);
-
-	get_udev_link_name(udev, link_name, sizeof(link_name));
-	WARN_ON(sysfs_create_link(&hub->dev->kobj, &udev->dev.kobj, link_name));
-
-	return 0;
-
-error:
-	mutex_unlock(&hub->lock);
-
-	return err;
-}
-
-static void onboard_hub_remove_usbdev(struct onboard_hub *hub, const struct usb_device *udev)
-{
-	struct usbdev_node *node;
-	char link_name[64];
-
-	get_udev_link_name(udev, link_name, sizeof(link_name));
-	sysfs_remove_link(&hub->dev->kobj, link_name);
-
-	mutex_lock(&hub->lock);
-
-	list_for_each_entry(node, &hub->udev_list, list) {
-		if (node->udev == udev) {
-			list_del(&node->list);
-			kfree(node);
-			break;
-		}
-	}
-
-	mutex_unlock(&hub->lock);
-}
-
-static ssize_t always_powered_in_suspend_show(struct device *dev, struct device_attribute *attr,
-			   char *buf)
-{
-	const struct onboard_hub *hub = dev_get_drvdata(dev);
-
-	return sysfs_emit(buf, "%d\n", hub->always_powered_in_suspend);
-}
-
-static ssize_t always_powered_in_suspend_store(struct device *dev, struct device_attribute *attr,
-			    const char *buf, size_t count)
-{
-	struct onboard_hub *hub = dev_get_drvdata(dev);
-	bool val;
-	int ret;
-
-	ret = kstrtobool(buf, &val);
-	if (ret < 0)
-		return ret;
-
-	hub->always_powered_in_suspend = val;
-
-	return count;
-}
-static DEVICE_ATTR_RW(always_powered_in_suspend);
-
-static struct attribute *onboard_hub_attrs[] = {
-	&dev_attr_always_powered_in_suspend.attr,
-	NULL,
-};
-ATTRIBUTE_GROUPS(onboard_hub);
-
-static void onboard_hub_attach_usb_driver(struct work_struct *work)
-{
-	int err;
-
-	err = driver_attach(&onboard_hub_usbdev_driver.driver);
-	if (err)
-		pr_err("Failed to attach USB driver: %pe\n", ERR_PTR(err));
-}
-
-static int onboard_hub_probe(struct platform_device *pdev)
-{
-	struct device *dev = &pdev->dev;
-	struct onboard_hub *hub;
-	int err;
-
-	hub = devm_kzalloc(dev, sizeof(*hub), GFP_KERNEL);
-	if (!hub)
-		return -ENOMEM;
-
-	hub->pdata = device_get_match_data(dev);
-	if (!hub->pdata)
-		return -EINVAL;
-
-	hub->dev = dev;
-
-	err = onboard_hub_get_regulators(hub);
-	if (err)
-		return err;
-
-	hub->clk = devm_clk_get_optional(dev, NULL);
-	if (IS_ERR(hub->clk))
-		return dev_err_probe(dev, PTR_ERR(hub->clk), "failed to get clock\n");
-
-	hub->reset_gpio = devm_gpiod_get_optional(dev, "reset",
-						  GPIOD_OUT_HIGH);
-	if (IS_ERR(hub->reset_gpio))
-		return dev_err_probe(dev, PTR_ERR(hub->reset_gpio), "failed to get reset GPIO\n");
-
-	mutex_init(&hub->lock);
-	INIT_LIST_HEAD(&hub->udev_list);
-
-	dev_set_drvdata(dev, hub);
-
-	err = onboard_hub_power_on(hub);
-	if (err)
-		return err;
-
-	/*
-	 * The USB driver might have been detached from the USB devices by
-	 * onboard_hub_remove() (e.g. through an 'unbind' by userspace),
-	 * make sure to re-attach it if needed.
-	 *
-	 * This needs to be done deferred to avoid self-deadlocks on systems
-	 * with nested onboard hubs.
-	 */
-	schedule_work(&attach_usb_driver_work);
-
-	return 0;
-}
-
-static void onboard_hub_remove(struct platform_device *pdev)
-{
-	struct onboard_hub *hub = dev_get_drvdata(&pdev->dev);
-	struct usbdev_node *node;
-	struct usb_device *udev;
-
-	hub->going_away = true;
-
-	mutex_lock(&hub->lock);
-
-	/* unbind the USB devices to avoid dangling references to this device */
-	while (!list_empty(&hub->udev_list)) {
-		node = list_first_entry(&hub->udev_list, struct usbdev_node, list);
-		udev = node->udev;
-
-		/*
-		 * Unbinding the driver will call onboard_hub_remove_usbdev(),
-		 * which acquires hub->lock.  We must release the lock first.
-		 */
-		get_device(&udev->dev);
-		mutex_unlock(&hub->lock);
-		device_release_driver(&udev->dev);
-		put_device(&udev->dev);
-		mutex_lock(&hub->lock);
-	}
-
-	mutex_unlock(&hub->lock);
-
-	onboard_hub_power_off(hub);
-}
-
-MODULE_DEVICE_TABLE(of, onboard_hub_match);
-
-static const struct dev_pm_ops __maybe_unused onboard_hub_pm_ops = {
-	SET_LATE_SYSTEM_SLEEP_PM_OPS(onboard_hub_suspend, onboard_hub_resume)
-};
-
-static struct platform_driver onboard_hub_driver = {
-	.probe = onboard_hub_probe,
-	.remove_new = onboard_hub_remove,
-
-	.driver = {
-		.name = "onboard-usb-hub",
-		.of_match_table = onboard_hub_match,
-		.pm = pm_ptr(&onboard_hub_pm_ops),
-		.dev_groups = onboard_hub_groups,
-	},
-};
-
-/************************** USB driver **************************/
-
-#define VENDOR_ID_CYPRESS	0x04b4
-#define VENDOR_ID_GENESYS	0x05e3
-#define VENDOR_ID_MICROCHIP	0x0424
-#define VENDOR_ID_REALTEK	0x0bda
-#define VENDOR_ID_TI		0x0451
-#define VENDOR_ID_VIA		0x2109
-
-/*
- * Returns the onboard_hub platform device that is associated with the USB
- * device passed as parameter.
- */
-static struct onboard_hub *_find_onboard_hub(struct device *dev)
-{
-	struct platform_device *pdev;
-	struct device_node *np;
-	struct onboard_hub *hub;
-
-	pdev = of_find_device_by_node(dev->of_node);
-	if (!pdev) {
-		np = of_parse_phandle(dev->of_node, "peer-hub", 0);
-		if (!np) {
-			dev_err(dev, "failed to find device node for peer hub\n");
-			return ERR_PTR(-EINVAL);
-		}
-
-		pdev = of_find_device_by_node(np);
-		of_node_put(np);
-
-		if (!pdev)
-			return ERR_PTR(-ENODEV);
-	}
-
-	hub = dev_get_drvdata(&pdev->dev);
-	put_device(&pdev->dev);
-
-	/*
-	 * The presence of drvdata ('hub') indicates that the platform driver
-	 * finished probing. This handles the case where (conceivably) we could
-	 * be running at the exact same time as the platform driver's probe. If
-	 * we detect the race we request probe deferral and we'll come back and
-	 * try again.
-	 */
-	if (!hub)
-		return ERR_PTR(-EPROBE_DEFER);
-
-	return hub;
-}
-
-static int onboard_hub_usbdev_probe(struct usb_device *udev)
-{
-	struct device *dev = &udev->dev;
-	struct onboard_hub *hub;
-	int err;
-
-	/* ignore supported hubs without device tree node */
-	if (!dev->of_node)
-		return -ENODEV;
-
-	hub = _find_onboard_hub(dev);
-	if (IS_ERR(hub))
-		return PTR_ERR(hub);
-
-	dev_set_drvdata(dev, hub);
-
-	err = onboard_hub_add_usbdev(hub, udev);
-	if (err)
-		return err;
-
-	return 0;
-}
-
-static void onboard_hub_usbdev_disconnect(struct usb_device *udev)
-{
-	struct onboard_hub *hub = dev_get_drvdata(&udev->dev);
-
-	onboard_hub_remove_usbdev(hub, udev);
-}
-
-static const struct usb_device_id onboard_hub_id_table[] = {
-	{ USB_DEVICE(VENDOR_ID_CYPRESS, 0x6504) }, /* CYUSB33{0,1,2}x/CYUSB230x 3.0 */
-	{ USB_DEVICE(VENDOR_ID_CYPRESS, 0x6506) }, /* CYUSB33{0,1,2}x/CYUSB230x 2.0 */
-	{ USB_DEVICE(VENDOR_ID_CYPRESS, 0x6570) }, /* CY7C6563x 2.0 */
-	{ USB_DEVICE(VENDOR_ID_GENESYS, 0x0608) }, /* Genesys Logic GL850G USB 2.0 */
-	{ USB_DEVICE(VENDOR_ID_GENESYS, 0x0610) }, /* Genesys Logic GL852G USB 2.0 */
-	{ USB_DEVICE(VENDOR_ID_GENESYS, 0x0620) }, /* Genesys Logic GL3523 USB 3.1 */
-	{ USB_DEVICE(VENDOR_ID_MICROCHIP, 0x2412) }, /* USB2412 USB 2.0 */
-	{ USB_DEVICE(VENDOR_ID_MICROCHIP, 0x2514) }, /* USB2514B USB 2.0 */
-	{ USB_DEVICE(VENDOR_ID_MICROCHIP, 0x2517) }, /* USB2517 USB 2.0 */
-	{ USB_DEVICE(VENDOR_ID_MICROCHIP, 0x2744) }, /* USB5744 USB 2.0 */
-	{ USB_DEVICE(VENDOR_ID_MICROCHIP, 0x5744) }, /* USB5744 USB 3.0 */
-	{ USB_DEVICE(VENDOR_ID_REALTEK, 0x0411) }, /* RTS5411 USB 3.1 */
-	{ USB_DEVICE(VENDOR_ID_REALTEK, 0x5411) }, /* RTS5411 USB 2.1 */
-	{ USB_DEVICE(VENDOR_ID_REALTEK, 0x0414) }, /* RTS5414 USB 3.2 */
-	{ USB_DEVICE(VENDOR_ID_REALTEK, 0x5414) }, /* RTS5414 USB 2.1 */
-	{ USB_DEVICE(VENDOR_ID_TI, 0x8025) }, /* TI USB8020B 3.0 */
-	{ USB_DEVICE(VENDOR_ID_TI, 0x8027) }, /* TI USB8020B 2.0 */
-	{ USB_DEVICE(VENDOR_ID_TI, 0x8140) }, /* TI USB8041 3.0 */
-	{ USB_DEVICE(VENDOR_ID_TI, 0x8142) }, /* TI USB8041 2.0 */
-	{ USB_DEVICE(VENDOR_ID_VIA, 0x0817) }, /* VIA VL817 3.1 */
-	{ USB_DEVICE(VENDOR_ID_VIA, 0x2817) }, /* VIA VL817 2.0 */
-	{}
-};
-MODULE_DEVICE_TABLE(usb, onboard_hub_id_table);
-
-static struct usb_device_driver onboard_hub_usbdev_driver = {
-	.name = "onboard-usb-hub",
-	.probe = onboard_hub_usbdev_probe,
-	.disconnect = onboard_hub_usbdev_disconnect,
-	.generic_subclass = 1,
-	.supports_autosuspend =	1,
-	.id_table = onboard_hub_id_table,
-};
-
-static int __init onboard_hub_init(void)
-{
-	int ret;
-
-	ret = usb_register_device_driver(&onboard_hub_usbdev_driver, THIS_MODULE);
-	if (ret)
-		return ret;
-
-	ret = platform_driver_register(&onboard_hub_driver);
-	if (ret)
-		usb_deregister_device_driver(&onboard_hub_usbdev_driver);
-
-	return ret;
-}
-module_init(onboard_hub_init);
-
-static void __exit onboard_hub_exit(void)
-{
-	usb_deregister_device_driver(&onboard_hub_usbdev_driver);
-	platform_driver_unregister(&onboard_hub_driver);
-
-	cancel_work_sync(&attach_usb_driver_work);
-}
-module_exit(onboard_hub_exit);
-
-MODULE_AUTHOR("Matthias Kaehlcke <mka@chromium.org>");
-MODULE_DESCRIPTION("Driver for discrete onboard USB hubs");
-MODULE_LICENSE("GPL v2");
diff --git a/drivers/usb/misc/onboard_usb_hub.h b/drivers/usb/misc/onboard_usb_hub.h
deleted file mode 100644
index edbca8aa6ea0..000000000000
--- a/drivers/usb/misc/onboard_usb_hub.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * Copyright (c) 2022, Google LLC
- */
-
-#ifndef _USB_MISC_ONBOARD_USB_HUB_H
-#define _USB_MISC_ONBOARD_USB_HUB_H
-
-#define MAX_SUPPLIES 2
-
-struct onboard_hub_pdata {
-	unsigned long reset_us;		/* reset pulse width in us */
-	unsigned int num_supplies;	/* number of supplies */
-	const char * const supply_names[MAX_SUPPLIES];
-};
-
-static const struct onboard_hub_pdata microchip_usb424_data = {
-	.reset_us = 1,
-	.num_supplies = 1,
-	.supply_names = { "vdd" },
-};
-
-static const struct onboard_hub_pdata microchip_usb5744_data = {
-	.reset_us = 0,
-	.num_supplies = 2,
-	.supply_names = { "vdd", "vdd2" },
-};
-
-static const struct onboard_hub_pdata realtek_rts5411_data = {
-	.reset_us = 0,
-	.num_supplies = 1,
-	.supply_names = { "vdd" },
-};
-
-static const struct onboard_hub_pdata ti_tusb8020b_data = {
-	.reset_us = 3000,
-	.num_supplies = 1,
-	.supply_names = { "vdd" },
-};
-
-static const struct onboard_hub_pdata ti_tusb8041_data = {
-	.reset_us = 3000,
-	.num_supplies = 1,
-	.supply_names = { "vdd" },
-};
-
-static const struct onboard_hub_pdata cypress_hx3_data = {
-	.reset_us = 10000,
-	.num_supplies = 2,
-	.supply_names = { "vdd", "vdd2" },
-};
-
-static const struct onboard_hub_pdata cypress_hx2vl_data = {
-	.reset_us = 1,
-	.num_supplies = 1,
-	.supply_names = { "vdd" },
-};
-
-static const struct onboard_hub_pdata genesys_gl850g_data = {
-	.reset_us = 3,
-	.num_supplies = 1,
-	.supply_names = { "vdd" },
-};
-
-static const struct onboard_hub_pdata genesys_gl852g_data = {
-	.reset_us = 50,
-	.num_supplies = 1,
-	.supply_names = { "vdd" },
-};
-
-static const struct onboard_hub_pdata vialab_vl817_data = {
-	.reset_us = 10,
-	.num_supplies = 1,
-	.supply_names = { "vdd" },
-};
-
-static const struct of_device_id onboard_hub_match[] = {
-	{ .compatible = "usb424,2412", .data = &microchip_usb424_data, },
-	{ .compatible = "usb424,2514", .data = &microchip_usb424_data, },
-	{ .compatible = "usb424,2517", .data = &microchip_usb424_data, },
-	{ .compatible = "usb424,2744", .data = &microchip_usb5744_data, },
-	{ .compatible = "usb424,5744", .data = &microchip_usb5744_data, },
-	{ .compatible = "usb451,8025", .data = &ti_tusb8020b_data, },
-	{ .compatible = "usb451,8027", .data = &ti_tusb8020b_data, },
-	{ .compatible = "usb451,8140", .data = &ti_tusb8041_data, },
-	{ .compatible = "usb451,8142", .data = &ti_tusb8041_data, },
-	{ .compatible = "usb4b4,6504", .data = &cypress_hx3_data, },
-	{ .compatible = "usb4b4,6506", .data = &cypress_hx3_data, },
-	{ .compatible = "usb4b4,6570", .data = &cypress_hx2vl_data, },
-	{ .compatible = "usb5e3,608", .data = &genesys_gl850g_data, },
-	{ .compatible = "usb5e3,610", .data = &genesys_gl852g_data, },
-	{ .compatible = "usb5e3,620", .data = &genesys_gl852g_data, },
-	{ .compatible = "usb5e3,626", .data = &genesys_gl852g_data, },
-	{ .compatible = "usbbda,411", .data = &realtek_rts5411_data, },
-	{ .compatible = "usbbda,5411", .data = &realtek_rts5411_data, },
-	{ .compatible = "usbbda,414", .data = &realtek_rts5411_data, },
-	{ .compatible = "usbbda,5414", .data = &realtek_rts5411_data, },
-	{ .compatible = "usb2109,817", .data = &vialab_vl817_data, },
-	{ .compatible = "usb2109,2817", .data = &vialab_vl817_data, },
-	{}
-};
-
-#endif /* _USB_MISC_ONBOARD_USB_HUB_H */
diff --git a/drivers/usb/misc/onboard_usb_hub_pdevs.c b/drivers/usb/misc/onboard_usb_hub_pdevs.c
deleted file mode 100644
index ed22a18f4ab7..000000000000
--- a/drivers/usb/misc/onboard_usb_hub_pdevs.c
+++ /dev/null
@@ -1,143 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * API for creating and destroying USB onboard hub platform devices
- *
- * Copyright (c) 2022, Google LLC
- */
-
-#include <linux/device.h>
-#include <linux/export.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/of.h>
-#include <linux/of_platform.h>
-#include <linux/platform_device.h>
-#include <linux/usb.h>
-#include <linux/usb/hcd.h>
-#include <linux/usb/of.h>
-#include <linux/usb/onboard_hub.h>
-
-#include "onboard_usb_hub.h"
-
-struct pdev_list_entry {
-	struct platform_device *pdev;
-	struct list_head node;
-};
-
-static bool of_is_onboard_usb_hub(const struct device_node *np)
-{
-	return !!of_match_node(onboard_hub_match, np);
-}
-
-/**
- * onboard_hub_create_pdevs -- create platform devices for onboard USB hubs
- * @parent_hub	: parent hub to scan for connected onboard hubs
- * @pdev_list	: list of onboard hub platform devices owned by the parent hub
- *
- * Creates a platform device for each supported onboard hub that is connected to
- * the given parent hub. The platform device is in charge of initializing the
- * hub (enable regulators, take the hub out of reset, ...) and can optionally
- * control whether the hub remains powered during system suspend or not.
- *
- * To keep track of the platform devices they are added to a list that is owned
- * by the parent hub.
- *
- * Some background about the logic in this function, which can be a bit hard
- * to follow:
- *
- * Root hubs don't have dedicated device tree nodes, but use the node of their
- * HCD. The primary and secondary HCD are usually represented by a single DT
- * node. That means the root hubs of the primary and secondary HCD share the
- * same device tree node (the HCD node). As a result this function can be called
- * twice with the same DT node for root hubs. We only want to create a single
- * platform device for each physical onboard hub, hence for root hubs the loop
- * is only executed for the root hub of the primary HCD. Since the function
- * scans through all child nodes it still creates pdevs for onboard hubs
- * connected to the root hub of the secondary HCD if needed.
- *
- * Further there must be only one platform device for onboard hubs with a peer
- * hub (the hub is a single physical device). To achieve this two measures are
- * taken: pdevs for onboard hubs with a peer are only created when the function
- * is called on behalf of the parent hub that is connected to the primary HCD
- * (directly or through other hubs). For onboard hubs connected to root hubs
- * the function processes the nodes of both peers. A platform device is only
- * created if the peer hub doesn't have one already.
- */
-void onboard_hub_create_pdevs(struct usb_device *parent_hub, struct list_head *pdev_list)
-{
-	int i;
-	struct usb_hcd *hcd = bus_to_hcd(parent_hub->bus);
-	struct device_node *np, *npc;
-	struct platform_device *pdev;
-	struct pdev_list_entry *pdle;
-
-	if (!parent_hub->dev.of_node)
-		return;
-
-	if (!parent_hub->parent && !usb_hcd_is_primary_hcd(hcd))
-		return;
-
-	for (i = 1; i <= parent_hub->maxchild; i++) {
-		np = usb_of_get_device_node(parent_hub, i);
-		if (!np)
-			continue;
-
-		if (!of_is_onboard_usb_hub(np))
-			goto node_put;
-
-		npc = of_parse_phandle(np, "peer-hub", 0);
-		if (npc) {
-			if (!usb_hcd_is_primary_hcd(hcd)) {
-				of_node_put(npc);
-				goto node_put;
-			}
-
-			pdev = of_find_device_by_node(npc);
-			of_node_put(npc);
-
-			if (pdev) {
-				put_device(&pdev->dev);
-				goto node_put;
-			}
-		}
-
-		pdev = of_platform_device_create(np, NULL, &parent_hub->dev);
-		if (!pdev) {
-			dev_err(&parent_hub->dev,
-				"failed to create platform device for onboard hub '%pOF'\n", np);
-			goto node_put;
-		}
-
-		pdle = kzalloc(sizeof(*pdle), GFP_KERNEL);
-		if (!pdle) {
-			of_platform_device_destroy(&pdev->dev, NULL);
-			goto node_put;
-		}
-
-		pdle->pdev = pdev;
-		list_add(&pdle->node, pdev_list);
-
-node_put:
-		of_node_put(np);
-	}
-}
-EXPORT_SYMBOL_GPL(onboard_hub_create_pdevs);
-
-/**
- * onboard_hub_destroy_pdevs -- free resources of onboard hub platform devices
- * @pdev_list	: list of onboard hub platform devices
- *
- * Destroys the platform devices in the given list and frees the memory associated
- * with the list entry.
- */
-void onboard_hub_destroy_pdevs(struct list_head *pdev_list)
-{
-	struct pdev_list_entry *pdle, *tmp;
-
-	list_for_each_entry_safe(pdle, tmp, pdev_list, node) {
-		list_del(&pdle->node);
-		of_platform_device_destroy(&pdle->pdev->dev, NULL);
-		kfree(pdle);
-	}
-}
-EXPORT_SYMBOL_GPL(onboard_hub_destroy_pdevs);
diff --git a/include/linux/usb/onboard_dev.h b/include/linux/usb/onboard_dev.h
new file mode 100644
index 000000000000..b79db6d193c8
--- /dev/null
+++ b/include/linux/usb/onboard_dev.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __LINUX_USB_ONBOARD_DEV_H
+#define __LINUX_USB_ONBOARD_DEV_H
+
+struct usb_device;
+struct list_head;
+
+#if IS_ENABLED(CONFIG_USB_ONBOARD_DEV)
+void onboard_dev_create_pdevs(struct usb_device *parent_dev, struct list_head *pdev_list);
+void onboard_dev_destroy_pdevs(struct list_head *pdev_list);
+#else
+static inline void onboard_dev_create_pdevs(struct usb_device *parent_dev,
+					    struct list_head *pdev_list) {}
+static inline void onboard_dev_destroy_pdevs(struct list_head *pdev_list) {}
+#endif
+
+#endif /* __LINUX_USB_ONBOARD_DEV_H */
diff --git a/include/linux/usb/onboard_hub.h b/include/linux/usb/onboard_hub.h
deleted file mode 100644
index d9373230556e..000000000000
--- a/include/linux/usb/onboard_hub.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef __LINUX_USB_ONBOARD_HUB_H
-#define __LINUX_USB_ONBOARD_HUB_H
-
-struct usb_device;
-struct list_head;
-
-#if IS_ENABLED(CONFIG_USB_ONBOARD_HUB)
-void onboard_hub_create_pdevs(struct usb_device *parent_hub, struct list_head *pdev_list);
-void onboard_hub_destroy_pdevs(struct list_head *pdev_list);
-#else
-static inline void onboard_hub_create_pdevs(struct usb_device *parent_hub,
-					    struct list_head *pdev_list) {}
-static inline void onboard_hub_destroy_pdevs(struct list_head *pdev_list) {}
-#endif
-
-#endif /* __LINUX_USB_ONBOARD_HUB_H */
-- 
cgit v1.2.3


From 0bb322be5d389c00740d884351d4ba08fca938aa Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Mon, 25 Mar 2024 17:14:09 -0500
Subject: driver core: Remove unused platform_notify, platform_notify_remove

The "platform_notify" and "platform_notify_remove" hooks have been unused
since 00ba9357d189 ("ARM: ixp4xx: Drop custom DMA coherency and bouncing").

Remove "platform_notify" and "platform_notify_remove".  No functional
change intended.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Heikki Krogerus <heikki.krogerus@linux.intel.com>
Cc: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Link: https://lore.kernel.org/r/20240325221409.1457036-1-helgaas@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c    |  8 --------
 include/linux/device.h | 11 -----------
 2 files changed, 19 deletions(-)

(limited to 'include')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index b93f3c5716ae..78dfa74ee18b 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -2331,8 +2331,6 @@ static void fw_devlink_link_device(struct device *dev)
 
 /* Device links support end. */
 
-int (*platform_notify)(struct device *dev) = NULL;
-int (*platform_notify_remove)(struct device *dev) = NULL;
 static struct kobject *dev_kobj;
 
 /* /sys/dev/char */
@@ -2380,16 +2378,10 @@ static void device_platform_notify(struct device *dev)
 	acpi_device_notify(dev);
 
 	software_node_notify(dev);
-
-	if (platform_notify)
-		platform_notify(dev);
 }
 
 static void device_platform_notify_remove(struct device *dev)
 {
-	if (platform_notify_remove)
-		platform_notify_remove(dev);
-
 	software_node_notify_remove(dev);
 
 	acpi_device_notify_remove(dev);
diff --git a/include/linux/device.h b/include/linux/device.h
index 97c4b046c09d..c515ba5756e4 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1206,17 +1206,6 @@ int __must_check devm_device_add_groups(struct device *dev,
 int __must_check devm_device_add_group(struct device *dev,
 				       const struct attribute_group *grp);
 
-/*
- * Platform "fixup" functions - allow the platform to have their say
- * about devices and actions that the general device layer doesn't
- * know about.
- */
-/* Notify platform of device discovery */
-extern int (*platform_notify)(struct device *dev);
-
-extern int (*platform_notify_remove)(struct device *dev);
-
-
 /*
  * get_device - atomically increment the reference count for the device.
  *
-- 
cgit v1.2.3


From 36d37f11f555812b0ded5d15aa686a6b9da57f61 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Sat, 23 Mar 2024 18:06:15 +0900
Subject: export.h: remove include/asm-generic/export.h

Commit 3a6dd5f614a1 ("riscv: remove unneeded #include
<asm-generic/export.h>") removed the last use of
include/asm-generic/export.h.

This deprecated header can go away.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Link: https://lore.kernel.org/r/20240323090615.1244904-1-masahiroy@kernel.org
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 include/asm-generic/export.h | 11 -----------
 1 file changed, 11 deletions(-)
 delete mode 100644 include/asm-generic/export.h

(limited to 'include')

diff --git a/include/asm-generic/export.h b/include/asm-generic/export.h
deleted file mode 100644
index 570cd4da7210..000000000000
--- a/include/asm-generic/export.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-#ifndef __ASM_GENERIC_EXPORT_H
-#define __ASM_GENERIC_EXPORT_H
-
-/*
- * <asm/export.h> and <asm-generic/export.h> are deprecated.
- * Please include <linux/export.h> directly.
- */
-#include <linux/export.h>
-
-#endif
-- 
cgit v1.2.3


From 2f7d7ea44adbe7497b225bbb7bfc29e3c792094d Mon Sep 17 00:00:00 2001
From: Cezary Rojewski <cezary.rojewski@intel.com>
Date: Tue, 19 Mar 2024 09:30:15 +0100
Subject: ACPI: NHLT: Reintroduce types the table consists of

ACPICA commit 32260f5ce519e854546ce907fc0cc449e1fe51fe

Non HDAudio Link Table (NHLT) is designed to separate hardware-related
description (registers) from AudioDSP firmware-related one i.e.:
pipelines and modules that together make up the audio stream on Intel
DSPs. This task is important as same set of hardware registers can be
used with different topologies and vice versa, same topology could be
utilized with different set of hardware. As the hardware registers
description is directly tied to specific platform, intention is to have
such description part of low-level firmware e.g.: BIOS.

The initial design has been provided in early Sky Lake (SKL) days. The
audio architecture goes by the name cAVS. SKL is a representative of
cAVS 1.5. The table helps describe endpoint capabilities ever since.
While Raptor Lake (RPL) is the last of cAVS architecture - cAVS 2.5 to
be precise - its successor, the ACE architecture which begun with
Meteor Lake (MTL) inherited the design for all I2S and PDM
configurations. These two configurations are the primary targets for
NHLT table.

Due to naming conflicts with existing code, several structs are named
'nhlt2' rather than 'nhlt'. Follow up changes clean this up once
existing code has no users and is removed.

Link: https://github.com/acpica/acpica/pull/912
Signed-off-by: Cezary Rojewski <cezary.rojewski@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/actbl2.h | 189 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 189 insertions(+)

(limited to 'include')

diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h
index 9775384d61c6..8030a1743100 100644
--- a/include/acpi/actbl2.h
+++ b/include/acpi/actbl2.h
@@ -2141,6 +2141,195 @@ struct acpi_nhlt_device_info {
 	u8 device_port_id;
 };
 
+/*******************************************************************************
+ *
+ * NHLT - Non HDAudio Link Table
+ *        Version 1
+ *
+ ******************************************************************************/
+
+struct acpi_table_nhlt2 {
+	struct acpi_table_header header;	/* Common ACPI table header */
+	u8 endpoints_count;
+	/*
+	 * struct acpi_nhlt_endpoint endpoints[];
+	 * struct acpi_nhlt_config oed_config;
+	 */
+};
+
+struct acpi_nhlt2_endpoint {
+	u32 length;
+	u8 link_type;
+	u8 instance_id;
+	u16 vendor_id;
+	u16 device_id;
+	u16 revision_id;
+	u32 subsystem_id;
+	u8 device_type;
+	u8 direction;
+	u8 virtual_bus_id;
+	/*
+	 * struct acpi_nhlt_config device_config;
+	 * struct acpi_nhlt_formats_config formats_config;
+	 * struct acpi_nhlt_devices_info devices_info;
+	 */
+};
+
+/*
+ * Values for link_type field above
+ *
+ * Only types PDM and SSP are used
+ */
+#define ACPI_NHLT_LINKTYPE_HDA			0
+#define ACPI_NHLT_LINKTYPE_DSP			1
+#define ACPI_NHLT_LINKTYPE_PDM			2
+#define ACPI_NHLT_LINKTYPE_SSP			3
+#define ACPI_NHLT_LINKTYPE_SLIMBUS		4
+#define ACPI_NHLT_LINKTYPE_SDW			5
+#define ACPI_NHLT_LINKTYPE_UAOL			6
+
+/* Values for device_id field above */
+
+#define ACPI_NHLT_DEVICEID_DMIC			0xAE20
+#define ACPI_NHLT_DEVICEID_BT			0xAE30
+#define ACPI_NHLT_DEVICEID_I2S			0xAE34
+
+/* Values for device_type field above */
+
+/*
+ * Device types unique to endpoint of link_type=PDM
+ *
+ * Type PDM used for all SKL+ platforms
+ */
+#define ACPI_NHLT_DEVICETYPE_PDM		0
+#define ACPI_NHLT_DEVICETYPE_PDM_SKL		1
+/* Device types unique to endpoint of link_type=SSP */
+#define ACPI_NHLT_DEVICETYPE_BT			0
+#define ACPI_NHLT_DEVICETYPE_FM			1
+#define ACPI_NHLT_DEVICETYPE_MODEM		2
+#define ACPI_NHLT_DEVICETYPE_CODEC		4
+
+/* Values for Direction field above */
+
+#define ACPI_NHLT_DIR_RENDER			0
+#define ACPI_NHLT_DIR_CAPTURE			1
+
+struct acpi_nhlt_config {
+	u32 capabilities_size;
+	u8 capabilities[];
+};
+
+struct acpi_nhlt_gendevice_config {
+	u8 virtual_slot;
+	u8 config_type;
+};
+
+/* Values for config_type field above */
+
+#define ACPI_NHLT_CONFIGTYPE_GENERIC		0
+#define ACPI_NHLT_CONFIGTYPE_MICARRAY		1
+
+struct acpi_nhlt_micdevice_config {
+	u8 virtual_slot;
+	u8 config_type;
+	u8 array_type;
+};
+
+/* Values for array_type field above */
+
+#define ACPI_NHLT_ARRAYTYPE_LINEAR2_SMALL	0xA
+#define ACPI_NHLT_ARRAYTYPE_LINEAR2_BIG		0xB
+#define ACPI_NHLT_ARRAYTYPE_LINEAR4_GEO1	0xC
+#define ACPI_NHLT_ARRAYTYPE_PLANAR4_LSHAPED	0xD
+#define ACPI_NHLT_ARRAYTYPE_LINEAR4_GEO2	0xE
+#define ACPI_NHLT_ARRAYTYPE_VENDOR		0xF
+
+struct acpi_nhlt2_vendor_mic_config {
+	u8 type;
+	u8 panel;
+	u16 speaker_position_distance;		/* mm */
+	u16 horizontal_offset;			/* mm */
+	u16 vertical_offset;			/* mm */
+	u8 frequency_low_band;			/* 5*Hz */
+	u8 frequency_high_band;			/* 500*Hz */
+	u16 direction_angle;			/* -180 - +180 */
+	u16 elevation_angle;			/* -180 - +180 */
+	u16 work_vertical_angle_begin;		/* -180 - +180 with 2 deg step */
+	u16 work_vertical_angle_end;		/* -180 - +180 with 2 deg step */
+	u16 work_horizontal_angle_begin;	/* -180 - +180 with 2 deg step */
+	u16 work_horizontal_angle_end;		/* -180 - +180 with 2 deg step */
+};
+
+/* Values for Type field above */
+
+#define ACPI_NHLT_MICTYPE_OMNIDIRECTIONAL	0
+#define ACPI_NHLT_MICTYPE_SUBCARDIOID		1
+#define ACPI_NHLT_MICTYPE_CARDIOID		2
+#define ACPI_NHLT_MICTYPE_SUPERCARDIOID		3
+#define ACPI_NHLT_MICTYPE_HYPERCARDIOID		4
+#define ACPI_NHLT_MICTYPE_8SHAPED		5
+#define ACPI_NHLT_MICTYPE_RESERVED		6
+#define ACPI_NHLT_MICTYPE_VENDORDEFINED		7
+
+/* Values for Panel field above */
+
+#define ACPI_NHLT_MICLOCATION_TOP		0
+#define ACPI_NHLT_MICLOCATION_BOTTOM		1
+#define ACPI_NHLT_MICLOCATION_LEFT		2
+#define ACPI_NHLT_MICLOCATION_RIGHT		3
+#define ACPI_NHLT_MICLOCATION_FRONT		4
+#define ACPI_NHLT_MICLOCATION_REAR		5
+
+struct acpi_nhlt_vendor_micdevice_config {
+	u8 virtual_slot;
+	u8 config_type;
+	u8 array_type;
+	u8 mics_count;
+	struct acpi_nhlt2_vendor_mic_config mics[];
+};
+
+union acpi_nhlt_device_config {
+	u8 virtual_slot;
+	struct acpi_nhlt_gendevice_config gen;
+	struct acpi_nhlt_micdevice_config mic;
+	struct acpi_nhlt_vendor_micdevice_config vendor_mic;
+};
+
+/* Inherited from Microsoft's WAVEFORMATEXTENSIBLE. */
+struct acpi_nhlt2_wave_formatext {
+	u16 format_tag;
+	u16 channel_count;
+	u32 samples_per_sec;
+	u32 avg_bytes_per_sec;
+	u16 block_align;
+	u16 bits_per_sample;
+	u16 extra_format_size;
+	u16 valid_bits_per_sample;
+	u32 channel_mask;
+	u8 subformat[16];
+};
+
+struct acpi_nhlt2_format_config {
+	struct acpi_nhlt2_wave_formatext format;
+	struct acpi_nhlt_config config;
+};
+
+struct acpi_nhlt2_formats_config {
+	u8 formats_count;
+	struct acpi_nhlt2_format_config formats[];
+};
+
+struct acpi_nhlt2_device_info {
+	u8 id[16];
+	u8 instance_id;
+	u8 port_id;
+};
+
+struct acpi_nhlt_devices_info {
+	u8 devices_count;
+	struct acpi_nhlt2_device_info devices[];
+};
+
 /*******************************************************************************
  *
  * PCCT - Platform Communications Channel Table (ACPI 5.0)
-- 
cgit v1.2.3


From 82b8acc06ea48b69c2482ac2ac994656c3740d08 Mon Sep 17 00:00:00 2001
From: Cezary Rojewski <cezary.rojewski@intel.com>
Date: Tue, 19 Mar 2024 09:30:16 +0100
Subject: ACPI: NHLT: Introduce API for the table

The table is composed of a range of endpoints with each describing
audio formats they support. Most of the operations involve iterating
over elements of the table and filtering them. Simplify the process by
implementing range of getters.

While the acpi_nhlt_endpoint_mic_count() stands out a bit, it is a
critical component for any AudioDSP driver to know how many digital
microphones it is dealing with.

Signed-off-by: Cezary Rojewski <cezary.rojewski@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/Kconfig  |   3 +
 drivers/acpi/Makefile |   1 +
 drivers/acpi/nhlt.c   | 289 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/acpi/nhlt.h   | 181 +++++++++++++++++++++++++++++++
 4 files changed, 474 insertions(+)
 create mode 100644 drivers/acpi/nhlt.c
 create mode 100644 include/acpi/nhlt.h

(limited to 'include')

diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index ff1689bb3124..e3a7c2aedd5f 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -469,6 +469,9 @@ config ACPI_REDUCED_HARDWARE_ONLY
 
 	  If you are unsure what to do, do not enable this option.
 
+config ACPI_NHLT
+	bool
+
 source "drivers/acpi/nfit/Kconfig"
 source "drivers/acpi/numa/Kconfig"
 source "drivers/acpi/apei/Kconfig"
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index 8cc8c0d9c873..d69d5444acdb 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -93,6 +93,7 @@ obj-$(CONFIG_ACPI_THERMAL_LIB)	+= thermal_lib.o
 obj-$(CONFIG_ACPI_THERMAL)	+= thermal.o
 obj-$(CONFIG_ACPI_PLATFORM_PROFILE) 	+= platform_profile.o
 obj-$(CONFIG_ACPI_NFIT)		+= nfit/
+obj-$(CONFIG_ACPI_NHLT)		+= nhlt.o
 obj-$(CONFIG_ACPI_NUMA)		+= numa/
 obj-$(CONFIG_ACPI)		+= acpi_memhotplug.o
 obj-$(CONFIG_ACPI_HOTPLUG_IOAPIC) += ioapic.o
diff --git a/drivers/acpi/nhlt.c b/drivers/acpi/nhlt.c
new file mode 100644
index 000000000000..ab722a95cbb5
--- /dev/null
+++ b/drivers/acpi/nhlt.c
@@ -0,0 +1,289 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright(c) 2023-2024 Intel Corporation
+ *
+ * Authors: Cezary Rojewski <cezary.rojewski@intel.com>
+ *          Amadeusz Slawinski <amadeuszx.slawinski@linux.intel.com>
+ */
+
+#define pr_fmt(fmt) "ACPI: NHLT: " fmt
+
+#include <linux/acpi.h>
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/minmax.h>
+#include <linux/printk.h>
+#include <linux/types.h>
+#include <acpi/nhlt.h>
+
+static struct acpi_table_nhlt2 *acpi_gbl_nhlt;
+
+static struct acpi_table_nhlt2 empty_nhlt = {
+	.header = {
+		.signature = ACPI_SIG_NHLT,
+	},
+};
+
+/**
+ * acpi_nhlt_get_gbl_table - Retrieve a pointer to the first NHLT table.
+ *
+ * If there is no NHLT in the system, acpi_gbl_nhlt will instead point to an
+ * empty table.
+ *
+ * Return: ACPI status code of the operation.
+ */
+acpi_status acpi_nhlt_get_gbl_table(void)
+{
+	acpi_status status;
+
+	status = acpi_get_table(ACPI_SIG_NHLT, 0, (struct acpi_table_header **)(&acpi_gbl_nhlt));
+	if (!acpi_gbl_nhlt)
+		acpi_gbl_nhlt = &empty_nhlt;
+	return status;
+}
+EXPORT_SYMBOL_GPL(acpi_nhlt_get_gbl_table);
+
+/**
+ * acpi_nhlt_put_gbl_table - Release the global NHLT table.
+ */
+void acpi_nhlt_put_gbl_table(void)
+{
+	acpi_put_table((struct acpi_table_header *)acpi_gbl_nhlt);
+}
+EXPORT_SYMBOL_GPL(acpi_nhlt_put_gbl_table);
+
+/**
+ * acpi_nhlt_endpoint_match - Verify if an endpoint matches criteria.
+ * @ep:			the endpoint to check.
+ * @link_type:		the hardware link type, e.g.: PDM or SSP.
+ * @dev_type:		the device type.
+ * @dir:		stream direction.
+ * @bus_id:		the ID of virtual bus hosting the endpoint.
+ *
+ * Either of @link_type, @dev_type, @dir or @bus_id may be set to a negative
+ * value to ignore the parameter when matching.
+ *
+ * Return: %true if endpoint matches specified criteria or %false otherwise.
+ */
+bool acpi_nhlt_endpoint_match(const struct acpi_nhlt2_endpoint *ep,
+			      int link_type, int dev_type, int dir, int bus_id)
+{
+	return ep &&
+	       (link_type < 0 || ep->link_type == link_type) &&
+	       (dev_type < 0 || ep->device_type == dev_type) &&
+	       (bus_id < 0 || ep->virtual_bus_id == bus_id) &&
+	       (dir < 0 || ep->direction == dir);
+}
+EXPORT_SYMBOL_GPL(acpi_nhlt_endpoint_match);
+
+/**
+ * acpi_nhlt_tb_find_endpoint - Search a NHLT table for an endpoint.
+ * @tb:			the table to search.
+ * @link_type:		the hardware link type, e.g.: PDM or SSP.
+ * @dev_type:		the device type.
+ * @dir:		stream direction.
+ * @bus_id:		the ID of virtual bus hosting the endpoint.
+ *
+ * Either of @link_type, @dev_type, @dir or @bus_id may be set to a negative
+ * value to ignore the parameter during the search.
+ *
+ * Return: A pointer to endpoint matching the criteria, %NULL if not found or
+ * an ERR_PTR() otherwise.
+ */
+struct acpi_nhlt2_endpoint *
+acpi_nhlt_tb_find_endpoint(const struct acpi_table_nhlt2 *tb,
+			   int link_type, int dev_type, int dir, int bus_id)
+{
+	struct acpi_nhlt2_endpoint *ep;
+
+	for_each_nhlt_endpoint(tb, ep)
+		if (acpi_nhlt_endpoint_match(ep, link_type, dev_type, dir, bus_id))
+			return ep;
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(acpi_nhlt_tb_find_endpoint);
+
+/**
+ * acpi_nhlt_find_endpoint - Search all NHLT tables for an endpoint.
+ * @link_type:		the hardware link type, e.g.: PDM or SSP.
+ * @dev_type:		the device type.
+ * @dir:		stream direction.
+ * @bus_id:		the ID of virtual bus hosting the endpoint.
+ *
+ * Either of @link_type, @dev_type, @dir or @bus_id may be set to a negative
+ * value to ignore the parameter during the search.
+ *
+ * Return: A pointer to endpoint matching the criteria, %NULL if not found or
+ * an ERR_PTR() otherwise.
+ */
+struct acpi_nhlt2_endpoint *
+acpi_nhlt_find_endpoint(int link_type, int dev_type, int dir, int bus_id)
+{
+	/* TODO: Currently limited to table of index 0. */
+	return acpi_nhlt_tb_find_endpoint(acpi_gbl_nhlt, link_type, dev_type, dir, bus_id);
+}
+EXPORT_SYMBOL_GPL(acpi_nhlt_find_endpoint);
+
+/**
+ * acpi_nhlt_endpoint_find_fmtcfg - Search endpoint's formats configuration space
+ *                                  for a specific format.
+ * @ep:			the endpoint to search.
+ * @ch:			number of channels.
+ * @rate:		samples per second.
+ * @vbps:		valid bits per sample.
+ * @bps:		bits per sample.
+ *
+ * Return: A pointer to format matching the criteria, %NULL if not found or
+ * an ERR_PTR() otherwise.
+ */
+struct acpi_nhlt2_format_config *
+acpi_nhlt_endpoint_find_fmtcfg(const struct acpi_nhlt2_endpoint *ep,
+			       u16 ch, u32 rate, u16 vbps, u16 bps)
+{
+	struct acpi_nhlt2_wave_formatext *wav;
+	struct acpi_nhlt2_format_config *fmt;
+
+	for_each_nhlt_endpoint_fmtcfg(ep, fmt) {
+		wav = &fmt->format;
+
+		if (wav->valid_bits_per_sample == vbps &&
+		    wav->samples_per_sec == rate &&
+		    wav->bits_per_sample == bps &&
+		    wav->channel_count == ch)
+			return fmt;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(acpi_nhlt_endpoint_find_fmtcfg);
+
+/**
+ * acpi_nhlt_tb_find_fmtcfg - Search a NHLT table for a specific format.
+ * @tb:			the table to search.
+ * @link_type:		the hardware link type, e.g.: PDM or SSP.
+ * @dev_type:		the device type.
+ * @dir:		stream direction.
+ * @bus_id:		the ID of virtual bus hosting the endpoint.
+ *
+ * @ch:			number of channels.
+ * @rate:		samples per second.
+ * @vbps:		valid bits per sample.
+ * @bps:		bits per sample.
+ *
+ * Either of @link_type, @dev_type, @dir or @bus_id may be set to a negative
+ * value to ignore the parameter during the search.
+ *
+ * Return: A pointer to format matching the criteria, %NULL if not found or
+ * an ERR_PTR() otherwise.
+ */
+struct acpi_nhlt2_format_config *
+acpi_nhlt_tb_find_fmtcfg(const struct acpi_table_nhlt2 *tb,
+			 int link_type, int dev_type, int dir, int bus_id,
+			 u16 ch, u32 rate, u16 vbps, u16 bps)
+{
+	struct acpi_nhlt2_format_config *fmt;
+	struct acpi_nhlt2_endpoint *ep;
+
+	for_each_nhlt_endpoint(tb, ep) {
+		if (!acpi_nhlt_endpoint_match(ep, link_type, dev_type, dir, bus_id))
+			continue;
+
+		fmt = acpi_nhlt_endpoint_find_fmtcfg(ep, ch, rate, vbps, bps);
+		if (fmt)
+			return fmt;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(acpi_nhlt_tb_find_fmtcfg);
+
+/**
+ * acpi_nhlt_find_fmtcfg - Search all NHLT tables for a specific format.
+ * @link_type:		the hardware link type, e.g.: PDM or SSP.
+ * @dev_type:		the device type.
+ * @dir:		stream direction.
+ * @bus_id:		the ID of virtual bus hosting the endpoint.
+ *
+ * @ch:			number of channels.
+ * @rate:		samples per second.
+ * @vbps:		valid bits per sample.
+ * @bps:		bits per sample.
+ *
+ * Either of @link_type, @dev_type, @dir or @bus_id may be set to a negative
+ * value to ignore the parameter during the search.
+ *
+ * Return: A pointer to format matching the criteria, %NULL if not found or
+ * an ERR_PTR() otherwise.
+ */
+struct acpi_nhlt2_format_config *
+acpi_nhlt_find_fmtcfg(int link_type, int dev_type, int dir, int bus_id,
+		      u16 ch, u32 rate, u16 vbps, u16 bps)
+{
+	/* TODO: Currently limited to table of index 0. */
+	return acpi_nhlt_tb_find_fmtcfg(acpi_gbl_nhlt, link_type, dev_type, dir, bus_id,
+					ch, rate, vbps, bps);
+}
+EXPORT_SYMBOL_GPL(acpi_nhlt_find_fmtcfg);
+
+static bool acpi_nhlt_config_is_micdevice(struct acpi_nhlt_config *cfg)
+{
+	return cfg->capabilities_size >= sizeof(struct acpi_nhlt_micdevice_config);
+}
+
+static bool acpi_nhlt_config_is_vendor_micdevice(struct acpi_nhlt_config *cfg)
+{
+	struct acpi_nhlt_vendor_micdevice_config *devcfg = __acpi_nhlt_config_caps(cfg);
+
+	return cfg->capabilities_size >= sizeof(*devcfg) &&
+	       cfg->capabilities_size == struct_size(devcfg, mics, devcfg->mics_count);
+}
+
+/**
+ * acpi_nhlt_endpoint_mic_count - Retrieve number of digital microphones for a PDM endpoint.
+ * @ep:			the endpoint to return microphones count for.
+ *
+ * Return: A number of microphones or an error code if an invalid endpoint is provided.
+ */
+int acpi_nhlt_endpoint_mic_count(const struct acpi_nhlt2_endpoint *ep)
+{
+	union acpi_nhlt_device_config *devcfg;
+	struct acpi_nhlt2_format_config *fmt;
+	struct acpi_nhlt_config *cfg;
+	u16 max_ch = 0;
+
+	if (!ep || ep->link_type != ACPI_NHLT_LINKTYPE_PDM)
+		return -EINVAL;
+
+	/* Find max number of channels based on formats configuration. */
+	for_each_nhlt_endpoint_fmtcfg(ep, fmt)
+		max_ch = max(fmt->format.channel_count, max_ch);
+
+	cfg = __acpi_nhlt_endpoint_config(ep);
+	devcfg = __acpi_nhlt_config_caps(cfg);
+
+	/* If @ep is not a mic array, fallback to channels count. */
+	if (!acpi_nhlt_config_is_micdevice(cfg) ||
+	    devcfg->gen.config_type != ACPI_NHLT_CONFIGTYPE_MICARRAY)
+		return max_ch;
+
+	switch (devcfg->mic.array_type) {
+	case ACPI_NHLT_ARRAYTYPE_LINEAR2_SMALL:
+	case ACPI_NHLT_ARRAYTYPE_LINEAR2_BIG:
+		return 2;
+
+	case ACPI_NHLT_ARRAYTYPE_LINEAR4_GEO1:
+	case ACPI_NHLT_ARRAYTYPE_PLANAR4_LSHAPED:
+	case ACPI_NHLT_ARRAYTYPE_LINEAR4_GEO2:
+		return 4;
+
+	case ACPI_NHLT_ARRAYTYPE_VENDOR:
+		if (!acpi_nhlt_config_is_vendor_micdevice(cfg))
+			return -EINVAL;
+		return devcfg->vendor_mic.mics_count;
+
+	default:
+		pr_warn("undefined mic array type: %#x\n", devcfg->mic.array_type);
+		return max_ch;
+	}
+}
+EXPORT_SYMBOL_GPL(acpi_nhlt_endpoint_mic_count);
diff --git a/include/acpi/nhlt.h b/include/acpi/nhlt.h
new file mode 100644
index 000000000000..7c394e7bc2bb
--- /dev/null
+++ b/include/acpi/nhlt.h
@@ -0,0 +1,181 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright(c) 2023-2024 Intel Corporation
+ *
+ * Authors: Cezary Rojewski <cezary.rojewski@intel.com>
+ *          Amadeusz Slawinski <amadeuszx.slawinski@linux.intel.com>
+ */
+
+#ifndef __ACPI_NHLT_H__
+#define __ACPI_NHLT_H__
+
+#include <linux/acpi.h>
+#include <linux/kconfig.h>
+#include <linux/overflow.h>
+#include <linux/types.h>
+
+#define __acpi_nhlt_endpoint_config(ep)		((void *)((ep) + 1))
+#define __acpi_nhlt_config_caps(cfg)		((void *)((cfg) + 1))
+
+/**
+ * acpi_nhlt_endpoint_fmtscfg - Get the formats configuration space.
+ * @ep:		the endpoint to retrieve the space for.
+ *
+ * Return: A pointer to the formats configuration space.
+ */
+static inline struct acpi_nhlt2_formats_config *
+acpi_nhlt_endpoint_fmtscfg(const struct acpi_nhlt2_endpoint *ep)
+{
+	struct acpi_nhlt_config *cfg = __acpi_nhlt_endpoint_config(ep);
+
+	return (struct acpi_nhlt2_formats_config *)((u8 *)(cfg + 1) + cfg->capabilities_size);
+}
+
+#define __acpi_nhlt_first_endpoint(tb) \
+	((void *)(tb + 1))
+
+#define __acpi_nhlt_next_endpoint(ep) \
+	((void *)((u8 *)(ep) + (ep)->length))
+
+#define __acpi_nhlt_get_endpoint(tb, ep, i) \
+	((i) ? __acpi_nhlt_next_endpoint(ep) : __acpi_nhlt_first_endpoint(tb))
+
+#define __acpi_nhlt_first_fmtcfg(fmts) \
+	((void *)(fmts + 1))
+
+#define __acpi_nhlt_next_fmtcfg(fmt) \
+	((void *)((u8 *)((fmt) + 1) + (fmt)->config.capabilities_size))
+
+#define __acpi_nhlt_get_fmtcfg(fmts, fmt, i) \
+	((i) ? __acpi_nhlt_next_fmtcfg(fmt) : __acpi_nhlt_first_fmtcfg(fmts))
+
+/*
+ * The for_each_nhlt_*() macros rely on an iterator to deal with the
+ * variable length of each endpoint structure and the possible presence
+ * of an OED-Config used by Windows only.
+ */
+
+/**
+ * for_each_nhlt_endpoint - Iterate over endpoints in a NHLT table.
+ * @tb:		the pointer to a NHLT table.
+ * @ep:		the pointer to endpoint to use as loop cursor.
+ */
+#define for_each_nhlt_endpoint(tb, ep)					\
+	for (unsigned int __i = 0;					\
+	     __i < (tb)->endpoints_count &&				\
+		(ep = __acpi_nhlt_get_endpoint(tb, ep, __i));		\
+	     __i++)
+
+/**
+ * for_each_nhlt_fmtcfg - Iterate over format configurations.
+ * @fmts:	the pointer to formats configuration space.
+ * @fmt:	the pointer to format to use as loop cursor.
+ */
+#define for_each_nhlt_fmtcfg(fmts, fmt)					\
+	for (unsigned int __i = 0;					\
+	     __i < (fmts)->formats_count &&				\
+		(fmt = __acpi_nhlt_get_fmtcfg(fmts, fmt, __i));	\
+	     __i++)
+
+/**
+ * for_each_nhlt_endpoint_fmtcfg - Iterate over format configurations in an endpoint.
+ * @ep:		the pointer to an endpoint.
+ * @fmt:	the pointer to format to use as loop cursor.
+ */
+#define for_each_nhlt_endpoint_fmtcfg(ep, fmt) \
+	for_each_nhlt_fmtcfg(acpi_nhlt_endpoint_fmtscfg(ep), fmt)
+
+#if IS_ENABLED(CONFIG_ACPI_NHLT)
+
+/*
+ * System-wide pointer to the first NHLT table.
+ *
+ * A sound driver may utilize acpi_nhlt_get/put_gbl_table() on its
+ * initialization and removal respectively to avoid excessive mapping
+ * and unmapping of the memory occupied by the table between streaming
+ * operations.
+ */
+
+acpi_status acpi_nhlt_get_gbl_table(void);
+void acpi_nhlt_put_gbl_table(void);
+
+bool acpi_nhlt_endpoint_match(const struct acpi_nhlt2_endpoint *ep,
+			      int link_type, int dev_type, int dir, int bus_id);
+struct acpi_nhlt2_endpoint *
+acpi_nhlt_tb_find_endpoint(const struct acpi_table_nhlt2 *tb,
+			   int link_type, int dev_type, int dir, int bus_id);
+struct acpi_nhlt2_endpoint *
+acpi_nhlt_find_endpoint(int link_type, int dev_type, int dir, int bus_id);
+struct acpi_nhlt2_format_config *
+acpi_nhlt_endpoint_find_fmtcfg(const struct acpi_nhlt2_endpoint *ep,
+			       u16 ch, u32 rate, u16 vbps, u16 bps);
+struct acpi_nhlt2_format_config *
+acpi_nhlt_tb_find_fmtcfg(const struct acpi_table_nhlt2 *tb,
+			 int link_type, int dev_type, int dir, int bus_id,
+			 u16 ch, u32 rate, u16 vpbs, u16 bps);
+struct acpi_nhlt2_format_config *
+acpi_nhlt_find_fmtcfg(int link_type, int dev_type, int dir, int bus_id,
+		      u16 ch, u32 rate, u16 vpbs, u16 bps);
+int acpi_nhlt_endpoint_mic_count(const struct acpi_nhlt2_endpoint *ep);
+
+#else /* !CONFIG_ACPI_NHLT */
+
+static inline acpi_status acpi_nhlt_get_gbl_table(void)
+{
+	return AE_NOT_FOUND;
+}
+
+static inline void acpi_nhlt_put_gbl_table(void)
+{
+}
+
+static inline bool
+acpi_nhlt_endpoint_match(const struct acpi_nhlt2_endpoint *ep,
+			 int link_type, int dev_type, int dir, int bus_id)
+{
+	return false;
+}
+
+static inline struct acpi_nhlt2_endpoint *
+acpi_nhlt_tb_find_endpoint(const struct acpi_table_nhlt2 *tb,
+			   int link_type, int dev_type, int dir, int bus_id)
+{
+	return NULL;
+}
+
+static inline struct acpi_nhlt2_format_config *
+acpi_nhlt_endpoint_find_fmtcfg(const struct acpi_nhlt2_endpoint *ep,
+			       u16 ch, u32 rate, u16 vbps, u16 bps)
+{
+	return NULL;
+}
+
+static inline struct acpi_nhlt2_format_config *
+acpi_nhlt_tb_find_fmtcfg(const struct acpi_table_nhlt2 *tb,
+			 int link_type, int dev_type, int dir, int bus_id,
+			 u16 ch, u32 rate, u16 vpbs, u16 bps)
+{
+	return NULL;
+}
+
+static inline int acpi_nhlt_endpoint_mic_count(const struct acpi_nhlt2_endpoint *ep)
+{
+	return 0;
+}
+
+static inline struct acpi_nhlt2_endpoint *
+acpi_nhlt_find_endpoint(int link_type, int dev_type, int dir, int bus_id)
+{
+	return NULL;
+}
+
+static inline struct acpi_nhlt2_format_config *
+acpi_nhlt_find_fmtcfg(int link_type, int dev_type, int dir, int bus_id,
+		      u16 ch, u32 rate, u16 vpbs, u16 bps)
+{
+	return NULL;
+}
+
+#endif /* CONFIG_ACPI_NHLT */
+
+#endif /* __ACPI_NHLT_H__ */
-- 
cgit v1.2.3


From 659a9490ccfbfad4aa5ab351f8a6f5f83ce5ed6c Mon Sep 17 00:00:00 2001
From: Cezary Rojewski <cezary.rojewski@intel.com>
Date: Tue, 19 Mar 2024 09:30:17 +0100
Subject: ACPI: NHLT: Drop redundant types

ACPICA commit 0c7379eae2a0342bfc36d6b7db0bb90ad13a5a3e

There are no users for the duplicated NHLT table components.

Link: https://github.com/acpica/acpica/pull/890
Signed-off-by: Cezary Rojewski <cezary.rojewski@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/actbl2.h | 254 --------------------------------------------------
 1 file changed, 254 deletions(-)

(limited to 'include')

diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h
index 8030a1743100..31a716a74340 100644
--- a/include/acpi/actbl2.h
+++ b/include/acpi/actbl2.h
@@ -1887,260 +1887,6 @@ struct nfit_device_handle {
 #define ACPI_NFIT_GET_NODE_ID(handle) \
 	(((handle) & ACPI_NFIT_NODE_ID_MASK)        >> ACPI_NFIT_NODE_ID_OFFSET)
 
-/*******************************************************************************
- *
- * NHLT - Non HD Audio Link Table
- *
- * Conforms to: Intel Smart Sound Technology NHLT Specification
- * Version 0.8.1, January 2020.
- *
- ******************************************************************************/
-
-/* Main table */
-
-struct acpi_table_nhlt {
-	struct acpi_table_header header;	/* Common ACPI table header */
-	u8 endpoint_count;
-};
-
-struct acpi_nhlt_endpoint {
-	u32 descriptor_length;
-	u8 link_type;
-	u8 instance_id;
-	u16 vendor_id;
-	u16 device_id;
-	u16 revision_id;
-	u32 subsystem_id;
-	u8 device_type;
-	u8 direction;
-	u8 virtual_bus_id;
-};
-
-/* Types for link_type field above */
-
-#define ACPI_NHLT_RESERVED_HD_AUDIO         0
-#define ACPI_NHLT_RESERVED_DSP              1
-#define ACPI_NHLT_PDM                       2
-#define ACPI_NHLT_SSP                       3
-#define ACPI_NHLT_RESERVED_SLIMBUS          4
-#define ACPI_NHLT_RESERVED_SOUNDWIRE        5
-#define ACPI_NHLT_TYPE_RESERVED             6	/* 6 and above are reserved */
-
-/* All other values above are reserved */
-
-/* Values for device_id field above */
-
-#define ACPI_NHLT_PDM_DMIC                  0xAE20
-#define ACPI_NHLT_BT_SIDEBAND               0xAE30
-#define ACPI_NHLT_I2S_TDM_CODECS            0xAE23
-
-/* Values for device_type field above */
-
-/* SSP Link */
-
-#define ACPI_NHLT_LINK_BT_SIDEBAND          0
-#define ACPI_NHLT_LINK_FM                   1
-#define ACPI_NHLT_LINK_MODEM                2
-/* 3 is reserved */
-#define ACPI_NHLT_LINK_SSP_ANALOG_CODEC     4
-
-/* PDM Link */
-
-#define ACPI_NHLT_PDM_ON_CAVS_1P8           0
-#define ACPI_NHLT_PDM_ON_CAVS_1P5           1
-
-/* Values for Direction field above */
-
-#define ACPI_NHLT_DIR_RENDER                0
-#define ACPI_NHLT_DIR_CAPTURE               1
-#define ACPI_NHLT_DIR_RENDER_LOOPBACK       2
-#define ACPI_NHLT_DIR_RENDER_FEEDBACK       3
-#define ACPI_NHLT_DIR_RESERVED              4	/* 4 and above are reserved */
-
-struct acpi_nhlt_device_specific_config {
-	u32 capabilities_size;
-	u8 virtual_slot;
-	u8 config_type;
-};
-
-struct acpi_nhlt_device_specific_config_a {
-	u32 capabilities_size;
-	u8 virtual_slot;
-	u8 config_type;
-	u8 array_type;
-};
-
-/* Values for Config Type above */
-
-#define ACPI_NHLT_CONFIG_TYPE_GENERIC              0x00
-#define ACPI_NHLT_CONFIG_TYPE_MIC_ARRAY            0x01
-#define ACPI_NHLT_CONFIG_TYPE_RENDER_FEEDBACK      0x03
-#define ACPI_NHLT_CONFIG_TYPE_RESERVED             0x04	/* 4 and above are reserved */
-
-struct acpi_nhlt_device_specific_config_b {
-	u32 capabilities_size;
-};
-
-struct acpi_nhlt_device_specific_config_c {
-	u32 capabilities_size;
-	u8 virtual_slot;
-};
-
-struct acpi_nhlt_render_device_specific_config {
-	u32 capabilities_size;
-	u8 virtual_slot;
-};
-
-struct acpi_nhlt_wave_extensible {
-	u16 format_tag;
-	u16 channel_count;
-	u32 samples_per_sec;
-	u32 avg_bytes_per_sec;
-	u16 block_align;
-	u16 bits_per_sample;
-	u16 extra_format_size;
-	u16 valid_bits_per_sample;
-	u32 channel_mask;
-	u8 sub_format_guid[16];
-};
-
-/* Values for channel_mask above */
-
-#define ACPI_NHLT_SPKR_FRONT_LEFT             0x1
-#define ACPI_NHLT_SPKR_FRONT_RIGHT            0x2
-#define ACPI_NHLT_SPKR_FRONT_CENTER           0x4
-#define ACPI_NHLT_SPKR_LOW_FREQ               0x8
-#define ACPI_NHLT_SPKR_BACK_LEFT              0x10
-#define ACPI_NHLT_SPKR_BACK_RIGHT             0x20
-#define ACPI_NHLT_SPKR_FRONT_LEFT_OF_CENTER   0x40
-#define ACPI_NHLT_SPKR_FRONT_RIGHT_OF_CENTER  0x80
-#define ACPI_NHLT_SPKR_BACK_CENTER            0x100
-#define ACPI_NHLT_SPKR_SIDE_LEFT              0x200
-#define ACPI_NHLT_SPKR_SIDE_RIGHT             0x400
-#define ACPI_NHLT_SPKR_TOP_CENTER             0x800
-#define ACPI_NHLT_SPKR_TOP_FRONT_LEFT         0x1000
-#define ACPI_NHLT_SPKR_TOP_FRONT_CENTER       0x2000
-#define ACPI_NHLT_SPKR_TOP_FRONT_RIGHT        0x4000
-#define ACPI_NHLT_SPKR_TOP_BACK_LEFT          0x8000
-#define ACPI_NHLT_SPKR_TOP_BACK_CENTER        0x10000
-#define ACPI_NHLT_SPKR_TOP_BACK_RIGHT         0x20000
-
-struct acpi_nhlt_format_config {
-	struct acpi_nhlt_wave_extensible format;
-	u32 capability_size;
-	u8 capabilities[];
-};
-
-struct acpi_nhlt_formats_config {
-	u8 formats_count;
-};
-
-struct acpi_nhlt_device_specific_hdr {
-	u8 virtual_slot;
-	u8 config_type;
-};
-
-/* Types for config_type above */
-
-#define ACPI_NHLT_GENERIC                   0
-#define ACPI_NHLT_MIC                       1
-#define ACPI_NHLT_RENDER                    3
-
-struct acpi_nhlt_mic_device_specific_config {
-	struct acpi_nhlt_device_specific_hdr device_config;
-	u8 array_type_ext;
-};
-
-/* Values for array_type_ext above */
-
-#define ACPI_NHLT_ARRAY_TYPE_RESERVED               0x09	/* 9 and below are reserved */
-#define ACPI_NHLT_SMALL_LINEAR_2ELEMENT             0x0A
-#define ACPI_NHLT_BIG_LINEAR_2ELEMENT               0x0B
-#define ACPI_NHLT_FIRST_GEOMETRY_LINEAR_4ELEMENT    0x0C
-#define ACPI_NHLT_PLANAR_LSHAPED_4ELEMENT           0x0D
-#define ACPI_NHLT_SECOND_GEOMETRY_LINEAR_4ELEMENT   0x0E
-#define ACPI_NHLT_VENDOR_DEFINED                    0x0F
-#define ACPI_NHLT_ARRAY_TYPE_MASK                   0x0F
-#define ACPI_NHLT_ARRAY_TYPE_EXT_MASK               0x10
-
-#define ACPI_NHLT_NO_EXTENSION                      0x0
-#define ACPI_NHLT_MIC_SNR_SENSITIVITY_EXT           (1<<4)
-
-struct acpi_nhlt_vendor_mic_count {
-	u8 microphone_count;
-};
-
-struct acpi_nhlt_vendor_mic_config {
-	u8 type;
-	u8 panel;
-	u16 speaker_position_distance;	/* mm */
-	u16 horizontal_offset;	/* mm */
-	u16 vertical_offset;	/* mm */
-	u8 frequency_low_band;	/* 5*Hz */
-	u8 frequency_high_band;	/* 500*Hz */
-	u16 direction_angle;	/* -180 - + 180 */
-	u16 elevation_angle;	/* -180 - + 180 */
-	u16 work_vertical_angle_begin;	/* -180 - + 180 with 2 deg step */
-	u16 work_vertical_angle_end;	/* -180 - + 180 with 2 deg step */
-	u16 work_horizontal_angle_begin;	/* -180 - + 180 with 2 deg step */
-	u16 work_horizontal_angle_end;	/* -180 - + 180 with 2 deg step */
-};
-
-/* Values for Type field above */
-
-#define ACPI_NHLT_MIC_OMNIDIRECTIONAL       0
-#define ACPI_NHLT_MIC_SUBCARDIOID           1
-#define ACPI_NHLT_MIC_CARDIOID              2
-#define ACPI_NHLT_MIC_SUPER_CARDIOID        3
-#define ACPI_NHLT_MIC_HYPER_CARDIOID        4
-#define ACPI_NHLT_MIC_8_SHAPED              5
-#define ACPI_NHLT_MIC_RESERVED6             6	/* 6 is reserved */
-#define ACPI_NHLT_MIC_VENDOR_DEFINED        7
-#define ACPI_NHLT_MIC_RESERVED              8	/* 8 and above are reserved */
-
-/* Values for Panel field above */
-
-#define ACPI_NHLT_MIC_POSITION_TOP          0
-#define ACPI_NHLT_MIC_POSITION_BOTTOM       1
-#define ACPI_NHLT_MIC_POSITION_LEFT         2
-#define ACPI_NHLT_MIC_POSITION_RIGHT        3
-#define ACPI_NHLT_MIC_POSITION_FRONT        4
-#define ACPI_NHLT_MIC_POSITION_BACK         5
-#define ACPI_NHLT_MIC_POSITION_RESERVED     6	/* 6 and above are reserved */
-
-struct acpi_nhlt_vendor_mic_device_specific_config {
-	struct acpi_nhlt_mic_device_specific_config mic_array_device_config;
-	u8 number_of_microphones;
-	struct acpi_nhlt_vendor_mic_config mic_config[];	/* Indexed by number_of_microphones */
-};
-
-/* Microphone SNR and Sensitivity extension */
-
-struct acpi_nhlt_mic_snr_sensitivity_extension {
-	u32 SNR;
-	u32 sensitivity;
-};
-
-/* Render device with feedback */
-
-struct acpi_nhlt_render_feedback_device_specific_config {
-	u8 feedback_virtual_slot;	/* Render slot in case of capture */
-	u16 feedback_channels;	/* Informative only */
-	u16 feedback_valid_bits_per_sample;
-};
-
-/* Non documented structures */
-
-struct acpi_nhlt_device_info_count {
-	u8 structure_count;
-};
-
-struct acpi_nhlt_device_info {
-	u8 device_id[16];
-	u8 device_instance_id;
-	u8 device_port_id;
-};
-
 /*******************************************************************************
  *
  * NHLT - Non HDAudio Link Table
-- 
cgit v1.2.3


From a640acab545b21ed1f347376f34d34e461ea92ba Mon Sep 17 00:00:00 2001
From: Cezary Rojewski <cezary.rojewski@intel.com>
Date: Tue, 19 Mar 2024 09:30:18 +0100
Subject: ACPI: NHLT: Streamline struct naming

Few recently introduced structs are named 'nhlt2' instead of 'nhlt' to
avoid naming conflicts. With duplicate types gone, the conflicts are no
more.

Signed-off-by: Cezary Rojewski <cezary.rojewski@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/nhlt.c   | 36 ++++++++++++++++++------------------
 include/acpi/actbl2.h | 22 +++++++++++-----------
 include/acpi/nhlt.h   | 46 +++++++++++++++++++++++-----------------------
 3 files changed, 52 insertions(+), 52 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/nhlt.c b/drivers/acpi/nhlt.c
index ab722a95cbb5..dc1bd0df9228 100644
--- a/drivers/acpi/nhlt.c
+++ b/drivers/acpi/nhlt.c
@@ -16,9 +16,9 @@
 #include <linux/types.h>
 #include <acpi/nhlt.h>
 
-static struct acpi_table_nhlt2 *acpi_gbl_nhlt;
+static struct acpi_table_nhlt *acpi_gbl_nhlt;
 
-static struct acpi_table_nhlt2 empty_nhlt = {
+static struct acpi_table_nhlt empty_nhlt = {
 	.header = {
 		.signature = ACPI_SIG_NHLT,
 	},
@@ -65,7 +65,7 @@ EXPORT_SYMBOL_GPL(acpi_nhlt_put_gbl_table);
  *
  * Return: %true if endpoint matches specified criteria or %false otherwise.
  */
-bool acpi_nhlt_endpoint_match(const struct acpi_nhlt2_endpoint *ep,
+bool acpi_nhlt_endpoint_match(const struct acpi_nhlt_endpoint *ep,
 			      int link_type, int dev_type, int dir, int bus_id)
 {
 	return ep &&
@@ -90,11 +90,11 @@ EXPORT_SYMBOL_GPL(acpi_nhlt_endpoint_match);
  * Return: A pointer to endpoint matching the criteria, %NULL if not found or
  * an ERR_PTR() otherwise.
  */
-struct acpi_nhlt2_endpoint *
-acpi_nhlt_tb_find_endpoint(const struct acpi_table_nhlt2 *tb,
+struct acpi_nhlt_endpoint *
+acpi_nhlt_tb_find_endpoint(const struct acpi_table_nhlt *tb,
 			   int link_type, int dev_type, int dir, int bus_id)
 {
-	struct acpi_nhlt2_endpoint *ep;
+	struct acpi_nhlt_endpoint *ep;
 
 	for_each_nhlt_endpoint(tb, ep)
 		if (acpi_nhlt_endpoint_match(ep, link_type, dev_type, dir, bus_id))
@@ -116,7 +116,7 @@ EXPORT_SYMBOL_GPL(acpi_nhlt_tb_find_endpoint);
  * Return: A pointer to endpoint matching the criteria, %NULL if not found or
  * an ERR_PTR() otherwise.
  */
-struct acpi_nhlt2_endpoint *
+struct acpi_nhlt_endpoint *
 acpi_nhlt_find_endpoint(int link_type, int dev_type, int dir, int bus_id)
 {
 	/* TODO: Currently limited to table of index 0. */
@@ -136,12 +136,12 @@ EXPORT_SYMBOL_GPL(acpi_nhlt_find_endpoint);
  * Return: A pointer to format matching the criteria, %NULL if not found or
  * an ERR_PTR() otherwise.
  */
-struct acpi_nhlt2_format_config *
-acpi_nhlt_endpoint_find_fmtcfg(const struct acpi_nhlt2_endpoint *ep,
+struct acpi_nhlt_format_config *
+acpi_nhlt_endpoint_find_fmtcfg(const struct acpi_nhlt_endpoint *ep,
 			       u16 ch, u32 rate, u16 vbps, u16 bps)
 {
-	struct acpi_nhlt2_wave_formatext *wav;
-	struct acpi_nhlt2_format_config *fmt;
+	struct acpi_nhlt_wave_formatext *wav;
+	struct acpi_nhlt_format_config *fmt;
 
 	for_each_nhlt_endpoint_fmtcfg(ep, fmt) {
 		wav = &fmt->format;
@@ -176,13 +176,13 @@ EXPORT_SYMBOL_GPL(acpi_nhlt_endpoint_find_fmtcfg);
  * Return: A pointer to format matching the criteria, %NULL if not found or
  * an ERR_PTR() otherwise.
  */
-struct acpi_nhlt2_format_config *
-acpi_nhlt_tb_find_fmtcfg(const struct acpi_table_nhlt2 *tb,
+struct acpi_nhlt_format_config *
+acpi_nhlt_tb_find_fmtcfg(const struct acpi_table_nhlt *tb,
 			 int link_type, int dev_type, int dir, int bus_id,
 			 u16 ch, u32 rate, u16 vbps, u16 bps)
 {
-	struct acpi_nhlt2_format_config *fmt;
-	struct acpi_nhlt2_endpoint *ep;
+	struct acpi_nhlt_format_config *fmt;
+	struct acpi_nhlt_endpoint *ep;
 
 	for_each_nhlt_endpoint(tb, ep) {
 		if (!acpi_nhlt_endpoint_match(ep, link_type, dev_type, dir, bus_id))
@@ -215,7 +215,7 @@ EXPORT_SYMBOL_GPL(acpi_nhlt_tb_find_fmtcfg);
  * Return: A pointer to format matching the criteria, %NULL if not found or
  * an ERR_PTR() otherwise.
  */
-struct acpi_nhlt2_format_config *
+struct acpi_nhlt_format_config *
 acpi_nhlt_find_fmtcfg(int link_type, int dev_type, int dir, int bus_id,
 		      u16 ch, u32 rate, u16 vbps, u16 bps)
 {
@@ -244,10 +244,10 @@ static bool acpi_nhlt_config_is_vendor_micdevice(struct acpi_nhlt_config *cfg)
  *
  * Return: A number of microphones or an error code if an invalid endpoint is provided.
  */
-int acpi_nhlt_endpoint_mic_count(const struct acpi_nhlt2_endpoint *ep)
+int acpi_nhlt_endpoint_mic_count(const struct acpi_nhlt_endpoint *ep)
 {
 	union acpi_nhlt_device_config *devcfg;
-	struct acpi_nhlt2_format_config *fmt;
+	struct acpi_nhlt_format_config *fmt;
 	struct acpi_nhlt_config *cfg;
 	u16 max_ch = 0;
 
diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h
index 31a716a74340..f237269bd1cb 100644
--- a/include/acpi/actbl2.h
+++ b/include/acpi/actbl2.h
@@ -1894,7 +1894,7 @@ struct nfit_device_handle {
  *
  ******************************************************************************/
 
-struct acpi_table_nhlt2 {
+struct acpi_table_nhlt {
 	struct acpi_table_header header;	/* Common ACPI table header */
 	u8 endpoints_count;
 	/*
@@ -1903,7 +1903,7 @@ struct acpi_table_nhlt2 {
 	 */
 };
 
-struct acpi_nhlt2_endpoint {
+struct acpi_nhlt_endpoint {
 	u32 length;
 	u8 link_type;
 	u8 instance_id;
@@ -1990,7 +1990,7 @@ struct acpi_nhlt_micdevice_config {
 #define ACPI_NHLT_ARRAYTYPE_LINEAR4_GEO2	0xE
 #define ACPI_NHLT_ARRAYTYPE_VENDOR		0xF
 
-struct acpi_nhlt2_vendor_mic_config {
+struct acpi_nhlt_vendor_mic_config {
 	u8 type;
 	u8 panel;
 	u16 speaker_position_distance;		/* mm */
@@ -2031,7 +2031,7 @@ struct acpi_nhlt_vendor_micdevice_config {
 	u8 config_type;
 	u8 array_type;
 	u8 mics_count;
-	struct acpi_nhlt2_vendor_mic_config mics[];
+	struct acpi_nhlt_vendor_mic_config mics[];
 };
 
 union acpi_nhlt_device_config {
@@ -2042,7 +2042,7 @@ union acpi_nhlt_device_config {
 };
 
 /* Inherited from Microsoft's WAVEFORMATEXTENSIBLE. */
-struct acpi_nhlt2_wave_formatext {
+struct acpi_nhlt_wave_formatext {
 	u16 format_tag;
 	u16 channel_count;
 	u32 samples_per_sec;
@@ -2055,17 +2055,17 @@ struct acpi_nhlt2_wave_formatext {
 	u8 subformat[16];
 };
 
-struct acpi_nhlt2_format_config {
-	struct acpi_nhlt2_wave_formatext format;
+struct acpi_nhlt_format_config {
+	struct acpi_nhlt_wave_formatext format;
 	struct acpi_nhlt_config config;
 };
 
-struct acpi_nhlt2_formats_config {
+struct acpi_nhlt_formats_config {
 	u8 formats_count;
-	struct acpi_nhlt2_format_config formats[];
+	struct acpi_nhlt_format_config formats[];
 };
 
-struct acpi_nhlt2_device_info {
+struct acpi_nhlt_device_info {
 	u8 id[16];
 	u8 instance_id;
 	u8 port_id;
@@ -2073,7 +2073,7 @@ struct acpi_nhlt2_device_info {
 
 struct acpi_nhlt_devices_info {
 	u8 devices_count;
-	struct acpi_nhlt2_device_info devices[];
+	struct acpi_nhlt_device_info devices[];
 };
 
 /*******************************************************************************
diff --git a/include/acpi/nhlt.h b/include/acpi/nhlt.h
index 7c394e7bc2bb..2108aa6d0207 100644
--- a/include/acpi/nhlt.h
+++ b/include/acpi/nhlt.h
@@ -23,12 +23,12 @@
  *
  * Return: A pointer to the formats configuration space.
  */
-static inline struct acpi_nhlt2_formats_config *
-acpi_nhlt_endpoint_fmtscfg(const struct acpi_nhlt2_endpoint *ep)
+static inline struct acpi_nhlt_formats_config *
+acpi_nhlt_endpoint_fmtscfg(const struct acpi_nhlt_endpoint *ep)
 {
 	struct acpi_nhlt_config *cfg = __acpi_nhlt_endpoint_config(ep);
 
-	return (struct acpi_nhlt2_formats_config *)((u8 *)(cfg + 1) + cfg->capabilities_size);
+	return (struct acpi_nhlt_formats_config *)((u8 *)(cfg + 1) + cfg->capabilities_size);
 }
 
 #define __acpi_nhlt_first_endpoint(tb) \
@@ -99,24 +99,24 @@ acpi_nhlt_endpoint_fmtscfg(const struct acpi_nhlt2_endpoint *ep)
 acpi_status acpi_nhlt_get_gbl_table(void);
 void acpi_nhlt_put_gbl_table(void);
 
-bool acpi_nhlt_endpoint_match(const struct acpi_nhlt2_endpoint *ep,
+bool acpi_nhlt_endpoint_match(const struct acpi_nhlt_endpoint *ep,
 			      int link_type, int dev_type, int dir, int bus_id);
-struct acpi_nhlt2_endpoint *
-acpi_nhlt_tb_find_endpoint(const struct acpi_table_nhlt2 *tb,
+struct acpi_nhlt_endpoint *
+acpi_nhlt_tb_find_endpoint(const struct acpi_table_nhlt *tb,
 			   int link_type, int dev_type, int dir, int bus_id);
-struct acpi_nhlt2_endpoint *
+struct acpi_nhlt_endpoint *
 acpi_nhlt_find_endpoint(int link_type, int dev_type, int dir, int bus_id);
-struct acpi_nhlt2_format_config *
-acpi_nhlt_endpoint_find_fmtcfg(const struct acpi_nhlt2_endpoint *ep,
+struct acpi_nhlt_format_config *
+acpi_nhlt_endpoint_find_fmtcfg(const struct acpi_nhlt_endpoint *ep,
 			       u16 ch, u32 rate, u16 vbps, u16 bps);
-struct acpi_nhlt2_format_config *
-acpi_nhlt_tb_find_fmtcfg(const struct acpi_table_nhlt2 *tb,
+struct acpi_nhlt_format_config *
+acpi_nhlt_tb_find_fmtcfg(const struct acpi_table_nhlt *tb,
 			 int link_type, int dev_type, int dir, int bus_id,
 			 u16 ch, u32 rate, u16 vpbs, u16 bps);
-struct acpi_nhlt2_format_config *
+struct acpi_nhlt_format_config *
 acpi_nhlt_find_fmtcfg(int link_type, int dev_type, int dir, int bus_id,
 		      u16 ch, u32 rate, u16 vpbs, u16 bps);
-int acpi_nhlt_endpoint_mic_count(const struct acpi_nhlt2_endpoint *ep);
+int acpi_nhlt_endpoint_mic_count(const struct acpi_nhlt_endpoint *ep);
 
 #else /* !CONFIG_ACPI_NHLT */
 
@@ -130,46 +130,46 @@ static inline void acpi_nhlt_put_gbl_table(void)
 }
 
 static inline bool
-acpi_nhlt_endpoint_match(const struct acpi_nhlt2_endpoint *ep,
+acpi_nhlt_endpoint_match(const struct acpi_nhlt_endpoint *ep,
 			 int link_type, int dev_type, int dir, int bus_id)
 {
 	return false;
 }
 
-static inline struct acpi_nhlt2_endpoint *
-acpi_nhlt_tb_find_endpoint(const struct acpi_table_nhlt2 *tb,
+static inline struct acpi_nhlt_endpoint *
+acpi_nhlt_tb_find_endpoint(const struct acpi_table_nhlt *tb,
 			   int link_type, int dev_type, int dir, int bus_id)
 {
 	return NULL;
 }
 
-static inline struct acpi_nhlt2_format_config *
-acpi_nhlt_endpoint_find_fmtcfg(const struct acpi_nhlt2_endpoint *ep,
+static inline struct acpi_nhlt_format_config *
+acpi_nhlt_endpoint_find_fmtcfg(const struct acpi_nhlt_endpoint *ep,
 			       u16 ch, u32 rate, u16 vbps, u16 bps)
 {
 	return NULL;
 }
 
-static inline struct acpi_nhlt2_format_config *
-acpi_nhlt_tb_find_fmtcfg(const struct acpi_table_nhlt2 *tb,
+static inline struct acpi_nhlt_format_config *
+acpi_nhlt_tb_find_fmtcfg(const struct acpi_table_nhlt *tb,
 			 int link_type, int dev_type, int dir, int bus_id,
 			 u16 ch, u32 rate, u16 vpbs, u16 bps)
 {
 	return NULL;
 }
 
-static inline int acpi_nhlt_endpoint_mic_count(const struct acpi_nhlt2_endpoint *ep)
+static inline int acpi_nhlt_endpoint_mic_count(const struct acpi_nhlt_endpoint *ep)
 {
 	return 0;
 }
 
-static inline struct acpi_nhlt2_endpoint *
+static inline struct acpi_nhlt_endpoint *
 acpi_nhlt_find_endpoint(int link_type, int dev_type, int dir, int bus_id)
 {
 	return NULL;
 }
 
-static inline struct acpi_nhlt2_format_config *
+static inline struct acpi_nhlt_format_config *
 acpi_nhlt_find_fmtcfg(int link_type, int dev_type, int dir, int bus_id,
 		      u16 ch, u32 rate, u16 vpbs, u16 bps)
 {
-- 
cgit v1.2.3


From 95d43290f1e476b3be782dd17642e452d0436266 Mon Sep 17 00:00:00 2001
From: Armin Wolf <W_Armin@gmx.de>
Date: Sat, 9 Mar 2024 21:13:06 +0100
Subject: ACPI: bus: Indicate support for _TFP thru _OSC

The ACPI thermal driver already uses the _TPF ACPI method to retrieve
precise sampling time values, but this is not reported thru _OSC.

Fix this by setting bit 9 ("Fast Thermal Sampling support") when
evaluating _OSC.

Fixes: a2ee7581afd5 ("ACPI: thermal: Add Thermal fast Sampling Period (_TFP) support")
Signed-off-by: Armin Wolf <W_Armin@gmx.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/bus.c   | 2 ++
 include/linux/acpi.h | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include')

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index d9fa730416f1..9c13a4e43fa8 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -316,6 +316,8 @@ static void acpi_bus_osc_negotiate_platform_control(void)
 		capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PAD_SUPPORT;
 	if (IS_ENABLED(CONFIG_ACPI_PROCESSOR))
 		capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PPC_OST_SUPPORT;
+	if (IS_ENABLED(CONFIG_ACPI_THERMAL))
+		capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_FAST_THERMAL_SAMPLING_SUPPORT;
 
 	capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_HOTPLUG_OST_SUPPORT;
 	capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PCLPI_SUPPORT;
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 34829f2c517a..51bdd9e08f6d 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -573,6 +573,7 @@ acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context);
 #define OSC_SB_CPCV2_SUPPORT			0x00000040
 #define OSC_SB_PCLPI_SUPPORT			0x00000080
 #define OSC_SB_OSLPI_SUPPORT			0x00000100
+#define OSC_SB_FAST_THERMAL_SAMPLING_SUPPORT	0x00000200
 #define OSC_SB_CPC_DIVERSE_HIGH_SUPPORT		0x00001000
 #define OSC_SB_GENERIC_INITIATOR_SUPPORT	0x00002000
 #define OSC_SB_CPC_FLEXIBLE_ADR_SPACE		0x00004000
-- 
cgit v1.2.3


From 6e8345f23ca37d6d41bb76be5d6a705ddf542817 Mon Sep 17 00:00:00 2001
From: Armin Wolf <W_Armin@gmx.de>
Date: Sat, 9 Mar 2024 21:13:07 +0100
Subject: ACPI: bus: Indicate support for more than 16 p-states thru _OSC

The code responsible for parsing the available p-states should
have no problems handling more than 16 p-states.

Indicate this by setting bit 10 ("Greater Than 16 p-state support")
when evaluating _OSC.

Signed-off-by: Armin Wolf <W_Armin@gmx.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/bus.c   | 1 +
 include/linux/acpi.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 9c13a4e43fa8..d5b0e80dc48e 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -321,6 +321,7 @@ static void acpi_bus_osc_negotiate_platform_control(void)
 
 	capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_HOTPLUG_OST_SUPPORT;
 	capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PCLPI_SUPPORT;
+	capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_OVER_16_PSTATES_SUPPORT;
 	if (IS_ENABLED(CONFIG_ACPI_PRMT))
 		capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PRM_SUPPORT;
 	if (IS_ENABLED(CONFIG_ACPI_FFH))
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 51bdd9e08f6d..0e0b027e27e2 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -574,6 +574,7 @@ acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context);
 #define OSC_SB_PCLPI_SUPPORT			0x00000080
 #define OSC_SB_OSLPI_SUPPORT			0x00000100
 #define OSC_SB_FAST_THERMAL_SAMPLING_SUPPORT	0x00000200
+#define OSC_SB_OVER_16_PSTATES_SUPPORT		0x00000400
 #define OSC_SB_CPC_DIVERSE_HIGH_SUPPORT		0x00001000
 #define OSC_SB_GENERIC_INITIATOR_SUPPORT	0x00002000
 #define OSC_SB_CPC_FLEXIBLE_ADR_SPACE		0x00004000
-- 
cgit v1.2.3


From a8a967a243d71dd635ede076020f665a4df51c63 Mon Sep 17 00:00:00 2001
From: Armin Wolf <W_Armin@gmx.de>
Date: Sat, 9 Mar 2024 21:13:08 +0100
Subject: ACPI: bus: Indicate support for the Generic Event Device thru _OSC

A device driver for the Generic Event Device (ACPI0013) already
exists for quite some time, but support for it was never reported
thru _OSC.

Fix this by setting bit 11 ("Generic Event Device support") when
evaluating _OSC.

Fixes: 3db80c230da1 ("ACPI: implement Generic Event Device")
Signed-off-by: Armin Wolf <W_Armin@gmx.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/bus.c   | 1 +
 include/linux/acpi.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index d5b0e80dc48e..0c48b603098a 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -322,6 +322,7 @@ static void acpi_bus_osc_negotiate_platform_control(void)
 	capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_HOTPLUG_OST_SUPPORT;
 	capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PCLPI_SUPPORT;
 	capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_OVER_16_PSTATES_SUPPORT;
+	capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_GED_SUPPORT;
 	if (IS_ENABLED(CONFIG_ACPI_PRMT))
 		capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PRM_SUPPORT;
 	if (IS_ENABLED(CONFIG_ACPI_FFH))
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 0e0b027e27e2..b0d909d1f5fc 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -575,6 +575,7 @@ acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context);
 #define OSC_SB_OSLPI_SUPPORT			0x00000100
 #define OSC_SB_FAST_THERMAL_SAMPLING_SUPPORT	0x00000200
 #define OSC_SB_OVER_16_PSTATES_SUPPORT		0x00000400
+#define OSC_SB_GED_SUPPORT			0x00000800
 #define OSC_SB_CPC_DIVERSE_HIGH_SUPPORT		0x00001000
 #define OSC_SB_GENERIC_INITIATOR_SUPPORT	0x00002000
 #define OSC_SB_CPC_FLEXIBLE_ADR_SPACE		0x00004000
-- 
cgit v1.2.3


From d0d4f1474e36b195eaad477373127ae621334c01 Mon Sep 17 00:00:00 2001
From: Armin Wolf <W_Armin@gmx.de>
Date: Sat, 9 Mar 2024 21:13:09 +0100
Subject: ACPI: Fix Generic Initiator Affinity _OSC bit

The ACPI spec says bit 17 should be used to indicate support
for Generic Initiator Affinity Structure in SRAT, but we currently
set bit 13 ("Interrupt ResourceSource support").

Fix this by actually setting bit 17 when evaluating _OSC.

Fixes: 01aabca2fd54 ("ACPI: Let ACPI know we support Generic Initiator Affinity Structures")
Signed-off-by: Armin Wolf <W_Armin@gmx.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index b0d909d1f5fc..e77783e101c3 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -577,8 +577,8 @@ acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context);
 #define OSC_SB_OVER_16_PSTATES_SUPPORT		0x00000400
 #define OSC_SB_GED_SUPPORT			0x00000800
 #define OSC_SB_CPC_DIVERSE_HIGH_SUPPORT		0x00001000
-#define OSC_SB_GENERIC_INITIATOR_SUPPORT	0x00002000
 #define OSC_SB_CPC_FLEXIBLE_ADR_SPACE		0x00004000
+#define OSC_SB_GENERIC_INITIATOR_SUPPORT	0x00020000
 #define OSC_SB_NATIVE_USB4_SUPPORT		0x00040000
 #define OSC_SB_PRM_SUPPORT			0x00200000
 #define OSC_SB_FFH_OPR_SUPPORT			0x00400000
-- 
cgit v1.2.3


From 403ad17c06509794fdf6e4d4b3070bd5b56e2a8e Mon Sep 17 00:00:00 2001
From: Armin Wolf <W_Armin@gmx.de>
Date: Sat, 9 Mar 2024 21:13:10 +0100
Subject: ACPI: bus: Indicate support for IRQ ResourceSource thru _OSC

The ACPI IRQ mapping code supports parsing of ResourceSource,
but this is not reported thru _OSC.

Fix this by setting bit 13 ("Interrupt ResourceSource support")
when evaluating _OSC.

Fixes: d44fa3d46079 ("ACPI: Add support for ResourceSource/IRQ domain mapping")
Signed-off-by: Armin Wolf <W_Armin@gmx.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/bus.c   | 1 +
 include/linux/acpi.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 0c48b603098a..a87b10eef77d 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -323,6 +323,7 @@ static void acpi_bus_osc_negotiate_platform_control(void)
 	capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PCLPI_SUPPORT;
 	capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_OVER_16_PSTATES_SUPPORT;
 	capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_GED_SUPPORT;
+	capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_IRQ_RESOURCE_SOURCE_SUPPORT;
 	if (IS_ENABLED(CONFIG_ACPI_PRMT))
 		capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PRM_SUPPORT;
 	if (IS_ENABLED(CONFIG_ACPI_FFH))
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index e77783e101c3..168201e4c782 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -577,6 +577,7 @@ acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context);
 #define OSC_SB_OVER_16_PSTATES_SUPPORT		0x00000400
 #define OSC_SB_GED_SUPPORT			0x00000800
 #define OSC_SB_CPC_DIVERSE_HIGH_SUPPORT		0x00001000
+#define OSC_SB_IRQ_RESOURCE_SOURCE_SUPPORT	0x00002000
 #define OSC_SB_CPC_FLEXIBLE_ADR_SPACE		0x00004000
 #define OSC_SB_GENERIC_INITIATOR_SUPPORT	0x00020000
 #define OSC_SB_NATIVE_USB4_SUPPORT		0x00040000
-- 
cgit v1.2.3


From 2bcf0f9bb33d7199a7ba02c2a38c25a4f7f22c3a Mon Sep 17 00:00:00 2001
From: Ravi Kumar Vodapalli <ravi.kumar.vodapalli@intel.com>
Date: Tue, 26 Mar 2024 16:02:41 +0530
Subject: drm/i915: Add new PCI IDs to DG2 platform in driver

New PCI IDs are added in Bspec for DG2 platform, add them in driver

Bspec: 44477
Signed-off-by: Ravi Kumar Vodapalli <ravi.kumar.vodapalli@intel.com>
Reviewed-by: Matt Roper <matthew.d.roper@intel.com>
Signed-off-by: Matt Roper <matthew.d.roper@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240326103241.3832494-1-ravi.kumar.vodapalli@intel.com
---
 include/drm/i915_pciids.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/drm/i915_pciids.h b/include/drm/i915_pciids.h
index 28a96aa1e08f..85ce33ad6e26 100644
--- a/include/drm/i915_pciids.h
+++ b/include/drm/i915_pciids.h
@@ -711,7 +711,9 @@
 	INTEL_VGA_DEVICE(0x5692, info), \
 	INTEL_VGA_DEVICE(0x56A0, info), \
 	INTEL_VGA_DEVICE(0x56A1, info), \
-	INTEL_VGA_DEVICE(0x56A2, info)
+	INTEL_VGA_DEVICE(0x56A2, info), \
+	INTEL_VGA_DEVICE(0x56BE, info), \
+	INTEL_VGA_DEVICE(0x56BF, info)
 
 #define INTEL_DG2_G11_IDS(info) \
 	INTEL_VGA_DEVICE(0x5693, info), \
-- 
cgit v1.2.3


From 4628b804555773daa982d9578d89fe04fcdde374 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Sun, 10 Mar 2024 01:02:08 +0000
Subject: regulator: axp20x: fix typo-ed identifier

The registers to set the X-Powers AXP313 regulators are of course
"CONTROL" registers, not "CONRTOL" ones.

Fix the typo in the header file and in its users. No functional change.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Acked-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Tested-by: Ryan Walklin <ryan@testtoast.com>
Link: https://lore.kernel.org/r/20240310010211.28653-2-andre.przywara@arm.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/regulator/axp20x-regulator.c | 10 +++++-----
 include/linux/mfd/axp20x.h           | 10 +++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/regulator/axp20x-regulator.c b/drivers/regulator/axp20x-regulator.c
index c657820b0bbb..7bcc2d508b65 100644
--- a/drivers/regulator/axp20x-regulator.c
+++ b/drivers/regulator/axp20x-regulator.c
@@ -733,21 +733,21 @@ static const struct linear_range axp313a_dcdc3_ranges[] = {
 static const struct regulator_desc axp313a_regulators[] = {
 	AXP_DESC_RANGES(AXP313A, DCDC1, "dcdc1", "vin1",
 			axp313a_dcdc1_ranges, AXP313A_DCDC1_NUM_VOLTAGES,
-			AXP313A_DCDC1_CONRTOL, AXP313A_DCDC_V_OUT_MASK,
+			AXP313A_DCDC1_CONTROL, AXP313A_DCDC_V_OUT_MASK,
 			AXP313A_OUTPUT_CONTROL, BIT(0)),
 	AXP_DESC_RANGES(AXP313A, DCDC2, "dcdc2", "vin2",
 			axp313a_dcdc2_ranges, AXP313A_DCDC23_NUM_VOLTAGES,
-			AXP313A_DCDC2_CONRTOL, AXP313A_DCDC_V_OUT_MASK,
+			AXP313A_DCDC2_CONTROL, AXP313A_DCDC_V_OUT_MASK,
 			AXP313A_OUTPUT_CONTROL, BIT(1)),
 	AXP_DESC_RANGES(AXP313A, DCDC3, "dcdc3", "vin3",
 			axp313a_dcdc3_ranges, AXP313A_DCDC23_NUM_VOLTAGES,
-			AXP313A_DCDC3_CONRTOL, AXP313A_DCDC_V_OUT_MASK,
+			AXP313A_DCDC3_CONTROL, AXP313A_DCDC_V_OUT_MASK,
 			AXP313A_OUTPUT_CONTROL, BIT(2)),
 	AXP_DESC(AXP313A, ALDO1, "aldo1", "vin1", 500, 3500, 100,
-		 AXP313A_ALDO1_CONRTOL, AXP313A_LDO_V_OUT_MASK,
+		 AXP313A_ALDO1_CONTROL, AXP313A_LDO_V_OUT_MASK,
 		 AXP313A_OUTPUT_CONTROL, BIT(3)),
 	AXP_DESC(AXP313A, DLDO1, "dldo1", "vin1", 500, 3500, 100,
-		 AXP313A_DLDO1_CONRTOL, AXP313A_LDO_V_OUT_MASK,
+		 AXP313A_DLDO1_CONTROL, AXP313A_LDO_V_OUT_MASK,
 		 AXP313A_OUTPUT_CONTROL, BIT(4)),
 	AXP_DESC_FIXED(AXP313A, RTC_LDO, "rtc-ldo", "vin1", 1800),
 };
diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index f1755163dd9f..9b2b7ac2107f 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -104,11 +104,11 @@ enum axp20x_variants {
 
 #define AXP313A_ON_INDICATE		0x00
 #define AXP313A_OUTPUT_CONTROL		0x10
-#define AXP313A_DCDC1_CONRTOL		0x13
-#define AXP313A_DCDC2_CONRTOL		0x14
-#define AXP313A_DCDC3_CONRTOL		0x15
-#define AXP313A_ALDO1_CONRTOL		0x16
-#define AXP313A_DLDO1_CONRTOL		0x17
+#define AXP313A_DCDC1_CONTROL		0x13
+#define AXP313A_DCDC2_CONTROL		0x14
+#define AXP313A_DCDC3_CONTROL		0x15
+#define AXP313A_ALDO1_CONTROL		0x16
+#define AXP313A_DLDO1_CONTROL		0x17
 #define AXP313A_SHUTDOWN_CTRL		0x1a
 #define AXP313A_IRQ_EN			0x20
 #define AXP313A_IRQ_STATE		0x21
-- 
cgit v1.2.3


From b5bfc8ab2484e8e62ae3ffccaecfe5d82c19f51f Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Sun, 10 Mar 2024 01:02:10 +0000
Subject: mfd: axp20x: Add support for AXP717 PMIC

The AXP717a is a PMIC chip produced by X-Powers, it can be connected to
an I2C or RSB bus.

It's a rather complete PMIC, with many regulators, interrupts, an ADC and
battery charging functionality. It also offer USB type-C CC pin
handling.

Describe the regmap and the MFD bits, along with the registers exposed
via I2C or RSB. This covers the regulator, interrupts and power key
devices for now.
Advertise the device using the new compatible string.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Acked-by: Jernej Skrabec <jernej.skrabec@gmail.com>
Tested-by: Ryan Walklin <ryan@testtoast.com>
Link: https://lore.kernel.org/r/20240310010211.28653-4-andre.przywara@arm.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/axp20x-i2c.c   |  2 ++
 drivers/mfd/axp20x-rsb.c   |  1 +
 drivers/mfd/axp20x.c       | 90 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/axp20x.h | 67 ++++++++++++++++++++++++++++++++++
 4 files changed, 160 insertions(+)

(limited to 'include')

diff --git a/drivers/mfd/axp20x-i2c.c b/drivers/mfd/axp20x-i2c.c
index 68d3560cfe4a..b8e7ac89f697 100644
--- a/drivers/mfd/axp20x-i2c.c
+++ b/drivers/mfd/axp20x-i2c.c
@@ -65,6 +65,7 @@ static const struct of_device_id axp20x_i2c_of_match[] = {
 	{ .compatible = "x-powers,axp221", .data = (void *)AXP221_ID },
 	{ .compatible = "x-powers,axp223", .data = (void *)AXP223_ID },
 	{ .compatible = "x-powers,axp313a", .data = (void *)AXP313A_ID },
+	{ .compatible = "x-powers,axp717", .data = (void *)AXP717_ID },
 	{ .compatible = "x-powers,axp803", .data = (void *)AXP803_ID },
 	{ .compatible = "x-powers,axp806", .data = (void *)AXP806_ID },
 	{ .compatible = "x-powers,axp15060", .data = (void *)AXP15060_ID },
@@ -81,6 +82,7 @@ static const struct i2c_device_id axp20x_i2c_id[] = {
 	{ "axp221", 0 },
 	{ "axp223", 0 },
 	{ "axp313a", 0 },
+	{ "axp717", 0 },
 	{ "axp803", 0 },
 	{ "axp806", 0 },
 	{ "axp15060", 0 },
diff --git a/drivers/mfd/axp20x-rsb.c b/drivers/mfd/axp20x-rsb.c
index 214bc0d84d44..059656f2a1bd 100644
--- a/drivers/mfd/axp20x-rsb.c
+++ b/drivers/mfd/axp20x-rsb.c
@@ -58,6 +58,7 @@ static void axp20x_rsb_remove(struct sunxi_rsb_device *rdev)
 
 static const struct of_device_id axp20x_rsb_of_match[] = {
 	{ .compatible = "x-powers,axp223", .data = (void *)AXP223_ID },
+	{ .compatible = "x-powers,axp717", .data = (void *)AXP717_ID },
 	{ .compatible = "x-powers,axp803", .data = (void *)AXP803_ID },
 	{ .compatible = "x-powers,axp806", .data = (void *)AXP806_ID },
 	{ .compatible = "x-powers,axp809", .data = (void *)AXP809_ID },
diff --git a/drivers/mfd/axp20x.c b/drivers/mfd/axp20x.c
index d8daa593ebd5..48ce6ea693ce 100644
--- a/drivers/mfd/axp20x.c
+++ b/drivers/mfd/axp20x.c
@@ -42,6 +42,7 @@ static const char * const axp20x_model_names[] = {
 	"AXP223",
 	"AXP288",
 	"AXP313a",
+	"AXP717",
 	"AXP803",
 	"AXP806",
 	"AXP809",
@@ -207,6 +208,25 @@ static const struct regmap_access_table axp313a_volatile_table = {
 	.n_yes_ranges = ARRAY_SIZE(axp313a_volatile_ranges),
 };
 
+static const struct regmap_range axp717_writeable_ranges[] = {
+	regmap_reg_range(AXP717_IRQ0_EN, AXP717_IRQ4_EN),
+	regmap_reg_range(AXP717_DCDC_OUTPUT_CONTROL, AXP717_CPUSLDO_CONTROL),
+};
+
+static const struct regmap_range axp717_volatile_ranges[] = {
+	regmap_reg_range(AXP717_IRQ0_STATE, AXP717_IRQ4_STATE),
+};
+
+static const struct regmap_access_table axp717_writeable_table = {
+	.yes_ranges = axp717_writeable_ranges,
+	.n_yes_ranges = ARRAY_SIZE(axp717_writeable_ranges),
+};
+
+static const struct regmap_access_table axp717_volatile_table = {
+	.yes_ranges = axp717_volatile_ranges,
+	.n_yes_ranges = ARRAY_SIZE(axp717_volatile_ranges),
+};
+
 static const struct regmap_range axp806_volatile_ranges[] = {
 	regmap_reg_range(AXP20X_IRQ1_STATE, AXP20X_IRQ2_STATE),
 };
@@ -317,6 +337,11 @@ static const struct resource axp313a_pek_resources[] = {
 	DEFINE_RES_IRQ_NAMED(AXP313A_IRQ_PEK_FAL_EDGE, "PEK_DBF"),
 };
 
+static const struct resource axp717_pek_resources[] = {
+	DEFINE_RES_IRQ_NAMED(AXP717_IRQ_PEK_RIS_EDGE, "PEK_DBR"),
+	DEFINE_RES_IRQ_NAMED(AXP717_IRQ_PEK_FAL_EDGE, "PEK_DBF"),
+};
+
 static const struct resource axp803_pek_resources[] = {
 	DEFINE_RES_IRQ_NAMED(AXP803_IRQ_PEK_RIS_EDGE, "PEK_DBR"),
 	DEFINE_RES_IRQ_NAMED(AXP803_IRQ_PEK_FAL_EDGE, "PEK_DBF"),
@@ -391,6 +416,15 @@ static const struct regmap_config axp313a_regmap_config = {
 	.cache_type = REGCACHE_MAPLE,
 };
 
+static const struct regmap_config axp717_regmap_config = {
+	.reg_bits = 8,
+	.val_bits = 8,
+	.wr_table = &axp717_writeable_table,
+	.volatile_table = &axp717_volatile_table,
+	.max_register = AXP717_CPUSLDO_CONTROL,
+	.cache_type = REGCACHE_RBTREE,
+};
+
 static const struct regmap_config axp806_regmap_config = {
 	.reg_bits	= 8,
 	.val_bits	= 8,
@@ -589,6 +623,40 @@ static const struct regmap_irq axp313a_regmap_irqs[] = {
 	INIT_REGMAP_IRQ(AXP313A, DIE_TEMP_HIGH,		0, 0),
 };
 
+static const struct regmap_irq axp717_regmap_irqs[] = {
+	INIT_REGMAP_IRQ(AXP717, SOC_DROP_LVL2,		0, 7),
+	INIT_REGMAP_IRQ(AXP717, SOC_DROP_LVL1,		0, 6),
+	INIT_REGMAP_IRQ(AXP717, GAUGE_NEW_SOC,		0, 4),
+	INIT_REGMAP_IRQ(AXP717, BOOST_OVER_V,		0, 2),
+	INIT_REGMAP_IRQ(AXP717, VBUS_OVER_V,		0, 1),
+	INIT_REGMAP_IRQ(AXP717, VBUS_FAULT,		0, 0),
+	INIT_REGMAP_IRQ(AXP717, VBUS_PLUGIN,		1, 7),
+	INIT_REGMAP_IRQ(AXP717, VBUS_REMOVAL,		1, 6),
+	INIT_REGMAP_IRQ(AXP717, BATT_PLUGIN,		1, 5),
+	INIT_REGMAP_IRQ(AXP717, BATT_REMOVAL,		1, 4),
+	INIT_REGMAP_IRQ(AXP717, PEK_SHORT,		1, 3),
+	INIT_REGMAP_IRQ(AXP717, PEK_LONG,		1, 2),
+	INIT_REGMAP_IRQ(AXP717, PEK_FAL_EDGE,		1, 1),
+	INIT_REGMAP_IRQ(AXP717, PEK_RIS_EDGE,		1, 0),
+	INIT_REGMAP_IRQ(AXP717, WDOG_EXPIRE,		2, 7),
+	INIT_REGMAP_IRQ(AXP717, LDO_OVER_CURR,		2, 6),
+	INIT_REGMAP_IRQ(AXP717, BATT_OVER_CURR,		2, 5),
+	INIT_REGMAP_IRQ(AXP717, CHARG_DONE,		2, 4),
+	INIT_REGMAP_IRQ(AXP717, CHARG,			2, 3),
+	INIT_REGMAP_IRQ(AXP717, DIE_TEMP_HIGH,		2, 2),
+	INIT_REGMAP_IRQ(AXP717, CHARG_TIMER,		2, 1),
+	INIT_REGMAP_IRQ(AXP717, BATT_OVER_V,		2, 0),
+	INIT_REGMAP_IRQ(AXP717, BC_USB_DONE,		3, 7),
+	INIT_REGMAP_IRQ(AXP717, BC_USB_CHNG,		3, 6),
+	INIT_REGMAP_IRQ(AXP717, BATT_QUIT_TEMP_HIGH,	3, 4),
+	INIT_REGMAP_IRQ(AXP717, BATT_CHG_TEMP_HIGH,	3, 3),
+	INIT_REGMAP_IRQ(AXP717, BATT_CHG_TEMP_LOW,	3, 2),
+	INIT_REGMAP_IRQ(AXP717, BATT_ACT_TEMP_HIGH,	3, 1),
+	INIT_REGMAP_IRQ(AXP717, BATT_ACT_TEMP_LOW,	3, 0),
+	INIT_REGMAP_IRQ(AXP717, TYPEC_REMOVE,		4, 6),
+	INIT_REGMAP_IRQ(AXP717, TYPEC_PLUGIN,		4, 5),
+};
+
 static const struct regmap_irq axp803_regmap_irqs[] = {
 	INIT_REGMAP_IRQ(AXP803, ACIN_OVER_V,		0, 7),
 	INIT_REGMAP_IRQ(AXP803, ACIN_PLUGIN,		0, 6),
@@ -776,6 +844,17 @@ static const struct regmap_irq_chip axp313a_regmap_irq_chip = {
 	.num_regs		= 1,
 };
 
+static const struct regmap_irq_chip axp717_regmap_irq_chip = {
+	.name			= "axp717_irq_chip",
+	.status_base		= AXP717_IRQ0_STATE,
+	.ack_base		= AXP717_IRQ0_STATE,
+	.unmask_base		= AXP717_IRQ0_EN,
+	.init_ack_masked	= true,
+	.irqs			= axp717_regmap_irqs,
+	.num_irqs		= ARRAY_SIZE(axp717_regmap_irqs),
+	.num_regs		= 5,
+};
+
 static const struct regmap_irq_chip axp803_regmap_irq_chip = {
 	.name			= "axp803",
 	.status_base		= AXP20X_IRQ1_STATE,
@@ -941,6 +1020,11 @@ static struct mfd_cell axp313a_cells[] = {
 	MFD_CELL_RES("axp313a-pek", axp313a_pek_resources),
 };
 
+static struct mfd_cell axp717_cells[] = {
+	MFD_CELL_NAME("axp20x-regulator"),
+	MFD_CELL_RES("axp20x-pek", axp717_pek_resources),
+};
+
 static const struct resource axp288_adc_resources[] = {
 	DEFINE_RES_IRQ_NAMED(AXP288_IRQ_GPADC, "GPADC"),
 };
@@ -1181,6 +1265,12 @@ int axp20x_match_device(struct axp20x_dev *axp20x)
 		axp20x->regmap_cfg = &axp313a_regmap_config;
 		axp20x->regmap_irq_chip = &axp313a_regmap_irq_chip;
 		break;
+	case AXP717_ID:
+		axp20x->nr_cells = ARRAY_SIZE(axp717_cells);
+		axp20x->cells = axp717_cells;
+		axp20x->regmap_cfg = &axp717_regmap_config;
+		axp20x->regmap_irq_chip = &axp717_regmap_irq_chip;
+		break;
 	case AXP803_ID:
 		axp20x->nr_cells = ARRAY_SIZE(axp803_cells);
 		axp20x->cells = axp803_cells;
diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index 9b2b7ac2107f..fd7a41a69526 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -19,6 +19,7 @@ enum axp20x_variants {
 	AXP223_ID,
 	AXP288_ID,
 	AXP313A_ID,
+	AXP717_ID,
 	AXP803_ID,
 	AXP806_ID,
 	AXP809_ID,
@@ -113,6 +114,38 @@ enum axp20x_variants {
 #define AXP313A_IRQ_EN			0x20
 #define AXP313A_IRQ_STATE		0x21
 
+#define AXP717_ON_INDICATE		0x00
+#define AXP717_IRQ0_EN			0x40
+#define AXP717_IRQ1_EN			0x41
+#define AXP717_IRQ2_EN			0x42
+#define AXP717_IRQ3_EN			0x43
+#define AXP717_IRQ4_EN			0x44
+#define AXP717_IRQ0_STATE		0x48
+#define AXP717_IRQ1_STATE		0x49
+#define AXP717_IRQ2_STATE		0x4a
+#define AXP717_IRQ3_STATE		0x4b
+#define AXP717_IRQ4_STATE		0x4c
+#define AXP717_DCDC_OUTPUT_CONTROL	0x80
+#define AXP717_DCDC1_CONTROL		0x83
+#define AXP717_DCDC2_CONTROL		0x84
+#define AXP717_DCDC3_CONTROL		0x85
+#define AXP717_DCDC4_CONTROL		0x86
+#define AXP717_LDO0_OUTPUT_CONTROL	0x90
+#define AXP717_LDO1_OUTPUT_CONTROL	0x91
+#define AXP717_ALDO1_CONTROL		0x93
+#define AXP717_ALDO2_CONTROL		0x94
+#define AXP717_ALDO3_CONTROL		0x95
+#define AXP717_ALDO4_CONTROL		0x96
+#define AXP717_BLDO1_CONTROL		0x97
+#define AXP717_BLDO2_CONTROL		0x98
+#define AXP717_BLDO3_CONTROL		0x99
+#define AXP717_BLDO4_CONTROL		0x9a
+#define AXP717_CLDO1_CONTROL		0x9b
+#define AXP717_CLDO2_CONTROL		0x9c
+#define AXP717_CLDO3_CONTROL		0x9d
+#define AXP717_CLDO4_CONTROL		0x9e
+#define AXP717_CPUSLDO_CONTROL		0x9f
+
 #define AXP806_STARTUP_SRC		0x00
 #define AXP806_CHIP_ID			0x03
 #define AXP806_PWR_OUT_CTRL1		0x10
@@ -732,6 +765,40 @@ enum axp313a_irqs {
 	AXP313A_IRQ_PEK_RIS_EDGE,
 };
 
+enum axp717_irqs {
+	AXP717_IRQ_VBUS_FAULT,
+	AXP717_IRQ_VBUS_OVER_V,
+	AXP717_IRQ_BOOST_OVER_V,
+	AXP717_IRQ_GAUGE_NEW_SOC = 4,
+	AXP717_IRQ_SOC_DROP_LVL1 = 6,
+	AXP717_IRQ_SOC_DROP_LVL2,
+	AXP717_IRQ_PEK_RIS_EDGE,
+	AXP717_IRQ_PEK_FAL_EDGE,
+	AXP717_IRQ_PEK_LONG,
+	AXP717_IRQ_PEK_SHORT,
+	AXP717_IRQ_BATT_REMOVAL,
+	AXP717_IRQ_BATT_PLUGIN,
+	AXP717_IRQ_VBUS_REMOVAL,
+	AXP717_IRQ_VBUS_PLUGIN,
+	AXP717_IRQ_BATT_OVER_V,
+	AXP717_IRQ_CHARG_TIMER,
+	AXP717_IRQ_DIE_TEMP_HIGH,
+	AXP717_IRQ_CHARG,
+	AXP717_IRQ_CHARG_DONE,
+	AXP717_IRQ_BATT_OVER_CURR,
+	AXP717_IRQ_LDO_OVER_CURR,
+	AXP717_IRQ_WDOG_EXPIRE,
+	AXP717_IRQ_BATT_ACT_TEMP_LOW,
+	AXP717_IRQ_BATT_ACT_TEMP_HIGH,
+	AXP717_IRQ_BATT_CHG_TEMP_LOW,
+	AXP717_IRQ_BATT_CHG_TEMP_HIGH,
+	AXP717_IRQ_BATT_QUIT_TEMP_HIGH,
+	AXP717_IRQ_BC_USB_CHNG = 30,
+	AXP717_IRQ_BC_USB_DONE,
+	AXP717_IRQ_TYPEC_PLUGIN = 37,
+	AXP717_IRQ_TYPEC_REMOVE,
+};
+
 enum axp803_irqs {
 	AXP803_IRQ_ACIN_OVER_V = 1,
 	AXP803_IRQ_ACIN_PLUGIN,
-- 
cgit v1.2.3


From d2ac3df75c3a995064cfac0171e082a30d8c4c66 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@arm.com>
Date: Sun, 10 Mar 2024 01:02:11 +0000
Subject: regulator: axp20x: add support for the AXP717

The X-Powers AXP717 is a typical PMIC from X-Powers, featuring four
DC/DC converters and 15 LDOs, on the regulator side.

Describe the chip's voltage settings and switch registers, how the
voltages are encoded, and connect this to the MFD device via its
regulator ID.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Acked-by: Mark Brown <broonie@kernel.org>
Tested-by: Ryan Walklin <ryan@testtoast.com>
Link: https://lore.kernel.org/r/20240310010211.28653-5-andre.przywara@arm.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/regulator/axp20x-regulator.c | 84 ++++++++++++++++++++++++++++++++++++
 include/linux/mfd/axp20x.h           | 21 +++++++++
 2 files changed, 105 insertions(+)

(limited to 'include')

diff --git a/drivers/regulator/axp20x-regulator.c b/drivers/regulator/axp20x-regulator.c
index 7bcc2d508b65..34fcdd82b2ea 100644
--- a/drivers/regulator/axp20x-regulator.c
+++ b/drivers/regulator/axp20x-regulator.c
@@ -138,6 +138,12 @@
 #define AXP313A_DCDC_V_OUT_MASK		GENMASK(6, 0)
 #define AXP313A_LDO_V_OUT_MASK		GENMASK(4, 0)
 
+#define AXP717_DCDC1_NUM_VOLTAGES	88
+#define AXP717_DCDC2_NUM_VOLTAGES	107
+#define AXP717_DCDC3_NUM_VOLTAGES	104
+#define AXP717_DCDC_V_OUT_MASK		GENMASK(6, 0)
+#define AXP717_LDO_V_OUT_MASK		GENMASK(4, 0)
+
 #define AXP803_PWR_OUT_DCDC1_MASK	BIT_MASK(0)
 #define AXP803_PWR_OUT_DCDC2_MASK	BIT_MASK(1)
 #define AXP803_PWR_OUT_DCDC3_MASK	BIT_MASK(2)
@@ -752,6 +758,79 @@ static const struct regulator_desc axp313a_regulators[] = {
 	AXP_DESC_FIXED(AXP313A, RTC_LDO, "rtc-ldo", "vin1", 1800),
 };
 
+static const struct linear_range axp717_dcdc1_ranges[] = {
+	REGULATOR_LINEAR_RANGE(500000,   0, 70, 10000),
+	REGULATOR_LINEAR_RANGE(1220000, 71, 87, 20000),
+};
+
+static const struct linear_range axp717_dcdc2_ranges[] = {
+	REGULATOR_LINEAR_RANGE(500000,   0,  70,  10000),
+	REGULATOR_LINEAR_RANGE(1220000, 71,  87,  20000),
+	REGULATOR_LINEAR_RANGE(1600000, 88, 107, 100000),
+};
+
+static const struct linear_range axp717_dcdc3_ranges[] = {
+	REGULATOR_LINEAR_RANGE(500000,   0,  70, 10000),
+	REGULATOR_LINEAR_RANGE(1220000, 71, 102, 20000),
+};
+
+static const struct regulator_desc axp717_regulators[] = {
+	AXP_DESC_RANGES(AXP717, DCDC1, "dcdc1", "vin1",
+			axp717_dcdc1_ranges, AXP717_DCDC1_NUM_VOLTAGES,
+			AXP717_DCDC1_CONTROL, AXP717_DCDC_V_OUT_MASK,
+			AXP717_DCDC_OUTPUT_CONTROL, BIT(0)),
+	AXP_DESC_RANGES(AXP717, DCDC2, "dcdc2", "vin2",
+			axp717_dcdc2_ranges, AXP717_DCDC2_NUM_VOLTAGES,
+			AXP717_DCDC2_CONTROL, AXP717_DCDC_V_OUT_MASK,
+			AXP717_DCDC_OUTPUT_CONTROL, BIT(1)),
+	AXP_DESC_RANGES(AXP717, DCDC3, "dcdc3", "vin3",
+			axp717_dcdc3_ranges, AXP717_DCDC3_NUM_VOLTAGES,
+			AXP717_DCDC3_CONTROL, AXP717_DCDC_V_OUT_MASK,
+			AXP717_DCDC_OUTPUT_CONTROL, BIT(2)),
+	AXP_DESC(AXP717, DCDC4, "dcdc4", "vin4", 1000, 3700, 100,
+		 AXP717_DCDC4_CONTROL, AXP717_DCDC_V_OUT_MASK,
+		 AXP717_DCDC_OUTPUT_CONTROL, BIT(3)),
+	AXP_DESC(AXP717, ALDO1, "aldo1", "vin1", 500, 3500, 100,
+		 AXP717_ALDO1_CONTROL, AXP717_LDO_V_OUT_MASK,
+		 AXP717_LDO0_OUTPUT_CONTROL, BIT(0)),
+	AXP_DESC(AXP717, ALDO2, "aldo2", "vin1", 500, 3500, 100,
+		 AXP717_ALDO2_CONTROL, AXP717_LDO_V_OUT_MASK,
+		 AXP717_LDO0_OUTPUT_CONTROL, BIT(1)),
+	AXP_DESC(AXP717, ALDO3, "aldo3", "vin1", 500, 3500, 100,
+		 AXP717_ALDO3_CONTROL, AXP717_LDO_V_OUT_MASK,
+		 AXP717_LDO0_OUTPUT_CONTROL, BIT(2)),
+	AXP_DESC(AXP717, ALDO4, "aldo4", "vin1", 500, 3500, 100,
+		 AXP717_ALDO4_CONTROL, AXP717_LDO_V_OUT_MASK,
+		 AXP717_LDO0_OUTPUT_CONTROL, BIT(3)),
+	AXP_DESC(AXP717, BLDO1, "bldo1", "vin1", 500, 3500, 100,
+		 AXP717_BLDO1_CONTROL, AXP717_LDO_V_OUT_MASK,
+		 AXP717_LDO0_OUTPUT_CONTROL, BIT(4)),
+	AXP_DESC(AXP717, BLDO2, "bldo2", "vin1", 500, 3500, 100,
+		 AXP717_BLDO2_CONTROL, AXP717_LDO_V_OUT_MASK,
+		 AXP717_LDO0_OUTPUT_CONTROL, BIT(5)),
+	AXP_DESC(AXP717, BLDO3, "bldo3", "vin1", 500, 3500, 100,
+		 AXP717_BLDO3_CONTROL, AXP717_LDO_V_OUT_MASK,
+		 AXP717_LDO0_OUTPUT_CONTROL, BIT(6)),
+	AXP_DESC(AXP717, BLDO4, "bldo4", "vin1", 500, 3500, 100,
+		 AXP717_BLDO4_CONTROL, AXP717_LDO_V_OUT_MASK,
+		 AXP717_LDO0_OUTPUT_CONTROL, BIT(7)),
+	AXP_DESC(AXP717, CLDO1, "cldo1", "vin1", 500, 3500, 100,
+		 AXP717_CLDO1_CONTROL, AXP717_LDO_V_OUT_MASK,
+		 AXP717_LDO1_OUTPUT_CONTROL, BIT(0)),
+	AXP_DESC(AXP717, CLDO2, "cldo2", "vin1", 500, 3500, 100,
+		 AXP717_CLDO2_CONTROL, AXP717_LDO_V_OUT_MASK,
+		 AXP717_LDO1_OUTPUT_CONTROL, BIT(1)),
+	AXP_DESC(AXP717, CLDO3, "cldo3", "vin1", 500, 3500, 100,
+		 AXP717_CLDO3_CONTROL, AXP717_LDO_V_OUT_MASK,
+		 AXP717_LDO1_OUTPUT_CONTROL, BIT(2)),
+	AXP_DESC(AXP717, CLDO4, "cldo4", "vin1", 500, 3500, 100,
+		 AXP717_CLDO4_CONTROL, AXP717_LDO_V_OUT_MASK,
+		 AXP717_LDO1_OUTPUT_CONTROL, BIT(3)),
+	AXP_DESC(AXP717, CPUSLDO, "cpusldo", "vin1", 500, 1400, 50,
+		 AXP717_CPUSLDO_CONTROL, AXP717_LDO_V_OUT_MASK,
+		 AXP717_LDO1_OUTPUT_CONTROL, BIT(4)),
+};
+
 /* DCDC ranges shared with AXP813 */
 static const struct linear_range axp803_dcdc234_ranges[] = {
 	REGULATOR_LINEAR_RANGE(500000,
@@ -1253,6 +1332,7 @@ static int axp20x_set_dcdc_freq(struct platform_device *pdev, u32 dcdcfreq)
 		step = 150;
 		break;
 	case AXP313A_ID:
+	case AXP717_ID:
 	case AXP15060_ID:
 		/* The DCDC PWM frequency seems to be fixed to 3 MHz. */
 		if (dcdcfreq != 0) {
@@ -1479,6 +1559,10 @@ static int axp20x_regulator_probe(struct platform_device *pdev)
 		regulators = axp313a_regulators;
 		nregulators = AXP313A_REG_ID_MAX;
 		break;
+	case AXP717_ID:
+		regulators = axp717_regulators;
+		nregulators = AXP717_REG_ID_MAX;
+		break;
 	case AXP803_ID:
 		regulators = axp803_regulators;
 		nregulators = AXP803_REG_ID_MAX;
diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h
index fd7a41a69526..8c0a33a2e9ce 100644
--- a/include/linux/mfd/axp20x.h
+++ b/include/linux/mfd/axp20x.h
@@ -466,6 +466,27 @@ enum {
 	AXP313A_REG_ID_MAX,
 };
 
+enum {
+	AXP717_DCDC1 = 0,
+	AXP717_DCDC2,
+	AXP717_DCDC3,
+	AXP717_DCDC4,
+	AXP717_ALDO1,
+	AXP717_ALDO2,
+	AXP717_ALDO3,
+	AXP717_ALDO4,
+	AXP717_BLDO1,
+	AXP717_BLDO2,
+	AXP717_BLDO3,
+	AXP717_BLDO4,
+	AXP717_CLDO1,
+	AXP717_CLDO2,
+	AXP717_CLDO3,
+	AXP717_CLDO4,
+	AXP717_CPUSLDO,
+	AXP717_REG_ID_MAX,
+};
+
 enum {
 	AXP806_DCDCA = 0,
 	AXP806_DCDCB,
-- 
cgit v1.2.3


From e6c6fea5c314d90dbfca4983f2ea46388aab3bb0 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Tue, 5 Mar 2024 17:22:34 +0100
Subject: backlight: Match backlight device against struct fb_info.bl_dev

Framebuffer drivers for devices with dedicated backlight are supposed
to set struct fb_info.bl_dev to the backlight's respective device. Use
the value to match backlight and framebuffer in the backlight core code.

The code first tests against struct backlight_ops.check_ops. If this
test succeeds, it performs the test against fbdev. So backlight drivers
can override the later test as before.

Fbdev's backlight support depends on CONFIG_FB_BACKLIGHT. To avoid
ifdef in the code, the new helper fb_bl_device() returns the backlight
device, or NULL if the config option has been disabled. The test in
the backlight code will then do nothing.

v4:
	* declare empty fb_bl_device() as static inline
	* export fb_bl_device()
v3:
	* hide ifdef in fb_bl_device() (Lee)
	* no if-else blocks (Andy)

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Link: https://lore.kernel.org/r/20240305162425.23845-2-tzimmermann@suse.de
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/video/backlight/backlight.c     | 8 ++++++--
 drivers/video/fbdev/core/fb_backlight.c | 6 ++++++
 include/linux/fb.h                      | 9 +++++++++
 3 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c
index 86e1cdc8e369..4f7973c6fcc7 100644
--- a/drivers/video/backlight/backlight.c
+++ b/drivers/video/backlight/backlight.c
@@ -98,7 +98,9 @@ static int fb_notifier_callback(struct notifier_block *self,
 {
 	struct backlight_device *bd;
 	struct fb_event *evdata = data;
-	int node = evdata->info->node;
+	struct fb_info *info = evdata->info;
+	struct backlight_device *fb_bd = fb_bl_device(info);
+	int node = info->node;
 	int fb_blank = 0;
 
 	/* If we aren't interested in this event, skip it immediately ... */
@@ -110,7 +112,9 @@ static int fb_notifier_callback(struct notifier_block *self,
 
 	if (!bd->ops)
 		goto out;
-	if (bd->ops->check_fb && !bd->ops->check_fb(bd, evdata->info))
+	if (bd->ops->check_fb && !bd->ops->check_fb(bd, info))
+		goto out;
+	if (fb_bd && fb_bd != bd)
 		goto out;
 
 	fb_blank = *(int *)evdata->data;
diff --git a/drivers/video/fbdev/core/fb_backlight.c b/drivers/video/fbdev/core/fb_backlight.c
index e2d3b3adc870..6fdaa9f81be9 100644
--- a/drivers/video/fbdev/core/fb_backlight.c
+++ b/drivers/video/fbdev/core/fb_backlight.c
@@ -30,4 +30,10 @@ void fb_bl_default_curve(struct fb_info *fb_info, u8 off, u8 min, u8 max)
 	mutex_unlock(&fb_info->bl_curve_mutex);
 }
 EXPORT_SYMBOL_GPL(fb_bl_default_curve);
+
+struct backlight_device *fb_bl_device(struct fb_info *info)
+{
+	return info->bl_dev;
+}
+EXPORT_SYMBOL(fb_bl_device);
 #endif
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 0dd27364d56f..75c5f800467c 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -738,6 +738,15 @@ extern struct fb_info *framebuffer_alloc(size_t size, struct device *dev);
 extern void framebuffer_release(struct fb_info *info);
 extern void fb_bl_default_curve(struct fb_info *fb_info, u8 off, u8 min, u8 max);
 
+#if IS_ENABLED(CONFIG_FB_BACKLIGHT)
+struct backlight_device *fb_bl_device(struct fb_info *info);
+#else
+static inline struct backlight_device *fb_bl_device(struct fb_info *info)
+{
+	return NULL;
+}
+#endif
+
 /* fbmon.c */
 #define FB_MAXTIMINGS		0
 #define FB_VSYNCTIMINGS		1
-- 
cgit v1.2.3


From 397b7493729288ac308c5f4ff3496eaeb1080293 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Tue, 5 Mar 2024 17:22:39 +0100
Subject: backlight: pwm-backlight: Remove struct backlight_ops.check_fb
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The internal check_fb callback from struct pwm_bl_data is never
implemented. The driver's implementation of check_fb always
returns true, which is the backlight core's default if no
implementation has been set. So remove the code from the driver.

v2:
	* reword commit message

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Cc: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Acked-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Link: https://lore.kernel.org/r/20240305162425.23845-7-tzimmermann@suse.de
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/video/backlight/pwm_bl.c | 12 ------------
 include/linux/pwm_backlight.h    |  1 -
 2 files changed, 13 deletions(-)

(limited to 'include')

diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c
index ffcebf6aa76a..61d30bc98eea 100644
--- a/drivers/video/backlight/pwm_bl.c
+++ b/drivers/video/backlight/pwm_bl.c
@@ -11,7 +11,6 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/platform_device.h>
-#include <linux/fb.h>
 #include <linux/backlight.h>
 #include <linux/err.h>
 #include <linux/pwm.h>
@@ -34,7 +33,6 @@ struct pwm_bl_data {
 					  int brightness);
 	void			(*notify_after)(struct device *,
 					int brightness);
-	int			(*check_fb)(struct device *, struct fb_info *);
 	void			(*exit)(struct device *);
 };
 
@@ -129,17 +127,8 @@ static int pwm_backlight_update_status(struct backlight_device *bl)
 	return 0;
 }
 
-static int pwm_backlight_check_fb(struct backlight_device *bl,
-				  struct fb_info *info)
-{
-	struct pwm_bl_data *pb = bl_get_data(bl);
-
-	return !pb->check_fb || pb->check_fb(pb->dev, info);
-}
-
 static const struct backlight_ops pwm_backlight_ops = {
 	.update_status	= pwm_backlight_update_status,
-	.check_fb	= pwm_backlight_check_fb,
 };
 
 #ifdef CONFIG_OF
@@ -482,7 +471,6 @@ static int pwm_backlight_probe(struct platform_device *pdev)
 
 	pb->notify = data->notify;
 	pb->notify_after = data->notify_after;
-	pb->check_fb = data->check_fb;
 	pb->exit = data->exit;
 	pb->dev = &pdev->dev;
 	pb->enabled = false;
diff --git a/include/linux/pwm_backlight.h b/include/linux/pwm_backlight.h
index cdd2ac366bc7..0bf80e98d5b4 100644
--- a/include/linux/pwm_backlight.h
+++ b/include/linux/pwm_backlight.h
@@ -19,7 +19,6 @@ struct platform_pwm_backlight_data {
 	int (*notify)(struct device *dev, int brightness);
 	void (*notify_after)(struct device *dev, int brightness);
 	void (*exit)(struct device *dev);
-	int (*check_fb)(struct device *dev, struct fb_info *info);
 };
 
 #endif
-- 
cgit v1.2.3


From 0a4be7263749945a3882f7a0e2e5b1c45c31064e Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Tue, 5 Mar 2024 17:22:43 +0100
Subject: backlight: Add controls_device callback to struct backlight_ops

Replace check_fb with controls_device in struct backlight_ops. The
new callback interface takes a Linux device instead of a framebuffer.
Resolves one of the dependencies of backlight.h on fb.h.

The few drivers that had custom implementations of check_fb can easily
use the framebuffer's Linux device instead. Update them accordingly.

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Link: https://lore.kernel.org/r/20240305162425.23845-11-tzimmermann@suse.de
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/video/backlight/backlight.c      |  2 +-
 drivers/video/backlight/bd6107.c         | 12 ++++++------
 drivers/video/backlight/gpio_backlight.c | 12 ++++++------
 drivers/video/backlight/lv5207lp.c       | 12 ++++++------
 include/linux/backlight.h                | 16 ++++++++--------
 5 files changed, 27 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c
index 4f7973c6fcc7..2bd4299206ae 100644
--- a/drivers/video/backlight/backlight.c
+++ b/drivers/video/backlight/backlight.c
@@ -112,7 +112,7 @@ static int fb_notifier_callback(struct notifier_block *self,
 
 	if (!bd->ops)
 		goto out;
-	if (bd->ops->check_fb && !bd->ops->check_fb(bd, info))
+	if (bd->ops->controls_device && !bd->ops->controls_device(bd, info->device))
 		goto out;
 	if (fb_bd && fb_bd != bd)
 		goto out;
diff --git a/drivers/video/backlight/bd6107.c b/drivers/video/backlight/bd6107.c
index b1e7126380ef..6be2c67ba85c 100644
--- a/drivers/video/backlight/bd6107.c
+++ b/drivers/video/backlight/bd6107.c
@@ -99,18 +99,18 @@ static int bd6107_backlight_update_status(struct backlight_device *backlight)
 	return 0;
 }
 
-static int bd6107_backlight_check_fb(struct backlight_device *backlight,
-				       struct fb_info *info)
+static bool bd6107_backlight_controls_device(struct backlight_device *backlight,
+					     struct device *display_dev)
 {
 	struct bd6107 *bd = bl_get_data(backlight);
 
-	return !bd->pdata->dev || bd->pdata->dev == info->device;
+	return !bd->pdata->dev || bd->pdata->dev == display_dev;
 }
 
 static const struct backlight_ops bd6107_backlight_ops = {
-	.options	= BL_CORE_SUSPENDRESUME,
-	.update_status	= bd6107_backlight_update_status,
-	.check_fb	= bd6107_backlight_check_fb,
+	.options	 = BL_CORE_SUSPENDRESUME,
+	.update_status	 = bd6107_backlight_update_status,
+	.controls_device = bd6107_backlight_controls_device,
 };
 
 static int bd6107_probe(struct i2c_client *client)
diff --git a/drivers/video/backlight/gpio_backlight.c b/drivers/video/backlight/gpio_backlight.c
index e0c8c2a3f5dc..4476c317ce29 100644
--- a/drivers/video/backlight/gpio_backlight.c
+++ b/drivers/video/backlight/gpio_backlight.c
@@ -30,18 +30,18 @@ static int gpio_backlight_update_status(struct backlight_device *bl)
 	return 0;
 }
 
-static int gpio_backlight_check_fb(struct backlight_device *bl,
-				   struct fb_info *info)
+static bool gpio_backlight_controls_device(struct backlight_device *bl,
+					   struct device *display_dev)
 {
 	struct gpio_backlight *gbl = bl_get_data(bl);
 
-	return !gbl->dev || gbl->dev == info->device;
+	return !gbl->dev || gbl->dev == display_dev;
 }
 
 static const struct backlight_ops gpio_backlight_ops = {
-	.options	= BL_CORE_SUSPENDRESUME,
-	.update_status	= gpio_backlight_update_status,
-	.check_fb	= gpio_backlight_check_fb,
+	.options	 = BL_CORE_SUSPENDRESUME,
+	.update_status	 = gpio_backlight_update_status,
+	.controls_device = gpio_backlight_controls_device,
 };
 
 static int gpio_backlight_probe(struct platform_device *pdev)
diff --git a/drivers/video/backlight/lv5207lp.c b/drivers/video/backlight/lv5207lp.c
index 1f1d06b4e119..0cf00fee0f60 100644
--- a/drivers/video/backlight/lv5207lp.c
+++ b/drivers/video/backlight/lv5207lp.c
@@ -62,18 +62,18 @@ static int lv5207lp_backlight_update_status(struct backlight_device *backlight)
 	return 0;
 }
 
-static int lv5207lp_backlight_check_fb(struct backlight_device *backlight,
-				       struct fb_info *info)
+static bool lv5207lp_backlight_controls_device(struct backlight_device *backlight,
+					       struct device *display_dev)
 {
 	struct lv5207lp *lv = bl_get_data(backlight);
 
-	return !lv->pdata->dev || lv->pdata->dev == info->device;
+	return !lv->pdata->dev || lv->pdata->dev == display_dev;
 }
 
 static const struct backlight_ops lv5207lp_backlight_ops = {
-	.options	= BL_CORE_SUSPENDRESUME,
-	.update_status	= lv5207lp_backlight_update_status,
-	.check_fb	= lv5207lp_backlight_check_fb,
+	.options	 = BL_CORE_SUSPENDRESUME,
+	.update_status	 = lv5207lp_backlight_update_status,
+	.controls_device = lv5207lp_backlight_controls_device,
 };
 
 static int lv5207lp_probe(struct i2c_client *client)
diff --git a/include/linux/backlight.h b/include/linux/backlight.h
index 614653e07e3a..2db4c70053c4 100644
--- a/include/linux/backlight.h
+++ b/include/linux/backlight.h
@@ -13,6 +13,7 @@
 #include <linux/fb.h>
 #include <linux/mutex.h>
 #include <linux/notifier.h>
+#include <linux/types.h>
 
 /**
  * enum backlight_update_reason - what method was used to update backlight
@@ -110,7 +111,6 @@ enum backlight_scale {
 };
 
 struct backlight_device;
-struct fb_info;
 
 /**
  * struct backlight_ops - backlight operations
@@ -160,18 +160,18 @@ struct backlight_ops {
 	int (*get_brightness)(struct backlight_device *);
 
 	/**
-	 * @check_fb: Check the framebuffer device.
+	 * @controls_device: Check against the display device
 	 *
-	 * Check if given framebuffer device is the one bound to this backlight.
-	 * This operation is optional and if not implemented it is assumed that the
-	 * fbdev is always the one bound to the backlight.
+	 * Check if the backlight controls the given display device. This
+	 * operation is optional and if not implemented it is assumed that
+	 * the display is always the one controlled by the backlight.
 	 *
 	 * RETURNS:
 	 *
-	 * If info is NULL or the info matches the fbdev bound to the backlight return true.
-	 * If info does not match the fbdev bound to the backlight return false.
+	 * If display_dev is NULL or display_dev matches the device controlled by
+	 * the backlight, return true. Otherwise return false.
 	 */
-	int (*check_fb)(struct backlight_device *bd, struct fb_info *info);
+	bool (*controls_device)(struct backlight_device *bd, struct device *display_dev);
 };
 
 /**
-- 
cgit v1.2.3


From b7ad4c67ed945524ab38bd61676e684eff84fc2a Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Tue, 19 Mar 2024 10:37:21 +0100
Subject: backlight: omap1: Remove unused struct
 omap_backlight_config.set_power

The callback set_power in struct omap_backlight_config is not
implemented anywhere. Remove it from the structure and driver.

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Sam Ravnborg <sam@ravnborg.org>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Reviewed-by: Dan Carpenter <dan.carpenter@linaro.org>
Link: https://lore.kernel.org/r/20240319093915.31778-3-tzimmermann@suse.de
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/video/backlight/omap1_bl.c     | 3 ---
 include/linux/platform_data/omap1_bl.h | 1 -
 2 files changed, 4 deletions(-)

(limited to 'include')

diff --git a/drivers/video/backlight/omap1_bl.c b/drivers/video/backlight/omap1_bl.c
index 69a49384b3de..84d148f38595 100644
--- a/drivers/video/backlight/omap1_bl.c
+++ b/drivers/video/backlight/omap1_bl.c
@@ -39,9 +39,6 @@ static inline void omapbl_send_enable(int enable)
 
 static void omapbl_blank(struct omap_backlight *bl, int mode)
 {
-	if (bl->pdata->set_power)
-		bl->pdata->set_power(bl->dev, mode);
-
 	switch (mode) {
 	case FB_BLANK_NORMAL:
 	case FB_BLANK_VSYNC_SUSPEND:
diff --git a/include/linux/platform_data/omap1_bl.h b/include/linux/platform_data/omap1_bl.h
index 5e8b17d77a5f..3d0bab31a0a9 100644
--- a/include/linux/platform_data/omap1_bl.h
+++ b/include/linux/platform_data/omap1_bl.h
@@ -6,7 +6,6 @@
 
 struct omap_backlight_config {
 	int default_intensity;
-	int (*set_power)(struct device *dev, int state);
 };
 
 #endif
-- 
cgit v1.2.3


From 4551978bb50a8d59b49629deebacd73478a8b1e1 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Tue, 19 Mar 2024 10:37:25 +0100
Subject: backlight: Remove fb_blank from struct backlight_properties

Remove the field fb_blank from struct backlight_properties and remove
all code that still sets or reads it. Backlight blank status is now
tracked exclusively in struct backlight_properties.state.

The core backlight code keeps the fb_blank and state fields in sync,
but doesn't do anything else with fb_blank. Several drivers initialize
fb_blank to FB_BLANK_UNBLANK to enable the backlight. This is already
the default for the state field. So we can delete the fb_blank code
from core and drivers and rely on the state field.

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Cc: Flavio Suligoi <f.suligoi@asem.it>
Cc: Nicolas Ferre <nicolas.ferre@microchip.com>
Cc: Alexandre Belloni <alexandre.belloni@bootlin.com>
Cc: Claudiu Beznea <claudiu.beznea@tuxon.dev>
Tested-by: Flavio Suligoi <f.suligoi@asem.it>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Reviewed-by: Dan Carpenter <dan.carpenter@linaro.org>
Link: https://lore.kernel.org/r/20240319093915.31778-7-tzimmermann@suse.de
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/video/backlight/backlight.c                |  2 --
 drivers/video/backlight/mp3309c.c                  |  1 -
 drivers/video/backlight/omap1_bl.c                 |  1 -
 drivers/video/fbdev/atmel_lcdfb.c                  |  1 -
 .../fbdev/omap2/omapfb/displays/panel-dsi-cm.c     |  1 -
 .../omap2/omapfb/displays/panel-sony-acx565akm.c   |  1 -
 include/linux/backlight.h                          | 25 +---------------------
 7 files changed, 1 insertion(+), 31 deletions(-)

(limited to 'include')

diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c
index 86e1cdc8e369..691f1f3030e9 100644
--- a/drivers/video/backlight/backlight.c
+++ b/drivers/video/backlight/backlight.c
@@ -118,14 +118,12 @@ static int fb_notifier_callback(struct notifier_block *self,
 		bd->fb_bl_on[node] = true;
 		if (!bd->use_count++) {
 			bd->props.state &= ~BL_CORE_FBBLANK;
-			bd->props.fb_blank = FB_BLANK_UNBLANK;
 			backlight_update_status(bd);
 		}
 	} else if (fb_blank != FB_BLANK_UNBLANK && bd->fb_bl_on[node]) {
 		bd->fb_bl_on[node] = false;
 		if (!(--bd->use_count)) {
 			bd->props.state |= BL_CORE_FBBLANK;
-			bd->props.fb_blank = fb_blank;
 			backlight_update_status(bd);
 		}
 	}
diff --git a/drivers/video/backlight/mp3309c.c b/drivers/video/backlight/mp3309c.c
index c80a1481e742..7d73e85b8c14 100644
--- a/drivers/video/backlight/mp3309c.c
+++ b/drivers/video/backlight/mp3309c.c
@@ -363,7 +363,6 @@ static int mp3309c_probe(struct i2c_client *client)
 	props.scale = BACKLIGHT_SCALE_LINEAR;
 	props.type = BACKLIGHT_RAW;
 	props.power = FB_BLANK_UNBLANK;
-	props.fb_blank = FB_BLANK_UNBLANK;
 	chip->bl = devm_backlight_device_register(dev, "mp3309c", dev, chip,
 						  &mp3309c_bl_ops, &props);
 	if (IS_ERR(chip->bl))
diff --git a/drivers/video/backlight/omap1_bl.c b/drivers/video/backlight/omap1_bl.c
index 636b390f78f0..e461e19231ae 100644
--- a/drivers/video/backlight/omap1_bl.c
+++ b/drivers/video/backlight/omap1_bl.c
@@ -139,7 +139,6 @@ static int omapbl_probe(struct platform_device *pdev)
 
 	omap_cfg_reg(PWL);	/* Conflicts with UART3 */
 
-	dev->props.fb_blank = FB_BLANK_UNBLANK;
 	dev->props.brightness = pdata->default_intensity;
 	omapbl_update_status(dev);
 
diff --git a/drivers/video/fbdev/atmel_lcdfb.c b/drivers/video/fbdev/atmel_lcdfb.c
index 9e391e5eaf9d..5574fb0361ee 100644
--- a/drivers/video/fbdev/atmel_lcdfb.c
+++ b/drivers/video/fbdev/atmel_lcdfb.c
@@ -153,7 +153,6 @@ static void init_backlight(struct atmel_lcdfb_info *sinfo)
 	sinfo->backlight = bl;
 
 	bl->props.power = FB_BLANK_UNBLANK;
-	bl->props.fb_blank = FB_BLANK_UNBLANK;
 	bl->props.brightness = atmel_bl_get_brightness(bl);
 }
 
diff --git a/drivers/video/fbdev/omap2/omapfb/displays/panel-dsi-cm.c b/drivers/video/fbdev/omap2/omapfb/displays/panel-dsi-cm.c
index 47683a6c0076..274bdf7b3b45 100644
--- a/drivers/video/fbdev/omap2/omapfb/displays/panel-dsi-cm.c
+++ b/drivers/video/fbdev/omap2/omapfb/displays/panel-dsi-cm.c
@@ -1215,7 +1215,6 @@ static int dsicm_probe(struct platform_device *pdev)
 
 		ddata->bldev = bldev;
 
-		bldev->props.fb_blank = FB_BLANK_UNBLANK;
 		bldev->props.power = FB_BLANK_UNBLANK;
 		bldev->props.brightness = 255;
 
diff --git a/drivers/video/fbdev/omap2/omapfb/displays/panel-sony-acx565akm.c b/drivers/video/fbdev/omap2/omapfb/displays/panel-sony-acx565akm.c
index 9d3ce234a787..71d2e015960c 100644
--- a/drivers/video/fbdev/omap2/omapfb/displays/panel-sony-acx565akm.c
+++ b/drivers/video/fbdev/omap2/omapfb/displays/panel-sony-acx565akm.c
@@ -753,7 +753,6 @@ static int acx565akm_probe(struct spi_device *spi)
 	}
 
 	memset(&props, 0, sizeof(props));
-	props.fb_blank = FB_BLANK_UNBLANK;
 	props.power = FB_BLANK_UNBLANK;
 	props.type = BACKLIGHT_RAW;
 
diff --git a/include/linux/backlight.h b/include/linux/backlight.h
index 614653e07e3a..31600b144d79 100644
--- a/include/linux/backlight.h
+++ b/include/linux/backlight.h
@@ -218,25 +218,6 @@ struct backlight_properties {
 	 */
 	int power;
 
-	/**
-	 * @fb_blank: The power state from the FBIOBLANK ioctl.
-	 *
-	 * When the FBIOBLANK ioctl is called @fb_blank is set to the
-	 * blank parameter and the update_status() operation is called.
-	 *
-	 * When the backlight device is enabled @fb_blank is set
-	 * to FB_BLANK_UNBLANK. When the backlight device is disabled
-	 * @fb_blank is set to FB_BLANK_POWERDOWN.
-	 *
-	 * Backlight drivers should avoid using this property. It has been
-	 * replaced by state & BL_CORE_FBLANK (although most drivers should
-	 * use backlight_is_blank() as the preferred means to get the blank
-	 * state).
-	 *
-	 * fb_blank is deprecated and will be removed.
-	 */
-	int fb_blank;
-
 	/**
 	 * @type: The type of backlight supported.
 	 *
@@ -366,7 +347,6 @@ static inline int backlight_enable(struct backlight_device *bd)
 		return 0;
 
 	bd->props.power = FB_BLANK_UNBLANK;
-	bd->props.fb_blank = FB_BLANK_UNBLANK;
 	bd->props.state &= ~BL_CORE_FBBLANK;
 
 	return backlight_update_status(bd);
@@ -382,7 +362,6 @@ static inline int backlight_disable(struct backlight_device *bd)
 		return 0;
 
 	bd->props.power = FB_BLANK_POWERDOWN;
-	bd->props.fb_blank = FB_BLANK_POWERDOWN;
 	bd->props.state |= BL_CORE_FBBLANK;
 
 	return backlight_update_status(bd);
@@ -395,15 +374,13 @@ static inline int backlight_disable(struct backlight_device *bd)
  * Display is expected to be blank if any of these is true::
  *
  *   1) if power in not UNBLANK
- *   2) if fb_blank is not UNBLANK
- *   3) if state indicate BLANK or SUSPENDED
+ *   2) if state indicate BLANK or SUSPENDED
  *
  * Returns true if display is expected to be blank, false otherwise.
  */
 static inline bool backlight_is_blank(const struct backlight_device *bd)
 {
 	return bd->props.power != FB_BLANK_UNBLANK ||
-	       bd->props.fb_blank != FB_BLANK_UNBLANK ||
 	       bd->props.state & (BL_CORE_SUSPENDED | BL_CORE_FBBLANK);
 }
 
-- 
cgit v1.2.3


From 847ad2b5afe6b01252abe1c4ba4a7ac4edfb98dc Mon Sep 17 00:00:00 2001
From: Maxime Ripard <mripard@kernel.org>
Date: Wed, 27 Mar 2024 11:56:59 +0100
Subject: drm/display: Make DisplayPort CEC-over-AUX Kconfig name consistent

While most display helpers Kconfig symbols have the DRM_DISPLAY prefix,
the DisplayPort CEC tunnelling implementation uses CONFIG_DRM_DISPLAY_DP_AUX_CEC.

Since the number of users is limited, we can easily rename it to make it
consistent.

Reviewed-by: Jani Nikula <jani.nikula@intel.com>
Reviewed-by: Lucas De Marchi <lucas.demarchi@intel.com>
Link: https://lore.kernel.org/r/20240327-kms-kconfig-helpers-v3-4-eafee11b84b3@kernel.org
Signed-off-by: Maxime Ripard <mripard@kernel.org>
---
 arch/parisc/configs/generic-32bit_defconfig | 2 +-
 drivers/gpu/drm/display/Kconfig             | 2 +-
 drivers/gpu/drm/display/Makefile            | 2 +-
 include/drm/display/drm_dp_helper.h         | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/arch/parisc/configs/generic-32bit_defconfig b/arch/parisc/configs/generic-32bit_defconfig
index ee4febb30386..5ce258f3fffa 100644
--- a/arch/parisc/configs/generic-32bit_defconfig
+++ b/arch/parisc/configs/generic-32bit_defconfig
@@ -131,7 +131,7 @@ CONFIG_PPDEV=m
 CONFIG_I2C=y
 CONFIG_HWMON=m
 CONFIG_DRM=m
-CONFIG_DRM_DP_CEC=y
+CONFIG_DRM_DISPLAY_DP_AUX_CEC=y
 # CONFIG_DRM_I2C_CH7006 is not set
 # CONFIG_DRM_I2C_SIL164 is not set
 CONFIG_DRM_RADEON=m
diff --git a/drivers/gpu/drm/display/Kconfig b/drivers/gpu/drm/display/Kconfig
index 038be9795581..35cb68db0bfd 100644
--- a/drivers/gpu/drm/display/Kconfig
+++ b/drivers/gpu/drm/display/Kconfig
@@ -59,7 +59,7 @@ config DRM_DISPLAY_DP_AUX_CHARDEV
 	  read and write values to arbitrary DPCD registers on the DP aux
 	  channel.
 
-config DRM_DP_CEC
+config DRM_DISPLAY_DP_AUX_CEC
 	bool "Enable DisplayPort CEC-Tunneling-over-AUX HDMI support"
 	depends on DRM && DRM_DISPLAY_HELPER
 	select DRM_DISPLAY_DP_HELPER
diff --git a/drivers/gpu/drm/display/Makefile b/drivers/gpu/drm/display/Makefile
index 44015aef1e8c..17d2cc73ff56 100644
--- a/drivers/gpu/drm/display/Makefile
+++ b/drivers/gpu/drm/display/Makefile
@@ -15,6 +15,6 @@ drm_display_helper-$(CONFIG_DRM_DISPLAY_HDMI_HELPER) += \
 	drm_hdmi_helper.o \
 	drm_scdc_helper.o
 drm_display_helper-$(CONFIG_DRM_DISPLAY_DP_AUX_CHARDEV) += drm_dp_aux_dev.o
-drm_display_helper-$(CONFIG_DRM_DP_CEC) += drm_dp_cec.o
+drm_display_helper-$(CONFIG_DRM_DISPLAY_DP_AUX_CEC) += drm_dp_cec.o
 
 obj-$(CONFIG_DRM_DISPLAY_HELPER) += drm_display_helper.o
diff --git a/include/drm/display/drm_dp_helper.h b/include/drm/display/drm_dp_helper.h
index 6c6a96496df6..f3a28af7c1b6 100644
--- a/include/drm/display/drm_dp_helper.h
+++ b/include/drm/display/drm_dp_helper.h
@@ -733,7 +733,7 @@ static inline int drm_panel_dp_aux_backlight(struct drm_panel *panel,
 
 #endif
 
-#ifdef CONFIG_DRM_DP_CEC
+#ifdef CONFIG_DRM_DISPLAY_DP_AUX_CEC
 void drm_dp_cec_irq(struct drm_dp_aux *aux);
 void drm_dp_cec_register_connector(struct drm_dp_aux *aux,
 				   struct drm_connector *connector);
-- 
cgit v1.2.3


From 822c91e72eac568ed8d83765634f00decb45666c Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Mon, 4 Mar 2024 21:57:30 +0100
Subject: leds: trigger: Store brightness set by led_trigger_event()

If a simple trigger is assigned to a LED, then the LED may be off until
the next led_trigger_event() call. This may be an issue for simple
triggers with rare led_trigger_event() calls, e.g. power supply
charging indicators (drivers/power/supply/power_supply_leds.c).
Therefore persist the brightness value of the last led_trigger_event()
call and use this value if the trigger is assigned to a LED.
In addition add a getter for the trigger brightness value.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Link: https://lore.kernel.org/r/b1358b25-3f30-458d-8240-5705ae007a8a@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/leds/led-triggers.c |  6 ++++--
 include/linux/leds.h        | 15 +++++++++++++++
 2 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/leds/led-triggers.c b/drivers/leds/led-triggers.c
index 0f5ac30053ad..b1b323b19301 100644
--- a/drivers/leds/led-triggers.c
+++ b/drivers/leds/led-triggers.c
@@ -194,11 +194,11 @@ int led_trigger_set(struct led_classdev *led_cdev, struct led_trigger *trig)
 		spin_unlock(&trig->leddev_list_lock);
 		led_cdev->trigger = trig;
 
+		ret = 0;
 		if (trig->activate)
 			ret = trig->activate(led_cdev);
 		else
-			ret = 0;
-
+			led_set_brightness(led_cdev, trig->brightness);
 		if (ret)
 			goto err_activate;
 
@@ -387,6 +387,8 @@ void led_trigger_event(struct led_trigger *trig,
 	if (!trig)
 		return;
 
+	trig->brightness = brightness;
+
 	rcu_read_lock();
 	list_for_each_entry_rcu(led_cdev, &trig->led_cdevs, trig_list)
 		led_set_brightness(led_cdev, brightness);
diff --git a/include/linux/leds.h b/include/linux/leds.h
index db6b114bb3d9..7964ee38b2a3 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -455,6 +455,9 @@ struct led_trigger {
 	int		(*activate)(struct led_classdev *led_cdev);
 	void		(*deactivate)(struct led_classdev *led_cdev);
 
+	/* Brightness set by led_trigger_event */
+	enum led_brightness brightness;
+
 	/* LED-private triggers have this set */
 	struct led_hw_trigger_type *trigger_type;
 
@@ -508,6 +511,12 @@ static inline void *led_get_trigger_data(struct led_classdev *led_cdev)
 	return led_cdev->trigger_data;
 }
 
+static inline enum led_brightness
+led_trigger_get_brightness(const struct led_trigger *trigger)
+{
+	return trigger ? trigger->brightness : LED_OFF;
+}
+
 #define module_led_trigger(__led_trigger) \
 	module_driver(__led_trigger, led_trigger_register, \
 		      led_trigger_unregister)
@@ -544,6 +553,12 @@ static inline void *led_get_trigger_data(struct led_classdev *led_cdev)
 	return NULL;
 }
 
+static inline enum led_brightness
+led_trigger_get_brightness(const struct led_trigger *trigger)
+{
+	return LED_OFF;
+}
+
 #endif /* CONFIG_LEDS_TRIGGERS */
 
 /* Trigger specific enum */
-- 
cgit v1.2.3


From ab2ab9e69ef9734b875ce6d43fe3f9f90135daae Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Mon, 4 Mar 2024 21:59:47 +0100
Subject: leds: trigger: audio: Remove this trigger

Now that the audio trigger is fully integrated in
sound/core/control_led.c, we can remove it here.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Link: https://lore.kernel.org/r/1e339779-6d04-4392-8ea2-5592c0fd1aa2@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 arch/mips/configs/ci20_defconfig     |  1 -
 drivers/leds/trigger/ledtrig-audio.c | 67 ------------------------------------
 include/linux/leds.h                 | 14 --------
 3 files changed, 82 deletions(-)
 delete mode 100644 drivers/leds/trigger/ledtrig-audio.c

(limited to 'include')

diff --git a/arch/mips/configs/ci20_defconfig b/arch/mips/configs/ci20_defconfig
index cdf2a782dee1..7827b2b392f6 100644
--- a/arch/mips/configs/ci20_defconfig
+++ b/arch/mips/configs/ci20_defconfig
@@ -152,7 +152,6 @@ CONFIG_LEDS_TRIGGER_CAMERA=m
 CONFIG_LEDS_TRIGGER_PANIC=y
 CONFIG_LEDS_TRIGGER_NETDEV=y
 CONFIG_LEDS_TRIGGER_PATTERN=y
-CONFIG_LEDS_TRIGGER_AUDIO=y
 CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_JZ4740=y
 CONFIG_DMADEVICES=y
diff --git a/drivers/leds/trigger/ledtrig-audio.c b/drivers/leds/trigger/ledtrig-audio.c
deleted file mode 100644
index 2ecd4b760fc3..000000000000
--- a/drivers/leds/trigger/ledtrig-audio.c
+++ /dev/null
@@ -1,67 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-//
-// Audio Mute LED trigger
-//
-
-#include <linux/kernel.h>
-#include <linux/leds.h>
-#include <linux/module.h>
-#include "../leds.h"
-
-static enum led_brightness audio_state[NUM_AUDIO_LEDS];
-
-static int ledtrig_audio_mute_activate(struct led_classdev *led_cdev)
-{
-	led_set_brightness_nosleep(led_cdev, audio_state[LED_AUDIO_MUTE]);
-	return 0;
-}
-
-static int ledtrig_audio_micmute_activate(struct led_classdev *led_cdev)
-{
-	led_set_brightness_nosleep(led_cdev, audio_state[LED_AUDIO_MICMUTE]);
-	return 0;
-}
-
-static struct led_trigger ledtrig_audio[NUM_AUDIO_LEDS] = {
-	[LED_AUDIO_MUTE] = {
-		.name     = "audio-mute",
-		.activate = ledtrig_audio_mute_activate,
-	},
-	[LED_AUDIO_MICMUTE] = {
-		.name     = "audio-micmute",
-		.activate = ledtrig_audio_micmute_activate,
-	},
-};
-
-enum led_brightness ledtrig_audio_get(enum led_audio type)
-{
-	return audio_state[type];
-}
-EXPORT_SYMBOL_GPL(ledtrig_audio_get);
-
-void ledtrig_audio_set(enum led_audio type, enum led_brightness state)
-{
-	audio_state[type] = state;
-	led_trigger_event(&ledtrig_audio[type], state);
-}
-EXPORT_SYMBOL_GPL(ledtrig_audio_set);
-
-static int __init ledtrig_audio_init(void)
-{
-	led_trigger_register(&ledtrig_audio[LED_AUDIO_MUTE]);
-	led_trigger_register(&ledtrig_audio[LED_AUDIO_MICMUTE]);
-	return 0;
-}
-module_init(ledtrig_audio_init);
-
-static void __exit ledtrig_audio_exit(void)
-{
-	led_trigger_unregister(&ledtrig_audio[LED_AUDIO_MUTE]);
-	led_trigger_unregister(&ledtrig_audio[LED_AUDIO_MICMUTE]);
-}
-module_exit(ledtrig_audio_exit);
-
-MODULE_DESCRIPTION("LED trigger for audio mute control");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS("ledtrig:audio-mute");
-MODULE_ALIAS("ledtrig:audio-micmute");
diff --git a/include/linux/leds.h b/include/linux/leds.h
index 7964ee38b2a3..6300313c46b7 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -705,18 +705,4 @@ enum led_audio {
 	NUM_AUDIO_LEDS
 };
 
-#if IS_ENABLED(CONFIG_LEDS_TRIGGER_AUDIO)
-enum led_brightness ledtrig_audio_get(enum led_audio type);
-void ledtrig_audio_set(enum led_audio type, enum led_brightness state);
-#else
-static inline enum led_brightness ledtrig_audio_get(enum led_audio type)
-{
-	return LED_OFF;
-}
-static inline void ledtrig_audio_set(enum led_audio type,
-				     enum led_brightness state)
-{
-}
-#endif
-
 #endif		/* __LINUX_LEDS_H_INCLUDED */
-- 
cgit v1.2.3


From bd1222ad1746ab4325b982c720c7099c78c7b731 Mon Sep 17 00:00:00 2001
From: Brent Lu <brent.lu@intel.com>
Date: Wed, 27 Mar 2024 11:23:52 -0500
Subject: ASoC: Intel: ssp-common: relocate header file

Moving ssp-common header file from sound/soc/intel/common directory to
include/sound directory and rename the file. Keep file content
unchanged for tracking purpose.

Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Brent Lu <brent.lu@intel.com>
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Link: https://msgid.link/r/20240327162408.63953-3-pierre-louis.bossart@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-acpi-intel-ssp-common.h          | 73 ++++++++++++++++++++++
 sound/soc/intel/boards/sof_board_helpers.h         |  2 +-
 sound/soc/intel/boards/sof_cirrus_common.h         |  2 +-
 sound/soc/intel/boards/sof_maxim_common.h          |  2 +-
 sound/soc/intel/boards/sof_nuvoton_common.h        |  2 +-
 sound/soc/intel/boards/sof_realtek_common.h        |  2 +-
 sound/soc/intel/common/soc-acpi-intel-ssp-common.c |  2 +-
 sound/soc/intel/common/sof_ssp_common.h            | 73 ----------------------
 8 files changed, 79 insertions(+), 79 deletions(-)
 create mode 100644 include/sound/soc-acpi-intel-ssp-common.h
 delete mode 100644 sound/soc/intel/common/sof_ssp_common.h

(limited to 'include')

diff --git a/include/sound/soc-acpi-intel-ssp-common.h b/include/sound/soc-acpi-intel-ssp-common.h
new file mode 100644
index 000000000000..ff84154ce60a
--- /dev/null
+++ b/include/sound/soc-acpi-intel-ssp-common.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright(c) 2023 Intel Corporation.
+ */
+
+#ifndef __SOF_SSP_COMMON_H
+#define __SOF_SSP_COMMON_H
+
+/* Cirrus Logic */
+#define CS35L41_ACPI_HID	"CSC3541"
+#define CS42L42_ACPI_HID	"10134242"
+
+/* Dialog */
+#define DA7219_ACPI_HID		"DLGS7219"
+
+/* Everest */
+#define ES8316_ACPI_HID		"ESSX8316"
+#define ES8326_ACPI_HID		"ESSX8326"
+#define ES8336_ACPI_HID		"ESSX8336"
+
+#define MAX_98357A_ACPI_HID	"MX98357A"
+#define MAX_98360A_ACPI_HID	"MX98360A"
+#define MAX_98373_ACPI_HID	"MX98373"
+#define MAX_98390_ACPI_HID	"MX98390"
+
+/* Nuvoton */
+#define NAU8318_ACPI_HID	"NVTN2012"
+#define NAU8825_ACPI_HID	"10508825"
+
+/* Realtek */
+#define RT1011_ACPI_HID		"10EC1011"
+#define RT1015_ACPI_HID		"10EC1015"
+#define RT1015P_ACPI_HID	"RTL1015"
+#define RT1019P_ACPI_HID	"RTL1019"
+#define RT1308_ACPI_HID		"10EC1308"
+#define RT5650_ACPI_HID		"10EC5650"
+#define RT5682_ACPI_HID		"10EC5682"
+#define RT5682S_ACPI_HID	"RTL5682"
+
+enum sof_ssp_codec {
+	CODEC_NONE,
+
+	/* headphone codec */
+	CODEC_CS42L42,
+	CODEC_DA7219,
+	CODEC_ES8316,
+	CODEC_ES8326,
+	CODEC_ES8336,
+	CODEC_NAU8825,
+	CODEC_RT5650,
+	CODEC_RT5682,
+	CODEC_RT5682S,
+
+	/* speaker amplifier */
+	CODEC_CS35L41,
+	CODEC_MAX98357A,
+	CODEC_MAX98360A,
+	CODEC_MAX98373,
+	CODEC_MAX98390,
+	CODEC_NAU8318,
+	CODEC_RT1011,
+	CODEC_RT1015,
+	CODEC_RT1015P,
+	CODEC_RT1019P,
+	CODEC_RT1308,
+};
+
+enum sof_ssp_codec sof_ssp_detect_codec_type(struct device *dev);
+enum sof_ssp_codec sof_ssp_detect_amp_type(struct device *dev);
+
+const char *sof_ssp_get_codec_name(enum sof_ssp_codec codec_type);
+
+#endif /* __SOF_SSP_COMMON_H */
diff --git a/sound/soc/intel/boards/sof_board_helpers.h b/sound/soc/intel/boards/sof_board_helpers.h
index bd5c48cd0649..34ea974c8493 100644
--- a/sound/soc/intel/boards/sof_board_helpers.h
+++ b/sound/soc/intel/boards/sof_board_helpers.h
@@ -7,8 +7,8 @@
 #define __SOF_INTEL_BOARD_HELPERS_H
 
 #include <sound/soc.h>
+#include <sound/soc-acpi-intel-ssp-common.h>
 #include "sof_hdmi_common.h"
-#include "../common/sof_ssp_common.h"
 
 /*
  * Common board quirks: from bit 8 to 31, LSB 8 bits reserved for machine
diff --git a/sound/soc/intel/boards/sof_cirrus_common.h b/sound/soc/intel/boards/sof_cirrus_common.h
index 445621466f05..1c87637b9ef7 100644
--- a/sound/soc/intel/boards/sof_cirrus_common.h
+++ b/sound/soc/intel/boards/sof_cirrus_common.h
@@ -9,7 +9,7 @@
 #define __SOF_CIRRUS_COMMON_H
 
 #include <sound/soc.h>
-#include "../common/sof_ssp_common.h"
+#include <sound/soc-acpi-intel-ssp-common.h>
 
 /*
  * Cirrus Logic CS35L41/CS35L53
diff --git a/sound/soc/intel/boards/sof_maxim_common.h b/sound/soc/intel/boards/sof_maxim_common.h
index 63054bcd287b..8886f985de68 100644
--- a/sound/soc/intel/boards/sof_maxim_common.h
+++ b/sound/soc/intel/boards/sof_maxim_common.h
@@ -11,7 +11,7 @@
 #define __SOF_MAXIM_COMMON_H
 
 #include <sound/soc.h>
-#include "../common/sof_ssp_common.h"
+#include <sound/soc-acpi-intel-ssp-common.h>
 
 /*
  * Maxim MAX98373
diff --git a/sound/soc/intel/boards/sof_nuvoton_common.h b/sound/soc/intel/boards/sof_nuvoton_common.h
index af2e58b1d393..8a0f283260e7 100644
--- a/sound/soc/intel/boards/sof_nuvoton_common.h
+++ b/sound/soc/intel/boards/sof_nuvoton_common.h
@@ -9,7 +9,7 @@
 #define __SOF_NUVOTON_COMMON_H
 
 #include <sound/soc.h>
-#include "../common/sof_ssp_common.h"
+#include <sound/soc-acpi-intel-ssp-common.h>
 
 /*
  * Nuvoton NAU8318
diff --git a/sound/soc/intel/boards/sof_realtek_common.h b/sound/soc/intel/boards/sof_realtek_common.h
index 99b23d96521a..289e85b61660 100644
--- a/sound/soc/intel/boards/sof_realtek_common.h
+++ b/sound/soc/intel/boards/sof_realtek_common.h
@@ -11,7 +11,7 @@
 #define __SOF_REALTEK_COMMON_H
 
 #include <sound/soc.h>
-#include "../common/sof_ssp_common.h"
+#include <sound/soc-acpi-intel-ssp-common.h>
 
 /*
  * Realtek ALC1011
diff --git a/sound/soc/intel/common/soc-acpi-intel-ssp-common.c b/sound/soc/intel/common/soc-acpi-intel-ssp-common.c
index 96072790e9c0..37758f4e3f32 100644
--- a/sound/soc/intel/common/soc-acpi-intel-ssp-common.c
+++ b/sound/soc/intel/common/soc-acpi-intel-ssp-common.c
@@ -4,7 +4,7 @@
 
 #include <linux/device.h>
 #include <sound/soc-acpi.h>
-#include "sof_ssp_common.h"
+#include <sound/soc-acpi-intel-ssp-common.h>
 
 /*
  * Codec probe function
diff --git a/sound/soc/intel/common/sof_ssp_common.h b/sound/soc/intel/common/sof_ssp_common.h
deleted file mode 100644
index ff84154ce60a..000000000000
--- a/sound/soc/intel/common/sof_ssp_common.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright(c) 2023 Intel Corporation.
- */
-
-#ifndef __SOF_SSP_COMMON_H
-#define __SOF_SSP_COMMON_H
-
-/* Cirrus Logic */
-#define CS35L41_ACPI_HID	"CSC3541"
-#define CS42L42_ACPI_HID	"10134242"
-
-/* Dialog */
-#define DA7219_ACPI_HID		"DLGS7219"
-
-/* Everest */
-#define ES8316_ACPI_HID		"ESSX8316"
-#define ES8326_ACPI_HID		"ESSX8326"
-#define ES8336_ACPI_HID		"ESSX8336"
-
-#define MAX_98357A_ACPI_HID	"MX98357A"
-#define MAX_98360A_ACPI_HID	"MX98360A"
-#define MAX_98373_ACPI_HID	"MX98373"
-#define MAX_98390_ACPI_HID	"MX98390"
-
-/* Nuvoton */
-#define NAU8318_ACPI_HID	"NVTN2012"
-#define NAU8825_ACPI_HID	"10508825"
-
-/* Realtek */
-#define RT1011_ACPI_HID		"10EC1011"
-#define RT1015_ACPI_HID		"10EC1015"
-#define RT1015P_ACPI_HID	"RTL1015"
-#define RT1019P_ACPI_HID	"RTL1019"
-#define RT1308_ACPI_HID		"10EC1308"
-#define RT5650_ACPI_HID		"10EC5650"
-#define RT5682_ACPI_HID		"10EC5682"
-#define RT5682S_ACPI_HID	"RTL5682"
-
-enum sof_ssp_codec {
-	CODEC_NONE,
-
-	/* headphone codec */
-	CODEC_CS42L42,
-	CODEC_DA7219,
-	CODEC_ES8316,
-	CODEC_ES8326,
-	CODEC_ES8336,
-	CODEC_NAU8825,
-	CODEC_RT5650,
-	CODEC_RT5682,
-	CODEC_RT5682S,
-
-	/* speaker amplifier */
-	CODEC_CS35L41,
-	CODEC_MAX98357A,
-	CODEC_MAX98360A,
-	CODEC_MAX98373,
-	CODEC_MAX98390,
-	CODEC_NAU8318,
-	CODEC_RT1011,
-	CODEC_RT1015,
-	CODEC_RT1015P,
-	CODEC_RT1019P,
-	CODEC_RT1308,
-};
-
-enum sof_ssp_codec sof_ssp_detect_codec_type(struct device *dev);
-enum sof_ssp_codec sof_ssp_detect_amp_type(struct device *dev);
-
-const char *sof_ssp_get_codec_name(enum sof_ssp_codec codec_type);
-
-#endif /* __SOF_SSP_COMMON_H */
-- 
cgit v1.2.3


From a17fea3880aea23fd8821ff0660268a680e0326b Mon Sep 17 00:00:00 2001
From: Brent Lu <brent.lu@intel.com>
Date: Wed, 27 Mar 2024 11:23:53 -0500
Subject: ASoC: Intel: ssp-common: naming convention change

As we moved ssp-common files to new locations with new names, changing
the naming convention from sof_ssp_ to snd_soc_acpi_intel_.

Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Brent Lu <brent.lu@intel.com>
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Link: https://msgid.link/r/20240327162408.63953-4-pierre-louis.bossart@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-acpi-intel-ssp-common.h          | 17 ++++++++++-------
 sound/soc/intel/boards/sof_board_helpers.c         | 13 +++++++------
 sound/soc/intel/boards/sof_board_helpers.h         |  4 ++--
 sound/soc/intel/common/soc-acpi-intel-ssp-common.c | 17 ++++++++++-------
 4 files changed, 29 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc-acpi-intel-ssp-common.h b/include/sound/soc-acpi-intel-ssp-common.h
index ff84154ce60a..b568cda11051 100644
--- a/include/sound/soc-acpi-intel-ssp-common.h
+++ b/include/sound/soc-acpi-intel-ssp-common.h
@@ -3,8 +3,8 @@
  * Copyright(c) 2023 Intel Corporation.
  */
 
-#ifndef __SOF_SSP_COMMON_H
-#define __SOF_SSP_COMMON_H
+#ifndef __LINUX_SND_SOC_ACPI_INTEL_SSP_COMMON_H
+#define __LINUX_SND_SOC_ACPI_INTEL_SSP_COMMON_H
 
 /* Cirrus Logic */
 #define CS35L41_ACPI_HID	"CSC3541"
@@ -37,7 +37,7 @@
 #define RT5682_ACPI_HID		"10EC5682"
 #define RT5682S_ACPI_HID	"RTL5682"
 
-enum sof_ssp_codec {
+enum snd_soc_acpi_intel_codec {
 	CODEC_NONE,
 
 	/* headphone codec */
@@ -65,9 +65,12 @@ enum sof_ssp_codec {
 	CODEC_RT1308,
 };
 
-enum sof_ssp_codec sof_ssp_detect_codec_type(struct device *dev);
-enum sof_ssp_codec sof_ssp_detect_amp_type(struct device *dev);
+enum snd_soc_acpi_intel_codec
+snd_soc_acpi_intel_detect_codec_type(struct device *dev);
+enum snd_soc_acpi_intel_codec
+snd_soc_acpi_intel_detect_amp_type(struct device *dev);
 
-const char *sof_ssp_get_codec_name(enum sof_ssp_codec codec_type);
+const char *
+snd_soc_acpi_intel_get_codec_name(enum snd_soc_acpi_intel_codec codec_type);
 
-#endif /* __SOF_SSP_COMMON_H */
+#endif /* __LINUX_SND_SOC_ACPI_INTEL_SSP_COMMON_H */
diff --git a/sound/soc/intel/boards/sof_board_helpers.c b/sound/soc/intel/boards/sof_board_helpers.c
index 586600680e84..8819ab5694bf 100644
--- a/sound/soc/intel/boards/sof_board_helpers.c
+++ b/sound/soc/intel/boards/sof_board_helpers.c
@@ -103,13 +103,13 @@ static struct snd_soc_dai_link_component platform_component[] = {
 };
 
 static int set_ssp_codec_link(struct device *dev, struct snd_soc_dai_link *link,
-			      int be_id, enum sof_ssp_codec codec_type,
+			      int be_id, enum snd_soc_acpi_intel_codec codec_type,
 			      int ssp_codec)
 {
 	struct snd_soc_dai_link_component *cpus;
 
 	dev_dbg(dev, "link %d: ssp codec %s, ssp %d\n", be_id,
-		sof_ssp_get_codec_name(codec_type), ssp_codec);
+		snd_soc_acpi_intel_get_codec_name(codec_type), ssp_codec);
 
 	/* link name */
 	link->name = devm_kasprintf(dev, GFP_KERNEL, "SSP%d-Codec", ssp_codec);
@@ -259,12 +259,13 @@ static int set_idisp_hdmi_link(struct device *dev, struct snd_soc_dai_link *link
 }
 
 static int set_ssp_amp_link(struct device *dev, struct snd_soc_dai_link *link,
-			    int be_id, enum sof_ssp_codec amp_type, int ssp_amp)
+			    int be_id, enum snd_soc_acpi_intel_codec amp_type,
+			    int ssp_amp)
 {
 	struct snd_soc_dai_link_component *cpus;
 
 	dev_dbg(dev, "link %d: ssp amp %s, ssp %d\n", be_id,
-		sof_ssp_get_codec_name(amp_type), ssp_amp);
+		snd_soc_acpi_intel_get_codec_name(amp_type), ssp_amp);
 
 	/* link name */
 	link->name = devm_kasprintf(dev, GFP_KERNEL, "SSP%d-Codec", ssp_amp);
@@ -598,8 +599,8 @@ sof_intel_board_get_ctx(struct device *dev, unsigned long board_quirk)
 	if (!ctx)
 		return NULL;
 
-	ctx->codec_type = sof_ssp_detect_codec_type(dev);
-	ctx->amp_type = sof_ssp_detect_amp_type(dev);
+	ctx->codec_type = snd_soc_acpi_intel_detect_codec_type(dev);
+	ctx->amp_type = snd_soc_acpi_intel_detect_amp_type(dev);
 
 	ctx->dmic_be_num = 2;
 	ctx->hdmi_num = (board_quirk & SOF_NUM_IDISP_HDMI_MASK) >>
diff --git a/sound/soc/intel/boards/sof_board_helpers.h b/sound/soc/intel/boards/sof_board_helpers.h
index 34ea974c8493..644f6b4a1b7b 100644
--- a/sound/soc/intel/boards/sof_board_helpers.h
+++ b/sound/soc/intel/boards/sof_board_helpers.h
@@ -132,8 +132,8 @@ struct sof_card_private {
 	struct snd_soc_jack headset_jack;
 	struct sof_hdmi_private hdmi;
 
-	enum sof_ssp_codec codec_type;
-	enum sof_ssp_codec amp_type;
+	enum snd_soc_acpi_intel_codec codec_type;
+	enum snd_soc_acpi_intel_codec amp_type;
 
 	int dmic_be_num;
 	int hdmi_num;
diff --git a/sound/soc/intel/common/soc-acpi-intel-ssp-common.c b/sound/soc/intel/common/soc-acpi-intel-ssp-common.c
index 37758f4e3f32..716243a4fc9a 100644
--- a/sound/soc/intel/common/soc-acpi-intel-ssp-common.c
+++ b/sound/soc/intel/common/soc-acpi-intel-ssp-common.c
@@ -19,7 +19,7 @@
 struct codec_map {
 	const char *name;
 	const char *acpi_hid;
-	enum sof_ssp_codec codec_type;
+	enum snd_soc_acpi_intel_codec codec_type;
 };
 
 static const struct codec_map codecs[] = {
@@ -64,7 +64,8 @@ static const struct codec_map amps[] = {
 	CODEC_MAP_ENTRY("RT1308", RT1308_ACPI_HID, CODEC_RT1308),
 };
 
-enum sof_ssp_codec sof_ssp_detect_codec_type(struct device *dev)
+enum snd_soc_acpi_intel_codec
+snd_soc_acpi_intel_detect_codec_type(struct device *dev)
 {
 	int i;
 
@@ -78,9 +79,10 @@ enum sof_ssp_codec sof_ssp_detect_codec_type(struct device *dev)
 
 	return CODEC_NONE;
 }
-EXPORT_SYMBOL_NS(sof_ssp_detect_codec_type, SND_SOC_INTEL_SOF_SSP_COMMON);
+EXPORT_SYMBOL_NS(snd_soc_acpi_intel_detect_codec_type, SND_SOC_INTEL_SOF_SSP_COMMON);
 
-enum sof_ssp_codec sof_ssp_detect_amp_type(struct device *dev)
+enum snd_soc_acpi_intel_codec
+snd_soc_acpi_intel_detect_amp_type(struct device *dev)
 {
 	int i;
 
@@ -94,9 +96,10 @@ enum sof_ssp_codec sof_ssp_detect_amp_type(struct device *dev)
 
 	return CODEC_NONE;
 }
-EXPORT_SYMBOL_NS(sof_ssp_detect_amp_type, SND_SOC_INTEL_SOF_SSP_COMMON);
+EXPORT_SYMBOL_NS(snd_soc_acpi_intel_detect_amp_type, SND_SOC_INTEL_SOF_SSP_COMMON);
 
-const char *sof_ssp_get_codec_name(enum sof_ssp_codec codec_type)
+const char *
+snd_soc_acpi_intel_get_codec_name(enum snd_soc_acpi_intel_codec codec_type)
 {
 	int i;
 
@@ -115,7 +118,7 @@ const char *sof_ssp_get_codec_name(enum sof_ssp_codec codec_type)
 
 	return NULL;
 }
-EXPORT_SYMBOL_NS(sof_ssp_get_codec_name, SND_SOC_INTEL_SOF_SSP_COMMON);
+EXPORT_SYMBOL_NS(snd_soc_acpi_intel_get_codec_name, SND_SOC_INTEL_SOF_SSP_COMMON);
 
 MODULE_DESCRIPTION("ASoC Intel SOF Common Machine Driver Helpers");
 MODULE_AUTHOR("Brent Lu <brent.lu@intel.com>");
-- 
cgit v1.2.3


From 2e723a79ec609871116d216309c1b89d2f61b713 Mon Sep 17 00:00:00 2001
From: Brent Lu <brent.lu@intel.com>
Date: Wed, 27 Mar 2024 11:23:56 -0500
Subject: ASoC: Intel: ssp-common: get codec tplg suffix function

Add helper functions to get tplg suffix string for specific headphone
codec or speaker amplifier. The string could be used to compose the
default topology file name for specific headphone codec and speaker
amplifier combination.

Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Brent Lu <brent.lu@intel.com>
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Link: https://msgid.link/r/20240327162408.63953-7-pierre-louis.bossart@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-acpi-intel-ssp-common.h          |  5 ++
 sound/soc/intel/common/soc-acpi-intel-ssp-common.c | 76 ++++++++++++++++------
 2 files changed, 60 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc-acpi-intel-ssp-common.h b/include/sound/soc-acpi-intel-ssp-common.h
index b568cda11051..b4597c8dac78 100644
--- a/include/sound/soc-acpi-intel-ssp-common.h
+++ b/include/sound/soc-acpi-intel-ssp-common.h
@@ -73,4 +73,9 @@ snd_soc_acpi_intel_detect_amp_type(struct device *dev);
 const char *
 snd_soc_acpi_intel_get_codec_name(enum snd_soc_acpi_intel_codec codec_type);
 
+const char *
+snd_soc_acpi_intel_get_codec_tplg_suffix(enum snd_soc_acpi_intel_codec codec_type);
+const char *
+snd_soc_acpi_intel_get_amp_tplg_suffix(enum snd_soc_acpi_intel_codec codec_type);
+
 #endif /* __LINUX_SND_SOC_ACPI_INTEL_SSP_COMMON_H */
diff --git a/sound/soc/intel/common/soc-acpi-intel-ssp-common.c b/sound/soc/intel/common/soc-acpi-intel-ssp-common.c
index 68da7260fb1d..a887f019afe2 100644
--- a/sound/soc/intel/common/soc-acpi-intel-ssp-common.c
+++ b/sound/soc/intel/common/soc-acpi-intel-ssp-common.c
@@ -9,59 +9,61 @@
 /*
  * Codec probe function
  */
-#define CODEC_MAP_ENTRY(n, h, t)	\
+#define CODEC_MAP_ENTRY(n, s, h, t)	\
 	{				\
 		.name = n,		\
+		.tplg_suffix = s,	\
 		.acpi_hid = h,		\
 		.codec_type = t,	\
 	}
 
 struct codec_map {
 	const char *name;
+	const char *tplg_suffix;
 	const char *acpi_hid;
 	enum snd_soc_acpi_intel_codec codec_type;
 };
 
 static const struct codec_map codecs[] = {
 	/* Cirrus Logic */
-	CODEC_MAP_ENTRY("CS42L42", CS42L42_ACPI_HID, CODEC_CS42L42),
+	CODEC_MAP_ENTRY("CS42L42", "cs42l42", CS42L42_ACPI_HID, CODEC_CS42L42),
 
 	/* Dialog */
-	CODEC_MAP_ENTRY("DA7219", DA7219_ACPI_HID, CODEC_DA7219),
+	CODEC_MAP_ENTRY("DA7219", "da7219", DA7219_ACPI_HID, CODEC_DA7219),
 
 	/* Everest */
-	CODEC_MAP_ENTRY("ES8316", ES8316_ACPI_HID, CODEC_ES8316),
-	CODEC_MAP_ENTRY("ES8326", ES8326_ACPI_HID, CODEC_ES8326),
-	CODEC_MAP_ENTRY("ES8336", ES8336_ACPI_HID, CODEC_ES8336),
+	CODEC_MAP_ENTRY("ES8316", "es8336", ES8316_ACPI_HID, CODEC_ES8316),
+	CODEC_MAP_ENTRY("ES8326", "es8336", ES8326_ACPI_HID, CODEC_ES8326),
+	CODEC_MAP_ENTRY("ES8336", "es8336", ES8336_ACPI_HID, CODEC_ES8336),
 
 	/* Nuvoton */
-	CODEC_MAP_ENTRY("NAU8825", NAU8825_ACPI_HID, CODEC_NAU8825),
+	CODEC_MAP_ENTRY("NAU8825", "nau8825", NAU8825_ACPI_HID, CODEC_NAU8825),
 
 	/* Realtek */
-	CODEC_MAP_ENTRY("RT5650", RT5650_ACPI_HID, CODEC_RT5650),
-	CODEC_MAP_ENTRY("RT5682", RT5682_ACPI_HID, CODEC_RT5682),
-	CODEC_MAP_ENTRY("RT5682S", RT5682S_ACPI_HID, CODEC_RT5682S),
+	CODEC_MAP_ENTRY("RT5650", "rt5650", RT5650_ACPI_HID, CODEC_RT5650),
+	CODEC_MAP_ENTRY("RT5682", "rt5682", RT5682_ACPI_HID, CODEC_RT5682),
+	CODEC_MAP_ENTRY("RT5682S", "rt5682", RT5682S_ACPI_HID, CODEC_RT5682S),
 };
 
 static const struct codec_map amps[] = {
 	/* Cirrus Logic */
-	CODEC_MAP_ENTRY("CS35L41", CS35L41_ACPI_HID, CODEC_CS35L41),
+	CODEC_MAP_ENTRY("CS35L41", "cs35l41", CS35L41_ACPI_HID, CODEC_CS35L41),
 
 	/* Maxim */
-	CODEC_MAP_ENTRY("MAX98357A", MAX_98357A_ACPI_HID, CODEC_MAX98357A),
-	CODEC_MAP_ENTRY("MAX98360A", MAX_98360A_ACPI_HID, CODEC_MAX98360A),
-	CODEC_MAP_ENTRY("MAX98373", MAX_98373_ACPI_HID, CODEC_MAX98373),
-	CODEC_MAP_ENTRY("MAX98390", MAX_98390_ACPI_HID, CODEC_MAX98390),
+	CODEC_MAP_ENTRY("MAX98357A", "max98357a", MAX_98357A_ACPI_HID, CODEC_MAX98357A),
+	CODEC_MAP_ENTRY("MAX98360A", "max98360a", MAX_98360A_ACPI_HID, CODEC_MAX98360A),
+	CODEC_MAP_ENTRY("MAX98373", "max98373", MAX_98373_ACPI_HID, CODEC_MAX98373),
+	CODEC_MAP_ENTRY("MAX98390", "max98390", MAX_98390_ACPI_HID, CODEC_MAX98390),
 
 	/* Nuvoton */
-	CODEC_MAP_ENTRY("NAU8318", NAU8318_ACPI_HID, CODEC_NAU8318),
+	CODEC_MAP_ENTRY("NAU8318", "nau8318", NAU8318_ACPI_HID, CODEC_NAU8318),
 
 	/* Realtek */
-	CODEC_MAP_ENTRY("RT1011", RT1011_ACPI_HID, CODEC_RT1011),
-	CODEC_MAP_ENTRY("RT1015", RT1015_ACPI_HID, CODEC_RT1015),
-	CODEC_MAP_ENTRY("RT1015P", RT1015P_ACPI_HID, CODEC_RT1015P),
-	CODEC_MAP_ENTRY("RT1019P", RT1019P_ACPI_HID, CODEC_RT1019P),
-	CODEC_MAP_ENTRY("RT1308", RT1308_ACPI_HID, CODEC_RT1308),
+	CODEC_MAP_ENTRY("RT1011", "rt1011", RT1011_ACPI_HID, CODEC_RT1011),
+	CODEC_MAP_ENTRY("RT1015", "rt1015", RT1015_ACPI_HID, CODEC_RT1015),
+	CODEC_MAP_ENTRY("RT1015P", "rt1015", RT1015P_ACPI_HID, CODEC_RT1015P),
+	CODEC_MAP_ENTRY("RT1019P", "rt1019", RT1019P_ACPI_HID, CODEC_RT1019P),
+	CODEC_MAP_ENTRY("RT1308", "rt1308", RT1308_ACPI_HID, CODEC_RT1308),
 };
 
 enum snd_soc_acpi_intel_codec
@@ -120,6 +122,38 @@ snd_soc_acpi_intel_get_codec_name(enum snd_soc_acpi_intel_codec codec_type)
 }
 EXPORT_SYMBOL_NS(snd_soc_acpi_intel_get_codec_name, SND_SOC_ACPI_INTEL_MATCH);
 
+const char *
+snd_soc_acpi_intel_get_codec_tplg_suffix(enum snd_soc_acpi_intel_codec codec_type)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(codecs); i++) {
+		if (codecs[i].codec_type != codec_type)
+			continue;
+
+		return codecs[i].tplg_suffix;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_NS(snd_soc_acpi_intel_get_codec_tplg_suffix, SND_SOC_ACPI_INTEL_MATCH);
+
+const char *
+snd_soc_acpi_intel_get_amp_tplg_suffix(enum snd_soc_acpi_intel_codec codec_type)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(amps); i++) {
+		if (amps[i].codec_type != codec_type)
+			continue;
+
+		return amps[i].tplg_suffix;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_NS(snd_soc_acpi_intel_get_amp_tplg_suffix, SND_SOC_ACPI_INTEL_MATCH);
+
 MODULE_DESCRIPTION("ASoC Intel SOF Common Machine Driver Helpers");
 MODULE_AUTHOR("Brent Lu <brent.lu@intel.com>");
 MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 1504a768f6045157437693fbfb50ae63ca86ec61 Mon Sep 17 00:00:00 2001
From: Brent Lu <brent.lu@intel.com>
Date: Wed, 27 Mar 2024 11:23:57 -0500
Subject: ASoC: SOF: Intel: support tplg suffix detection

Add new flags to tplg_quirk_mask to detect and append codec/amplifier
tplg suffix to topology file name at runtime. With this feature we
could implement an enumeration entry for all boards which implement
same headphone codec regardless the speaker amplifier type.

Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Signed-off-by: Brent Lu <brent.lu@intel.com>
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Link: https://msgid.link/r/20240327162408.63953-8-pierre-louis.bossart@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-acpi.h  | 12 +++++++
 sound/soc/sof/intel/hda.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 94 insertions(+)

(limited to 'include')

diff --git a/include/sound/soc-acpi.h b/include/sound/soc-acpi.h
index 23d6d6bfb073..1d8f35ca1d6f 100644
--- a/include/sound/soc-acpi.h
+++ b/include/sound/soc-acpi.h
@@ -151,6 +151,18 @@ struct snd_soc_acpi_link_adr {
  */
 #define SND_SOC_ACPI_TPLG_INTEL_DMIC_NUMBER BIT(2)
 
+/*
+ * when set the speaker amplifier name suffix (i.e. "-max98360a") will be
+ * appended to topology file name
+ */
+#define SND_SOC_ACPI_TPLG_INTEL_AMP_NAME BIT(3)
+
+/*
+ * when set the headphone codec name suffix (i.e. "-rt5682") will be appended to
+ * topology file name
+ */
+#define SND_SOC_ACPI_TPLG_INTEL_CODEC_NAME BIT(4)
+
 /**
  * snd_soc_acpi_mach: ACPI-based machine descriptor. Most of the fields are
  * related to the hardware, except for the firmware and topology file names.
diff --git a/sound/soc/sof/intel/hda.c b/sound/soc/sof/intel/hda.c
index 7fe72b065451..e26b8fd682e5 100644
--- a/sound/soc/sof/intel/hda.c
+++ b/sound/soc/sof/intel/hda.c
@@ -24,6 +24,7 @@
 #include <linux/soundwire/sdw_intel.h>
 #include <sound/intel-dsp-config.h>
 #include <sound/intel-nhlt.h>
+#include <sound/soc-acpi-intel-ssp-common.h>
 #include <sound/sof.h>
 #include <sound/sof/xtensa.h>
 #include <sound/hda-mlink.h>
@@ -1676,13 +1677,36 @@ void hda_set_mach_params(struct snd_soc_acpi_mach *mach,
 	mach_params->dai_drivers = desc->ops->drv;
 }
 
+static int check_tplg_quirk_mask(struct snd_soc_acpi_mach *mach)
+{
+	u32 dmic_ssp_quirk;
+	u32 codec_amp_name_quirk;
+
+	/*
+	 * In current implementation dmic and ssp quirks are designed for es8336
+	 * machine driver and could not be mixed with codec name and amp name
+	 * quirks.
+	 */
+	dmic_ssp_quirk = mach->tplg_quirk_mask &
+			 (SND_SOC_ACPI_TPLG_INTEL_DMIC_NUMBER | SND_SOC_ACPI_TPLG_INTEL_SSP_NUMBER);
+	codec_amp_name_quirk = mach->tplg_quirk_mask &
+			 (SND_SOC_ACPI_TPLG_INTEL_AMP_NAME | SND_SOC_ACPI_TPLG_INTEL_CODEC_NAME);
+
+	if (dmic_ssp_quirk && codec_amp_name_quirk)
+		return -EINVAL;
+
+	return 0;
+}
+
 struct snd_soc_acpi_mach *hda_machine_select(struct snd_sof_dev *sdev)
 {
 	u32 interface_mask = hda_get_interface_mask(sdev);
 	struct snd_sof_pdata *sof_pdata = sdev->pdata;
 	const struct sof_dev_desc *desc = sof_pdata->desc;
 	struct snd_soc_acpi_mach *mach = NULL;
+	enum snd_soc_acpi_intel_codec codec_type;
 	const char *tplg_filename;
+	const char *tplg_suffix;
 
 	/* Try I2S or DMIC if it is supported */
 	if (interface_mask & (BIT(SOF_DAI_INTEL_SSP) | BIT(SOF_DAI_INTEL_DMIC)))
@@ -1701,6 +1725,17 @@ struct snd_soc_acpi_mach *hda_machine_select(struct snd_sof_dev *sdev)
 			tplg_fixup = true;
 		}
 
+		/*
+		 * Checking quirk mask integrity; some quirk flags could not be
+		 * set concurrently.
+		 */
+		if (tplg_fixup &&
+		    check_tplg_quirk_mask(mach)) {
+			dev_err(sdev->dev, "Invalid tplg quirk mask 0x%x\n",
+				mach->tplg_quirk_mask);
+			return NULL;
+		}
+
 		/* report to machine driver if any DMICs are found */
 		mach->mach_params.dmic_num = check_dmic_num(sdev);
 
@@ -1775,6 +1810,52 @@ struct snd_soc_acpi_mach *hda_machine_select(struct snd_sof_dev *sdev)
 			}
 		}
 
+		codec_type = snd_soc_acpi_intel_detect_amp_type(sdev->dev);
+
+		if (tplg_fixup &&
+		    mach->tplg_quirk_mask & SND_SOC_ACPI_TPLG_INTEL_AMP_NAME &&
+		    codec_type != CODEC_NONE) {
+			tplg_suffix = snd_soc_acpi_intel_get_amp_tplg_suffix(codec_type);
+			if (!tplg_suffix) {
+				dev_err(sdev->dev, "no tplg suffix found, amp %d\n",
+					codec_type);
+				return NULL;
+			}
+
+			tplg_filename = devm_kasprintf(sdev->dev, GFP_KERNEL,
+						       "%s-%s",
+						       sof_pdata->tplg_filename,
+						       tplg_suffix);
+			if (!tplg_filename)
+				return NULL;
+
+			sof_pdata->tplg_filename = tplg_filename;
+			add_extension = true;
+		}
+
+		codec_type = snd_soc_acpi_intel_detect_codec_type(sdev->dev);
+
+		if (tplg_fixup &&
+		    mach->tplg_quirk_mask & SND_SOC_ACPI_TPLG_INTEL_CODEC_NAME &&
+		    codec_type != CODEC_NONE) {
+			tplg_suffix = snd_soc_acpi_intel_get_codec_tplg_suffix(codec_type);
+			if (!tplg_suffix) {
+				dev_err(sdev->dev, "no tplg suffix found, codec %d\n",
+					codec_type);
+				return NULL;
+			}
+
+			tplg_filename = devm_kasprintf(sdev->dev, GFP_KERNEL,
+						       "%s-%s",
+						       sof_pdata->tplg_filename,
+						       tplg_suffix);
+			if (!tplg_filename)
+				return NULL;
+
+			sof_pdata->tplg_filename = tplg_filename;
+			add_extension = true;
+		}
+
 		if (tplg_fixup && add_extension) {
 			tplg_filename = devm_kasprintf(sdev->dev, GFP_KERNEL,
 						       "%s%s",
@@ -1842,3 +1923,4 @@ MODULE_IMPORT_NS(SND_INTEL_SOUNDWIRE_ACPI);
 MODULE_IMPORT_NS(SOUNDWIRE_INTEL_INIT);
 MODULE_IMPORT_NS(SOUNDWIRE_INTEL);
 MODULE_IMPORT_NS(SND_SOC_SOF_HDA_MLINK);
+MODULE_IMPORT_NS(SND_SOC_ACPI_INTEL_MATCH);
-- 
cgit v1.2.3


From 67bb37c05a6b56e0e1f804706145a52f655af3f1 Mon Sep 17 00:00:00 2001
From: Louis Chauvet <louis.chauvet@bootlin.com>
Date: Wed, 27 Mar 2024 09:43:36 +0100
Subject: spi: spi-omap2-mcspi.c: revert "Toggle CS after each word"

Commit 5cbc7ca987fb ("spi: spi-omap2-mcspi.c: Toggle CS after each
word") introduced the toggling of CS after each word for the omap2-mcspi
controller.

The implementation is not respectful of the actual spi_message
content, so the CS can be raised after each word even if the
transfer structure asks to keep the CS active for the whole operation.

As it is not used anyway in the current Linux tree, it can be safely
removed.

Signed-off-by: Louis Chauvet <louis.chauvet@bootlin.com>
Link: https://msgid.link/r/20240327-spi-omap2-mcspi-multi-mode-v3-1-c4ac329dd5a2@bootlin.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-omap2-mcspi.c                 | 15 ---------------
 include/linux/platform_data/spi-omap2-mcspi.h |  3 ---
 2 files changed, 18 deletions(-)

(limited to 'include')

diff --git a/drivers/spi/spi-omap2-mcspi.c b/drivers/spi/spi-omap2-mcspi.c
index ddf1c684bcc7..601acec37eca 100644
--- a/drivers/spi/spi-omap2-mcspi.c
+++ b/drivers/spi/spi-omap2-mcspi.c
@@ -1175,13 +1175,6 @@ static int omap2_mcspi_transfer_one(struct spi_controller *ctlr,
 		    t->bits_per_word == spi->bits_per_word)
 			par_override = 0;
 	}
-	if (cd && cd->cs_per_word) {
-		chconf = mcspi->ctx.modulctrl;
-		chconf &= ~OMAP2_MCSPI_MODULCTRL_SINGLE;
-		mcspi_write_reg(ctlr, OMAP2_MCSPI_MODULCTRL, chconf);
-		mcspi->ctx.modulctrl =
-			mcspi_read_cs_reg(spi, OMAP2_MCSPI_MODULCTRL);
-	}
 
 	chconf = mcspi_cached_chconf0(spi);
 	chconf &= ~OMAP2_MCSPI_CHCONF_TRM_MASK;
@@ -1240,14 +1233,6 @@ out:
 		status = omap2_mcspi_setup_transfer(spi, NULL);
 	}
 
-	if (cd && cd->cs_per_word) {
-		chconf = mcspi->ctx.modulctrl;
-		chconf |= OMAP2_MCSPI_MODULCTRL_SINGLE;
-		mcspi_write_reg(ctlr, OMAP2_MCSPI_MODULCTRL, chconf);
-		mcspi->ctx.modulctrl =
-			mcspi_read_cs_reg(spi, OMAP2_MCSPI_MODULCTRL);
-	}
-
 	omap2_mcspi_set_enable(spi, 0);
 
 	if (spi_get_csgpiod(spi, 0))
diff --git a/include/linux/platform_data/spi-omap2-mcspi.h b/include/linux/platform_data/spi-omap2-mcspi.h
index 3b400b1919a9..9e3c15b4ac91 100644
--- a/include/linux/platform_data/spi-omap2-mcspi.h
+++ b/include/linux/platform_data/spi-omap2-mcspi.h
@@ -16,9 +16,6 @@ struct omap2_mcspi_platform_config {
 
 struct omap2_mcspi_device_config {
 	unsigned turbo_mode:1;
-
-	/* toggle chip select after every word */
-	unsigned cs_per_word:1;
 };
 
 #endif
-- 
cgit v1.2.3


From 368d67dab4cc4a3ffd39fbd062b2f5796cdbb37b Mon Sep 17 00:00:00 2001
From: Abel Vesa <abel.vesa@linaro.org>
Date: Sun, 24 Mar 2024 20:50:17 +0200
Subject: phy: Add Embedded DisplayPort and DisplayPort submodes

In some cases, a DP PHY needs to be configured to work in eDP mode.
So add submodes for both DP and eDP so they can be used by the
controllers for specifying the mode the PHY should be configured in.

Signed-off-by: Abel Vesa <abel.vesa@linaro.org>
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Link: https://lore.kernel.org/r/20240324-x1e80100-phy-edp-compatible-refactor-v5-1-a0db5f3150bc@linaro.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/phy/phy-dp.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/phy/phy-dp.h b/include/linux/phy/phy-dp.h
index 18cad23642cd..9cce5766bc0b 100644
--- a/include/linux/phy/phy-dp.h
+++ b/include/linux/phy/phy-dp.h
@@ -8,6 +8,9 @@
 
 #include <linux/types.h>
 
+#define PHY_SUBMODE_DP	0
+#define PHY_SUBMODE_EDP	1
+
 /**
  * struct phy_configure_opts_dp - DisplayPort PHY configuration set
  *
-- 
cgit v1.2.3


From ca83f9d20171ce1ba5f82fb28b77eddabd636443 Mon Sep 17 00:00:00 2001
From: Francois Dugast <francois.dugast@intel.com>
Date: Thu, 28 Mar 2024 14:02:43 +0000
Subject: drm/xe/uapi: Define topology types as indexes rather than masks

The topology type is an index (not a mask) so define the values
like other indexes instead of using powers of 2. This is also
to make clear that the next type can use value 3. This commit
does not change the existing values so it does not break
compatibility.

Cc: Lucas De Marchi <lucas.demarchi@intel.com>
Suggested-by: Matt Roper <matthew.d.roper@intel.com>
Signed-off-by: Francois Dugast <francois.dugast@intel.com>
Link: https://lore.kernel.org/intel-xe/20240327232317.GI718896@mdroper-desk1.amr.corp.intel.com/
Reviewed-by: Lucas De Marchi <lucas.demarchi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240328140243.7-1-francois.dugast@intel.com
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
 include/uapi/drm/xe_drm.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
index 808ad1c308ec..95a8ecca21f4 100644
--- a/include/uapi/drm/xe_drm.h
+++ b/include/uapi/drm/xe_drm.h
@@ -518,9 +518,9 @@ struct drm_xe_query_topology_mask {
 	/** @gt_id: GT ID the mask is associated with */
 	__u16 gt_id;
 
-#define DRM_XE_TOPO_DSS_GEOMETRY	(1 << 0)
-#define DRM_XE_TOPO_DSS_COMPUTE		(1 << 1)
-#define DRM_XE_TOPO_EU_PER_DSS		(1 << 2)
+#define DRM_XE_TOPO_DSS_GEOMETRY	1
+#define DRM_XE_TOPO_DSS_COMPUTE		2
+#define DRM_XE_TOPO_EU_PER_DSS		4
 	/** @type: type of mask */
 	__u16 type;
 
-- 
cgit v1.2.3


From 6e9b01909a811555ff3326cf80a5847169c57806 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 26 Mar 2024 21:02:12 -0700
Subject: net: remove gfp_mask from napi_alloc_skb()

__napi_alloc_skb() is napi_alloc_skb() with the added flexibility
of choosing gfp_mask. This is a NAPI function, so GFP_ATOMIC is
implied. The only practical choice the caller has is whether to
set __GFP_NOWARN. But that's a false choice, too, allocation failures
in atomic context will happen, and printing warnings in logs,
effectively for a packet drop, is both too much and very likely
non-actionable.

This leads me to a conclusion that most uses of napi_alloc_skb()
are simply misguided, and should use __GFP_NOWARN in the first
place. We also have a "standard" way of reporting allocation
failures via the queue stat API (qstats::rx-alloc-fail).

The direct motivation for this patch is that one of the drivers
used at Meta calls napi_alloc_skb() (so prior to this patch without
__GFP_NOWARN), and the resulting OOM warning is the top networking
warning in our fleet.

Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20240327040213.3153864-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/mm/page_frags.rst                    | 2 +-
 Documentation/translations/zh_CN/mm/page_frags.rst | 2 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx.c        | 4 +---
 drivers/net/ethernet/intel/i40e/i40e_xsk.c         | 3 +--
 drivers/net/ethernet/intel/iavf/iavf_txrx.c        | 4 +---
 drivers/net/ethernet/intel/ice/ice_txrx.c          | 3 +--
 drivers/net/ethernet/intel/ice/ice_xsk.c           | 3 +--
 drivers/net/ethernet/intel/idpf/idpf_txrx.c        | 5 ++---
 drivers/net/ethernet/intel/igc/igc_main.c          | 3 +--
 drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c       | 3 +--
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 5 ++---
 include/linux/skbuff.h                             | 8 +-------
 net/core/skbuff.c                                  | 9 ++++-----
 13 files changed, 18 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/Documentation/mm/page_frags.rst b/Documentation/mm/page_frags.rst
index a81617e688a8..503ca6cdb804 100644
--- a/Documentation/mm/page_frags.rst
+++ b/Documentation/mm/page_frags.rst
@@ -25,7 +25,7 @@ to be disabled when executing the fragment allocation.
 The network stack uses two separate caches per CPU to handle fragment
 allocation.  The netdev_alloc_cache is used by callers making use of the
 netdev_alloc_frag and __netdev_alloc_skb calls.  The napi_alloc_cache is
-used by callers of the __napi_alloc_frag and __napi_alloc_skb calls.  The
+used by callers of the __napi_alloc_frag and napi_alloc_skb calls.  The
 main difference between these two calls is the context in which they may be
 called.  The "netdev" prefixed functions are usable in any context as these
 functions will disable interrupts, while the "napi" prefixed functions are
diff --git a/Documentation/translations/zh_CN/mm/page_frags.rst b/Documentation/translations/zh_CN/mm/page_frags.rst
index 20bd3fafdc8c..a5b22486a913 100644
--- a/Documentation/translations/zh_CN/mm/page_frags.rst
+++ b/Documentation/translations/zh_CN/mm/page_frags.rst
@@ -25,7 +25,7 @@ sk_buff->head使用，或者用于skb_shared_info的 “frags” 部分。
 
 网络堆栈在每个CPU使用两个独立的缓存来处理碎片分配。netdev_alloc_cache被使用
 netdev_alloc_frag和__netdev_alloc_skb调用的调用者使用。napi_alloc_cache
-被调用__napi_alloc_frag和__napi_alloc_skb的调用者使用。这两个调用的主要区别是
+被调用__napi_alloc_frag和napi_alloc_skb的调用者使用。这两个调用的主要区别是
 它们可能被调用的环境。“netdev” 前缀的函数可以在任何上下文中使用，因为这些函数
 将禁用中断，而 ”napi“ 前缀的函数只可以在softirq上下文中使用。
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 0d7177083708..ac2fcc5ac595 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2144,9 +2144,7 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring,
 	 */
 
 	/* allocate a skb to store the frags */
-	skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
-			       I40E_RX_HDR_SIZE,
-			       GFP_ATOMIC | __GFP_NOWARN);
+	skb = napi_alloc_skb(&rx_ring->q_vector->napi, I40E_RX_HDR_SIZE);
 	if (unlikely(!skb))
 		return NULL;
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index 11500003af0d..a85b425794df 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -301,8 +301,7 @@ static struct sk_buff *i40e_construct_skb_zc(struct i40e_ring *rx_ring,
 	net_prefetch(xdp->data_meta);
 
 	/* allocate a skb to store the frags */
-	skb = __napi_alloc_skb(&rx_ring->q_vector->napi, totalsize,
-			       GFP_ATOMIC | __GFP_NOWARN);
+	skb = napi_alloc_skb(&rx_ring->q_vector->napi, totalsize);
 	if (unlikely(!skb))
 		goto out;
 
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index b71484c87a84..32bb604a1382 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -1334,9 +1334,7 @@ static struct sk_buff *iavf_construct_skb(struct iavf_ring *rx_ring,
 	net_prefetch(va);
 
 	/* allocate a skb to store the frags */
-	skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
-			       IAVF_RX_HDR_SIZE,
-			       GFP_ATOMIC | __GFP_NOWARN);
+	skb = napi_alloc_skb(&rx_ring->q_vector->napi, IAVF_RX_HDR_SIZE);
 	if (unlikely(!skb))
 		return NULL;
 
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
index 97d41d6ebf1f..8bb743f78fcb 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -1051,8 +1051,7 @@ ice_construct_skb(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp)
 	}
 
 	/* allocate a skb to store the frags */
-	skb = __napi_alloc_skb(&rx_ring->q_vector->napi, ICE_RX_HDR_SIZE,
-			       GFP_ATOMIC | __GFP_NOWARN);
+	skb = napi_alloc_skb(&rx_ring->q_vector->napi, ICE_RX_HDR_SIZE);
 	if (unlikely(!skb))
 		return NULL;
 
diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
index 1857220d27fe..aa81d1162b81 100644
--- a/drivers/net/ethernet/intel/ice/ice_xsk.c
+++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
@@ -555,8 +555,7 @@ ice_construct_skb_zc(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp)
 	}
 	net_prefetch(xdp->data_meta);
 
-	skb = __napi_alloc_skb(&rx_ring->q_vector->napi, totalsize,
-			       GFP_ATOMIC | __GFP_NOWARN);
+	skb = napi_alloc_skb(&rx_ring->q_vector->napi, totalsize);
 	if (unlikely(!skb))
 		return NULL;
 
diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c
index 6dd7a66bb897..f940f650cd78 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c
@@ -3005,8 +3005,7 @@ struct sk_buff *idpf_rx_construct_skb(struct idpf_queue *rxq,
 	/* prefetch first cache line of first page */
 	net_prefetch(va);
 	/* allocate a skb to store the frags */
-	skb = __napi_alloc_skb(&rxq->q_vector->napi, IDPF_RX_HDR_SIZE,
-			       GFP_ATOMIC);
+	skb = napi_alloc_skb(&rxq->q_vector->napi, IDPF_RX_HDR_SIZE);
 	if (unlikely(!skb)) {
 		idpf_rx_put_page(rx_buf);
 
@@ -3060,7 +3059,7 @@ static struct sk_buff *idpf_rx_hdr_construct_skb(struct idpf_queue *rxq,
 	struct sk_buff *skb;
 
 	/* allocate a skb to store the frags */
-	skb = __napi_alloc_skb(&rxq->q_vector->napi, size, GFP_ATOMIC);
+	skb = napi_alloc_skb(&rxq->q_vector->napi, size);
 	if (unlikely(!skb))
 		return NULL;
 
diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
index 35ad40a803cb..0cd923c8ac9b 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -2712,8 +2712,7 @@ static struct sk_buff *igc_construct_skb_zc(struct igc_ring *ring,
 
 	net_prefetch(xdp->data_meta);
 
-	skb = __napi_alloc_skb(&ring->q_vector->napi, totalsize,
-			       GFP_ATOMIC | __GFP_NOWARN);
+	skb = napi_alloc_skb(&ring->q_vector->napi, totalsize);
 	if (unlikely(!skb))
 		return NULL;
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
index d34d715c59eb..397cb773fabb 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
@@ -220,8 +220,7 @@ static struct sk_buff *ixgbe_construct_skb_zc(struct ixgbe_ring *rx_ring,
 	net_prefetch(xdp->data_meta);
 
 	/* allocate a skb to store the frags */
-	skb = __napi_alloc_skb(&rx_ring->q_vector->napi, totalsize,
-			       GFP_ATOMIC | __GFP_NOWARN);
+	skb = napi_alloc_skb(&rx_ring->q_vector->napi, totalsize);
 	if (unlikely(!skb))
 		return NULL;
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 24cd80490d19..bcdde68a099a 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -5109,9 +5109,8 @@ static struct sk_buff *stmmac_construct_skb_zc(struct stmmac_channel *ch,
 	unsigned int datasize = xdp->data_end - xdp->data;
 	struct sk_buff *skb;
 
-	skb = __napi_alloc_skb(&ch->rxtx_napi,
-			       xdp->data_end - xdp->data_hard_start,
-			       GFP_ATOMIC | __GFP_NOWARN);
+	skb = napi_alloc_skb(&ch->rxtx_napi,
+			     xdp->data_end - xdp->data_hard_start);
 	if (unlikely(!skb))
 		return NULL;
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 517e546a120a..b7f1ecdaec38 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3350,13 +3350,7 @@ static inline void *napi_alloc_frag_align(unsigned int fragsz,
 	return __napi_alloc_frag_align(fragsz, -align);
 }
 
-struct sk_buff *__napi_alloc_skb(struct napi_struct *napi,
-				 unsigned int length, gfp_t gfp_mask);
-static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi,
-					     unsigned int length)
-{
-	return __napi_alloc_skb(napi, length, GFP_ATOMIC);
-}
+struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int length);
 void napi_consume_skb(struct sk_buff *skb, int budget);
 
 void napi_skb_free_stolen_head(struct sk_buff *skb);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 17617c29be2d..a1be84be5d35 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -775,10 +775,9 @@ skb_fail:
 EXPORT_SYMBOL(__netdev_alloc_skb);
 
 /**
- *	__napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
+ *	napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
  *	@napi: napi instance this buffer was allocated for
  *	@len: length to allocate
- *	@gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
  *
  *	Allocate a new sk_buff for use in NAPI receive.  This buffer will
  *	attempt to allocate the head from a special reserved region used
@@ -787,9 +786,9 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
  *
  *	%NULL is returned if there is no free memory.
  */
-struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
-				 gfp_t gfp_mask)
+struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
 {
+	gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN;
 	struct napi_alloc_cache *nc;
 	struct sk_buff *skb;
 	bool pfmemalloc;
@@ -860,7 +859,7 @@ skb_success:
 skb_fail:
 	return skb;
 }
-EXPORT_SYMBOL(__napi_alloc_skb);
+EXPORT_SYMBOL(napi_alloc_skb);
 
 void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem,
 			    int off, int size, unsigned int truesize)
-- 
cgit v1.2.3


From 5311591fbb349fe9f5c555dcba3b13a5831aa72d Mon Sep 17 00:00:00 2001
From: Anton Protopopov <aspsk@isovalent.com>
Date: Tue, 26 Mar 2024 10:17:40 +0000
Subject: bpf: Add support for passing mark with bpf_fib_lookup

Extend the bpf_fib_lookup() helper by making it to utilize mark if
the BPF_FIB_LOOKUP_MARK flag is set. In order to pass the mark the
four bytes of struct bpf_fib_lookup are used, shared with the
output-only smac/dmac fields.

Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: David Ahern <dsahern@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20240326101742.17421-2-aspsk@isovalent.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h       | 20 ++++++++++++++++++--
 net/core/filter.c              | 12 +++++++++---
 tools/include/uapi/linux/bpf.h | 20 ++++++++++++++++++--
 3 files changed, 45 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 9585f5345353..96d57e483133 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3394,6 +3394,10 @@ union bpf_attr {
  *			for the nexthop. If the src addr cannot be derived,
  *			**BPF_FIB_LKUP_RET_NO_SRC_ADDR** is returned. In this
  *			case, *params*->dmac and *params*->smac are not set either.
+ *		**BPF_FIB_LOOKUP_MARK**
+ *			Use the mark present in *params*->mark for the fib lookup.
+ *			This option should not be used with BPF_FIB_LOOKUP_DIRECT,
+ *			as it only has meaning for full lookups.
  *
  *		*ctx* is either **struct xdp_md** for XDP programs or
  *		**struct sk_buff** tc cls_act programs.
@@ -7120,6 +7124,7 @@ enum {
 	BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2),
 	BPF_FIB_LOOKUP_TBID    = (1U << 3),
 	BPF_FIB_LOOKUP_SRC     = (1U << 4),
+	BPF_FIB_LOOKUP_MARK    = (1U << 5),
 };
 
 enum {
@@ -7197,8 +7202,19 @@ struct bpf_fib_lookup {
 		__u32	tbid;
 	};
 
-	__u8	smac[6];     /* ETH_ALEN */
-	__u8	dmac[6];     /* ETH_ALEN */
+	union {
+		/* input */
+		struct {
+			__u32	mark;   /* policy routing */
+			/* 2 4-byte holes for input */
+		};
+
+		/* output: source and dest mac */
+		struct {
+			__u8	smac[6];	/* ETH_ALEN */
+			__u8	dmac[6];	/* ETH_ALEN */
+		};
+	};
 };
 
 struct bpf_redir_neigh {
diff --git a/net/core/filter.c b/net/core/filter.c
index 0c66e4a3fc5b..1205dd777dc2 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5884,7 +5884,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 
 		err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
 	} else {
-		fl4.flowi4_mark = 0;
+		if (flags & BPF_FIB_LOOKUP_MARK)
+			fl4.flowi4_mark = params->mark;
+		else
+			fl4.flowi4_mark = 0;
 		fl4.flowi4_secid = 0;
 		fl4.flowi4_tun_key.tun_id = 0;
 		fl4.flowi4_uid = sock_net_uid(net, NULL);
@@ -6027,7 +6030,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
 		err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res,
 						   strict);
 	} else {
-		fl6.flowi6_mark = 0;
+		if (flags & BPF_FIB_LOOKUP_MARK)
+			fl6.flowi6_mark = params->mark;
+		else
+			fl6.flowi6_mark = 0;
 		fl6.flowi6_secid = 0;
 		fl6.flowi6_tun_key.tun_id = 0;
 		fl6.flowi6_uid = sock_net_uid(net, NULL);
@@ -6105,7 +6111,7 @@ set_fwd_params:
 
 #define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \
 			     BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \
-			     BPF_FIB_LOOKUP_SRC)
+			     BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_MARK)
 
 BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
 	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 9585f5345353..96d57e483133 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3394,6 +3394,10 @@ union bpf_attr {
  *			for the nexthop. If the src addr cannot be derived,
  *			**BPF_FIB_LKUP_RET_NO_SRC_ADDR** is returned. In this
  *			case, *params*->dmac and *params*->smac are not set either.
+ *		**BPF_FIB_LOOKUP_MARK**
+ *			Use the mark present in *params*->mark for the fib lookup.
+ *			This option should not be used with BPF_FIB_LOOKUP_DIRECT,
+ *			as it only has meaning for full lookups.
  *
  *		*ctx* is either **struct xdp_md** for XDP programs or
  *		**struct sk_buff** tc cls_act programs.
@@ -7120,6 +7124,7 @@ enum {
 	BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2),
 	BPF_FIB_LOOKUP_TBID    = (1U << 3),
 	BPF_FIB_LOOKUP_SRC     = (1U << 4),
+	BPF_FIB_LOOKUP_MARK    = (1U << 5),
 };
 
 enum {
@@ -7197,8 +7202,19 @@ struct bpf_fib_lookup {
 		__u32	tbid;
 	};
 
-	__u8	smac[6];     /* ETH_ALEN */
-	__u8	dmac[6];     /* ETH_ALEN */
+	union {
+		/* input */
+		struct {
+			__u32	mark;   /* policy routing */
+			/* 2 4-byte holes for input */
+		};
+
+		/* output: source and dest mac */
+		struct {
+			__u8	smac[6];	/* ETH_ALEN */
+			__u8	dmac[6];	/* ETH_ALEN */
+		};
+	};
 };
 
 struct bpf_redir_neigh {
-- 
cgit v1.2.3


From 3124591f686115aca25d772c2ccb7b1e202c3197 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Tue, 26 Mar 2024 09:21:50 -0700
Subject: bpf: add bpf_modify_return_test_tp() kfunc triggering tracepoint

Add a simple bpf_modify_return_test_tp() kfunc, available to all program
types, that is useful for various testing and benchmarking scenarios, as
it allows to trigger most tracing BPF program types from BPF side,
allowing to do complex testing and benchmarking scenarios.

It is also attachable to for fmod_ret programs, making it a good and
simple way to trigger fmod_ret program under test/benchmark.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/r/20240326162151.3981687-6-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/trace/events/bpf_test_run.h | 17 +++++++++++++++++
 kernel/bpf/helpers.c                |  1 +
 net/bpf/test_run.c                  |  8 ++++++++
 3 files changed, 26 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/bpf_test_run.h b/include/trace/events/bpf_test_run.h
index 265447e3f71a..0c924d39b7cb 100644
--- a/include/trace/events/bpf_test_run.h
+++ b/include/trace/events/bpf_test_run.h
@@ -7,6 +7,23 @@
 
 #include <linux/tracepoint.h>
 
+TRACE_EVENT(bpf_trigger_tp,
+
+	TP_PROTO(int nonce),
+
+	TP_ARGS(nonce),
+
+	TP_STRUCT__entry(
+		__field(int, nonce)
+	),
+
+	TP_fast_assign(
+		__entry->nonce = nonce;
+	),
+
+	TP_printk("nonce %d", __entry->nonce)
+);
+
 DECLARE_EVENT_CLASS(bpf_test_finish,
 
 	TP_PROTO(int *err),
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index f860adab3eb9..d9e7aca8ae9e 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2625,6 +2625,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_null)
 BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
 BTF_ID_FLAGS(func, bpf_dynptr_size)
 BTF_ID_FLAGS(func, bpf_dynptr_clone)
+BTF_ID_FLAGS(func, bpf_modify_return_test_tp)
 BTF_KFUNCS_END(common_btf_ids)
 
 static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 61efeadaff8d..f6aad4ed2ab2 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -575,6 +575,13 @@ __bpf_kfunc int bpf_modify_return_test2(int a, int *b, short c, int d,
 	return a + *b + c + d + (long)e + f + g;
 }
 
+__bpf_kfunc int bpf_modify_return_test_tp(int nonce)
+{
+	trace_bpf_trigger_tp(nonce);
+
+	return nonce;
+}
+
 int noinline bpf_fentry_shadow_test(int a)
 {
 	return a + 1;
@@ -622,6 +629,7 @@ __bpf_kfunc_end_defs();
 BTF_KFUNCS_START(bpf_test_modify_return_ids)
 BTF_ID_FLAGS(func, bpf_modify_return_test)
 BTF_ID_FLAGS(func, bpf_modify_return_test2)
+BTF_ID_FLAGS(func, bpf_modify_return_test_tp)
 BTF_ID_FLAGS(func, bpf_fentry_test1, KF_SLEEPABLE)
 BTF_KFUNCS_END(bpf_test_modify_return_ids)
 
-- 
cgit v1.2.3


From ca7e324e8ad385a2da15049953c04ea7310687f7 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Wed, 27 Mar 2024 15:22:39 +0100
Subject: compiler_types: add Endianness-dependent __counted_by_{le,be}

Some structures contain flexible arrays at the end and the counter for
them, but the counter has explicit Endianness and thus __counted_by()
can't be used directly.

To increase test coverage for potential problems without breaking
anything, introduce __counted_by_{le,be}() defined depending on
platform's Endianness to either __counted_by() when applicable or noop
otherwise.
Maybe it would be a good idea to introduce such attributes on compiler
level if possible, but for now let's stop on what we have.

Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Link: https://lore.kernel.org/r/20240327142241.1745989-2-aleksander.lobakin@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/conf.py          |  2 ++
 include/linux/compiler_types.h | 11 +++++++++++
 scripts/kernel-doc             |  1 +
 3 files changed, 14 insertions(+)

(limited to 'include')

diff --git a/Documentation/conf.py b/Documentation/conf.py
index d148f3e8dd57..0c2205d536b3 100644
--- a/Documentation/conf.py
+++ b/Documentation/conf.py
@@ -75,6 +75,8 @@ if major >= 3:
             "__rcu",
             "__user",
             "__force",
+            "__counted_by_le",
+            "__counted_by_be",
 
             # include/linux/compiler_attributes.h:
             "__alias",
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 2abaa3a825a9..a29ba6ef1e27 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -282,6 +282,17 @@ struct ftrace_likely_data {
 #define __no_sanitize_or_inline __always_inline
 #endif
 
+/*
+ * Apply __counted_by() when the Endianness matches to increase test coverage.
+ */
+#ifdef __LITTLE_ENDIAN
+#define __counted_by_le(member)	__counted_by(member)
+#define __counted_by_be(member)
+#else
+#define __counted_by_le(member)
+#define __counted_by_be(member)	__counted_by(member)
+#endif
+
 /* Do not trap wrapping arithmetic within an annotated function. */
 #ifdef CONFIG_UBSAN_SIGNED_WRAP
 # define __signed_wrap __attribute__((no_sanitize("signed-integer-overflow")))
diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index 967f1abb0edb..1474e95dbe4f 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -1143,6 +1143,7 @@ sub dump_struct($$) {
         $members =~ s/\s*$attribute/ /gi;
         $members =~ s/\s*__aligned\s*\([^;]*\)/ /gos;
         $members =~ s/\s*__counted_by\s*\([^;]*\)/ /gos;
+        $members =~ s/\s*__counted_by_(le|be)\s*\([^;]*\)/ /gos;
         $members =~ s/\s*__packed\s*/ /gos;
         $members =~ s/\s*CRYPTO_MINALIGN_ATTR/ /gos;
         $members =~ s/\s*____cacheline_aligned_in_smp/ /gos;
-- 
cgit v1.2.3


From 21d9ba5bc5516e1d8a90c4a26de90855a1e4fb59 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Tue, 26 Mar 2024 14:32:07 +0100
Subject: net: phylink: add PHY_F_RXC_ALWAYS_ON to PHY dev flags

Some MAC controllers (e.g. stmmac) require their connected PHY to
continuously provide a receive clock signal. This can cause issues in two
cases:

  1. The clock signal hasn't been started yet by the time the MAC driver
     initializes its hardware. This can make the initialization fail, as in
      the case of the rzn1 GMAC1 driver.
  2. The clock signal is cut during a power saving event. By the time the
     MAC is brought back up, the clock signal is still not active since
     phylink_start hasn't been called yet. This brings us back to case 1.

If a PHY driver reads this flag, it should ensure that the receive clock
signal is started as soon as possible, and that it isn't brought down when
the PHY goes into suspend.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
[rgantois: commit log]
Signed-off-by: Romain Gantois <romain.gantois@bootlin.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://lore.kernel.org/r/20240326-rxc_bugfix-v6-1-24a74e5c761f@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phylink.c | 10 +++++++++-
 include/linux/phy.h       |  1 +
 include/linux/phylink.h   |  4 ++++
 3 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 503fd7c40523..2bb583543dea 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -1923,6 +1923,8 @@ static int phylink_bringup_phy(struct phylink *pl, struct phy_device *phy,
 static int phylink_attach_phy(struct phylink *pl, struct phy_device *phy,
 			      phy_interface_t interface)
 {
+	u32 flags = 0;
+
 	if (WARN_ON(pl->cfg_link_an_mode == MLO_AN_FIXED ||
 		    (pl->cfg_link_an_mode == MLO_AN_INBAND &&
 		     phy_interface_mode_is_8023z(interface) && !pl->sfp_bus)))
@@ -1931,7 +1933,10 @@ static int phylink_attach_phy(struct phylink *pl, struct phy_device *phy,
 	if (pl->phydev)
 		return -EBUSY;
 
-	return phy_attach_direct(pl->netdev, phy, 0, interface);
+	if (pl->config->mac_requires_rxc)
+		flags |= PHY_F_RXC_ALWAYS_ON;
+
+	return phy_attach_direct(pl->netdev, phy, flags, interface);
 }
 
 /**
@@ -2034,6 +2039,9 @@ int phylink_fwnode_phy_connect(struct phylink *pl,
 		pl->link_config.interface = pl->link_interface;
 	}
 
+	if (pl->config->mac_requires_rxc)
+		flags |= PHY_F_RXC_ALWAYS_ON;
+
 	ret = phy_attach_direct(pl->netdev, phy_dev, flags,
 				pl->link_interface);
 	phy_device_free(phy_dev);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 3f68b8239bb1..e6e83304558e 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -778,6 +778,7 @@ struct phy_device {
 
 /* Generic phy_device::dev_flags */
 #define PHY_F_NO_IRQ		0x80000000
+#define PHY_F_RXC_ALWAYS_ON	0x40000000
 
 static inline struct phy_device *to_phy_device(const struct device *dev)
 {
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index 9a57deefcb07..fef8ae66b988 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -138,6 +138,9 @@ enum phylink_op_type {
  * @poll_fixed_state: if true, starts link_poll,
  *		      if MAC link is at %MLO_AN_FIXED mode.
  * @mac_managed_pm: if true, indicate the MAC driver is responsible for PHY PM.
+ * @mac_requires_rxc: if true, the MAC always requires a receive clock from PHY.
+ *                    The PHY driver should start the clock signal as soon as
+ *                    possible and avoid stopping it during suspend events.
  * @ovr_an_inband: if true, override PCS to MLO_AN_INBAND
  * @get_fixed_state: callback to execute to determine the fixed link state,
  *		     if MAC link is at %MLO_AN_FIXED mode.
@@ -150,6 +153,7 @@ struct phylink_config {
 	enum phylink_op_type type;
 	bool poll_fixed_state;
 	bool mac_managed_pm;
+	bool mac_requires_rxc;
 	bool ovr_an_inband;
 	void (*get_fixed_state)(struct phylink_config *config,
 				struct phylink_link_state *state);
-- 
cgit v1.2.3


From dceb393a0a8e82ff05b7c1f6c911d0779d856a7a Mon Sep 17 00:00:00 2001
From: Romain Gantois <romain.gantois@bootlin.com>
Date: Tue, 26 Mar 2024 14:32:08 +0100
Subject: net: phylink: add rxc_always_on flag to phylink_pcs

Some MAC drivers (e.g. stmmac) require a continuous receive clock signal to
be generated by a PCS that is handled by a standalone PCS driver.

Such a PCS driver does not have access to a PHY device, thus cannot check
the PHY_F_RXC_ALWAYS_ON flag. They cannot check max_requires_rxc in the
phylink config either, since it is a private member. Therefore, a new flag
is needed to signal to the PCS that it should keep the RX clock signal up
at all times.

Co-developed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: Romain Gantois <romain.gantois@bootlin.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://lore.kernel.org/r/20240326-rxc_bugfix-v6-2-24a74e5c761f@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/phylink.c | 15 +++++++++++++++
 include/linux/phylink.h   | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 2bb583543dea..84a97088dfc6 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -1042,6 +1042,21 @@ static void phylink_pcs_poll_start(struct phylink *pl)
 		mod_timer(&pl->link_poll, jiffies + HZ);
 }
 
+int phylink_pcs_pre_init(struct phylink *pl, struct phylink_pcs *pcs)
+{
+	int ret = 0;
+
+	/* Signal to PCS driver that MAC requires RX clock for init */
+	if (pl->config->mac_requires_rxc)
+		pcs->rxc_always_on = true;
+
+	if (pcs->ops->pcs_pre_init)
+		ret = pcs->ops->pcs_pre_init(pcs);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(phylink_pcs_pre_init);
+
 static void phylink_mac_config(struct phylink *pl,
 			       const struct phylink_link_state *state)
 {
diff --git a/include/linux/phylink.h b/include/linux/phylink.h
index fef8ae66b988..5ea6b2ad2396 100644
--- a/include/linux/phylink.h
+++ b/include/linux/phylink.h
@@ -396,6 +396,10 @@ struct phylink_pcs_ops;
  * @phylink: pointer to &struct phylink_config
  * @neg_mode: provide PCS neg mode via "mode" argument
  * @poll: poll the PCS for link changes
+ * @rxc_always_on: The MAC driver requires the reference clock
+ *                 to always be on. Standalone PCS drivers which
+ *                 do not have access to a PHY device can check
+ *                 this instead of PHY_F_RXC_ALWAYS_ON.
  *
  * This structure is designed to be embedded within the PCS private data,
  * and will be passed between phylink and the PCS.
@@ -408,6 +412,7 @@ struct phylink_pcs {
 	struct phylink *phylink;
 	bool neg_mode;
 	bool poll;
+	bool rxc_always_on;
 };
 
 /**
@@ -422,6 +427,8 @@ struct phylink_pcs {
  * @pcs_an_restart: restart 802.3z BaseX autonegotiation.
  * @pcs_link_up: program the PCS for the resolved link configuration
  *               (where necessary).
+ * @pcs_pre_init: configure PCS components necessary for MAC hardware
+ *                initialization e.g. RX clock for stmmac.
  */
 struct phylink_pcs_ops {
 	int (*pcs_validate)(struct phylink_pcs *pcs, unsigned long *supported,
@@ -441,6 +448,7 @@ struct phylink_pcs_ops {
 	void (*pcs_an_restart)(struct phylink_pcs *pcs);
 	void (*pcs_link_up)(struct phylink_pcs *pcs, unsigned int neg_mode,
 			    phy_interface_t interface, int speed, int duplex);
+	int (*pcs_pre_init)(struct phylink_pcs *pcs);
 };
 
 #if 0 /* For kernel-doc purposes only. */
@@ -546,6 +554,34 @@ void pcs_an_restart(struct phylink_pcs *pcs);
  */
 void pcs_link_up(struct phylink_pcs *pcs, unsigned int neg_mode,
 		 phy_interface_t interface, int speed, int duplex);
+
+/**
+ * pcs_pre_init() - Configure PCS components necessary for MAC initialization
+ * @pcs: a pointer to a &struct phylink_pcs.
+ *
+ * This function can be called by MAC drivers through the
+ * phylink_pcs_pre_init() wrapper, before their hardware is initialized. It
+ * should not be called after the link is brought up, as reconfiguring the PCS
+ * at this point could break the link.
+ *
+ * Some MAC devices require specific hardware initialization to be performed by
+ * their associated PCS device before they can properly initialize their own
+ * hardware. An example of this is the initialization of stmmac controllers,
+ * which requires an active REF_CLK signal to be provided by the PHY/PCS.
+ *
+ * By calling phylink_pcs_pre_init(), MAC drivers can ensure that the PCS is
+ * setup in a way that allows for successful hardware initialization.
+ *
+ * The specific configuration performed by pcs_pre_init() is dependent on the
+ * model of PCS and the requirements of the MAC device attached to it. PCS
+ * driver authors should consider whether their target device is to be used in
+ * conjunction with a MAC device whose driver calls phylink_pcs_pre_init(). MAC
+ * driver authors should document their requirements for the PCS
+ * pre-initialization.
+ *
+ */
+int pcs_pre_init(struct phylink_pcs *pcs);
+
 #endif
 
 struct phylink *phylink_create(struct phylink_config *,
@@ -565,6 +601,8 @@ void phylink_disconnect_phy(struct phylink *);
 void phylink_mac_change(struct phylink *, bool up);
 void phylink_pcs_change(struct phylink_pcs *, bool up);
 
+int phylink_pcs_pre_init(struct phylink *pl, struct phylink_pcs *pcs);
+
 void phylink_start(struct phylink *);
 void phylink_stop(struct phylink *);
 
-- 
cgit v1.2.3


From f631ef39d81956a2ee69d25039781ceae1162f62 Mon Sep 17 00:00:00 2001
From: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Date: Mon, 25 Mar 2024 20:47:34 +0000
Subject: net: sched: cls_api: add skip_sw counter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Maintain a count of skip_sw filters.

This counter is protected by the cb_lock, and is updated
at the same time as offloadcnt.

Signed-off-by: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 1 +
 net/sched/cls_api.c       | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index cefe0c4bdae3..120a4ca6ec9b 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -471,6 +471,7 @@ struct tcf_block {
 	struct flow_block flow_block;
 	struct list_head owner_list;
 	bool keep_dst;
+	atomic_t skipswcnt; /* Number of skip_sw filters */
 	atomic_t offloadcnt; /* Number of oddloaded filters */
 	unsigned int nooffloaddevcnt; /* Number of devs unable to do offload */
 	unsigned int lockeddevcnt; /* Number of devs that require rtnl lock. */
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index ca5676b2668e..397c3d29659c 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -3483,6 +3483,8 @@ static void tcf_block_offload_inc(struct tcf_block *block, u32 *flags)
 	if (*flags & TCA_CLS_FLAGS_IN_HW)
 		return;
 	*flags |= TCA_CLS_FLAGS_IN_HW;
+	if (tc_skip_sw(*flags))
+		atomic_inc(&block->skipswcnt);
 	atomic_inc(&block->offloadcnt);
 }
 
@@ -3491,6 +3493,8 @@ static void tcf_block_offload_dec(struct tcf_block *block, u32 *flags)
 	if (!(*flags & TCA_CLS_FLAGS_IN_HW))
 		return;
 	*flags &= ~TCA_CLS_FLAGS_IN_HW;
+	if (tc_skip_sw(*flags))
+		atomic_dec(&block->skipswcnt);
 	atomic_dec(&block->offloadcnt);
 }
 
-- 
cgit v1.2.3


From 2081fd3445fec6b9813c20e8b910c2abd6de31cb Mon Sep 17 00:00:00 2001
From: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Date: Mon, 25 Mar 2024 20:47:35 +0000
Subject: net: sched: cls_api: add filter counter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Maintain a count of filters per block.

Counter updates are protected by cb_lock, which is
also used to protect the offload counters.

Signed-off-by: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h |  2 ++
 net/sched/cls_api.c       | 19 +++++++++++++++++++
 2 files changed, 21 insertions(+)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 120a4ca6ec9b..eb3872c22fcd 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -422,6 +422,7 @@ struct tcf_proto {
 	 */
 	spinlock_t		lock;
 	bool			deleting;
+	bool			counted;
 	refcount_t		refcnt;
 	struct rcu_head		rcu;
 	struct hlist_node	destroy_ht_node;
@@ -471,6 +472,7 @@ struct tcf_block {
 	struct flow_block flow_block;
 	struct list_head owner_list;
 	bool keep_dst;
+	atomic_t filtercnt; /* Number of filters */
 	atomic_t skipswcnt; /* Number of skip_sw filters */
 	atomic_t offloadcnt; /* Number of oddloaded filters */
 	unsigned int nooffloaddevcnt; /* Number of devs unable to do offload */
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 397c3d29659c..304a46ab0e0b 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -410,12 +410,30 @@ static void tcf_proto_get(struct tcf_proto *tp)
 	refcount_inc(&tp->refcnt);
 }
 
+static void tcf_block_filter_cnt_update(struct tcf_block *block, bool *counted, bool add)
+{
+	lockdep_assert_not_held(&block->cb_lock);
+
+	down_write(&block->cb_lock);
+	if (*counted != add) {
+		if (add) {
+			atomic_inc(&block->filtercnt);
+			*counted = true;
+		} else {
+			atomic_dec(&block->filtercnt);
+			*counted = false;
+		}
+	}
+	up_write(&block->cb_lock);
+}
+
 static void tcf_chain_put(struct tcf_chain *chain);
 
 static void tcf_proto_destroy(struct tcf_proto *tp, bool rtnl_held,
 			      bool sig_destroy, struct netlink_ext_ack *extack)
 {
 	tp->ops->destroy(tp, rtnl_held, extack);
+	tcf_block_filter_cnt_update(tp->chain->block, &tp->counted, false);
 	if (sig_destroy)
 		tcf_proto_signal_destroyed(tp->chain, tp);
 	tcf_chain_put(tp->chain);
@@ -2364,6 +2382,7 @@ replay:
 	err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh,
 			      flags, extack);
 	if (err == 0) {
+		tcf_block_filter_cnt_update(block, &tp->counted, true);
 		tfilter_notify(net, skb, n, tp, block, q, parent, fh,
 			       RTM_NEWTFILTER, false, rtnl_held, extack);
 		tfilter_put(tp, fh);
-- 
cgit v1.2.3


From 047f340b36fc550c0fc6a8947fc0a1f8e429e9ab Mon Sep 17 00:00:00 2001
From: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Date: Mon, 25 Mar 2024 20:47:36 +0000
Subject: net: sched: make skip_sw actually skip software
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TC filters come in 3 variants:
- no flag (try to process in hardware, but fallback to software))
- skip_hw (do not process filter by hardware)
- skip_sw (do not process filter by software)

However skip_sw is implemented so that the skip_sw
flag can first be checked, after it has been matched.

IMHO it's common when using skip_sw, to use it on all rules.

So if all filters in a block is skip_sw filters, then
we can bail early, we can thus avoid having to match
the filters, just to check for the skip_sw flag.

This patch adds a bypass, for when only TC skip_sw rules
are used. The bypass is guarded by a static key, to avoid
harming other workloads.

There are 3 ways that a packet from a skip_sw ruleset, can
end up in the kernel path. Although the send packets to a
non-existent chain way is only improved a few percents, then
I believe it's worth optimizing the trap and fall-though
use-cases.

 +----------------------------+--------+--------+--------+
 | Test description           | Pre-   | Post-  | Rel.   |
 |                            | kpps   | kpps   | chg.   |
 +----------------------------+--------+--------+--------+
 | basic forwarding + notrack | 3589.3 | 3587.9 |  1.00x |
 | switch to eswitch mode     | 3081.8 | 3094.7 |  1.00x |
 | add ingress qdisc          | 3042.9 | 3063.6 |  1.01x |
 | tc forward in hw / skip_sw |37024.7 |37028.4 |  1.00x |
 | tc forward in sw / skip_hw | 3245.0 | 3245.3 |  1.00x |
 +----------------------------+--------+--------+--------+
 | tests with only skip_sw rules below:                  |
 +----------------------------+--------+--------+--------+
 | 1 non-matching rule        | 2694.7 | 3058.7 |  1.14x |
 | 1 n-m rule, match trap     | 2611.2 | 3323.1 |  1.27x |
 | 1 n-m rule, goto non-chain | 2886.8 | 2945.9 |  1.02x |
 | 5 non-matching rules       | 1958.2 | 3061.3 |  1.56x |
 | 5 n-m rules, match trap    | 1911.9 | 3327.0 |  1.74x |
 | 5 n-m rules, goto non-chain| 2883.1 | 2947.5 |  1.02x |
 | 10 non-matching rules      | 1466.3 | 3062.8 |  2.09x |
 | 10 n-m rules, match trap   | 1444.3 | 3317.9 |  2.30x |
 | 10 n-m rules,goto non-chain| 2883.1 | 2939.5 |  1.02x |
 | 25 non-matching rules      |  838.5 | 3058.9 |  3.65x |
 | 25 n-m rules, match trap   |  824.5 | 3323.0 |  4.03x |
 | 25 n-m rules,goto non-chain| 2875.8 | 2944.7 |  1.02x |
 | 50 non-matching rules      |  488.1 | 3054.7 |  6.26x |
 | 50 n-m rules, match trap   |  484.9 | 3318.5 |  6.84x |
 | 50 n-m rules,goto non-chain| 2884.1 | 2939.7 |  1.02x |
 +----------------------------+--------+--------+--------+

perf top (25 n-m skip_sw rules - pre patch):
  20.39%  [kernel]  [k] __skb_flow_dissect
  16.43%  [kernel]  [k] rhashtable_jhash2
  10.58%  [kernel]  [k] fl_classify
  10.23%  [kernel]  [k] fl_mask_lookup
   4.79%  [kernel]  [k] memset_orig
   2.58%  [kernel]  [k] tcf_classify
   1.47%  [kernel]  [k] __x86_indirect_thunk_rax
   1.42%  [kernel]  [k] __dev_queue_xmit
   1.36%  [kernel]  [k] nft_do_chain
   1.21%  [kernel]  [k] __rcu_read_lock

perf top (25 n-m skip_sw rules - post patch):
   5.12%  [kernel]  [k] __dev_queue_xmit
   4.77%  [kernel]  [k] nft_do_chain
   3.65%  [kernel]  [k] dev_gro_receive
   3.41%  [kernel]  [k] check_preemption_disabled
   3.14%  [kernel]  [k] mlx5e_skb_from_cqe_mpwrq_nonlinear
   2.88%  [kernel]  [k] __netif_receive_skb_core.constprop.0
   2.49%  [kernel]  [k] mlx5e_xmit
   2.15%  [kernel]  [k] ip_forward
   1.95%  [kernel]  [k] mlx5e_tc_restore_tunnel
   1.92%  [kernel]  [k] vlan_gro_receive

Test setup:
 DUT: Intel Xeon D-1518 (2.20GHz) w/ Nvidia/Mellanox ConnectX-6 Dx 2x100G
 Data rate measured on switch (Extreme X690), and DUT connected as
 a router on a stick, with pktgen and pktsink as VLANs.
 Pktgen-dpdk was in range 36.6-37.7 Mpps 64B packets across all tests.
 Full test data at https://files.fiberby.net/ast/2024/tc_skip_sw/v2_tests/

Signed-off-by: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h     |  9 +++++++++
 include/net/sch_generic.h |  1 +
 net/core/dev.c            | 10 ++++++++++
 net/sched/cls_api.c       | 18 ++++++++++++++++++
 4 files changed, 38 insertions(+)

(limited to 'include')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index a4ee43f493bb..41297bd38dff 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -74,6 +74,15 @@ static inline bool tcf_block_non_null_shared(struct tcf_block *block)
 	return block && block->index;
 }
 
+#ifdef CONFIG_NET_CLS_ACT
+DECLARE_STATIC_KEY_FALSE(tcf_bypass_check_needed_key);
+
+static inline bool tcf_block_bypass_sw(struct tcf_block *block)
+{
+	return block && block->bypass_wanted;
+}
+#endif
+
 static inline struct Qdisc *tcf_block_q(struct tcf_block *block)
 {
 	WARN_ON(tcf_block_shared(block));
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index eb3872c22fcd..76db6be16083 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -472,6 +472,7 @@ struct tcf_block {
 	struct flow_block flow_block;
 	struct list_head owner_list;
 	bool keep_dst;
+	bool bypass_wanted;
 	atomic_t filtercnt; /* Number of filters */
 	atomic_t skipswcnt; /* Number of skip_sw filters */
 	atomic_t offloadcnt; /* Number of oddloaded filters */
diff --git a/net/core/dev.c b/net/core/dev.c
index 5d36a634f468..c136e80dea61 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2083,6 +2083,11 @@ void net_dec_egress_queue(void)
 EXPORT_SYMBOL_GPL(net_dec_egress_queue);
 #endif
 
+#ifdef CONFIG_NET_CLS_ACT
+DEFINE_STATIC_KEY_FALSE(tcf_bypass_check_needed_key);
+EXPORT_SYMBOL(tcf_bypass_check_needed_key);
+#endif
+
 DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
 EXPORT_SYMBOL(netstamp_needed_key);
 #ifdef CONFIG_JUMP_LABEL
@@ -3937,6 +3942,11 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
 	if (!miniq)
 		return ret;
 
+	if (static_branch_unlikely(&tcf_bypass_check_needed_key)) {
+		if (tcf_block_bypass_sw(miniq->block))
+			return ret;
+	}
+
 	tc_skb_cb(skb)->mru = 0;
 	tc_skb_cb(skb)->post_ct = false;
 	tcf_set_drop_reason(skb, *drop_reason);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 304a46ab0e0b..db0653993632 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -410,6 +410,23 @@ static void tcf_proto_get(struct tcf_proto *tp)
 	refcount_inc(&tp->refcnt);
 }
 
+static void tcf_maintain_bypass(struct tcf_block *block)
+{
+	int filtercnt = atomic_read(&block->filtercnt);
+	int skipswcnt = atomic_read(&block->skipswcnt);
+	bool bypass_wanted = filtercnt > 0 && filtercnt == skipswcnt;
+
+	if (bypass_wanted != block->bypass_wanted) {
+#ifdef CONFIG_NET_CLS_ACT
+		if (bypass_wanted)
+			static_branch_inc(&tcf_bypass_check_needed_key);
+		else
+			static_branch_dec(&tcf_bypass_check_needed_key);
+#endif
+		block->bypass_wanted = bypass_wanted;
+	}
+}
+
 static void tcf_block_filter_cnt_update(struct tcf_block *block, bool *counted, bool add)
 {
 	lockdep_assert_not_held(&block->cb_lock);
@@ -424,6 +441,7 @@ static void tcf_block_filter_cnt_update(struct tcf_block *block, bool *counted,
 			*counted = false;
 		}
 	}
+	tcf_maintain_bypass(block);
 	up_write(&block->cb_lock);
 }
 
-- 
cgit v1.2.3


From af398bd0cb2151a581027a19bd995f72af2310ac Mon Sep 17 00:00:00 2001
From: Zhengchao Shao <shaozhengchao@huawei.com>
Date: Tue, 26 Mar 2024 15:29:52 +0800
Subject: net/smc: make smc_hash_sk/smc_unhash_sk static

smc_hash_sk and smc_unhash_sk are only used in af_smc.c, so make them
static and remove the output symbol. They can be called under the path
.prot->hash()/unhash().

Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
Reviewed-by: Wen Gu <guwen@linux.alibaba.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/smc.h | 3 ---
 net/smc/af_smc.c  | 6 ++----
 2 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/smc.h b/include/net/smc.h
index c9dcb30e3fd9..10684d0a33df 100644
--- a/include/net/smc.h
+++ b/include/net/smc.h
@@ -26,9 +26,6 @@ struct smc_hashinfo {
 	struct hlist_head ht;
 };
 
-int smc_hash_sk(struct sock *sk);
-void smc_unhash_sk(struct sock *sk);
-
 /* SMCD/ISM device driver interface */
 struct smcd_dmb {
 	u64 dmb_tok;
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 4b52b3b159c0..e8dcd28a554c 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -177,7 +177,7 @@ static struct smc_hashinfo smc_v6_hashinfo = {
 	.lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
 };
 
-int smc_hash_sk(struct sock *sk)
+static int smc_hash_sk(struct sock *sk)
 {
 	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
 	struct hlist_head *head;
@@ -191,9 +191,8 @@ int smc_hash_sk(struct sock *sk)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(smc_hash_sk);
 
-void smc_unhash_sk(struct sock *sk)
+static void smc_unhash_sk(struct sock *sk)
 {
 	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
 
@@ -202,7 +201,6 @@ void smc_unhash_sk(struct sock *sk)
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 	write_unlock_bh(&h->lock);
 }
-EXPORT_SYMBOL_GPL(smc_unhash_sk);
 
 /* This will be called before user really release sock_lock. So do the
  * work which we didn't do because of user hold the sock_lock in the
-- 
cgit v1.2.3


From ba5206881843e16b74a07c37970dcc44d22f8f6f Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Thu, 28 Mar 2024 15:33:40 -0700
Subject: spi: spi.h: add missing kernel-doc for @last_cs_index_mask

kernel-doc complains about last_cs_index_mask not described, so add its
description.

spi.h:778: warning: Function parameter or struct member 'last_cs_index_mask' not described in 'spi_controller'

Fixes: 4d8ff6b0991d ("spi: Add multi-cs memories support in SPI core")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Andy Shevchenko <andy@kernel.org>
Link: https://lore.kernel.org/r/20240328223340.17159-1-rdunlap@infradead.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/spi/spi.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index b589e2547439..e8e1e798924f 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -453,6 +453,7 @@ extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 ch
  * @last_cs_mode_high: was (mode & SPI_CS_HIGH) true on the last call to set_cs.
  * @last_cs: the last chip_select that is recorded by set_cs, -1 on non chip
  *           selected
+ * @last_cs_index_mask: bit mask the last chip selects that were used
  * @xfer_completion: used by core transfer_one_message()
  * @busy: message pump is busy
  * @running: message pump is running
-- 
cgit v1.2.3


From 1fbfdfaa590248c1d86407f578e40e5c65136330 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 25 Mar 2024 13:24:11 -0700
Subject: af_unix: Allocate struct unix_vertex for each inflight AF_UNIX fd.

We will replace the garbage collection algorithm for AF_UNIX, where
we will consider each inflight AF_UNIX socket as a vertex and its file
descriptor as an edge in a directed graph.

This patch introduces a new struct unix_vertex representing a vertex
in the graph and adds its pointer to struct unix_sock.

When we send a fd using the SCM_RIGHTS message, we allocate struct
scm_fp_list to struct scm_cookie in scm_fp_copy().  Then, we bump
each refcount of the inflight fds' struct file and save them in
scm_fp_list.fp.

After that, unix_attach_fds() inexplicably clones scm_fp_list of
scm_cookie and sets it to skb.  (We will remove this part after
replacing GC.)

Here, we add a new function call in unix_attach_fds() to preallocate
struct unix_vertex per inflight AF_UNIX fd and link each vertex to
skb's scm_fp_list.vertices.

When sendmsg() succeeds later, if the socket of the inflight fd is
still not inflight yet, we will set the preallocated vertex to struct
unix_sock.vertex and link it to a global list unix_unvisited_vertices
under spin_lock(&unix_gc_lock).

If the socket is already inflight, we free the preallocated vertex.
This is to avoid taking the lock unnecessarily when sendmsg() could
fail later.

In the following patch, we will similarly allocate another struct
per edge, which will finally be linked to the inflight socket's
unix_vertex.edges.

And then, we will count the number of edges as unix_vertex.out_degree.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Link: https://lore.kernel.org/r/20240325202425.60930-2-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/af_unix.h |  9 +++++++++
 include/net/scm.h     |  3 +++
 net/core/scm.c        |  7 +++++++
 net/unix/af_unix.c    |  6 ++++++
 net/unix/garbage.c    | 38 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 63 insertions(+)

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 627ea8e2d915..c270877a5256 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -22,9 +22,17 @@ extern unsigned int unix_tot_inflight;
 
 void unix_inflight(struct user_struct *user, struct file *fp);
 void unix_notinflight(struct user_struct *user, struct file *fp);
+int unix_prepare_fpl(struct scm_fp_list *fpl);
+void unix_destroy_fpl(struct scm_fp_list *fpl);
 void unix_gc(void);
 void wait_for_unix_gc(struct scm_fp_list *fpl);
 
+struct unix_vertex {
+	struct list_head edges;
+	struct list_head entry;
+	unsigned long out_degree;
+};
+
 struct sock *unix_peer_get(struct sock *sk);
 
 #define UNIX_HASH_MOD	(256 - 1)
@@ -62,6 +70,7 @@ struct unix_sock {
 	struct path		path;
 	struct mutex		iolock, bindlock;
 	struct sock		*peer;
+	struct unix_vertex	*vertex;
 	struct list_head	link;
 	unsigned long		inflight;
 	spinlock_t		lock;
diff --git a/include/net/scm.h b/include/net/scm.h
index 92276a2c5543..e34321b6e204 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -27,6 +27,9 @@ struct scm_fp_list {
 	short			count;
 	short			count_unix;
 	short			max;
+#ifdef CONFIG_UNIX
+	struct list_head	vertices;
+#endif
 	struct user_struct	*user;
 	struct file		*fp[SCM_MAX_FD];
 };
diff --git a/net/core/scm.c b/net/core/scm.c
index 9cd4b0a01cd6..87dfec1c3378 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -89,6 +89,9 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
 		fpl->count_unix = 0;
 		fpl->max = SCM_MAX_FD;
 		fpl->user = NULL;
+#if IS_ENABLED(CONFIG_UNIX)
+		INIT_LIST_HEAD(&fpl->vertices);
+#endif
 	}
 	fpp = &fpl->fp[fpl->count];
 
@@ -376,8 +379,12 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
 	if (new_fpl) {
 		for (i = 0; i < fpl->count; i++)
 			get_file(fpl->fp[i]);
+
 		new_fpl->max = new_fpl->count;
 		new_fpl->user = get_uid(fpl->user);
+#if IS_ENABLED(CONFIG_UNIX)
+		INIT_LIST_HEAD(&new_fpl->vertices);
+#endif
 	}
 	return new_fpl;
 }
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 5b41e2321209..a3b25d311560 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -980,6 +980,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern,
 	sk->sk_destruct		= unix_sock_destructor;
 	u = unix_sk(sk);
 	u->inflight = 0;
+	u->vertex = NULL;
 	u->path.dentry = NULL;
 	u->path.mnt = NULL;
 	spin_lock_init(&u->lock);
@@ -1805,6 +1806,9 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
 	for (i = scm->fp->count - 1; i >= 0; i--)
 		unix_inflight(scm->fp->user, scm->fp->fp[i]);
 
+	if (unix_prepare_fpl(UNIXCB(skb).fp))
+		return -ENOMEM;
+
 	return 0;
 }
 
@@ -1815,6 +1819,8 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
 	scm->fp = UNIXCB(skb).fp;
 	UNIXCB(skb).fp = NULL;
 
+	unix_destroy_fpl(scm->fp);
+
 	for (i = scm->fp->count - 1; i >= 0; i--)
 		unix_notinflight(scm->fp->user, scm->fp->fp[i]);
 }
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index fa39b6265238..75bdf66b81df 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -101,6 +101,44 @@ struct unix_sock *unix_get_socket(struct file *filp)
 	return NULL;
 }
 
+static void unix_free_vertices(struct scm_fp_list *fpl)
+{
+	struct unix_vertex *vertex, *next_vertex;
+
+	list_for_each_entry_safe(vertex, next_vertex, &fpl->vertices, entry) {
+		list_del(&vertex->entry);
+		kfree(vertex);
+	}
+}
+
+int unix_prepare_fpl(struct scm_fp_list *fpl)
+{
+	struct unix_vertex *vertex;
+	int i;
+
+	if (!fpl->count_unix)
+		return 0;
+
+	for (i = 0; i < fpl->count_unix; i++) {
+		vertex = kmalloc(sizeof(*vertex), GFP_KERNEL);
+		if (!vertex)
+			goto err;
+
+		list_add(&vertex->entry, &fpl->vertices);
+	}
+
+	return 0;
+
+err:
+	unix_free_vertices(fpl);
+	return -ENOMEM;
+}
+
+void unix_destroy_fpl(struct scm_fp_list *fpl)
+{
+	unix_free_vertices(fpl);
+}
+
 DEFINE_SPINLOCK(unix_gc_lock);
 unsigned int unix_tot_inflight;
 static LIST_HEAD(gc_candidates);
-- 
cgit v1.2.3


From 29b64e354029cfcf1eea4d91b146c7b769305930 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 25 Mar 2024 13:24:12 -0700
Subject: af_unix: Allocate struct unix_edge for each inflight AF_UNIX fd.

As with the previous patch, we preallocate to skb's scm_fp_list an
array of struct unix_edge in the number of inflight AF_UNIX fds.

There we just preallocate memory and do not use immediately because
sendmsg() could fail after this point.  The actual use will be in
the next patch.

When we queue skb with inflight edges, we will set the inflight
socket's unix_sock as unix_edge->predecessor and the receiver's
unix_sock as successor, and then we will link the edge to the
inflight socket's unix_vertex.edges.

Note that we set NULL to cloned scm_fp_list.edges in scm_fp_dup()
so that MSG_PEEK does not change the shape of the directed graph.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Link: https://lore.kernel.org/r/20240325202425.60930-3-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/af_unix.h | 6 ++++++
 include/net/scm.h     | 5 +++++
 net/core/scm.c        | 2 ++
 net/unix/garbage.c    | 6 ++++++
 4 files changed, 19 insertions(+)

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index c270877a5256..55c4abc26a71 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -33,6 +33,12 @@ struct unix_vertex {
 	unsigned long out_degree;
 };
 
+struct unix_edge {
+	struct unix_sock *predecessor;
+	struct unix_sock *successor;
+	struct list_head vertex_entry;
+};
+
 struct sock *unix_peer_get(struct sock *sk);
 
 #define UNIX_HASH_MOD	(256 - 1)
diff --git a/include/net/scm.h b/include/net/scm.h
index e34321b6e204..5f5154e5096d 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -23,12 +23,17 @@ struct scm_creds {
 	kgid_t	gid;
 };
 
+#ifdef CONFIG_UNIX
+struct unix_edge;
+#endif
+
 struct scm_fp_list {
 	short			count;
 	short			count_unix;
 	short			max;
 #ifdef CONFIG_UNIX
 	struct list_head	vertices;
+	struct unix_edge	*edges;
 #endif
 	struct user_struct	*user;
 	struct file		*fp[SCM_MAX_FD];
diff --git a/net/core/scm.c b/net/core/scm.c
index 87dfec1c3378..1bcc8a2d65e3 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -90,6 +90,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
 		fpl->max = SCM_MAX_FD;
 		fpl->user = NULL;
 #if IS_ENABLED(CONFIG_UNIX)
+		fpl->edges = NULL;
 		INIT_LIST_HEAD(&fpl->vertices);
 #endif
 	}
@@ -383,6 +384,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
 		new_fpl->max = new_fpl->count;
 		new_fpl->user = get_uid(fpl->user);
 #if IS_ENABLED(CONFIG_UNIX)
+		new_fpl->edges = NULL;
 		INIT_LIST_HEAD(&new_fpl->vertices);
 #endif
 	}
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 75bdf66b81df..f31917683288 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -127,6 +127,11 @@ int unix_prepare_fpl(struct scm_fp_list *fpl)
 		list_add(&vertex->entry, &fpl->vertices);
 	}
 
+	fpl->edges = kvmalloc_array(fpl->count_unix, sizeof(*fpl->edges),
+				    GFP_KERNEL_ACCOUNT);
+	if (!fpl->edges)
+		goto err;
+
 	return 0;
 
 err:
@@ -136,6 +141,7 @@ err:
 
 void unix_destroy_fpl(struct scm_fp_list *fpl)
 {
+	kvfree(fpl->edges);
 	unix_free_vertices(fpl);
 }
 
-- 
cgit v1.2.3


From 42f298c06b30bfe0a8cbee5d38644e618699e26e Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 25 Mar 2024 13:24:13 -0700
Subject: af_unix: Link struct unix_edge when queuing skb.

Just before queuing skb with inflight fds, we call scm_stat_add(),
which is a good place to set up the preallocated struct unix_vertex
and struct unix_edge in UNIXCB(skb).fp.

Then, we call unix_add_edges() and construct the directed graph
as follows:

  1. Set the inflight socket's unix_sock to unix_edge.predecessor.
  2. Set the receiver's unix_sock to unix_edge.successor.
  3. Set the preallocated vertex to inflight socket's unix_sock.vertex.
  4. Link inflight socket's unix_vertex.entry to unix_unvisited_vertices.
  5. Link unix_edge.vertex_entry to the inflight socket's unix_vertex.edges.

Let's say we pass the fd of AF_UNIX socket A to B and the fd of B
to C.  The graph looks like this:

  +-------------------------+
  | unix_unvisited_vertices | <-------------------------.
  +-------------------------+                           |
  +                                                     |
  |     +--------------+             +--------------+   |         +--------------+
  |     |  unix_sock A | <---. .---> |  unix_sock B | <-|-. .---> |  unix_sock C |
  |     +--------------+     | |     +--------------+   | | |     +--------------+
  | .-+ |    vertex    |     | | .-+ |    vertex    |   | | |     |    vertex    |
  | |   +--------------+     | | |   +--------------+   | | |     +--------------+
  | |                        | | |                      | | |
  | |   +--------------+     | | |   +--------------+   | | |
  | '-> |  unix_vertex |     | | '-> |  unix_vertex |   | | |
  |     +--------------+     | |     +--------------+   | | |
  `---> |    entry     | +---------> |    entry     | +-' | |
        |--------------|     | |     |--------------|     | |
        |    edges     | <-. | |     |    edges     | <-. | |
        +--------------+   | | |     +--------------+   | | |
                           | | |                        | | |
    .----------------------' | | .----------------------' | |
    |                        | | |                        | |
    |   +--------------+     | | |   +--------------+     | |
    |   |   unix_edge  |     | | |   |   unix_edge  |     | |
    |   +--------------+     | | |   +--------------+     | |
    `-> | vertex_entry |     | | `-> | vertex_entry |     | |
        |--------------|     | |     |--------------|     | |
        |  predecessor | +---' |     |  predecessor | +---' |
        |--------------|       |     |--------------|       |
        |   successor  | +-----'     |   successor  | +-----'
        +--------------+             +--------------+

Henceforth, we denote such a graph as A -> B (-> C).

Now, we can express all inflight fd graphs that do not contain
embryo sockets.  We will support the particular case later.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Link: https://lore.kernel.org/r/20240325202425.60930-4-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/af_unix.h |  2 ++
 include/net/scm.h     |  1 +
 net/core/scm.c        |  2 ++
 net/unix/af_unix.c    |  8 +++--
 net/unix/garbage.c    | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 100 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 55c4abc26a71..f31ad1166346 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -22,6 +22,8 @@ extern unsigned int unix_tot_inflight;
 
 void unix_inflight(struct user_struct *user, struct file *fp);
 void unix_notinflight(struct user_struct *user, struct file *fp);
+void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver);
+void unix_del_edges(struct scm_fp_list *fpl);
 int unix_prepare_fpl(struct scm_fp_list *fpl);
 void unix_destroy_fpl(struct scm_fp_list *fpl);
 void unix_gc(void);
diff --git a/include/net/scm.h b/include/net/scm.h
index 5f5154e5096d..bbc5527809d1 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -32,6 +32,7 @@ struct scm_fp_list {
 	short			count_unix;
 	short			max;
 #ifdef CONFIG_UNIX
+	bool			inflight;
 	struct list_head	vertices;
 	struct unix_edge	*edges;
 #endif
diff --git a/net/core/scm.c b/net/core/scm.c
index 1bcc8a2d65e3..5763f3320358 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -90,6 +90,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
 		fpl->max = SCM_MAX_FD;
 		fpl->user = NULL;
 #if IS_ENABLED(CONFIG_UNIX)
+		fpl->inflight = false;
 		fpl->edges = NULL;
 		INIT_LIST_HEAD(&fpl->vertices);
 #endif
@@ -384,6 +385,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
 		new_fpl->max = new_fpl->count;
 		new_fpl->user = get_uid(fpl->user);
 #if IS_ENABLED(CONFIG_UNIX)
+		new_fpl->inflight = false;
 		new_fpl->edges = NULL;
 		INIT_LIST_HEAD(&new_fpl->vertices);
 #endif
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index a3b25d311560..24adbc4d5188 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1943,8 +1943,10 @@ static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
 	struct scm_fp_list *fp = UNIXCB(skb).fp;
 	struct unix_sock *u = unix_sk(sk);
 
-	if (unlikely(fp && fp->count))
+	if (unlikely(fp && fp->count)) {
 		atomic_add(fp->count, &u->scm_stat.nr_fds);
+		unix_add_edges(fp, u);
+	}
 }
 
 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
@@ -1952,8 +1954,10 @@ static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
 	struct scm_fp_list *fp = UNIXCB(skb).fp;
 	struct unix_sock *u = unix_sk(sk);
 
-	if (unlikely(fp && fp->count))
+	if (unlikely(fp && fp->count)) {
 		atomic_sub(fp->count, &u->scm_stat.nr_fds);
+		unix_del_edges(fp);
+	}
 }
 
 /*
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index f31917683288..36d665936096 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -101,6 +101,38 @@ struct unix_sock *unix_get_socket(struct file *filp)
 	return NULL;
 }
 
+static LIST_HEAD(unix_unvisited_vertices);
+
+static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
+{
+	struct unix_vertex *vertex = edge->predecessor->vertex;
+
+	if (!vertex) {
+		vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry);
+		vertex->out_degree = 0;
+		INIT_LIST_HEAD(&vertex->edges);
+
+		list_move_tail(&vertex->entry, &unix_unvisited_vertices);
+		edge->predecessor->vertex = vertex;
+	}
+
+	vertex->out_degree++;
+	list_add_tail(&edge->vertex_entry, &vertex->edges);
+}
+
+static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
+{
+	struct unix_vertex *vertex = edge->predecessor->vertex;
+
+	list_del(&edge->vertex_entry);
+	vertex->out_degree--;
+
+	if (!vertex->out_degree) {
+		edge->predecessor->vertex = NULL;
+		list_move_tail(&vertex->entry, &fpl->vertices);
+	}
+}
+
 static void unix_free_vertices(struct scm_fp_list *fpl)
 {
 	struct unix_vertex *vertex, *next_vertex;
@@ -111,6 +143,60 @@ static void unix_free_vertices(struct scm_fp_list *fpl)
 	}
 }
 
+DEFINE_SPINLOCK(unix_gc_lock);
+
+void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver)
+{
+	int i = 0, j = 0;
+
+	spin_lock(&unix_gc_lock);
+
+	if (!fpl->count_unix)
+		goto out;
+
+	do {
+		struct unix_sock *inflight = unix_get_socket(fpl->fp[j++]);
+		struct unix_edge *edge;
+
+		if (!inflight)
+			continue;
+
+		edge = fpl->edges + i++;
+		edge->predecessor = inflight;
+		edge->successor = receiver;
+
+		unix_add_edge(fpl, edge);
+	} while (i < fpl->count_unix);
+
+out:
+	spin_unlock(&unix_gc_lock);
+
+	fpl->inflight = true;
+
+	unix_free_vertices(fpl);
+}
+
+void unix_del_edges(struct scm_fp_list *fpl)
+{
+	int i = 0;
+
+	spin_lock(&unix_gc_lock);
+
+	if (!fpl->count_unix)
+		goto out;
+
+	do {
+		struct unix_edge *edge = fpl->edges + i++;
+
+		unix_del_edge(fpl, edge);
+	} while (i < fpl->count_unix);
+
+out:
+	spin_unlock(&unix_gc_lock);
+
+	fpl->inflight = false;
+}
+
 int unix_prepare_fpl(struct scm_fp_list *fpl)
 {
 	struct unix_vertex *vertex;
@@ -141,11 +227,13 @@ err:
 
 void unix_destroy_fpl(struct scm_fp_list *fpl)
 {
+	if (fpl->inflight)
+		unix_del_edges(fpl);
+
 	kvfree(fpl->edges);
 	unix_free_vertices(fpl);
 }
 
-DEFINE_SPINLOCK(unix_gc_lock);
 unsigned int unix_tot_inflight;
 static LIST_HEAD(gc_candidates);
 static LIST_HEAD(gc_inflight_list);
-- 
cgit v1.2.3


From 6ba76fd2848e107594ea4f03b737230f74bc23ea Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 25 Mar 2024 13:24:15 -0700
Subject: af_unix: Iterate all vertices by DFS.

The new GC will use a depth first search graph algorithm to find
cyclic references.  The algorithm visits every vertex exactly once.

Here, we implement the DFS part without recursion so that no one
can abuse it.

unix_walk_scc() marks every vertex unvisited by initialising index
as UNIX_VERTEX_INDEX_UNVISITED and iterates inflight vertices in
unix_unvisited_vertices and call __unix_walk_scc() to start DFS from
an arbitrary vertex.

__unix_walk_scc() iterates all edges starting from the vertex and
explores the neighbour vertices with DFS using edge_stack.

After visiting all neighbours, __unix_walk_scc() moves the visited
vertex to unix_visited_vertices so that unix_walk_scc() will not
restart DFS from the visited vertex.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Link: https://lore.kernel.org/r/20240325202425.60930-6-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/af_unix.h |  2 ++
 net/unix/garbage.c    | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+)

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index f31ad1166346..970a91da2239 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -33,12 +33,14 @@ struct unix_vertex {
 	struct list_head edges;
 	struct list_head entry;
 	unsigned long out_degree;
+	unsigned long index;
 };
 
 struct unix_edge {
 	struct unix_sock *predecessor;
 	struct unix_sock *successor;
 	struct list_head vertex_entry;
+	struct list_head stack_entry;
 };
 
 struct sock *unix_peer_get(struct sock *sk);
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 84c8ea524a98..8b16ab9e240e 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -103,6 +103,11 @@ struct unix_sock *unix_get_socket(struct file *filp)
 
 static LIST_HEAD(unix_unvisited_vertices);
 
+enum unix_vertex_index {
+	UNIX_VERTEX_INDEX_UNVISITED,
+	UNIX_VERTEX_INDEX_START,
+};
+
 static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
 {
 	struct unix_vertex *vertex = edge->predecessor->vertex;
@@ -241,6 +246,73 @@ void unix_destroy_fpl(struct scm_fp_list *fpl)
 	unix_free_vertices(fpl);
 }
 
+static LIST_HEAD(unix_visited_vertices);
+
+static void __unix_walk_scc(struct unix_vertex *vertex)
+{
+	unsigned long index = UNIX_VERTEX_INDEX_START;
+	struct unix_edge *edge;
+	LIST_HEAD(edge_stack);
+
+next_vertex:
+	vertex->index = index;
+	index++;
+
+	/* Explore neighbour vertices (receivers of the current vertex's fd). */
+	list_for_each_entry(edge, &vertex->edges, vertex_entry) {
+		struct unix_vertex *next_vertex = edge->successor->vertex;
+
+		if (!next_vertex)
+			continue;
+
+		if (next_vertex->index == UNIX_VERTEX_INDEX_UNVISITED) {
+			/* Iterative deepening depth first search
+			 *
+			 *   1. Push a forward edge to edge_stack and set
+			 *      the successor to vertex for the next iteration.
+			 */
+			list_add(&edge->stack_entry, &edge_stack);
+
+			vertex = next_vertex;
+			goto next_vertex;
+
+			/*   2. Pop the edge directed to the current vertex
+			 *      and restore the ancestor for backtracking.
+			 */
+prev_vertex:
+			edge = list_first_entry(&edge_stack, typeof(*edge), stack_entry);
+			list_del_init(&edge->stack_entry);
+
+			vertex = edge->predecessor->vertex;
+		}
+	}
+
+	/* Don't restart DFS from this vertex in unix_walk_scc(). */
+	list_move_tail(&vertex->entry, &unix_visited_vertices);
+
+	/* Need backtracking ? */
+	if (!list_empty(&edge_stack))
+		goto prev_vertex;
+}
+
+static void unix_walk_scc(void)
+{
+	struct unix_vertex *vertex;
+
+	list_for_each_entry(vertex, &unix_unvisited_vertices, entry)
+		vertex->index = UNIX_VERTEX_INDEX_UNVISITED;
+
+	/* Visit every vertex exactly once.
+	 * __unix_walk_scc() moves visited vertices to unix_visited_vertices.
+	 */
+	while (!list_empty(&unix_unvisited_vertices)) {
+		vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
+		__unix_walk_scc(vertex);
+	}
+
+	list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
+}
+
 static LIST_HEAD(gc_candidates);
 static LIST_HEAD(gc_inflight_list);
 
@@ -388,6 +460,8 @@ static void __unix_gc(struct work_struct *work)
 
 	spin_lock(&unix_gc_lock);
 
+	unix_walk_scc();
+
 	/* First, select candidates for garbage collection.  Only
 	 * in-flight sockets are considered, and from those only ones
 	 * which don't have any external reference.
-- 
cgit v1.2.3


From 3484f063172dd88776b062046d721d7c2ae1af7c Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 25 Mar 2024 13:24:16 -0700
Subject: af_unix: Detect Strongly Connected Components.

In the new GC, we use a simple graph algorithm, Tarjan's Strongly
Connected Components (SCC) algorithm, to find cyclic references.

The algorithm visits every vertex exactly once using depth-first
search (DFS).

DFS starts by pushing an input vertex to a stack and assigning it
a unique number.  Two fields, index and lowlink, are initialised
with the number, but lowlink could be updated later during DFS.

If a vertex has an edge to an unvisited inflight vertex, we visit
it and do the same processing.  So, we will have vertices in the
stack in the order they appear and number them consecutively in
the same order.

If a vertex has a back-edge to a visited vertex in the stack,
we update the predecessor's lowlink with the successor's index.

After iterating edges from the vertex, we check if its index
equals its lowlink.

If the lowlink is different from the index, it shows there was a
back-edge.  Then, we go backtracking and propagate the lowlink to
its predecessor and resume the previous edge iteration from the
next edge.

If the lowlink is the same as the index, we pop vertices before
and including the vertex from the stack.  Then, the set of vertices
is SCC, possibly forming a cycle.  At the same time, we move the
vertices to unix_visited_vertices.

When we finish the algorithm, all vertices in each SCC will be
linked via unix_vertex.scc_entry.

Let's take an example.  We have a graph including five inflight
vertices (F is not inflight):

  A -> B -> C -> D -> E (-> F)
       ^         |
       `---------'

Suppose that we start DFS from C.  We will visit C, D, and B first
and initialise their index and lowlink.  Then, the stack looks like
this:

  > B = (3, 3)  (index, lowlink)
    D = (2, 2)
    C = (1, 1)

When checking B's edge to C, we update B's lowlink with C's index
and propagate it to D.

    B = (3, 1)  (index, lowlink)
  > D = (2, 1)
    C = (1, 1)

Next, we visit E, which has no edge to an inflight vertex.

  > E = (4, 4)  (index, lowlink)
    B = (3, 1)
    D = (2, 1)
    C = (1, 1)

When we leave from E, its index and lowlink are the same, so we
pop E from the stack as single-vertex SCC.  Next, we leave from
B and D but do nothing because their lowlink are different from
their index.

    B = (3, 1)  (index, lowlink)
    D = (2, 1)
  > C = (1, 1)

Then, we leave from C, whose index and lowlink are the same, so
we pop B, D and C as SCC.

Last, we do DFS for the rest of vertices, A, which is also a
single-vertex SCC.

Finally, each unix_vertex.scc_entry is linked as follows:

  A -.  B -> C -> D  E -.
  ^  |  ^         |  ^  |
  `--'  `---------'  `--'

We use SCC later to decide whether we can garbage-collect the
sockets.

Note that we still cannot detect SCC properly if an edge points
to an embryo socket.  The following two patches will sort it out.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Link: https://lore.kernel.org/r/20240325202425.60930-7-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/af_unix.h |  3 +++
 net/unix/garbage.c    | 46 ++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 47 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 970a91da2239..67736767b616 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -32,8 +32,11 @@ void wait_for_unix_gc(struct scm_fp_list *fpl);
 struct unix_vertex {
 	struct list_head edges;
 	struct list_head entry;
+	struct list_head scc_entry;
 	unsigned long out_degree;
 	unsigned long index;
+	unsigned long lowlink;
+	bool on_stack;
 };
 
 struct unix_edge {
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 8b16ab9e240e..33aadaa35346 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -251,11 +251,19 @@ static LIST_HEAD(unix_visited_vertices);
 static void __unix_walk_scc(struct unix_vertex *vertex)
 {
 	unsigned long index = UNIX_VERTEX_INDEX_START;
+	LIST_HEAD(vertex_stack);
 	struct unix_edge *edge;
 	LIST_HEAD(edge_stack);
 
 next_vertex:
+	/* Push vertex to vertex_stack.
+	 * The vertex will be popped when finalising SCC later.
+	 */
+	vertex->on_stack = true;
+	list_add(&vertex->scc_entry, &vertex_stack);
+
 	vertex->index = index;
+	vertex->lowlink = index;
 	index++;
 
 	/* Explore neighbour vertices (receivers of the current vertex's fd). */
@@ -283,12 +291,46 @@ prev_vertex:
 			edge = list_first_entry(&edge_stack, typeof(*edge), stack_entry);
 			list_del_init(&edge->stack_entry);
 
+			next_vertex = vertex;
 			vertex = edge->predecessor->vertex;
+
+			/* If the successor has a smaller lowlink, two vertices
+			 * are in the same SCC, so propagate the smaller lowlink
+			 * to skip SCC finalisation.
+			 */
+			vertex->lowlink = min(vertex->lowlink, next_vertex->lowlink);
+		} else if (next_vertex->on_stack) {
+			/* Loop detected by a back/cross edge.
+			 *
+			 * The successor is on vertex_stack, so two vertices are
+			 * in the same SCC.  If the successor has a smaller index,
+			 * propagate it to skip SCC finalisation.
+			 */
+			vertex->lowlink = min(vertex->lowlink, next_vertex->index);
+		} else {
+			/* The successor was already grouped as another SCC */
 		}
 	}
 
-	/* Don't restart DFS from this vertex in unix_walk_scc(). */
-	list_move_tail(&vertex->entry, &unix_visited_vertices);
+	if (vertex->index == vertex->lowlink) {
+		struct list_head scc;
+
+		/* SCC finalised.
+		 *
+		 * If the lowlink was not updated, all the vertices above on
+		 * vertex_stack are in the same SCC.  Group them using scc_entry.
+		 */
+		__list_cut_position(&scc, &vertex_stack, &vertex->scc_entry);
+
+		list_for_each_entry_reverse(vertex, &scc, scc_entry) {
+			/* Don't restart DFS from this vertex in unix_walk_scc(). */
+			list_move_tail(&vertex->entry, &unix_visited_vertices);
+
+			vertex->on_stack = false;
+		}
+
+		list_del(&scc);
+	}
 
 	/* Need backtracking ? */
 	if (!list_empty(&edge_stack))
-- 
cgit v1.2.3


From aed6ecef55d70de3762ce41c561b7f547dbaf107 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 25 Mar 2024 13:24:17 -0700
Subject: af_unix: Save listener for embryo socket.

This is a prep patch for the following change, where we need to
fetch the listening socket from the successor embryo socket
during GC.

We add a new field to struct unix_sock to save a pointer to a
listening socket.

We set it when connect() creates a new socket, and clear it when
accept() is called.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Link: https://lore.kernel.org/r/20240325202425.60930-8-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/af_unix.h | 1 +
 net/unix/af_unix.c    | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 67736767b616..dc7469191195 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -83,6 +83,7 @@ struct unix_sock {
 	struct path		path;
 	struct mutex		iolock, bindlock;
 	struct sock		*peer;
+	struct sock		*listener;
 	struct unix_vertex	*vertex;
 	struct list_head	link;
 	unsigned long		inflight;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 24adbc4d5188..af74e7ebc35a 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -979,6 +979,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern,
 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
 	sk->sk_destruct		= unix_sock_destructor;
 	u = unix_sk(sk);
+	u->listener = NULL;
 	u->inflight = 0;
 	u->vertex = NULL;
 	u->path.dentry = NULL;
@@ -1598,6 +1599,7 @@ restart:
 	newsk->sk_type		= sk->sk_type;
 	init_peercred(newsk);
 	newu = unix_sk(newsk);
+	newu->listener = other;
 	RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
 	otheru = unix_sk(other);
 
@@ -1693,8 +1695,8 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
 		       bool kern)
 {
 	struct sock *sk = sock->sk;
-	struct sock *tsk;
 	struct sk_buff *skb;
+	struct sock *tsk;
 	int err;
 
 	err = -EOPNOTSUPP;
@@ -1719,6 +1721,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
 	}
 
 	tsk = skb->sk;
+	unix_sk(tsk)->listener = NULL;
 	skb_free_datagram(sk, skb);
 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
 
-- 
cgit v1.2.3


From dcf70df2048d27c5d186f013f101a4aefd63aa41 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 25 Mar 2024 13:24:18 -0700
Subject: af_unix: Fix up unix_edge.successor for embryo socket.

To garbage collect inflight AF_UNIX sockets, we must define the
cyclic reference appropriately.  This is a bit tricky if the loop
consists of embryo sockets.

Suppose that the fd of AF_UNIX socket A is passed to D and the fd B
to C and that C and D are embryo sockets of A and B, respectively.
It may appear that there are two separate graphs, A (-> D) and
B (-> C), but this is not correct.

     A --. .-- B
          X
     C <-' `-> D

Now, D holds A's refcount, and C has B's refcount, so unix_release()
will never be called for A and B when we close() them.  However, no
one can call close() for D and C to free skbs holding refcounts of A
and B because C/D is in A/B's receive queue, which should have been
purged by unix_release() for A and B.

So, here's another type of cyclic reference.  When a fd of an AF_UNIX
socket is passed to an embryo socket, the reference is indirectly held
by its parent listening socket.

  .-> A                            .-> B
  |   `- sk_receive_queue          |   `- sk_receive_queue
  |      `- skb                    |      `- skb
  |         `- sk == C             |         `- sk == D
  |            `- sk_receive_queue |           `- sk_receive_queue
  |               `- skb +---------'               `- skb +-.
  |                                                         |
  `---------------------------------------------------------'

Technically, the graph must be denoted as A <-> B instead of A (-> D)
and B (-> C) to find such a cyclic reference without touching each
socket's receive queue.

  .-> A --. .-- B <-.
  |        X        |  ==  A <-> B
  `-- C <-' `-> D --'

We apply this fixup during GC by fetching the real successor by
unix_edge_successor().

When we call accept(), we clear unix_sock.listener under unix_gc_lock
not to confuse GC.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Link: https://lore.kernel.org/r/20240325202425.60930-9-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/af_unix.h |  1 +
 net/unix/af_unix.c    |  2 +-
 net/unix/garbage.c    | 20 +++++++++++++++++++-
 3 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index dc7469191195..414463803b7e 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -24,6 +24,7 @@ void unix_inflight(struct user_struct *user, struct file *fp);
 void unix_notinflight(struct user_struct *user, struct file *fp);
 void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver);
 void unix_del_edges(struct scm_fp_list *fpl);
+void unix_update_edges(struct unix_sock *receiver);
 int unix_prepare_fpl(struct scm_fp_list *fpl);
 void unix_destroy_fpl(struct scm_fp_list *fpl);
 void unix_gc(void);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index af74e7ebc35a..ae77e2dc0dae 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1721,7 +1721,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
 	}
 
 	tsk = skb->sk;
-	unix_sk(tsk)->listener = NULL;
+	unix_update_edges(unix_sk(tsk));
 	skb_free_datagram(sk, skb);
 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
 
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 33aadaa35346..8d0912c1d01a 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -101,6 +101,17 @@ struct unix_sock *unix_get_socket(struct file *filp)
 	return NULL;
 }
 
+static struct unix_vertex *unix_edge_successor(struct unix_edge *edge)
+{
+	/* If an embryo socket has a fd,
+	 * the listener indirectly holds the fd's refcnt.
+	 */
+	if (edge->successor->listener)
+		return unix_sk(edge->successor->listener)->vertex;
+
+	return edge->successor->vertex;
+}
+
 static LIST_HEAD(unix_unvisited_vertices);
 
 enum unix_vertex_index {
@@ -209,6 +220,13 @@ out:
 	fpl->inflight = false;
 }
 
+void unix_update_edges(struct unix_sock *receiver)
+{
+	spin_lock(&unix_gc_lock);
+	receiver->listener = NULL;
+	spin_unlock(&unix_gc_lock);
+}
+
 int unix_prepare_fpl(struct scm_fp_list *fpl)
 {
 	struct unix_vertex *vertex;
@@ -268,7 +286,7 @@ next_vertex:
 
 	/* Explore neighbour vertices (receivers of the current vertex's fd). */
 	list_for_each_entry(edge, &vertex->edges, vertex_entry) {
-		struct unix_vertex *next_vertex = edge->successor->vertex;
+		struct unix_vertex *next_vertex = unix_edge_successor(edge);
 
 		if (!next_vertex)
 			continue;
-- 
cgit v1.2.3


From ba31b4a4e1018f5844c6eb31734976e2184f2f9a Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 25 Mar 2024 13:24:19 -0700
Subject: af_unix: Save O(n) setup of Tarjan's algo.

Before starting Tarjan's algorithm, we need to mark all vertices
as unvisited.  We can save this O(n) setup by reserving two special
indices (0, 1) and using two variables.

The first time we link a vertex to unix_unvisited_vertices, we set
unix_vertex_unvisited_index to index.

During DFS, we can see that the index of unvisited vertices is the
same as unix_vertex_unvisited_index.

When we finalise SCC later, we set unix_vertex_grouped_index to each
vertex's index.

Then, we can know (i) that the vertex is on the stack if the index
of a visited vertex is >= 2 and (ii) that it is not on the stack and
belongs to a different SCC if the index is unix_vertex_grouped_index.

After the whole algorithm, all indices of vertices are set as
unix_vertex_grouped_index.

Next time we start DFS, we know that all unvisited vertices have
unix_vertex_grouped_index, and we can use unix_vertex_unvisited_index
as the not-on-stack marker.

To use the same variable in __unix_walk_scc(), we can swap
unix_vertex_(grouped|unvisited)_index at the end of Tarjan's
algorithm.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Link: https://lore.kernel.org/r/20240325202425.60930-10-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/af_unix.h |  1 -
 net/unix/garbage.c    | 26 +++++++++++++++-----------
 2 files changed, 15 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 414463803b7e..ec040caaa4b5 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -37,7 +37,6 @@ struct unix_vertex {
 	unsigned long out_degree;
 	unsigned long index;
 	unsigned long lowlink;
-	bool on_stack;
 };
 
 struct unix_edge {
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 8d0912c1d01a..cec6189126ba 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -115,16 +115,20 @@ static struct unix_vertex *unix_edge_successor(struct unix_edge *edge)
 static LIST_HEAD(unix_unvisited_vertices);
 
 enum unix_vertex_index {
-	UNIX_VERTEX_INDEX_UNVISITED,
+	UNIX_VERTEX_INDEX_MARK1,
+	UNIX_VERTEX_INDEX_MARK2,
 	UNIX_VERTEX_INDEX_START,
 };
 
+static unsigned long unix_vertex_unvisited_index = UNIX_VERTEX_INDEX_MARK1;
+
 static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
 {
 	struct unix_vertex *vertex = edge->predecessor->vertex;
 
 	if (!vertex) {
 		vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry);
+		vertex->index = unix_vertex_unvisited_index;
 		vertex->out_degree = 0;
 		INIT_LIST_HEAD(&vertex->edges);
 
@@ -265,6 +269,7 @@ void unix_destroy_fpl(struct scm_fp_list *fpl)
 }
 
 static LIST_HEAD(unix_visited_vertices);
+static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2;
 
 static void __unix_walk_scc(struct unix_vertex *vertex)
 {
@@ -274,10 +279,10 @@ static void __unix_walk_scc(struct unix_vertex *vertex)
 	LIST_HEAD(edge_stack);
 
 next_vertex:
-	/* Push vertex to vertex_stack.
+	/* Push vertex to vertex_stack and mark it as on-stack
+	 * (index >= UNIX_VERTEX_INDEX_START).
 	 * The vertex will be popped when finalising SCC later.
 	 */
-	vertex->on_stack = true;
 	list_add(&vertex->scc_entry, &vertex_stack);
 
 	vertex->index = index;
@@ -291,7 +296,7 @@ next_vertex:
 		if (!next_vertex)
 			continue;
 
-		if (next_vertex->index == UNIX_VERTEX_INDEX_UNVISITED) {
+		if (next_vertex->index == unix_vertex_unvisited_index) {
 			/* Iterative deepening depth first search
 			 *
 			 *   1. Push a forward edge to edge_stack and set
@@ -317,7 +322,7 @@ prev_vertex:
 			 * to skip SCC finalisation.
 			 */
 			vertex->lowlink = min(vertex->lowlink, next_vertex->lowlink);
-		} else if (next_vertex->on_stack) {
+		} else if (next_vertex->index != unix_vertex_grouped_index) {
 			/* Loop detected by a back/cross edge.
 			 *
 			 * The successor is on vertex_stack, so two vertices are
@@ -344,7 +349,8 @@ prev_vertex:
 			/* Don't restart DFS from this vertex in unix_walk_scc(). */
 			list_move_tail(&vertex->entry, &unix_visited_vertices);
 
-			vertex->on_stack = false;
+			/* Mark vertex as off-stack. */
+			vertex->index = unix_vertex_grouped_index;
 		}
 
 		list_del(&scc);
@@ -357,20 +363,18 @@ prev_vertex:
 
 static void unix_walk_scc(void)
 {
-	struct unix_vertex *vertex;
-
-	list_for_each_entry(vertex, &unix_unvisited_vertices, entry)
-		vertex->index = UNIX_VERTEX_INDEX_UNVISITED;
-
 	/* Visit every vertex exactly once.
 	 * __unix_walk_scc() moves visited vertices to unix_visited_vertices.
 	 */
 	while (!list_empty(&unix_unvisited_vertices)) {
+		struct unix_vertex *vertex;
+
 		vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
 		__unix_walk_scc(vertex);
 	}
 
 	list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
+	swap(unix_vertex_unvisited_index, unix_vertex_grouped_index);
 }
 
 static LIST_HEAD(gc_candidates);
-- 
cgit v1.2.3


From bfdb01283ee8f2f3089656c3ff8f62bb072dabb2 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 25 Mar 2024 13:24:22 -0700
Subject: af_unix: Assign a unique index to SCC.

The definition of the lowlink in Tarjan's algorithm is the
smallest index of a vertex that is reachable with at most one
back-edge in SCC.  This is not useful for a cross-edge.

If we start traversing from A in the following graph, the final
lowlink of D is 3.  The cross-edge here is one between D and C.

  A -> B -> D   D = (4, 3)  (index, lowlink)
  ^    |    |   C = (3, 1)
  |    V    |   B = (2, 1)
  `--- C <--'   A = (1, 1)

This is because the lowlink of D is updated with the index of C.

In the following patch, we detect a dead SCC by checking two
conditions for each vertex.

  1) vertex has no edge directed to another SCC (no bridge)
  2) vertex's out_degree is the same as the refcount of its file

If 1) is false, there is a receiver of all fds of the SCC and
its ancestor SCC.

To evaluate 1), we need to assign a unique index to each SCC and
assign it to all vertices in the SCC.

This patch changes the lowlink update logic for cross-edge so
that in the example above, the lowlink of D is updated with the
lowlink of C.

  A -> B -> D   D = (4, 1)  (index, lowlink)
  ^    |    |   C = (3, 1)
  |    V    |   B = (2, 1)
  `--- C <--'   A = (1, 1)

Then, all vertices in the same SCC have the same lowlink, and we
can quickly find the bridge connecting to different SCC if exists.

However, it is no longer called lowlink, so we rename it to
scc_index.  (It's sometimes called lowpoint.)

Also, we add a global variable to hold the last index used in DFS
so that we do not reset the initial index in each DFS.

This patch can be squashed to the SCC detection patch but is
split deliberately for anyone wondering why lowlink is not used
as used in the original Tarjan's algorithm and many reference
implementations.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Link: https://lore.kernel.org/r/20240325202425.60930-13-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/af_unix.h |  2 +-
 net/unix/garbage.c    | 29 +++++++++++++++--------------
 2 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index ec040caaa4b5..696d997a5ac9 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -36,7 +36,7 @@ struct unix_vertex {
 	struct list_head scc_entry;
 	unsigned long out_degree;
 	unsigned long index;
-	unsigned long lowlink;
+	unsigned long scc_index;
 };
 
 struct unix_edge {
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 654aa8e30a8b..3f59cee3ccbc 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -312,9 +312,8 @@ static bool unix_scc_cyclic(struct list_head *scc)
 static LIST_HEAD(unix_visited_vertices);
 static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2;
 
-static void __unix_walk_scc(struct unix_vertex *vertex)
+static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index)
 {
-	unsigned long index = UNIX_VERTEX_INDEX_START;
 	LIST_HEAD(vertex_stack);
 	struct unix_edge *edge;
 	LIST_HEAD(edge_stack);
@@ -326,9 +325,9 @@ next_vertex:
 	 */
 	list_add(&vertex->scc_entry, &vertex_stack);
 
-	vertex->index = index;
-	vertex->lowlink = index;
-	index++;
+	vertex->index = *last_index;
+	vertex->scc_index = *last_index;
+	(*last_index)++;
 
 	/* Explore neighbour vertices (receivers of the current vertex's fd). */
 	list_for_each_entry(edge, &vertex->edges, vertex_entry) {
@@ -358,30 +357,30 @@ prev_vertex:
 			next_vertex = vertex;
 			vertex = edge->predecessor->vertex;
 
-			/* If the successor has a smaller lowlink, two vertices
-			 * are in the same SCC, so propagate the smaller lowlink
+			/* If the successor has a smaller scc_index, two vertices
+			 * are in the same SCC, so propagate the smaller scc_index
 			 * to skip SCC finalisation.
 			 */
-			vertex->lowlink = min(vertex->lowlink, next_vertex->lowlink);
+			vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index);
 		} else if (next_vertex->index != unix_vertex_grouped_index) {
 			/* Loop detected by a back/cross edge.
 			 *
-			 * The successor is on vertex_stack, so two vertices are
-			 * in the same SCC.  If the successor has a smaller index,
+			 * The successor is on vertex_stack, so two vertices are in
+			 * the same SCC.  If the successor has a smaller *scc_index*,
 			 * propagate it to skip SCC finalisation.
 			 */
-			vertex->lowlink = min(vertex->lowlink, next_vertex->index);
+			vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index);
 		} else {
 			/* The successor was already grouped as another SCC */
 		}
 	}
 
-	if (vertex->index == vertex->lowlink) {
+	if (vertex->index == vertex->scc_index) {
 		struct list_head scc;
 
 		/* SCC finalised.
 		 *
-		 * If the lowlink was not updated, all the vertices above on
+		 * If the scc_index was not updated, all the vertices above on
 		 * vertex_stack are in the same SCC.  Group them using scc_entry.
 		 */
 		__list_cut_position(&scc, &vertex_stack, &vertex->scc_entry);
@@ -407,6 +406,8 @@ prev_vertex:
 
 static void unix_walk_scc(void)
 {
+	unsigned long last_index = UNIX_VERTEX_INDEX_START;
+
 	unix_graph_maybe_cyclic = false;
 
 	/* Visit every vertex exactly once.
@@ -416,7 +417,7 @@ static void unix_walk_scc(void)
 		struct unix_vertex *vertex;
 
 		vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
-		__unix_walk_scc(vertex);
+		__unix_walk_scc(vertex, &last_index);
 	}
 
 	list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
-- 
cgit v1.2.3


From 4090fa373f0e763c43610853d2774b5979915959 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 25 Mar 2024 13:24:24 -0700
Subject: af_unix: Replace garbage collection algorithm.

If we find a dead SCC during iteration, we call unix_collect_skb()
to splice all skb in the SCC to the global sk_buff_head, hitlist.

After iterating all SCC, we unlock unix_gc_lock and purge the queue.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Link: https://lore.kernel.org/r/20240325202425.60930-15-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/af_unix.h |   8 --
 net/unix/af_unix.c    |  12 --
 net/unix/garbage.c    | 302 +++++++++++---------------------------------------
 3 files changed, 64 insertions(+), 258 deletions(-)

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 696d997a5ac9..226a8da2cbe3 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -19,9 +19,6 @@ static inline struct unix_sock *unix_get_socket(struct file *filp)
 
 extern spinlock_t unix_gc_lock;
 extern unsigned int unix_tot_inflight;
-
-void unix_inflight(struct user_struct *user, struct file *fp);
-void unix_notinflight(struct user_struct *user, struct file *fp);
 void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver);
 void unix_del_edges(struct scm_fp_list *fpl);
 void unix_update_edges(struct unix_sock *receiver);
@@ -85,12 +82,7 @@ struct unix_sock {
 	struct sock		*peer;
 	struct sock		*listener;
 	struct unix_vertex	*vertex;
-	struct list_head	link;
-	unsigned long		inflight;
 	spinlock_t		lock;
-	unsigned long		gc_flags;
-#define UNIX_GC_CANDIDATE	0
-#define UNIX_GC_MAYBE_CYCLE	1
 	struct socket_wq	peer_wq;
 	wait_queue_entry_t	peer_wake;
 	struct scm_stat		scm_stat;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index ae77e2dc0dae..27ca50ab1cd1 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -980,12 +980,10 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern,
 	sk->sk_destruct		= unix_sock_destructor;
 	u = unix_sk(sk);
 	u->listener = NULL;
-	u->inflight = 0;
 	u->vertex = NULL;
 	u->path.dentry = NULL;
 	u->path.mnt = NULL;
 	spin_lock_init(&u->lock);
-	INIT_LIST_HEAD(&u->link);
 	mutex_init(&u->iolock); /* single task reading lock */
 	mutex_init(&u->bindlock); /* single task binding lock */
 	init_waitqueue_head(&u->peer_wait);
@@ -1793,8 +1791,6 @@ static inline bool too_many_unix_fds(struct task_struct *p)
 
 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
 {
-	int i;
-
 	if (too_many_unix_fds(current))
 		return -ETOOMANYREFS;
 
@@ -1806,9 +1802,6 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
 	if (!UNIXCB(skb).fp)
 		return -ENOMEM;
 
-	for (i = scm->fp->count - 1; i >= 0; i--)
-		unix_inflight(scm->fp->user, scm->fp->fp[i]);
-
 	if (unix_prepare_fpl(UNIXCB(skb).fp))
 		return -ENOMEM;
 
@@ -1817,15 +1810,10 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
 
 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
 {
-	int i;
-
 	scm->fp = UNIXCB(skb).fp;
 	UNIXCB(skb).fp = NULL;
 
 	unix_destroy_fpl(scm->fp);
-
-	for (i = scm->fp->count - 1; i >= 0; i--)
-		unix_notinflight(scm->fp->user, scm->fp->fp[i]);
 }
 
 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 0a6b38da578c..89ea71d9297b 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -322,6 +322,52 @@ static bool unix_vertex_dead(struct unix_vertex *vertex)
 	return true;
 }
 
+enum unix_recv_queue_lock_class {
+	U_RECVQ_LOCK_NORMAL,
+	U_RECVQ_LOCK_EMBRYO,
+};
+
+static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist)
+{
+	struct unix_vertex *vertex;
+
+	list_for_each_entry_reverse(vertex, scc, scc_entry) {
+		struct sk_buff_head *queue;
+		struct unix_edge *edge;
+		struct unix_sock *u;
+
+		edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry);
+		u = edge->predecessor;
+		queue = &u->sk.sk_receive_queue;
+
+		spin_lock(&queue->lock);
+
+		if (u->sk.sk_state == TCP_LISTEN) {
+			struct sk_buff *skb;
+
+			skb_queue_walk(queue, skb) {
+				struct sk_buff_head *embryo_queue = &skb->sk->sk_receive_queue;
+
+				/* listener -> embryo order, the inversion never happens. */
+				spin_lock_nested(&embryo_queue->lock, U_RECVQ_LOCK_EMBRYO);
+				skb_queue_splice_init(embryo_queue, hitlist);
+				spin_unlock(&embryo_queue->lock);
+			}
+		} else {
+			skb_queue_splice_init(queue, hitlist);
+
+#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
+			if (u->oob_skb) {
+				kfree_skb(u->oob_skb);
+				u->oob_skb = NULL;
+			}
+#endif
+		}
+
+		spin_unlock(&queue->lock);
+	}
+}
+
 static bool unix_scc_cyclic(struct list_head *scc)
 {
 	struct unix_vertex *vertex;
@@ -345,7 +391,8 @@ static bool unix_scc_cyclic(struct list_head *scc)
 static LIST_HEAD(unix_visited_vertices);
 static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2;
 
-static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index)
+static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index,
+			    struct sk_buff_head *hitlist)
 {
 	LIST_HEAD(vertex_stack);
 	struct unix_edge *edge;
@@ -430,7 +477,9 @@ prev_vertex:
 				scc_dead = unix_vertex_dead(vertex);
 		}
 
-		if (!unix_graph_maybe_cyclic)
+		if (scc_dead)
+			unix_collect_skb(&scc, hitlist);
+		else if (!unix_graph_maybe_cyclic)
 			unix_graph_maybe_cyclic = unix_scc_cyclic(&scc);
 
 		list_del(&scc);
@@ -441,7 +490,7 @@ prev_vertex:
 		goto prev_vertex;
 }
 
-static void unix_walk_scc(void)
+static void unix_walk_scc(struct sk_buff_head *hitlist)
 {
 	unsigned long last_index = UNIX_VERTEX_INDEX_START;
 
@@ -454,7 +503,7 @@ static void unix_walk_scc(void)
 		struct unix_vertex *vertex;
 
 		vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
-		__unix_walk_scc(vertex, &last_index);
+		__unix_walk_scc(vertex, &last_index, hitlist);
 	}
 
 	list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
@@ -463,7 +512,7 @@ static void unix_walk_scc(void)
 	unix_graph_grouped = true;
 }
 
-static void unix_walk_scc_fast(void)
+static void unix_walk_scc_fast(struct sk_buff_head *hitlist)
 {
 	while (!list_empty(&unix_unvisited_vertices)) {
 		struct unix_vertex *vertex;
@@ -480,263 +529,40 @@ static void unix_walk_scc_fast(void)
 				scc_dead = unix_vertex_dead(vertex);
 		}
 
+		if (scc_dead)
+			unix_collect_skb(&scc, hitlist);
+
 		list_del(&scc);
 	}
 
 	list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
 }
 
-static LIST_HEAD(gc_candidates);
-static LIST_HEAD(gc_inflight_list);
-
-/* Keep the number of times in flight count for the file
- * descriptor if it is for an AF_UNIX socket.
- */
-void unix_inflight(struct user_struct *user, struct file *filp)
-{
-	struct unix_sock *u = unix_get_socket(filp);
-
-	spin_lock(&unix_gc_lock);
-
-	if (u) {
-		if (!u->inflight) {
-			WARN_ON_ONCE(!list_empty(&u->link));
-			list_add_tail(&u->link, &gc_inflight_list);
-		} else {
-			WARN_ON_ONCE(list_empty(&u->link));
-		}
-		u->inflight++;
-	}
-
-	spin_unlock(&unix_gc_lock);
-}
-
-void unix_notinflight(struct user_struct *user, struct file *filp)
-{
-	struct unix_sock *u = unix_get_socket(filp);
-
-	spin_lock(&unix_gc_lock);
-
-	if (u) {
-		WARN_ON_ONCE(!u->inflight);
-		WARN_ON_ONCE(list_empty(&u->link));
-
-		u->inflight--;
-		if (!u->inflight)
-			list_del_init(&u->link);
-	}
-
-	spin_unlock(&unix_gc_lock);
-}
-
-static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
-			  struct sk_buff_head *hitlist)
-{
-	struct sk_buff *skb;
-	struct sk_buff *next;
-
-	spin_lock(&x->sk_receive_queue.lock);
-	skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
-		/* Do we have file descriptors ? */
-		if (UNIXCB(skb).fp) {
-			bool hit = false;
-			/* Process the descriptors of this socket */
-			int nfd = UNIXCB(skb).fp->count;
-			struct file **fp = UNIXCB(skb).fp->fp;
-
-			while (nfd--) {
-				/* Get the socket the fd matches if it indeed does so */
-				struct unix_sock *u = unix_get_socket(*fp++);
-
-				/* Ignore non-candidates, they could have been added
-				 * to the queues after starting the garbage collection
-				 */
-				if (u && test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) {
-					hit = true;
-
-					func(u);
-				}
-			}
-			if (hit && hitlist != NULL) {
-				__skb_unlink(skb, &x->sk_receive_queue);
-				__skb_queue_tail(hitlist, skb);
-			}
-		}
-	}
-	spin_unlock(&x->sk_receive_queue.lock);
-}
-
-static void scan_children(struct sock *x, void (*func)(struct unix_sock *),
-			  struct sk_buff_head *hitlist)
-{
-	if (x->sk_state != TCP_LISTEN) {
-		scan_inflight(x, func, hitlist);
-	} else {
-		struct sk_buff *skb;
-		struct sk_buff *next;
-		struct unix_sock *u;
-		LIST_HEAD(embryos);
-
-		/* For a listening socket collect the queued embryos
-		 * and perform a scan on them as well.
-		 */
-		spin_lock(&x->sk_receive_queue.lock);
-		skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
-			u = unix_sk(skb->sk);
-
-			/* An embryo cannot be in-flight, so it's safe
-			 * to use the list link.
-			 */
-			WARN_ON_ONCE(!list_empty(&u->link));
-			list_add_tail(&u->link, &embryos);
-		}
-		spin_unlock(&x->sk_receive_queue.lock);
-
-		while (!list_empty(&embryos)) {
-			u = list_entry(embryos.next, struct unix_sock, link);
-			scan_inflight(&u->sk, func, hitlist);
-			list_del_init(&u->link);
-		}
-	}
-}
-
-static void dec_inflight(struct unix_sock *usk)
-{
-	usk->inflight--;
-}
-
-static void inc_inflight(struct unix_sock *usk)
-{
-	usk->inflight++;
-}
-
-static void inc_inflight_move_tail(struct unix_sock *u)
-{
-	u->inflight++;
-
-	/* If this still might be part of a cycle, move it to the end
-	 * of the list, so that it's checked even if it was already
-	 * passed over
-	 */
-	if (test_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags))
-		list_move_tail(&u->link, &gc_candidates);
-}
-
 static bool gc_in_progress;
 
 static void __unix_gc(struct work_struct *work)
 {
 	struct sk_buff_head hitlist;
-	struct unix_sock *u, *next;
-	LIST_HEAD(not_cycle_list);
-	struct list_head cursor;
 
 	spin_lock(&unix_gc_lock);
 
-	if (!unix_graph_maybe_cyclic)
+	if (!unix_graph_maybe_cyclic) {
+		spin_unlock(&unix_gc_lock);
 		goto skip_gc;
-
-	if (unix_graph_grouped)
-		unix_walk_scc_fast();
-	else
-		unix_walk_scc();
-
-	/* First, select candidates for garbage collection.  Only
-	 * in-flight sockets are considered, and from those only ones
-	 * which don't have any external reference.
-	 *
-	 * Holding unix_gc_lock will protect these candidates from
-	 * being detached, and hence from gaining an external
-	 * reference.  Since there are no possible receivers, all
-	 * buffers currently on the candidates' queues stay there
-	 * during the garbage collection.
-	 *
-	 * We also know that no new candidate can be added onto the
-	 * receive queues.  Other, non candidate sockets _can_ be
-	 * added to queue, so we must make sure only to touch
-	 * candidates.
-	 */
-	list_for_each_entry_safe(u, next, &gc_inflight_list, link) {
-		long total_refs;
-
-		total_refs = file_count(u->sk.sk_socket->file);
-
-		WARN_ON_ONCE(!u->inflight);
-		WARN_ON_ONCE(total_refs < u->inflight);
-		if (total_refs == u->inflight) {
-			list_move_tail(&u->link, &gc_candidates);
-			__set_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
-			__set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
-		}
-	}
-
-	/* Now remove all internal in-flight reference to children of
-	 * the candidates.
-	 */
-	list_for_each_entry(u, &gc_candidates, link)
-		scan_children(&u->sk, dec_inflight, NULL);
-
-	/* Restore the references for children of all candidates,
-	 * which have remaining references.  Do this recursively, so
-	 * only those remain, which form cyclic references.
-	 *
-	 * Use a "cursor" link, to make the list traversal safe, even
-	 * though elements might be moved about.
-	 */
-	list_add(&cursor, &gc_candidates);
-	while (cursor.next != &gc_candidates) {
-		u = list_entry(cursor.next, struct unix_sock, link);
-
-		/* Move cursor to after the current position. */
-		list_move(&cursor, &u->link);
-
-		if (u->inflight) {
-			list_move_tail(&u->link, &not_cycle_list);
-			__clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
-			scan_children(&u->sk, inc_inflight_move_tail, NULL);
-		}
 	}
-	list_del(&cursor);
 
-	/* Now gc_candidates contains only garbage.  Restore original
-	 * inflight counters for these as well, and remove the skbuffs
-	 * which are creating the cycle(s).
-	 */
-	skb_queue_head_init(&hitlist);
-	list_for_each_entry(u, &gc_candidates, link) {
-		scan_children(&u->sk, inc_inflight, &hitlist);
+	__skb_queue_head_init(&hitlist);
 
-#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
-		if (u->oob_skb) {
-			kfree_skb(u->oob_skb);
-			u->oob_skb = NULL;
-		}
-#endif
-	}
-
-	/* not_cycle_list contains those sockets which do not make up a
-	 * cycle.  Restore these to the inflight list.
-	 */
-	while (!list_empty(&not_cycle_list)) {
-		u = list_entry(not_cycle_list.next, struct unix_sock, link);
-		__clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
-		list_move_tail(&u->link, &gc_inflight_list);
-	}
+	if (unix_graph_grouped)
+		unix_walk_scc_fast(&hitlist);
+	else
+		unix_walk_scc(&hitlist);
 
 	spin_unlock(&unix_gc_lock);
 
-	/* Here we are. Hitlist is filled. Die. */
 	__skb_queue_purge(&hitlist);
-
-	spin_lock(&unix_gc_lock);
-
-	/* All candidates should have been detached by now. */
-	WARN_ON_ONCE(!list_empty(&gc_candidates));
 skip_gc:
-	/* Paired with READ_ONCE() in wait_for_unix_gc(). */
 	WRITE_ONCE(gc_in_progress, false);
-
-	spin_unlock(&unix_gc_lock);
 }
 
 static DECLARE_WORK(unix_gc_work, __unix_gc);
-- 
cgit v1.2.3


From a0ad11fc2632903ec0fe9cd16b07d7dd2ade8341 Mon Sep 17 00:00:00 2001
From: Balazs Scheidler <bazsi77@gmail.com>
Date: Tue, 26 Mar 2024 19:05:46 +0100
Subject: net: port TP_STORE_ADDR_PORTS_SKB macro to be tcp/udp independent

This patch moves TP_STORE_ADDR_PORTS_SKB() to a common header and removes
the TCP specific implementation details.

Previously the macro assumed the skb passed as an argument is a
TCP packet, the implementation now uses an argument to the L4 header and
uses that to extract the source/destination ports, which happen
to be named the same in "struct tcphdr" and "struct udphdr"

Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Signed-off-by: Balazs Scheidler <balazs.scheidler@axoflow.com>
Link: https://lore.kernel.org/r/9e306f78260dfbbdc7353ba5f864cc027a409540.1711475011.git.balazs.scheidler@axoflow.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/trace/events/net_probe_common.h | 40 +++++++++++++++++++++++++++++
 include/trace/events/tcp.h              | 45 ++-------------------------------
 2 files changed, 42 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/net_probe_common.h b/include/trace/events/net_probe_common.h
index b1f9a4d3ee13..5e33f91bdea3 100644
--- a/include/trace/events/net_probe_common.h
+++ b/include/trace/events/net_probe_common.h
@@ -70,4 +70,44 @@
 	TP_STORE_V4MAPPED(__entry, saddr, daddr)
 #endif
 
+#define TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb, protoh)		\
+	do {								\
+		struct sockaddr_in *v4 = (void *)__entry->saddr;	\
+									\
+		v4->sin_family = AF_INET;				\
+		v4->sin_port = protoh->source;				\
+		v4->sin_addr.s_addr = ip_hdr(skb)->saddr;		\
+		v4 = (void *)__entry->daddr;				\
+		v4->sin_family = AF_INET;				\
+		v4->sin_port = protoh->dest;				\
+		v4->sin_addr.s_addr = ip_hdr(skb)->daddr;		\
+	} while (0)
+
+#if IS_ENABLED(CONFIG_IPV6)
+
+#define TP_STORE_ADDR_PORTS_SKB(__entry, skb, protoh)			\
+	do {								\
+		const struct iphdr *iph = ip_hdr(skb);			\
+									\
+		if (iph->version == 6) {				\
+			struct sockaddr_in6 *v6 = (void *)__entry->saddr; \
+									\
+			v6->sin6_family = AF_INET6;			\
+			v6->sin6_port = protoh->source;			\
+			v6->sin6_addr = ipv6_hdr(skb)->saddr;		\
+			v6 = (void *)__entry->daddr;			\
+			v6->sin6_family = AF_INET6;			\
+			v6->sin6_port = protoh->dest;			\
+			v6->sin6_addr = ipv6_hdr(skb)->daddr;		\
+		} else							\
+			TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb, protoh); \
+	} while (0)
+
+#else
+
+#define TP_STORE_ADDR_PORTS_SKB(__entry, skb, protoh)		\
+	TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb, protoh)
+
+#endif
+
 #endif
diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 3c08a0846c47..1db95175c1e5 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -273,48 +273,6 @@ TRACE_EVENT(tcp_probe,
 		  __entry->skbaddr, __entry->skaddr)
 );
 
-#define TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb)			\
-	do {								\
-		const struct tcphdr *th = (const struct tcphdr *)skb->data; \
-		struct sockaddr_in *v4 = (void *)__entry->saddr;	\
-									\
-		v4->sin_family = AF_INET;				\
-		v4->sin_port = th->source;				\
-		v4->sin_addr.s_addr = ip_hdr(skb)->saddr;		\
-		v4 = (void *)__entry->daddr;				\
-		v4->sin_family = AF_INET;				\
-		v4->sin_port = th->dest;				\
-		v4->sin_addr.s_addr = ip_hdr(skb)->daddr;		\
-	} while (0)
-
-#if IS_ENABLED(CONFIG_IPV6)
-
-#define TP_STORE_ADDR_PORTS_SKB(__entry, skb)				\
-	do {								\
-		const struct iphdr *iph = ip_hdr(skb);			\
-									\
-		if (iph->version == 6) {				\
-			const struct tcphdr *th = (const struct tcphdr *)skb->data; \
-			struct sockaddr_in6 *v6 = (void *)__entry->saddr; \
-									\
-			v6->sin6_family = AF_INET6;			\
-			v6->sin6_port = th->source;			\
-			v6->sin6_addr = ipv6_hdr(skb)->saddr;		\
-			v6 = (void *)__entry->daddr;			\
-			v6->sin6_family = AF_INET6;			\
-			v6->sin6_port = th->dest;			\
-			v6->sin6_addr = ipv6_hdr(skb)->daddr;		\
-		} else							\
-			TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb);	\
-	} while (0)
-
-#else
-
-#define TP_STORE_ADDR_PORTS_SKB(__entry, skb)		\
-	TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb)
-
-#endif
-
 /*
  * tcp event with only skb
  */
@@ -331,12 +289,13 @@ DECLARE_EVENT_CLASS(tcp_event_skb,
 	),
 
 	TP_fast_assign(
+		const struct tcphdr *th = (const struct tcphdr *)skb->data;
 		__entry->skbaddr = skb;
 
 		memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
 		memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
 
-		TP_STORE_ADDR_PORTS_SKB(__entry, skb);
+		TP_STORE_ADDR_PORTS_SKB(__entry, skb, th);
 	),
 
 	TP_printk("skbaddr=%p src=%pISpc dest=%pISpc",
-- 
cgit v1.2.3


From e9669a00bba79442dd4862c57761333d6a020c24 Mon Sep 17 00:00:00 2001
From: Balazs Scheidler <bazsi77@gmail.com>
Date: Tue, 26 Mar 2024 19:05:47 +0100
Subject: net: udp: add IP/port data to the tracepoint
 udp/udp_fail_queue_rcv_skb

The udp_fail_queue_rcv_skb() tracepoint lacks any details on the source
and destination IP/port whereas this information can be critical in case
of UDP/syslog.

Signed-off-by: Balazs Scheidler <balazs.scheidler@axoflow.com>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Link: https://lore.kernel.org/r/0c8b3e33dbf679e190be6f4c6736603a76988a20.1711475011.git.balazs.scheidler@axoflow.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/trace/events/udp.h | 29 ++++++++++++++++++++++++-----
 net/ipv4/udp.c             |  2 +-
 net/ipv6/udp.c             |  3 ++-
 3 files changed, 27 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/udp.h b/include/trace/events/udp.h
index 336fe272889f..62bebe2a6ece 100644
--- a/include/trace/events/udp.h
+++ b/include/trace/events/udp.h
@@ -7,24 +7,43 @@
 
 #include <linux/udp.h>
 #include <linux/tracepoint.h>
+#include <trace/events/net_probe_common.h>
 
 TRACE_EVENT(udp_fail_queue_rcv_skb,
 
-	TP_PROTO(int rc, struct sock *sk),
+	TP_PROTO(int rc, struct sock *sk, struct sk_buff *skb),
 
-	TP_ARGS(rc, sk),
+	TP_ARGS(rc, sk, skb),
 
 	TP_STRUCT__entry(
 		__field(int, rc)
-		__field(__u16, lport)
+
+		__field(__u16, sport)
+		__field(__u16, dport)
+		__field(__u16, family)
+		__array(__u8, saddr, sizeof(struct sockaddr_in6))
+		__array(__u8, daddr, sizeof(struct sockaddr_in6))
 	),
 
 	TP_fast_assign(
+		const struct udphdr *uh = (const struct udphdr *)udp_hdr(skb);
+
 		__entry->rc = rc;
-		__entry->lport = inet_sk(sk)->inet_num;
+
+		/* for filtering use */
+		__entry->sport = ntohs(uh->source);
+		__entry->dport = ntohs(uh->dest);
+		__entry->family = sk->sk_family;
+
+		memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
+		memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
+
+		TP_STORE_ADDR_PORTS_SKB(__entry, skb, uh);
 	),
 
-	TP_printk("rc=%d port=%hu", __entry->rc, __entry->lport)
+	TP_printk("rc=%d family=%s src=%pISpc dest=%pISpc", __entry->rc,
+		  show_family_name(__entry->family),
+		  __entry->saddr, __entry->daddr)
 );
 
 #endif /* _TRACE_UDP_H */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 661d0e0d273f..531882f321f2 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2049,8 +2049,8 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 			drop_reason = SKB_DROP_REASON_PROTO_MEM;
 		}
 		UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+		trace_udp_fail_queue_rcv_skb(rc, sk, skb);
 		kfree_skb_reason(skb, drop_reason);
-		trace_udp_fail_queue_rcv_skb(rc, sk);
 		return -1;
 	}
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 7c1e6469d091..2e4dc5e6137b 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -34,6 +34,7 @@
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <linux/indirect_call_wrapper.h>
+#include <trace/events/udp.h>
 
 #include <net/addrconf.h>
 #include <net/ndisc.h>
@@ -658,8 +659,8 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 			drop_reason = SKB_DROP_REASON_PROTO_MEM;
 		}
 		UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+		trace_udp_fail_queue_rcv_skb(rc, sk, skb);
 		kfree_skb_reason(skb, drop_reason);
-		trace_udp_fail_queue_rcv_skb(rc, sk);
 		return -1;
 	}
 
-- 
cgit v1.2.3


From 1abe267f173eae7ae76cf56232292e9641eb652f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 28 Mar 2024 14:40:32 +0000
Subject: net: add sk_wake_async_rcu() helper

While looking at UDP receive performance, I saw sk_wake_async()
was no longer inlined.

This matters at least on AMD Zen1-4 platforms (see SRSO)

This might be because rcu_read_lock() and rcu_read_unlock()
are no longer nops in recent kernels ?

Add sk_wake_async_rcu() variant, which must be called from
contexts already holding rcu lock.

As SOCK_FASYNC is deprecated in modern days, use unlikely()
to give a hint to the compiler.

sk_wake_async_rcu() is properly inlined from
__udp_enqueue_schedule_skb() and sock_def_readable().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20240328144032.1864988-5-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 crypto/af_alg.c      | 4 ++--
 include/net/sock.h   | 6 ++++++
 net/atm/common.c     | 2 +-
 net/core/sock.c      | 8 ++++----
 net/dccp/output.c    | 2 +-
 net/ipv4/udp.c       | 2 +-
 net/iucv/af_iucv.c   | 2 +-
 net/rxrpc/af_rxrpc.c | 2 +-
 net/sctp/socket.c    | 2 +-
 net/smc/smc_rx.c     | 4 ++--
 net/unix/af_unix.c   | 2 +-
 11 files changed, 21 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index 68cc9290cabe..5bc6d0fa7498 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -847,7 +847,7 @@ void af_alg_wmem_wakeup(struct sock *sk)
 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN |
 							   EPOLLRDNORM |
 							   EPOLLRDBAND);
-	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+	sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
 	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(af_alg_wmem_wakeup);
@@ -914,7 +914,7 @@ static void af_alg_data_wakeup(struct sock *sk)
 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
 							   EPOLLRDNORM |
 							   EPOLLRDBAND);
-	sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+	sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	rcu_read_unlock();
 }
 
diff --git a/include/net/sock.h b/include/net/sock.h
index f57bfd8a2ad2..2253eefe2848 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2513,6 +2513,12 @@ static inline void sk_wake_async(const struct sock *sk, int how, int band)
 	}
 }
 
+static inline void sk_wake_async_rcu(const struct sock *sk, int how, int band)
+{
+	if (unlikely(sock_flag(sk, SOCK_FASYNC)))
+		sock_wake_async(rcu_dereference(sk->sk_wq), how, band);
+}
+
 /* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might
  * need sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak.
  * Note: for send buffers, TCP works better if we can build two skbs at
diff --git a/net/atm/common.c b/net/atm/common.c
index 2a1ec014e901..9b75699992ff 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -116,7 +116,7 @@ static void vcc_write_space(struct sock *sk)
 		if (skwq_has_sleeper(wq))
 			wake_up_interruptible(&wq->wait);
 
-		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	}
 
 	rcu_read_unlock();
diff --git a/net/core/sock.c b/net/core/sock.c
index 0963689a5950..5ed411231fc7 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3338,7 +3338,7 @@ static void sock_def_error_report(struct sock *sk)
 	wq = rcu_dereference(sk->sk_wq);
 	if (skwq_has_sleeper(wq))
 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
-	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
+	sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
 	rcu_read_unlock();
 }
 
@@ -3353,7 +3353,7 @@ void sock_def_readable(struct sock *sk)
 	if (skwq_has_sleeper(wq))
 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
 						EPOLLRDNORM | EPOLLRDBAND);
-	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+	sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
 	rcu_read_unlock();
 }
 
@@ -3373,7 +3373,7 @@ static void sock_def_write_space(struct sock *sk)
 						EPOLLWRNORM | EPOLLWRBAND);
 
 		/* Should agree with poll, otherwise some programs break */
-		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	}
 
 	rcu_read_unlock();
@@ -3398,7 +3398,7 @@ static void sock_def_write_space_wfree(struct sock *sk)
 						EPOLLWRNORM | EPOLLWRBAND);
 
 		/* Should agree with poll, otherwise some programs break */
-		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	}
 }
 
diff --git a/net/dccp/output.c b/net/dccp/output.c
index fd2eb148d24d..5c2e24f3c39b 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -204,7 +204,7 @@ void dccp_write_space(struct sock *sk)
 		wake_up_interruptible(&wq->wait);
 	/* Should agree with poll, otherwise some programs break */
 	if (sock_writeable(sk))
-		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
 
 	rcu_read_unlock();
 }
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 143043cd2dcb..11460d751e73 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1544,7 +1544,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
 			INDIRECT_CALL_1(sk->sk_data_ready,
 					sock_def_readable, sk);
 		else
-			sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+			sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
 	}
 	busylock_release(busy);
 	return 0;
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 7c8c3adcac6e..c951bb9cc2e0 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -184,7 +184,7 @@ static void iucv_sock_wake_msglim(struct sock *sk)
 	wq = rcu_dereference(sk->sk_wq);
 	if (skwq_has_sleeper(wq))
 		wake_up_interruptible_all(&wq->wait);
-	sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+	sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	rcu_read_unlock();
 }
 
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 5222bc97d192..f4844683e120 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -65,7 +65,7 @@ static void rxrpc_write_space(struct sock *sk)
 
 		if (skwq_has_sleeper(wq))
 			wake_up_interruptible(&wq->wait);
-		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	}
 	rcu_read_unlock();
 }
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index c67679a41044..e416b6d3d270 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -9276,7 +9276,7 @@ void sctp_data_ready(struct sock *sk)
 	if (skwq_has_sleeper(wq))
 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN |
 						EPOLLRDNORM | EPOLLRDBAND);
-	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+	sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
 	rcu_read_unlock();
 }
 
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
index 9a2f3638d161..f0cbe77a80b4 100644
--- a/net/smc/smc_rx.c
+++ b/net/smc/smc_rx.c
@@ -42,10 +42,10 @@ static void smc_rx_wake_up(struct sock *sk)
 	if (skwq_has_sleeper(wq))
 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
 						EPOLLRDNORM | EPOLLRDBAND);
-	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+	sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
 	if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
 	    (sk->sk_state == SMC_CLOSED))
-		sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
+		sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_HUP);
 	rcu_read_unlock();
 }
 
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 27ca50ab1cd1..533fb682c954 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -546,7 +546,7 @@ static void unix_write_space(struct sock *sk)
 		if (skwq_has_sleeper(wq))
 			wake_up_interruptible_sync_poll(&wq->wait,
 				EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
-		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
 	}
 	rcu_read_unlock();
 }
-- 
cgit v1.2.3


From e8058a49e67fe7bc7e4a0308851a3ca3a6d2e45d Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 28 Mar 2024 20:31:45 +0100
Subject: netlink: introduce type-checking attribute iteration

There are, especially with multi-attr arrays, many cases
of needing to iterate all attributes of a specific type
in a netlink message or a nested attribute. Add specific
macros to support that case.

Also convert many instances using this spatch:

    @@
    iterator nla_for_each_attr;
    iterator name nla_for_each_attr_type;
    identifier nla;
    expression head, len, rem;
    expression ATTR;
    type T;
    identifier x;
    @@
    -nla_for_each_attr(nla, head, len, rem)
    +nla_for_each_attr_type(nla, ATTR, head, len, rem)
     {
    <... T x; ...>
    -if (nla_type(nla) == ATTR) {
     ...
    -}
     }

    @@
    identifier nla;
    iterator nla_for_each_nested;
    iterator name nla_for_each_nested_type;
    expression attr, rem;
    expression ATTR;
    type T;
    identifier x;
    @@
    -nla_for_each_nested(nla, attr, rem)
    +nla_for_each_nested_type(nla, ATTR, attr, rem)
     {
    <... T x; ...>
    -if (nla_type(nla) == ATTR) {
     ...
    -}
     }

    @@
    iterator nla_for_each_attr;
    iterator name nla_for_each_attr_type;
    identifier nla;
    expression head, len, rem;
    expression ATTR;
    type T;
    identifier x;
    @@
    -nla_for_each_attr(nla, head, len, rem)
    +nla_for_each_attr_type(nla, ATTR, head, len, rem)
     {
    <... T x; ...>
    -if (nla_type(nla) != ATTR) continue;
     ...
     }

    @@
    identifier nla;
    iterator nla_for_each_nested;
    iterator name nla_for_each_nested_type;
    expression attr, rem;
    expression ATTR;
    type T;
    identifier x;
    @@
    -nla_for_each_nested(nla, attr, rem)
    +nla_for_each_nested_type(nla, ATTR, attr, rem)
     {
    <... T x; ...>
    -if (nla_type(nla) != ATTR) continue;
     ...
     }

Although I had to undo one bad change this made, and
I also adjusted some other code for whitespace and to
use direct variable initialization now.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Link: https://lore.kernel.org/r/20240328203144.b5a6c895fb80.I1869b44767379f204998ff44dd239803f39c23e0@changeid
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c          |  5 +---
 drivers/net/ethernet/emulex/benet/be_main.c        |  5 +---
 drivers/net/ethernet/intel/i40e/i40e_main.c        |  8 ++-----
 drivers/net/ethernet/intel/ice/ice_main.c          |  7 ++----
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c      | 11 +++------
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  5 +---
 .../net/ethernet/netronome/nfp/nfp_net_common.c    |  5 +---
 include/net/netlink.h                              | 27 ++++++++++++++++++++++
 net/8021q/vlan_netlink.c                           | 10 ++++----
 net/core/bpf_sk_storage.c                          | 23 ++++++++----------
 net/core/rtnetlink.c                               | 15 ++++++------
 net/devlink/dev.c                                  | 12 ++++------
 net/sched/sch_mqprio.c                             |  6 ++---
 net/sched/sch_taprio.c                             |  5 +---
 14 files changed, 65 insertions(+), 79 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 388e80bf91f5..b4db4b1aaffb 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -14581,12 +14581,9 @@ static int bnxt_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh,
 	if (!br_spec)
 		return -EINVAL;
 
-	nla_for_each_nested(attr, br_spec, rem) {
+	nla_for_each_nested_type(attr, IFLA_BRIDGE_MODE, br_spec, rem) {
 		u16 mode;
 
-		if (nla_type(attr) != IFLA_BRIDGE_MODE)
-			continue;
-
 		mode = nla_get_u16(attr);
 		if (mode == bp->br_mode)
 			break;
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index ad862ed7888a..a8596ebcdfd6 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -4982,10 +4982,7 @@ static int be_ndo_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh,
 	if (!br_spec)
 		return -EINVAL;
 
-	nla_for_each_nested(attr, br_spec, rem) {
-		if (nla_type(attr) != IFLA_BRIDGE_MODE)
-			continue;
-
+	nla_for_each_nested_type(attr, IFLA_BRIDGE_MODE, br_spec, rem) {
 		mode = nla_get_u16(attr);
 		if (BE3_chip(adapter) && mode == BRIDGE_MODE_VEPA)
 			return -EOPNOTSUPP;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index f86578857e8a..e427e65af205 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -13108,13 +13108,9 @@ static int i40e_ndo_bridge_setlink(struct net_device *dev,
 	if (!br_spec)
 		return -EINVAL;
 
-	nla_for_each_nested(attr, br_spec, rem) {
-		__u16 mode;
+	nla_for_each_nested_type(attr, IFLA_BRIDGE_MODE, br_spec, rem) {
+		__u16 mode = nla_get_u16(attr);
 
-		if (nla_type(attr) != IFLA_BRIDGE_MODE)
-			continue;
-
-		mode = nla_get_u16(attr);
 		if ((mode != BRIDGE_MODE_VEPA) &&
 		    (mode != BRIDGE_MODE_VEB))
 			return -EINVAL;
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index f2b7d6ca8805..618570f23580 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -7993,12 +7993,9 @@ ice_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh,
 	if (!br_spec)
 		return -EINVAL;
 
-	nla_for_each_nested(attr, br_spec, rem) {
-		__u16 mode;
+	nla_for_each_nested_type(attr, IFLA_BRIDGE_MODE, br_spec, rem) {
+		__u16 mode = nla_get_u16(attr);
 
-		if (nla_type(attr) != IFLA_BRIDGE_MODE)
-			continue;
-		mode = nla_get_u16(attr);
 		if (mode != BRIDGE_MODE_VEPA && mode != BRIDGE_MODE_VEB)
 			return -EINVAL;
 		/* Continue  if bridge mode is not being flipped */
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index f985252c8c8d..ed05af665466 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -10061,15 +10061,10 @@ static int ixgbe_ndo_bridge_setlink(struct net_device *dev,
 	if (!br_spec)
 		return -EINVAL;
 
-	nla_for_each_nested(attr, br_spec, rem) {
-		int status;
-		__u16 mode;
+	nla_for_each_nested_type(attr, IFLA_BRIDGE_MODE, br_spec, rem) {
+		__u16 mode = nla_get_u16(attr);
+		int status = ixgbe_configure_bridge_mode(adapter, mode);
 
-		if (nla_type(attr) != IFLA_BRIDGE_MODE)
-			continue;
-
-		mode = nla_get_u16(attr);
-		status = ixgbe_configure_bridge_mode(adapter, mode);
 		if (status)
 			return status;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 91848eae4565..81e1c1e401f9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -4950,10 +4950,7 @@ static int mlx5e_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh,
 	if (!br_spec)
 		return -EINVAL;
 
-	nla_for_each_nested(attr, br_spec, rem) {
-		if (nla_type(attr) != IFLA_BRIDGE_MODE)
-			continue;
-
+	nla_for_each_nested_type(attr, IFLA_BRIDGE_MODE, br_spec, rem) {
 		mode = nla_get_u16(attr);
 		if (mode > BRIDGE_MODE_VEPA)
 			return -EINVAL;
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index f28e769e6fda..997cc4fcffdb 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -2289,10 +2289,7 @@ static int nfp_net_bridge_setlink(struct net_device *dev, struct nlmsghdr *nlh,
 	if (!br_spec)
 		return -EINVAL;
 
-	nla_for_each_nested(attr, br_spec, rem) {
-		if (nla_type(attr) != IFLA_BRIDGE_MODE)
-			continue;
-
+	nla_for_each_nested_type(attr, IFLA_BRIDGE_MODE, br_spec, rem) {
 		new_ctrl = nn->dp.ctrl;
 		mode = nla_get_u16(attr);
 		if (mode == BRIDGE_MODE_VEPA)
diff --git a/include/net/netlink.h b/include/net/netlink.h
index c19ff921b661..1d2bbcc50212 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -157,7 +157,11 @@
  *   nla_parse()			parse and validate stream of attrs
  *   nla_parse_nested()			parse nested attributes
  *   nla_for_each_attr()		loop over all attributes
+ *   nla_for_each_attr_type()		loop over all attributes with the
+ *					given type
  *   nla_for_each_nested()		loop over the nested attributes
+ *   nla_for_each_nested_type()		loop over the nested attributes with
+ *					the given type
  *=========================================================================
  */
 
@@ -2070,6 +2074,18 @@ static inline int nla_total_size_64bit(int payload)
 	     nla_ok(pos, rem); \
 	     pos = nla_next(pos, &(rem)))
 
+/**
+ * nla_for_each_attr_type - iterate over a stream of attributes
+ * @pos: loop counter, set to current attribute
+ * @type: required attribute type for @pos
+ * @head: head of attribute stream
+ * @len: length of attribute stream
+ * @rem: initialized to len, holds bytes currently remaining in stream
+ */
+#define nla_for_each_attr_type(pos, type, head, len, rem) \
+	nla_for_each_attr(pos, head, len, rem) \
+		if (nla_type(pos) == type)
+
 /**
  * nla_for_each_nested - iterate over nested attributes
  * @pos: loop counter, set to current attribute
@@ -2079,6 +2095,17 @@ static inline int nla_total_size_64bit(int payload)
 #define nla_for_each_nested(pos, nla, rem) \
 	nla_for_each_attr(pos, nla_data(nla), nla_len(nla), rem)
 
+/**
+ * nla_for_each_nested_type - iterate over nested attributes
+ * @pos: loop counter, set to current attribute
+ * @type: required attribute type for @pos
+ * @nla: attribute containing the nested attributes
+ * @rem: initialized to len, holds bytes currently remaining in stream
+ */
+#define nla_for_each_nested_type(pos, type, nla, rem) \
+	nla_for_each_nested(pos, nla, rem) \
+		if (nla_type(pos) == type)
+
 /**
  * nla_is_last - Test if attribute is last in stream
  * @nla: attribute to test
diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
index a3b68243fd4b..cf5219df7903 100644
--- a/net/8021q/vlan_netlink.c
+++ b/net/8021q/vlan_netlink.c
@@ -117,17 +117,15 @@ static int vlan_changelink(struct net_device *dev, struct nlattr *tb[],
 			return err;
 	}
 	if (data[IFLA_VLAN_INGRESS_QOS]) {
-		nla_for_each_nested(attr, data[IFLA_VLAN_INGRESS_QOS], rem) {
-			if (nla_type(attr) != IFLA_VLAN_QOS_MAPPING)
-				continue;
+		nla_for_each_nested_type(attr, IFLA_VLAN_QOS_MAPPING,
+					 data[IFLA_VLAN_INGRESS_QOS], rem) {
 			m = nla_data(attr);
 			vlan_dev_set_ingress_priority(dev, m->to, m->from);
 		}
 	}
 	if (data[IFLA_VLAN_EGRESS_QOS]) {
-		nla_for_each_nested(attr, data[IFLA_VLAN_EGRESS_QOS], rem) {
-			if (nla_type(attr) != IFLA_VLAN_QOS_MAPPING)
-				continue;
+		nla_for_each_nested_type(attr, IFLA_VLAN_QOS_MAPPING,
+					 data[IFLA_VLAN_EGRESS_QOS], rem) {
 			m = nla_data(attr);
 			err = vlan_dev_set_egress_priority(dev, m->from, m->to);
 			if (err)
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 6c4d90b24d46..bc01b3aa6b0f 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -496,27 +496,22 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs)
 	if (!bpf_capable())
 		return ERR_PTR(-EPERM);
 
-	nla_for_each_nested(nla, nla_stgs, rem) {
-		if (nla_type(nla) == SK_DIAG_BPF_STORAGE_REQ_MAP_FD) {
-			if (nla_len(nla) != sizeof(u32))
-				return ERR_PTR(-EINVAL);
-			nr_maps++;
-		}
+	nla_for_each_nested_type(nla, SK_DIAG_BPF_STORAGE_REQ_MAP_FD,
+				 nla_stgs, rem) {
+		if (nla_len(nla) != sizeof(u32))
+			return ERR_PTR(-EINVAL);
+		nr_maps++;
 	}
 
 	diag = kzalloc(struct_size(diag, maps, nr_maps), GFP_KERNEL);
 	if (!diag)
 		return ERR_PTR(-ENOMEM);
 
-	nla_for_each_nested(nla, nla_stgs, rem) {
-		struct bpf_map *map;
-		int map_fd;
-
-		if (nla_type(nla) != SK_DIAG_BPF_STORAGE_REQ_MAP_FD)
-			continue;
+	nla_for_each_nested_type(nla, SK_DIAG_BPF_STORAGE_REQ_MAP_FD,
+				 nla_stgs, rem) {
+		int map_fd = nla_get_u32(nla);
+		struct bpf_map *map = bpf_map_get(map_fd);
 
-		map_fd = nla_get_u32(nla);
-		map = bpf_map_get(map_fd);
 		if (IS_ERR(map)) {
 			err = PTR_ERR(map);
 			goto err_free;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a3d7847ce69d..283e42f48af6 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -5245,15 +5245,14 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
 	if (br_spec) {
-		nla_for_each_nested(attr, br_spec, rem) {
-			if (nla_type(attr) == IFLA_BRIDGE_FLAGS) {
-				if (nla_len(attr) < sizeof(flags))
-					return -EINVAL;
+		nla_for_each_nested_type(attr, IFLA_BRIDGE_FLAGS, br_spec,
+					 rem) {
+			if (nla_len(attr) < sizeof(flags))
+				return -EINVAL;
 
-				have_flags = true;
-				flags = nla_get_u16(attr);
-				break;
-			}
+			have_flags = true;
+			flags = nla_get_u16(attr);
+			break;
 		}
 	}
 
diff --git a/net/devlink/dev.c b/net/devlink/dev.c
index 19dbf540748a..c609deb42e88 100644
--- a/net/devlink/dev.c
+++ b/net/devlink/dev.c
@@ -1202,17 +1202,13 @@ static void __devlink_compat_running_version(struct devlink *devlink,
 	if (err)
 		goto free_msg;
 
-	nla_for_each_attr(nlattr, (void *)msg->data, msg->len, rem) {
+	nla_for_each_attr_type(nlattr, DEVLINK_ATTR_INFO_VERSION_RUNNING,
+			       (void *)msg->data, msg->len, rem) {
 		const struct nlattr *kv;
 		int rem_kv;
 
-		if (nla_type(nlattr) != DEVLINK_ATTR_INFO_VERSION_RUNNING)
-			continue;
-
-		nla_for_each_nested(kv, nlattr, rem_kv) {
-			if (nla_type(kv) != DEVLINK_ATTR_INFO_VERSION_VALUE)
-				continue;
-
+		nla_for_each_nested_type(kv, DEVLINK_ATTR_INFO_VERSION_VALUE,
+					 nlattr, rem_kv) {
 			strlcat(buf, nla_data(kv), len);
 			strlcat(buf, " ", len);
 		}
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 225353fbb3f1..51d4013b6121 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -215,10 +215,8 @@ static int mqprio_parse_tc_entries(struct Qdisc *sch, struct nlattr *nlattr_opt,
 	for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
 		fp[tc] = priv->fp[tc];
 
-	nla_for_each_attr(n, nlattr_opt, nlattr_opt_len, rem) {
-		if (nla_type(n) != TCA_MQPRIO_TC_ENTRY)
-			continue;
-
+	nla_for_each_attr_type(n, TCA_MQPRIO_TC_ENTRY, nlattr_opt,
+			       nlattr_opt_len, rem) {
 		err = mqprio_parse_tc_entry(fp, n, &seen_tcs, extack);
 		if (err)
 			goto out;
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index a0d54b422186..1ab17e8a7260 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -1752,10 +1752,7 @@ static int taprio_parse_tc_entries(struct Qdisc *sch,
 		fp[tc] = q->fp[tc];
 	}
 
-	nla_for_each_nested(n, opt, rem) {
-		if (nla_type(n) != TCA_TAPRIO_ATTR_TC_ENTRY)
-			continue;
-
+	nla_for_each_nested_type(n, TCA_TAPRIO_ATTR_TC_ENTRY, opt, rem) {
 		err = taprio_parse_tc_entry(sch, n, max_sdu, fp, &seen_tcs,
 					    extack);
 		if (err)
-- 
cgit v1.2.3


From 4d4d2d4346857bf778fafaa97d6f76bb1663e3c9 Mon Sep 17 00:00:00 2001
From: Marco Pagani <marpagan@redhat.com>
Date: Tue, 5 Mar 2024 20:29:26 +0100
Subject: fpga: manager: add owner module and take its refcount

The current implementation of the fpga manager assumes that the low-level
module registers a driver for the parent device and uses its owner pointer
to take the module's refcount. This approach is problematic since it can
lead to a null pointer dereference while attempting to get the manager if
the parent device does not have a driver.

To address this problem, add a module owner pointer to the fpga_manager
struct and use it to take the module's refcount. Modify the functions for
registering the manager to take an additional owner module parameter and
rename them to avoid conflicts. Use the old function names for helper
macros that automatically set the module that registers the manager as the
owner. This ensures compatibility with existing low-level control modules
and reduces the chances of registering a manager without setting the owner.

Also, update the documentation to keep it consistent with the new interface
for registering an fpga manager.

Other changes: opportunistically move put_device() from __fpga_mgr_get() to
fpga_mgr_get() and of_fpga_mgr_get() to improve code clarity since the
manager device is taken in these functions.

Fixes: 654ba4cc0f3e ("fpga manager: ensure lifetime with of_fpga_mgr_get")
Suggested-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Suggested-by: Xu Yilun <yilun.xu@intel.com>
Signed-off-by: Marco Pagani <marpagan@redhat.com>
Acked-by: Xu Yilun <yilun.xu@intel.com>
Link: https://lore.kernel.org/r/20240305192926.84886-1-marpagan@redhat.com
Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
---
 Documentation/driver-api/fpga/fpga-mgr.rst | 34 ++++++++-----
 drivers/fpga/fpga-mgr.c                    | 82 ++++++++++++++++++------------
 include/linux/fpga/fpga-mgr.h              | 26 +++++++---
 3 files changed, 89 insertions(+), 53 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/fpga/fpga-mgr.rst b/Documentation/driver-api/fpga/fpga-mgr.rst
index 49c0a9512653..8d2b79f696c1 100644
--- a/Documentation/driver-api/fpga/fpga-mgr.rst
+++ b/Documentation/driver-api/fpga/fpga-mgr.rst
@@ -24,7 +24,8 @@ How to support a new FPGA device
 --------------------------------
 
 To add another FPGA manager, write a driver that implements a set of ops.  The
-probe function calls fpga_mgr_register() or fpga_mgr_register_full(), such as::
+probe function calls ``fpga_mgr_register()`` or ``fpga_mgr_register_full()``,
+such as::
 
 	static const struct fpga_manager_ops socfpga_fpga_ops = {
 		.write_init = socfpga_fpga_ops_configure_init,
@@ -69,10 +70,11 @@ probe function calls fpga_mgr_register() or fpga_mgr_register_full(), such as::
 	}
 
 Alternatively, the probe function could call one of the resource managed
-register functions, devm_fpga_mgr_register() or devm_fpga_mgr_register_full().
-When these functions are used, the parameter syntax is the same, but the call
-to fpga_mgr_unregister() should be removed. In the above example, the
-socfpga_fpga_remove() function would not be required.
+register functions, ``devm_fpga_mgr_register()`` or
+``devm_fpga_mgr_register_full()``.  When these functions are used, the
+parameter syntax is the same, but the call to ``fpga_mgr_unregister()`` should be
+removed. In the above example, the ``socfpga_fpga_remove()`` function would not be
+required.
 
 The ops will implement whatever device specific register writes are needed to
 do the programming sequence for this particular FPGA.  These ops return 0 for
@@ -125,15 +127,19 @@ API for implementing a new FPGA Manager driver
 * struct fpga_manager -  the FPGA manager struct
 * struct fpga_manager_ops -  Low level FPGA manager driver ops
 * struct fpga_manager_info -  Parameter structure for fpga_mgr_register_full()
-* fpga_mgr_register_full() -  Create and register an FPGA manager using the
+* __fpga_mgr_register_full() -  Create and register an FPGA manager using the
   fpga_mgr_info structure to provide the full flexibility of options
-* fpga_mgr_register() -  Create and register an FPGA manager using standard
+* __fpga_mgr_register() -  Create and register an FPGA manager using standard
   arguments
-* devm_fpga_mgr_register_full() -  Resource managed version of
-  fpga_mgr_register_full()
-* devm_fpga_mgr_register() -  Resource managed version of fpga_mgr_register()
+* __devm_fpga_mgr_register_full() -  Resource managed version of
+  __fpga_mgr_register_full()
+* __devm_fpga_mgr_register() -  Resource managed version of __fpga_mgr_register()
 * fpga_mgr_unregister() -  Unregister an FPGA manager
 
+Helper macros ``fpga_mgr_register_full()``, ``fpga_mgr_register()``,
+``devm_fpga_mgr_register_full()``, and ``devm_fpga_mgr_register()`` are available
+to ease the registration.
+
 .. kernel-doc:: include/linux/fpga/fpga-mgr.h
    :functions: fpga_mgr_states
 
@@ -147,16 +153,16 @@ API for implementing a new FPGA Manager driver
    :functions: fpga_manager_info
 
 .. kernel-doc:: drivers/fpga/fpga-mgr.c
-   :functions: fpga_mgr_register_full
+   :functions: __fpga_mgr_register_full
 
 .. kernel-doc:: drivers/fpga/fpga-mgr.c
-   :functions: fpga_mgr_register
+   :functions: __fpga_mgr_register
 
 .. kernel-doc:: drivers/fpga/fpga-mgr.c
-   :functions: devm_fpga_mgr_register_full
+   :functions: __devm_fpga_mgr_register_full
 
 .. kernel-doc:: drivers/fpga/fpga-mgr.c
-   :functions: devm_fpga_mgr_register
+   :functions: __devm_fpga_mgr_register
 
 .. kernel-doc:: drivers/fpga/fpga-mgr.c
    :functions: fpga_mgr_unregister
diff --git a/drivers/fpga/fpga-mgr.c b/drivers/fpga/fpga-mgr.c
index 06651389c592..0f4035b089a2 100644
--- a/drivers/fpga/fpga-mgr.c
+++ b/drivers/fpga/fpga-mgr.c
@@ -664,20 +664,16 @@ static struct attribute *fpga_mgr_attrs[] = {
 };
 ATTRIBUTE_GROUPS(fpga_mgr);
 
-static struct fpga_manager *__fpga_mgr_get(struct device *dev)
+static struct fpga_manager *__fpga_mgr_get(struct device *mgr_dev)
 {
 	struct fpga_manager *mgr;
 
-	mgr = to_fpga_manager(dev);
+	mgr = to_fpga_manager(mgr_dev);
 
-	if (!try_module_get(dev->parent->driver->owner))
-		goto err_dev;
+	if (!try_module_get(mgr->mops_owner))
+		mgr = ERR_PTR(-ENODEV);
 
 	return mgr;
-
-err_dev:
-	put_device(dev);
-	return ERR_PTR(-ENODEV);
 }
 
 static int fpga_mgr_dev_match(struct device *dev, const void *data)
@@ -693,12 +689,18 @@ static int fpga_mgr_dev_match(struct device *dev, const void *data)
  */
 struct fpga_manager *fpga_mgr_get(struct device *dev)
 {
-	struct device *mgr_dev = class_find_device(&fpga_mgr_class, NULL, dev,
-						   fpga_mgr_dev_match);
+	struct fpga_manager *mgr;
+	struct device *mgr_dev;
+
+	mgr_dev = class_find_device(&fpga_mgr_class, NULL, dev, fpga_mgr_dev_match);
 	if (!mgr_dev)
 		return ERR_PTR(-ENODEV);
 
-	return __fpga_mgr_get(mgr_dev);
+	mgr = __fpga_mgr_get(mgr_dev);
+	if (IS_ERR(mgr))
+		put_device(mgr_dev);
+
+	return mgr;
 }
 EXPORT_SYMBOL_GPL(fpga_mgr_get);
 
@@ -711,13 +713,18 @@ EXPORT_SYMBOL_GPL(fpga_mgr_get);
  */
 struct fpga_manager *of_fpga_mgr_get(struct device_node *node)
 {
-	struct device *dev;
+	struct fpga_manager *mgr;
+	struct device *mgr_dev;
 
-	dev = class_find_device_by_of_node(&fpga_mgr_class, node);
-	if (!dev)
+	mgr_dev = class_find_device_by_of_node(&fpga_mgr_class, node);
+	if (!mgr_dev)
 		return ERR_PTR(-ENODEV);
 
-	return __fpga_mgr_get(dev);
+	mgr = __fpga_mgr_get(mgr_dev);
+	if (IS_ERR(mgr))
+		put_device(mgr_dev);
+
+	return mgr;
 }
 EXPORT_SYMBOL_GPL(of_fpga_mgr_get);
 
@@ -727,7 +734,7 @@ EXPORT_SYMBOL_GPL(of_fpga_mgr_get);
  */
 void fpga_mgr_put(struct fpga_manager *mgr)
 {
-	module_put(mgr->dev.parent->driver->owner);
+	module_put(mgr->mops_owner);
 	put_device(&mgr->dev);
 }
 EXPORT_SYMBOL_GPL(fpga_mgr_put);
@@ -766,9 +773,10 @@ void fpga_mgr_unlock(struct fpga_manager *mgr)
 EXPORT_SYMBOL_GPL(fpga_mgr_unlock);
 
 /**
- * fpga_mgr_register_full - create and register an FPGA Manager device
+ * __fpga_mgr_register_full - create and register an FPGA Manager device
  * @parent:	fpga manager device from pdev
  * @info:	parameters for fpga manager
+ * @owner:	owner module containing the ops
  *
  * The caller of this function is responsible for calling fpga_mgr_unregister().
  * Using devm_fpga_mgr_register_full() instead is recommended.
@@ -776,7 +784,8 @@ EXPORT_SYMBOL_GPL(fpga_mgr_unlock);
  * Return: pointer to struct fpga_manager pointer or ERR_PTR()
  */
 struct fpga_manager *
-fpga_mgr_register_full(struct device *parent, const struct fpga_manager_info *info)
+__fpga_mgr_register_full(struct device *parent, const struct fpga_manager_info *info,
+			 struct module *owner)
 {
 	const struct fpga_manager_ops *mops = info->mops;
 	struct fpga_manager *mgr;
@@ -804,6 +813,8 @@ fpga_mgr_register_full(struct device *parent, const struct fpga_manager_info *in
 
 	mutex_init(&mgr->ref_mutex);
 
+	mgr->mops_owner = owner;
+
 	mgr->name = info->name;
 	mgr->mops = info->mops;
 	mgr->priv = info->priv;
@@ -841,14 +852,15 @@ error_kfree:
 
 	return ERR_PTR(ret);
 }
-EXPORT_SYMBOL_GPL(fpga_mgr_register_full);
+EXPORT_SYMBOL_GPL(__fpga_mgr_register_full);
 
 /**
- * fpga_mgr_register - create and register an FPGA Manager device
+ * __fpga_mgr_register - create and register an FPGA Manager device
  * @parent:	fpga manager device from pdev
  * @name:	fpga manager name
  * @mops:	pointer to structure of fpga manager ops
  * @priv:	fpga manager private data
+ * @owner:	owner module containing the ops
  *
  * The caller of this function is responsible for calling fpga_mgr_unregister().
  * Using devm_fpga_mgr_register() instead is recommended. This simple
@@ -859,8 +871,8 @@ EXPORT_SYMBOL_GPL(fpga_mgr_register_full);
  * Return: pointer to struct fpga_manager pointer or ERR_PTR()
  */
 struct fpga_manager *
-fpga_mgr_register(struct device *parent, const char *name,
-		  const struct fpga_manager_ops *mops, void *priv)
+__fpga_mgr_register(struct device *parent, const char *name,
+		    const struct fpga_manager_ops *mops, void *priv, struct module *owner)
 {
 	struct fpga_manager_info info = { 0 };
 
@@ -868,9 +880,9 @@ fpga_mgr_register(struct device *parent, const char *name,
 	info.mops = mops;
 	info.priv = priv;
 
-	return fpga_mgr_register_full(parent, &info);
+	return __fpga_mgr_register_full(parent, &info, owner);
 }
-EXPORT_SYMBOL_GPL(fpga_mgr_register);
+EXPORT_SYMBOL_GPL(__fpga_mgr_register);
 
 /**
  * fpga_mgr_unregister - unregister an FPGA manager
@@ -900,9 +912,10 @@ static void devm_fpga_mgr_unregister(struct device *dev, void *res)
 }
 
 /**
- * devm_fpga_mgr_register_full - resource managed variant of fpga_mgr_register()
+ * __devm_fpga_mgr_register_full - resource managed variant of fpga_mgr_register()
  * @parent:	fpga manager device from pdev
  * @info:	parameters for fpga manager
+ * @owner:	owner module containing the ops
  *
  * Return:  fpga manager pointer on success, negative error code otherwise.
  *
@@ -910,7 +923,8 @@ static void devm_fpga_mgr_unregister(struct device *dev, void *res)
  * function will be called automatically when the managing device is detached.
  */
 struct fpga_manager *
-devm_fpga_mgr_register_full(struct device *parent, const struct fpga_manager_info *info)
+__devm_fpga_mgr_register_full(struct device *parent, const struct fpga_manager_info *info,
+			      struct module *owner)
 {
 	struct fpga_mgr_devres *dr;
 	struct fpga_manager *mgr;
@@ -919,7 +933,7 @@ devm_fpga_mgr_register_full(struct device *parent, const struct fpga_manager_inf
 	if (!dr)
 		return ERR_PTR(-ENOMEM);
 
-	mgr = fpga_mgr_register_full(parent, info);
+	mgr = __fpga_mgr_register_full(parent, info, owner);
 	if (IS_ERR(mgr)) {
 		devres_free(dr);
 		return mgr;
@@ -930,14 +944,15 @@ devm_fpga_mgr_register_full(struct device *parent, const struct fpga_manager_inf
 
 	return mgr;
 }
-EXPORT_SYMBOL_GPL(devm_fpga_mgr_register_full);
+EXPORT_SYMBOL_GPL(__devm_fpga_mgr_register_full);
 
 /**
- * devm_fpga_mgr_register - resource managed variant of fpga_mgr_register()
+ * __devm_fpga_mgr_register - resource managed variant of fpga_mgr_register()
  * @parent:	fpga manager device from pdev
  * @name:	fpga manager name
  * @mops:	pointer to structure of fpga manager ops
  * @priv:	fpga manager private data
+ * @owner:	owner module containing the ops
  *
  * Return:  fpga manager pointer on success, negative error code otherwise.
  *
@@ -946,8 +961,9 @@ EXPORT_SYMBOL_GPL(devm_fpga_mgr_register_full);
  * device is detached.
  */
 struct fpga_manager *
-devm_fpga_mgr_register(struct device *parent, const char *name,
-		       const struct fpga_manager_ops *mops, void *priv)
+__devm_fpga_mgr_register(struct device *parent, const char *name,
+			 const struct fpga_manager_ops *mops, void *priv,
+			 struct module *owner)
 {
 	struct fpga_manager_info info = { 0 };
 
@@ -955,9 +971,9 @@ devm_fpga_mgr_register(struct device *parent, const char *name,
 	info.mops = mops;
 	info.priv = priv;
 
-	return devm_fpga_mgr_register_full(parent, &info);
+	return __devm_fpga_mgr_register_full(parent, &info, owner);
 }
-EXPORT_SYMBOL_GPL(devm_fpga_mgr_register);
+EXPORT_SYMBOL_GPL(__devm_fpga_mgr_register);
 
 static void fpga_mgr_dev_release(struct device *dev)
 {
diff --git a/include/linux/fpga/fpga-mgr.h b/include/linux/fpga/fpga-mgr.h
index 54f63459efd6..0d4fe068f3d8 100644
--- a/include/linux/fpga/fpga-mgr.h
+++ b/include/linux/fpga/fpga-mgr.h
@@ -201,6 +201,7 @@ struct fpga_manager_ops {
  * @state: state of fpga manager
  * @compat_id: FPGA manager id for compatibility check.
  * @mops: pointer to struct of fpga manager ops
+ * @mops_owner: module containing the mops
  * @priv: low level driver private date
  */
 struct fpga_manager {
@@ -210,6 +211,7 @@ struct fpga_manager {
 	enum fpga_mgr_states state;
 	struct fpga_compat_id *compat_id;
 	const struct fpga_manager_ops *mops;
+	struct module *mops_owner;
 	void *priv;
 };
 
@@ -230,18 +232,30 @@ struct fpga_manager *fpga_mgr_get(struct device *dev);
 
 void fpga_mgr_put(struct fpga_manager *mgr);
 
+#define fpga_mgr_register_full(parent, info) \
+	__fpga_mgr_register_full(parent, info, THIS_MODULE)
 struct fpga_manager *
-fpga_mgr_register_full(struct device *parent, const struct fpga_manager_info *info);
+__fpga_mgr_register_full(struct device *parent, const struct fpga_manager_info *info,
+			 struct module *owner);
 
+#define fpga_mgr_register(parent, name, mops, priv) \
+	__fpga_mgr_register(parent, name, mops, priv, THIS_MODULE)
 struct fpga_manager *
-fpga_mgr_register(struct device *parent, const char *name,
-		  const struct fpga_manager_ops *mops, void *priv);
+__fpga_mgr_register(struct device *parent, const char *name,
+		    const struct fpga_manager_ops *mops, void *priv, struct module *owner);
+
 void fpga_mgr_unregister(struct fpga_manager *mgr);
 
+#define devm_fpga_mgr_register_full(parent, info) \
+	__devm_fpga_mgr_register_full(parent, info, THIS_MODULE)
 struct fpga_manager *
-devm_fpga_mgr_register_full(struct device *parent, const struct fpga_manager_info *info);
+__devm_fpga_mgr_register_full(struct device *parent, const struct fpga_manager_info *info,
+			      struct module *owner);
+#define devm_fpga_mgr_register(parent, name, mops, priv) \
+	__devm_fpga_mgr_register(parent, name, mops, priv, THIS_MODULE)
 struct fpga_manager *
-devm_fpga_mgr_register(struct device *parent, const char *name,
-		       const struct fpga_manager_ops *mops, void *priv);
+__devm_fpga_mgr_register(struct device *parent, const char *name,
+			 const struct fpga_manager_ops *mops, void *priv,
+			 struct module *owner);
 
 #endif /*_LINUX_FPGA_MGR_H */
-- 
cgit v1.2.3


From 1da11f822042eb6ef4b6064dc048f157a7852529 Mon Sep 17 00:00:00 2001
From: Marco Pagani <marpagan@redhat.com>
Date: Fri, 22 Mar 2024 18:18:37 +0100
Subject: fpga: bridge: add owner module and take its refcount

The current implementation of the fpga bridge assumes that the low-level
module registers a driver for the parent device and uses its owner pointer
to take the module's refcount. This approach is problematic since it can
lead to a null pointer dereference while attempting to get the bridge if
the parent device does not have a driver.

To address this problem, add a module owner pointer to the fpga_bridge
struct and use it to take the module's refcount. Modify the function for
registering a bridge to take an additional owner module parameter and
rename it to avoid conflicts. Use the old function name for a helper macro
that automatically sets the module that registers the bridge as the owner.
This ensures compatibility with existing low-level control modules and
reduces the chances of registering a bridge without setting the owner.

Also, update the documentation to keep it consistent with the new interface
for registering an fpga bridge.

Other changes: opportunistically move put_device() from __fpga_bridge_get()
to fpga_bridge_get() and of_fpga_bridge_get() to improve code clarity since
the bridge device is taken in these functions.

Fixes: 21aeda950c5f ("fpga: add fpga bridge framework")
Suggested-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Suggested-by: Xu Yilun <yilun.xu@intel.com>
Reviewed-by: Russ Weight <russ.weight@linux.dev>
Signed-off-by: Marco Pagani <marpagan@redhat.com>
Acked-by: Xu Yilun <yilun.xu@intel.com>
Link: https://lore.kernel.org/r/20240322171839.233864-1-marpagan@redhat.com
Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
---
 Documentation/driver-api/fpga/fpga-bridge.rst |  7 +++-
 drivers/fpga/fpga-bridge.c                    | 57 +++++++++++++++------------
 include/linux/fpga/fpga-bridge.h              | 10 +++--
 3 files changed, 43 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/fpga/fpga-bridge.rst b/Documentation/driver-api/fpga/fpga-bridge.rst
index 604208534095..833f68fb0700 100644
--- a/Documentation/driver-api/fpga/fpga-bridge.rst
+++ b/Documentation/driver-api/fpga/fpga-bridge.rst
@@ -6,9 +6,12 @@ API to implement a new FPGA bridge
 
 * struct fpga_bridge - The FPGA Bridge structure
 * struct fpga_bridge_ops - Low level Bridge driver ops
-* fpga_bridge_register() - Create and register a bridge
+* __fpga_bridge_register() - Create and register a bridge
 * fpga_bridge_unregister() - Unregister a bridge
 
+The helper macro ``fpga_bridge_register()`` automatically sets
+the module that registers the FPGA bridge as the owner.
+
 .. kernel-doc:: include/linux/fpga/fpga-bridge.h
    :functions: fpga_bridge
 
@@ -16,7 +19,7 @@ API to implement a new FPGA bridge
    :functions: fpga_bridge_ops
 
 .. kernel-doc:: drivers/fpga/fpga-bridge.c
-   :functions: fpga_bridge_register
+   :functions: __fpga_bridge_register
 
 .. kernel-doc:: drivers/fpga/fpga-bridge.c
    :functions: fpga_bridge_unregister
diff --git a/drivers/fpga/fpga-bridge.c b/drivers/fpga/fpga-bridge.c
index 79c473b3c7c3..8ef395b49bf8 100644
--- a/drivers/fpga/fpga-bridge.c
+++ b/drivers/fpga/fpga-bridge.c
@@ -55,33 +55,26 @@ int fpga_bridge_disable(struct fpga_bridge *bridge)
 }
 EXPORT_SYMBOL_GPL(fpga_bridge_disable);
 
-static struct fpga_bridge *__fpga_bridge_get(struct device *dev,
+static struct fpga_bridge *__fpga_bridge_get(struct device *bridge_dev,
 					     struct fpga_image_info *info)
 {
 	struct fpga_bridge *bridge;
-	int ret = -ENODEV;
 
-	bridge = to_fpga_bridge(dev);
+	bridge = to_fpga_bridge(bridge_dev);
 
 	bridge->info = info;
 
-	if (!mutex_trylock(&bridge->mutex)) {
-		ret = -EBUSY;
-		goto err_dev;
-	}
+	if (!mutex_trylock(&bridge->mutex))
+		return ERR_PTR(-EBUSY);
 
-	if (!try_module_get(dev->parent->driver->owner))
-		goto err_ll_mod;
+	if (!try_module_get(bridge->br_ops_owner)) {
+		mutex_unlock(&bridge->mutex);
+		return ERR_PTR(-ENODEV);
+	}
 
 	dev_dbg(&bridge->dev, "get\n");
 
 	return bridge;
-
-err_ll_mod:
-	mutex_unlock(&bridge->mutex);
-err_dev:
-	put_device(dev);
-	return ERR_PTR(ret);
 }
 
 /**
@@ -98,13 +91,18 @@ err_dev:
 struct fpga_bridge *of_fpga_bridge_get(struct device_node *np,
 				       struct fpga_image_info *info)
 {
-	struct device *dev;
+	struct fpga_bridge *bridge;
+	struct device *bridge_dev;
 
-	dev = class_find_device_by_of_node(&fpga_bridge_class, np);
-	if (!dev)
+	bridge_dev = class_find_device_by_of_node(&fpga_bridge_class, np);
+	if (!bridge_dev)
 		return ERR_PTR(-ENODEV);
 
-	return __fpga_bridge_get(dev, info);
+	bridge = __fpga_bridge_get(bridge_dev, info);
+	if (IS_ERR(bridge))
+		put_device(bridge_dev);
+
+	return bridge;
 }
 EXPORT_SYMBOL_GPL(of_fpga_bridge_get);
 
@@ -125,6 +123,7 @@ static int fpga_bridge_dev_match(struct device *dev, const void *data)
 struct fpga_bridge *fpga_bridge_get(struct device *dev,
 				    struct fpga_image_info *info)
 {
+	struct fpga_bridge *bridge;
 	struct device *bridge_dev;
 
 	bridge_dev = class_find_device(&fpga_bridge_class, NULL, dev,
@@ -132,7 +131,11 @@ struct fpga_bridge *fpga_bridge_get(struct device *dev,
 	if (!bridge_dev)
 		return ERR_PTR(-ENODEV);
 
-	return __fpga_bridge_get(bridge_dev, info);
+	bridge = __fpga_bridge_get(bridge_dev, info);
+	if (IS_ERR(bridge))
+		put_device(bridge_dev);
+
+	return bridge;
 }
 EXPORT_SYMBOL_GPL(fpga_bridge_get);
 
@@ -146,7 +149,7 @@ void fpga_bridge_put(struct fpga_bridge *bridge)
 	dev_dbg(&bridge->dev, "put\n");
 
 	bridge->info = NULL;
-	module_put(bridge->dev.parent->driver->owner);
+	module_put(bridge->br_ops_owner);
 	mutex_unlock(&bridge->mutex);
 	put_device(&bridge->dev);
 }
@@ -316,18 +319,19 @@ static struct attribute *fpga_bridge_attrs[] = {
 ATTRIBUTE_GROUPS(fpga_bridge);
 
 /**
- * fpga_bridge_register - create and register an FPGA Bridge device
+ * __fpga_bridge_register - create and register an FPGA Bridge device
  * @parent:	FPGA bridge device from pdev
  * @name:	FPGA bridge name
  * @br_ops:	pointer to structure of fpga bridge ops
  * @priv:	FPGA bridge private data
+ * @owner:	owner module containing the br_ops
  *
  * Return: struct fpga_bridge pointer or ERR_PTR()
  */
 struct fpga_bridge *
-fpga_bridge_register(struct device *parent, const char *name,
-		     const struct fpga_bridge_ops *br_ops,
-		     void *priv)
+__fpga_bridge_register(struct device *parent, const char *name,
+		       const struct fpga_bridge_ops *br_ops,
+		       void *priv, struct module *owner)
 {
 	struct fpga_bridge *bridge;
 	int id, ret;
@@ -357,6 +361,7 @@ fpga_bridge_register(struct device *parent, const char *name,
 
 	bridge->name = name;
 	bridge->br_ops = br_ops;
+	bridge->br_ops_owner = owner;
 	bridge->priv = priv;
 
 	bridge->dev.groups = br_ops->groups;
@@ -386,7 +391,7 @@ error_kfree:
 
 	return ERR_PTR(ret);
 }
-EXPORT_SYMBOL_GPL(fpga_bridge_register);
+EXPORT_SYMBOL_GPL(__fpga_bridge_register);
 
 /**
  * fpga_bridge_unregister - unregister an FPGA bridge
diff --git a/include/linux/fpga/fpga-bridge.h b/include/linux/fpga/fpga-bridge.h
index 223da48a6d18..94c4edd047e5 100644
--- a/include/linux/fpga/fpga-bridge.h
+++ b/include/linux/fpga/fpga-bridge.h
@@ -45,6 +45,7 @@ struct fpga_bridge_info {
  * @dev: FPGA bridge device
  * @mutex: enforces exclusive reference to bridge
  * @br_ops: pointer to struct of FPGA bridge ops
+ * @br_ops_owner: module containing the br_ops
  * @info: fpga image specific information
  * @node: FPGA bridge list node
  * @priv: low level driver private date
@@ -54,6 +55,7 @@ struct fpga_bridge {
 	struct device dev;
 	struct mutex mutex; /* for exclusive reference to bridge */
 	const struct fpga_bridge_ops *br_ops;
+	struct module *br_ops_owner;
 	struct fpga_image_info *info;
 	struct list_head node;
 	void *priv;
@@ -79,10 +81,12 @@ int of_fpga_bridge_get_to_list(struct device_node *np,
 			       struct fpga_image_info *info,
 			       struct list_head *bridge_list);
 
+#define fpga_bridge_register(parent, name, br_ops, priv) \
+	__fpga_bridge_register(parent, name, br_ops, priv, THIS_MODULE)
 struct fpga_bridge *
-fpga_bridge_register(struct device *parent, const char *name,
-		     const struct fpga_bridge_ops *br_ops,
-		     void *priv);
+__fpga_bridge_register(struct device *parent, const char *name,
+		       const struct fpga_bridge_ops *br_ops, void *priv,
+		       struct module *owner);
 void fpga_bridge_unregister(struct fpga_bridge *br);
 
 #endif /* _LINUX_FPGA_BRIDGE_H */
-- 
cgit v1.2.3


From 63c15822b8dd02a2423cfd92232245ace3f7a11b Mon Sep 17 00:00:00 2001
From: Syed Nayyar Waris <syednwaris@gmail.com>
Date: Wed, 27 Mar 2024 16:23:38 +0100
Subject: lib/bitmap: add bitmap_{read,write}()

The two new functions allow reading/writing values of length up to
BITS_PER_LONG bits at arbitrary position in the bitmap.

The code was taken from "bitops: Introduce the for_each_set_clump macro"
by Syed Nayyar Waris with a number of changes and simplifications:
 - instead of using roundup(), which adds an unnecessary dependency
   on <linux/math.h>, we calculate space as BITS_PER_LONG-offset;
 - indentation is reduced by not using else-clauses (suggested by
   checkpatch for bitmap_get_value());
 - bitmap_get_value()/bitmap_set_value() are renamed to bitmap_read()
   and bitmap_write();
 - some redundant computations are omitted.

Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Syed Nayyar Waris <syednwaris@gmail.com>
Signed-off-by: William Breathitt Gray <william.gray@linaro.org>
Link: https://lore.kernel.org/lkml/fe12eedf3666f4af5138de0e70b67a07c7f40338.1592224129.git.syednwaris@gmail.com/
Suggested-by: Yury Norov <yury.norov@gmail.com>
Co-developed-by: Alexander Potapenko <glider@google.com>
Signed-off-by: Alexander Potapenko <glider@google.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bitmap.h | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

(limited to 'include')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index aa4096126553..914c23e96f26 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -83,6 +83,10 @@ struct device;
  *  bitmap_to_arr64(buf, src, nbits)            Copy nbits from buf to u64[] dst
  *  bitmap_get_value8(map, start)               Get 8bit value from map at start
  *  bitmap_set_value8(map, value, start)        Set 8bit value to map at start
+ *  bitmap_read(map, start, nbits)              Read an nbits-sized value from
+ *                                              map at start
+ *  bitmap_write(map, value, start, nbits)      Write an nbits-sized value to
+ *                                              map at start
  *
  * Note, bitmap_zero() and bitmap_fill() operate over the region of
  * unsigned longs, that is, bits behind bitmap till the unsigned long
@@ -754,6 +758,79 @@ static inline void bitmap_set_value8(unsigned long *map, unsigned long value,
 	map[index] |= value << offset;
 }
 
+/**
+ * bitmap_read - read a value of n-bits from the memory region
+ * @map: address to the bitmap memory region
+ * @start: bit offset of the n-bit value
+ * @nbits: size of value in bits, nonzero, up to BITS_PER_LONG
+ *
+ * Returns: value of @nbits bits located at the @start bit offset within the
+ * @map memory region. For @nbits = 0 and @nbits > BITS_PER_LONG the return
+ * value is undefined.
+ */
+static inline unsigned long bitmap_read(const unsigned long *map,
+					unsigned long start,
+					unsigned long nbits)
+{
+	size_t index = BIT_WORD(start);
+	unsigned long offset = start % BITS_PER_LONG;
+	unsigned long space = BITS_PER_LONG - offset;
+	unsigned long value_low, value_high;
+
+	if (unlikely(!nbits || nbits > BITS_PER_LONG))
+		return 0;
+
+	if (space >= nbits)
+		return (map[index] >> offset) & BITMAP_LAST_WORD_MASK(nbits);
+
+	value_low = map[index] & BITMAP_FIRST_WORD_MASK(start);
+	value_high = map[index + 1] & BITMAP_LAST_WORD_MASK(start + nbits);
+	return (value_low >> offset) | (value_high << space);
+}
+
+/**
+ * bitmap_write - write n-bit value within a memory region
+ * @map: address to the bitmap memory region
+ * @value: value to write, clamped to nbits
+ * @start: bit offset of the n-bit value
+ * @nbits: size of value in bits, nonzero, up to BITS_PER_LONG.
+ *
+ * bitmap_write() behaves as-if implemented as @nbits calls of __assign_bit(),
+ * i.e. bits beyond @nbits are ignored:
+ *
+ *   for (bit = 0; bit < nbits; bit++)
+ *           __assign_bit(start + bit, bitmap, val & BIT(bit));
+ *
+ * For @nbits == 0 and @nbits > BITS_PER_LONG no writes are performed.
+ */
+static inline void bitmap_write(unsigned long *map, unsigned long value,
+				unsigned long start, unsigned long nbits)
+{
+	size_t index;
+	unsigned long offset;
+	unsigned long space;
+	unsigned long mask;
+	bool fit;
+
+	if (unlikely(!nbits || nbits > BITS_PER_LONG))
+		return;
+
+	mask = BITMAP_LAST_WORD_MASK(nbits);
+	value &= mask;
+	offset = start % BITS_PER_LONG;
+	space = BITS_PER_LONG - offset;
+	fit = space >= nbits;
+	index = BIT_WORD(start);
+
+	map[index] &= (fit ? (~(mask << offset)) : ~BITMAP_FIRST_WORD_MASK(start));
+	map[index] |= value << offset;
+	if (fit)
+		return;
+
+	map[index + 1] &= BITMAP_FIRST_WORD_MASK(start + nbits);
+	map[index + 1] |= (value >> space);
+}
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __LINUX_BITMAP_H */
-- 
cgit v1.2.3


From 72cc1980a0ef3ccad0d539e7dace63d0d7d432a4 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Wed, 27 Mar 2024 16:23:41 +0100
Subject: bitops: add missing prototype check

Commit 8238b4579866 ("wait_on_bit: add an acquire memory barrier") added
a new bitop, test_bit_acquire(), with proper wrapping in order to try to
optimize it at compile-time, but missed the list of bitops used for
checking their prototypes a bit below.
The functions added have consistent prototypes, so that no more changes
are required and no functional changes take place.

Fixes: 8238b4579866 ("wait_on_bit: add an acquire memory barrier")
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bitops.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 2ba557e067fe..f7f5a783da2a 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -80,6 +80,7 @@ __check_bitop_pr(__test_and_set_bit);
 __check_bitop_pr(__test_and_clear_bit);
 __check_bitop_pr(__test_and_change_bit);
 __check_bitop_pr(test_bit);
+__check_bitop_pr(test_bit_acquire);
 
 #undef __check_bitop_pr
 
-- 
cgit v1.2.3


From 7d8296b250f2eed73f1758607926d4d258dea5d4 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Wed, 27 Mar 2024 16:23:42 +0100
Subject: bitops: make BYTES_TO_BITS() treewide-available

Avoid open-coding that simple expression each time by moving
BYTES_TO_BITS() from the probes code to <linux/bitops.h> to export
it to the rest of the kernel.
Simplify the macro while at it. `BITS_PER_LONG / sizeof(long)` always
equals to %BITS_PER_BYTE, regardless of the target architecture.
Do the same for the tools ecosystem as well (incl. its version of
bitops.h). The previous implementation had its implicit type of long,
while the new one is int, so adjust the format literal accordingly in
the perf code.

Suggested-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Acked-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bitops.h         | 2 ++
 kernel/trace/trace_probe.c     | 2 --
 tools/include/linux/bitops.h   | 2 ++
 tools/perf/util/probe-finder.c | 4 +---
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index f7f5a783da2a..e0cd09eb91cd 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -21,6 +21,8 @@
 #define BITS_TO_U32(nr)		__KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(u32))
 #define BITS_TO_BYTES(nr)	__KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(char))
 
+#define BYTES_TO_BITS(nb)	((nb) * BITS_PER_BYTE)
+
 extern unsigned int __sw_hweight8(unsigned int w);
 extern unsigned int __sw_hweight16(unsigned int w);
 extern unsigned int __sw_hweight32(unsigned int w);
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index dfe3ee6035ec..87337a0c8e03 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -1180,8 +1180,6 @@ parse_probe_arg(char *arg, const struct fetch_type *type,
 	return ret;
 }
 
-#define BYTES_TO_BITS(nb)	((BITS_PER_LONG * (nb)) / sizeof(long))
-
 /* Bitfield type needs to be parsed into a fetch function */
 static int __parse_bitfield_probe_arg(const char *bf,
 				      const struct fetch_type *t,
diff --git a/tools/include/linux/bitops.h b/tools/include/linux/bitops.h
index 7319f6ced108..272f15d0e434 100644
--- a/tools/include/linux/bitops.h
+++ b/tools/include/linux/bitops.h
@@ -20,6 +20,8 @@
 #define BITS_TO_U32(nr)		DIV_ROUND_UP(nr, BITS_PER_TYPE(u32))
 #define BITS_TO_BYTES(nr)	DIV_ROUND_UP(nr, BITS_PER_TYPE(char))
 
+#define BYTES_TO_BITS(nb)	((nb) * BITS_PER_BYTE)
+
 extern unsigned int __sw_hweight8(unsigned int w);
 extern unsigned int __sw_hweight16(unsigned int w);
 extern unsigned int __sw_hweight32(unsigned int w);
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index c8923375e30d..630e16c54ed5 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -186,8 +186,6 @@ static_var:
 	return ret2;
 }
 
-#define BYTES_TO_BITS(nb)	((nb) * BITS_PER_LONG / sizeof(long))
-
 static int convert_variable_type(Dwarf_Die *vr_die,
 				 struct probe_trace_arg *tvar,
 				 const char *cast, bool user_access)
@@ -217,7 +215,7 @@ static int convert_variable_type(Dwarf_Die *vr_die,
 		total = dwarf_bytesize(vr_die);
 		if (boffs < 0 || total < 0)
 			return -ENOENT;
-		ret = snprintf(buf, 16, "b%d@%d/%zd", bsize, boffs,
+		ret = snprintf(buf, 16, "b%d@%d/%d", bsize, boffs,
 				BYTES_TO_BITS(total));
 		goto formatted;
 	}
-- 
cgit v1.2.3


From 5259401ef8f4b010bc0f9740868e9147ccc45899 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Wed, 27 Mar 2024 16:23:43 +0100
Subject: bitops: let the compiler optimize {__,}assign_bit()

Since commit b03fc1173c0c ("bitops: let optimize out non-atomic bitops
on compile-time constants"), the compilers are able to expand inline
bitmap operations to compile-time initializers when possible.
However, during the round of replacement if-__set-else-__clear with
__assign_bit() as per Andy's advice, bloat-o-meter showed +1024 bytes
difference in object code size for one module (even one function),
where the pattern:

	DECLARE_BITMAP(foo) = { }; // on the stack, zeroed

	if (a)
		__set_bit(const_bit_num, foo);
	if (b)
		__set_bit(another_const_bit_num, foo);
	...

is heavily used, although there should be no difference: the bitmap is
zeroed, so the second half of __assign_bit() should be compiled-out as
a no-op.
I either missed the fact that __assign_bit() has bitmap pointer marked
as `volatile` (as we usually do for bitops) or was hoping that the
compilers would at least try to look past the `volatile` for
__always_inline functions. Anyhow, due to that attribute, the compilers
were always compiling the whole expression and no mentioned compile-time
optimizations were working.

Convert __assign_bit() to a macro since it's a very simple if-else and
all of the checks are performed inside __set_bit() and __clear_bit(),
thus that wrapper has to be as transparent as possible. After that
change, despite it showing only -20 bytes change for vmlinux (due to
that it's still relatively unpopular), no drastic code size changes
happen when replacing if-set-else-clear for onstack bitmaps with
__assign_bit(), meaning the compiler now expands them to the actual
operations will all the expected optimizations.

Atomic assign_bit() is less affected due to its nature, but let's
convert it to a macro as well to keep the code consistent and not
leave a place for possible suboptimal codegen. Moreover, with certain
kernel configuration it actually gives some saves (x86):

do_ip_setsockopt    4154    4099     -55

Suggested-by: Yury Norov <yury.norov@gmail.com> # assign_bit(), too
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Acked-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bitops.h | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index e0cd09eb91cd..b25dc8742124 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -275,23 +275,11 @@ static inline unsigned long fns(unsigned long word, unsigned int n)
  * @addr: the address to start counting from
  * @value: the value to assign
  */
-static __always_inline void assign_bit(long nr, volatile unsigned long *addr,
-				       bool value)
-{
-	if (value)
-		set_bit(nr, addr);
-	else
-		clear_bit(nr, addr);
-}
+#define assign_bit(nr, addr, value)					\
+	((value) ? set_bit((nr), (addr)) : clear_bit((nr), (addr)))
 
-static __always_inline void __assign_bit(long nr, volatile unsigned long *addr,
-					 bool value)
-{
-	if (value)
-		__set_bit(nr, addr);
-	else
-		__clear_bit(nr, addr);
-}
+#define __assign_bit(nr, addr, value)					\
+	((value) ? __set_bit((nr), (addr)) : __clear_bit((nr), (addr)))
 
 /**
  * __ptr_set_bit - Set bit in a pointer's value
-- 
cgit v1.2.3


From 8fab6a9d72e4fa0b0607b3a8011a909cabae79f7 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Wed, 27 Mar 2024 16:23:44 +0100
Subject: linkmode: convert linkmode_{test,set,clear,mod}_bit() to macros

Since commit b03fc1173c0c ("bitops: let optimize out non-atomic bitops
on compile-time constants"), the non-atomic bitops are macros which can
be expanded by the compilers into compile-time expressions, which will
result in better optimized object code. Unfortunately, turned out that
passing `volatile` to those macros discards any possibility of
optimization, as the compilers then don't even try to look whether
the passed bitmap is known at compilation time. In addition to that,
the mentioned linkmode helpers are marked with `inline`, not
`__always_inline`, meaning that it's not guaranteed some compiler won't
uninline them for no reason, which will also effectively prevent them
from being optimized (it's a well-known thing the compilers sometimes
uninline `2 + 2`).
Convert linkmode_*_bit() from inlines to macros. Their calling
convention are 1:1 with the corresponding bitops, so that it's not even
needed to enumerate and map the arguments, only the names. No changes in
vmlinux' object code (compiled by LLVM for x86_64) whatsoever, but that
doesn't necessarily means the change is meaningless.

Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/linkmode.h | 27 ++++-----------------------
 1 file changed, 4 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/linkmode.h b/include/linux/linkmode.h
index 287f590ed56b..d94bfd9ac8cc 100644
--- a/include/linux/linkmode.h
+++ b/include/linux/linkmode.h
@@ -43,29 +43,10 @@ static inline int linkmode_andnot(unsigned long *dst, const unsigned long *src1,
 	return bitmap_andnot(dst, src1, src2,  __ETHTOOL_LINK_MODE_MASK_NBITS);
 }
 
-static inline void linkmode_set_bit(int nr, volatile unsigned long *addr)
-{
-	__set_bit(nr, addr);
-}
-
-static inline void linkmode_clear_bit(int nr, volatile unsigned long *addr)
-{
-	__clear_bit(nr, addr);
-}
-
-static inline void linkmode_mod_bit(int nr, volatile unsigned long *addr,
-				    int set)
-{
-	if (set)
-		linkmode_set_bit(nr, addr);
-	else
-		linkmode_clear_bit(nr, addr);
-}
-
-static inline int linkmode_test_bit(int nr, const volatile unsigned long *addr)
-{
-	return test_bit(nr, addr);
-}
+#define linkmode_test_bit	test_bit
+#define linkmode_set_bit	__set_bit
+#define linkmode_clear_bit	__clear_bit
+#define linkmode_mod_bit	__assign_bit
 
 static inline void linkmode_set_bit_array(const int *array, int array_size,
 					  unsigned long *addr)
-- 
cgit v1.2.3


From a37fbe666c016fd89e4460d0ebfcea05baba46dc Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Wed, 27 Mar 2024 16:23:49 +0100
Subject: bitmap: introduce generic optimized bitmap_size()

The number of times yet another open coded
`BITS_TO_LONGS(nbits) * sizeof(long)` can be spotted is huge.
Some generic helper is long overdue.

Add one, bitmap_size(), but with one detail.
BITS_TO_LONGS() uses DIV_ROUND_UP(). The latter works well when both
divident and divisor are compile-time constants or when the divisor
is not a pow-of-2. When it is however, the compilers sometimes tend
to generate suboptimal code (GCC 13):

48 83 c0 3f          	add    $0x3f,%rax
48 c1 e8 06          	shr    $0x6,%rax
48 8d 14 c5 00 00 00 00	lea    0x0(,%rax,8),%rdx

%BITS_PER_LONG is always a pow-2 (either 32 or 64), but GCC still does
full division of `nbits + 63` by it and then multiplication by 8.
Instead of BITS_TO_LONGS(), use ALIGN() and then divide by 8. GCC:

8d 50 3f             	lea    0x3f(%rax),%edx
c1 ea 03             	shr    $0x3,%edx
81 e2 f8 ff ff 1f    	and    $0x1ffffff8,%edx

Now it shifts `nbits + 63` by 3 positions (IOW performs fast division
by 8) and then masks bits[2:0]. bloat-o-meter:

add/remove: 0/0 grow/shrink: 20/133 up/down: 156/-773 (-617)

Clang does it better and generates the same code before/after starting
from -O1, except that with the ALIGN() approach it uses %edx and thus
still saves some bytes:

add/remove: 0/0 grow/shrink: 9/133 up/down: 18/-538 (-520)

Note that we can't expand DIV_ROUND_UP() by adding a check and using
this approach there, as it's used in array declarations where
expressions are not allowed.
Add this helper to tools/ as well.

Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Acked-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/md/dm-clone-metadata.c | 5 -----
 drivers/s390/cio/idset.c       | 2 +-
 include/linux/bitmap.h         | 8 +++++---
 include/linux/cpumask.h        | 2 +-
 lib/math/prime_numbers.c       | 2 --
 tools/include/linux/bitmap.h   | 7 ++++---
 6 files changed, 11 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/md/dm-clone-metadata.c b/drivers/md/dm-clone-metadata.c
index c43d55672bce..47c1fa7aad8b 100644
--- a/drivers/md/dm-clone-metadata.c
+++ b/drivers/md/dm-clone-metadata.c
@@ -465,11 +465,6 @@ static void __destroy_persistent_data_structures(struct dm_clone_metadata *cmd)
 
 /*---------------------------------------------------------------------------*/
 
-static size_t bitmap_size(unsigned long nr_bits)
-{
-	return BITS_TO_LONGS(nr_bits) * sizeof(long);
-}
-
 static int __dirty_map_init(struct dirty_map *dmap, unsigned long nr_words,
 			    unsigned long nr_regions)
 {
diff --git a/drivers/s390/cio/idset.c b/drivers/s390/cio/idset.c
index 0a1105a483bf..e5f28370a903 100644
--- a/drivers/s390/cio/idset.c
+++ b/drivers/s390/cio/idset.c
@@ -18,7 +18,7 @@ struct idset {
 
 static inline unsigned long idset_bitmap_size(int num_ssid, int num_id)
 {
-	return BITS_TO_LONGS(num_ssid * num_id) * sizeof(unsigned long);
+	return bitmap_size(size_mul(num_ssid, num_id));
 }
 
 static struct idset *idset_new(int num_ssid, int num_id)
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 914c23e96f26..363e0b184a45 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -226,9 +226,11 @@ void bitmap_fold(unsigned long *dst, const unsigned long *orig,
 #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
 #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
 
+#define bitmap_size(nbits)	(ALIGN(nbits, BITS_PER_LONG) / BITS_PER_BYTE)
+
 static inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
 {
-	unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
+	unsigned int len = bitmap_size(nbits);
 
 	if (small_const_nbits(nbits))
 		*dst = 0;
@@ -238,7 +240,7 @@ static inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
 
 static inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
 {
-	unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
+	unsigned int len = bitmap_size(nbits);
 
 	if (small_const_nbits(nbits))
 		*dst = ~0UL;
@@ -249,7 +251,7 @@ static inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
 static inline void bitmap_copy(unsigned long *dst, const unsigned long *src,
 			unsigned int nbits)
 {
-	unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
+	unsigned int len = bitmap_size(nbits);
 
 	if (small_const_nbits(nbits))
 		*dst = *src;
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 1c29947db848..6519f9c77709 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -853,7 +853,7 @@ static inline int cpulist_parse(const char *buf, struct cpumask *dstp)
  */
 static inline unsigned int cpumask_size(void)
 {
-	return BITS_TO_LONGS(large_cpumask_bits) * sizeof(long);
+	return bitmap_size(large_cpumask_bits);
 }
 
 /*
diff --git a/lib/math/prime_numbers.c b/lib/math/prime_numbers.c
index d42cebf7407f..d3b64b10da1c 100644
--- a/lib/math/prime_numbers.c
+++ b/lib/math/prime_numbers.c
@@ -6,8 +6,6 @@
 #include <linux/prime_numbers.h>
 #include <linux/slab.h>
 
-#define bitmap_size(nbits) (BITS_TO_LONGS(nbits) * sizeof(unsigned long))
-
 struct primes {
 	struct rcu_head rcu;
 	unsigned long last, sz;
diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h
index 8c6852dba04f..210c13b1b857 100644
--- a/tools/include/linux/bitmap.h
+++ b/tools/include/linux/bitmap.h
@@ -26,13 +26,14 @@ bool __bitmap_intersects(const unsigned long *bitmap1,
 #define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
 #define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))
 
+#define bitmap_size(nbits)	(ALIGN(nbits, BITS_PER_LONG) / BITS_PER_BYTE)
+
 static inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
 {
 	if (small_const_nbits(nbits))
 		*dst = 0UL;
 	else {
-		int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
-		memset(dst, 0, len);
+		memset(dst, 0, bitmap_size(nbits));
 	}
 }
 
@@ -84,7 +85,7 @@ static inline void bitmap_or(unsigned long *dst, const unsigned long *src1,
  */
 static inline unsigned long *bitmap_zalloc(int nbits)
 {
-	return calloc(1, BITS_TO_LONGS(nbits) * sizeof(unsigned long));
+	return calloc(1, bitmap_size(nbits));
 }
 
 /*
-- 
cgit v1.2.3


From b44759705f7dc95d539d145b4c2edcaf079e7c33 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Wed, 27 Mar 2024 16:23:50 +0100
Subject: bitmap: make bitmap_{get,set}_value8() use bitmap_{read,write}()

Now that we have generic bitmap_read() and bitmap_write(), which are
inline and try to take care of non-bound-crossing and aligned cases
to keep them optimized, collapse bitmap_{get,set}_value8() into
simple wrappers around the former ones.
bloat-o-meter shows no difference in vmlinux and -2 bytes for
gpio-pca953x.ko, which says the optimization didn't suffer due to
that change. The converted helpers have the value width embedded
and always compile-time constant and that helps a lot.

Suggested-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bitmap.h | 38 +++++---------------------------------
 1 file changed, 5 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 363e0b184a45..8c4768c44a01 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -727,39 +727,6 @@ static inline void bitmap_from_u64(unsigned long *dst, u64 mask)
 	bitmap_from_arr64(dst, &mask, 64);
 }
 
-/**
- * bitmap_get_value8 - get an 8-bit value within a memory region
- * @map: address to the bitmap memory region
- * @start: bit offset of the 8-bit value; must be a multiple of 8
- *
- * Returns the 8-bit value located at the @start bit offset within the @src
- * memory region.
- */
-static inline unsigned long bitmap_get_value8(const unsigned long *map,
-					      unsigned long start)
-{
-	const size_t index = BIT_WORD(start);
-	const unsigned long offset = start % BITS_PER_LONG;
-
-	return (map[index] >> offset) & 0xFF;
-}
-
-/**
- * bitmap_set_value8 - set an 8-bit value within a memory region
- * @map: address to the bitmap memory region
- * @value: the 8-bit value; values wider than 8 bits may clobber bitmap
- * @start: bit offset of the 8-bit value; must be a multiple of 8
- */
-static inline void bitmap_set_value8(unsigned long *map, unsigned long value,
-				     unsigned long start)
-{
-	const size_t index = BIT_WORD(start);
-	const unsigned long offset = start % BITS_PER_LONG;
-
-	map[index] &= ~(0xFFUL << offset);
-	map[index] |= value << offset;
-}
-
 /**
  * bitmap_read - read a value of n-bits from the memory region
  * @map: address to the bitmap memory region
@@ -833,6 +800,11 @@ static inline void bitmap_write(unsigned long *map, unsigned long value,
 	map[index + 1] |= (value >> space);
 }
 
+#define bitmap_get_value8(map, start)			\
+	bitmap_read(map, start, BITS_PER_BYTE)
+#define bitmap_set_value8(map, value, start)		\
+	bitmap_write(map, value, start, BITS_PER_BYTE)
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __LINUX_BITMAP_H */
-- 
cgit v1.2.3


From 117aef12a7b1b797bce9f66b156c65eab850b5b5 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Wed, 27 Mar 2024 16:23:52 +0100
Subject: ip_tunnel: use a separate struct to store tunnel params in the kernel

Unlike IPv6 tunnels which use purely-kernel __ip6_tnl_parm structure
to store params inside the kernel, IPv4 tunnel code uses the same
ip_tunnel_parm which is being used to talk with the userspace.
This makes it difficult to alter or add any fields or use a
different format for whatever data.
Define struct ip_tunnel_parm_kern, a 1:1 copy of ip_tunnel_parm for
now, and use it throughout the code. Define the pieces, where the copy
user <-> kernel happens, as standalone functions, and copy the data
there field-by-field, so that the kernel-side structure could be easily
modified later on and the users wouldn't have to care about this.

Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_ipip.c    | 30 ++++++-----
 .../net/ethernet/mellanox/mlxsw/spectrum_ipip.h    |  2 +-
 .../net/ethernet/mellanox/mlxsw/spectrum_span.c    |  4 +-
 include/linux/netdevice.h                          |  7 +--
 include/net/ip_tunnels.h                           | 25 +++++++--
 net/ipv4/ip_gre.c                                  | 17 +++---
 net/ipv4/ip_tunnel.c                               | 63 +++++++++++++++++-----
 net/ipv4/ip_tunnel_core.c                          |  2 +-
 net/ipv4/ip_vti.c                                  | 12 ++---
 net/ipv4/ipip.c                                    | 12 ++---
 net/ipv4/ipmr.c                                    |  2 +-
 net/ipv6/addrconf.c                                |  3 +-
 net/ipv6/sit.c                                     | 33 ++++++------
 13 files changed, 137 insertions(+), 75 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c
index 3340b4a694c3..d67df358a52f 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c
@@ -8,7 +8,7 @@
 #include "spectrum_ipip.h"
 #include "reg.h"
 
-struct ip_tunnel_parm
+struct ip_tunnel_parm_kern
 mlxsw_sp_ipip_netdev_parms4(const struct net_device *ol_dev)
 {
 	struct ip_tunnel *tun = netdev_priv(ol_dev);
@@ -24,7 +24,8 @@ mlxsw_sp_ipip_netdev_parms6(const struct net_device *ol_dev)
 	return tun->parms;
 }
 
-static bool mlxsw_sp_ipip_parms4_has_ikey(const struct ip_tunnel_parm *parms)
+static bool
+mlxsw_sp_ipip_parms4_has_ikey(const struct ip_tunnel_parm_kern *parms)
 {
 	return !!(parms->i_flags & TUNNEL_KEY);
 }
@@ -34,7 +35,8 @@ static bool mlxsw_sp_ipip_parms6_has_ikey(const struct __ip6_tnl_parm *parms)
 	return !!(parms->i_flags & TUNNEL_KEY);
 }
 
-static bool mlxsw_sp_ipip_parms4_has_okey(const struct ip_tunnel_parm *parms)
+static bool
+mlxsw_sp_ipip_parms4_has_okey(const struct ip_tunnel_parm_kern *parms)
 {
 	return !!(parms->o_flags & TUNNEL_KEY);
 }
@@ -44,7 +46,7 @@ static bool mlxsw_sp_ipip_parms6_has_okey(const struct __ip6_tnl_parm *parms)
 	return !!(parms->o_flags & TUNNEL_KEY);
 }
 
-static u32 mlxsw_sp_ipip_parms4_ikey(const struct ip_tunnel_parm *parms)
+static u32 mlxsw_sp_ipip_parms4_ikey(const struct ip_tunnel_parm_kern *parms)
 {
 	return mlxsw_sp_ipip_parms4_has_ikey(parms) ?
 		be32_to_cpu(parms->i_key) : 0;
@@ -56,7 +58,7 @@ static u32 mlxsw_sp_ipip_parms6_ikey(const struct __ip6_tnl_parm *parms)
 		be32_to_cpu(parms->i_key) : 0;
 }
 
-static u32 mlxsw_sp_ipip_parms4_okey(const struct ip_tunnel_parm *parms)
+static u32 mlxsw_sp_ipip_parms4_okey(const struct ip_tunnel_parm_kern *parms)
 {
 	return mlxsw_sp_ipip_parms4_has_okey(parms) ?
 		be32_to_cpu(parms->o_key) : 0;
@@ -69,7 +71,7 @@ static u32 mlxsw_sp_ipip_parms6_okey(const struct __ip6_tnl_parm *parms)
 }
 
 static union mlxsw_sp_l3addr
-mlxsw_sp_ipip_parms4_saddr(const struct ip_tunnel_parm *parms)
+mlxsw_sp_ipip_parms4_saddr(const struct ip_tunnel_parm_kern *parms)
 {
 	return (union mlxsw_sp_l3addr) { .addr4 = parms->iph.saddr };
 }
@@ -81,7 +83,7 @@ mlxsw_sp_ipip_parms6_saddr(const struct __ip6_tnl_parm *parms)
 }
 
 static union mlxsw_sp_l3addr
-mlxsw_sp_ipip_parms4_daddr(const struct ip_tunnel_parm *parms)
+mlxsw_sp_ipip_parms4_daddr(const struct ip_tunnel_parm_kern *parms)
 {
 	return (union mlxsw_sp_l3addr) { .addr4 = parms->iph.daddr };
 }
@@ -96,7 +98,7 @@ union mlxsw_sp_l3addr
 mlxsw_sp_ipip_netdev_saddr(enum mlxsw_sp_l3proto proto,
 			   const struct net_device *ol_dev)
 {
-	struct ip_tunnel_parm parms4;
+	struct ip_tunnel_parm_kern parms4;
 	struct __ip6_tnl_parm parms6;
 
 	switch (proto) {
@@ -115,7 +117,9 @@ mlxsw_sp_ipip_netdev_saddr(enum mlxsw_sp_l3proto proto,
 static __be32 mlxsw_sp_ipip_netdev_daddr4(const struct net_device *ol_dev)
 {
 
-	struct ip_tunnel_parm parms4 = mlxsw_sp_ipip_netdev_parms4(ol_dev);
+	struct ip_tunnel_parm_kern parms4;
+
+	parms4 = mlxsw_sp_ipip_netdev_parms4(ol_dev);
 
 	return mlxsw_sp_ipip_parms4_daddr(&parms4).addr4;
 }
@@ -124,7 +128,7 @@ static union mlxsw_sp_l3addr
 mlxsw_sp_ipip_netdev_daddr(enum mlxsw_sp_l3proto proto,
 			   const struct net_device *ol_dev)
 {
-	struct ip_tunnel_parm parms4;
+	struct ip_tunnel_parm_kern parms4;
 	struct __ip6_tnl_parm parms6;
 
 	switch (proto) {
@@ -150,7 +154,7 @@ bool mlxsw_sp_l3addr_is_zero(union mlxsw_sp_l3addr addr)
 static struct mlxsw_sp_ipip_parms
 mlxsw_sp_ipip_netdev_parms_init_gre4(const struct net_device *ol_dev)
 {
-	struct ip_tunnel_parm parms = mlxsw_sp_ipip_netdev_parms4(ol_dev);
+	struct ip_tunnel_parm_kern parms = mlxsw_sp_ipip_netdev_parms4(ol_dev);
 
 	return (struct mlxsw_sp_ipip_parms) {
 		.proto = MLXSW_SP_L3_PROTO_IPV4,
@@ -187,8 +191,8 @@ mlxsw_sp_ipip_decap_config_gre4(struct mlxsw_sp *mlxsw_sp,
 {
 	u16 rif_index = mlxsw_sp_ipip_lb_rif_index(ipip_entry->ol_lb);
 	u16 ul_rif_id = mlxsw_sp_ipip_lb_ul_rif_id(ipip_entry->ol_lb);
+	struct ip_tunnel_parm_kern parms;
 	char rtdp_pl[MLXSW_REG_RTDP_LEN];
-	struct ip_tunnel_parm parms;
 	unsigned int type_check;
 	bool has_ikey;
 	u32 daddr4;
@@ -252,7 +256,7 @@ static struct mlxsw_sp_rif_ipip_lb_config
 mlxsw_sp_ipip_ol_loopback_config_gre4(struct mlxsw_sp *mlxsw_sp,
 				      const struct net_device *ol_dev)
 {
-	struct ip_tunnel_parm parms = mlxsw_sp_ipip_netdev_parms4(ol_dev);
+	struct ip_tunnel_parm_kern parms = mlxsw_sp_ipip_netdev_parms4(ol_dev);
 	enum mlxsw_reg_ritr_loopback_ipip_type lb_ipipt;
 
 	lb_ipipt = mlxsw_sp_ipip_parms4_has_okey(&parms) ?
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.h
index a35f009da561..a66173779641 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.h
@@ -9,7 +9,7 @@
 #include <linux/if_tunnel.h>
 #include <net/ip6_tunnel.h>
 
-struct ip_tunnel_parm
+struct ip_tunnel_parm_kern
 mlxsw_sp_ipip_netdev_parms4(const struct net_device *ol_dev);
 struct __ip6_tnl_parm
 mlxsw_sp_ipip_netdev_parms6(const struct net_device *ol_dev);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
index af50ff9e5f26..f0cbc6b9b041 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
@@ -413,8 +413,8 @@ mlxsw_sp_span_gretap4_route(const struct net_device *to_dev,
 			    __be32 *saddrp, __be32 *daddrp)
 {
 	struct ip_tunnel *tun = netdev_priv(to_dev);
+	struct ip_tunnel_parm_kern parms;
 	struct net_device *dev = NULL;
-	struct ip_tunnel_parm parms;
 	struct rtable *rt = NULL;
 	struct flowi4 fl4;
 
@@ -451,7 +451,7 @@ mlxsw_sp_span_entry_gretap4_parms(struct mlxsw_sp *mlxsw_sp,
 				  const struct net_device *to_dev,
 				  struct mlxsw_sp_span_parms *sparmsp)
 {
-	struct ip_tunnel_parm tparm = mlxsw_sp_ipip_netdev_parms4(to_dev);
+	struct ip_tunnel_parm_kern tparm = mlxsw_sp_ipip_netdev_parms4(to_dev);
 	union mlxsw_sp_l3addr saddr = { .addr4 = tparm.iph.saddr };
 	union mlxsw_sp_l3addr daddr = { .addr4 = tparm.iph.daddr };
 	bool inherit_tos = tparm.iph.tos & 0x1;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e41d30ebaca6..55c7cf9404a4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -59,7 +59,7 @@ struct ethtool_ops;
 struct kernel_hwtstamp_config;
 struct phy_device;
 struct dsa_port;
-struct ip_tunnel_parm;
+struct ip_tunnel_parm_kern;
 struct macsec_context;
 struct macsec_ops;
 struct netdev_name_node;
@@ -1327,7 +1327,7 @@ struct netdev_net_notifier {
  *	queue id bound to an AF_XDP socket. The flags field specifies if
  *	only RX, only Tx, or both should be woken up using the flags
  *	XDP_WAKEUP_RX and XDP_WAKEUP_TX.
- * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p,
+ * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm_kern *p,
  *			 int cmd);
  *	Add, change, delete or get information on an IPv4 tunnel.
  * struct net_device *(*ndo_get_peer_dev)(struct net_device *dev);
@@ -1583,7 +1583,8 @@ struct net_device_ops {
 	int			(*ndo_xsk_wakeup)(struct net_device *dev,
 						  u32 queue_id, u32 flags);
 	int			(*ndo_tunnel_ctl)(struct net_device *dev,
-						  struct ip_tunnel_parm *p, int cmd);
+						  struct ip_tunnel_parm_kern *p,
+						  int cmd);
 	struct net_device *	(*ndo_get_peer_dev)(struct net_device *dev);
 	int                     (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx,
                                                          struct net_device_path *path);
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 5cd64bb2104d..20f0319ab149 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -110,6 +110,17 @@ struct ip_tunnel_prl_entry {
 
 struct metadata_dst;
 
+/* Kernel-side copy of ip_tunnel_parm */
+struct ip_tunnel_parm_kern {
+	char			name[IFNAMSIZ];
+	int			link;
+	__be16			i_flags;
+	__be16			o_flags;
+	__be32			i_key;
+	__be32			o_key;
+	struct iphdr		iph;
+};
+
 struct ip_tunnel {
 	struct ip_tunnel __rcu	*next;
 	struct hlist_node hash_node;
@@ -136,7 +147,7 @@ struct ip_tunnel {
 
 	struct dst_cache dst_cache;
 
-	struct ip_tunnel_parm parms;
+	struct ip_tunnel_parm_kern parms;
 
 	int		mlink;
 	int		encap_hlen;	/* Encap header length (FOU,GUE) */
@@ -291,7 +302,11 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 		    const struct iphdr *tnl_params, const u8 protocol);
 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 		       const u8 proto, int tunnel_hlen);
-int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd);
+int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p,
+		  int cmd);
+bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern *kp,
+			      const void __user *data);
+bool ip_tunnel_parm_to_user(void __user *data, struct ip_tunnel_parm_kern *kp);
 int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
 			     void __user *data, int cmd);
 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict);
@@ -307,16 +322,16 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
 		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
 		  bool log_ecn_error);
 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
-			 struct ip_tunnel_parm *p, __u32 fwmark);
+			 struct ip_tunnel_parm_kern *p, __u32 fwmark);
 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
-		      struct ip_tunnel_parm *p, __u32 fwmark);
+		      struct ip_tunnel_parm_kern *p, __u32 fwmark);
 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id);
 
 bool ip_tunnel_netlink_encap_parms(struct nlattr *data[],
 				   struct ip_tunnel_encap *encap);
 
 void ip_tunnel_netlink_parms(struct nlattr *data[],
-			     struct ip_tunnel_parm *parms);
+			     struct ip_tunnel_parm_kern *parms);
 
 extern const struct header_ops ip_tunnel_header_ops;
 __be16 ip_tunnel_parse_protocol(const struct sk_buff *skb);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 7b16c211b904..8853c065a985 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -785,7 +785,8 @@ static void ipgre_link_update(struct net_device *dev, bool set_mtu)
 	}
 }
 
-static int ipgre_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p,
+static int ipgre_tunnel_ctl(struct net_device *dev,
+			    struct ip_tunnel_parm_kern *p,
 			    int cmd)
 {
 	int err;
@@ -1131,7 +1132,7 @@ static int erspan_validate(struct nlattr *tb[], struct nlattr *data[],
 static int ipgre_netlink_parms(struct net_device *dev,
 				struct nlattr *data[],
 				struct nlattr *tb[],
-				struct ip_tunnel_parm *parms,
+				struct ip_tunnel_parm_kern *parms,
 				__u32 *fwmark)
 {
 	struct ip_tunnel *t = netdev_priv(dev);
@@ -1198,7 +1199,7 @@ static int ipgre_netlink_parms(struct net_device *dev,
 static int erspan_netlink_parms(struct net_device *dev,
 				struct nlattr *data[],
 				struct nlattr *tb[],
-				struct ip_tunnel_parm *parms,
+				struct ip_tunnel_parm_kern *parms,
 				__u32 *fwmark)
 {
 	struct ip_tunnel *t = netdev_priv(dev);
@@ -1357,7 +1358,7 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev,
 			 struct nlattr *tb[], struct nlattr *data[],
 			 struct netlink_ext_ack *extack)
 {
-	struct ip_tunnel_parm p;
+	struct ip_tunnel_parm_kern p;
 	__u32 fwmark = 0;
 	int err;
 
@@ -1375,7 +1376,7 @@ static int erspan_newlink(struct net *src_net, struct net_device *dev,
 			  struct nlattr *tb[], struct nlattr *data[],
 			  struct netlink_ext_ack *extack)
 {
-	struct ip_tunnel_parm p;
+	struct ip_tunnel_parm_kern p;
 	__u32 fwmark = 0;
 	int err;
 
@@ -1394,8 +1395,8 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
 			    struct netlink_ext_ack *extack)
 {
 	struct ip_tunnel *t = netdev_priv(dev);
+	struct ip_tunnel_parm_kern p;
 	__u32 fwmark = t->fwmark;
-	struct ip_tunnel_parm p;
 	int err;
 
 	err = ipgre_newlink_encap_setup(dev, data);
@@ -1423,8 +1424,8 @@ static int erspan_changelink(struct net_device *dev, struct nlattr *tb[],
 			     struct netlink_ext_ack *extack)
 {
 	struct ip_tunnel *t = netdev_priv(dev);
+	struct ip_tunnel_parm_kern p;
 	__u32 fwmark = t->fwmark;
-	struct ip_tunnel_parm p;
 	int err;
 
 	err = ipgre_newlink_encap_setup(dev, data);
@@ -1496,7 +1497,7 @@ static size_t ipgre_get_size(const struct net_device *dev)
 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
 {
 	struct ip_tunnel *t = netdev_priv(dev);
-	struct ip_tunnel_parm *p = &t->parms;
+	struct ip_tunnel_parm_kern *p = &t->parms;
 	__be16 o_flags = p->o_flags;
 
 	if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 1b8d8ff9a237..d10f41f81463 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -56,7 +56,7 @@ static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
 			 IP_TNL_HASH_BITS);
 }
 
-static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
+static bool ip_tunnel_key_match(const struct ip_tunnel_parm_kern *p,
 				__be16 flags, __be32 key)
 {
 	if (p->i_flags & TUNNEL_KEY) {
@@ -171,7 +171,7 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
 
 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
-				    struct ip_tunnel_parm *parms)
+				    struct ip_tunnel_parm_kern *parms)
 {
 	unsigned int h;
 	__be32 remote;
@@ -206,7 +206,7 @@ static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
 }
 
 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
-					struct ip_tunnel_parm *parms,
+					struct ip_tunnel_parm_kern *parms,
 					int type)
 {
 	__be32 remote = parms->iph.daddr;
@@ -230,7 +230,7 @@ static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
 
 static struct net_device *__ip_tunnel_create(struct net *net,
 					     const struct rtnl_link_ops *ops,
-					     struct ip_tunnel_parm *parms)
+					     struct ip_tunnel_parm_kern *parms)
 {
 	int err;
 	struct ip_tunnel *tunnel;
@@ -326,7 +326,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
 
 static struct ip_tunnel *ip_tunnel_create(struct net *net,
 					  struct ip_tunnel_net *itn,
-					  struct ip_tunnel_parm *parms)
+					  struct ip_tunnel_parm_kern *parms)
 {
 	struct ip_tunnel *nt;
 	struct net_device *dev;
@@ -871,7 +871,7 @@ EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
 static void ip_tunnel_update(struct ip_tunnel_net *itn,
 			     struct ip_tunnel *t,
 			     struct net_device *dev,
-			     struct ip_tunnel_parm *p,
+			     struct ip_tunnel_parm_kern *p,
 			     bool set_mtu,
 			     __u32 fwmark)
 {
@@ -903,7 +903,8 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn,
 	netdev_state_change(dev);
 }
 
-int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
+int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p,
+		  int cmd)
 {
 	int err = 0;
 	struct ip_tunnel *t = netdev_priv(dev);
@@ -1005,16 +1006,52 @@ done:
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
 
+bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern *kp,
+			      const void __user *data)
+{
+	struct ip_tunnel_parm p;
+
+	if (copy_from_user(&p, data, sizeof(p)))
+		return false;
+
+	strscpy(kp->name, p.name);
+	kp->link = p.link;
+	kp->i_flags = p.i_flags;
+	kp->o_flags = p.o_flags;
+	kp->i_key = p.i_key;
+	kp->o_key = p.o_key;
+	memcpy(&kp->iph, &p.iph, min(sizeof(kp->iph), sizeof(p.iph)));
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_parm_from_user);
+
+bool ip_tunnel_parm_to_user(void __user *data, struct ip_tunnel_parm_kern *kp)
+{
+	struct ip_tunnel_parm p;
+
+	strscpy(p.name, kp->name);
+	p.link = kp->link;
+	p.i_flags = kp->i_flags;
+	p.o_flags = kp->o_flags;
+	p.i_key = kp->i_key;
+	p.o_key = kp->o_key;
+	memcpy(&p.iph, &kp->iph, min(sizeof(p.iph), sizeof(kp->iph)));
+
+	return !copy_to_user(data, &p, sizeof(p));
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_parm_to_user);
+
 int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
 			     void __user *data, int cmd)
 {
-	struct ip_tunnel_parm p;
+	struct ip_tunnel_parm_kern p;
 	int err;
 
-	if (copy_from_user(&p, data, sizeof(p)))
+	if (!ip_tunnel_parm_from_user(&p, data))
 		return -EFAULT;
 	err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
-	if (!err && copy_to_user(data, &p, sizeof(p)))
+	if (!err && !ip_tunnel_parm_to_user(data, &p))
 		return -EFAULT;
 	return err;
 }
@@ -1093,7 +1130,7 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
 				  struct rtnl_link_ops *ops, char *devname)
 {
 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
-	struct ip_tunnel_parm parms;
+	struct ip_tunnel_parm_kern parms;
 	unsigned int i;
 
 	itn->rtnl_link_ops = ops;
@@ -1171,7 +1208,7 @@ void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
 
 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
-		      struct ip_tunnel_parm *p, __u32 fwmark)
+		      struct ip_tunnel_parm_kern *p, __u32 fwmark)
 {
 	struct ip_tunnel *nt;
 	struct net *net = dev_net(dev);
@@ -1225,7 +1262,7 @@ err_register_netdevice:
 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
 
 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
-			 struct ip_tunnel_parm *p, __u32 fwmark)
+			 struct ip_tunnel_parm_kern *p, __u32 fwmark)
 {
 	struct ip_tunnel *t;
 	struct ip_tunnel *tunnel = netdev_priv(dev);
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 80ccd6661aa3..98f136b07191 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -1116,7 +1116,7 @@ bool ip_tunnel_netlink_encap_parms(struct nlattr *data[],
 EXPORT_SYMBOL_GPL(ip_tunnel_netlink_encap_parms);
 
 void ip_tunnel_netlink_parms(struct nlattr *data[],
-			     struct ip_tunnel_parm *parms)
+			     struct ip_tunnel_parm_kern *parms)
 {
 	if (data[IFLA_IPTUN_LINK])
 		parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]);
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index ee587adb169f..7e595f619615 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -167,7 +167,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
 			    struct flowi *fl)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
-	struct ip_tunnel_parm *parms = &tunnel->parms;
+	struct ip_tunnel_parm_kern *parms = &tunnel->parms;
 	struct dst_entry *dst = skb_dst(skb);
 	struct net_device *tdev;	/* Device to other host */
 	int pkt_len = skb->len;
@@ -373,7 +373,7 @@ static int vti4_err(struct sk_buff *skb, u32 info)
 }
 
 static int
-vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
+vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd)
 {
 	int err = 0;
 
@@ -531,7 +531,7 @@ static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
 }
 
 static void vti_netlink_parms(struct nlattr *data[],
-			      struct ip_tunnel_parm *parms,
+			      struct ip_tunnel_parm_kern *parms,
 			      __u32 *fwmark)
 {
 	memset(parms, 0, sizeof(*parms));
@@ -566,7 +566,7 @@ static int vti_newlink(struct net *src_net, struct net_device *dev,
 		       struct nlattr *tb[], struct nlattr *data[],
 		       struct netlink_ext_ack *extack)
 {
-	struct ip_tunnel_parm parms;
+	struct ip_tunnel_parm_kern parms;
 	__u32 fwmark = 0;
 
 	vti_netlink_parms(data, &parms, &fwmark);
@@ -578,8 +578,8 @@ static int vti_changelink(struct net_device *dev, struct nlattr *tb[],
 			  struct netlink_ext_ack *extack)
 {
 	struct ip_tunnel *t = netdev_priv(dev);
+	struct ip_tunnel_parm_kern p;
 	__u32 fwmark = t->fwmark;
-	struct ip_tunnel_parm p;
 
 	vti_netlink_parms(data, &p, &fwmark);
 	return ip_tunnel_changelink(dev, tb, &p, fwmark);
@@ -606,7 +606,7 @@ static size_t vti_get_size(const struct net_device *dev)
 static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev)
 {
 	struct ip_tunnel *t = netdev_priv(dev);
-	struct ip_tunnel_parm *p = &t->parms;
+	struct ip_tunnel_parm_kern *p = &t->parms;
 
 	if (nla_put_u32(skb, IFLA_VTI_LINK, p->link) ||
 	    nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key) ||
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index f2696eaadbe6..db960cd08a3b 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -330,7 +330,7 @@ static bool ipip_tunnel_ioctl_verify_protocol(u8 ipproto)
 }
 
 static int
-ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
+ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd)
 {
 	if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
 		if (p->iph.version != 4 ||
@@ -405,8 +405,8 @@ static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
 }
 
 static void ipip_netlink_parms(struct nlattr *data[],
-			       struct ip_tunnel_parm *parms, bool *collect_md,
-			       __u32 *fwmark)
+			       struct ip_tunnel_parm_kern *parms,
+			       bool *collect_md, __u32 *fwmark)
 {
 	memset(parms, 0, sizeof(*parms));
 
@@ -432,8 +432,8 @@ static int ipip_newlink(struct net *src_net, struct net_device *dev,
 			struct netlink_ext_ack *extack)
 {
 	struct ip_tunnel *t = netdev_priv(dev);
-	struct ip_tunnel_parm p;
 	struct ip_tunnel_encap ipencap;
+	struct ip_tunnel_parm_kern p;
 	__u32 fwmark = 0;
 
 	if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
@@ -452,8 +452,8 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
 			   struct netlink_ext_ack *extack)
 {
 	struct ip_tunnel *t = netdev_priv(dev);
-	struct ip_tunnel_parm p;
 	struct ip_tunnel_encap ipencap;
+	struct ip_tunnel_parm_kern p;
 	bool collect_md;
 	__u32 fwmark = t->fwmark;
 
@@ -510,7 +510,7 @@ static size_t ipip_get_size(const struct net_device *dev)
 static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
-	struct ip_tunnel_parm *parm = &tunnel->parms;
+	struct ip_tunnel_parm_kern *parm = &tunnel->parms;
 
 	if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) ||
 	    nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) ||
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index fd5c01c8489f..6c750bd13dd8 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -441,7 +441,7 @@ static bool ipmr_init_vif_indev(const struct net_device *dev)
 static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
 {
 	struct net_device *tunnel_dev, *new_dev;
-	struct ip_tunnel_parm p = { };
+	struct ip_tunnel_parm_kern p = { };
 	int err;
 
 	tunnel_dev = __dev_get_by_name(net, "tunl0");
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 92db9b474f2b..82ad6b8fa007 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -63,6 +63,7 @@
 #include <linux/string.h>
 #include <linux/hash.h>
 
+#include <net/ip_tunnels.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
 #include <net/snmp.h>
@@ -2917,7 +2918,7 @@ put:
 static int addrconf_set_sit_dstaddr(struct net *net, struct net_device *dev,
 		struct in6_ifreq *ireq)
 {
-	struct ip_tunnel_parm p = { };
+	struct ip_tunnel_parm_kern p = { };
 	int err;
 
 	if (!(ipv6_addr_type(&ireq->ifr6_addr) & IPV6_ADDR_COMPATv4))
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 655c9b1a19b8..894e6a4c4be3 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -132,8 +132,8 @@ static struct ip_tunnel *ipip6_tunnel_lookup(struct net *net,
 	return NULL;
 }
 
-static struct ip_tunnel __rcu **__ipip6_bucket(struct sit_net *sitn,
-		struct ip_tunnel_parm *parms)
+static struct ip_tunnel __rcu **
+__ipip6_bucket(struct sit_net *sitn, struct ip_tunnel_parm_kern *parms)
 {
 	__be32 remote = parms->iph.daddr;
 	__be32 local = parms->iph.saddr;
@@ -226,7 +226,8 @@ out:
 }
 
 static struct ip_tunnel *ipip6_tunnel_locate(struct net *net,
-		struct ip_tunnel_parm *parms, int create)
+					     struct ip_tunnel_parm_kern *parms,
+					     int create)
 {
 	__be32 remote = parms->iph.daddr;
 	__be32 local = parms->iph.saddr;
@@ -1135,7 +1136,8 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev)
 	dev->needed_headroom = t_hlen + hlen;
 }
 
-static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p,
+static void ipip6_tunnel_update(struct ip_tunnel *t,
+				struct ip_tunnel_parm_kern *p,
 				__u32 fwmark)
 {
 	struct net *net = t->net;
@@ -1196,11 +1198,11 @@ static int
 ipip6_tunnel_get6rd(struct net_device *dev, struct ip_tunnel_parm __user *data)
 {
 	struct ip_tunnel *t = netdev_priv(dev);
+	struct ip_tunnel_parm_kern p;
 	struct ip_tunnel_6rd ip6rd;
-	struct ip_tunnel_parm p;
 
 	if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) {
-		if (copy_from_user(&p, data, sizeof(p)))
+		if (!ip_tunnel_parm_from_user(&p, data))
 			return -EFAULT;
 		t = ipip6_tunnel_locate(t->net, &p, 0);
 	}
@@ -1251,7 +1253,7 @@ static bool ipip6_valid_ip_proto(u8 ipproto)
 }
 
 static int
-__ipip6_tunnel_ioctl_validate(struct net *net, struct ip_tunnel_parm *p)
+__ipip6_tunnel_ioctl_validate(struct net *net, struct ip_tunnel_parm_kern *p)
 {
 	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
@@ -1268,7 +1270,7 @@ __ipip6_tunnel_ioctl_validate(struct net *net, struct ip_tunnel_parm *p)
 }
 
 static int
-ipip6_tunnel_get(struct net_device *dev, struct ip_tunnel_parm *p)
+ipip6_tunnel_get(struct net_device *dev, struct ip_tunnel_parm_kern *p)
 {
 	struct ip_tunnel *t = netdev_priv(dev);
 
@@ -1281,7 +1283,7 @@ ipip6_tunnel_get(struct net_device *dev, struct ip_tunnel_parm *p)
 }
 
 static int
-ipip6_tunnel_add(struct net_device *dev, struct ip_tunnel_parm *p)
+ipip6_tunnel_add(struct net_device *dev, struct ip_tunnel_parm_kern *p)
 {
 	struct ip_tunnel *t = netdev_priv(dev);
 	int err;
@@ -1297,7 +1299,7 @@ ipip6_tunnel_add(struct net_device *dev, struct ip_tunnel_parm *p)
 }
 
 static int
-ipip6_tunnel_change(struct net_device *dev, struct ip_tunnel_parm *p)
+ipip6_tunnel_change(struct net_device *dev, struct ip_tunnel_parm_kern *p)
 {
 	struct ip_tunnel *t = netdev_priv(dev);
 	int err;
@@ -1328,7 +1330,7 @@ ipip6_tunnel_change(struct net_device *dev, struct ip_tunnel_parm *p)
 }
 
 static int
-ipip6_tunnel_del(struct net_device *dev, struct ip_tunnel_parm *p)
+ipip6_tunnel_del(struct net_device *dev, struct ip_tunnel_parm_kern *p)
 {
 	struct ip_tunnel *t = netdev_priv(dev);
 
@@ -1348,7 +1350,8 @@ ipip6_tunnel_del(struct net_device *dev, struct ip_tunnel_parm *p)
 }
 
 static int
-ipip6_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
+ipip6_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p,
+		 int cmd)
 {
 	switch (cmd) {
 	case SIOCGETTUNNEL:
@@ -1490,7 +1493,7 @@ static int ipip6_validate(struct nlattr *tb[], struct nlattr *data[],
 }
 
 static void ipip6_netlink_parms(struct nlattr *data[],
-				struct ip_tunnel_parm *parms,
+				struct ip_tunnel_parm_kern *parms,
 				__u32 *fwmark)
 {
 	memset(parms, 0, sizeof(*parms));
@@ -1599,8 +1602,8 @@ static int ipip6_changelink(struct net_device *dev, struct nlattr *tb[],
 			    struct netlink_ext_ack *extack)
 {
 	struct ip_tunnel *t = netdev_priv(dev);
-	struct ip_tunnel_parm p;
 	struct ip_tunnel_encap ipencap;
+	struct ip_tunnel_parm_kern p;
 	struct net *net = t->net;
 	struct sit_net *sitn = net_generic(net, sit_net_id);
 #ifdef CONFIG_IPV6_SIT_6RD
@@ -1687,7 +1690,7 @@ static size_t ipip6_get_size(const struct net_device *dev)
 static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
-	struct ip_tunnel_parm *parm = &tunnel->parms;
+	struct ip_tunnel_parm_kern *parm = &tunnel->parms;
 
 	if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) ||
 	    nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) ||
-- 
cgit v1.2.3


From 5832c4a77d6931cebf9ba737129ae8f14b66ee1d Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Wed, 27 Mar 2024 16:23:53 +0100
Subject: ip_tunnel: convert __be16 tunnel flags to bitmaps

Historically, tunnel flags like TUNNEL_CSUM or TUNNEL_ERSPAN_OPT
have been defined as __be16. Now all of those 16 bits are occupied
and there's no more free space for new flags.
It can't be simply switched to a bigger container with no
adjustments to the values, since it's an explicit Endian storage,
and on LE systems (__be16)0x0001 equals to
(__be64)0x0001000000000000.
We could probably define new 64-bit flags depending on the
Endianness, i.e. (__be64)0x0001 on BE and (__be64)0x00010000... on
LE, but that would introduce an Endianness dependency and spawn a
ton of Sparse warnings. To mitigate them, all of those places which
were adjusted with this change would be touched anyway, so why not
define stuff properly if there's no choice.

Define IP_TUNNEL_*_BIT counterparts as a bit number instead of the
value already coded and a fistful of <16 <-> bitmap> converters and
helpers. The two flags which have a different bit position are
SIT_ISATAP_BIT and VTI_ISVTI_BIT, as they were defined not as
__cpu_to_be16(), but as (__force __be16), i.e. had different
positions on LE and BE. Now they both have strongly defined places.
Change all __be16 fields which were used to store those flags, to
IP_TUNNEL_DECLARE_FLAGS() -> DECLARE_BITMAP(__IP_TUNNEL_FLAG_NUM) ->
unsigned long[1] for now, and replace all TUNNEL_* occurrences to
their bitmap counterparts. Use the converters in the places which talk
to the userspace, hardware (NFP) or other hosts (GRE header). The rest
must explicitly use the new flags only. This must be done at once,
otherwise there will be too many conversions throughout the code in
the intermediate commits.
Finally, disable the old __be16 flags for use in the kernel code
(except for the two 'irregular' flags mentioned above), to prevent
any accidental (mis)use of them. For the userspace, nothing is
changed, only additions were made.

Most noticeable bloat-o-meter difference (.text):

vmlinux:	307/-1 (306)
gre.ko:		62/0 (62)
ip_gre.ko:	941/-217 (724)	[*]
ip_tunnel.ko:	390/-900 (-510)	[**]
ip_vti.ko:	138/0 (138)
ip6_gre.ko:	534/-18 (516)	[*]
ip6_tunnel.ko:	118/-10 (108)

[*] gre_flags_to_tnl_flags() grew, but still is inlined
[**] ip_tunnel_find() got uninlined, hence such decrease

The average code size increase in non-extreme case is 100-200 bytes
per module, mostly due to sizeof(long) > sizeof(__be16), as
%__IP_TUNNEL_FLAG_NUM is less than %BITS_PER_LONG and the compilers
are able to expand the majority of bitmap_*() calls here into direct
operations on scalars.

Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bareudp.c                              |  19 ++-
 .../net/ethernet/mellanox/mlx5/core/en/tc_tun.h    |   2 +-
 .../ethernet/mellanox/mlx5/core/en/tc_tun_encap.c  |   6 +-
 .../ethernet/mellanox/mlx5/core/en/tc_tun_geneve.c |  12 +-
 .../ethernet/mellanox/mlx5/core/en/tc_tun_gre.c    |   8 +-
 .../ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c  |   9 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    |  16 ++-
 .../net/ethernet/mellanox/mlxsw/spectrum_ipip.c    |  26 +++--
 .../net/ethernet/mellanox/mlxsw/spectrum_span.c    |   6 +-
 drivers/net/ethernet/netronome/nfp/flower/action.c |  27 ++++-
 drivers/net/geneve.c                               |  44 ++++---
 drivers/net/vxlan/vxlan_core.c                     |  14 ++-
 include/net/dst_metadata.h                         |  10 +-
 include/net/flow_dissector.h                       |   2 +-
 include/net/gre.h                                  |  70 ++++++------
 include/net/ip6_tunnel.h                           |   4 +-
 include/net/ip_tunnels.h                           | 119 ++++++++++++++++---
 include/net/udp_tunnel.h                           |   4 +-
 include/uapi/linux/if_tunnel.h                     |  33 ++++++
 net/bridge/br_vlan_tunnel.c                        |   9 +-
 net/core/filter.c                                  |  26 +++--
 net/core/flow_dissector.c                          |  20 +++-
 net/ipv4/fou_bpf.c                                 |   2 +-
 net/ipv4/gre_demux.c                               |   2 +-
 net/ipv4/ip_gre.c                                  | 127 +++++++++++++--------
 net/ipv4/ip_tunnel.c                               |  54 +++++----
 net/ipv4/ip_tunnel_core.c                          |  80 ++++++++-----
 net/ipv4/ip_vti.c                                  |  29 +++--
 net/ipv4/ipip.c                                    |  21 +++-
 net/ipv4/udp_tunnel_core.c                         |   5 +-
 net/ipv6/ip6_gre.c                                 |  85 ++++++++------
 net/ipv6/ip6_tunnel.c                              |  14 +--
 net/ipv6/sit.c                                     |   5 +-
 net/netfilter/ipvs/ip_vs_core.c                    |   6 +-
 net/netfilter/ipvs/ip_vs_xmit.c                    |  20 ++--
 net/netfilter/nft_tunnel.c                         |  44 ++++---
 net/openvswitch/flow_netlink.c                     |  61 ++++++----
 net/psample/psample.c                              |  26 +++--
 net/sched/act_tunnel_key.c                         |  36 +++---
 net/sched/cls_flower.c                             |  27 ++---
 40 files changed, 715 insertions(+), 415 deletions(-)

(limited to 'include')

diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c
index 339db6e4a1d5..d5c56ca91b77 100644
--- a/drivers/net/bareudp.c
+++ b/drivers/net/bareudp.c
@@ -61,6 +61,7 @@ struct bareudp_dev {
 static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 {
 	struct metadata_dst *tun_dst = NULL;
+	IP_TUNNEL_DECLARE_FLAGS(key) = { };
 	struct bareudp_dev *bareudp;
 	unsigned short family;
 	unsigned int len;
@@ -137,7 +138,10 @@ static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 		bareudp->dev->stats.rx_dropped++;
 		goto drop;
 	}
-	tun_dst = udp_tun_rx_dst(skb, family, TUNNEL_KEY, 0, 0);
+
+	__set_bit(IP_TUNNEL_KEY_BIT, key);
+
+	tun_dst = udp_tun_rx_dst(skb, family, key, 0, 0);
 	if (!tun_dst) {
 		bareudp->dev->stats.rx_dropped++;
 		goto drop;
@@ -285,10 +289,10 @@ static int bareudp_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 			    struct bareudp_dev *bareudp,
 			    const struct ip_tunnel_info *info)
 {
+	bool udp_sum = test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags);
 	bool xnet = !net_eq(bareudp->net, dev_net(bareudp->dev));
 	bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
 	struct socket *sock = rcu_dereference(bareudp->sock);
-	bool udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM);
 	const struct ip_tunnel_key *key = &info->key;
 	struct rtable *rt;
 	__be16 sport, df;
@@ -316,7 +320,8 @@ static int bareudp_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 
 	tos = ip_tunnel_ecn_encap(key->tos, ip_hdr(skb), skb);
 	ttl = key->ttl;
-	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
+	df = test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags) ?
+	     htons(IP_DF) : 0;
 	skb_scrub_packet(skb, xnet);
 
 	err = -ENOSPC;
@@ -338,7 +343,8 @@ static int bareudp_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	udp_tunnel_xmit_skb(rt, sock->sk, skb, saddr, info->key.u.ipv4.dst,
 			    tos, ttl, df, sport, bareudp->port,
 			    !net_eq(bareudp->net, dev_net(bareudp->dev)),
-			    !(info->key.tun_flags & TUNNEL_CSUM));
+			    !test_bit(IP_TUNNEL_CSUM_BIT,
+				      info->key.tun_flags));
 	return 0;
 
 free_dst:
@@ -350,10 +356,10 @@ static int bareudp6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 			     struct bareudp_dev *bareudp,
 			     const struct ip_tunnel_info *info)
 {
+	bool udp_sum = test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags);
 	bool xnet = !net_eq(bareudp->net, dev_net(bareudp->dev));
 	bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
 	struct socket *sock  = rcu_dereference(bareudp->sock);
-	bool udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM);
 	const struct ip_tunnel_key *key = &info->key;
 	struct dst_entry *dst = NULL;
 	struct in6_addr saddr, daddr;
@@ -402,7 +408,8 @@ static int bareudp6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	udp_tunnel6_xmit_skb(dst, sock->sk, skb, dev,
 			     &saddr, &daddr, prio, ttl,
 			     info->key.label, sport, bareudp->port,
-			     !(info->key.tun_flags & TUNNEL_CSUM));
+			     !test_bit(IP_TUNNEL_CSUM_BIT,
+				       info->key.tun_flags));
 	return 0;
 
 free_dst:
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.h b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.h
index 92065568bb19..6873c1201803 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.h
@@ -117,7 +117,7 @@ bool mlx5e_tc_tun_encap_info_equal_generic(struct mlx5e_encap_key *a,
 
 bool mlx5e_tc_tun_encap_info_equal_options(struct mlx5e_encap_key *a,
 					   struct mlx5e_encap_key *b,
-					   __be16 tun_flags);
+					   u32 tun_type);
 #endif /* CONFIG_MLX5_ESWITCH */
 
 #endif //__MLX5_EN_TC_TUNNEL_H__
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c
index f1d1e1542e81..878cbdbf5ec8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c
@@ -587,7 +587,7 @@ bool mlx5e_tc_tun_encap_info_equal_generic(struct mlx5e_encap_key *a,
 
 bool mlx5e_tc_tun_encap_info_equal_options(struct mlx5e_encap_key *a,
 					   struct mlx5e_encap_key *b,
-					   __be16 tun_flags)
+					   u32 tun_type)
 {
 	struct ip_tunnel_info *a_info;
 	struct ip_tunnel_info *b_info;
@@ -596,8 +596,8 @@ bool mlx5e_tc_tun_encap_info_equal_options(struct mlx5e_encap_key *a,
 	if (!mlx5e_tc_tun_encap_info_equal_generic(a, b))
 		return false;
 
-	a_has_opts = !!(a->ip_tun_key->tun_flags & tun_flags);
-	b_has_opts = !!(b->ip_tun_key->tun_flags & tun_flags);
+	a_has_opts = test_bit(tun_type, a->ip_tun_key->tun_flags);
+	b_has_opts = test_bit(tun_type, b->ip_tun_key->tun_flags);
 
 	/* keys are equal when both don't have any options attached */
 	if (!a_has_opts && !b_has_opts)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_geneve.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_geneve.c
index 2bcd10b6d653..bf969212cc77 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_geneve.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_geneve.c
@@ -106,12 +106,13 @@ static int mlx5e_gen_ip_tunnel_header_geneve(char buf[],
 	memset(geneveh, 0, sizeof(*geneveh));
 	geneveh->ver = MLX5E_GENEVE_VER;
 	geneveh->opt_len = tun_info->options_len / 4;
-	geneveh->oam = !!(tun_info->key.tun_flags & TUNNEL_OAM);
-	geneveh->critical = !!(tun_info->key.tun_flags & TUNNEL_CRIT_OPT);
+	geneveh->oam = test_bit(IP_TUNNEL_OAM_BIT, tun_info->key.tun_flags);
+	geneveh->critical = test_bit(IP_TUNNEL_CRIT_OPT_BIT,
+				     tun_info->key.tun_flags);
 	mlx5e_tunnel_id_to_vni(tun_info->key.tun_id, geneveh->vni);
 	geneveh->proto_type = htons(ETH_P_TEB);
 
-	if (tun_info->key.tun_flags & TUNNEL_GENEVE_OPT) {
+	if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, tun_info->key.tun_flags)) {
 		if (!geneveh->opt_len)
 			return -EOPNOTSUPP;
 		ip_tunnel_info_opts_get(geneveh->options, tun_info);
@@ -188,7 +189,7 @@ static int mlx5e_tc_tun_parse_geneve_options(struct mlx5e_priv *priv,
 
 	/* make sure that we're talking about GENEVE options */
 
-	if (enc_opts.key->dst_opt_type != TUNNEL_GENEVE_OPT) {
+	if (enc_opts.key->dst_opt_type != IP_TUNNEL_GENEVE_OPT_BIT) {
 		NL_SET_ERR_MSG_MOD(extack,
 				   "Matching on GENEVE options: option type is not GENEVE");
 		netdev_warn(priv->netdev,
@@ -337,7 +338,8 @@ static int mlx5e_tc_tun_parse_geneve(struct mlx5e_priv *priv,
 static bool mlx5e_tc_tun_encap_info_equal_geneve(struct mlx5e_encap_key *a,
 						 struct mlx5e_encap_key *b)
 {
-	return mlx5e_tc_tun_encap_info_equal_options(a, b, TUNNEL_GENEVE_OPT);
+	return mlx5e_tc_tun_encap_info_equal_options(a, b,
+						     IP_TUNNEL_GENEVE_OPT_BIT);
 }
 
 struct mlx5e_tc_tunnel geneve_tunnel = {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_gre.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_gre.c
index ada14f0574dc..579eda89fc76 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_gre.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_gre.c
@@ -31,12 +31,16 @@ static int mlx5e_gen_ip_tunnel_header_gretap(char buf[],
 	const struct ip_tunnel_key *tun_key  = &e->tun_info->key;
 	struct gre_base_hdr *greh = (struct gre_base_hdr *)(buf);
 	__be32 tun_id = tunnel_id_to_key32(tun_key->tun_id);
+	IP_TUNNEL_DECLARE_FLAGS(unsupp) = { };
 	int hdr_len;
 
 	*ip_proto = IPPROTO_GRE;
 
 	/* the HW does not calculate GRE csum or sequences */
-	if (tun_key->tun_flags & (TUNNEL_CSUM | TUNNEL_SEQ))
+	__set_bit(IP_TUNNEL_CSUM_BIT, unsupp);
+	__set_bit(IP_TUNNEL_SEQ_BIT, unsupp);
+
+	if (ip_tunnel_flags_intersect(tun_key->tun_flags, unsupp))
 		return -EOPNOTSUPP;
 
 	greh->protocol = htons(ETH_P_TEB);
@@ -44,7 +48,7 @@ static int mlx5e_gen_ip_tunnel_header_gretap(char buf[],
 	/* GRE key */
 	hdr_len	= mlx5e_tc_tun_calc_hlen_gretap(e);
 	greh->flags = gre_tnl_flags_to_gre_flags(tun_key->tun_flags);
-	if (tun_key->tun_flags & TUNNEL_KEY) {
+	if (test_bit(IP_TUNNEL_KEY_BIT, tun_key->tun_flags)) {
 		__be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
 		*ptr = tun_id;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c
index a184d739d5f8..e4e487c8431b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_vxlan.c
@@ -90,7 +90,7 @@ static int mlx5e_gen_ip_tunnel_header_vxlan(char buf[],
 	const struct vxlan_metadata *md;
 	struct vxlanhdr *vxh;
 
-	if ((tun_key->tun_flags & TUNNEL_VXLAN_OPT) &&
+	if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, tun_key->tun_flags) &&
 	    e->tun_info->options_len != sizeof(*md))
 		return -EOPNOTSUPP;
 	vxh = (struct vxlanhdr *)((char *)udp + sizeof(struct udphdr));
@@ -99,7 +99,7 @@ static int mlx5e_gen_ip_tunnel_header_vxlan(char buf[],
 	udp->dest = tun_key->tp_dst;
 	vxh->vx_flags = VXLAN_HF_VNI;
 	vxh->vx_vni = vxlan_vni_field(tun_id);
-	if (tun_key->tun_flags & TUNNEL_VXLAN_OPT) {
+	if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, tun_key->tun_flags)) {
 		md = ip_tunnel_info_opts(e->tun_info);
 		vxlan_build_gbp_hdr(vxh, md);
 	}
@@ -125,7 +125,7 @@ static int mlx5e_tc_tun_parse_vxlan_gbp_option(struct mlx5e_priv *priv,
 		return -EOPNOTSUPP;
 	}
 
-	if (enc_opts.key->dst_opt_type != TUNNEL_VXLAN_OPT) {
+	if (enc_opts.key->dst_opt_type != IP_TUNNEL_VXLAN_OPT_BIT) {
 		NL_SET_ERR_MSG_MOD(extack, "Wrong VxLAN option type: not GBP");
 		return -EOPNOTSUPP;
 	}
@@ -208,7 +208,8 @@ static int mlx5e_tc_tun_parse_vxlan(struct mlx5e_priv *priv,
 static bool mlx5e_tc_tun_encap_info_equal_vxlan(struct mlx5e_encap_key *a,
 						struct mlx5e_encap_key *b)
 {
-	return mlx5e_tc_tun_encap_info_equal_options(a, b, TUNNEL_VXLAN_OPT);
+	return mlx5e_tc_tun_encap_info_equal_options(a, b,
+						     IP_TUNNEL_VXLAN_OPT_BIT);
 }
 
 static int mlx5e_tc_tun_get_remote_ifindex(struct net_device *mirred_dev)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 31ed26cac9bf..f2f19fa37d91 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -5464,6 +5464,7 @@ static bool mlx5e_tc_restore_tunnel(struct mlx5e_priv *priv, struct sk_buff *skb
 	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 	struct tunnel_match_enc_opts enc_opts = {};
 	struct mlx5_rep_uplink_priv *uplink_priv;
+	IP_TUNNEL_DECLARE_FLAGS(flags) = { };
 	struct mlx5e_rep_priv *uplink_rpriv;
 	struct metadata_dst *tun_dst;
 	struct tunnel_match_key key;
@@ -5471,6 +5472,8 @@ static bool mlx5e_tc_restore_tunnel(struct mlx5e_priv *priv, struct sk_buff *skb
 	struct net_device *dev;
 	int err;
 
+	__set_bit(IP_TUNNEL_KEY_BIT, flags);
+
 	enc_opts_id = tunnel_id & ENC_OPTS_BITS_MASK;
 	tun_id = tunnel_id >> ENC_OPTS_BITS;
 
@@ -5503,14 +5506,14 @@ static bool mlx5e_tc_restore_tunnel(struct mlx5e_priv *priv, struct sk_buff *skb
 	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
 		tun_dst = __ip_tun_set_dst(key.enc_ipv4.src, key.enc_ipv4.dst,
 					   key.enc_ip.tos, key.enc_ip.ttl,
-					   key.enc_tp.dst, TUNNEL_KEY,
+					   key.enc_tp.dst, flags,
 					   key32_to_tunnel_id(key.enc_key_id.keyid),
 					   enc_opts.key.len);
 		break;
 	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
 		tun_dst = __ipv6_tun_set_dst(&key.enc_ipv6.src, &key.enc_ipv6.dst,
 					     key.enc_ip.tos, key.enc_ip.ttl,
-					     key.enc_tp.dst, 0, TUNNEL_KEY,
+					     key.enc_tp.dst, 0, flags,
 					     key32_to_tunnel_id(key.enc_key_id.keyid),
 					     enc_opts.key.len);
 		break;
@@ -5528,11 +5531,16 @@ static bool mlx5e_tc_restore_tunnel(struct mlx5e_priv *priv, struct sk_buff *skb
 
 	tun_dst->u.tun_info.key.tp_src = key.enc_tp.src;
 
-	if (enc_opts.key.len)
+	if (enc_opts.key.len) {
+		ip_tunnel_flags_zero(flags);
+		if (enc_opts.key.dst_opt_type)
+			__set_bit(enc_opts.key.dst_opt_type, flags);
+
 		ip_tunnel_info_opts_set(&tun_dst->u.tun_info,
 					enc_opts.key.data,
 					enc_opts.key.len,
-					enc_opts.key.dst_opt_type);
+					flags);
+	}
 
 	skb_dst_set(skb, (struct dst_entry *)tun_dst);
 	dev = dev_get_by_index(&init_net, key.filter_ifindex);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c
index d67df358a52f..d761a1235994 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_ipip.c
@@ -27,23 +27,23 @@ mlxsw_sp_ipip_netdev_parms6(const struct net_device *ol_dev)
 static bool
 mlxsw_sp_ipip_parms4_has_ikey(const struct ip_tunnel_parm_kern *parms)
 {
-	return !!(parms->i_flags & TUNNEL_KEY);
+	return test_bit(IP_TUNNEL_KEY_BIT, parms->i_flags);
 }
 
 static bool mlxsw_sp_ipip_parms6_has_ikey(const struct __ip6_tnl_parm *parms)
 {
-	return !!(parms->i_flags & TUNNEL_KEY);
+	return test_bit(IP_TUNNEL_KEY_BIT, parms->i_flags);
 }
 
 static bool
 mlxsw_sp_ipip_parms4_has_okey(const struct ip_tunnel_parm_kern *parms)
 {
-	return !!(parms->o_flags & TUNNEL_KEY);
+	return test_bit(IP_TUNNEL_KEY_BIT, parms->o_flags);
 }
 
 static bool mlxsw_sp_ipip_parms6_has_okey(const struct __ip6_tnl_parm *parms)
 {
-	return !!(parms->o_flags & TUNNEL_KEY);
+	return test_bit(IP_TUNNEL_KEY_BIT, parms->o_flags);
 }
 
 static u32 mlxsw_sp_ipip_parms4_ikey(const struct ip_tunnel_parm_kern *parms)
@@ -242,12 +242,15 @@ static bool mlxsw_sp_ipip_can_offload_gre4(const struct mlxsw_sp *mlxsw_sp,
 					   const struct net_device *ol_dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(ol_dev);
-	__be16 okflags = TUNNEL_KEY; /* We can't offload any other features. */
 	bool inherit_ttl = tunnel->parms.iph.ttl == 0;
 	bool inherit_tos = tunnel->parms.iph.tos & 0x1;
+	IP_TUNNEL_DECLARE_FLAGS(okflags) = { };
 
-	return (tunnel->parms.i_flags & ~okflags) == 0 &&
-	       (tunnel->parms.o_flags & ~okflags) == 0 &&
+	/* We can't offload any other features. */
+	__set_bit(IP_TUNNEL_KEY_BIT, okflags);
+
+	return ip_tunnel_flags_subset(tunnel->parms.i_flags, okflags) &&
+	       ip_tunnel_flags_subset(tunnel->parms.o_flags, okflags) &&
 	       inherit_ttl && inherit_tos &&
 	       mlxsw_sp_ipip_tunnel_complete(MLXSW_SP_L3_PROTO_IPV4, ol_dev);
 }
@@ -443,10 +446,13 @@ static bool mlxsw_sp_ipip_can_offload_gre6(const struct mlxsw_sp *mlxsw_sp,
 	struct __ip6_tnl_parm tparm = mlxsw_sp_ipip_netdev_parms6(ol_dev);
 	bool inherit_tos = tparm.flags & IP6_TNL_F_USE_ORIG_TCLASS;
 	bool inherit_ttl = tparm.hop_limit == 0;
-	__be16 okflags = TUNNEL_KEY; /* We can't offload any other features. */
+	IP_TUNNEL_DECLARE_FLAGS(okflags) = { };
+
+	/* We can't offload any other features. */
+	__set_bit(IP_TUNNEL_KEY_BIT, okflags);
 
-	return (tparm.i_flags & ~okflags) == 0 &&
-	       (tparm.o_flags & ~okflags) == 0 &&
+	return ip_tunnel_flags_subset(tparm.i_flags, okflags) &&
+	       ip_tunnel_flags_subset(tparm.o_flags, okflags) &&
 	       inherit_ttl && inherit_tos &&
 	       mlxsw_sp_ipip_tunnel_complete(MLXSW_SP_L3_PROTO_IPV6, ol_dev);
 }
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
index f0cbc6b9b041..3de69b2eb5c4 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
@@ -461,7 +461,8 @@ mlxsw_sp_span_entry_gretap4_parms(struct mlxsw_sp *mlxsw_sp,
 
 	if (!(to_dev->flags & IFF_UP) ||
 	    /* Reject tunnels with GRE keys, checksums, etc. */
-	    tparm.i_flags || tparm.o_flags ||
+	    !ip_tunnel_flags_empty(tparm.i_flags) ||
+	    !ip_tunnel_flags_empty(tparm.o_flags) ||
 	    /* Require a fixed TTL and a TOS copied from the mirrored packet. */
 	    inherit_ttl || !inherit_tos ||
 	    /* A destination address may not be "any". */
@@ -565,7 +566,8 @@ mlxsw_sp_span_entry_gretap6_parms(struct mlxsw_sp *mlxsw_sp,
 
 	if (!(to_dev->flags & IFF_UP) ||
 	    /* Reject tunnels with GRE keys, checksums, etc. */
-	    tparm.i_flags || tparm.o_flags ||
+	    !ip_tunnel_flags_empty(tparm.i_flags) ||
+	    !ip_tunnel_flags_empty(tparm.o_flags) ||
 	    /* Require a fixed TTL and a TOS copied from the mirrored packet. */
 	    inherit_ttl || !inherit_tos ||
 	    /* A destination address may not be "any". */
diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c b/drivers/net/ethernet/netronome/nfp/flower/action.c
index 2c3f62907958..aca2a7417af3 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/action.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/action.c
@@ -396,6 +396,17 @@ nfp_fl_push_geneve_options(struct nfp_fl_payload *nfp_fl, int *list_len,
 	return 0;
 }
 
+#define NFP_FL_CHECK(flag) ({				\
+	IP_TUNNEL_DECLARE_FLAGS(__check) = { };		\
+	__be16 __res;					\
+							\
+	__set_bit(IP_TUNNEL_##flag##_BIT, __check);	\
+	__res = ip_tunnel_flags_to_be16(__check);	\
+							\
+	BUILD_BUG_ON(__builtin_constant_p(__res) &&	\
+		     NFP_FL_TUNNEL_##flag != __res);	\
+})
+
 static int
 nfp_fl_set_tun(struct nfp_app *app, struct nfp_fl_set_tun *set_tun,
 	       const struct flow_action_entry *act,
@@ -410,6 +421,7 @@ nfp_fl_set_tun(struct nfp_app *app, struct nfp_fl_set_tun *set_tun,
 	u32 tmp_set_ip_tun_type_index = 0;
 	/* Currently support one pre-tunnel so index is always 0. */
 	int pretun_idx = 0;
+	__be16 tun_flags;
 
 	if (!IS_ENABLED(CONFIG_IPV6) && ipv6)
 		return -EOPNOTSUPP;
@@ -417,9 +429,10 @@ nfp_fl_set_tun(struct nfp_app *app, struct nfp_fl_set_tun *set_tun,
 	if (ipv6 && !(priv->flower_ext_feats & NFP_FL_FEATS_IPV6_TUN))
 		return -EOPNOTSUPP;
 
-	BUILD_BUG_ON(NFP_FL_TUNNEL_CSUM != TUNNEL_CSUM ||
-		     NFP_FL_TUNNEL_KEY	!= TUNNEL_KEY ||
-		     NFP_FL_TUNNEL_GENEVE_OPT != TUNNEL_GENEVE_OPT);
+	NFP_FL_CHECK(CSUM);
+	NFP_FL_CHECK(KEY);
+	NFP_FL_CHECK(GENEVE_OPT);
+
 	if (ip_tun->options_len &&
 	    (tun_type != NFP_FL_TUNNEL_GENEVE ||
 	    !(priv->flower_ext_feats & NFP_FL_FEATS_GENEVE_OPT))) {
@@ -427,7 +440,9 @@ nfp_fl_set_tun(struct nfp_app *app, struct nfp_fl_set_tun *set_tun,
 		return -EOPNOTSUPP;
 	}
 
-	if (ip_tun->key.tun_flags & ~NFP_FL_SUPPORTED_UDP_TUN_FLAGS) {
+	tun_flags = ip_tunnel_flags_to_be16(ip_tun->key.tun_flags);
+	if (!ip_tunnel_flags_is_be16_compat(ip_tun->key.tun_flags) ||
+	    (tun_flags & ~NFP_FL_SUPPORTED_UDP_TUN_FLAGS)) {
 		NL_SET_ERR_MSG_MOD(extack,
 				   "unsupported offload: loaded firmware does not support tunnel flag offload");
 		return -EOPNOTSUPP;
@@ -442,7 +457,7 @@ nfp_fl_set_tun(struct nfp_app *app, struct nfp_fl_set_tun *set_tun,
 		FIELD_PREP(NFP_FL_PRE_TUN_INDEX, pretun_idx);
 
 	set_tun->tun_type_index = cpu_to_be32(tmp_set_ip_tun_type_index);
-	if (ip_tun->key.tun_flags & NFP_FL_TUNNEL_KEY)
+	if (tun_flags & NFP_FL_TUNNEL_KEY)
 		set_tun->tun_id = ip_tun->key.tun_id;
 
 	if (ip_tun->key.ttl) {
@@ -486,7 +501,7 @@ nfp_fl_set_tun(struct nfp_app *app, struct nfp_fl_set_tun *set_tun,
 	}
 
 	set_tun->tos = ip_tun->key.tos;
-	set_tun->tun_flags = ip_tun->key.tun_flags;
+	set_tun->tun_flags = tun_flags;
 
 	if (tun_type == NFP_FL_TUNNEL_GENEVE) {
 		set_tun->tun_proto = htons(ETH_P_TEB);
diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 2f6739fe78af..163f94a5a58f 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -225,10 +225,11 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs,
 	void *oiph;
 
 	if (ip_tunnel_collect_metadata() || gs->collect_md) {
-		__be16 flags;
+		IP_TUNNEL_DECLARE_FLAGS(flags) = { };
 
-		flags = TUNNEL_KEY | (gnvh->oam ? TUNNEL_OAM : 0) |
-			(gnvh->critical ? TUNNEL_CRIT_OPT : 0);
+		__set_bit(IP_TUNNEL_KEY_BIT, flags);
+		__assign_bit(IP_TUNNEL_OAM_BIT, flags, gnvh->oam);
+		__assign_bit(IP_TUNNEL_CRIT_OPT_BIT, flags, gnvh->critical);
 
 		tun_dst = udp_tun_rx_dst(skb, geneve_get_sk_family(gs), flags,
 					 vni_to_tunnel_id(gnvh->vni),
@@ -238,9 +239,11 @@ static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs,
 			goto drop;
 		}
 		/* Update tunnel dst according to Geneve options. */
+		ip_tunnel_flags_zero(flags);
+		__set_bit(IP_TUNNEL_GENEVE_OPT_BIT, flags);
 		ip_tunnel_info_opts_set(&tun_dst->u.tun_info,
 					gnvh->options, gnvh->opt_len * 4,
-					TUNNEL_GENEVE_OPT);
+					flags);
 	} else {
 		/* Drop packets w/ critical options,
 		 * since we don't support any...
@@ -745,14 +748,15 @@ static void geneve_build_header(struct genevehdr *geneveh,
 {
 	geneveh->ver = GENEVE_VER;
 	geneveh->opt_len = info->options_len / 4;
-	geneveh->oam = !!(info->key.tun_flags & TUNNEL_OAM);
-	geneveh->critical = !!(info->key.tun_flags & TUNNEL_CRIT_OPT);
+	geneveh->oam = test_bit(IP_TUNNEL_OAM_BIT, info->key.tun_flags);
+	geneveh->critical = test_bit(IP_TUNNEL_CRIT_OPT_BIT,
+				     info->key.tun_flags);
 	geneveh->rsvd1 = 0;
 	tunnel_id_to_vni(info->key.tun_id, geneveh->vni);
 	geneveh->proto_type = inner_proto;
 	geneveh->rsvd2 = 0;
 
-	if (info->key.tun_flags & TUNNEL_GENEVE_OPT)
+	if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags))
 		ip_tunnel_info_opts_get(geneveh->options, info);
 }
 
@@ -761,7 +765,7 @@ static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb,
 			    bool xnet, int ip_hdr_len,
 			    bool inner_proto_inherit)
 {
-	bool udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM);
+	bool udp_sum = test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags);
 	struct genevehdr *gnvh;
 	__be16 inner_proto;
 	int min_headroom;
@@ -878,7 +882,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	if (geneve->cfg.collect_md) {
 		ttl = key->ttl;
 
-		df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
+		df = test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags) ?
+		     htons(IP_DF) : 0;
 	} else {
 		if (geneve->cfg.ttl_inherit)
 			ttl = ip_tunnel_get_ttl(ip_hdr(skb), skb);
@@ -910,7 +915,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	udp_tunnel_xmit_skb(rt, gs4->sock->sk, skb, saddr, info->key.u.ipv4.dst,
 			    tos, ttl, df, sport, geneve->cfg.info.key.tp_dst,
 			    !net_eq(geneve->net, dev_net(geneve->dev)),
-			    !(info->key.tun_flags & TUNNEL_CSUM));
+			    !test_bit(IP_TUNNEL_CSUM_BIT,
+				      info->key.tun_flags));
 	return 0;
 }
 
@@ -998,7 +1004,8 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	udp_tunnel6_xmit_skb(dst, gs6->sock->sk, skb, dev,
 			     &saddr, &key->u.ipv6.dst, prio, ttl,
 			     info->key.label, sport, geneve->cfg.info.key.tp_dst,
-			     !(info->key.tun_flags & TUNNEL_CSUM));
+			     !test_bit(IP_TUNNEL_CSUM_BIT,
+				       info->key.tun_flags));
 	return 0;
 }
 #endif
@@ -1297,7 +1304,8 @@ static struct geneve_dev *geneve_find_dev(struct geneve_net *gn,
 
 static bool is_tnl_info_zero(const struct ip_tunnel_info *info)
 {
-	return !(info->key.tun_id || info->key.tun_flags || info->key.tos ||
+	return !(info->key.tun_id || info->key.tos ||
+		 !ip_tunnel_flags_empty(info->key.tun_flags) ||
 		 info->key.ttl || info->key.label || info->key.tp_src ||
 		 memchr_inv(&info->key.u, 0, sizeof(info->key.u)));
 }
@@ -1435,7 +1443,7 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[],
 					    "Remote IPv6 address cannot be Multicast");
 			return -EINVAL;
 		}
-		info->key.tun_flags |= TUNNEL_CSUM;
+		__set_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags);
 		cfg->use_udp6_rx_checksums = true;
 #else
 		NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_REMOTE6],
@@ -1510,7 +1518,7 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[],
 			goto change_notsup;
 		}
 		if (nla_get_u8(data[IFLA_GENEVE_UDP_CSUM]))
-			info->key.tun_flags |= TUNNEL_CSUM;
+			__set_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags);
 	}
 
 	if (data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX]) {
@@ -1520,7 +1528,7 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[],
 			goto change_notsup;
 		}
 		if (nla_get_u8(data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX]))
-			info->key.tun_flags &= ~TUNNEL_CSUM;
+			__clear_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags);
 #else
 		NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX],
 				    "IPv6 support not enabled in the kernel");
@@ -1753,7 +1761,8 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
 				    info->key.u.ipv4.dst))
 			goto nla_put_failure;
 		if (nla_put_u8(skb, IFLA_GENEVE_UDP_CSUM,
-			       !!(info->key.tun_flags & TUNNEL_CSUM)))
+			       test_bit(IP_TUNNEL_CSUM_BIT,
+					info->key.tun_flags)))
 			goto nla_put_failure;
 
 #if IS_ENABLED(CONFIG_IPV6)
@@ -1762,7 +1771,8 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
 				     &info->key.u.ipv6.dst))
 			goto nla_put_failure;
 		if (nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_TX,
-			       !(info->key.tun_flags & TUNNEL_CSUM)))
+			       !test_bit(IP_TUNNEL_CSUM_BIT,
+					 info->key.tun_flags)))
 			goto nla_put_failure;
 #endif
 	}
diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
index 3495591a5c29..72ecf6cf809c 100644
--- a/drivers/net/vxlan/vxlan_core.c
+++ b/drivers/net/vxlan/vxlan_core.c
@@ -1584,7 +1584,8 @@ static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed,
 
 	tun_dst = (struct metadata_dst *)skb_dst(skb);
 	if (tun_dst) {
-		tun_dst->u.tun_info.key.tun_flags |= TUNNEL_VXLAN_OPT;
+		__set_bit(IP_TUNNEL_VXLAN_OPT_BIT,
+			  tun_dst->u.tun_info.key.tun_flags);
 		tun_dst->u.tun_info.options_len = sizeof(*md);
 	}
 	if (gbp->dont_learn)
@@ -1716,9 +1717,11 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
 			goto drop;
 
 	if (vxlan_collect_metadata(vs)) {
+		IP_TUNNEL_DECLARE_FLAGS(flags) = { };
 		struct metadata_dst *tun_dst;
 
-		tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), TUNNEL_KEY,
+		__set_bit(IP_TUNNEL_KEY_BIT, flags);
+		tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), flags,
 					 key32_to_tunnel_id(vni), sizeof(*md));
 
 		if (!tun_dst)
@@ -2403,14 +2406,14 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		vni = tunnel_id_to_key32(info->key.tun_id);
 		ifindex = 0;
 		dst_cache = &info->dst_cache;
-		if (info->key.tun_flags & TUNNEL_VXLAN_OPT) {
+		if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags)) {
 			if (info->options_len < sizeof(*md))
 				goto drop;
 			md = ip_tunnel_info_opts(info);
 		}
 		ttl = info->key.ttl;
 		tos = info->key.tos;
-		udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM);
+		udp_sum = test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags);
 	}
 	src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
 				     vxlan->cfg.port_max, true);
@@ -2451,7 +2454,8 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 				     old_iph->frag_off & htons(IP_DF)))
 					df = htons(IP_DF);
 			}
-		} else if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) {
+		} else if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT,
+				    info->key.tun_flags)) {
 			df = htons(IP_DF);
 		}
 
diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 1b7fae4c6b24..4160731dcb6e 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -198,7 +198,7 @@ static inline struct metadata_dst *__ip_tun_set_dst(__be32 saddr,
 						    __be32 daddr,
 						    __u8 tos, __u8 ttl,
 						    __be16 tp_dst,
-						    __be16 flags,
+						    const unsigned long *flags,
 						    __be64 tunnel_id,
 						    int md_size)
 {
@@ -215,7 +215,7 @@ static inline struct metadata_dst *__ip_tun_set_dst(__be32 saddr,
 }
 
 static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb,
-						 __be16 flags,
+						 const unsigned long *flags,
 						 __be64 tunnel_id,
 						 int md_size)
 {
@@ -230,7 +230,7 @@ static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *sad
 						      __u8 tos, __u8 ttl,
 						      __be16 tp_dst,
 						      __be32 label,
-						      __be16 flags,
+						      const unsigned long *flags,
 						      __be64 tunnel_id,
 						      int md_size)
 {
@@ -243,7 +243,7 @@ static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *sad
 
 	info = &tun_dst->u.tun_info;
 	info->mode = IP_TUNNEL_INFO_IPV6;
-	info->key.tun_flags = flags;
+	ip_tunnel_flags_copy(info->key.tun_flags, flags);
 	info->key.tun_id = tunnel_id;
 	info->key.tp_src = 0;
 	info->key.tp_dst = tp_dst;
@@ -259,7 +259,7 @@ static inline struct metadata_dst *__ipv6_tun_set_dst(const struct in6_addr *sad
 }
 
 static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb,
-						   __be16 flags,
+						   const unsigned long *flags,
 						   __be64 tunnel_id,
 						   int md_size)
 {
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index 1a7131d6cb0e..9ab376d1a677 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -97,7 +97,7 @@ struct flow_dissector_key_enc_opts {
 					 * here but seems difficult to #include
 					 */
 	u8 len;
-	__be16 dst_opt_type;
+	u32 dst_opt_type;
 };
 
 struct flow_dissector_key_keyid {
diff --git a/include/net/gre.h b/include/net/gre.h
index 4e209708b754..ccd293203284 100644
--- a/include/net/gre.h
+++ b/include/net/gre.h
@@ -49,67 +49,61 @@ static inline bool netif_is_ip6gretap(const struct net_device *dev)
 	       !strcmp(dev->rtnl_link_ops->kind, "ip6gretap");
 }
 
-static inline int gre_calc_hlen(__be16 o_flags)
+static inline int gre_calc_hlen(const unsigned long *o_flags)
 {
 	int addend = 4;
 
-	if (o_flags & TUNNEL_CSUM)
+	if (test_bit(IP_TUNNEL_CSUM_BIT, o_flags))
 		addend += 4;
-	if (o_flags & TUNNEL_KEY)
+	if (test_bit(IP_TUNNEL_KEY_BIT, o_flags))
 		addend += 4;
-	if (o_flags & TUNNEL_SEQ)
+	if (test_bit(IP_TUNNEL_SEQ_BIT, o_flags))
 		addend += 4;
 	return addend;
 }
 
-static inline __be16 gre_flags_to_tnl_flags(__be16 flags)
+static inline void gre_flags_to_tnl_flags(unsigned long *dst, __be16 flags)
 {
-	__be16 tflags = 0;
-
-	if (flags & GRE_CSUM)
-		tflags |= TUNNEL_CSUM;
-	if (flags & GRE_ROUTING)
-		tflags |= TUNNEL_ROUTING;
-	if (flags & GRE_KEY)
-		tflags |= TUNNEL_KEY;
-	if (flags & GRE_SEQ)
-		tflags |= TUNNEL_SEQ;
-	if (flags & GRE_STRICT)
-		tflags |= TUNNEL_STRICT;
-	if (flags & GRE_REC)
-		tflags |= TUNNEL_REC;
-	if (flags & GRE_VERSION)
-		tflags |= TUNNEL_VERSION;
-
-	return tflags;
+	IP_TUNNEL_DECLARE_FLAGS(res) = { };
+
+	__assign_bit(IP_TUNNEL_CSUM_BIT, res, flags & GRE_CSUM);
+	__assign_bit(IP_TUNNEL_ROUTING_BIT, res, flags & GRE_ROUTING);
+	__assign_bit(IP_TUNNEL_KEY_BIT, res, flags & GRE_KEY);
+	__assign_bit(IP_TUNNEL_SEQ_BIT, res, flags & GRE_SEQ);
+	__assign_bit(IP_TUNNEL_STRICT_BIT, res, flags & GRE_STRICT);
+	__assign_bit(IP_TUNNEL_REC_BIT, res, flags & GRE_REC);
+	__assign_bit(IP_TUNNEL_VERSION_BIT, res, flags & GRE_VERSION);
+
+	ip_tunnel_flags_copy(dst, res);
 }
 
-static inline __be16 gre_tnl_flags_to_gre_flags(__be16 tflags)
+static inline __be16 gre_tnl_flags_to_gre_flags(const unsigned long *tflags)
 {
 	__be16 flags = 0;
 
-	if (tflags & TUNNEL_CSUM)
+	if (test_bit(IP_TUNNEL_CSUM_BIT, tflags))
 		flags |= GRE_CSUM;
-	if (tflags & TUNNEL_ROUTING)
+	if (test_bit(IP_TUNNEL_ROUTING_BIT, tflags))
 		flags |= GRE_ROUTING;
-	if (tflags & TUNNEL_KEY)
+	if (test_bit(IP_TUNNEL_KEY_BIT, tflags))
 		flags |= GRE_KEY;
-	if (tflags & TUNNEL_SEQ)
+	if (test_bit(IP_TUNNEL_SEQ_BIT, tflags))
 		flags |= GRE_SEQ;
-	if (tflags & TUNNEL_STRICT)
+	if (test_bit(IP_TUNNEL_STRICT_BIT, tflags))
 		flags |= GRE_STRICT;
-	if (tflags & TUNNEL_REC)
+	if (test_bit(IP_TUNNEL_REC_BIT, tflags))
 		flags |= GRE_REC;
-	if (tflags & TUNNEL_VERSION)
+	if (test_bit(IP_TUNNEL_VERSION_BIT, tflags))
 		flags |= GRE_VERSION;
 
 	return flags;
 }
 
 static inline void gre_build_header(struct sk_buff *skb, int hdr_len,
-				    __be16 flags, __be16 proto,
+				    const unsigned long *flags, __be16 proto,
 				    __be32 key, __be32 seq)
 {
+	IP_TUNNEL_DECLARE_FLAGS(cond) = { };
 	struct gre_base_hdr *greh;
 
 	skb_push(skb, hdr_len);
@@ -120,18 +114,22 @@ static inline void gre_build_header(struct sk_buff *skb, int hdr_len,
 	greh->flags = gre_tnl_flags_to_gre_flags(flags);
 	greh->protocol = proto;
 
-	if (flags & (TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_SEQ)) {
+	__set_bit(IP_TUNNEL_KEY_BIT, cond);
+	__set_bit(IP_TUNNEL_CSUM_BIT, cond);
+	__set_bit(IP_TUNNEL_SEQ_BIT, cond);
+
+	if (ip_tunnel_flags_intersect(flags, cond)) {
 		__be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
 
-		if (flags & TUNNEL_SEQ) {
+		if (test_bit(IP_TUNNEL_SEQ_BIT, flags)) {
 			*ptr = seq;
 			ptr--;
 		}
-		if (flags & TUNNEL_KEY) {
+		if (test_bit(IP_TUNNEL_KEY_BIT, flags)) {
 			*ptr = key;
 			ptr--;
 		}
-		if (flags & TUNNEL_CSUM &&
+		if (test_bit(IP_TUNNEL_CSUM_BIT, flags) &&
 		    !(skb_shinfo(skb)->gso_type &
 		      (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) {
 			*ptr = 0;
diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index 74b369bddf49..399592405c72 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -30,8 +30,8 @@ struct __ip6_tnl_parm {
 	struct in6_addr laddr;	/* local tunnel end-point address */
 	struct in6_addr raddr;	/* remote tunnel end-point address */
 
-	__be16			i_flags;
-	__be16			o_flags;
+	IP_TUNNEL_DECLARE_FLAGS(i_flags);
+	IP_TUNNEL_DECLARE_FLAGS(o_flags);
 	__be32			i_key;
 	__be32			o_key;
 
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 20f0319ab149..ed8e48cc9054 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -36,6 +36,24 @@
 	(sizeof_field(struct ip_tunnel_key, u) -		\
 	 sizeof_field(struct ip_tunnel_key, u.ipv4))
 
+#define __ipt_flag_op(op, ...)					\
+	op(__VA_ARGS__, __IP_TUNNEL_FLAG_NUM)
+
+#define IP_TUNNEL_DECLARE_FLAGS(...)				\
+	__ipt_flag_op(DECLARE_BITMAP, __VA_ARGS__)
+
+#define ip_tunnel_flags_zero(...)	__ipt_flag_op(bitmap_zero, __VA_ARGS__)
+#define ip_tunnel_flags_copy(...)	__ipt_flag_op(bitmap_copy, __VA_ARGS__)
+#define ip_tunnel_flags_and(...)	__ipt_flag_op(bitmap_and, __VA_ARGS__)
+#define ip_tunnel_flags_or(...)		__ipt_flag_op(bitmap_or, __VA_ARGS__)
+
+#define ip_tunnel_flags_empty(...)				\
+	__ipt_flag_op(bitmap_empty, __VA_ARGS__)
+#define ip_tunnel_flags_intersect(...)				\
+	__ipt_flag_op(bitmap_intersects, __VA_ARGS__)
+#define ip_tunnel_flags_subset(...)				\
+	__ipt_flag_op(bitmap_subset, __VA_ARGS__)
+
 struct ip_tunnel_key {
 	__be64			tun_id;
 	union {
@@ -48,11 +66,11 @@ struct ip_tunnel_key {
 			struct in6_addr dst;
 		} ipv6;
 	} u;
-	__be16			tun_flags;
-	u8			tos;		/* TOS for IPv4, TC for IPv6 */
-	u8			ttl;		/* TTL for IPv4, HL for IPv6 */
+	IP_TUNNEL_DECLARE_FLAGS(tun_flags);
 	__be32			label;		/* Flow Label for IPv6 */
 	u32			nhid;
+	u8			tos;		/* TOS for IPv4, TC for IPv6 */
+	u8			ttl;		/* TTL for IPv4, HL for IPv6 */
 	__be16			tp_src;
 	__be16			tp_dst;
 	__u8			flow_flags;
@@ -110,14 +128,14 @@ struct ip_tunnel_prl_entry {
 
 struct metadata_dst;
 
-/* Kernel-side copy of ip_tunnel_parm */
+/* Kernel-side variant of ip_tunnel_parm */
 struct ip_tunnel_parm_kern {
 	char			name[IFNAMSIZ];
-	int			link;
-	__be16			i_flags;
-	__be16			o_flags;
+	IP_TUNNEL_DECLARE_FLAGS(i_flags);
+	IP_TUNNEL_DECLARE_FLAGS(o_flags);
 	__be32			i_key;
 	__be32			o_key;
+	int			link;
 	struct iphdr		iph;
 };
 
@@ -168,7 +186,7 @@ struct ip_tunnel {
 };
 
 struct tnl_ptk_info {
-	__be16 flags;
+	IP_TUNNEL_DECLARE_FLAGS(flags);
 	__be16 proto;
 	__be32 key;
 	__be32 seq;
@@ -190,11 +208,77 @@ struct ip_tunnel_net {
 	int type;
 };
 
+static inline void ip_tunnel_set_options_present(unsigned long *flags)
+{
+	IP_TUNNEL_DECLARE_FLAGS(present) = { };
+
+	__set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present);
+	__set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present);
+	__set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present);
+	__set_bit(IP_TUNNEL_GTP_OPT_BIT, present);
+
+	ip_tunnel_flags_or(flags, flags, present);
+}
+
+static inline void ip_tunnel_clear_options_present(unsigned long *flags)
+{
+	IP_TUNNEL_DECLARE_FLAGS(present) = { };
+
+	__set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present);
+	__set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present);
+	__set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present);
+	__set_bit(IP_TUNNEL_GTP_OPT_BIT, present);
+
+	__ipt_flag_op(bitmap_andnot, flags, flags, present);
+}
+
+static inline bool ip_tunnel_is_options_present(const unsigned long *flags)
+{
+	IP_TUNNEL_DECLARE_FLAGS(present) = { };
+
+	__set_bit(IP_TUNNEL_GENEVE_OPT_BIT, present);
+	__set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present);
+	__set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present);
+	__set_bit(IP_TUNNEL_GTP_OPT_BIT, present);
+
+	return ip_tunnel_flags_intersect(flags, present);
+}
+
+static inline bool ip_tunnel_flags_is_be16_compat(const unsigned long *flags)
+{
+	IP_TUNNEL_DECLARE_FLAGS(supp) = { };
+
+	bitmap_set(supp, 0, BITS_PER_TYPE(__be16));
+	__set_bit(IP_TUNNEL_VTI_BIT, supp);
+
+	return ip_tunnel_flags_subset(flags, supp);
+}
+
+static inline void ip_tunnel_flags_from_be16(unsigned long *dst, __be16 flags)
+{
+	ip_tunnel_flags_zero(dst);
+
+	bitmap_write(dst, be16_to_cpu(flags), 0, BITS_PER_TYPE(__be16));
+	__assign_bit(IP_TUNNEL_VTI_BIT, dst, flags & VTI_ISVTI);
+}
+
+static inline __be16 ip_tunnel_flags_to_be16(const unsigned long *flags)
+{
+	__be16 ret;
+
+	ret = cpu_to_be16(bitmap_read(flags, 0, BITS_PER_TYPE(__be16)));
+	if (test_bit(IP_TUNNEL_VTI_BIT, flags))
+		ret |= VTI_ISVTI;
+
+	return ret;
+}
+
 static inline void ip_tunnel_key_init(struct ip_tunnel_key *key,
 				      __be32 saddr, __be32 daddr,
 				      u8 tos, u8 ttl, __be32 label,
 				      __be16 tp_src, __be16 tp_dst,
-				      __be64 tun_id, __be16 tun_flags)
+				      __be64 tun_id,
+				      const unsigned long *tun_flags)
 {
 	key->tun_id = tun_id;
 	key->u.ipv4.src = saddr;
@@ -204,7 +288,7 @@ static inline void ip_tunnel_key_init(struct ip_tunnel_key *key,
 	key->tos = tos;
 	key->ttl = ttl;
 	key->label = label;
-	key->tun_flags = tun_flags;
+	ip_tunnel_flags_copy(key->tun_flags, tun_flags);
 
 	/* For the tunnel types on the top of IPsec, the tp_src and tp_dst of
 	 * the upper tunnel are used.
@@ -225,12 +309,8 @@ ip_tunnel_dst_cache_usable(const struct sk_buff *skb,
 {
 	if (skb->mark)
 		return false;
-	if (!info)
-		return true;
-	if (info->key.tun_flags & TUNNEL_NOCACHE)
-		return false;
 
-	return true;
+	return !info || !test_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags);
 }
 
 static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info
@@ -313,7 +393,7 @@ int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict);
 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu);
 
 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
-				   int link, __be16 flags,
+				   int link, const unsigned long *flags,
 				   __be32 remote, __be32 local,
 				   __be32 key);
 
@@ -529,12 +609,13 @@ static inline void ip_tunnel_info_opts_get(void *to,
 
 static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info,
 					   const void *from, int len,
-					   __be16 flags)
+					   const unsigned long *flags)
 {
 	info->options_len = len;
 	if (len > 0) {
 		memcpy(ip_tunnel_info_opts(info), from, len);
-		info->key.tun_flags |= flags;
+		ip_tunnel_flags_or(info->key.tun_flags, info->key.tun_flags,
+				   flags);
 	}
 }
 
@@ -578,7 +659,7 @@ static inline void ip_tunnel_info_opts_get(void *to,
 
 static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info,
 					   const void *from, int len,
-					   __be16 flags)
+					   const unsigned long *flags)
 {
 	info->options_len = 0;
 }
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index d716214fe03d..a93dc51f6323 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -179,8 +179,8 @@ struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb,
 					 struct dst_cache *dst_cache);
 
 struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family,
-				    __be16 flags, __be64 tunnel_id,
-				    int md_size);
+				    const unsigned long *flags,
+				    __be64 tunnel_id, int md_size);
 
 #ifdef CONFIG_INET
 static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum)
diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h
index 102119628ff5..838927dd73a4 100644
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -161,6 +161,14 @@ enum {
 
 #define IFLA_VTI_MAX	(__IFLA_VTI_MAX - 1)
 
+#ifndef __KERNEL__
+/* Historically, tunnel flags have been defined as __be16 and now there are
+ * no free bits left. It is strongly advised to switch the already existing
+ * userspace code to the new *_BIT definitions from down below, as __be16
+ * can't be simply cast to a wider type on LE systems. All new flags and
+ * code must use *_BIT only.
+ */
+
 #define TUNNEL_CSUM		__cpu_to_be16(0x01)
 #define TUNNEL_ROUTING		__cpu_to_be16(0x02)
 #define TUNNEL_KEY		__cpu_to_be16(0x04)
@@ -181,5 +189,30 @@ enum {
 #define TUNNEL_OPTIONS_PRESENT \
 		(TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT | TUNNEL_ERSPAN_OPT | \
 		TUNNEL_GTP_OPT)
+#endif
+
+enum {
+	IP_TUNNEL_CSUM_BIT		= 0U,
+	IP_TUNNEL_ROUTING_BIT,
+	IP_TUNNEL_KEY_BIT,
+	IP_TUNNEL_SEQ_BIT,
+	IP_TUNNEL_STRICT_BIT,
+	IP_TUNNEL_REC_BIT,
+	IP_TUNNEL_VERSION_BIT,
+	IP_TUNNEL_NO_KEY_BIT,
+	IP_TUNNEL_DONT_FRAGMENT_BIT,
+	IP_TUNNEL_OAM_BIT,
+	IP_TUNNEL_CRIT_OPT_BIT,
+	IP_TUNNEL_GENEVE_OPT_BIT,	/* OPTIONS_PRESENT */
+	IP_TUNNEL_VXLAN_OPT_BIT,	/* OPTIONS_PRESENT */
+	IP_TUNNEL_NOCACHE_BIT,
+	IP_TUNNEL_ERSPAN_OPT_BIT,	/* OPTIONS_PRESENT */
+	IP_TUNNEL_GTP_OPT_BIT,		/* OPTIONS_PRESENT */
+
+	IP_TUNNEL_VTI_BIT,
+	IP_TUNNEL_SIT_ISATAP_BIT	= IP_TUNNEL_VTI_BIT,
+
+	__IP_TUNNEL_FLAG_NUM,
+};
 
 #endif /* _UAPI_IF_TUNNEL_H_ */
diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c
index 81833ca7a2c7..a966a6ec8263 100644
--- a/net/bridge/br_vlan_tunnel.c
+++ b/net/bridge/br_vlan_tunnel.c
@@ -65,13 +65,14 @@ static int __vlan_tunnel_info_add(struct net_bridge_vlan_group *vg,
 {
 	struct metadata_dst *metadata = rtnl_dereference(vlan->tinfo.tunnel_dst);
 	__be64 key = key32_to_tunnel_id(cpu_to_be32(tun_id));
+	IP_TUNNEL_DECLARE_FLAGS(flags) = { };
 	int err;
 
 	if (metadata)
 		return -EEXIST;
 
-	metadata = __ip_tun_set_dst(0, 0, 0, 0, 0, TUNNEL_KEY,
-				    key, 0);
+	__set_bit(IP_TUNNEL_KEY_BIT, flags);
+	metadata = __ip_tun_set_dst(0, 0, 0, 0, 0, flags, key, 0);
 	if (!metadata)
 		return -EINVAL;
 
@@ -185,6 +186,7 @@ void br_handle_ingress_vlan_tunnel(struct sk_buff *skb,
 int br_handle_egress_vlan_tunnel(struct sk_buff *skb,
 				 struct net_bridge_vlan *vlan)
 {
+	IP_TUNNEL_DECLARE_FLAGS(flags) = { };
 	struct metadata_dst *tunnel_dst;
 	__be64 tunnel_id;
 	int err;
@@ -202,7 +204,8 @@ int br_handle_egress_vlan_tunnel(struct sk_buff *skb,
 		return err;
 
 	if (BR_INPUT_SKB_CB(skb)->backup_nhid) {
-		tunnel_dst = __ip_tun_set_dst(0, 0, 0, 0, 0, TUNNEL_KEY,
+		__set_bit(IP_TUNNEL_KEY_BIT, flags);
+		tunnel_dst = __ip_tun_set_dst(0, 0, 0, 0, 0, flags,
 					      tunnel_id, 0);
 		if (!tunnel_dst)
 			return -ENOMEM;
diff --git a/net/core/filter.c b/net/core/filter.c
index 0c66e4a3fc5b..294670d3850d 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4662,7 +4662,7 @@ set_compat:
 	to->tunnel_tos = info->key.tos;
 	to->tunnel_ttl = info->key.ttl;
 	if (flags & BPF_F_TUNINFO_FLAGS)
-		to->tunnel_flags = info->key.tun_flags;
+		to->tunnel_flags = ip_tunnel_flags_to_be16(info->key.tun_flags);
 	else
 		to->tunnel_ext = 0;
 
@@ -4705,7 +4705,7 @@ BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size)
 	int err;
 
 	if (unlikely(!info ||
-		     !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) {
+		     !ip_tunnel_is_options_present(info->key.tun_flags))) {
 		err = -ENOENT;
 		goto err_clear;
 	}
@@ -4775,15 +4775,15 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
 	memset(info, 0, sizeof(*info));
 	info->mode = IP_TUNNEL_INFO_TX;
 
-	info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
-	if (flags & BPF_F_DONT_FRAGMENT)
-		info->key.tun_flags |= TUNNEL_DONT_FRAGMENT;
-	if (flags & BPF_F_ZERO_CSUM_TX)
-		info->key.tun_flags &= ~TUNNEL_CSUM;
-	if (flags & BPF_F_SEQ_NUMBER)
-		info->key.tun_flags |= TUNNEL_SEQ;
-	if (flags & BPF_F_NO_TUNNEL_KEY)
-		info->key.tun_flags &= ~TUNNEL_KEY;
+	__set_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags);
+	__assign_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags,
+		     flags & BPF_F_DONT_FRAGMENT);
+	__assign_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags,
+		     !(flags & BPF_F_ZERO_CSUM_TX));
+	__assign_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags,
+		     flags & BPF_F_SEQ_NUMBER);
+	__assign_bit(IP_TUNNEL_KEY_BIT, info->key.tun_flags,
+		     !(flags & BPF_F_NO_TUNNEL_KEY));
 
 	info->key.tun_id = cpu_to_be64(from->tunnel_id);
 	info->key.tos = from->tunnel_tos;
@@ -4821,13 +4821,15 @@ BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb,
 {
 	struct ip_tunnel_info *info = skb_tunnel_info(skb);
 	const struct metadata_dst *md = this_cpu_ptr(md_dst);
+	IP_TUNNEL_DECLARE_FLAGS(present) = { };
 
 	if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1))))
 		return -EINVAL;
 	if (unlikely(size > IP_TUNNEL_OPTS_MAX))
 		return -ENOMEM;
 
-	ip_tunnel_info_opts_set(info, from, size, TUNNEL_OPTIONS_PRESENT);
+	ip_tunnel_set_options_present(present);
+	ip_tunnel_info_opts_set(info, from, size, present);
 
 	return 0;
 }
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 272f09251343..f82e9a7d3b37 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -455,17 +455,25 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
 
 	if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) {
 		struct flow_dissector_key_enc_opts *enc_opt;
+		IP_TUNNEL_DECLARE_FLAGS(flags) = { };
+		u32 val;
 
 		enc_opt = skb_flow_dissector_target(flow_dissector,
 						    FLOW_DISSECTOR_KEY_ENC_OPTS,
 						    target_container);
 
-		if (info->options_len) {
-			enc_opt->len = info->options_len;
-			ip_tunnel_info_opts_get(enc_opt->data, info);
-			enc_opt->dst_opt_type = info->key.tun_flags &
-						TUNNEL_OPTIONS_PRESENT;
-		}
+		if (!info->options_len)
+			return;
+
+		enc_opt->len = info->options_len;
+		ip_tunnel_info_opts_get(enc_opt->data, info);
+
+		ip_tunnel_set_options_present(flags);
+		ip_tunnel_flags_and(flags, info->key.tun_flags, flags);
+
+		val = find_next_bit(flags, __IP_TUNNEL_FLAG_NUM,
+				    IP_TUNNEL_GENEVE_OPT_BIT);
+		enc_opt->dst_opt_type = val < __IP_TUNNEL_FLAG_NUM ? val : 0;
 	}
 }
 EXPORT_SYMBOL(skb_flow_dissect_tunnel_info);
diff --git a/net/ipv4/fou_bpf.c b/net/ipv4/fou_bpf.c
index 06e5572f296f..54984f3170a8 100644
--- a/net/ipv4/fou_bpf.c
+++ b/net/ipv4/fou_bpf.c
@@ -64,7 +64,7 @@ __bpf_kfunc int bpf_skb_set_fou_encap(struct __sk_buff *skb_ctx,
 		info->encap.type = TUNNEL_ENCAP_NONE;
 	}
 
-	if (info->key.tun_flags & TUNNEL_CSUM)
+	if (test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags))
 		info->encap.flags |= TUNNEL_ENCAP_FLAG_CSUM;
 
 	info->encap.sport = encap->sport;
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index 3757fd93523f..6701a98d9a9f 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -73,7 +73,7 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 	if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
 		return -EINVAL;
 
-	tpi->flags = gre_flags_to_tnl_flags(greh->flags);
+	gre_flags_to_tnl_flags(tpi->flags, greh->flags);
 	hdr_len = gre_calc_hlen(tpi->flags);
 
 	if (!pskb_may_pull(skb, nhs + hdr_len))
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 8853c065a985..3c46a7fef8db 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -265,6 +265,7 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 	struct net *net = dev_net(skb->dev);
 	struct metadata_dst *tun_dst = NULL;
 	struct erspan_base_hdr *ershdr;
+	IP_TUNNEL_DECLARE_FLAGS(flags);
 	struct ip_tunnel_net *itn;
 	struct ip_tunnel *tunnel;
 	const struct iphdr *iph;
@@ -272,18 +273,20 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 	int ver;
 	int len;
 
+	ip_tunnel_flags_copy(flags, tpi->flags);
+
 	itn = net_generic(net, erspan_net_id);
 	iph = ip_hdr(skb);
 	if (is_erspan_type1(gre_hdr_len)) {
 		ver = 0;
-		tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
-					  tpi->flags | TUNNEL_NO_KEY,
+		__set_bit(IP_TUNNEL_NO_KEY_BIT, flags);
+		tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags,
 					  iph->saddr, iph->daddr, 0);
 	} else {
 		ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len);
 		ver = ershdr->ver;
-		tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
-					  tpi->flags | TUNNEL_KEY,
+		__set_bit(IP_TUNNEL_KEY_BIT, flags);
+		tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags,
 					  iph->saddr, iph->daddr, tpi->key);
 	}
 
@@ -307,10 +310,9 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 			struct ip_tunnel_info *info;
 			unsigned char *gh;
 			__be64 tun_id;
-			__be16 flags;
 
-			tpi->flags |= TUNNEL_KEY;
-			flags = tpi->flags;
+			__set_bit(IP_TUNNEL_KEY_BIT, tpi->flags);
+			ip_tunnel_flags_copy(flags, tpi->flags);
 			tun_id = key32_to_tunnel_id(tpi->key);
 
 			tun_dst = ip_tun_rx_dst(skb, flags,
@@ -333,7 +335,8 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
 						       ERSPAN_V2_MDSIZE);
 
 			info = &tun_dst->u.tun_info;
-			info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
+			__set_bit(IP_TUNNEL_ERSPAN_OPT_BIT,
+				  info->key.tun_flags);
 			info->options_len = sizeof(*md);
 		}
 
@@ -376,10 +379,13 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
 
 		tnl_params = &tunnel->parms.iph;
 		if (tunnel->collect_md || tnl_params->daddr == 0) {
-			__be16 flags;
+			IP_TUNNEL_DECLARE_FLAGS(flags) = { };
 			__be64 tun_id;
 
-			flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
+			__set_bit(IP_TUNNEL_CSUM_BIT, flags);
+			__set_bit(IP_TUNNEL_KEY_BIT, flags);
+			ip_tunnel_flags_and(flags, tpi->flags, flags);
+
 			tun_id = key32_to_tunnel_id(tpi->key);
 			tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
 			if (!tun_dst)
@@ -459,12 +465,15 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
 		       __be16 proto)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
-	__be16 flags = tunnel->parms.o_flags;
+	IP_TUNNEL_DECLARE_FLAGS(flags);
+
+	ip_tunnel_flags_copy(flags, tunnel->parms.o_flags);
 
 	/* Push GRE header. */
 	gre_build_header(skb, tunnel->tun_hlen,
 			 flags, proto, tunnel->parms.o_key,
-			 (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0);
+			 test_bit(IP_TUNNEL_SEQ_BIT, flags) ?
+			 htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0);
 
 	ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
 }
@@ -478,10 +487,10 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 			__be16 proto)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
+	IP_TUNNEL_DECLARE_FLAGS(flags) = { };
 	struct ip_tunnel_info *tun_info;
 	const struct ip_tunnel_key *key;
 	int tunnel_hlen;
-	__be16 flags;
 
 	tun_info = skb_tunnel_info(skb);
 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
@@ -495,14 +504,19 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
 		goto err_free_skb;
 
 	/* Push Tunnel header. */
-	if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)))
+	if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT,
+					      tunnel->parms.o_flags)))
 		goto err_free_skb;
 
-	flags = tun_info->key.tun_flags &
-		(TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ);
+	__set_bit(IP_TUNNEL_CSUM_BIT, flags);
+	__set_bit(IP_TUNNEL_KEY_BIT, flags);
+	__set_bit(IP_TUNNEL_SEQ_BIT, flags);
+	ip_tunnel_flags_and(flags, tun_info->key.tun_flags, flags);
+
 	gre_build_header(skb, tunnel_hlen, flags, proto,
 			 tunnel_id_to_key32(tun_info->key.tun_id),
-			 (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0);
+			 test_bit(IP_TUNNEL_SEQ_BIT, flags) ?
+			 htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0);
 
 	ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);
 
@@ -516,6 +530,7 @@ err_free_skb:
 static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
+	IP_TUNNEL_DECLARE_FLAGS(flags) = { };
 	struct ip_tunnel_info *tun_info;
 	const struct ip_tunnel_key *key;
 	struct erspan_metadata *md;
@@ -531,7 +546,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
 		goto err_free_skb;
 
 	key = &tun_info->key;
-	if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT))
+	if (!test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_info->key.tun_flags))
 		goto err_free_skb;
 	if (tun_info->options_len < sizeof(*md))
 		goto err_free_skb;
@@ -584,8 +599,9 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
 		goto err_free_skb;
 	}
 
-	gre_build_header(skb, 8, TUNNEL_SEQ,
-			 proto, 0, htonl(atomic_fetch_inc(&tunnel->o_seqno)));
+	__set_bit(IP_TUNNEL_SEQ_BIT, flags);
+	gre_build_header(skb, 8, flags, proto, 0,
+			 htonl(atomic_fetch_inc(&tunnel->o_seqno)));
 
 	ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);
 
@@ -659,7 +675,8 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
 		tnl_params = &tunnel->parms.iph;
 	}
 
-	if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
+	if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT,
+					      tunnel->parms.o_flags)))
 		goto free_skb;
 
 	__gre_xmit(skb, dev, tnl_params, skb->protocol);
@@ -701,7 +718,7 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb,
 	/* Push ERSPAN header */
 	if (tunnel->erspan_ver == 0) {
 		proto = htons(ETH_P_ERSPAN);
-		tunnel->parms.o_flags &= ~TUNNEL_SEQ;
+		__clear_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags);
 	} else if (tunnel->erspan_ver == 1) {
 		erspan_build_header(skb, ntohl(tunnel->parms.o_key),
 				    tunnel->index,
@@ -716,7 +733,7 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb,
 		goto free_skb;
 	}
 
-	tunnel->parms.o_flags &= ~TUNNEL_KEY;
+	__clear_bit(IP_TUNNEL_KEY_BIT, tunnel->parms.o_flags);
 	__gre_xmit(skb, dev, &tunnel->parms.iph, proto);
 	return NETDEV_TX_OK;
 
@@ -739,7 +756,8 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
 		return NETDEV_TX_OK;
 	}
 
-	if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
+	if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT,
+					      tunnel->parms.o_flags)))
 		goto free_skb;
 
 	if (skb_cow_head(skb, dev->needed_headroom))
@@ -757,7 +775,6 @@ free_skb:
 static void ipgre_link_update(struct net_device *dev, bool set_mtu)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
-	__be16 flags;
 	int len;
 
 	len = tunnel->tun_hlen;
@@ -773,10 +790,9 @@ static void ipgre_link_update(struct net_device *dev, bool set_mtu)
 	if (set_mtu)
 		dev->mtu = max_t(int, dev->mtu - len, 68);
 
-	flags = tunnel->parms.o_flags;
-
-	if (flags & TUNNEL_SEQ ||
-	    (flags & TUNNEL_CSUM && tunnel->encap.type != TUNNEL_ENCAP_NONE)) {
+	if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags) ||
+	    (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.o_flags) &&
+	     tunnel->encap.type != TUNNEL_ENCAP_NONE)) {
 		dev->features &= ~NETIF_F_GSO_SOFTWARE;
 		dev->hw_features &= ~NETIF_F_GSO_SOFTWARE;
 	} else {
@@ -789,17 +805,25 @@ static int ipgre_tunnel_ctl(struct net_device *dev,
 			    struct ip_tunnel_parm_kern *p,
 			    int cmd)
 {
+	__be16 i_flags, o_flags;
 	int err;
 
+	if (!ip_tunnel_flags_is_be16_compat(p->i_flags) ||
+	    !ip_tunnel_flags_is_be16_compat(p->o_flags))
+		return -EOVERFLOW;
+
+	i_flags = ip_tunnel_flags_to_be16(p->i_flags);
+	o_flags = ip_tunnel_flags_to_be16(p->o_flags);
+
 	if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
 		if (p->iph.version != 4 || p->iph.protocol != IPPROTO_GRE ||
 		    p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF)) ||
-		    ((p->i_flags | p->o_flags) & (GRE_VERSION | GRE_ROUTING)))
+		    ((i_flags | o_flags) & (GRE_VERSION | GRE_ROUTING)))
 			return -EINVAL;
 	}
 
-	p->i_flags = gre_flags_to_tnl_flags(p->i_flags);
-	p->o_flags = gre_flags_to_tnl_flags(p->o_flags);
+	gre_flags_to_tnl_flags(p->i_flags, i_flags);
+	gre_flags_to_tnl_flags(p->o_flags, o_flags);
 
 	err = ip_tunnel_ctl(dev, p, cmd);
 	if (err)
@@ -808,15 +832,18 @@ static int ipgre_tunnel_ctl(struct net_device *dev,
 	if (cmd == SIOCCHGTUNNEL) {
 		struct ip_tunnel *t = netdev_priv(dev);
 
-		t->parms.i_flags = p->i_flags;
-		t->parms.o_flags = p->o_flags;
+		ip_tunnel_flags_copy(t->parms.i_flags, p->i_flags);
+		ip_tunnel_flags_copy(t->parms.o_flags, p->o_flags);
 
 		if (strcmp(dev->rtnl_link_ops->kind, "erspan"))
 			ipgre_link_update(dev, true);
 	}
 
-	p->i_flags = gre_tnl_flags_to_gre_flags(p->i_flags);
-	p->o_flags = gre_tnl_flags_to_gre_flags(p->o_flags);
+	i_flags = gre_tnl_flags_to_gre_flags(p->i_flags);
+	ip_tunnel_flags_from_be16(p->i_flags, i_flags);
+	o_flags = gre_tnl_flags_to_gre_flags(p->o_flags);
+	ip_tunnel_flags_from_be16(p->o_flags, o_flags);
+
 	return 0;
 }
 
@@ -956,7 +983,6 @@ static void ipgre_tunnel_setup(struct net_device *dev)
 static void __gre_tunnel_init(struct net_device *dev)
 {
 	struct ip_tunnel *tunnel;
-	__be16 flags;
 
 	tunnel = netdev_priv(dev);
 	tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
@@ -968,14 +994,13 @@ static void __gre_tunnel_init(struct net_device *dev)
 	dev->features		|= GRE_FEATURES | NETIF_F_LLTX;
 	dev->hw_features	|= GRE_FEATURES;
 
-	flags = tunnel->parms.o_flags;
-
 	/* TCP offload with GRE SEQ is not supported, nor can we support 2
 	 * levels of outer headers requiring an update.
 	 */
-	if (flags & TUNNEL_SEQ)
+	if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags))
 		return;
-	if (flags & TUNNEL_CSUM && tunnel->encap.type != TUNNEL_ENCAP_NONE)
+	if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.o_flags) &&
+	    tunnel->encap.type != TUNNEL_ENCAP_NONE)
 		return;
 
 	dev->features |= NETIF_F_GSO_SOFTWARE;
@@ -1148,10 +1173,12 @@ static int ipgre_netlink_parms(struct net_device *dev,
 		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
 
 	if (data[IFLA_GRE_IFLAGS])
-		parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));
+		gre_flags_to_tnl_flags(parms->i_flags,
+				       nla_get_be16(data[IFLA_GRE_IFLAGS]));
 
 	if (data[IFLA_GRE_OFLAGS])
-		parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));
+		gre_flags_to_tnl_flags(parms->o_flags,
+				       nla_get_be16(data[IFLA_GRE_OFLAGS]));
 
 	if (data[IFLA_GRE_IKEY])
 		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
@@ -1411,8 +1438,8 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
 	if (err < 0)
 		return err;
 
-	t->parms.i_flags = p.i_flags;
-	t->parms.o_flags = p.o_flags;
+	ip_tunnel_flags_copy(t->parms.i_flags, p.i_flags);
+	ip_tunnel_flags_copy(t->parms.o_flags, p.o_flags);
 
 	ipgre_link_update(dev, !tb[IFLA_MTU]);
 
@@ -1440,8 +1467,8 @@ static int erspan_changelink(struct net_device *dev, struct nlattr *tb[],
 	if (err < 0)
 		return err;
 
-	t->parms.i_flags = p.i_flags;
-	t->parms.o_flags = p.o_flags;
+	ip_tunnel_flags_copy(t->parms.i_flags, p.i_flags);
+	ip_tunnel_flags_copy(t->parms.o_flags, p.o_flags);
 
 	return 0;
 }
@@ -1498,7 +1525,9 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
 {
 	struct ip_tunnel *t = netdev_priv(dev);
 	struct ip_tunnel_parm_kern *p = &t->parms;
-	__be16 o_flags = p->o_flags;
+	IP_TUNNEL_DECLARE_FLAGS(o_flags);
+
+	ip_tunnel_flags_copy(o_flags, p->o_flags);
 
 	if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
 	    nla_put_be16(skb, IFLA_GRE_IFLAGS,
@@ -1546,7 +1575,7 @@ static int erspan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 
 	if (t->erspan_ver <= 2) {
 		if (t->erspan_ver != 0 && !t->collect_md)
-			t->parms.o_flags |= TUNNEL_KEY;
+			__set_bit(IP_TUNNEL_KEY_BIT, t->parms.o_flags);
 
 		if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver))
 			goto nla_put_failure;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index d10f41f81463..f65170a28106 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -57,16 +57,12 @@ static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
 }
 
 static bool ip_tunnel_key_match(const struct ip_tunnel_parm_kern *p,
-				__be16 flags, __be32 key)
+				const unsigned long *flags, __be32 key)
 {
-	if (p->i_flags & TUNNEL_KEY) {
-		if (flags & TUNNEL_KEY)
-			return key == p->i_key;
-		else
-			/* key expected, none present */
-			return false;
-	} else
-		return !(flags & TUNNEL_KEY);
+	if (!test_bit(IP_TUNNEL_KEY_BIT, flags))
+		return !test_bit(IP_TUNNEL_KEY_BIT, p->i_flags);
+
+	return test_bit(IP_TUNNEL_KEY_BIT, p->i_flags) && p->i_key == key;
 }
 
 /* Fallback tunnel: no source, no destination, no key, no options
@@ -81,7 +77,7 @@ static bool ip_tunnel_key_match(const struct ip_tunnel_parm_kern *p,
    Given src, dst and key, find appropriate for input tunnel.
 */
 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
-				   int link, __be16 flags,
+				   int link, const unsigned long *flags,
 				   __be32 remote, __be32 local,
 				   __be32 key)
 {
@@ -143,7 +139,8 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
 	}
 
 	hlist_for_each_entry_rcu(t, head, hash_node) {
-		if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
+		if ((!test_bit(IP_TUNNEL_NO_KEY_BIT, flags) &&
+		     t->parms.i_key != key) ||
 		    t->parms.iph.saddr != 0 ||
 		    t->parms.iph.daddr != 0 ||
 		    !(t->dev->flags & IFF_UP))
@@ -182,7 +179,8 @@ static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
 	else
 		remote = 0;
 
-	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
+	if (!test_bit(IP_TUNNEL_KEY_BIT, parms->i_flags) &&
+	    test_bit(IP_TUNNEL_VTI_BIT, parms->i_flags))
 		i_key = 0;
 
 	h = ip_tunnel_hash(i_key, remote);
@@ -211,12 +209,14 @@ static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
 {
 	__be32 remote = parms->iph.daddr;
 	__be32 local = parms->iph.saddr;
+	IP_TUNNEL_DECLARE_FLAGS(flags);
 	__be32 key = parms->i_key;
-	__be16 flags = parms->i_flags;
 	int link = parms->link;
 	struct ip_tunnel *t = NULL;
 	struct hlist_head *head = ip_bucket(itn, parms);
 
+	ip_tunnel_flags_copy(flags, parms->i_flags);
+
 	hlist_for_each_entry_rcu(t, head, hash_node) {
 		if (local == t->parms.iph.saddr &&
 		    remote == t->parms.iph.daddr &&
@@ -386,15 +386,15 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
 	}
 #endif
 
-	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
-	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
+	if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) !=
+	    test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) {
 		DEV_STATS_INC(tunnel->dev, rx_crc_errors);
 		DEV_STATS_INC(tunnel->dev, rx_errors);
 		goto drop;
 	}
 
-	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
-		if (!(tpi->flags&TUNNEL_SEQ) ||
+	if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) {
+		if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) ||
 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
 			DEV_STATS_INC(tunnel->dev, rx_fifo_errors);
 			DEV_STATS_INC(tunnel->dev, rx_errors);
@@ -638,7 +638,7 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
 		goto tx_error;
 	}
 
-	if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
+	if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags))
 		df = htons(IP_DF);
 	if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
 			    key->u.ipv4.dst, true)) {
@@ -928,10 +928,10 @@ int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p,
 			goto done;
 		if (p->iph.ttl)
 			p->iph.frag_off |= htons(IP_DF);
-		if (!(p->i_flags & VTI_ISVTI)) {
-			if (!(p->i_flags & TUNNEL_KEY))
+		if (!test_bit(IP_TUNNEL_VTI_BIT, p->i_flags)) {
+			if (!test_bit(IP_TUNNEL_KEY_BIT, p->i_flags))
 				p->i_key = 0;
-			if (!(p->o_flags & TUNNEL_KEY))
+			if (!test_bit(IP_TUNNEL_KEY_BIT, p->o_flags))
 				p->o_key = 0;
 		}
 
@@ -1016,8 +1016,8 @@ bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern *kp,
 
 	strscpy(kp->name, p.name);
 	kp->link = p.link;
-	kp->i_flags = p.i_flags;
-	kp->o_flags = p.o_flags;
+	ip_tunnel_flags_from_be16(kp->i_flags, p.i_flags);
+	ip_tunnel_flags_from_be16(kp->o_flags, p.o_flags);
 	kp->i_key = p.i_key;
 	kp->o_key = p.o_key;
 	memcpy(&kp->iph, &p.iph, min(sizeof(kp->iph), sizeof(p.iph)));
@@ -1030,10 +1030,14 @@ bool ip_tunnel_parm_to_user(void __user *data, struct ip_tunnel_parm_kern *kp)
 {
 	struct ip_tunnel_parm p;
 
+	if (!ip_tunnel_flags_is_be16_compat(kp->i_flags) ||
+	    !ip_tunnel_flags_is_be16_compat(kp->o_flags))
+		return false;
+
 	strscpy(p.name, kp->name);
 	p.link = kp->link;
-	p.i_flags = kp->i_flags;
-	p.o_flags = kp->o_flags;
+	p.i_flags = ip_tunnel_flags_to_be16(kp->i_flags);
+	p.o_flags = ip_tunnel_flags_to_be16(kp->o_flags);
 	p.i_key = kp->i_key;
 	p.o_key = kp->o_key;
 	memcpy(&p.iph, &kp->iph, min(sizeof(p.iph), sizeof(kp->iph)));
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 98f136b07191..a3676155be78 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -125,6 +125,7 @@ EXPORT_SYMBOL_GPL(__iptunnel_pull_header);
 struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
 					     gfp_t flags)
 {
+	IP_TUNNEL_DECLARE_FLAGS(tun_flags) = { };
 	struct metadata_dst *res;
 	struct ip_tunnel_info *dst, *src;
 
@@ -144,10 +145,10 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
 		       sizeof(struct in6_addr));
 	else
 		dst->key.u.ipv4.dst = src->key.u.ipv4.src;
-	dst->key.tun_flags = src->key.tun_flags;
+	ip_tunnel_flags_copy(dst->key.tun_flags, src->key.tun_flags);
 	dst->mode = src->mode | IP_TUNNEL_INFO_TX;
 	ip_tunnel_info_opts_set(dst, ip_tunnel_info_opts(src),
-				src->options_len, 0);
+				src->options_len, tun_flags);
 
 	return res;
 }
@@ -497,7 +498,7 @@ static int ip_tun_parse_opts_geneve(struct nlattr *attr,
 		opt->opt_class = nla_get_be16(attr);
 		attr = tb[LWTUNNEL_IP_OPT_GENEVE_TYPE];
 		opt->type = nla_get_u8(attr);
-		info->key.tun_flags |= TUNNEL_GENEVE_OPT;
+		__set_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags);
 	}
 
 	return sizeof(struct geneve_opt) + data_len;
@@ -525,7 +526,7 @@ static int ip_tun_parse_opts_vxlan(struct nlattr *attr,
 		attr = tb[LWTUNNEL_IP_OPT_VXLAN_GBP];
 		md->gbp = nla_get_u32(attr);
 		md->gbp &= VXLAN_GBP_MASK;
-		info->key.tun_flags |= TUNNEL_VXLAN_OPT;
+		__set_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags);
 	}
 
 	return sizeof(struct vxlan_metadata);
@@ -574,7 +575,7 @@ static int ip_tun_parse_opts_erspan(struct nlattr *attr,
 			set_hwid(&md->u.md2, nla_get_u8(attr));
 		}
 
-		info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
+		__set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags);
 	}
 
 	return sizeof(struct erspan_metadata);
@@ -585,7 +586,7 @@ static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info,
 {
 	int err, rem, opt_len, opts_len = 0;
 	struct nlattr *nla;
-	__be16 type = 0;
+	u32 type = 0;
 
 	if (!attr)
 		return 0;
@@ -598,7 +599,7 @@ static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info,
 	nla_for_each_attr(nla, nla_data(attr), nla_len(attr), rem) {
 		switch (nla_type(nla)) {
 		case LWTUNNEL_IP_OPTS_GENEVE:
-			if (type && type != TUNNEL_GENEVE_OPT)
+			if (type && type != IP_TUNNEL_GENEVE_OPT_BIT)
 				return -EINVAL;
 			opt_len = ip_tun_parse_opts_geneve(nla, info, opts_len,
 							   extack);
@@ -607,7 +608,7 @@ static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info,
 			opts_len += opt_len;
 			if (opts_len > IP_TUNNEL_OPTS_MAX)
 				return -EINVAL;
-			type = TUNNEL_GENEVE_OPT;
+			type = IP_TUNNEL_GENEVE_OPT_BIT;
 			break;
 		case LWTUNNEL_IP_OPTS_VXLAN:
 			if (type)
@@ -617,7 +618,7 @@ static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info,
 			if (opt_len < 0)
 				return opt_len;
 			opts_len += opt_len;
-			type = TUNNEL_VXLAN_OPT;
+			type = IP_TUNNEL_VXLAN_OPT_BIT;
 			break;
 		case LWTUNNEL_IP_OPTS_ERSPAN:
 			if (type)
@@ -627,7 +628,7 @@ static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info,
 			if (opt_len < 0)
 				return opt_len;
 			opts_len += opt_len;
-			type = TUNNEL_ERSPAN_OPT;
+			type = IP_TUNNEL_ERSPAN_OPT_BIT;
 			break;
 		default:
 			return -EINVAL;
@@ -705,10 +706,16 @@ static int ip_tun_build_state(struct net *net, struct nlattr *attr,
 	if (tb[LWTUNNEL_IP_TOS])
 		tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]);
 
-	if (tb[LWTUNNEL_IP_FLAGS])
-		tun_info->key.tun_flags |=
-				(nla_get_be16(tb[LWTUNNEL_IP_FLAGS]) &
-				 ~TUNNEL_OPTIONS_PRESENT);
+	if (tb[LWTUNNEL_IP_FLAGS]) {
+		IP_TUNNEL_DECLARE_FLAGS(flags);
+
+		ip_tunnel_flags_from_be16(flags,
+					  nla_get_be16(tb[LWTUNNEL_IP_FLAGS]));
+		ip_tunnel_clear_options_present(flags);
+
+		ip_tunnel_flags_or(tun_info->key.tun_flags,
+				   tun_info->key.tun_flags, flags);
+	}
 
 	tun_info->mode = IP_TUNNEL_INFO_TX;
 	tun_info->options_len = opt_len;
@@ -812,18 +819,18 @@ static int ip_tun_fill_encap_opts(struct sk_buff *skb, int type,
 	struct nlattr *nest;
 	int err = 0;
 
-	if (!(tun_info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))
+	if (!ip_tunnel_is_options_present(tun_info->key.tun_flags))
 		return 0;
 
 	nest = nla_nest_start_noflag(skb, type);
 	if (!nest)
 		return -ENOMEM;
 
-	if (tun_info->key.tun_flags & TUNNEL_GENEVE_OPT)
+	if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, tun_info->key.tun_flags))
 		err = ip_tun_fill_encap_opts_geneve(skb, tun_info);
-	else if (tun_info->key.tun_flags & TUNNEL_VXLAN_OPT)
+	else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, tun_info->key.tun_flags))
 		err = ip_tun_fill_encap_opts_vxlan(skb, tun_info);
-	else if (tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)
+	else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_info->key.tun_flags))
 		err = ip_tun_fill_encap_opts_erspan(skb, tun_info);
 
 	if (err) {
@@ -846,7 +853,8 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb,
 	    nla_put_in_addr(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) ||
 	    nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) ||
 	    nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) ||
-	    nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags) ||
+	    nla_put_be16(skb, LWTUNNEL_IP_FLAGS,
+			 ip_tunnel_flags_to_be16(tun_info->key.tun_flags)) ||
 	    ip_tun_fill_encap_opts(skb, LWTUNNEL_IP_OPTS, tun_info))
 		return -ENOMEM;
 
@@ -857,11 +865,11 @@ static int ip_tun_opts_nlsize(struct ip_tunnel_info *info)
 {
 	int opt_len;
 
-	if (!(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))
+	if (!ip_tunnel_is_options_present(info->key.tun_flags))
 		return 0;
 
 	opt_len = nla_total_size(0);		/* LWTUNNEL_IP_OPTS */
-	if (info->key.tun_flags & TUNNEL_GENEVE_OPT) {
+	if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags)) {
 		struct geneve_opt *opt;
 		int offset = 0;
 
@@ -874,10 +882,10 @@ static int ip_tun_opts_nlsize(struct ip_tunnel_info *info)
 							/* OPT_GENEVE_DATA */
 			offset += sizeof(*opt) + opt->length * 4;
 		}
-	} else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) {
+	} else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags)) {
 		opt_len += nla_total_size(0)	/* LWTUNNEL_IP_OPTS_VXLAN */
 			   + nla_total_size(4);	/* OPT_VXLAN_GBP */
-	} else if (info->key.tun_flags & TUNNEL_ERSPAN_OPT) {
+	} else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags)) {
 		struct erspan_metadata *md = ip_tunnel_info_opts(info);
 
 		opt_len += nla_total_size(0)	/* LWTUNNEL_IP_OPTS_ERSPAN */
@@ -984,10 +992,17 @@ static int ip6_tun_build_state(struct net *net, struct nlattr *attr,
 	if (tb[LWTUNNEL_IP6_TC])
 		tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]);
 
-	if (tb[LWTUNNEL_IP6_FLAGS])
-		tun_info->key.tun_flags |=
-				(nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]) &
-				 ~TUNNEL_OPTIONS_PRESENT);
+	if (tb[LWTUNNEL_IP6_FLAGS]) {
+		IP_TUNNEL_DECLARE_FLAGS(flags);
+		__be16 data;
+
+		data = nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]);
+		ip_tunnel_flags_from_be16(flags, data);
+		ip_tunnel_clear_options_present(flags);
+
+		ip_tunnel_flags_or(tun_info->key.tun_flags,
+				   tun_info->key.tun_flags, flags);
+	}
 
 	tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6;
 	tun_info->options_len = opt_len;
@@ -1008,7 +1023,8 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb,
 	    nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) ||
 	    nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) ||
 	    nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.ttl) ||
-	    nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags) ||
+	    nla_put_be16(skb, LWTUNNEL_IP6_FLAGS,
+			 ip_tunnel_flags_to_be16(tun_info->key.tun_flags)) ||
 	    ip_tun_fill_encap_opts(skb, LWTUNNEL_IP6_OPTS, tun_info))
 		return -ENOMEM;
 
@@ -1139,8 +1155,12 @@ void ip_tunnel_netlink_parms(struct nlattr *data[],
 	if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
 		parms->iph.frag_off = htons(IP_DF);
 
-	if (data[IFLA_IPTUN_FLAGS])
-		parms->i_flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]);
+	if (data[IFLA_IPTUN_FLAGS]) {
+		__be16 flags;
+
+		flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]);
+		ip_tunnel_flags_from_be16(parms->i_flags, flags);
+	}
 
 	if (data[IFLA_IPTUN_PROTO])
 		parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]);
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 7e595f619615..14536da9f5dc 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -51,8 +51,11 @@ static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi,
 	const struct iphdr *iph = ip_hdr(skb);
 	struct net *net = dev_net(skb->dev);
 	struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+	IP_TUNNEL_DECLARE_FLAGS(flags) = { };
 
-	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+	__set_bit(IP_TUNNEL_NO_KEY_BIT, flags);
+
+	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags,
 				  iph->saddr, iph->daddr, 0);
 	if (tunnel) {
 		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
@@ -322,8 +325,11 @@ static int vti4_err(struct sk_buff *skb, u32 info)
 	const struct iphdr *iph = (const struct iphdr *)skb->data;
 	int protocol = iph->protocol;
 	struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+	IP_TUNNEL_DECLARE_FLAGS(flags) = { };
+
+	__set_bit(IP_TUNNEL_NO_KEY_BIT, flags);
 
-	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags,
 				  iph->daddr, iph->saddr, 0);
 	if (!tunnel)
 		return -1;
@@ -375,6 +381,7 @@ static int vti4_err(struct sk_buff *skb, u32 info)
 static int
 vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd)
 {
+	IP_TUNNEL_DECLARE_FLAGS(flags) = { };
 	int err = 0;
 
 	if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
@@ -383,20 +390,26 @@ vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd)
 			return -EINVAL;
 	}
 
-	if (!(p->i_flags & GRE_KEY))
+	if (!ip_tunnel_flags_is_be16_compat(p->i_flags) ||
+	    !ip_tunnel_flags_is_be16_compat(p->o_flags))
+		return -EOVERFLOW;
+
+	if (!(ip_tunnel_flags_to_be16(p->i_flags) & GRE_KEY))
 		p->i_key = 0;
-	if (!(p->o_flags & GRE_KEY))
+	if (!(ip_tunnel_flags_to_be16(p->o_flags) & GRE_KEY))
 		p->o_key = 0;
 
-	p->i_flags = VTI_ISVTI;
+	__set_bit(IP_TUNNEL_VTI_BIT, flags);
+	ip_tunnel_flags_copy(p->i_flags, flags);
 
 	err = ip_tunnel_ctl(dev, p, cmd);
 	if (err)
 		return err;
 
 	if (cmd != SIOCDELTUNNEL) {
-		p->i_flags |= GRE_KEY;
-		p->o_flags |= GRE_KEY;
+		ip_tunnel_flags_from_be16(flags, GRE_KEY);
+		ip_tunnel_flags_or(p->i_flags, p->i_flags, flags);
+		ip_tunnel_flags_or(p->o_flags, p->o_flags, flags);
 	}
 	return 0;
 }
@@ -541,7 +554,7 @@ static void vti_netlink_parms(struct nlattr *data[],
 	if (!data)
 		return;
 
-	parms->i_flags = VTI_ISVTI;
+	__set_bit(IP_TUNNEL_VTI_BIT, parms->i_flags);
 
 	if (data[IFLA_VTI_LINK])
 		parms->link = nla_get_u32(data[IFLA_VTI_LINK]);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index db960cd08a3b..923a2ef68c2f 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -130,13 +130,16 @@ static int ipip_err(struct sk_buff *skb, u32 info)
 	struct net *net = dev_net(skb->dev);
 	struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
 	const struct iphdr *iph = (const struct iphdr *)skb->data;
+	IP_TUNNEL_DECLARE_FLAGS(flags) = { };
 	const int type = icmp_hdr(skb)->type;
 	const int code = icmp_hdr(skb)->code;
 	struct ip_tunnel *t;
 	int err = 0;
 
-	t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
-			     iph->daddr, iph->saddr, 0);
+	__set_bit(IP_TUNNEL_NO_KEY_BIT, flags);
+
+	t = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->daddr,
+			     iph->saddr, 0);
 	if (!t) {
 		err = -ENOENT;
 		goto out;
@@ -213,13 +216,16 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
 {
 	struct net *net = dev_net(skb->dev);
 	struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
+	IP_TUNNEL_DECLARE_FLAGS(flags) = { };
 	struct metadata_dst *tun_dst = NULL;
 	struct ip_tunnel *tunnel;
 	const struct iphdr *iph;
 
+	__set_bit(IP_TUNNEL_NO_KEY_BIT, flags);
+
 	iph = ip_hdr(skb);
-	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
-			iph->saddr, iph->daddr, 0);
+	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->saddr,
+				  iph->daddr, 0);
 	if (tunnel) {
 		const struct tnl_ptk_info *tpi;
 
@@ -238,7 +244,9 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
 		if (iptunnel_pull_header(skb, 0, tpi->proto, false))
 			goto drop;
 		if (tunnel->collect_md) {
-			tun_dst = ip_tun_rx_dst(skb, 0, 0, 0);
+			ip_tunnel_flags_zero(flags);
+
+			tun_dst = ip_tun_rx_dst(skb, flags, 0, 0);
 			if (!tun_dst)
 				return 0;
 			ip_tunnel_md_udp_encap(skb, &tun_dst->u.tun_info);
@@ -340,7 +348,8 @@ ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd)
 	}
 
 	p->i_key = p->o_key = 0;
-	p->i_flags = p->o_flags = 0;
+	ip_tunnel_flags_zero(p->i_flags);
+	ip_tunnel_flags_zero(p->o_flags);
 	return ip_tunnel_ctl(dev, p, cmd);
 }
 
diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c
index 860aff5f8599..e4e0fa869fa4 100644
--- a/net/ipv4/udp_tunnel_core.c
+++ b/net/ipv4/udp_tunnel_core.c
@@ -183,7 +183,8 @@ void udp_tunnel_sock_release(struct socket *sock)
 EXPORT_SYMBOL_GPL(udp_tunnel_sock_release);
 
 struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb,  unsigned short family,
-				    __be16 flags, __be64 tunnel_id, int md_size)
+				    const unsigned long *flags,
+				    __be64 tunnel_id, int md_size)
 {
 	struct metadata_dst *tun_dst;
 	struct ip_tunnel_info *info;
@@ -199,7 +200,7 @@ struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb,  unsigned short family,
 	info->key.tp_src = udp_hdr(skb)->source;
 	info->key.tp_dst = udp_hdr(skb)->dest;
 	if (udp_hdr(skb)->check)
-		info->key.tun_flags |= TUNNEL_CSUM;
+		__set_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags);
 	return tun_dst;
 }
 EXPORT_SYMBOL_GPL(udp_tun_rx_dst);
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index ca7e77e84283..2146cf69d8e2 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -496,11 +496,11 @@ static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
 				      tpi->proto);
 	if (tunnel) {
 		if (tunnel->parms.collect_md) {
+			IP_TUNNEL_DECLARE_FLAGS(flags);
 			struct metadata_dst *tun_dst;
 			__be64 tun_id;
-			__be16 flags;
 
-			flags = tpi->flags;
+			ip_tunnel_flags_copy(flags, tpi->flags);
 			tun_id = key32_to_tunnel_id(tpi->key);
 
 			tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id, 0);
@@ -548,14 +548,14 @@ static int ip6erspan_rcv(struct sk_buff *skb,
 
 		if (tunnel->parms.collect_md) {
 			struct erspan_metadata *pkt_md, *md;
+			IP_TUNNEL_DECLARE_FLAGS(flags);
 			struct metadata_dst *tun_dst;
 			struct ip_tunnel_info *info;
 			unsigned char *gh;
 			__be64 tun_id;
-			__be16 flags;
 
-			tpi->flags |= TUNNEL_KEY;
-			flags = tpi->flags;
+			__set_bit(IP_TUNNEL_KEY_BIT, tpi->flags);
+			ip_tunnel_flags_copy(flags, tpi->flags);
 			tun_id = key32_to_tunnel_id(tpi->key);
 
 			tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id,
@@ -577,7 +577,8 @@ static int ip6erspan_rcv(struct sk_buff *skb,
 			md2 = &md->u.md2;
 			memcpy(md2, pkt_md, ver == 1 ? ERSPAN_V1_MDSIZE :
 						       ERSPAN_V2_MDSIZE);
-			info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
+			__set_bit(IP_TUNNEL_ERSPAN_OPT_BIT,
+				  info->key.tun_flags);
 			info->options_len = sizeof(*md);
 
 			ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
@@ -745,8 +746,8 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
 			       __u32 *pmtu, __be16 proto)
 {
 	struct ip6_tnl *tunnel = netdev_priv(dev);
+	IP_TUNNEL_DECLARE_FLAGS(flags);
 	__be16 protocol;
-	__be16 flags;
 
 	if (dev->type == ARPHRD_ETHER)
 		IPCB(skb)->flags = 0;
@@ -778,8 +779,11 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
 		fl6->fl6_gre_key = tunnel_id_to_key32(key->tun_id);
 
 		dsfield = key->tos;
-		flags = key->tun_flags &
-			(TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ);
+		ip_tunnel_flags_zero(flags);
+		__set_bit(IP_TUNNEL_CSUM_BIT, flags);
+		__set_bit(IP_TUNNEL_KEY_BIT, flags);
+		__set_bit(IP_TUNNEL_SEQ_BIT, flags);
+		ip_tunnel_flags_and(flags, flags, key->tun_flags);
 		tun_hlen = gre_calc_hlen(flags);
 
 		if (skb_cow_head(skb, dev->needed_headroom ?: tun_hlen + tunnel->encap_hlen))
@@ -788,19 +792,21 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
 		gre_build_header(skb, tun_hlen,
 				 flags, protocol,
 				 tunnel_id_to_key32(tun_info->key.tun_id),
-				 (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno))
-						      : 0);
+				 test_bit(IP_TUNNEL_SEQ_BIT, flags) ?
+				 htonl(atomic_fetch_inc(&tunnel->o_seqno)) :
+				 0);
 
 	} else {
 		if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen))
 			return -ENOMEM;
 
-		flags = tunnel->parms.o_flags;
+		ip_tunnel_flags_copy(flags, tunnel->parms.o_flags);
 
 		gre_build_header(skb, tunnel->tun_hlen, flags,
 				 protocol, tunnel->parms.o_key,
-				 (flags & TUNNEL_SEQ) ? htonl(atomic_fetch_inc(&tunnel->o_seqno))
-						      : 0);
+				 test_bit(IP_TUNNEL_SEQ_BIT, flags) ?
+				 htonl(atomic_fetch_inc(&tunnel->o_seqno)) :
+				 0);
 	}
 
 	return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu,
@@ -822,7 +828,8 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev)
 		prepare_ip6gre_xmit_ipv4(skb, dev, &fl6,
 					 &dsfield, &encap_limit);
 
-	err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM));
+	err = gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT,
+						t->parms.o_flags));
 	if (err)
 		return -1;
 
@@ -856,7 +863,8 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev)
 	    prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, &dsfield, &encap_limit))
 		return -1;
 
-	if (gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)))
+	if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT,
+					      t->parms.o_flags)))
 		return -1;
 
 	err = __gre6_xmit(skb, dev, dsfield, &fl6, encap_limit,
@@ -883,7 +891,8 @@ static int ip6gre_xmit_other(struct sk_buff *skb, struct net_device *dev)
 	    prepare_ip6gre_xmit_other(skb, dev, &fl6, &dsfield, &encap_limit))
 		return -1;
 
-	err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM));
+	err = gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT,
+						t->parms.o_flags));
 	if (err)
 		return err;
 	err = __gre6_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, skb->protocol);
@@ -936,6 +945,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 	struct ip_tunnel_info *tun_info = NULL;
 	struct ip6_tnl *t = netdev_priv(dev);
 	struct dst_entry *dst = skb_dst(skb);
+	IP_TUNNEL_DECLARE_FLAGS(flags) = { };
 	bool truncate = false;
 	int encap_limit = -1;
 	__u8 dsfield = false;
@@ -979,7 +989,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 	if (skb_cow_head(skb, dev->needed_headroom ?: t->hlen))
 		goto tx_err;
 
-	t->parms.o_flags &= ~TUNNEL_KEY;
+	__clear_bit(IP_TUNNEL_KEY_BIT, t->parms.o_flags);
 	IPCB(skb)->flags = 0;
 
 	/* For collect_md mode, derive fl6 from the tunnel key,
@@ -1004,7 +1014,8 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 		fl6.fl6_gre_key = tunnel_id_to_key32(key->tun_id);
 
 		dsfield = key->tos;
-		if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT))
+		if (!test_bit(IP_TUNNEL_ERSPAN_OPT_BIT,
+			      tun_info->key.tun_flags))
 			goto tx_err;
 		if (tun_info->options_len < sizeof(*md))
 			goto tx_err;
@@ -1065,7 +1076,9 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
 	}
 
 	/* Push GRE header. */
-	gre_build_header(skb, 8, TUNNEL_SEQ, proto, 0, htonl(atomic_fetch_inc(&t->o_seqno)));
+	__set_bit(IP_TUNNEL_SEQ_BIT, flags);
+	gre_build_header(skb, 8, flags, proto, 0,
+			 htonl(atomic_fetch_inc(&t->o_seqno)));
 
 	/* TooBig packet may have updated dst->dev's mtu */
 	if (!t->parms.collect_md && dst && dst_mtu(dst) > dst->dev->mtu)
@@ -1208,8 +1221,8 @@ static void ip6gre_tnl_copy_tnl_parm(struct ip6_tnl *t,
 	t->parms.proto = p->proto;
 	t->parms.i_key = p->i_key;
 	t->parms.o_key = p->o_key;
-	t->parms.i_flags = p->i_flags;
-	t->parms.o_flags = p->o_flags;
+	ip_tunnel_flags_copy(t->parms.i_flags, p->i_flags);
+	ip_tunnel_flags_copy(t->parms.o_flags, p->o_flags);
 	t->parms.fwmark = p->fwmark;
 	t->parms.erspan_ver = p->erspan_ver;
 	t->parms.index = p->index;
@@ -1238,8 +1251,8 @@ static void ip6gre_tnl_parm_from_user(struct __ip6_tnl_parm *p,
 	p->link = u->link;
 	p->i_key = u->i_key;
 	p->o_key = u->o_key;
-	p->i_flags = gre_flags_to_tnl_flags(u->i_flags);
-	p->o_flags = gre_flags_to_tnl_flags(u->o_flags);
+	gre_flags_to_tnl_flags(p->i_flags, u->i_flags);
+	gre_flags_to_tnl_flags(p->o_flags, u->o_flags);
 	memcpy(p->name, u->name, sizeof(u->name));
 }
 
@@ -1391,7 +1404,7 @@ static int ip6gre_header(struct sk_buff *skb, struct net_device *dev,
 	ipv6h->daddr = t->parms.raddr;
 
 	p = (__be16 *)(ipv6h + 1);
-	p[0] = t->parms.o_flags;
+	p[0] = ip_tunnel_flags_to_be16(t->parms.o_flags);
 	p[1] = htons(type);
 
 	/*
@@ -1455,19 +1468,17 @@ static void ip6gre_tunnel_setup(struct net_device *dev)
 static void ip6gre_tnl_init_features(struct net_device *dev)
 {
 	struct ip6_tnl *nt = netdev_priv(dev);
-	__be16 flags;
 
 	dev->features		|= GRE6_FEATURES | NETIF_F_LLTX;
 	dev->hw_features	|= GRE6_FEATURES;
 
-	flags = nt->parms.o_flags;
-
 	/* TCP offload with GRE SEQ is not supported, nor can we support 2
 	 * levels of outer headers requiring an update.
 	 */
-	if (flags & TUNNEL_SEQ)
+	if (test_bit(IP_TUNNEL_SEQ_BIT, nt->parms.o_flags))
 		return;
-	if (flags & TUNNEL_CSUM && nt->encap.type != TUNNEL_ENCAP_NONE)
+	if (test_bit(IP_TUNNEL_CSUM_BIT, nt->parms.o_flags) &&
+	    nt->encap.type != TUNNEL_ENCAP_NONE)
 		return;
 
 	dev->features |= NETIF_F_GSO_SOFTWARE;
@@ -1792,12 +1803,12 @@ static void ip6gre_netlink_parms(struct nlattr *data[],
 		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
 
 	if (data[IFLA_GRE_IFLAGS])
-		parms->i_flags = gre_flags_to_tnl_flags(
-				nla_get_be16(data[IFLA_GRE_IFLAGS]));
+		gre_flags_to_tnl_flags(parms->i_flags,
+				       nla_get_be16(data[IFLA_GRE_IFLAGS]));
 
 	if (data[IFLA_GRE_OFLAGS])
-		parms->o_flags = gre_flags_to_tnl_flags(
-				nla_get_be16(data[IFLA_GRE_OFLAGS]));
+		gre_flags_to_tnl_flags(parms->o_flags,
+				       nla_get_be16(data[IFLA_GRE_OFLAGS]));
 
 	if (data[IFLA_GRE_IKEY])
 		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
@@ -2144,11 +2155,13 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev)
 {
 	struct ip6_tnl *t = netdev_priv(dev);
 	struct __ip6_tnl_parm *p = &t->parms;
-	__be16 o_flags = p->o_flags;
+	IP_TUNNEL_DECLARE_FLAGS(o_flags);
+
+	ip_tunnel_flags_copy(o_flags, p->o_flags);
 
 	if (p->erspan_ver == 1 || p->erspan_ver == 2) {
 		if (!p->collect_md)
-			o_flags |= TUNNEL_KEY;
+			__set_bit(IP_TUNNEL_KEY_BIT, o_flags);
 
 		if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, p->erspan_ver))
 			goto nla_put_failure;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index e9cc315832cb..57bb3b3ea0c5 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -798,17 +798,15 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb,
 	const struct ipv6hdr *ipv6h;
 	int nh, err;
 
-	if ((!(tpi->flags & TUNNEL_CSUM) &&
-	     (tunnel->parms.i_flags & TUNNEL_CSUM)) ||
-	    ((tpi->flags & TUNNEL_CSUM) &&
-	     !(tunnel->parms.i_flags & TUNNEL_CSUM))) {
+	if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) !=
+	    test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) {
 		DEV_STATS_INC(tunnel->dev, rx_crc_errors);
 		DEV_STATS_INC(tunnel->dev, rx_errors);
 		goto drop;
 	}
 
-	if (tunnel->parms.i_flags & TUNNEL_SEQ) {
-		if (!(tpi->flags & TUNNEL_SEQ) ||
+	if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) {
+		if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) ||
 		    (tunnel->i_seqno &&
 		     (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
 			DEV_STATS_INC(tunnel->dev, rx_fifo_errors);
@@ -946,7 +944,9 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
 		if (iptunnel_pull_header(skb, 0, tpi->proto, false))
 			goto drop;
 		if (t->parms.collect_md) {
-			tun_dst = ipv6_tun_rx_dst(skb, 0, 0, 0);
+			IP_TUNNEL_DECLARE_FLAGS(flags) = { };
+
+			tun_dst = ipv6_tun_rx_dst(skb, flags, 0, 0);
 			if (!tun_dst)
 				goto drop;
 		}
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 894e6a4c4be3..83b195f09561 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -207,7 +207,7 @@ static int ipip6_tunnel_create(struct net_device *dev)
 	__dev_addr_set(dev, &t->parms.iph.saddr, 4);
 	memcpy(dev->broadcast, &t->parms.iph.daddr, 4);
 
-	if ((__force u16)t->parms.i_flags & SIT_ISATAP)
+	if (test_bit(IP_TUNNEL_SIT_ISATAP_BIT, t->parms.i_flags))
 		dev->priv_flags |= IFF_ISATAP;
 
 	dev->rtnl_link_ops = &sit_link_ops;
@@ -1700,7 +1700,8 @@ static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	    nla_put_u8(skb, IFLA_IPTUN_PMTUDISC,
 		       !!(parm->iph.frag_off & htons(IP_DF))) ||
 	    nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->iph.protocol) ||
-	    nla_put_be16(skb, IFLA_IPTUN_FLAGS, parm->i_flags) ||
+	    nla_put_be16(skb, IFLA_IPTUN_FLAGS,
+			 ip_tunnel_flags_to_be16(parm->i_flags)) ||
 	    nla_put_u32(skb, IFLA_IPTUN_FWMARK, tunnel->fwmark))
 		goto nla_put_failure;
 
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index a2c16b501087..c7a8a08b7308 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1550,6 +1550,7 @@ static int ipvs_gre_decap(struct netns_ipvs *ipvs, struct sk_buff *skb,
 	if (!dest)
 		goto unk;
 	if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
+		IP_TUNNEL_DECLARE_FLAGS(flags);
 		__be16 type;
 
 		/* Only support version 0 and C (csum) */
@@ -1560,7 +1561,10 @@ static int ipvs_gre_decap(struct netns_ipvs *ipvs, struct sk_buff *skb,
 		if (type != htons(ETH_P_IP))
 			goto unk;
 		*proto = IPPROTO_IPIP;
-		return gre_calc_hlen(gre_flags_to_tnl_flags(greh->flags));
+
+		gre_flags_to_tnl_flags(flags, greh->flags);
+
+		return gre_calc_hlen(flags);
 	}
 
 unk:
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 65e0259178da..39b5fd6bbf65 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -390,10 +390,10 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
 			    skb->ip_summed == CHECKSUM_PARTIAL)
 				mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
 		} else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
-			__be16 tflags = 0;
+			IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
 
 			if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
-				tflags |= TUNNEL_CSUM;
+				__set_bit(IP_TUNNEL_CSUM_BIT, tflags);
 			mtu -= gre_calc_hlen(tflags);
 		}
 		if (mtu < 68) {
@@ -553,10 +553,10 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
 			    skb->ip_summed == CHECKSUM_PARTIAL)
 				mtu -= GUE_PLEN_REMCSUM + GUE_LEN_PRIV;
 		} else if (dest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
-			__be16 tflags = 0;
+			IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
 
 			if (dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
-				tflags |= TUNNEL_CSUM;
+				__set_bit(IP_TUNNEL_CSUM_BIT, tflags);
 			mtu -= gre_calc_hlen(tflags);
 		}
 		if (mtu < IPV6_MIN_MTU) {
@@ -1082,11 +1082,11 @@ ipvs_gre_encap(struct net *net, struct sk_buff *skb,
 {
 	__be16 proto = *next_protocol == IPPROTO_IPIP ?
 				htons(ETH_P_IP) : htons(ETH_P_IPV6);
-	__be16 tflags = 0;
+	IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
 	size_t hdrlen;
 
 	if (cp->dest->tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
-		tflags |= TUNNEL_CSUM;
+		__set_bit(IP_TUNNEL_CSUM_BIT, tflags);
 
 	hdrlen = gre_calc_hlen(tflags);
 	gre_build_header(skb, hdrlen, tflags, proto, 0, 0);
@@ -1165,11 +1165,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 		max_headroom += sizeof(struct udphdr) + gue_hdrlen;
 	} else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
+		IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
 		size_t gre_hdrlen;
-		__be16 tflags = 0;
 
 		if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
-			tflags |= TUNNEL_CSUM;
+			__set_bit(IP_TUNNEL_CSUM_BIT, tflags);
 		gre_hdrlen = gre_calc_hlen(tflags);
 
 		max_headroom += gre_hdrlen;
@@ -1310,11 +1310,11 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 
 		max_headroom += sizeof(struct udphdr) + gue_hdrlen;
 	} else if (tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GRE) {
+		IP_TUNNEL_DECLARE_FLAGS(tflags) = { };
 		size_t gre_hdrlen;
-		__be16 tflags = 0;
 
 		if (tun_flags & IP_VS_TUNNEL_ENCAP_FLAG_CSUM)
-			tflags |= TUNNEL_CSUM;
+			__set_bit(IP_TUNNEL_CSUM_BIT, tflags);
 		gre_hdrlen = gre_calc_hlen(tflags);
 
 		max_headroom += gre_hdrlen;
diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c
index f735d79d8be5..60a76e6e348e 100644
--- a/net/netfilter/nft_tunnel.c
+++ b/net/netfilter/nft_tunnel.c
@@ -174,8 +174,8 @@ struct nft_tunnel_opts {
 		struct erspan_metadata	erspan;
 		u8	data[IP_TUNNEL_OPTS_MAX];
 	} u;
+	IP_TUNNEL_DECLARE_FLAGS(flags);
 	u32	len;
-	__be16	flags;
 };
 
 struct nft_tunnel_obj {
@@ -271,7 +271,8 @@ static int nft_tunnel_obj_vxlan_init(const struct nlattr *attr,
 	opts->u.vxlan.gbp = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_VXLAN_GBP]));
 
 	opts->len	= sizeof(struct vxlan_metadata);
-	opts->flags	= TUNNEL_VXLAN_OPT;
+	ip_tunnel_flags_zero(opts->flags);
+	__set_bit(IP_TUNNEL_VXLAN_OPT_BIT, opts->flags);
 
 	return 0;
 }
@@ -325,7 +326,8 @@ static int nft_tunnel_obj_erspan_init(const struct nlattr *attr,
 	opts->u.erspan.version = version;
 
 	opts->len	= sizeof(struct erspan_metadata);
-	opts->flags	= TUNNEL_ERSPAN_OPT;
+	ip_tunnel_flags_zero(opts->flags);
+	__set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, opts->flags);
 
 	return 0;
 }
@@ -366,7 +368,8 @@ static int nft_tunnel_obj_geneve_init(const struct nlattr *attr,
 	opt->length = data_len / 4;
 	opt->opt_class = nla_get_be16(tb[NFTA_TUNNEL_KEY_GENEVE_CLASS]);
 	opt->type = nla_get_u8(tb[NFTA_TUNNEL_KEY_GENEVE_TYPE]);
-	opts->flags = TUNNEL_GENEVE_OPT;
+	ip_tunnel_flags_zero(opts->flags);
+	__set_bit(IP_TUNNEL_GENEVE_OPT_BIT, opts->flags);
 
 	return 0;
 }
@@ -385,8 +388,8 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx,
 				    struct nft_tunnel_opts *opts)
 {
 	struct nlattr *nla;
-	__be16 type = 0;
 	int err, rem;
+	u32 type = 0;
 
 	err = nla_validate_nested_deprecated(attr, NFTA_TUNNEL_KEY_OPTS_MAX,
 					     nft_tunnel_opts_policy, NULL);
@@ -401,7 +404,7 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx,
 			err = nft_tunnel_obj_vxlan_init(nla, opts);
 			if (err)
 				return err;
-			type = TUNNEL_VXLAN_OPT;
+			type = IP_TUNNEL_VXLAN_OPT_BIT;
 			break;
 		case NFTA_TUNNEL_KEY_OPTS_ERSPAN:
 			if (type)
@@ -409,15 +412,15 @@ static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx,
 			err = nft_tunnel_obj_erspan_init(nla, opts);
 			if (err)
 				return err;
-			type = TUNNEL_ERSPAN_OPT;
+			type = IP_TUNNEL_ERSPAN_OPT_BIT;
 			break;
 		case NFTA_TUNNEL_KEY_OPTS_GENEVE:
-			if (type && type != TUNNEL_GENEVE_OPT)
+			if (type && type != IP_TUNNEL_GENEVE_OPT_BIT)
 				return -EINVAL;
 			err = nft_tunnel_obj_geneve_init(nla, opts);
 			if (err)
 				return err;
-			type = TUNNEL_GENEVE_OPT;
+			type = IP_TUNNEL_GENEVE_OPT_BIT;
 			break;
 		default:
 			return -EOPNOTSUPP;
@@ -454,7 +457,9 @@ static int nft_tunnel_obj_init(const struct nft_ctx *ctx,
 	memset(&info, 0, sizeof(info));
 	info.mode		= IP_TUNNEL_INFO_TX;
 	info.key.tun_id		= key32_to_tunnel_id(nla_get_be32(tb[NFTA_TUNNEL_KEY_ID]));
-	info.key.tun_flags	= TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
+	__set_bit(IP_TUNNEL_KEY_BIT, info.key.tun_flags);
+	__set_bit(IP_TUNNEL_CSUM_BIT, info.key.tun_flags);
+	__set_bit(IP_TUNNEL_NOCACHE_BIT, info.key.tun_flags);
 
 	if (tb[NFTA_TUNNEL_KEY_IP]) {
 		err = nft_tunnel_obj_ip_init(ctx, tb[NFTA_TUNNEL_KEY_IP], &info);
@@ -483,11 +488,12 @@ static int nft_tunnel_obj_init(const struct nft_ctx *ctx,
 			return -EOPNOTSUPP;
 
 		if (tun_flags & NFT_TUNNEL_F_ZERO_CSUM_TX)
-			info.key.tun_flags &= ~TUNNEL_CSUM;
+			__clear_bit(IP_TUNNEL_CSUM_BIT, info.key.tun_flags);
 		if (tun_flags & NFT_TUNNEL_F_DONT_FRAGMENT)
-			info.key.tun_flags |= TUNNEL_DONT_FRAGMENT;
+			__set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT,
+				  info.key.tun_flags);
 		if (tun_flags & NFT_TUNNEL_F_SEQ_NUMBER)
-			info.key.tun_flags |= TUNNEL_SEQ;
+			__set_bit(IP_TUNNEL_SEQ_BIT, info.key.tun_flags);
 	}
 	if (tb[NFTA_TUNNEL_KEY_TOS])
 		info.key.tos = nla_get_u8(tb[NFTA_TUNNEL_KEY_TOS]);
@@ -583,7 +589,7 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb,
 	if (!nest)
 		return -1;
 
-	if (opts->flags & TUNNEL_VXLAN_OPT) {
+	if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, opts->flags)) {
 		inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_VXLAN);
 		if (!inner)
 			goto failure;
@@ -591,7 +597,7 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb,
 				 htonl(opts->u.vxlan.gbp)))
 			goto inner_failure;
 		nla_nest_end(skb, inner);
-	} else if (opts->flags & TUNNEL_ERSPAN_OPT) {
+	} else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, opts->flags)) {
 		inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_ERSPAN);
 		if (!inner)
 			goto failure;
@@ -613,7 +619,7 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb,
 			break;
 		}
 		nla_nest_end(skb, inner);
-	} else if (opts->flags & TUNNEL_GENEVE_OPT) {
+	} else if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, opts->flags)) {
 		struct geneve_opt *opt;
 		int offset = 0;
 
@@ -658,11 +664,11 @@ static int nft_tunnel_flags_dump(struct sk_buff *skb,
 {
 	u32 flags = 0;
 
-	if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT)
+	if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags))
 		flags |= NFT_TUNNEL_F_DONT_FRAGMENT;
-	if (!(info->key.tun_flags & TUNNEL_CSUM))
+	if (!test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags))
 		flags |= NFT_TUNNEL_F_ZERO_CSUM_TX;
-	if (info->key.tun_flags & TUNNEL_SEQ)
+	if (test_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags))
 		flags |= NFT_TUNNEL_F_SEQ_NUMBER;
 
 	if (nla_put_be32(skb, NFTA_TUNNEL_KEY_FLAGS, htonl(flags)) < 0)
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index ebc5728aab4e..f224d9bcea5e 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -152,6 +152,13 @@ static void update_range(struct sw_flow_match *match,
 			       sizeof((match)->key->field));                \
 	} while (0)
 
+#define SW_FLOW_KEY_BITMAP_COPY(match, field, value_p, nbits, is_mask) ({     \
+	update_range(match, offsetof(struct sw_flow_key, field),	      \
+		     bitmap_size(nbits), is_mask);			      \
+	bitmap_copy(is_mask ? (match)->mask->key.field : (match)->key->field, \
+		    value_p, nbits);					      \
+})
+
 static bool match_validate(const struct sw_flow_match *match,
 			   u64 key_attrs, u64 mask_attrs, bool log)
 {
@@ -670,8 +677,8 @@ static int ip_tun_from_nlattr(const struct nlattr *attr,
 			      bool log)
 {
 	bool ttl = false, ipv4 = false, ipv6 = false;
+	IP_TUNNEL_DECLARE_FLAGS(tun_flags) = { };
 	bool info_bridge_mode = false;
-	__be16 tun_flags = 0;
 	int opts_type = 0;
 	struct nlattr *a;
 	int rem;
@@ -697,7 +704,7 @@ static int ip_tun_from_nlattr(const struct nlattr *attr,
 		case OVS_TUNNEL_KEY_ATTR_ID:
 			SW_FLOW_KEY_PUT(match, tun_key.tun_id,
 					nla_get_be64(a), is_mask);
-			tun_flags |= TUNNEL_KEY;
+			__set_bit(IP_TUNNEL_KEY_BIT, tun_flags);
 			break;
 		case OVS_TUNNEL_KEY_ATTR_IPV4_SRC:
 			SW_FLOW_KEY_PUT(match, tun_key.u.ipv4.src,
@@ -729,10 +736,10 @@ static int ip_tun_from_nlattr(const struct nlattr *attr,
 			ttl = true;
 			break;
 		case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT:
-			tun_flags |= TUNNEL_DONT_FRAGMENT;
+			__set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, tun_flags);
 			break;
 		case OVS_TUNNEL_KEY_ATTR_CSUM:
-			tun_flags |= TUNNEL_CSUM;
+			__set_bit(IP_TUNNEL_CSUM_BIT, tun_flags);
 			break;
 		case OVS_TUNNEL_KEY_ATTR_TP_SRC:
 			SW_FLOW_KEY_PUT(match, tun_key.tp_src,
@@ -743,7 +750,7 @@ static int ip_tun_from_nlattr(const struct nlattr *attr,
 					nla_get_be16(a), is_mask);
 			break;
 		case OVS_TUNNEL_KEY_ATTR_OAM:
-			tun_flags |= TUNNEL_OAM;
+			__set_bit(IP_TUNNEL_OAM_BIT, tun_flags);
 			break;
 		case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS:
 			if (opts_type) {
@@ -755,7 +762,7 @@ static int ip_tun_from_nlattr(const struct nlattr *attr,
 			if (err)
 				return err;
 
-			tun_flags |= TUNNEL_GENEVE_OPT;
+			__set_bit(IP_TUNNEL_GENEVE_OPT_BIT, tun_flags);
 			opts_type = type;
 			break;
 		case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS:
@@ -768,7 +775,7 @@ static int ip_tun_from_nlattr(const struct nlattr *attr,
 			if (err)
 				return err;
 
-			tun_flags |= TUNNEL_VXLAN_OPT;
+			__set_bit(IP_TUNNEL_VXLAN_OPT_BIT, tun_flags);
 			opts_type = type;
 			break;
 		case OVS_TUNNEL_KEY_ATTR_PAD:
@@ -784,7 +791,7 @@ static int ip_tun_from_nlattr(const struct nlattr *attr,
 			if (err)
 				return err;
 
-			tun_flags |= TUNNEL_ERSPAN_OPT;
+			__set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_flags);
 			opts_type = type;
 			break;
 		case OVS_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE:
@@ -798,7 +805,8 @@ static int ip_tun_from_nlattr(const struct nlattr *attr,
 		}
 	}
 
-	SW_FLOW_KEY_PUT(match, tun_key.tun_flags, tun_flags, is_mask);
+	SW_FLOW_KEY_BITMAP_COPY(match, tun_key.tun_flags, tun_flags,
+				__IP_TUNNEL_FLAG_NUM, is_mask);
 	if (is_mask)
 		SW_FLOW_KEY_MEMSET_FIELD(match, tun_proto, 0xff, true);
 	else
@@ -823,13 +831,15 @@ static int ip_tun_from_nlattr(const struct nlattr *attr,
 		}
 		if (ipv4) {
 			if (info_bridge_mode) {
+				__clear_bit(IP_TUNNEL_KEY_BIT, tun_flags);
+
 				if (match->key->tun_key.u.ipv4.src ||
 				    match->key->tun_key.u.ipv4.dst ||
 				    match->key->tun_key.tp_src ||
 				    match->key->tun_key.tp_dst ||
 				    match->key->tun_key.ttl ||
 				    match->key->tun_key.tos ||
-				    tun_flags & ~TUNNEL_KEY) {
+				    !ip_tunnel_flags_empty(tun_flags)) {
 					OVS_NLERR(log, "IPv4 tun info is not correct");
 					return -EINVAL;
 				}
@@ -874,7 +884,7 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb,
 			      const void *tun_opts, int swkey_tun_opts_len,
 			      unsigned short tun_proto, u8 mode)
 {
-	if (output->tun_flags & TUNNEL_KEY &&
+	if (test_bit(IP_TUNNEL_KEY_BIT, output->tun_flags) &&
 	    nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id,
 			 OVS_TUNNEL_KEY_ATTR_PAD))
 		return -EMSGSIZE;
@@ -910,10 +920,10 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb,
 		return -EMSGSIZE;
 	if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ttl))
 		return -EMSGSIZE;
-	if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) &&
+	if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, output->tun_flags) &&
 	    nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT))
 		return -EMSGSIZE;
-	if ((output->tun_flags & TUNNEL_CSUM) &&
+	if (test_bit(IP_TUNNEL_CSUM_BIT, output->tun_flags) &&
 	    nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM))
 		return -EMSGSIZE;
 	if (output->tp_src &&
@@ -922,18 +932,20 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb,
 	if (output->tp_dst &&
 	    nla_put_be16(skb, OVS_TUNNEL_KEY_ATTR_TP_DST, output->tp_dst))
 		return -EMSGSIZE;
-	if ((output->tun_flags & TUNNEL_OAM) &&
+	if (test_bit(IP_TUNNEL_OAM_BIT, output->tun_flags) &&
 	    nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM))
 		return -EMSGSIZE;
 	if (swkey_tun_opts_len) {
-		if (output->tun_flags & TUNNEL_GENEVE_OPT &&
+		if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, output->tun_flags) &&
 		    nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS,
 			    swkey_tun_opts_len, tun_opts))
 			return -EMSGSIZE;
-		else if (output->tun_flags & TUNNEL_VXLAN_OPT &&
+		else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT,
+				  output->tun_flags) &&
 			 vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len))
 			return -EMSGSIZE;
-		else if (output->tun_flags & TUNNEL_ERSPAN_OPT &&
+		else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT,
+				  output->tun_flags) &&
 			 nla_put(skb, OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS,
 				 swkey_tun_opts_len, tun_opts))
 			return -EMSGSIZE;
@@ -2029,7 +2041,7 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
 	if ((swkey->tun_proto || is_mask)) {
 		const void *opts = NULL;
 
-		if (output->tun_key.tun_flags & TUNNEL_OPTIONS_PRESENT)
+		if (ip_tunnel_is_options_present(output->tun_key.tun_flags))
 			opts = TUN_METADATA_OPTS(output, swkey->tun_opts_len);
 
 		if (ip_tun_to_nlattr(skb, &output->tun_key, opts,
@@ -2752,7 +2764,8 @@ static int validate_geneve_opts(struct sw_flow_key *key)
 		opts_len -= len;
 	}
 
-	key->tun_key.tun_flags |= crit_opt ? TUNNEL_CRIT_OPT : 0;
+	if (crit_opt)
+		__set_bit(IP_TUNNEL_CRIT_OPT_BIT, key->tun_key.tun_flags);
 
 	return 0;
 }
@@ -2760,6 +2773,7 @@ static int validate_geneve_opts(struct sw_flow_key *key)
 static int validate_and_copy_set_tun(const struct nlattr *attr,
 				     struct sw_flow_actions **sfa, bool log)
 {
+	IP_TUNNEL_DECLARE_FLAGS(dst_opt_type) = { };
 	struct sw_flow_match match;
 	struct sw_flow_key key;
 	struct metadata_dst *tun_dst;
@@ -2767,9 +2781,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 	struct ovs_tunnel_info *ovs_tun;
 	struct nlattr *a;
 	int err = 0, start, opts_type;
-	__be16 dst_opt_type;
 
-	dst_opt_type = 0;
 	ovs_match_init(&match, &key, true, NULL);
 	opts_type = ip_tun_from_nlattr(nla_data(attr), &match, false, log);
 	if (opts_type < 0)
@@ -2781,13 +2793,14 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
 			err = validate_geneve_opts(&key);
 			if (err < 0)
 				return err;
-			dst_opt_type = TUNNEL_GENEVE_OPT;
+
+			__set_bit(IP_TUNNEL_GENEVE_OPT_BIT, dst_opt_type);
 			break;
 		case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS:
-			dst_opt_type = TUNNEL_VXLAN_OPT;
+			__set_bit(IP_TUNNEL_VXLAN_OPT_BIT, dst_opt_type);
 			break;
 		case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS:
-			dst_opt_type = TUNNEL_ERSPAN_OPT;
+			__set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, dst_opt_type);
 			break;
 		}
 	}
diff --git a/net/psample/psample.c b/net/psample/psample.c
index ddd211a151d0..a5d9b8446f77 100644
--- a/net/psample/psample.c
+++ b/net/psample/psample.c
@@ -221,7 +221,7 @@ static int __psample_ip_tun_to_nlattr(struct sk_buff *skb,
 	const struct ip_tunnel_key *tun_key = &tun_info->key;
 	int tun_opts_len = tun_info->options_len;
 
-	if (tun_key->tun_flags & TUNNEL_KEY &&
+	if (test_bit(IP_TUNNEL_KEY_BIT, tun_key->tun_flags) &&
 	    nla_put_be64(skb, PSAMPLE_TUNNEL_KEY_ATTR_ID, tun_key->tun_id,
 			 PSAMPLE_TUNNEL_KEY_ATTR_PAD))
 		return -EMSGSIZE;
@@ -257,10 +257,10 @@ static int __psample_ip_tun_to_nlattr(struct sk_buff *skb,
 		return -EMSGSIZE;
 	if (nla_put_u8(skb, PSAMPLE_TUNNEL_KEY_ATTR_TTL, tun_key->ttl))
 		return -EMSGSIZE;
-	if ((tun_key->tun_flags & TUNNEL_DONT_FRAGMENT) &&
+	if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, tun_key->tun_flags) &&
 	    nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_DONT_FRAGMENT))
 		return -EMSGSIZE;
-	if ((tun_key->tun_flags & TUNNEL_CSUM) &&
+	if (test_bit(IP_TUNNEL_CSUM_BIT, tun_key->tun_flags) &&
 	    nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_CSUM))
 		return -EMSGSIZE;
 	if (tun_key->tp_src &&
@@ -269,15 +269,16 @@ static int __psample_ip_tun_to_nlattr(struct sk_buff *skb,
 	if (tun_key->tp_dst &&
 	    nla_put_be16(skb, PSAMPLE_TUNNEL_KEY_ATTR_TP_DST, tun_key->tp_dst))
 		return -EMSGSIZE;
-	if ((tun_key->tun_flags & TUNNEL_OAM) &&
+	if (test_bit(IP_TUNNEL_OAM_BIT, tun_key->tun_flags) &&
 	    nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_OAM))
 		return -EMSGSIZE;
 	if (tun_opts_len) {
-		if (tun_key->tun_flags & TUNNEL_GENEVE_OPT &&
+		if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, tun_key->tun_flags) &&
 		    nla_put(skb, PSAMPLE_TUNNEL_KEY_ATTR_GENEVE_OPTS,
 			    tun_opts_len, tun_opts))
 			return -EMSGSIZE;
-		else if (tun_key->tun_flags & TUNNEL_ERSPAN_OPT &&
+		else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT,
+				  tun_key->tun_flags) &&
 			 nla_put(skb, PSAMPLE_TUNNEL_KEY_ATTR_ERSPAN_OPTS,
 				 tun_opts_len, tun_opts))
 			return -EMSGSIZE;
@@ -314,7 +315,7 @@ static int psample_tunnel_meta_len(struct ip_tunnel_info *tun_info)
 	int tun_opts_len = tun_info->options_len;
 	int sum = nla_total_size(0);	/* PSAMPLE_ATTR_TUNNEL */
 
-	if (tun_key->tun_flags & TUNNEL_KEY)
+	if (test_bit(IP_TUNNEL_KEY_BIT, tun_key->tun_flags))
 		sum += nla_total_size_64bit(sizeof(u64));
 
 	if (tun_info->mode & IP_TUNNEL_INFO_BRIDGE)
@@ -337,20 +338,21 @@ static int psample_tunnel_meta_len(struct ip_tunnel_info *tun_info)
 	if (tun_key->tos)
 		sum += nla_total_size(sizeof(u8));
 	sum += nla_total_size(sizeof(u8));	/* TTL */
-	if (tun_key->tun_flags & TUNNEL_DONT_FRAGMENT)
+	if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, tun_key->tun_flags))
 		sum += nla_total_size(0);
-	if (tun_key->tun_flags & TUNNEL_CSUM)
+	if (test_bit(IP_TUNNEL_CSUM_BIT, tun_key->tun_flags))
 		sum += nla_total_size(0);
 	if (tun_key->tp_src)
 		sum += nla_total_size(sizeof(u16));
 	if (tun_key->tp_dst)
 		sum += nla_total_size(sizeof(u16));
-	if (tun_key->tun_flags & TUNNEL_OAM)
+	if (test_bit(IP_TUNNEL_OAM_BIT, tun_key->tun_flags))
 		sum += nla_total_size(0);
 	if (tun_opts_len) {
-		if (tun_key->tun_flags & TUNNEL_GENEVE_OPT)
+		if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, tun_key->tun_flags))
 			sum += nla_total_size(tun_opts_len);
-		else if (tun_key->tun_flags & TUNNEL_ERSPAN_OPT)
+		else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT,
+				  tun_key->tun_flags))
 			sum += nla_total_size(tun_opts_len);
 	}
 
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 1536f8b16f1b..af7c99845948 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -230,7 +230,7 @@ static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst,
 	nla_for_each_attr(attr, head, len, rem) {
 		switch (nla_type(attr)) {
 		case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE:
-			if (type && type != TUNNEL_GENEVE_OPT) {
+			if (type && type != IP_TUNNEL_GENEVE_OPT_BIT) {
 				NL_SET_ERR_MSG(extack, "Duplicate type for geneve options");
 				return -EINVAL;
 			}
@@ -247,7 +247,7 @@ static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst,
 				dst_len -= opt_len;
 				dst += opt_len;
 			}
-			type = TUNNEL_GENEVE_OPT;
+			type = IP_TUNNEL_GENEVE_OPT_BIT;
 			break;
 		case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN:
 			if (type) {
@@ -259,7 +259,7 @@ static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst,
 			if (opt_len < 0)
 				return opt_len;
 			opts_len += opt_len;
-			type = TUNNEL_VXLAN_OPT;
+			type = IP_TUNNEL_VXLAN_OPT_BIT;
 			break;
 		case TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN:
 			if (type) {
@@ -271,7 +271,7 @@ static int tunnel_key_copy_opts(const struct nlattr *nla, u8 *dst,
 			if (opt_len < 0)
 				return opt_len;
 			opts_len += opt_len;
-			type = TUNNEL_ERSPAN_OPT;
+			type = IP_TUNNEL_ERSPAN_OPT_BIT;
 			break;
 		}
 	}
@@ -302,7 +302,7 @@ static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info,
 	switch (nla_type(nla_data(nla))) {
 	case TCA_TUNNEL_KEY_ENC_OPTS_GENEVE:
 #if IS_ENABLED(CONFIG_INET)
-		info->key.tun_flags |= TUNNEL_GENEVE_OPT;
+		__set_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags);
 		return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info),
 					    opts_len, extack);
 #else
@@ -310,7 +310,7 @@ static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info,
 #endif
 	case TCA_TUNNEL_KEY_ENC_OPTS_VXLAN:
 #if IS_ENABLED(CONFIG_INET)
-		info->key.tun_flags |= TUNNEL_VXLAN_OPT;
+		__set_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags);
 		return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info),
 					    opts_len, extack);
 #else
@@ -318,7 +318,7 @@ static int tunnel_key_opts_set(struct nlattr *nla, struct ip_tunnel_info *info,
 #endif
 	case TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN:
 #if IS_ENABLED(CONFIG_INET)
-		info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
+		__set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags);
 		return tunnel_key_copy_opts(nla, ip_tunnel_info_opts(info),
 					    opts_len, extack);
 #else
@@ -363,6 +363,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 	bool bind = act_flags & TCA_ACT_FLAGS_BIND;
 	struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1];
 	struct tcf_tunnel_key_params *params_new;
+	IP_TUNNEL_DECLARE_FLAGS(flags) = { };
 	struct metadata_dst *metadata = NULL;
 	struct tcf_chain *goto_ch = NULL;
 	struct tc_tunnel_key *parm;
@@ -371,7 +372,6 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 	__be16 dst_port = 0;
 	__be64 key_id = 0;
 	int opts_len = 0;
-	__be16 flags = 0;
 	u8 tos, ttl;
 	int ret = 0;
 	u32 index;
@@ -412,16 +412,16 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
 
 			key32 = nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID]);
 			key_id = key32_to_tunnel_id(key32);
-			flags = TUNNEL_KEY;
+			__set_bit(IP_TUNNEL_KEY_BIT, flags);
 		}
 
-		flags |= TUNNEL_CSUM;
+		__set_bit(IP_TUNNEL_CSUM_BIT, flags);
 		if (tb[TCA_TUNNEL_KEY_NO_CSUM] &&
 		    nla_get_u8(tb[TCA_TUNNEL_KEY_NO_CSUM]))
-			flags &= ~TUNNEL_CSUM;
+			__clear_bit(IP_TUNNEL_CSUM_BIT, flags);
 
 		if (nla_get_flag(tb[TCA_TUNNEL_KEY_NO_FRAG]))
-			flags |= TUNNEL_DONT_FRAGMENT;
+			__set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, flags);
 
 		if (tb[TCA_TUNNEL_KEY_ENC_DST_PORT])
 			dst_port = nla_get_be16(tb[TCA_TUNNEL_KEY_ENC_DST_PORT]);
@@ -663,15 +663,15 @@ static int tunnel_key_opts_dump(struct sk_buff *skb,
 	if (!start)
 		return -EMSGSIZE;
 
-	if (info->key.tun_flags & TUNNEL_GENEVE_OPT) {
+	if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags)) {
 		err = tunnel_key_geneve_opts_dump(skb, info);
 		if (err)
 			goto err_out;
-	} else if (info->key.tun_flags & TUNNEL_VXLAN_OPT) {
+	} else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags)) {
 		err = tunnel_key_vxlan_opts_dump(skb, info);
 		if (err)
 			goto err_out;
-	} else if (info->key.tun_flags & TUNNEL_ERSPAN_OPT) {
+	} else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags)) {
 		err = tunnel_key_erspan_opts_dump(skb, info);
 		if (err)
 			goto err_out;
@@ -741,7 +741,7 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
 		struct ip_tunnel_key *key = &info->key;
 		__be32 key_id = tunnel_id_to_key32(key->tun_id);
 
-		if (((key->tun_flags & TUNNEL_KEY) &&
+		if ((test_bit(IP_TUNNEL_KEY_BIT, key->tun_flags) &&
 		     nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id)) ||
 		    tunnel_key_dump_addresses(skb,
 					      &params->tcft_enc_metadata->u.tun_info) ||
@@ -749,8 +749,8 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
 		      nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_DST_PORT,
 				   key->tp_dst)) ||
 		    nla_put_u8(skb, TCA_TUNNEL_KEY_NO_CSUM,
-			       !(key->tun_flags & TUNNEL_CSUM)) ||
-		    ((key->tun_flags & TUNNEL_DONT_FRAGMENT) &&
+			       !test_bit(IP_TUNNEL_CSUM_BIT, key->tun_flags)) ||
+		    (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags) &&
 		     nla_put_flag(skb, TCA_TUNNEL_KEY_NO_FRAG)) ||
 		    tunnel_key_opts_dump(skb, info))
 			goto nla_put_failure;
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index e1314674b4a9..ae14d649140f 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -1454,12 +1454,13 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
 		switch (nla_type(nla_opt_key)) {
 		case TCA_FLOWER_KEY_ENC_OPTS_GENEVE:
 			if (key->enc_opts.dst_opt_type &&
-			    key->enc_opts.dst_opt_type != TUNNEL_GENEVE_OPT) {
+			    key->enc_opts.dst_opt_type !=
+			    IP_TUNNEL_GENEVE_OPT_BIT) {
 				NL_SET_ERR_MSG(extack, "Duplicate type for geneve options");
 				return -EINVAL;
 			}
 			option_len = 0;
-			key->enc_opts.dst_opt_type = TUNNEL_GENEVE_OPT;
+			key->enc_opts.dst_opt_type = IP_TUNNEL_GENEVE_OPT_BIT;
 			option_len = fl_set_geneve_opt(nla_opt_key, key,
 						       key_depth, option_len,
 						       extack);
@@ -1470,7 +1471,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
 			/* At the same time we need to parse through the mask
 			 * in order to verify exact and mask attribute lengths.
 			 */
-			mask->enc_opts.dst_opt_type = TUNNEL_GENEVE_OPT;
+			mask->enc_opts.dst_opt_type = IP_TUNNEL_GENEVE_OPT_BIT;
 			option_len = fl_set_geneve_opt(nla_opt_msk, mask,
 						       msk_depth, option_len,
 						       extack);
@@ -1489,7 +1490,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
 				return -EINVAL;
 			}
 			option_len = 0;
-			key->enc_opts.dst_opt_type = TUNNEL_VXLAN_OPT;
+			key->enc_opts.dst_opt_type = IP_TUNNEL_VXLAN_OPT_BIT;
 			option_len = fl_set_vxlan_opt(nla_opt_key, key,
 						      key_depth, option_len,
 						      extack);
@@ -1500,7 +1501,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
 			/* At the same time we need to parse through the mask
 			 * in order to verify exact and mask attribute lengths.
 			 */
-			mask->enc_opts.dst_opt_type = TUNNEL_VXLAN_OPT;
+			mask->enc_opts.dst_opt_type = IP_TUNNEL_VXLAN_OPT_BIT;
 			option_len = fl_set_vxlan_opt(nla_opt_msk, mask,
 						      msk_depth, option_len,
 						      extack);
@@ -1519,7 +1520,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
 				return -EINVAL;
 			}
 			option_len = 0;
-			key->enc_opts.dst_opt_type = TUNNEL_ERSPAN_OPT;
+			key->enc_opts.dst_opt_type = IP_TUNNEL_ERSPAN_OPT_BIT;
 			option_len = fl_set_erspan_opt(nla_opt_key, key,
 						       key_depth, option_len,
 						       extack);
@@ -1530,7 +1531,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
 			/* At the same time we need to parse through the mask
 			 * in order to verify exact and mask attribute lengths.
 			 */
-			mask->enc_opts.dst_opt_type = TUNNEL_ERSPAN_OPT;
+			mask->enc_opts.dst_opt_type = IP_TUNNEL_ERSPAN_OPT_BIT;
 			option_len = fl_set_erspan_opt(nla_opt_msk, mask,
 						       msk_depth, option_len,
 						       extack);
@@ -1550,7 +1551,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
 				return -EINVAL;
 			}
 			option_len = 0;
-			key->enc_opts.dst_opt_type = TUNNEL_GTP_OPT;
+			key->enc_opts.dst_opt_type = IP_TUNNEL_GTP_OPT_BIT;
 			option_len = fl_set_gtp_opt(nla_opt_key, key,
 						    key_depth, option_len,
 						    extack);
@@ -1561,7 +1562,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
 			/* At the same time we need to parse through the mask
 			 * in order to verify exact and mask attribute lengths.
 			 */
-			mask->enc_opts.dst_opt_type = TUNNEL_GTP_OPT;
+			mask->enc_opts.dst_opt_type = IP_TUNNEL_GTP_OPT_BIT;
 			option_len = fl_set_gtp_opt(nla_opt_msk, mask,
 						    msk_depth, option_len,
 						    extack);
@@ -3202,22 +3203,22 @@ static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type,
 		goto nla_put_failure;
 
 	switch (enc_opts->dst_opt_type) {
-	case TUNNEL_GENEVE_OPT:
+	case IP_TUNNEL_GENEVE_OPT_BIT:
 		err = fl_dump_key_geneve_opt(skb, enc_opts);
 		if (err)
 			goto nla_put_failure;
 		break;
-	case TUNNEL_VXLAN_OPT:
+	case IP_TUNNEL_VXLAN_OPT_BIT:
 		err = fl_dump_key_vxlan_opt(skb, enc_opts);
 		if (err)
 			goto nla_put_failure;
 		break;
-	case TUNNEL_ERSPAN_OPT:
+	case IP_TUNNEL_ERSPAN_OPT_BIT:
 		err = fl_dump_key_erspan_opt(skb, enc_opts);
 		if (err)
 			goto nla_put_failure;
 		break;
-	case TUNNEL_GTP_OPT:
+	case IP_TUNNEL_GTP_OPT_BIT:
 		err = fl_dump_key_gtp_opt(skb, enc_opts);
 		if (err)
 			goto nla_put_failure;
-- 
cgit v1.2.3


From 76c8764ef36a5d37ea2e551bda28ac7f028383ba Mon Sep 17 00:00:00 2001
From: Wojciech Drewek <wojciech.drewek@intel.com>
Date: Wed, 27 Mar 2024 16:23:55 +0100
Subject: pfcp: add PFCP module

Packet Forwarding Control Protocol (PFCP) is a 3GPP Protocol
used between the control plane and the user plane function.
It is specified in TS 29.244[1].

Note that this module is not designed to support this Protocol
in the kernel space. There is no support for parsing any PFCP messages.
There is no API that could be used by any userspace daemon.
Basically it does not support PFCP. This protocol is sophisticated
and there is no need for implementing it in the kernel. The purpose
of this module is to allow users to setup software and hardware offload
of PFCP packets using tc tool.

When user requests to create a PFCP device, a new socket is created.
The socket is set up with port number 8805 which is specific for
PFCP [29.244 4.2.2]. This allow to receive PFCP request messages,
response messages use other ports.

Note that only one PFCP netdev can be created.

Only IPv4 is supported at this time.

[1] https://portal.3gpp.org/desktopmodules/Specifications/SpecificationDetails.aspx?specificationId=3111

Signed-off-by: Wojciech Drewek <wojciech.drewek@intel.com>
Signed-off-by: Marcin Szycik <marcin.szycik@linux.intel.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/Kconfig  |  13 +++
 drivers/net/Makefile |   1 +
 drivers/net/pfcp.c   | 223 +++++++++++++++++++++++++++++++++++++++++++++++++++
 include/net/pfcp.h   |  17 ++++
 4 files changed, 254 insertions(+)
 create mode 100644 drivers/net/pfcp.c
 create mode 100644 include/net/pfcp.h

(limited to 'include')

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 8ca0bc223b30..172d84e39129 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -290,6 +290,19 @@ config GTP
 	  To compile this drivers as a module, choose M here: the module
 	  will be called gtp.
 
+config PFCP
+	tristate "Packet Forwarding Control Protocol (PFCP)"
+	depends on INET
+	select NET_UDP_TUNNEL
+	help
+	  This allows one to create PFCP virtual interfaces that allows to
+	  set up software and hardware offload of PFCP packets.
+	  Note that this module does not support PFCP protocol in the kernel space.
+	  There is no support for parsing any PFCP messages.
+
+	  To compile this drivers as a module, choose M here: the module
+	  will be called pfcp.
+
 config AMT
 	tristate "Automatic Multicast Tunneling (AMT)"
 	depends on INET && IP_MULTICAST
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 7cab36f94782..9c053673d6b2 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -38,6 +38,7 @@ obj-$(CONFIG_GENEVE) += geneve.o
 obj-$(CONFIG_BAREUDP) += bareudp.o
 obj-$(CONFIG_GTP) += gtp.o
 obj-$(CONFIG_NLMON) += nlmon.o
+obj-$(CONFIG_PFCP) += pfcp.o
 obj-$(CONFIG_NET_VRF) += vrf.o
 obj-$(CONFIG_VSOCKMON) += vsockmon.o
 obj-$(CONFIG_MHI_NET) += mhi_net.o
diff --git a/drivers/net/pfcp.c b/drivers/net/pfcp.c
new file mode 100644
index 000000000000..3f1ee0ae7111
--- /dev/null
+++ b/drivers/net/pfcp.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PFCP according to 3GPP TS 29.244
+ *
+ * Copyright (C) 2022, Intel Corporation.
+ */
+
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+
+#include <net/udp.h>
+#include <net/udp_tunnel.h>
+#include <net/pfcp.h>
+
+struct pfcp_dev {
+	struct list_head	list;
+
+	struct socket		*sock;
+	struct net_device	*dev;
+	struct net		*net;
+};
+
+static unsigned int pfcp_net_id __read_mostly;
+
+struct pfcp_net {
+	struct list_head	pfcp_dev_list;
+};
+
+static void pfcp_del_sock(struct pfcp_dev *pfcp)
+{
+	udp_tunnel_sock_release(pfcp->sock);
+	pfcp->sock = NULL;
+}
+
+static void pfcp_dev_uninit(struct net_device *dev)
+{
+	struct pfcp_dev *pfcp = netdev_priv(dev);
+
+	pfcp_del_sock(pfcp);
+}
+
+static int pfcp_dev_init(struct net_device *dev)
+{
+	struct pfcp_dev *pfcp = netdev_priv(dev);
+
+	pfcp->dev = dev;
+
+	return 0;
+}
+
+static const struct net_device_ops pfcp_netdev_ops = {
+	.ndo_init		= pfcp_dev_init,
+	.ndo_uninit		= pfcp_dev_uninit,
+	.ndo_get_stats64	= dev_get_tstats64,
+};
+
+static const struct device_type pfcp_type = {
+	.name = "pfcp",
+};
+
+static void pfcp_link_setup(struct net_device *dev)
+{
+	dev->netdev_ops = &pfcp_netdev_ops;
+	dev->needs_free_netdev = true;
+	SET_NETDEV_DEVTYPE(dev, &pfcp_type);
+
+	dev->hard_header_len = 0;
+	dev->addr_len = 0;
+
+	dev->type = ARPHRD_NONE;
+	dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
+	dev->priv_flags |= IFF_NO_QUEUE;
+
+	netif_keep_dst(dev);
+}
+
+static struct socket *pfcp_create_sock(struct pfcp_dev *pfcp)
+{
+	struct udp_tunnel_sock_cfg tuncfg = {};
+	struct udp_port_cfg udp_conf = {
+		.local_ip.s_addr	= htonl(INADDR_ANY),
+		.family			= AF_INET,
+	};
+	struct net *net = pfcp->net;
+	struct socket *sock;
+	int err;
+
+	udp_conf.local_udp_port = htons(PFCP_PORT);
+
+	err = udp_sock_create(net, &udp_conf, &sock);
+	if (err)
+		return ERR_PTR(err);
+
+	setup_udp_tunnel_sock(net, sock, &tuncfg);
+
+	return sock;
+}
+
+static int pfcp_add_sock(struct pfcp_dev *pfcp)
+{
+	pfcp->sock = pfcp_create_sock(pfcp);
+
+	return PTR_ERR_OR_ZERO(pfcp->sock);
+}
+
+static int pfcp_newlink(struct net *net, struct net_device *dev,
+			struct nlattr *tb[], struct nlattr *data[],
+			struct netlink_ext_ack *extack)
+{
+	struct pfcp_dev *pfcp = netdev_priv(dev);
+	struct pfcp_net *pn;
+	int err;
+
+	pfcp->net = net;
+
+	err = pfcp_add_sock(pfcp);
+	if (err) {
+		netdev_dbg(dev, "failed to add pfcp socket %d\n", err);
+		goto exit_err;
+	}
+
+	err = register_netdevice(dev);
+	if (err) {
+		netdev_dbg(dev, "failed to register pfcp netdev %d\n", err);
+		goto exit_del_pfcp_sock;
+	}
+
+	pn = net_generic(dev_net(dev), pfcp_net_id);
+	list_add_rcu(&pfcp->list, &pn->pfcp_dev_list);
+
+	netdev_dbg(dev, "registered new PFCP interface\n");
+
+	return 0;
+
+exit_del_pfcp_sock:
+	pfcp_del_sock(pfcp);
+exit_err:
+	pfcp->net = NULL;
+	return err;
+}
+
+static void pfcp_dellink(struct net_device *dev, struct list_head *head)
+{
+	struct pfcp_dev *pfcp = netdev_priv(dev);
+
+	list_del_rcu(&pfcp->list);
+	unregister_netdevice_queue(dev, head);
+}
+
+static struct rtnl_link_ops pfcp_link_ops __read_mostly = {
+	.kind		= "pfcp",
+	.priv_size	= sizeof(struct pfcp_dev),
+	.setup		= pfcp_link_setup,
+	.newlink	= pfcp_newlink,
+	.dellink	= pfcp_dellink,
+};
+
+static int __net_init pfcp_net_init(struct net *net)
+{
+	struct pfcp_net *pn = net_generic(net, pfcp_net_id);
+
+	INIT_LIST_HEAD(&pn->pfcp_dev_list);
+	return 0;
+}
+
+static void __net_exit pfcp_net_exit(struct net *net)
+{
+	struct pfcp_net *pn = net_generic(net, pfcp_net_id);
+	struct pfcp_dev *pfcp;
+	LIST_HEAD(list);
+
+	rtnl_lock();
+	list_for_each_entry(pfcp, &pn->pfcp_dev_list, list)
+		pfcp_dellink(pfcp->dev, &list);
+
+	unregister_netdevice_many(&list);
+	rtnl_unlock();
+}
+
+static struct pernet_operations pfcp_net_ops = {
+	.init	= pfcp_net_init,
+	.exit	= pfcp_net_exit,
+	.id	= &pfcp_net_id,
+	.size	= sizeof(struct pfcp_net),
+};
+
+static int __init pfcp_init(void)
+{
+	int err;
+
+	err = register_pernet_subsys(&pfcp_net_ops);
+	if (err)
+		goto exit_err;
+
+	err = rtnl_link_register(&pfcp_link_ops);
+	if (err)
+		goto exit_unregister_subsys;
+	return 0;
+
+exit_unregister_subsys:
+	unregister_pernet_subsys(&pfcp_net_ops);
+exit_err:
+	pr_err("loading PFCP module failed: err %d\n", err);
+	return err;
+}
+late_initcall(pfcp_init);
+
+static void __exit pfcp_exit(void)
+{
+	rtnl_link_unregister(&pfcp_link_ops);
+	unregister_pernet_subsys(&pfcp_net_ops);
+
+	pr_info("PFCP module unloaded\n");
+}
+module_exit(pfcp_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Wojciech Drewek <wojciech.drewek@intel.com>");
+MODULE_DESCRIPTION("Interface driver for PFCP encapsulated traffic");
+MODULE_ALIAS_RTNL_LINK("pfcp");
diff --git a/include/net/pfcp.h b/include/net/pfcp.h
new file mode 100644
index 000000000000..3f9ebf27a8ff
--- /dev/null
+++ b/include/net/pfcp.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _PFCP_H_
+#define _PFCP_H_
+
+#include <linux/netdevice.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#define PFCP_PORT 8805
+
+static inline bool netif_is_pfcp(const struct net_device *dev)
+{
+	return dev->rtnl_link_ops &&
+	       !strcmp(dev->rtnl_link_ops->kind, "pfcp");
+}
+
+#endif
-- 
cgit v1.2.3


From 6dd514f48110ebb4bf36875b9e7e02d07b589caa Mon Sep 17 00:00:00 2001
From: Michal Swiatkowski <michal.swiatkowski@linux.intel.com>
Date: Wed, 27 Mar 2024 16:23:56 +0100
Subject: pfcp: always set pfcp metadata

In PFCP receive path set metadata needed by flower code to do correct
classification based on this metadata.

Signed-off-by: Michal Swiatkowski <michal.swiatkowski@linux.intel.com>
Signed-off-by: Marcin Szycik <marcin.szycik@linux.intel.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/pfcp.c             |  81 ++++++++++++++++++++++++++++++-
 include/net/ip_tunnels.h       |   3 ++
 include/net/pfcp.h             |  73 ++++++++++++++++++++++++++++
 include/uapi/linux/if_tunnel.h |   3 ++
 include/uapi/linux/pkt_cls.h   |  14 ++++++
 net/core/net_test.c            |   7 +--
 net/sched/cls_flower.c         | 107 +++++++++++++++++++++++++++++++++++++++++
 7 files changed, 282 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/net/pfcp.c b/drivers/net/pfcp.c
index 3f1ee0ae7111..cc5b28c5f99f 100644
--- a/drivers/net/pfcp.c
+++ b/drivers/net/pfcp.c
@@ -21,6 +21,8 @@ struct pfcp_dev {
 	struct socket		*sock;
 	struct net_device	*dev;
 	struct net		*net;
+
+	struct gro_cells	gro_cells;
 };
 
 static unsigned int pfcp_net_id __read_mostly;
@@ -29,6 +31,78 @@ struct pfcp_net {
 	struct list_head	pfcp_dev_list;
 };
 
+static void
+pfcp_session_recv(struct pfcp_dev *pfcp, struct sk_buff *skb,
+		  struct pfcp_metadata *md)
+{
+	struct pfcphdr_session *unparsed = pfcp_hdr_session(skb);
+
+	md->seid = unparsed->seid;
+	md->type = PFCP_TYPE_SESSION;
+}
+
+static void
+pfcp_node_recv(struct pfcp_dev *pfcp, struct sk_buff *skb,
+	       struct pfcp_metadata *md)
+{
+	md->type = PFCP_TYPE_NODE;
+}
+
+static int pfcp_encap_recv(struct sock *sk, struct sk_buff *skb)
+{
+	IP_TUNNEL_DECLARE_FLAGS(flags) = { };
+	struct metadata_dst *tun_dst;
+	struct pfcp_metadata *md;
+	struct pfcphdr *unparsed;
+	struct pfcp_dev *pfcp;
+
+	if (unlikely(!pskb_may_pull(skb, PFCP_HLEN)))
+		goto drop;
+
+	pfcp = rcu_dereference_sk_user_data(sk);
+	if (unlikely(!pfcp))
+		goto drop;
+
+	unparsed = pfcp_hdr(skb);
+
+	ip_tunnel_flags_zero(flags);
+	tun_dst = udp_tun_rx_dst(skb, sk->sk_family, flags, 0,
+				 sizeof(*md));
+	if (unlikely(!tun_dst))
+		goto drop;
+
+	md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
+	if (unlikely(!md))
+		goto drop;
+
+	if (unparsed->flags & PFCP_SEID_FLAG)
+		pfcp_session_recv(pfcp, skb, md);
+	else
+		pfcp_node_recv(pfcp, skb, md);
+
+	__set_bit(IP_TUNNEL_PFCP_OPT_BIT, flags);
+	ip_tunnel_info_opts_set(&tun_dst->u.tun_info, md, sizeof(*md),
+				flags);
+
+	if (unlikely(iptunnel_pull_header(skb, PFCP_HLEN, skb->protocol,
+					  !net_eq(sock_net(sk),
+					  dev_net(pfcp->dev)))))
+		goto drop;
+
+	skb_dst_set(skb, (struct dst_entry *)tun_dst);
+
+	skb_reset_network_header(skb);
+	skb_reset_mac_header(skb);
+	skb->dev = pfcp->dev;
+
+	gro_cells_receive(&pfcp->gro_cells, skb);
+
+	return 0;
+drop:
+	kfree_skb(skb);
+	return 0;
+}
+
 static void pfcp_del_sock(struct pfcp_dev *pfcp)
 {
 	udp_tunnel_sock_release(pfcp->sock);
@@ -39,6 +113,7 @@ static void pfcp_dev_uninit(struct net_device *dev)
 {
 	struct pfcp_dev *pfcp = netdev_priv(dev);
 
+	gro_cells_destroy(&pfcp->gro_cells);
 	pfcp_del_sock(pfcp);
 }
 
@@ -48,7 +123,7 @@ static int pfcp_dev_init(struct net_device *dev)
 
 	pfcp->dev = dev;
 
-	return 0;
+	return gro_cells_init(&pfcp->gro_cells, dev);
 }
 
 static const struct net_device_ops pfcp_netdev_ops = {
@@ -94,6 +169,10 @@ static struct socket *pfcp_create_sock(struct pfcp_dev *pfcp)
 	if (err)
 		return ERR_PTR(err);
 
+	tuncfg.sk_user_data = pfcp;
+	tuncfg.encap_rcv = pfcp_encap_recv;
+	tuncfg.encap_type = 1;
+
 	setup_udp_tunnel_sock(net, sock, &tuncfg);
 
 	return sock;
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index ed8e48cc9054..d8f574fbb11e 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -216,6 +216,7 @@ static inline void ip_tunnel_set_options_present(unsigned long *flags)
 	__set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present);
 	__set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present);
 	__set_bit(IP_TUNNEL_GTP_OPT_BIT, present);
+	__set_bit(IP_TUNNEL_PFCP_OPT_BIT, present);
 
 	ip_tunnel_flags_or(flags, flags, present);
 }
@@ -228,6 +229,7 @@ static inline void ip_tunnel_clear_options_present(unsigned long *flags)
 	__set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present);
 	__set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present);
 	__set_bit(IP_TUNNEL_GTP_OPT_BIT, present);
+	__set_bit(IP_TUNNEL_PFCP_OPT_BIT, present);
 
 	__ipt_flag_op(bitmap_andnot, flags, flags, present);
 }
@@ -240,6 +242,7 @@ static inline bool ip_tunnel_is_options_present(const unsigned long *flags)
 	__set_bit(IP_TUNNEL_VXLAN_OPT_BIT, present);
 	__set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, present);
 	__set_bit(IP_TUNNEL_GTP_OPT_BIT, present);
+	__set_bit(IP_TUNNEL_PFCP_OPT_BIT, present);
 
 	return ip_tunnel_flags_intersect(flags, present);
 }
diff --git a/include/net/pfcp.h b/include/net/pfcp.h
index 3f9ebf27a8ff..af14f970b80e 100644
--- a/include/net/pfcp.h
+++ b/include/net/pfcp.h
@@ -2,12 +2,85 @@
 #ifndef _PFCP_H_
 #define _PFCP_H_
 
+#include <uapi/linux/if_ether.h>
+#include <net/dst_metadata.h>
 #include <linux/netdevice.h>
+#include <uapi/linux/ipv6.h>
+#include <net/udp_tunnel.h>
+#include <uapi/linux/udp.h>
+#include <uapi/linux/ip.h>
 #include <linux/string.h>
 #include <linux/types.h>
+#include <linux/bits.h>
 
 #define PFCP_PORT 8805
 
+/* PFCP protocol header */
+struct pfcphdr {
+	u8	flags;
+	u8	message_type;
+	__be16	message_length;
+};
+
+/* PFCP header flags */
+#define PFCP_SEID_FLAG		BIT(0)
+#define PFCP_MP_FLAG		BIT(1)
+
+#define PFCP_VERSION_MASK	GENMASK(4, 0)
+
+#define PFCP_HLEN (sizeof(struct udphdr) + sizeof(struct pfcphdr))
+
+/* PFCP node related messages */
+struct pfcphdr_node {
+	u8	seq_number[3];
+	u8	reserved;
+};
+
+/* PFCP session related messages */
+struct pfcphdr_session {
+	__be64	seid;
+	u8	seq_number[3];
+#ifdef __LITTLE_ENDIAN_BITFIELD
+	u8	message_priority:4,
+		reserved:4;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+	u8	reserved:4,
+		message_priprity:4;
+#else
+#error "Please fix <asm/byteorder>"
+#endif
+};
+
+struct pfcp_metadata {
+	u8 type;
+	__be64 seid;
+} __packed;
+
+enum {
+	PFCP_TYPE_NODE		= 0,
+	PFCP_TYPE_SESSION	= 1,
+};
+
+#define PFCP_HEADROOM (sizeof(struct iphdr) + sizeof(struct udphdr) + \
+		       sizeof(struct pfcphdr) + sizeof(struct ethhdr))
+#define PFCP6_HEADROOM (sizeof(struct ipv6hdr) + sizeof(struct udphdr) + \
+			sizeof(struct pfcphdr) + sizeof(struct ethhdr))
+
+static inline struct pfcphdr *pfcp_hdr(struct sk_buff *skb)
+{
+	return (struct pfcphdr *)(udp_hdr(skb) + 1);
+}
+
+static inline struct pfcphdr_node *pfcp_hdr_node(struct sk_buff *skb)
+{
+	return (struct pfcphdr_node *)(pfcp_hdr(skb) + 1);
+}
+
+static inline struct pfcphdr_session *pfcp_hdr_session(struct sk_buff *skb)
+{
+	return (struct pfcphdr_session *)(pfcp_hdr(skb) + 1);
+}
+
 static inline bool netif_is_pfcp(const struct net_device *dev)
 {
 	return dev->rtnl_link_ops &&
diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h
index 838927dd73a4..e1a246dd8c62 100644
--- a/include/uapi/linux/if_tunnel.h
+++ b/include/uapi/linux/if_tunnel.h
@@ -212,6 +212,9 @@ enum {
 	IP_TUNNEL_VTI_BIT,
 	IP_TUNNEL_SIT_ISATAP_BIT	= IP_TUNNEL_VTI_BIT,
 
+	/* Flags starting from here are not available via the old UAPI */
+	IP_TUNNEL_PFCP_OPT_BIT,		/* OPTIONS_PRESENT */
+
 	__IP_TUNNEL_FLAG_NUM,
 };
 
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index ea277039f89d..229fc925ec3a 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -587,6 +587,10 @@ enum {
 					 * TCA_FLOWER_KEY_ENC_OPT_GTP_
 					 * attributes
 					 */
+	TCA_FLOWER_KEY_ENC_OPTS_PFCP,	/* Nested
+					 * TCA_FLOWER_KEY_ENC_IPT_PFCP
+					 * attributes
+					 */
 	__TCA_FLOWER_KEY_ENC_OPTS_MAX,
 };
 
@@ -636,6 +640,16 @@ enum {
 #define TCA_FLOWER_KEY_ENC_OPT_GTP_MAX \
 		(__TCA_FLOWER_KEY_ENC_OPT_GTP_MAX - 1)
 
+enum {
+	TCA_FLOWER_KEY_ENC_OPT_PFCP_UNSPEC,
+	TCA_FLOWER_KEY_ENC_OPT_PFCP_TYPE,		/* u8 */
+	TCA_FLOWER_KEY_ENC_OPT_PFCP_SEID,		/* be64 */
+	__TCA_FLOWER_KEY_ENC_OPT_PFCP_MAX,
+};
+
+#define TCA_FLOWER_KEY_ENC_OPT_PFCP_MAX \
+		(__TCA_FLOWER_KEY_ENC_OPT_PFCP_MAX - 1)
+
 enum {
 	TCA_FLOWER_KEY_MPLS_OPTS_UNSPEC,
 	TCA_FLOWER_KEY_MPLS_OPTS_LSE,
diff --git a/net/core/net_test.c b/net/core/net_test.c
index 30062b9621e4..9c3a590865d2 100644
--- a/net/core/net_test.c
+++ b/net/core/net_test.c
@@ -335,11 +335,8 @@ static const struct ip_tunnel_flags_test ip_tunnel_flags_test[] = {
 			     ip_tunnel_flags_1),
 	IP_TUNNEL_FLAGS_TEST("conflict", ip_tunnel_flags_2_src, true,
 			     VTI_ISVTI, ip_tunnel_flags_2_exp),
-	IP_TUNNEL_FLAGS_TEST("new", ip_tunnel_flags_3_src,
-			     /* This must be set to ``false`` once
-			      * ``__IP_TUNNEL_FLAG_NUM`` goes above 17.
-			      */
-			     true, cpu_to_be16(BIT(IP_TUNNEL_VXLAN_OPT_BIT)),
+	IP_TUNNEL_FLAGS_TEST("new", ip_tunnel_flags_3_src, false,
+			     cpu_to_be16(BIT(IP_TUNNEL_VXLAN_OPT_BIT)),
 			     ip_tunnel_flags_3_exp),
 };
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index ae14d649140f..fd9a6f20b60b 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -28,6 +28,7 @@
 #include <net/vxlan.h>
 #include <net/erspan.h>
 #include <net/gtp.h>
+#include <net/pfcp.h>
 #include <net/tc_wrapper.h>
 
 #include <net/dst.h>
@@ -741,6 +742,7 @@ enc_opts_policy[TCA_FLOWER_KEY_ENC_OPTS_MAX + 1] = {
 	[TCA_FLOWER_KEY_ENC_OPTS_VXLAN]         = { .type = NLA_NESTED },
 	[TCA_FLOWER_KEY_ENC_OPTS_ERSPAN]        = { .type = NLA_NESTED },
 	[TCA_FLOWER_KEY_ENC_OPTS_GTP]		= { .type = NLA_NESTED },
+	[TCA_FLOWER_KEY_ENC_OPTS_PFCP]		= { .type = NLA_NESTED },
 };
 
 static const struct nla_policy
@@ -770,6 +772,12 @@ gtp_opt_policy[TCA_FLOWER_KEY_ENC_OPT_GTP_MAX + 1] = {
 	[TCA_FLOWER_KEY_ENC_OPT_GTP_QFI]	   = { .type = NLA_U8 },
 };
 
+static const struct nla_policy
+pfcp_opt_policy[TCA_FLOWER_KEY_ENC_OPT_PFCP_MAX + 1] = {
+	[TCA_FLOWER_KEY_ENC_OPT_PFCP_TYPE]	   = { .type = NLA_U8 },
+	[TCA_FLOWER_KEY_ENC_OPT_PFCP_SEID]	   = { .type = NLA_U64 },
+};
+
 static const struct nla_policy
 mpls_stack_entry_policy[TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX + 1] = {
 	[TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH]    = { .type = NLA_U8 },
@@ -1419,6 +1427,44 @@ static int fl_set_gtp_opt(const struct nlattr *nla, struct fl_flow_key *key,
 	return sizeof(*sinfo);
 }
 
+static int fl_set_pfcp_opt(const struct nlattr *nla, struct fl_flow_key *key,
+			   int depth, int option_len,
+			   struct netlink_ext_ack *extack)
+{
+	struct nlattr *tb[TCA_FLOWER_KEY_ENC_OPT_PFCP_MAX + 1];
+	struct pfcp_metadata *md;
+	int err;
+
+	md = (struct pfcp_metadata *)&key->enc_opts.data[key->enc_opts.len];
+	memset(md, 0xff, sizeof(*md));
+
+	if (!depth)
+		return sizeof(*md);
+
+	if (nla_type(nla) != TCA_FLOWER_KEY_ENC_OPTS_PFCP) {
+		NL_SET_ERR_MSG_MOD(extack, "Non-pfcp option type for mask");
+		return -EINVAL;
+	}
+
+	err = nla_parse_nested(tb, TCA_FLOWER_KEY_ENC_OPT_PFCP_MAX, nla,
+			       pfcp_opt_policy, extack);
+	if (err < 0)
+		return err;
+
+	if (!option_len && !tb[TCA_FLOWER_KEY_ENC_OPT_PFCP_TYPE]) {
+		NL_SET_ERR_MSG_MOD(extack, "Missing tunnel key pfcp option type");
+		return -EINVAL;
+	}
+
+	if (tb[TCA_FLOWER_KEY_ENC_OPT_PFCP_TYPE])
+		md->type = nla_get_u8(tb[TCA_FLOWER_KEY_ENC_OPT_PFCP_TYPE]);
+
+	if (tb[TCA_FLOWER_KEY_ENC_OPT_PFCP_SEID])
+		md->seid = nla_get_be64(tb[TCA_FLOWER_KEY_ENC_OPT_PFCP_SEID]);
+
+	return sizeof(*md);
+}
+
 static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
 			  struct fl_flow_key *mask,
 			  struct netlink_ext_ack *extack)
@@ -1576,6 +1622,36 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key,
 				return -EINVAL;
 			}
 			break;
+		case TCA_FLOWER_KEY_ENC_OPTS_PFCP:
+			if (key->enc_opts.dst_opt_type) {
+				NL_SET_ERR_MSG_MOD(extack, "Duplicate type for pfcp options");
+				return -EINVAL;
+			}
+			option_len = 0;
+			key->enc_opts.dst_opt_type = IP_TUNNEL_PFCP_OPT_BIT;
+			option_len = fl_set_pfcp_opt(nla_opt_key, key,
+						     key_depth, option_len,
+						     extack);
+			if (option_len < 0)
+				return option_len;
+
+			key->enc_opts.len += option_len;
+			/* At the same time we need to parse through the mask
+			 * in order to verify exact and mask attribute lengths.
+			 */
+			mask->enc_opts.dst_opt_type = IP_TUNNEL_PFCP_OPT_BIT;
+			option_len = fl_set_pfcp_opt(nla_opt_msk, mask,
+						     msk_depth, option_len,
+						     extack);
+			if (option_len < 0)
+				return option_len;
+
+			mask->enc_opts.len += option_len;
+			if (key->enc_opts.len != mask->enc_opts.len) {
+				NL_SET_ERR_MSG_MOD(extack, "Key and mask miss aligned");
+				return -EINVAL;
+			}
+			break;
 		default:
 			NL_SET_ERR_MSG(extack, "Unknown tunnel option type");
 			return -EINVAL;
@@ -3118,6 +3194,32 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+static int fl_dump_key_pfcp_opt(struct sk_buff *skb,
+				struct flow_dissector_key_enc_opts *enc_opts)
+{
+	struct pfcp_metadata *md;
+	struct nlattr *nest;
+
+	nest = nla_nest_start_noflag(skb, TCA_FLOWER_KEY_ENC_OPTS_PFCP);
+	if (!nest)
+		goto nla_put_failure;
+
+	md = (struct pfcp_metadata *)&enc_opts->data[0];
+	if (nla_put_u8(skb, TCA_FLOWER_KEY_ENC_OPT_PFCP_TYPE, md->type))
+		goto nla_put_failure;
+
+	if (nla_put_be64(skb, TCA_FLOWER_KEY_ENC_OPT_PFCP_SEID,
+			 md->seid, 0))
+		goto nla_put_failure;
+
+	nla_nest_end(skb, nest);
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
 static int fl_dump_key_ct(struct sk_buff *skb,
 			  struct flow_dissector_key_ct *key,
 			  struct flow_dissector_key_ct *mask)
@@ -3223,6 +3325,11 @@ static int fl_dump_key_options(struct sk_buff *skb, int enc_opt_type,
 		if (err)
 			goto nla_put_failure;
 		break;
+	case IP_TUNNEL_PFCP_OPT_BIT:
+		err = fl_dump_key_pfcp_opt(skb, enc_opts);
+		if (err)
+			goto nla_put_failure;
+		break;
 	default:
 		goto nla_put_failure;
 	}
-- 
cgit v1.2.3


From b9495b564d91a0afe4125db64d2bc25c310bda9c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 29 Mar 2024 15:42:18 +0000
Subject: net: move kick_defer_list_purge() to net/core/dev.h

kick_defer_list_purge() is defined in net/core/dev.c
and used from net/core/skubff.c

Because we need softnet_data, include <linux/netdevice.h>
from net/core/dev.h

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 1 -
 net/core/dev.h            | 6 +++---
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 55c7cf9404a4..15e70527b703 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3288,7 +3288,6 @@ static inline void dev_xmit_recursion_dec(void)
 	__this_cpu_dec(softnet_data.xmit.recursion);
 }
 
-void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu);
 void __netif_schedule(struct Qdisc *q);
 void netif_schedule_queue(struct netdev_queue *txq);
 
diff --git a/net/core/dev.h b/net/core/dev.h
index 2bcaf8eee50c..9d0f8b4587f8 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -4,11 +4,9 @@
 
 #include <linux/types.h>
 #include <linux/rwsem.h>
+#include <linux/netdevice.h>
 
 struct net;
-struct net_device;
-struct netdev_bpf;
-struct netdev_phys_item_id;
 struct netlink_ext_ack;
 struct cpumask;
 
@@ -150,4 +148,6 @@ static inline void xdp_do_check_flushed(struct napi_struct *napi) { }
 #endif
 
 struct napi_struct *napi_by_id(unsigned int napi_id);
+void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu);
+
 #endif
-- 
cgit v1.2.3


From 2fe50a4d7225cf10748775e290361896637091a9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 29 Mar 2024 15:42:19 +0000
Subject: net: move dev_xmit_recursion() helpers to net/core/dev.h

Move dev_xmit_recursion() and friends to net/core/dev.h

They are only used from net/core/dev.c and net/core/filter.c.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 17 -----------------
 net/core/dev.h            | 17 +++++++++++++++++
 2 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 15e70527b703..0cd9ee83f554 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3271,23 +3271,6 @@ static inline int dev_recursion_level(void)
 	return this_cpu_read(softnet_data.xmit.recursion);
 }
 
-#define XMIT_RECURSION_LIMIT	8
-static inline bool dev_xmit_recursion(void)
-{
-	return unlikely(__this_cpu_read(softnet_data.xmit.recursion) >
-			XMIT_RECURSION_LIMIT);
-}
-
-static inline void dev_xmit_recursion_inc(void)
-{
-	__this_cpu_inc(softnet_data.xmit.recursion);
-}
-
-static inline void dev_xmit_recursion_dec(void)
-{
-	__this_cpu_dec(softnet_data.xmit.recursion);
-}
-
 void __netif_schedule(struct Qdisc *q);
 void netif_schedule_queue(struct netdev_queue *txq);
 
diff --git a/net/core/dev.h b/net/core/dev.h
index 9d0f8b4587f8..8572d2c8dc4a 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -150,4 +150,21 @@ static inline void xdp_do_check_flushed(struct napi_struct *napi) { }
 struct napi_struct *napi_by_id(unsigned int napi_id);
 void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu);
 
+#define XMIT_RECURSION_LIMIT	8
+static inline bool dev_xmit_recursion(void)
+{
+	return unlikely(__this_cpu_read(softnet_data.xmit.recursion) >
+			XMIT_RECURSION_LIMIT);
+}
+
+static inline void dev_xmit_recursion_inc(void)
+{
+	__this_cpu_inc(softnet_data.xmit.recursion);
+}
+
+static inline void dev_xmit_recursion_dec(void)
+{
+	__this_cpu_dec(softnet_data.xmit.recursion);
+}
+
 #endif
-- 
cgit v1.2.3


From a7ae7b0b2ea014ff3ed4be812c3efa1b1d86e153 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 29 Mar 2024 15:42:21 +0000
Subject: net: make softnet_data.dropped an atomic_t

If under extreme cpu backlog pressure enqueue_to_backlog() has
to drop a packet, it could do this without dirtying a cache line
and potentially slowing down the target cpu.

Move sd->dropped into a separate cache line, and make it atomic.

In non pressure mode, this field is not touched, no need to consume
valuable space in a hot cache line.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  3 ++-
 net/core/dev.c            | 13 +++++++++----
 net/core/net-procfs.c     |  3 ++-
 3 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0cd9ee83f554..be9f071a340d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3237,10 +3237,11 @@ struct softnet_data {
 	unsigned int		input_queue_tail;
 #endif
 	unsigned int		received_rps;
-	unsigned int		dropped;
 	struct sk_buff_head	input_pkt_queue;
 	struct napi_struct	backlog;
 
+	atomic_t		dropped ____cacheline_aligned_in_smp;
+
 	/* Another possibly contended cache line */
 	spinlock_t		defer_lock ____cacheline_aligned_in_smp;
 	int			defer_count;
diff --git a/net/core/dev.c b/net/core/dev.c
index 4ad7836365e6..02c98f115243 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4800,17 +4800,22 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 	struct softnet_data *sd;
 	unsigned long flags;
 	unsigned int qlen;
+	int max_backlog;
 
 	reason = SKB_DROP_REASON_DEV_READY;
 	if (!netif_running(skb->dev))
 		goto bad_dev;
 
+	reason = SKB_DROP_REASON_CPU_BACKLOG;
 	sd = &per_cpu(softnet_data, cpu);
 
+	qlen = skb_queue_len_lockless(&sd->input_pkt_queue);
+	max_backlog = READ_ONCE(net_hotdata.max_backlog);
+	if (unlikely(qlen > max_backlog))
+		goto cpu_backlog_drop;
 	backlog_lock_irq_save(sd, &flags);
 	qlen = skb_queue_len(&sd->input_pkt_queue);
-	if (qlen <= READ_ONCE(net_hotdata.max_backlog) &&
-	    !skb_flow_limit(skb, qlen)) {
+	if (qlen <= max_backlog && !skb_flow_limit(skb, qlen)) {
 		if (qlen) {
 enqueue:
 			__skb_queue_tail(&sd->input_pkt_queue, skb);
@@ -4826,11 +4831,11 @@ enqueue:
 			napi_schedule_rps(sd);
 		goto enqueue;
 	}
-	reason = SKB_DROP_REASON_CPU_BACKLOG;
 
-	sd->dropped++;
 	backlog_unlock_irq_restore(sd, &flags);
 
+cpu_backlog_drop:
+	atomic_inc(&sd->dropped);
 bad_dev:
 	dev_core_stats_rx_dropped_inc(skb->dev);
 	kfree_skb_reason(skb, reason);
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index a97eceb84e61..fa6d3969734a 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -144,7 +144,8 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
 	seq_printf(seq,
 		   "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x "
 		   "%08x %08x\n",
-		   sd->processed, sd->dropped, sd->time_squeeze, 0,
+		   sd->processed, atomic_read(&sd->dropped),
+		   sd->time_squeeze, 0,
 		   0, 0, 0, 0, /* was fastroute */
 		   0,	/* was cpu_collision */
 		   sd->received_rps, flow_limit_count,
-- 
cgit v1.2.3


From 36b83ffcf209a2e6099dae1070df6a2001dfab27 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 29 Mar 2024 15:42:23 +0000
Subject: net: rps: change input_queue_tail_incr_save()

input_queue_tail_incr_save() is incrementing the sd queue_tail
and save it in the flow last_qtail.

Two issues here :

- no lock protects the write on last_qtail, we should use appropriate
  annotations.

- We can perform this write after releasing the per-cpu backlog lock,
  to decrease this lock hold duration (move away the cache line miss)

Also move input_queue_head_incr() and rps helpers to include/net/rps.h,
while adding rps_ prefix to better reflect their role.

v2: Fixed a build issue (Jakub and kernel build bots)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 15 ---------------
 include/net/rps.h         | 23 +++++++++++++++++++++++
 net/core/dev.c            | 20 ++++++++++++--------
 3 files changed, 35 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index be9f071a340d..2c11c295cf64 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3250,21 +3250,6 @@ struct softnet_data {
 	call_single_data_t	defer_csd;
 };
 
-static inline void input_queue_head_incr(struct softnet_data *sd)
-{
-#ifdef CONFIG_RPS
-	sd->input_queue_head++;
-#endif
-}
-
-static inline void input_queue_tail_incr_save(struct softnet_data *sd,
-					      unsigned int *qtail)
-{
-#ifdef CONFIG_RPS
-	*qtail = ++sd->input_queue_tail;
-#endif
-}
-
 DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 
 static inline int dev_recursion_level(void)
diff --git a/include/net/rps.h b/include/net/rps.h
index 7660243e905b..10ca25731c1e 100644
--- a/include/net/rps.h
+++ b/include/net/rps.h
@@ -122,4 +122,27 @@ static inline void sock_rps_record_flow(const struct sock *sk)
 #endif
 }
 
+static inline u32 rps_input_queue_tail_incr(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+	return ++sd->input_queue_tail;
+#else
+	return 0;
+#endif
+}
+
+static inline void rps_input_queue_tail_save(u32 *dest, u32 tail)
+{
+#ifdef CONFIG_RPS
+	WRITE_ONCE(*dest, tail);
+#endif
+}
+
+static inline void rps_input_queue_head_incr(struct softnet_data *sd)
+{
+#ifdef CONFIG_RPS
+	sd->input_queue_head++;
+#endif
+}
+
 #endif /* _NET_RPS_H */
diff --git a/net/core/dev.c b/net/core/dev.c
index 0a8ccb0451c3..79073bbc9a64 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4611,7 +4611,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 		if (unlikely(tcpu != next_cpu) &&
 		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
-		      rflow->last_qtail)) >= 0)) {
+		      READ_ONCE(rflow->last_qtail))) >= 0)) {
 			tcpu = next_cpu;
 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
 		}
@@ -4666,7 +4666,7 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
 		cpu = READ_ONCE(rflow->cpu);
 		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
-			   rflow->last_qtail) <
+			   READ_ONCE(rflow->last_qtail)) <
 		     (int)(10 * flow_table->mask)))
 			expire = false;
 	}
@@ -4801,6 +4801,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 	unsigned long flags;
 	unsigned int qlen;
 	int max_backlog;
+	u32 tail;
 
 	reason = SKB_DROP_REASON_DEV_READY;
 	if (!netif_running(skb->dev))
@@ -4825,8 +4826,11 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 				napi_schedule_rps(sd);
 		}
 		__skb_queue_tail(&sd->input_pkt_queue, skb);
-		input_queue_tail_incr_save(sd, qtail);
+		tail = rps_input_queue_tail_incr(sd);
 		backlog_unlock_irq_restore(sd, &flags);
+
+		/* save the tail outside of the critical section */
+		rps_input_queue_tail_save(qtail, tail);
 		return NET_RX_SUCCESS;
 	}
 
@@ -5904,7 +5908,7 @@ static void flush_backlog(struct work_struct *work)
 		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 			__skb_unlink(skb, &sd->input_pkt_queue);
 			dev_kfree_skb_irq(skb);
-			input_queue_head_incr(sd);
+			rps_input_queue_head_incr(sd);
 		}
 	}
 	backlog_unlock_irq_enable(sd);
@@ -5913,7 +5917,7 @@ static void flush_backlog(struct work_struct *work)
 		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 			__skb_unlink(skb, &sd->process_queue);
 			kfree_skb(skb);
-			input_queue_head_incr(sd);
+			rps_input_queue_head_incr(sd);
 		}
 	}
 	local_bh_enable();
@@ -6041,7 +6045,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
 			rcu_read_lock();
 			__netif_receive_skb(skb);
 			rcu_read_unlock();
-			input_queue_head_incr(sd);
+			rps_input_queue_head_incr(sd);
 			if (++work >= quota)
 				return work;
 
@@ -11455,11 +11459,11 @@ static int dev_cpu_dead(unsigned int oldcpu)
 	/* Process offline CPU's input_pkt_queue */
 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
 		netif_rx(skb);
-		input_queue_head_incr(oldsd);
+		rps_input_queue_head_incr(oldsd);
 	}
 	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
 		netif_rx(skb);
-		input_queue_head_incr(oldsd);
+		rps_input_queue_head_incr(oldsd);
 	}
 
 	return 0;
-- 
cgit v1.2.3


From c62fdf5b11ef12b89ac2450c25c12bffc3d924c5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 29 Mar 2024 15:42:24 +0000
Subject: net: rps: add rps_input_queue_head_add() helper

process_backlog() can batch increments of sd->input_queue_head,
saving some memory bandwidth.

Also add READ_ONCE()/WRITE_ONCE() annotations around
sd->input_queue_head accesses.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/rps.h |  9 +++++++--
 net/core/dev.c    | 13 ++++++++-----
 2 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/rps.h b/include/net/rps.h
index 10ca25731c1e..a93401d23d66 100644
--- a/include/net/rps.h
+++ b/include/net/rps.h
@@ -138,11 +138,16 @@ static inline void rps_input_queue_tail_save(u32 *dest, u32 tail)
 #endif
 }
 
-static inline void rps_input_queue_head_incr(struct softnet_data *sd)
+static inline void rps_input_queue_head_add(struct softnet_data *sd, int val)
 {
 #ifdef CONFIG_RPS
-	sd->input_queue_head++;
+	WRITE_ONCE(sd->input_queue_head, sd->input_queue_head + val);
 #endif
 }
 
+static inline void rps_input_queue_head_incr(struct softnet_data *sd)
+{
+	rps_input_queue_head_add(sd, 1);
+}
+
 #endif /* _NET_RPS_H */
diff --git a/net/core/dev.c b/net/core/dev.c
index 79073bbc9a64..818699dea9d7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4528,7 +4528,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 	out:
 #endif
 		rflow->last_qtail =
-			per_cpu(softnet_data, next_cpu).input_queue_head;
+			READ_ONCE(per_cpu(softnet_data, next_cpu).input_queue_head);
 	}
 
 	rflow->cpu = next_cpu;
@@ -4610,7 +4610,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 		 */
 		if (unlikely(tcpu != next_cpu) &&
 		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
-		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
+		     ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) -
 		      READ_ONCE(rflow->last_qtail))) >= 0)) {
 			tcpu = next_cpu;
 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
@@ -4665,7 +4665,7 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
 		rflow = &flow_table->flows[flow_id];
 		cpu = READ_ONCE(rflow->cpu);
 		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
-		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
+		    ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) -
 			   READ_ONCE(rflow->last_qtail)) <
 		     (int)(10 * flow_table->mask)))
 			expire = false;
@@ -6045,9 +6045,10 @@ static int process_backlog(struct napi_struct *napi, int quota)
 			rcu_read_lock();
 			__netif_receive_skb(skb);
 			rcu_read_unlock();
-			rps_input_queue_head_incr(sd);
-			if (++work >= quota)
+			if (++work >= quota) {
+				rps_input_queue_head_add(sd, work);
 				return work;
+			}
 
 		}
 
@@ -6070,6 +6071,8 @@ static int process_backlog(struct napi_struct *napi, int quota)
 		backlog_unlock_irq_enable(sd);
 	}
 
+	if (work)
+		rps_input_queue_head_add(sd, work);
 	return work;
 }
 
-- 
cgit v1.2.3


From d3ae5f4632c107d3c2eeb97a60fecc6a6f9d6fbe Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 29 Mar 2024 15:42:25 +0000
Subject: net: rps: move received_rps field to a better location

Commit 14d898f3c1b3 ("dev: Move received_rps counter next
to RPS members in softnet data") was unfortunate:

received_rps is dirtied by a cpu and never read by other
cpus in fast path.

Its presence in the hot RPS cache line (shared by many cpus)
is hurting RPS/RFS performance.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2c11c295cf64..7d12b5a9380f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3204,6 +3204,7 @@ struct softnet_data {
 	struct softnet_data	*rps_ipi_list;
 #endif
 
+	unsigned int		received_rps;
 	bool			in_net_rx_action;
 	bool			in_napi_threaded_poll;
 
@@ -3236,7 +3237,6 @@ struct softnet_data {
 	unsigned int		cpu;
 	unsigned int		input_queue_tail;
 #endif
-	unsigned int		received_rps;
 	struct sk_buff_head	input_pkt_queue;
 	struct napi_struct	backlog;
 
-- 
cgit v1.2.3


From c32c617de8076d8fb2a16a4a2f3b5da5f3df398d Mon Sep 17 00:00:00 2001
From: Andrew Davis <afd@ti.com>
Date: Mon, 25 Mar 2024 15:31:24 -0500
Subject: power: supply: bq27xxx: Move temperature reading out of update loop

Most of the functions that read values return a status and put the value
itself in an a function parameter. Update temperature reading to match.

As temp is not checked for changes as part of the update loop, remove
the read of the temperature from the periodic update loop. This saves
I2C/1W bandwidth. It also means we do not have to cache it, fresh
values are read when requested.

Signed-off-by: Andrew Davis <afd@ti.com>
Link: https://lore.kernel.org/r/20240325203129.150030-1-afd@ti.com
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 drivers/power/supply/bq27xxx_battery.c | 17 ++++++++++-------
 include/linux/power/bq27xxx_battery.h  |  1 -
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c
index abca56834468..00b2a56039ac 100644
--- a/drivers/power/supply/bq27xxx_battery.c
+++ b/drivers/power/supply/bq27xxx_battery.c
@@ -1652,10 +1652,11 @@ static int bq27xxx_battery_read_energy(struct bq27xxx_device_info *di)
 }
 
 /*
- * Return the battery temperature in tenths of degree Kelvin
+ * Return the battery temperature in tenths of degree Celsius
  * Or < 0 if something fails.
  */
-static int bq27xxx_battery_read_temperature(struct bq27xxx_device_info *di)
+static int bq27xxx_battery_read_temperature(struct bq27xxx_device_info *di,
+					    union power_supply_propval *val)
 {
 	int temp;
 
@@ -1668,7 +1669,12 @@ static int bq27xxx_battery_read_temperature(struct bq27xxx_device_info *di)
 	if (di->opts & BQ27XXX_O_ZERO)
 		temp = 5 * temp / 2;
 
-	return temp;
+	/* Convert decidegree Kelvin to Celsius */
+	temp -= 2731;
+
+	val->intval = temp;
+
+	return 0;
 }
 
 /*
@@ -1851,7 +1857,6 @@ static void bq27xxx_battery_update_unlocked(struct bq27xxx_device_info *di)
 	if ((cache.flags & 0xff) == 0xff)
 		cache.flags = -1; /* read error */
 	if (cache.flags >= 0) {
-		cache.temperature = bq27xxx_battery_read_temperature(di);
 		if (di->regs[BQ27XXX_REG_TTE] != INVALID_REG_ADDR)
 			cache.time_to_empty = bq27xxx_battery_read_time(di, BQ27XXX_REG_TTE);
 		if (di->regs[BQ27XXX_REG_TTECP] != INVALID_REG_ADDR)
@@ -2038,9 +2043,7 @@ static int bq27xxx_battery_get_property(struct power_supply *psy,
 		ret = bq27xxx_battery_capacity_level(di, val);
 		break;
 	case POWER_SUPPLY_PROP_TEMP:
-		ret = bq27xxx_simple_value(di->cache.temperature, val);
-		if (ret == 0)
-			val->intval -= 2731; /* convert decidegree k to c */
+		ret = bq27xxx_battery_read_temperature(di, val);
 		break;
 	case POWER_SUPPLY_PROP_TIME_TO_EMPTY_NOW:
 		ret = bq27xxx_simple_value(di->cache.time_to_empty, val);
diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h
index b9e5bd2b42d3..64b2749d9562 100644
--- a/include/linux/power/bq27xxx_battery.h
+++ b/include/linux/power/bq27xxx_battery.h
@@ -47,7 +47,6 @@ struct bq27xxx_access_methods {
 };
 
 struct bq27xxx_reg_cache {
-	int temperature;
 	int time_to_empty;
 	int time_to_empty_avg;
 	int time_to_full;
-- 
cgit v1.2.3


From 651a620aa4d49f5647e21e55fc71bb049bc03389 Mon Sep 17 00:00:00 2001
From: Andrew Davis <afd@ti.com>
Date: Mon, 25 Mar 2024 15:31:25 -0500
Subject: power: supply: bq27xxx: Move time reading out of update loop

Most of the functions that read values return a status and put the value
itself in an a function parameter. Update time reading to match.

As time is not checked for changes as part of the update loop, remove
the read of the this from the periodic update loop. This saves
I2C/1W bandwidth. It also means we do not have to cache it, fresh
values are read when requested.

Signed-off-by: Andrew Davis <afd@ti.com>
Link: https://lore.kernel.org/r/20240325203129.150030-2-afd@ti.com
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 drivers/power/supply/bq27xxx_battery.c | 20 ++++++++------------
 include/linux/power/bq27xxx_battery.h  |  3 ---
 2 files changed, 8 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c
index 00b2a56039ac..dcfe3c10e7ed 100644
--- a/drivers/power/supply/bq27xxx_battery.c
+++ b/drivers/power/supply/bq27xxx_battery.c
@@ -1696,7 +1696,8 @@ static int bq27xxx_battery_read_cyct(struct bq27xxx_device_info *di)
  * Read a time register.
  * Return < 0 if something fails.
  */
-static int bq27xxx_battery_read_time(struct bq27xxx_device_info *di, u8 reg)
+static int bq27xxx_battery_read_time(struct bq27xxx_device_info *di, u8 reg,
+				     union power_supply_propval *val)
 {
 	int tval;
 
@@ -1710,7 +1711,9 @@ static int bq27xxx_battery_read_time(struct bq27xxx_device_info *di, u8 reg)
 	if (tval == 65535)
 		return -ENODATA;
 
-	return tval * 60;
+	val->intval = tval * 60;
+
+	return 0;
 }
 
 /*
@@ -1857,13 +1860,6 @@ static void bq27xxx_battery_update_unlocked(struct bq27xxx_device_info *di)
 	if ((cache.flags & 0xff) == 0xff)
 		cache.flags = -1; /* read error */
 	if (cache.flags >= 0) {
-		if (di->regs[BQ27XXX_REG_TTE] != INVALID_REG_ADDR)
-			cache.time_to_empty = bq27xxx_battery_read_time(di, BQ27XXX_REG_TTE);
-		if (di->regs[BQ27XXX_REG_TTECP] != INVALID_REG_ADDR)
-			cache.time_to_empty_avg = bq27xxx_battery_read_time(di, BQ27XXX_REG_TTECP);
-		if (di->regs[BQ27XXX_REG_TTF] != INVALID_REG_ADDR)
-			cache.time_to_full = bq27xxx_battery_read_time(di, BQ27XXX_REG_TTF);
-
 		cache.charge_full = bq27xxx_battery_read_fcc(di);
 		cache.capacity = bq27xxx_battery_read_soc(di);
 		if (di->regs[BQ27XXX_REG_AE] != INVALID_REG_ADDR)
@@ -2046,13 +2042,13 @@ static int bq27xxx_battery_get_property(struct power_supply *psy,
 		ret = bq27xxx_battery_read_temperature(di, val);
 		break;
 	case POWER_SUPPLY_PROP_TIME_TO_EMPTY_NOW:
-		ret = bq27xxx_simple_value(di->cache.time_to_empty, val);
+		ret = bq27xxx_battery_read_time(di, BQ27XXX_REG_TTE, val);
 		break;
 	case POWER_SUPPLY_PROP_TIME_TO_EMPTY_AVG:
-		ret = bq27xxx_simple_value(di->cache.time_to_empty_avg, val);
+		ret = bq27xxx_battery_read_time(di, BQ27XXX_REG_TTECP, val);
 		break;
 	case POWER_SUPPLY_PROP_TIME_TO_FULL_NOW:
-		ret = bq27xxx_simple_value(di->cache.time_to_full, val);
+		ret = bq27xxx_battery_read_time(di, BQ27XXX_REG_TTF, val);
 		break;
 	case POWER_SUPPLY_PROP_TECHNOLOGY:
 		if (di->opts & BQ27XXX_O_MUL_CHEM)
diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h
index 64b2749d9562..e89ef989a575 100644
--- a/include/linux/power/bq27xxx_battery.h
+++ b/include/linux/power/bq27xxx_battery.h
@@ -47,9 +47,6 @@ struct bq27xxx_access_methods {
 };
 
 struct bq27xxx_reg_cache {
-	int time_to_empty;
-	int time_to_empty_avg;
-	int time_to_full;
 	int charge_full;
 	int cycle_count;
 	int capacity;
-- 
cgit v1.2.3


From 8d846335204f25a2247e5e88e39e1604b6ecc133 Mon Sep 17 00:00:00 2001
From: Andrew Davis <afd@ti.com>
Date: Mon, 25 Mar 2024 15:31:26 -0500
Subject: power: supply: bq27xxx: Move charge reading out of update loop

Most of the functions that read values return a status and put the value
itself in an a function parameter. Update charge reading to match.

As charge state is not checked for changes as part of the update loop,
remove the read of this from the periodic update loop. This saves
I2C/1W bandwidth. It also means we do not have to cache it, fresh
values are read when requested.

Signed-off-by: Andrew Davis <afd@ti.com>
Link: https://lore.kernel.org/r/20240325203129.150030-3-afd@ti.com
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 drivers/power/supply/bq27xxx_battery.c | 29 +++++++++++++++++------------
 include/linux/power/bq27xxx_battery.h  |  1 -
 2 files changed, 17 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c
index dcfe3c10e7ed..799ee0aa9ef7 100644
--- a/drivers/power/supply/bq27xxx_battery.c
+++ b/drivers/power/supply/bq27xxx_battery.c
@@ -1545,7 +1545,8 @@ static int bq27xxx_battery_read_soc(struct bq27xxx_device_info *di)
  * Return a battery charge value in µAh
  * Or < 0 if something fails.
  */
-static int bq27xxx_battery_read_charge(struct bq27xxx_device_info *di, u8 reg)
+static int bq27xxx_battery_read_charge(struct bq27xxx_device_info *di, u8 reg,
+				       union power_supply_propval *val)
 {
 	int charge;
 
@@ -1561,34 +1562,39 @@ static int bq27xxx_battery_read_charge(struct bq27xxx_device_info *di, u8 reg)
 	else
 		charge *= 1000;
 
-	return charge;
+	val->intval = charge;
+
+	return 0;
 }
 
 /*
  * Return the battery Nominal available capacity in µAh
  * Or < 0 if something fails.
  */
-static inline int bq27xxx_battery_read_nac(struct bq27xxx_device_info *di)
+static inline int bq27xxx_battery_read_nac(struct bq27xxx_device_info *di,
+					   union power_supply_propval *val)
 {
-	return bq27xxx_battery_read_charge(di, BQ27XXX_REG_NAC);
+	return bq27xxx_battery_read_charge(di, BQ27XXX_REG_NAC, val);
 }
 
 /*
  * Return the battery Remaining Capacity in µAh
  * Or < 0 if something fails.
  */
-static inline int bq27xxx_battery_read_rc(struct bq27xxx_device_info *di)
+static inline int bq27xxx_battery_read_rc(struct bq27xxx_device_info *di,
+					  union power_supply_propval *val)
 {
-	return bq27xxx_battery_read_charge(di, BQ27XXX_REG_RC);
+	return bq27xxx_battery_read_charge(di, BQ27XXX_REG_RC, val);
 }
 
 /*
  * Return the battery Full Charge Capacity in µAh
  * Or < 0 if something fails.
  */
-static inline int bq27xxx_battery_read_fcc(struct bq27xxx_device_info *di)
+static inline int bq27xxx_battery_read_fcc(struct bq27xxx_device_info *di,
+					   union power_supply_propval *val)
 {
-	return bq27xxx_battery_read_charge(di, BQ27XXX_REG_FCC);
+	return bq27xxx_battery_read_charge(di, BQ27XXX_REG_FCC, val);
 }
 
 /*
@@ -1860,7 +1866,6 @@ static void bq27xxx_battery_update_unlocked(struct bq27xxx_device_info *di)
 	if ((cache.flags & 0xff) == 0xff)
 		cache.flags = -1; /* read error */
 	if (cache.flags >= 0) {
-		cache.charge_full = bq27xxx_battery_read_fcc(di);
 		cache.capacity = bq27xxx_battery_read_soc(di);
 		if (di->regs[BQ27XXX_REG_AE] != INVALID_REG_ADDR)
 			cache.energy = bq27xxx_battery_read_energy(di);
@@ -2058,12 +2063,12 @@ static int bq27xxx_battery_get_property(struct power_supply *psy,
 		break;
 	case POWER_SUPPLY_PROP_CHARGE_NOW:
 		if (di->regs[BQ27XXX_REG_NAC] != INVALID_REG_ADDR)
-			ret = bq27xxx_simple_value(bq27xxx_battery_read_nac(di), val);
+			ret = bq27xxx_battery_read_nac(di, val);
 		else
-			ret = bq27xxx_simple_value(bq27xxx_battery_read_rc(di), val);
+			ret = bq27xxx_battery_read_rc(di, val);
 		break;
 	case POWER_SUPPLY_PROP_CHARGE_FULL:
-		ret = bq27xxx_simple_value(di->cache.charge_full, val);
+		ret = bq27xxx_battery_read_fcc(di, val);
 		break;
 	case POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN:
 		ret = bq27xxx_battery_read_dcap(di, val);
diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h
index e89ef989a575..1c67fa46013b 100644
--- a/include/linux/power/bq27xxx_battery.h
+++ b/include/linux/power/bq27xxx_battery.h
@@ -47,7 +47,6 @@ struct bq27xxx_access_methods {
 };
 
 struct bq27xxx_reg_cache {
-	int charge_full;
 	int cycle_count;
 	int capacity;
 	int energy;
-- 
cgit v1.2.3


From 39cf1c4cd03254218a23ef955bd534e19328f618 Mon Sep 17 00:00:00 2001
From: Andrew Davis <afd@ti.com>
Date: Mon, 25 Mar 2024 15:31:27 -0500
Subject: power: supply: bq27xxx: Move energy reading out of update loop

Most of the functions that read values return a status and put the value
itself in an a function parameter. Update energy reading to match.

As energy is not checked for changes as part of the update loop,
remove the read of this from the periodic update loop. This saves
I2C/1W bandwidth. It also means we do not have to cache it, fresh
values are read when requested.

Signed-off-by: Andrew Davis <afd@ti.com>
Link: https://lore.kernel.org/r/20240325203129.150030-4-afd@ti.com
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 drivers/power/supply/bq27xxx_battery.c | 11 ++++++-----
 include/linux/power/bq27xxx_battery.h  |  1 -
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c
index 799ee0aa9ef7..eae2b9a60f40 100644
--- a/drivers/power/supply/bq27xxx_battery.c
+++ b/drivers/power/supply/bq27xxx_battery.c
@@ -1639,7 +1639,8 @@ static int bq27xxx_battery_read_dcap(struct bq27xxx_device_info *di,
  * Return the battery Available energy in µWh
  * Or < 0 if something fails.
  */
-static int bq27xxx_battery_read_energy(struct bq27xxx_device_info *di)
+static int bq27xxx_battery_read_energy(struct bq27xxx_device_info *di,
+				       union power_supply_propval *val)
 {
 	int ae;
 
@@ -1654,7 +1655,9 @@ static int bq27xxx_battery_read_energy(struct bq27xxx_device_info *di)
 	else
 		ae *= 1000;
 
-	return ae;
+	val->intval = ae;
+
+	return 0;
 }
 
 /*
@@ -1867,8 +1870,6 @@ static void bq27xxx_battery_update_unlocked(struct bq27xxx_device_info *di)
 		cache.flags = -1; /* read error */
 	if (cache.flags >= 0) {
 		cache.capacity = bq27xxx_battery_read_soc(di);
-		if (di->regs[BQ27XXX_REG_AE] != INVALID_REG_ADDR)
-			cache.energy = bq27xxx_battery_read_energy(di);
 		di->cache.flags = cache.flags;
 		cache.health = bq27xxx_battery_read_health(di);
 		if (di->regs[BQ27XXX_REG_CYCT] != INVALID_REG_ADDR)
@@ -2084,7 +2085,7 @@ static int bq27xxx_battery_get_property(struct power_supply *psy,
 		ret = bq27xxx_simple_value(di->cache.cycle_count, val);
 		break;
 	case POWER_SUPPLY_PROP_ENERGY_NOW:
-		ret = bq27xxx_simple_value(di->cache.energy, val);
+		ret = bq27xxx_battery_read_energy(di, val);
 		break;
 	case POWER_SUPPLY_PROP_POWER_AVG:
 		ret = bq27xxx_battery_pwr_avg(di, val);
diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h
index 1c67fa46013b..5c75abf3cf06 100644
--- a/include/linux/power/bq27xxx_battery.h
+++ b/include/linux/power/bq27xxx_battery.h
@@ -49,7 +49,6 @@ struct bq27xxx_access_methods {
 struct bq27xxx_reg_cache {
 	int cycle_count;
 	int capacity;
-	int energy;
 	int flags;
 	int health;
 };
-- 
cgit v1.2.3


From 656489ac90f25f92190a1dd5c4e5c5293bd70323 Mon Sep 17 00:00:00 2001
From: Andrew Davis <afd@ti.com>
Date: Mon, 25 Mar 2024 15:31:28 -0500
Subject: power: supply: bq27xxx: Move cycle count reading out of update loop

Most of the functions that read values return a status and put the value
itself in an a function parameter. Update cycle count reading to match.

As cycle count is not checked for changes as part of the update loop,
remove the read of this from the periodic update loop. This saves I2C/1W
bandwidth. It also means we do not have to cache it, fresh values are
read when requested.

Signed-off-by: Andrew Davis <afd@ti.com>
Link: https://lore.kernel.org/r/20240325203129.150030-5-afd@ti.com
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 drivers/power/supply/bq27xxx_battery.c | 11 ++++++-----
 include/linux/power/bq27xxx_battery.h  |  1 -
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c
index eae2b9a60f40..fe1b0af143ca 100644
--- a/drivers/power/supply/bq27xxx_battery.c
+++ b/drivers/power/supply/bq27xxx_battery.c
@@ -1690,7 +1690,8 @@ static int bq27xxx_battery_read_temperature(struct bq27xxx_device_info *di,
  * Return the battery Cycle count total
  * Or < 0 if something fails.
  */
-static int bq27xxx_battery_read_cyct(struct bq27xxx_device_info *di)
+static int bq27xxx_battery_read_cyct(struct bq27xxx_device_info *di,
+				     union power_supply_propval *val)
 {
 	int cyct;
 
@@ -1698,7 +1699,9 @@ static int bq27xxx_battery_read_cyct(struct bq27xxx_device_info *di)
 	if (cyct < 0)
 		dev_err(di->dev, "error reading cycle count total\n");
 
-	return cyct;
+	val->intval = cyct;
+
+	return 0;
 }
 
 /*
@@ -1872,8 +1875,6 @@ static void bq27xxx_battery_update_unlocked(struct bq27xxx_device_info *di)
 		cache.capacity = bq27xxx_battery_read_soc(di);
 		di->cache.flags = cache.flags;
 		cache.health = bq27xxx_battery_read_health(di);
-		if (di->regs[BQ27XXX_REG_CYCT] != INVALID_REG_ADDR)
-			cache.cycle_count = bq27xxx_battery_read_cyct(di);
 
 		/*
 		 * On gauges with signed current reporting the current must be
@@ -2082,7 +2083,7 @@ static int bq27xxx_battery_get_property(struct power_supply *psy,
 	case POWER_SUPPLY_PROP_VOLTAGE_MIN_DESIGN:
 		return -EINVAL;
 	case POWER_SUPPLY_PROP_CYCLE_COUNT:
-		ret = bq27xxx_simple_value(di->cache.cycle_count, val);
+		ret = bq27xxx_battery_read_cyct(di, val);
 		break;
 	case POWER_SUPPLY_PROP_ENERGY_NOW:
 		ret = bq27xxx_battery_read_energy(di, val);
diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h
index 5c75abf3cf06..d743270799d7 100644
--- a/include/linux/power/bq27xxx_battery.h
+++ b/include/linux/power/bq27xxx_battery.h
@@ -47,7 +47,6 @@ struct bq27xxx_access_methods {
 };
 
 struct bq27xxx_reg_cache {
-	int cycle_count;
 	int capacity;
 	int flags;
 	int health;
-- 
cgit v1.2.3


From 50f0ff7c8cc4c1d10fabc4b3b3f3b9e942b08187 Mon Sep 17 00:00:00 2001
From: Andrew Davis <afd@ti.com>
Date: Mon, 25 Mar 2024 15:31:29 -0500
Subject: power: supply: bq27xxx: Move health reading out of update loop

Most of the functions that read values return a status and put the value
itself in an a function parameter. Update health reading to match.

As health is not checked for changes as part of the update loop, remove
the read of this from the periodic update loop. This saves I2C/1W
bandwidth. It also means we do not have to cache it, fresh values are
read when requested.

Signed-off-by: Andrew Davis <afd@ti.com>
Link: https://lore.kernel.org/r/20240325203129.150030-6-afd@ti.com
Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
---
 drivers/power/supply/bq27xxx_battery.c | 30 ++++++++++++++++++------------
 include/linux/power/bq27xxx_battery.h  |  1 -
 2 files changed, 18 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c
index fe1b0af143ca..750fda543308 100644
--- a/drivers/power/supply/bq27xxx_battery.c
+++ b/drivers/power/supply/bq27xxx_battery.c
@@ -1777,19 +1777,26 @@ static bool bq27xxx_battery_capacity_inaccurate(struct bq27xxx_device_info *di,
 		return false;
 }
 
-static int bq27xxx_battery_read_health(struct bq27xxx_device_info *di)
+static int bq27xxx_battery_read_health(struct bq27xxx_device_info *di,
+				       union power_supply_propval *val)
 {
+	int health;
+
 	/* Unlikely but important to return first */
 	if (unlikely(bq27xxx_battery_overtemp(di, di->cache.flags)))
-		return POWER_SUPPLY_HEALTH_OVERHEAT;
-	if (unlikely(bq27xxx_battery_undertemp(di, di->cache.flags)))
-		return POWER_SUPPLY_HEALTH_COLD;
-	if (unlikely(bq27xxx_battery_dead(di, di->cache.flags)))
-		return POWER_SUPPLY_HEALTH_DEAD;
-	if (unlikely(bq27xxx_battery_capacity_inaccurate(di, di->cache.flags)))
-		return POWER_SUPPLY_HEALTH_CALIBRATION_REQUIRED;
-
-	return POWER_SUPPLY_HEALTH_GOOD;
+		health = POWER_SUPPLY_HEALTH_OVERHEAT;
+	else if (unlikely(bq27xxx_battery_undertemp(di, di->cache.flags)))
+		health = POWER_SUPPLY_HEALTH_COLD;
+	else if (unlikely(bq27xxx_battery_dead(di, di->cache.flags)))
+		health = POWER_SUPPLY_HEALTH_DEAD;
+	else if (unlikely(bq27xxx_battery_capacity_inaccurate(di, di->cache.flags)))
+		health = POWER_SUPPLY_HEALTH_CALIBRATION_REQUIRED;
+	else
+		health = POWER_SUPPLY_HEALTH_GOOD;
+
+	val->intval = health;
+
+	return 0;
 }
 
 static bool bq27xxx_battery_is_full(struct bq27xxx_device_info *di, int flags)
@@ -1874,7 +1881,6 @@ static void bq27xxx_battery_update_unlocked(struct bq27xxx_device_info *di)
 	if (cache.flags >= 0) {
 		cache.capacity = bq27xxx_battery_read_soc(di);
 		di->cache.flags = cache.flags;
-		cache.health = bq27xxx_battery_read_health(di);
 
 		/*
 		 * On gauges with signed current reporting the current must be
@@ -2092,7 +2098,7 @@ static int bq27xxx_battery_get_property(struct power_supply *psy,
 		ret = bq27xxx_battery_pwr_avg(di, val);
 		break;
 	case POWER_SUPPLY_PROP_HEALTH:
-		ret = bq27xxx_simple_value(di->cache.health, val);
+		ret = bq27xxx_battery_read_health(di, val);
 		break;
 	case POWER_SUPPLY_PROP_MANUFACTURER:
 		val->strval = BQ27XXX_MANUFACTURER;
diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h
index d743270799d7..5180dc9f1706 100644
--- a/include/linux/power/bq27xxx_battery.h
+++ b/include/linux/power/bq27xxx_battery.h
@@ -49,7 +49,6 @@ struct bq27xxx_access_methods {
 struct bq27xxx_reg_cache {
 	int capacity;
 	int flags;
-	int health;
 };
 
 struct bq27xxx_device_info {
-- 
cgit v1.2.3


From 813e0ae613d6ee1b3e11f1c41f8b9e9df8ef0493 Mon Sep 17 00:00:00 2001
From: Baochen Qiang <quic_bqiang@quicinc.com>
Date: Tue, 5 Mar 2024 10:13:18 +0800
Subject: bus: mhi: host: Add mhi_power_down_keep_dev() API to support system
 suspend/hibernation

Currently, ath11k fails to resume from system suspend/hibernation on some
the x86 host machines with below error message:

```
ath11k_pci 0000:06:00.0: timeout while waiting for restart complete
```

This happens because, ath11k powers down the MHI stack during suspend and
that leads to destruction of the struct device associated with the MHI
channels. And during resume, ath11k calls calling mhi_sync_power_up() to
power up the MHI subsystem and that eventually calls the driver framework's
device_add() API from mhi_create_devices(). But the PM framework blocks the
struct device creation during device_add() and this leads to probe deferral
as below:

```
mhi mhi0_IPCR: Driver qcom_mhi_qrtr force probe deferral
```

The reason for deferring device creation during resume is explained in
dpm_prepare():

        /*
         * It is unsafe if probing of devices will happen during suspend or
         * hibernation and system behavior will be unpredictable in this
         * case. So, let's prohibit device's probing here and defer their
         * probes instead. The normal behavior will be restored in
         * dpm_complete().
         */

Due to the device probe deferral, qcom_mhi_qrtr_probe() API is not getting
called during resume and thus MHI channels are not prepared. So this blocks
the QMI messages from being transferred between ath11k and firmware,
resulting in a firmware initialization failure.

After consulting with Rafael, it was decided to not destroy the struct
device for the MHI channels during system suspend/hibernation because the
device is bound to appear again during resume.

So to achieve this, a new API called mhi_power_down_keep_dev() is
introduced for MHI controllers to keep the struct device when required.
This API is similar to the existing mhi_power_down() API, except that it
keeps the struct device associated with MHI channels instead of destroying
them.

Tested-on: WCN6855 hw2.0 PCI WLAN.HSP.1.1-03125-QCAHSPSWPL_V1_V2_SILICONZ_LITE-3.6510.30

Signed-off-by: Baochen Qiang <quic_bqiang@quicinc.com>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Reviewed-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Link: https://lore.kernel.org/r/20240305021320.3367-2-quic_bqiang@quicinc.com
[mani: reworded the commit message and subject]
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 drivers/bus/mhi/host/internal.h |  4 +++-
 drivers/bus/mhi/host/pm.c       | 42 +++++++++++++++++++++++++++++++++++------
 include/linux/mhi.h             | 18 +++++++++++++++++-
 3 files changed, 56 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/bus/mhi/host/internal.h b/drivers/bus/mhi/host/internal.h
index 5fe49311b8eb..aaad40a07f69 100644
--- a/drivers/bus/mhi/host/internal.h
+++ b/drivers/bus/mhi/host/internal.h
@@ -80,6 +80,7 @@ enum dev_st_transition {
 	DEV_ST_TRANSITION_FP,
 	DEV_ST_TRANSITION_SYS_ERR,
 	DEV_ST_TRANSITION_DISABLE,
+	DEV_ST_TRANSITION_DISABLE_DESTROY_DEVICE,
 	DEV_ST_TRANSITION_MAX,
 };
 
@@ -90,7 +91,8 @@ enum dev_st_transition {
 	dev_st_trans(MISSION_MODE,	"MISSION MODE")		\
 	dev_st_trans(FP,		"FLASH PROGRAMMER")	\
 	dev_st_trans(SYS_ERR,		"SYS ERROR")		\
-	dev_st_trans_end(DISABLE,	"DISABLE")
+	dev_st_trans(DISABLE,		"DISABLE")		\
+	dev_st_trans_end(DISABLE_DESTROY_DEVICE, "DISABLE (DESTROY DEVICE)")
 
 extern const char * const dev_state_tran_str[DEV_ST_TRANSITION_MAX];
 #define TO_DEV_STATE_TRANS_STR(state) (((state) >= DEV_ST_TRANSITION_MAX) ? \
diff --git a/drivers/bus/mhi/host/pm.c b/drivers/bus/mhi/host/pm.c
index 8b40d3f01acc..11c0e751f223 100644
--- a/drivers/bus/mhi/host/pm.c
+++ b/drivers/bus/mhi/host/pm.c
@@ -468,7 +468,8 @@ error_mission_mode:
 }
 
 /* Handle shutdown transitions */
-static void mhi_pm_disable_transition(struct mhi_controller *mhi_cntrl)
+static void mhi_pm_disable_transition(struct mhi_controller *mhi_cntrl,
+				      bool destroy_device)
 {
 	enum mhi_pm_state cur_state;
 	struct mhi_event *mhi_event;
@@ -530,8 +531,16 @@ skip_mhi_reset:
 	dev_dbg(dev, "Waiting for all pending threads to complete\n");
 	wake_up_all(&mhi_cntrl->state_event);
 
-	dev_dbg(dev, "Reset all active channels and remove MHI devices\n");
-	device_for_each_child(&mhi_cntrl->mhi_dev->dev, NULL, mhi_destroy_device);
+	/*
+	 * Only destroy the 'struct device' for channels if indicated by the
+	 * 'destroy_device' flag. Because, during system suspend or hibernation
+	 * state, there is no need to destroy the 'struct device' as the endpoint
+	 * device would still be physically attached to the machine.
+	 */
+	if (destroy_device) {
+		dev_dbg(dev, "Reset all active channels and remove MHI devices\n");
+		device_for_each_child(&mhi_cntrl->mhi_dev->dev, NULL, mhi_destroy_device);
+	}
 
 	mutex_lock(&mhi_cntrl->pm_mutex);
 
@@ -821,7 +830,10 @@ void mhi_pm_st_worker(struct work_struct *work)
 			mhi_pm_sys_error_transition(mhi_cntrl);
 			break;
 		case DEV_ST_TRANSITION_DISABLE:
-			mhi_pm_disable_transition(mhi_cntrl);
+			mhi_pm_disable_transition(mhi_cntrl, false);
+			break;
+		case DEV_ST_TRANSITION_DISABLE_DESTROY_DEVICE:
+			mhi_pm_disable_transition(mhi_cntrl, true);
 			break;
 		default:
 			break;
@@ -1175,7 +1187,8 @@ error_exit:
 }
 EXPORT_SYMBOL_GPL(mhi_async_power_up);
 
-void mhi_power_down(struct mhi_controller *mhi_cntrl, bool graceful)
+static void __mhi_power_down(struct mhi_controller *mhi_cntrl, bool graceful,
+			     bool destroy_device)
 {
 	enum mhi_pm_state cur_state, transition_state;
 	struct device *dev = &mhi_cntrl->mhi_dev->dev;
@@ -1211,15 +1224,32 @@ void mhi_power_down(struct mhi_controller *mhi_cntrl, bool graceful)
 	write_unlock_irq(&mhi_cntrl->pm_lock);
 	mutex_unlock(&mhi_cntrl->pm_mutex);
 
-	mhi_queue_state_transition(mhi_cntrl, DEV_ST_TRANSITION_DISABLE);
+	if (destroy_device)
+		mhi_queue_state_transition(mhi_cntrl,
+					   DEV_ST_TRANSITION_DISABLE_DESTROY_DEVICE);
+	else
+		mhi_queue_state_transition(mhi_cntrl,
+					   DEV_ST_TRANSITION_DISABLE);
 
 	/* Wait for shutdown to complete */
 	flush_work(&mhi_cntrl->st_worker);
 
 	disable_irq(mhi_cntrl->irq[0]);
 }
+
+void mhi_power_down(struct mhi_controller *mhi_cntrl, bool graceful)
+{
+	__mhi_power_down(mhi_cntrl, graceful, true);
+}
 EXPORT_SYMBOL_GPL(mhi_power_down);
 
+void mhi_power_down_keep_dev(struct mhi_controller *mhi_cntrl,
+			       bool graceful)
+{
+	__mhi_power_down(mhi_cntrl, graceful, false);
+}
+EXPORT_SYMBOL_GPL(mhi_power_down_keep_dev);
+
 int mhi_sync_power_up(struct mhi_controller *mhi_cntrl)
 {
 	int ret = mhi_async_power_up(mhi_cntrl);
diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index 77b8c0a26674..cde01e133a1b 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -630,12 +630,28 @@ int mhi_async_power_up(struct mhi_controller *mhi_cntrl);
 int mhi_sync_power_up(struct mhi_controller *mhi_cntrl);
 
 /**
- * mhi_power_down - Start MHI power down sequence
+ * mhi_power_down - Power down the MHI device and also destroy the
+ *                  'struct device' for the channels associated with it.
+ *                  See also mhi_power_down_keep_dev() which is a variant
+ *                  of this API that keeps the 'struct device' for channels
+ *                  (useful during suspend/hibernation).
  * @mhi_cntrl: MHI controller
  * @graceful: Link is still accessible, so do a graceful shutdown process
  */
 void mhi_power_down(struct mhi_controller *mhi_cntrl, bool graceful);
 
+/**
+ * mhi_power_down_keep_dev - Power down the MHI device but keep the 'struct
+ *                           device' for the channels associated with it.
+ *                           This is a variant of 'mhi_power_down()' and
+ *                           useful in scenarios such as suspend/hibernation
+ *                           where destroying of the 'struct device' is not
+ *                           needed.
+ * @mhi_cntrl: MHI controller
+ * @graceful: Link is still accessible, so do a graceful shutdown process
+ */
+void mhi_power_down_keep_dev(struct mhi_controller *mhi_cntrl, bool graceful);
+
 /**
  * mhi_unprepare_after_power_down - Free any allocated memory after power down
  * @mhi_cntrl: MHI controller
-- 
cgit v1.2.3


From c9418adfbabadf7530c034a52e54b31705568f95 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 28 Mar 2024 09:41:44 +0100
Subject: block: add a bio_list_merge_init helper

This is a simple combination of bio_list_merge + bio_list_init
similar to list_splice_init.  While it only saves a single
line in a callers, it makes the move all bios from one list to
another and reinitialize the original pattern a lot more obvious
in the callers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Matthew Sakai <msakai@redhat.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Link: https://lore.kernel.org/r/20240328084147.2954434-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 875d792bffff..9b8a369f44bc 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -615,6 +615,13 @@ static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2)
 	bl->tail = bl2->tail;
 }
 
+static inline void bio_list_merge_init(struct bio_list *bl,
+		struct bio_list *bl2)
+{
+	bio_list_merge(bl, bl2);
+	bio_list_init(bl2);
+}
+
 static inline void bio_list_merge_head(struct bio_list *bl,
 				       struct bio_list *bl2)
 {
-- 
cgit v1.2.3


From 1131f339089bdf7ef7aa0a026bdefe1c9a22e8a1 Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Thu, 28 Mar 2024 11:48:37 -0400
Subject: dlm: remove lkb from callback tracepoints

Stop using lkb structs in the callback tracepoints so that lkb
references are not needed. This prepares for separating lkb
structs from callbacks.

Signed-off-by: Alexander Aring <aahringo@redhat.com>
Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/ast.c               |  9 +++++++--
 fs/dlm/user.c              |  9 +++++++--
 include/trace/events/dlm.h | 46 +++++++++++++++++++---------------------------
 3 files changed, 33 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index decedc4ee15f..dd7cca3c1472 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -171,6 +171,7 @@ void dlm_callback_work(struct work_struct *work)
 {
 	struct dlm_lkb *lkb = container_of(work, struct dlm_lkb, lkb_cb_work);
 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
+	struct dlm_rsb *rsb = lkb->lkb_resource;
 	void (*castfn) (void *astparam);
 	void (*bastfn) (void *astparam, int mode);
 	struct dlm_callback *cb;
@@ -190,14 +191,18 @@ void dlm_callback_work(struct work_struct *work)
 		bastfn = lkb->lkb_bastfn;
 
 		if (cb->flags & DLM_CB_BAST) {
-			trace_dlm_bast(ls, lkb, cb->mode);
+			trace_dlm_bast(ls->ls_global_id, lkb->lkb_id,
+				       cb->mode, rsb->res_name,
+				       rsb->res_length);
 			lkb->lkb_last_bast_time = ktime_get();
 			lkb->lkb_last_bast_mode = cb->mode;
 			bastfn(lkb->lkb_astparam, cb->mode);
 		} else if (cb->flags & DLM_CB_CAST) {
 			lkb->lkb_lksb->sb_status = cb->sb_status;
 			lkb->lkb_lksb->sb_flags = cb->sb_flags;
-			trace_dlm_ast(ls, lkb);
+			trace_dlm_ast(ls->ls_global_id, lkb->lkb_id,
+				      cb->sb_flags, cb->sb_status,
+				      rsb->res_name, rsb->res_length);
 			lkb->lkb_last_cast_time = ktime_get();
 			castfn(lkb->lkb_astparam);
 		}
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 12a483deeef5..6f99bbeeac9b 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -805,6 +805,7 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
 	struct dlm_lkb *lkb;
 	DECLARE_WAITQUEUE(wait, current);
 	struct dlm_callback *cb;
+	struct dlm_rsb *rsb;
 	int rv, ret;
 
 	if (count == sizeof(struct dlm_device_version)) {
@@ -887,12 +888,16 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
 	}
 	spin_unlock(&proc->asts_spin);
 
+	rsb = lkb->lkb_resource;
 	if (cb->flags & DLM_CB_BAST) {
-		trace_dlm_bast(lkb->lkb_resource->res_ls, lkb, cb->mode);
+		trace_dlm_bast(rsb->res_ls->ls_global_id, lkb->lkb_id,
+			       cb->mode, rsb->res_name, rsb->res_length);
 	} else if (cb->flags & DLM_CB_CAST) {
 		lkb->lkb_lksb->sb_status = cb->sb_status;
 		lkb->lkb_lksb->sb_flags = cb->sb_flags;
-		trace_dlm_ast(lkb->lkb_resource->res_ls, lkb);
+		trace_dlm_ast(rsb->res_ls->ls_global_id, lkb->lkb_id,
+			      cb->sb_flags, cb->sb_status, rsb->res_name,
+			      rsb->res_length);
 	}
 
 	ret = copy_result_to_user(lkb->lkb_ua,
diff --git a/include/trace/events/dlm.h b/include/trace/events/dlm.h
index c1a146f9fc91..af160082c9e3 100644
--- a/include/trace/events/dlm.h
+++ b/include/trace/events/dlm.h
@@ -189,29 +189,25 @@ TRACE_EVENT(dlm_lock_end,
 
 TRACE_EVENT(dlm_bast,
 
-	TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb, int mode),
+	TP_PROTO(__u32 ls_id, __u32 lkb_id, int mode,
+		 const char *res_name, size_t res_length),
 
-	TP_ARGS(ls, lkb, mode),
+	TP_ARGS(ls_id, lkb_id, mode, res_name, res_length),
 
 	TP_STRUCT__entry(
 		__field(__u32, ls_id)
 		__field(__u32, lkb_id)
 		__field(int, mode)
-		__dynamic_array(unsigned char, res_name,
-				lkb->lkb_resource ? lkb->lkb_resource->res_length : 0)
+		__dynamic_array(unsigned char, res_name, res_length)
 	),
 
 	TP_fast_assign(
-		struct dlm_rsb *r;
-
-		__entry->ls_id = ls->ls_global_id;
-		__entry->lkb_id = lkb->lkb_id;
+		__entry->ls_id = ls_id;
+		__entry->lkb_id = lkb_id;
 		__entry->mode = mode;
 
-		r = lkb->lkb_resource;
-		if (r)
-			memcpy(__get_dynamic_array(res_name), r->res_name,
-			       __get_dynamic_array_len(res_name));
+		memcpy(__get_dynamic_array(res_name), res_name,
+		       __get_dynamic_array_len(res_name));
 	),
 
 	TP_printk("ls_id=%u lkb_id=%x mode=%s res_name=%s",
@@ -224,31 +220,27 @@ TRACE_EVENT(dlm_bast,
 
 TRACE_EVENT(dlm_ast,
 
-	TP_PROTO(struct dlm_ls *ls, struct dlm_lkb *lkb),
+	TP_PROTO(__u32 ls_id, __u32 lkb_id, __u8 sb_flags, int sb_status,
+		 const char *res_name, size_t res_length),
 
-	TP_ARGS(ls, lkb),
+	TP_ARGS(ls_id, lkb_id, sb_flags, sb_status, res_name, res_length),
 
 	TP_STRUCT__entry(
 		__field(__u32, ls_id)
 		__field(__u32, lkb_id)
-		__field(u8, sb_flags)
+		__field(__u8, sb_flags)
 		__field(int, sb_status)
-		__dynamic_array(unsigned char, res_name,
-				lkb->lkb_resource ? lkb->lkb_resource->res_length : 0)
+		__dynamic_array(unsigned char, res_name, res_length)
 	),
 
 	TP_fast_assign(
-		struct dlm_rsb *r;
-
-		__entry->ls_id = ls->ls_global_id;
-		__entry->lkb_id = lkb->lkb_id;
-		__entry->sb_flags = lkb->lkb_lksb->sb_flags;
-		__entry->sb_status = lkb->lkb_lksb->sb_status;
+		__entry->ls_id = ls_id;
+		__entry->lkb_id = lkb_id;
+		__entry->sb_flags = sb_flags;
+		__entry->sb_status = sb_status;
 
-		r = lkb->lkb_resource;
-		if (r)
-			memcpy(__get_dynamic_array(res_name), r->res_name,
-			       __get_dynamic_array_len(res_name));
+		memcpy(__get_dynamic_array(res_name), res_name,
+		       __get_dynamic_array_len(res_name));
 	),
 
 	TP_printk("ls_id=%u lkb_id=%x sb_flags=%s sb_status=%d res_name=%s",
-- 
cgit v1.2.3


From 0fce55e5334d380d8a09f80ba9c9b68eeea6971d Mon Sep 17 00:00:00 2001
From: Xin Zeng <xin.zeng@intel.com>
Date: Wed, 6 Mar 2024 21:58:53 +0800
Subject: crypto: qat - add interface for live migration

Extend the driver with a new interface to be used for VF live migration.
This allows to create and destroy a qat_mig_dev object that contains
a set of methods to allow to save and restore the state of QAT VF.
This interface will be used by the qat-vfio-pci module.

Signed-off-by: Xin Zeng <xin.zeng@intel.com>
Reviewed-by: Giovanni Cabiddu <giovanni.cabiddu@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/crypto/intel/qat/qat_common/Makefile       |   2 +-
 .../intel/qat/qat_common/adf_accel_devices.h       |  17 +++
 .../crypto/intel/qat/qat_common/adf_gen4_vf_mig.h  |  10 ++
 drivers/crypto/intel/qat/qat_common/qat_mig_dev.c  | 130 +++++++++++++++++++++
 include/linux/qat/qat_mig_dev.h                    |  31 +++++
 5 files changed, 189 insertions(+), 1 deletion(-)
 create mode 100644 drivers/crypto/intel/qat/qat_common/adf_gen4_vf_mig.h
 create mode 100644 drivers/crypto/intel/qat/qat_common/qat_mig_dev.c
 create mode 100644 include/linux/qat/qat_mig_dev.h

(limited to 'include')

diff --git a/drivers/crypto/intel/qat/qat_common/Makefile b/drivers/crypto/intel/qat/qat_common/Makefile
index ceaa685352ed..9fba31d4ac7f 100644
--- a/drivers/crypto/intel/qat/qat_common/Makefile
+++ b/drivers/crypto/intel/qat/qat_common/Makefile
@@ -54,6 +54,6 @@ intel_qat-$(CONFIG_DEBUG_FS) += adf_transport_debug.o \
 intel_qat-$(CONFIG_PCI_IOV) += adf_sriov.o adf_vf_isr.o adf_pfvf_utils.o \
 			       adf_pfvf_pf_msg.o adf_pfvf_pf_proto.o \
 			       adf_pfvf_vf_msg.o adf_pfvf_vf_proto.o \
-			       adf_gen2_pfvf.o adf_gen4_pfvf.o
+			       adf_gen2_pfvf.o adf_gen4_pfvf.o qat_mig_dev.o
 
 intel_qat-$(CONFIG_CRYPTO_DEV_QAT_ERROR_INJECTION) += adf_heartbeat_inject.o
diff --git a/drivers/crypto/intel/qat/qat_common/adf_accel_devices.h b/drivers/crypto/intel/qat/qat_common/adf_accel_devices.h
index 986e63ec702d..b08fea10121e 100644
--- a/drivers/crypto/intel/qat/qat_common/adf_accel_devices.h
+++ b/drivers/crypto/intel/qat/qat_common/adf_accel_devices.h
@@ -9,6 +9,7 @@
 #include <linux/pci.h>
 #include <linux/ratelimit.h>
 #include <linux/types.h>
+#include <linux/qat/qat_mig_dev.h>
 #include "adf_cfg_common.h"
 #include "adf_rl.h"
 #include "adf_telemetry.h"
@@ -258,6 +259,20 @@ struct adf_dc_ops {
 	void (*build_deflate_ctx)(void *ctx);
 };
 
+struct qat_migdev_ops {
+	int (*init)(struct qat_mig_dev *mdev);
+	void (*cleanup)(struct qat_mig_dev *mdev);
+	void (*reset)(struct qat_mig_dev *mdev);
+	int (*open)(struct qat_mig_dev *mdev);
+	void (*close)(struct qat_mig_dev *mdev);
+	int (*suspend)(struct qat_mig_dev *mdev);
+	int (*resume)(struct qat_mig_dev *mdev);
+	int (*save_state)(struct qat_mig_dev *mdev);
+	int (*save_setup)(struct qat_mig_dev *mdev);
+	int (*load_state)(struct qat_mig_dev *mdev);
+	int (*load_setup)(struct qat_mig_dev *mdev, int size);
+};
+
 struct adf_dev_err_mask {
 	u32 cppagentcmdpar_mask;
 	u32 parerr_ath_cph_mask;
@@ -325,6 +340,7 @@ struct adf_hw_device_data {
 	struct adf_dev_err_mask dev_err_mask;
 	struct adf_rl_hw_data rl_data;
 	struct adf_tl_hw_data tl_data;
+	struct qat_migdev_ops vfmig_ops;
 	const char *fw_name;
 	const char *fw_mmp_name;
 	u32 fuses;
@@ -381,6 +397,7 @@ struct adf_hw_device_data {
 #define GET_CSR_OPS(accel_dev) (&(accel_dev)->hw_device->csr_ops)
 #define GET_PFVF_OPS(accel_dev) (&(accel_dev)->hw_device->pfvf_ops)
 #define GET_DC_OPS(accel_dev) (&(accel_dev)->hw_device->dc_ops)
+#define GET_VFMIG_OPS(accel_dev) (&(accel_dev)->hw_device->vfmig_ops)
 #define GET_TL_DATA(accel_dev) GET_HW_DATA(accel_dev)->tl_data
 #define accel_to_pci_dev(accel_ptr) accel_ptr->accel_pci_dev.pci_dev
 
diff --git a/drivers/crypto/intel/qat/qat_common/adf_gen4_vf_mig.h b/drivers/crypto/intel/qat/qat_common/adf_gen4_vf_mig.h
new file mode 100644
index 000000000000..72216d078ee1
--- /dev/null
+++ b/drivers/crypto/intel/qat/qat_common/adf_gen4_vf_mig.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright(c) 2024 Intel Corporation */
+#ifndef ADF_GEN4_VF_MIG_H_
+#define ADF_GEN4_VF_MIG_H_
+
+#include "adf_accel_devices.h"
+
+void adf_gen4_init_vf_mig_ops(struct qat_migdev_ops *vfmig_ops);
+
+#endif
diff --git a/drivers/crypto/intel/qat/qat_common/qat_mig_dev.c b/drivers/crypto/intel/qat/qat_common/qat_mig_dev.c
new file mode 100644
index 000000000000..892c2283a50e
--- /dev/null
+++ b/drivers/crypto/intel/qat/qat_common/qat_mig_dev.c
@@ -0,0 +1,130 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2024 Intel Corporation */
+#include <linux/dev_printk.h>
+#include <linux/export.h>
+#include <linux/pci.h>
+#include <linux/types.h>
+#include <linux/qat/qat_mig_dev.h>
+#include "adf_accel_devices.h"
+#include "adf_common_drv.h"
+
+struct qat_mig_dev *qat_vfmig_create(struct pci_dev *pdev, int vf_id)
+{
+	struct adf_accel_dev *accel_dev;
+	struct qat_migdev_ops *ops;
+	struct qat_mig_dev *mdev;
+
+	accel_dev = adf_devmgr_pci_to_accel_dev(pdev);
+	if (!accel_dev)
+		return ERR_PTR(-ENODEV);
+
+	ops = GET_VFMIG_OPS(accel_dev);
+	if (!ops || !ops->init || !ops->cleanup || !ops->reset || !ops->open ||
+	    !ops->close || !ops->suspend || !ops->resume || !ops->save_state ||
+	    !ops->load_state || !ops->save_setup || !ops->load_setup)
+		return ERR_PTR(-EINVAL);
+
+	mdev = kmalloc(sizeof(*mdev), GFP_KERNEL);
+	if (!mdev)
+		return ERR_PTR(-ENOMEM);
+
+	mdev->vf_id = vf_id;
+	mdev->parent_accel_dev = accel_dev;
+
+	return mdev;
+}
+EXPORT_SYMBOL_GPL(qat_vfmig_create);
+
+int qat_vfmig_init(struct qat_mig_dev *mdev)
+{
+	struct adf_accel_dev *accel_dev = mdev->parent_accel_dev;
+
+	return GET_VFMIG_OPS(accel_dev)->init(mdev);
+}
+EXPORT_SYMBOL_GPL(qat_vfmig_init);
+
+void qat_vfmig_cleanup(struct qat_mig_dev *mdev)
+{
+	struct adf_accel_dev *accel_dev = mdev->parent_accel_dev;
+
+	return GET_VFMIG_OPS(accel_dev)->cleanup(mdev);
+}
+EXPORT_SYMBOL_GPL(qat_vfmig_cleanup);
+
+void qat_vfmig_reset(struct qat_mig_dev *mdev)
+{
+	struct adf_accel_dev *accel_dev = mdev->parent_accel_dev;
+
+	return GET_VFMIG_OPS(accel_dev)->reset(mdev);
+}
+EXPORT_SYMBOL_GPL(qat_vfmig_reset);
+
+int qat_vfmig_open(struct qat_mig_dev *mdev)
+{
+	struct adf_accel_dev *accel_dev = mdev->parent_accel_dev;
+
+	return GET_VFMIG_OPS(accel_dev)->open(mdev);
+}
+EXPORT_SYMBOL_GPL(qat_vfmig_open);
+
+void qat_vfmig_close(struct qat_mig_dev *mdev)
+{
+	struct adf_accel_dev *accel_dev = mdev->parent_accel_dev;
+
+	GET_VFMIG_OPS(accel_dev)->close(mdev);
+}
+EXPORT_SYMBOL_GPL(qat_vfmig_close);
+
+int qat_vfmig_suspend(struct qat_mig_dev *mdev)
+{
+	struct adf_accel_dev *accel_dev = mdev->parent_accel_dev;
+
+	return GET_VFMIG_OPS(accel_dev)->suspend(mdev);
+}
+EXPORT_SYMBOL_GPL(qat_vfmig_suspend);
+
+int qat_vfmig_resume(struct qat_mig_dev *mdev)
+{
+	struct adf_accel_dev *accel_dev = mdev->parent_accel_dev;
+
+	return GET_VFMIG_OPS(accel_dev)->resume(mdev);
+}
+EXPORT_SYMBOL_GPL(qat_vfmig_resume);
+
+int qat_vfmig_save_state(struct qat_mig_dev *mdev)
+{
+	struct adf_accel_dev *accel_dev = mdev->parent_accel_dev;
+
+	return GET_VFMIG_OPS(accel_dev)->save_state(mdev);
+}
+EXPORT_SYMBOL_GPL(qat_vfmig_save_state);
+
+int qat_vfmig_save_setup(struct qat_mig_dev *mdev)
+{
+	struct adf_accel_dev *accel_dev = mdev->parent_accel_dev;
+
+	return GET_VFMIG_OPS(accel_dev)->save_setup(mdev);
+}
+EXPORT_SYMBOL_GPL(qat_vfmig_save_setup);
+
+int qat_vfmig_load_state(struct qat_mig_dev *mdev)
+{
+	struct adf_accel_dev *accel_dev = mdev->parent_accel_dev;
+
+	return GET_VFMIG_OPS(accel_dev)->load_state(mdev);
+}
+EXPORT_SYMBOL_GPL(qat_vfmig_load_state);
+
+int qat_vfmig_load_setup(struct qat_mig_dev *mdev, int size)
+{
+	struct adf_accel_dev *accel_dev = mdev->parent_accel_dev;
+
+	return GET_VFMIG_OPS(accel_dev)->load_setup(mdev, size);
+}
+EXPORT_SYMBOL_GPL(qat_vfmig_load_setup);
+
+void qat_vfmig_destroy(struct qat_mig_dev *mdev)
+{
+	kfree(mdev);
+}
+EXPORT_SYMBOL_GPL(qat_vfmig_destroy);
diff --git a/include/linux/qat/qat_mig_dev.h b/include/linux/qat/qat_mig_dev.h
new file mode 100644
index 000000000000..dbbb6a063dd2
--- /dev/null
+++ b/include/linux/qat/qat_mig_dev.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright(c) 2024 Intel Corporation */
+#ifndef QAT_MIG_DEV_H_
+#define QAT_MIG_DEV_H_
+
+struct pci_dev;
+
+struct qat_mig_dev {
+	void *parent_accel_dev;
+	u8 *state;
+	u32 setup_size;
+	u32 remote_setup_size;
+	u32 state_size;
+	s32 vf_id;
+};
+
+struct qat_mig_dev *qat_vfmig_create(struct pci_dev *pdev, int vf_id);
+int qat_vfmig_init(struct qat_mig_dev *mdev);
+void qat_vfmig_cleanup(struct qat_mig_dev *mdev);
+void qat_vfmig_reset(struct qat_mig_dev *mdev);
+int qat_vfmig_open(struct qat_mig_dev *mdev);
+void qat_vfmig_close(struct qat_mig_dev *mdev);
+int qat_vfmig_suspend(struct qat_mig_dev *mdev);
+int qat_vfmig_resume(struct qat_mig_dev *mdev);
+int qat_vfmig_save_state(struct qat_mig_dev *mdev);
+int qat_vfmig_save_setup(struct qat_mig_dev *mdev);
+int qat_vfmig_load_state(struct qat_mig_dev *mdev);
+int qat_vfmig_load_setup(struct qat_mig_dev *mdev, int size);
+void qat_vfmig_destroy(struct qat_mig_dev *mdev);
+
+#endif /*QAT_MIG_DEV_H_*/
-- 
cgit v1.2.3


From 29ce50e078b857977202205394f200a25889636e Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 12 Mar 2024 20:48:21 -0700
Subject: crypto: remove CONFIG_CRYPTO_STATS

Remove support for the "Crypto usage statistics" feature
(CONFIG_CRYPTO_STATS).  This feature does not appear to have ever been
used, and it is harmful because it significantly reduces performance and
is a large maintenance burden.

Covering each of these points in detail:

1. Feature is not being used

Since these generic crypto statistics are only readable using netlink,
it's fairly straightforward to look for programs that use them.  I'm
unable to find any evidence that any such programs exist.  For example,
Debian Code Search returns no hits except the kernel header and kernel
code itself and translations of the kernel header:
https://codesearch.debian.net/search?q=CRYPTOCFGA_STAT&literal=1&perpkg=1

The patch series that added this feature in 2018
(https://lore.kernel.org/linux-crypto/1537351855-16618-1-git-send-email-clabbe@baylibre.com/)
said "The goal is to have an ifconfig for crypto device."  This doesn't
appear to have happened.

It's not clear that there is real demand for crypto statistics.  Just
because the kernel provides other types of statistics such as I/O and
networking statistics and some people find those useful does not mean
that crypto statistics are useful too.

Further evidence that programs are not using CONFIG_CRYPTO_STATS is that
it was able to be disabled in RHEL and Fedora as a bug fix
(https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/merge_requests/2947).

Even further evidence comes from the fact that there are and have been
bugs in how the stats work, but they were never reported.  For example,
before Linux v6.7 hash stats were double-counted in most cases.

There has also never been any documentation for this feature, so it
might be hard to use even if someone wanted to.

2. CONFIG_CRYPTO_STATS significantly reduces performance

Enabling CONFIG_CRYPTO_STATS significantly reduces the performance of
the crypto API, even if no program ever retrieves the statistics.  This
primarily affects systems with a large number of CPUs.  For example,
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/2039576 reported
that Lustre client encryption performance improved from 21.7GB/s to
48.2GB/s by disabling CONFIG_CRYPTO_STATS.

It can be argued that this means that CONFIG_CRYPTO_STATS should be
optimized with per-cpu counters similar to many of the networking
counters.  But no one has done this in 5+ years.  This is consistent
with the fact that the feature appears to be unused, so there seems to
be little interest in improving it as opposed to just disabling it.

It can be argued that because CONFIG_CRYPTO_STATS is off by default,
performance doesn't matter.  But Linux distros tend to error on the side
of enabling options.  The option is enabled in Ubuntu and Arch Linux,
and until recently was enabled in RHEL and Fedora (see above).  So, even
just having the option available is harmful to users.

3. CONFIG_CRYPTO_STATS is a large maintenance burden

There are over 1000 lines of code associated with CONFIG_CRYPTO_STATS,
spread among 32 files.  It significantly complicates much of the
implementation of the crypto API.  After the initial submission, many
fixes and refactorings have consumed effort of multiple people to keep
this feature "working".  We should be spending this effort elsewhere.

Acked-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Corentin Labbe <clabbe@baylibre.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/s390/configs/debug_defconfig    |   1 -
 arch/s390/configs/defconfig          |   1 -
 crypto/Kconfig                       |  20 --
 crypto/Makefile                      |   2 -
 crypto/acompress.c                   |  33 ---
 crypto/aead.c                        |  84 +-----
 crypto/ahash.c                       |  63 +----
 crypto/akcipher.c                    |  31 ---
 crypto/compress.h                    |   3 -
 crypto/crypto_user.c                 | 522 +++++++++++++++++++++++++++++++++++
 crypto/crypto_user_base.c            | 516 ----------------------------------
 crypto/crypto_user_stat.c            | 176 ------------
 crypto/hash.h                        |  30 --
 crypto/kpp.c                         |  30 --
 crypto/lskcipher.c                   |  73 +----
 crypto/rng.c                         |  44 +--
 crypto/scompress.c                   |   3 -
 crypto/shash.c                       |  75 +----
 crypto/sig.c                         |  13 -
 crypto/skcipher.c                    |  86 +-----
 crypto/skcipher.h                    |  10 -
 include/crypto/acompress.h           |  73 +----
 include/crypto/aead.h                |  21 --
 include/crypto/akcipher.h            |  78 +-----
 include/crypto/algapi.h              |   3 -
 include/crypto/hash.h                |  22 --
 include/crypto/internal/acompress.h  |   1 -
 include/crypto/internal/cryptouser.h |  16 --
 include/crypto/internal/scompress.h  |   1 -
 include/crypto/kpp.h                 |  58 +---
 include/crypto/rng.h                 |  51 +---
 include/crypto/skcipher.h            |  25 --
 include/uapi/linux/cryptouser.h      |  30 +-
 33 files changed, 585 insertions(+), 1610 deletions(-)
 create mode 100644 crypto/crypto_user.c
 delete mode 100644 crypto/crypto_user_base.c
 delete mode 100644 crypto/crypto_user_stat.c
 delete mode 100644 include/crypto/internal/cryptouser.h

(limited to 'include')

diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index f1fb01cd5a25..145342e46ea8 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -760,7 +760,6 @@ CONFIG_CRYPTO_USER_API_HASH=m
 CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
-CONFIG_CRYPTO_STATS=y
 CONFIG_CRYPTO_CRC32_S390=y
 CONFIG_CRYPTO_SHA512_S390=m
 CONFIG_CRYPTO_SHA1_S390=m
diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig
index d33f814f78b2..dc237896f99d 100644
--- a/arch/s390/configs/defconfig
+++ b/arch/s390/configs/defconfig
@@ -745,7 +745,6 @@ CONFIG_CRYPTO_USER_API_HASH=m
 CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
-CONFIG_CRYPTO_STATS=y
 CONFIG_CRYPTO_CRC32_S390=y
 CONFIG_CRYPTO_SHA512_S390=m
 CONFIG_CRYPTO_SHA1_S390=m
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 2903ce19f15c..5688d42a59c2 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1456,26 +1456,6 @@ config CRYPTO_USER_API_ENABLE_OBSOLETE
 	  already been phased out from internal use by the kernel, and are
 	  only useful for userspace clients that still rely on them.
 
-config CRYPTO_STATS
-	bool "Crypto usage statistics"
-	depends on CRYPTO_USER
-	help
-	  Enable the gathering of crypto stats.
-
-	  Enabling this option reduces the performance of the crypto API.  It
-	  should only be enabled when there is actually a use case for it.
-
-	  This collects data sizes, numbers of requests, and numbers
-	  of errors processed by:
-	  - AEAD ciphers (encrypt, decrypt)
-	  - asymmetric key ciphers (encrypt, decrypt, verify, sign)
-	  - symmetric key ciphers (encrypt, decrypt)
-	  - compression algorithms (compress, decompress)
-	  - hash algorithms (hash)
-	  - key-agreement protocol primitives (setsecret, generate
-	    public key, compute shared secret)
-	  - RNG (generate, seed)
-
 endmenu
 
 config CRYPTO_HASH_INFO
diff --git a/crypto/Makefile b/crypto/Makefile
index 408f0a1f9ab9..de9a3312a2c8 100644
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -69,8 +69,6 @@ cryptomgr-y := algboss.o testmgr.o
 
 obj-$(CONFIG_CRYPTO_MANAGER2) += cryptomgr.o
 obj-$(CONFIG_CRYPTO_USER) += crypto_user.o
-crypto_user-y := crypto_user_base.o
-crypto_user-$(CONFIG_CRYPTO_STATS) += crypto_user_stat.o
 obj-$(CONFIG_CRYPTO_CMAC) += cmac.o
 obj-$(CONFIG_CRYPTO_HMAC) += hmac.o
 obj-$(CONFIG_CRYPTO_VMAC) += vmac.o
diff --git a/crypto/acompress.c b/crypto/acompress.c
index 1c682810a484..6fdf0ff9f3c0 100644
--- a/crypto/acompress.c
+++ b/crypto/acompress.c
@@ -93,32 +93,6 @@ static unsigned int crypto_acomp_extsize(struct crypto_alg *alg)
 	return extsize;
 }
 
-static inline int __crypto_acomp_report_stat(struct sk_buff *skb,
-					     struct crypto_alg *alg)
-{
-	struct comp_alg_common *calg = __crypto_comp_alg_common(alg);
-	struct crypto_istat_compress *istat = comp_get_stat(calg);
-	struct crypto_stat_compress racomp;
-
-	memset(&racomp, 0, sizeof(racomp));
-
-	strscpy(racomp.type, "acomp", sizeof(racomp.type));
-	racomp.stat_compress_cnt = atomic64_read(&istat->compress_cnt);
-	racomp.stat_compress_tlen = atomic64_read(&istat->compress_tlen);
-	racomp.stat_decompress_cnt =  atomic64_read(&istat->decompress_cnt);
-	racomp.stat_decompress_tlen = atomic64_read(&istat->decompress_tlen);
-	racomp.stat_err_cnt = atomic64_read(&istat->err_cnt);
-
-	return nla_put(skb, CRYPTOCFGA_STAT_ACOMP, sizeof(racomp), &racomp);
-}
-
-#ifdef CONFIG_CRYPTO_STATS
-int crypto_acomp_report_stat(struct sk_buff *skb, struct crypto_alg *alg)
-{
-	return __crypto_acomp_report_stat(skb, alg);
-}
-#endif
-
 static const struct crypto_type crypto_acomp_type = {
 	.extsize = crypto_acomp_extsize,
 	.init_tfm = crypto_acomp_init_tfm,
@@ -127,9 +101,6 @@ static const struct crypto_type crypto_acomp_type = {
 #endif
 #if IS_ENABLED(CONFIG_CRYPTO_USER)
 	.report = crypto_acomp_report,
-#endif
-#ifdef CONFIG_CRYPTO_STATS
-	.report_stat = crypto_acomp_report_stat,
 #endif
 	.maskclear = ~CRYPTO_ALG_TYPE_MASK,
 	.maskset = CRYPTO_ALG_TYPE_ACOMPRESS_MASK,
@@ -184,13 +155,9 @@ EXPORT_SYMBOL_GPL(acomp_request_free);
 
 void comp_prepare_alg(struct comp_alg_common *alg)
 {
-	struct crypto_istat_compress *istat = comp_get_stat(alg);
 	struct crypto_alg *base = &alg->base;
 
 	base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
-
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
-		memset(istat, 0, sizeof(*istat));
 }
 
 int crypto_register_acomp(struct acomp_alg *alg)
diff --git a/crypto/aead.c b/crypto/aead.c
index 54906633566a..0e75a69189df 100644
--- a/crypto/aead.c
+++ b/crypto/aead.c
@@ -20,15 +20,6 @@
 
 #include "internal.h"
 
-static inline struct crypto_istat_aead *aead_get_stat(struct aead_alg *alg)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	return &alg->stat;
-#else
-	return NULL;
-#endif
-}
-
 static int setkey_unaligned(struct crypto_aead *tfm, const u8 *key,
 			    unsigned int keylen)
 {
@@ -90,62 +81,28 @@ int crypto_aead_setauthsize(struct crypto_aead *tfm, unsigned int authsize)
 }
 EXPORT_SYMBOL_GPL(crypto_aead_setauthsize);
 
-static inline int crypto_aead_errstat(struct crypto_istat_aead *istat, int err)
-{
-	if (!IS_ENABLED(CONFIG_CRYPTO_STATS))
-		return err;
-
-	if (err && err != -EINPROGRESS && err != -EBUSY)
-		atomic64_inc(&istat->err_cnt);
-
-	return err;
-}
-
 int crypto_aead_encrypt(struct aead_request *req)
 {
 	struct crypto_aead *aead = crypto_aead_reqtfm(req);
-	struct aead_alg *alg = crypto_aead_alg(aead);
-	struct crypto_istat_aead *istat;
-	int ret;
-
-	istat = aead_get_stat(alg);
-
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS)) {
-		atomic64_inc(&istat->encrypt_cnt);
-		atomic64_add(req->cryptlen, &istat->encrypt_tlen);
-	}
 
 	if (crypto_aead_get_flags(aead) & CRYPTO_TFM_NEED_KEY)
-		ret = -ENOKEY;
-	else
-		ret = alg->encrypt(req);
+		return -ENOKEY;
 
-	return crypto_aead_errstat(istat, ret);
+	return crypto_aead_alg(aead)->encrypt(req);
 }
 EXPORT_SYMBOL_GPL(crypto_aead_encrypt);
 
 int crypto_aead_decrypt(struct aead_request *req)
 {
 	struct crypto_aead *aead = crypto_aead_reqtfm(req);
-	struct aead_alg *alg = crypto_aead_alg(aead);
-	struct crypto_istat_aead *istat;
-	int ret;
-
-	istat = aead_get_stat(alg);
-
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS)) {
-		atomic64_inc(&istat->encrypt_cnt);
-		atomic64_add(req->cryptlen, &istat->encrypt_tlen);
-	}
 
 	if (crypto_aead_get_flags(aead) & CRYPTO_TFM_NEED_KEY)
-		ret = -ENOKEY;
-	else if (req->cryptlen < crypto_aead_authsize(aead))
-		ret = -EINVAL;
-	else
-		ret = alg->decrypt(req);
+		return -ENOKEY;
+
+	if (req->cryptlen < crypto_aead_authsize(aead))
+		return -EINVAL;
 
-	return crypto_aead_errstat(istat, ret);
+	return crypto_aead_alg(aead)->decrypt(req);
 }
 EXPORT_SYMBOL_GPL(crypto_aead_decrypt);
 
@@ -215,26 +172,6 @@ static void crypto_aead_free_instance(struct crypto_instance *inst)
 	aead->free(aead);
 }
 
-static int __maybe_unused crypto_aead_report_stat(
-	struct sk_buff *skb, struct crypto_alg *alg)
-{
-	struct aead_alg *aead = container_of(alg, struct aead_alg, base);
-	struct crypto_istat_aead *istat = aead_get_stat(aead);
-	struct crypto_stat_aead raead;
-
-	memset(&raead, 0, sizeof(raead));
-
-	strscpy(raead.type, "aead", sizeof(raead.type));
-
-	raead.stat_encrypt_cnt = atomic64_read(&istat->encrypt_cnt);
-	raead.stat_encrypt_tlen = atomic64_read(&istat->encrypt_tlen);
-	raead.stat_decrypt_cnt = atomic64_read(&istat->decrypt_cnt);
-	raead.stat_decrypt_tlen = atomic64_read(&istat->decrypt_tlen);
-	raead.stat_err_cnt = atomic64_read(&istat->err_cnt);
-
-	return nla_put(skb, CRYPTOCFGA_STAT_AEAD, sizeof(raead), &raead);
-}
-
 static const struct crypto_type crypto_aead_type = {
 	.extsize = crypto_alg_extsize,
 	.init_tfm = crypto_aead_init_tfm,
@@ -244,9 +181,6 @@ static const struct crypto_type crypto_aead_type = {
 #endif
 #if IS_ENABLED(CONFIG_CRYPTO_USER)
 	.report = crypto_aead_report,
-#endif
-#ifdef CONFIG_CRYPTO_STATS
-	.report_stat = crypto_aead_report_stat,
 #endif
 	.maskclear = ~CRYPTO_ALG_TYPE_MASK,
 	.maskset = CRYPTO_ALG_TYPE_MASK,
@@ -277,7 +211,6 @@ EXPORT_SYMBOL_GPL(crypto_has_aead);
 
 static int aead_prepare_alg(struct aead_alg *alg)
 {
-	struct crypto_istat_aead *istat = aead_get_stat(alg);
 	struct crypto_alg *base = &alg->base;
 
 	if (max3(alg->maxauthsize, alg->ivsize, alg->chunksize) >
@@ -291,9 +224,6 @@ static int aead_prepare_alg(struct aead_alg *alg)
 	base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
 	base->cra_flags |= CRYPTO_ALG_TYPE_AEAD;
 
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
-		memset(istat, 0, sizeof(*istat));
-
 	return 0;
 }
 
diff --git a/crypto/ahash.c b/crypto/ahash.c
index 0ac83f7f701d..bcd9de009a91 100644
--- a/crypto/ahash.c
+++ b/crypto/ahash.c
@@ -27,22 +27,6 @@
 
 #define CRYPTO_ALG_TYPE_AHASH_MASK	0x0000000e
 
-static inline struct crypto_istat_hash *ahash_get_stat(struct ahash_alg *alg)
-{
-	return hash_get_stat(&alg->halg);
-}
-
-static inline int crypto_ahash_errstat(struct ahash_alg *alg, int err)
-{
-	if (!IS_ENABLED(CONFIG_CRYPTO_STATS))
-		return err;
-
-	if (err && err != -EINPROGRESS && err != -EBUSY)
-		atomic64_inc(&ahash_get_stat(alg)->err_cnt);
-
-	return err;
-}
-
 /*
  * For an ahash tfm that is using an shash algorithm (instead of an ahash
  * algorithm), this returns the underlying shash tfm.
@@ -344,75 +328,47 @@ static void ahash_restore_req(struct ahash_request *req, int err)
 int crypto_ahash_update(struct ahash_request *req)
 {
 	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-	struct ahash_alg *alg;
 
 	if (likely(tfm->using_shash))
 		return shash_ahash_update(req, ahash_request_ctx(req));
 
-	alg = crypto_ahash_alg(tfm);
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
-		atomic64_add(req->nbytes, &ahash_get_stat(alg)->hash_tlen);
-	return crypto_ahash_errstat(alg, alg->update(req));
+	return crypto_ahash_alg(tfm)->update(req);
 }
 EXPORT_SYMBOL_GPL(crypto_ahash_update);
 
 int crypto_ahash_final(struct ahash_request *req)
 {
 	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-	struct ahash_alg *alg;
 
 	if (likely(tfm->using_shash))
 		return crypto_shash_final(ahash_request_ctx(req), req->result);
 
-	alg = crypto_ahash_alg(tfm);
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
-		atomic64_inc(&ahash_get_stat(alg)->hash_cnt);
-	return crypto_ahash_errstat(alg, alg->final(req));
+	return crypto_ahash_alg(tfm)->final(req);
 }
 EXPORT_SYMBOL_GPL(crypto_ahash_final);
 
 int crypto_ahash_finup(struct ahash_request *req)
 {
 	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-	struct ahash_alg *alg;
 
 	if (likely(tfm->using_shash))
 		return shash_ahash_finup(req, ahash_request_ctx(req));
 
-	alg = crypto_ahash_alg(tfm);
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS)) {
-		struct crypto_istat_hash *istat = ahash_get_stat(alg);
-
-		atomic64_inc(&istat->hash_cnt);
-		atomic64_add(req->nbytes, &istat->hash_tlen);
-	}
-	return crypto_ahash_errstat(alg, alg->finup(req));
+	return crypto_ahash_alg(tfm)->finup(req);
 }
 EXPORT_SYMBOL_GPL(crypto_ahash_finup);
 
 int crypto_ahash_digest(struct ahash_request *req)
 {
 	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
-	struct ahash_alg *alg;
-	int err;
 
 	if (likely(tfm->using_shash))
 		return shash_ahash_digest(req, prepare_shash_desc(req, tfm));
 
-	alg = crypto_ahash_alg(tfm);
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS)) {
-		struct crypto_istat_hash *istat = ahash_get_stat(alg);
-
-		atomic64_inc(&istat->hash_cnt);
-		atomic64_add(req->nbytes, &istat->hash_tlen);
-	}
-
 	if (crypto_ahash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
-		err = -ENOKEY;
-	else
-		err = alg->digest(req);
+		return -ENOKEY;
 
-	return crypto_ahash_errstat(alg, err);
+	return crypto_ahash_alg(tfm)->digest(req);
 }
 EXPORT_SYMBOL_GPL(crypto_ahash_digest);
 
@@ -571,12 +527,6 @@ static void crypto_ahash_show(struct seq_file *m, struct crypto_alg *alg)
 		   __crypto_hash_alg_common(alg)->digestsize);
 }
 
-static int __maybe_unused crypto_ahash_report_stat(
-	struct sk_buff *skb, struct crypto_alg *alg)
-{
-	return crypto_hash_report_stat(skb, alg, "ahash");
-}
-
 static const struct crypto_type crypto_ahash_type = {
 	.extsize = crypto_ahash_extsize,
 	.init_tfm = crypto_ahash_init_tfm,
@@ -586,9 +536,6 @@ static const struct crypto_type crypto_ahash_type = {
 #endif
 #if IS_ENABLED(CONFIG_CRYPTO_USER)
 	.report = crypto_ahash_report,
-#endif
-#ifdef CONFIG_CRYPTO_STATS
-	.report_stat = crypto_ahash_report_stat,
 #endif
 	.maskclear = ~CRYPTO_ALG_TYPE_MASK,
 	.maskset = CRYPTO_ALG_TYPE_AHASH_MASK,
diff --git a/crypto/akcipher.c b/crypto/akcipher.c
index 52813f0b19e4..e0ff5f4dda6d 100644
--- a/crypto/akcipher.c
+++ b/crypto/akcipher.c
@@ -70,30 +70,6 @@ static void crypto_akcipher_free_instance(struct crypto_instance *inst)
 	akcipher->free(akcipher);
 }
 
-static int __maybe_unused crypto_akcipher_report_stat(
-	struct sk_buff *skb, struct crypto_alg *alg)
-{
-	struct akcipher_alg *akcipher = __crypto_akcipher_alg(alg);
-	struct crypto_istat_akcipher *istat;
-	struct crypto_stat_akcipher rakcipher;
-
-	istat = akcipher_get_stat(akcipher);
-
-	memset(&rakcipher, 0, sizeof(rakcipher));
-
-	strscpy(rakcipher.type, "akcipher", sizeof(rakcipher.type));
-	rakcipher.stat_encrypt_cnt = atomic64_read(&istat->encrypt_cnt);
-	rakcipher.stat_encrypt_tlen = atomic64_read(&istat->encrypt_tlen);
-	rakcipher.stat_decrypt_cnt = atomic64_read(&istat->decrypt_cnt);
-	rakcipher.stat_decrypt_tlen = atomic64_read(&istat->decrypt_tlen);
-	rakcipher.stat_sign_cnt = atomic64_read(&istat->sign_cnt);
-	rakcipher.stat_verify_cnt = atomic64_read(&istat->verify_cnt);
-	rakcipher.stat_err_cnt = atomic64_read(&istat->err_cnt);
-
-	return nla_put(skb, CRYPTOCFGA_STAT_AKCIPHER,
-		       sizeof(rakcipher), &rakcipher);
-}
-
 static const struct crypto_type crypto_akcipher_type = {
 	.extsize = crypto_alg_extsize,
 	.init_tfm = crypto_akcipher_init_tfm,
@@ -103,9 +79,6 @@ static const struct crypto_type crypto_akcipher_type = {
 #endif
 #if IS_ENABLED(CONFIG_CRYPTO_USER)
 	.report = crypto_akcipher_report,
-#endif
-#ifdef CONFIG_CRYPTO_STATS
-	.report_stat = crypto_akcipher_report_stat,
 #endif
 	.maskclear = ~CRYPTO_ALG_TYPE_MASK,
 	.maskset = CRYPTO_ALG_TYPE_AHASH_MASK,
@@ -131,15 +104,11 @@ EXPORT_SYMBOL_GPL(crypto_alloc_akcipher);
 
 static void akcipher_prepare_alg(struct akcipher_alg *alg)
 {
-	struct crypto_istat_akcipher *istat = akcipher_get_stat(alg);
 	struct crypto_alg *base = &alg->base;
 
 	base->cra_type = &crypto_akcipher_type;
 	base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
 	base->cra_flags |= CRYPTO_ALG_TYPE_AKCIPHER;
-
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
-		memset(istat, 0, sizeof(*istat));
 }
 
 static int akcipher_default_op(struct akcipher_request *req)
diff --git a/crypto/compress.h b/crypto/compress.h
index 19f65516d699..c3cedfb5e606 100644
--- a/crypto/compress.h
+++ b/crypto/compress.h
@@ -13,14 +13,11 @@
 
 struct acomp_req;
 struct comp_alg_common;
-struct sk_buff;
 
 int crypto_init_scomp_ops_async(struct crypto_tfm *tfm);
 struct acomp_req *crypto_acomp_scomp_alloc_ctx(struct acomp_req *req);
 void crypto_acomp_scomp_free_ctx(struct acomp_req *req);
 
-int crypto_acomp_report_stat(struct sk_buff *skb, struct crypto_alg *alg);
-
 void comp_prepare_alg(struct comp_alg_common *alg);
 
 #endif	/* _LOCAL_CRYPTO_COMPRESS_H */
diff --git a/crypto/crypto_user.c b/crypto/crypto_user.c
new file mode 100644
index 000000000000..6c571834e86a
--- /dev/null
+++ b/crypto/crypto_user.c
@@ -0,0 +1,522 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Crypto user configuration API.
+ *
+ * Copyright (C) 2011 secunet Security Networks AG
+ * Copyright (C) 2011 Steffen Klassert <steffen.klassert@secunet.com>
+ */
+
+#include <linux/module.h>
+#include <linux/crypto.h>
+#include <linux/cryptouser.h>
+#include <linux/sched.h>
+#include <linux/security.h>
+#include <net/netlink.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/internal/rng.h>
+#include <crypto/akcipher.h>
+#include <crypto/kpp.h>
+
+#include "internal.h"
+
+#define null_terminated(x)	(strnlen(x, sizeof(x)) < sizeof(x))
+
+static DEFINE_MUTEX(crypto_cfg_mutex);
+
+struct crypto_dump_info {
+	struct sk_buff *in_skb;
+	struct sk_buff *out_skb;
+	u32 nlmsg_seq;
+	u16 nlmsg_flags;
+};
+
+static struct crypto_alg *crypto_alg_match(struct crypto_user_alg *p, int exact)
+{
+	struct crypto_alg *q, *alg = NULL;
+
+	down_read(&crypto_alg_sem);
+
+	list_for_each_entry(q, &crypto_alg_list, cra_list) {
+		int match = 0;
+
+		if (crypto_is_larval(q))
+			continue;
+
+		if ((q->cra_flags ^ p->cru_type) & p->cru_mask)
+			continue;
+
+		if (strlen(p->cru_driver_name))
+			match = !strcmp(q->cra_driver_name,
+					p->cru_driver_name);
+		else if (!exact)
+			match = !strcmp(q->cra_name, p->cru_name);
+
+		if (!match)
+			continue;
+
+		if (unlikely(!crypto_mod_get(q)))
+			continue;
+
+		alg = q;
+		break;
+	}
+
+	up_read(&crypto_alg_sem);
+
+	return alg;
+}
+
+static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg)
+{
+	struct crypto_report_cipher rcipher;
+
+	memset(&rcipher, 0, sizeof(rcipher));
+
+	strscpy(rcipher.type, "cipher", sizeof(rcipher.type));
+
+	rcipher.blocksize = alg->cra_blocksize;
+	rcipher.min_keysize = alg->cra_cipher.cia_min_keysize;
+	rcipher.max_keysize = alg->cra_cipher.cia_max_keysize;
+
+	return nla_put(skb, CRYPTOCFGA_REPORT_CIPHER,
+		       sizeof(rcipher), &rcipher);
+}
+
+static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg)
+{
+	struct crypto_report_comp rcomp;
+
+	memset(&rcomp, 0, sizeof(rcomp));
+
+	strscpy(rcomp.type, "compression", sizeof(rcomp.type));
+
+	return nla_put(skb, CRYPTOCFGA_REPORT_COMPRESS, sizeof(rcomp), &rcomp);
+}
+
+static int crypto_report_one(struct crypto_alg *alg,
+			     struct crypto_user_alg *ualg, struct sk_buff *skb)
+{
+	memset(ualg, 0, sizeof(*ualg));
+
+	strscpy(ualg->cru_name, alg->cra_name, sizeof(ualg->cru_name));
+	strscpy(ualg->cru_driver_name, alg->cra_driver_name,
+		sizeof(ualg->cru_driver_name));
+	strscpy(ualg->cru_module_name, module_name(alg->cra_module),
+		sizeof(ualg->cru_module_name));
+
+	ualg->cru_type = 0;
+	ualg->cru_mask = 0;
+	ualg->cru_flags = alg->cra_flags;
+	ualg->cru_refcnt = refcount_read(&alg->cra_refcnt);
+
+	if (nla_put_u32(skb, CRYPTOCFGA_PRIORITY_VAL, alg->cra_priority))
+		goto nla_put_failure;
+	if (alg->cra_flags & CRYPTO_ALG_LARVAL) {
+		struct crypto_report_larval rl;
+
+		memset(&rl, 0, sizeof(rl));
+		strscpy(rl.type, "larval", sizeof(rl.type));
+		if (nla_put(skb, CRYPTOCFGA_REPORT_LARVAL, sizeof(rl), &rl))
+			goto nla_put_failure;
+		goto out;
+	}
+
+	if (alg->cra_type && alg->cra_type->report) {
+		if (alg->cra_type->report(skb, alg))
+			goto nla_put_failure;
+
+		goto out;
+	}
+
+	switch (alg->cra_flags & (CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_LARVAL)) {
+	case CRYPTO_ALG_TYPE_CIPHER:
+		if (crypto_report_cipher(skb, alg))
+			goto nla_put_failure;
+
+		break;
+	case CRYPTO_ALG_TYPE_COMPRESS:
+		if (crypto_report_comp(skb, alg))
+			goto nla_put_failure;
+
+		break;
+	}
+
+out:
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static int crypto_report_alg(struct crypto_alg *alg,
+			     struct crypto_dump_info *info)
+{
+	struct sk_buff *in_skb = info->in_skb;
+	struct sk_buff *skb = info->out_skb;
+	struct nlmsghdr *nlh;
+	struct crypto_user_alg *ualg;
+	int err = 0;
+
+	nlh = nlmsg_put(skb, NETLINK_CB(in_skb).portid, info->nlmsg_seq,
+			CRYPTO_MSG_GETALG, sizeof(*ualg), info->nlmsg_flags);
+	if (!nlh) {
+		err = -EMSGSIZE;
+		goto out;
+	}
+
+	ualg = nlmsg_data(nlh);
+
+	err = crypto_report_one(alg, ualg, skb);
+	if (err) {
+		nlmsg_cancel(skb, nlh);
+		goto out;
+	}
+
+	nlmsg_end(skb, nlh);
+
+out:
+	return err;
+}
+
+static int crypto_report(struct sk_buff *in_skb, struct nlmsghdr *in_nlh,
+			 struct nlattr **attrs)
+{
+	struct net *net = sock_net(in_skb->sk);
+	struct crypto_user_alg *p = nlmsg_data(in_nlh);
+	struct crypto_alg *alg;
+	struct sk_buff *skb;
+	struct crypto_dump_info info;
+	int err;
+
+	if (!null_terminated(p->cru_name) || !null_terminated(p->cru_driver_name))
+		return -EINVAL;
+
+	alg = crypto_alg_match(p, 0);
+	if (!alg)
+		return -ENOENT;
+
+	err = -ENOMEM;
+	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!skb)
+		goto drop_alg;
+
+	info.in_skb = in_skb;
+	info.out_skb = skb;
+	info.nlmsg_seq = in_nlh->nlmsg_seq;
+	info.nlmsg_flags = 0;
+
+	err = crypto_report_alg(alg, &info);
+
+drop_alg:
+	crypto_mod_put(alg);
+
+	if (err) {
+		kfree_skb(skb);
+		return err;
+	}
+
+	return nlmsg_unicast(net->crypto_nlsk, skb, NETLINK_CB(in_skb).portid);
+}
+
+static int crypto_dump_report(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	const size_t start_pos = cb->args[0];
+	size_t pos = 0;
+	struct crypto_dump_info info;
+	struct crypto_alg *alg;
+	int res;
+
+	info.in_skb = cb->skb;
+	info.out_skb = skb;
+	info.nlmsg_seq = cb->nlh->nlmsg_seq;
+	info.nlmsg_flags = NLM_F_MULTI;
+
+	down_read(&crypto_alg_sem);
+	list_for_each_entry(alg, &crypto_alg_list, cra_list) {
+		if (pos >= start_pos) {
+			res = crypto_report_alg(alg, &info);
+			if (res == -EMSGSIZE)
+				break;
+			if (res)
+				goto out;
+		}
+		pos++;
+	}
+	cb->args[0] = pos;
+	res = skb->len;
+out:
+	up_read(&crypto_alg_sem);
+	return res;
+}
+
+static int crypto_dump_report_done(struct netlink_callback *cb)
+{
+	return 0;
+}
+
+static int crypto_update_alg(struct sk_buff *skb, struct nlmsghdr *nlh,
+			     struct nlattr **attrs)
+{
+	struct crypto_alg *alg;
+	struct crypto_user_alg *p = nlmsg_data(nlh);
+	struct nlattr *priority = attrs[CRYPTOCFGA_PRIORITY_VAL];
+	LIST_HEAD(list);
+
+	if (!netlink_capable(skb, CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (!null_terminated(p->cru_name) || !null_terminated(p->cru_driver_name))
+		return -EINVAL;
+
+	if (priority && !strlen(p->cru_driver_name))
+		return -EINVAL;
+
+	alg = crypto_alg_match(p, 1);
+	if (!alg)
+		return -ENOENT;
+
+	down_write(&crypto_alg_sem);
+
+	crypto_remove_spawns(alg, &list, NULL);
+
+	if (priority)
+		alg->cra_priority = nla_get_u32(priority);
+
+	up_write(&crypto_alg_sem);
+
+	crypto_mod_put(alg);
+	crypto_remove_final(&list);
+
+	return 0;
+}
+
+static int crypto_del_alg(struct sk_buff *skb, struct nlmsghdr *nlh,
+			  struct nlattr **attrs)
+{
+	struct crypto_alg *alg;
+	struct crypto_user_alg *p = nlmsg_data(nlh);
+	int err;
+
+	if (!netlink_capable(skb, CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (!null_terminated(p->cru_name) || !null_terminated(p->cru_driver_name))
+		return -EINVAL;
+
+	alg = crypto_alg_match(p, 1);
+	if (!alg)
+		return -ENOENT;
+
+	/* We can not unregister core algorithms such as aes-generic.
+	 * We would loose the reference in the crypto_alg_list to this algorithm
+	 * if we try to unregister. Unregistering such an algorithm without
+	 * removing the module is not possible, so we restrict to crypto
+	 * instances that are build from templates. */
+	err = -EINVAL;
+	if (!(alg->cra_flags & CRYPTO_ALG_INSTANCE))
+		goto drop_alg;
+
+	err = -EBUSY;
+	if (refcount_read(&alg->cra_refcnt) > 2)
+		goto drop_alg;
+
+	crypto_unregister_instance((struct crypto_instance *)alg);
+	err = 0;
+
+drop_alg:
+	crypto_mod_put(alg);
+	return err;
+}
+
+static int crypto_add_alg(struct sk_buff *skb, struct nlmsghdr *nlh,
+			  struct nlattr **attrs)
+{
+	int exact = 0;
+	const char *name;
+	struct crypto_alg *alg;
+	struct crypto_user_alg *p = nlmsg_data(nlh);
+	struct nlattr *priority = attrs[CRYPTOCFGA_PRIORITY_VAL];
+
+	if (!netlink_capable(skb, CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (!null_terminated(p->cru_name) || !null_terminated(p->cru_driver_name))
+		return -EINVAL;
+
+	if (strlen(p->cru_driver_name))
+		exact = 1;
+
+	if (priority && !exact)
+		return -EINVAL;
+
+	alg = crypto_alg_match(p, exact);
+	if (alg) {
+		crypto_mod_put(alg);
+		return -EEXIST;
+	}
+
+	if (strlen(p->cru_driver_name))
+		name = p->cru_driver_name;
+	else
+		name = p->cru_name;
+
+	alg = crypto_alg_mod_lookup(name, p->cru_type, p->cru_mask);
+	if (IS_ERR(alg))
+		return PTR_ERR(alg);
+
+	down_write(&crypto_alg_sem);
+
+	if (priority)
+		alg->cra_priority = nla_get_u32(priority);
+
+	up_write(&crypto_alg_sem);
+
+	crypto_mod_put(alg);
+
+	return 0;
+}
+
+static int crypto_del_rng(struct sk_buff *skb, struct nlmsghdr *nlh,
+			  struct nlattr **attrs)
+{
+	if (!netlink_capable(skb, CAP_NET_ADMIN))
+		return -EPERM;
+	return crypto_del_default_rng();
+}
+
+static int crypto_reportstat(struct sk_buff *in_skb, struct nlmsghdr *in_nlh,
+			     struct nlattr **attrs)
+{
+	/* No longer supported */
+	return -ENOTSUPP;
+}
+
+#define MSGSIZE(type) sizeof(struct type)
+
+static const int crypto_msg_min[CRYPTO_NR_MSGTYPES] = {
+	[CRYPTO_MSG_NEWALG	- CRYPTO_MSG_BASE] = MSGSIZE(crypto_user_alg),
+	[CRYPTO_MSG_DELALG	- CRYPTO_MSG_BASE] = MSGSIZE(crypto_user_alg),
+	[CRYPTO_MSG_UPDATEALG	- CRYPTO_MSG_BASE] = MSGSIZE(crypto_user_alg),
+	[CRYPTO_MSG_GETALG	- CRYPTO_MSG_BASE] = MSGSIZE(crypto_user_alg),
+	[CRYPTO_MSG_DELRNG	- CRYPTO_MSG_BASE] = 0,
+	[CRYPTO_MSG_GETSTAT	- CRYPTO_MSG_BASE] = MSGSIZE(crypto_user_alg),
+};
+
+static const struct nla_policy crypto_policy[CRYPTOCFGA_MAX+1] = {
+	[CRYPTOCFGA_PRIORITY_VAL]   = { .type = NLA_U32},
+};
+
+#undef MSGSIZE
+
+static const struct crypto_link {
+	int (*doit)(struct sk_buff *, struct nlmsghdr *, struct nlattr **);
+	int (*dump)(struct sk_buff *, struct netlink_callback *);
+	int (*done)(struct netlink_callback *);
+} crypto_dispatch[CRYPTO_NR_MSGTYPES] = {
+	[CRYPTO_MSG_NEWALG	- CRYPTO_MSG_BASE] = { .doit = crypto_add_alg},
+	[CRYPTO_MSG_DELALG	- CRYPTO_MSG_BASE] = { .doit = crypto_del_alg},
+	[CRYPTO_MSG_UPDATEALG	- CRYPTO_MSG_BASE] = { .doit = crypto_update_alg},
+	[CRYPTO_MSG_GETALG	- CRYPTO_MSG_BASE] = { .doit = crypto_report,
+						       .dump = crypto_dump_report,
+						       .done = crypto_dump_report_done},
+	[CRYPTO_MSG_DELRNG	- CRYPTO_MSG_BASE] = { .doit = crypto_del_rng },
+	[CRYPTO_MSG_GETSTAT	- CRYPTO_MSG_BASE] = { .doit = crypto_reportstat},
+};
+
+static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
+			       struct netlink_ext_ack *extack)
+{
+	struct net *net = sock_net(skb->sk);
+	struct nlattr *attrs[CRYPTOCFGA_MAX+1];
+	const struct crypto_link *link;
+	int type, err;
+
+	type = nlh->nlmsg_type;
+	if (type > CRYPTO_MSG_MAX)
+		return -EINVAL;
+
+	type -= CRYPTO_MSG_BASE;
+	link = &crypto_dispatch[type];
+
+	if ((type == (CRYPTO_MSG_GETALG - CRYPTO_MSG_BASE) &&
+	    (nlh->nlmsg_flags & NLM_F_DUMP))) {
+		struct crypto_alg *alg;
+		unsigned long dump_alloc = 0;
+
+		if (link->dump == NULL)
+			return -EINVAL;
+
+		down_read(&crypto_alg_sem);
+		list_for_each_entry(alg, &crypto_alg_list, cra_list)
+			dump_alloc += CRYPTO_REPORT_MAXSIZE;
+		up_read(&crypto_alg_sem);
+
+		{
+			struct netlink_dump_control c = {
+				.dump = link->dump,
+				.done = link->done,
+				.min_dump_alloc = min(dump_alloc, 65535UL),
+			};
+			err = netlink_dump_start(net->crypto_nlsk, skb, nlh, &c);
+		}
+
+		return err;
+	}
+
+	err = nlmsg_parse_deprecated(nlh, crypto_msg_min[type], attrs,
+				     CRYPTOCFGA_MAX, crypto_policy, extack);
+	if (err < 0)
+		return err;
+
+	if (link->doit == NULL)
+		return -EINVAL;
+
+	return link->doit(skb, nlh, attrs);
+}
+
+static void crypto_netlink_rcv(struct sk_buff *skb)
+{
+	mutex_lock(&crypto_cfg_mutex);
+	netlink_rcv_skb(skb, &crypto_user_rcv_msg);
+	mutex_unlock(&crypto_cfg_mutex);
+}
+
+static int __net_init crypto_netlink_init(struct net *net)
+{
+	struct netlink_kernel_cfg cfg = {
+		.input	= crypto_netlink_rcv,
+	};
+
+	net->crypto_nlsk = netlink_kernel_create(net, NETLINK_CRYPTO, &cfg);
+	return net->crypto_nlsk == NULL ? -ENOMEM : 0;
+}
+
+static void __net_exit crypto_netlink_exit(struct net *net)
+{
+	netlink_kernel_release(net->crypto_nlsk);
+	net->crypto_nlsk = NULL;
+}
+
+static struct pernet_operations crypto_netlink_net_ops = {
+	.init = crypto_netlink_init,
+	.exit = crypto_netlink_exit,
+};
+
+static int __init crypto_user_init(void)
+{
+	return register_pernet_subsys(&crypto_netlink_net_ops);
+}
+
+static void __exit crypto_user_exit(void)
+{
+	unregister_pernet_subsys(&crypto_netlink_net_ops);
+}
+
+module_init(crypto_user_init);
+module_exit(crypto_user_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>");
+MODULE_DESCRIPTION("Crypto userspace configuration API");
+MODULE_ALIAS("net-pf-16-proto-21");
diff --git a/crypto/crypto_user_base.c b/crypto/crypto_user_base.c
deleted file mode 100644
index 3fa20f12989f..000000000000
--- a/crypto/crypto_user_base.c
+++ /dev/null
@@ -1,516 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Crypto user configuration API.
- *
- * Copyright (C) 2011 secunet Security Networks AG
- * Copyright (C) 2011 Steffen Klassert <steffen.klassert@secunet.com>
- */
-
-#include <linux/module.h>
-#include <linux/crypto.h>
-#include <linux/cryptouser.h>
-#include <linux/sched.h>
-#include <linux/security.h>
-#include <net/netlink.h>
-#include <net/net_namespace.h>
-#include <net/sock.h>
-#include <crypto/internal/skcipher.h>
-#include <crypto/internal/rng.h>
-#include <crypto/akcipher.h>
-#include <crypto/kpp.h>
-#include <crypto/internal/cryptouser.h>
-
-#include "internal.h"
-
-#define null_terminated(x)	(strnlen(x, sizeof(x)) < sizeof(x))
-
-static DEFINE_MUTEX(crypto_cfg_mutex);
-
-struct crypto_dump_info {
-	struct sk_buff *in_skb;
-	struct sk_buff *out_skb;
-	u32 nlmsg_seq;
-	u16 nlmsg_flags;
-};
-
-struct crypto_alg *crypto_alg_match(struct crypto_user_alg *p, int exact)
-{
-	struct crypto_alg *q, *alg = NULL;
-
-	down_read(&crypto_alg_sem);
-
-	list_for_each_entry(q, &crypto_alg_list, cra_list) {
-		int match = 0;
-
-		if (crypto_is_larval(q))
-			continue;
-
-		if ((q->cra_flags ^ p->cru_type) & p->cru_mask)
-			continue;
-
-		if (strlen(p->cru_driver_name))
-			match = !strcmp(q->cra_driver_name,
-					p->cru_driver_name);
-		else if (!exact)
-			match = !strcmp(q->cra_name, p->cru_name);
-
-		if (!match)
-			continue;
-
-		if (unlikely(!crypto_mod_get(q)))
-			continue;
-
-		alg = q;
-		break;
-	}
-
-	up_read(&crypto_alg_sem);
-
-	return alg;
-}
-
-static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg)
-{
-	struct crypto_report_cipher rcipher;
-
-	memset(&rcipher, 0, sizeof(rcipher));
-
-	strscpy(rcipher.type, "cipher", sizeof(rcipher.type));
-
-	rcipher.blocksize = alg->cra_blocksize;
-	rcipher.min_keysize = alg->cra_cipher.cia_min_keysize;
-	rcipher.max_keysize = alg->cra_cipher.cia_max_keysize;
-
-	return nla_put(skb, CRYPTOCFGA_REPORT_CIPHER,
-		       sizeof(rcipher), &rcipher);
-}
-
-static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg)
-{
-	struct crypto_report_comp rcomp;
-
-	memset(&rcomp, 0, sizeof(rcomp));
-
-	strscpy(rcomp.type, "compression", sizeof(rcomp.type));
-
-	return nla_put(skb, CRYPTOCFGA_REPORT_COMPRESS, sizeof(rcomp), &rcomp);
-}
-
-static int crypto_report_one(struct crypto_alg *alg,
-			     struct crypto_user_alg *ualg, struct sk_buff *skb)
-{
-	memset(ualg, 0, sizeof(*ualg));
-
-	strscpy(ualg->cru_name, alg->cra_name, sizeof(ualg->cru_name));
-	strscpy(ualg->cru_driver_name, alg->cra_driver_name,
-		sizeof(ualg->cru_driver_name));
-	strscpy(ualg->cru_module_name, module_name(alg->cra_module),
-		sizeof(ualg->cru_module_name));
-
-	ualg->cru_type = 0;
-	ualg->cru_mask = 0;
-	ualg->cru_flags = alg->cra_flags;
-	ualg->cru_refcnt = refcount_read(&alg->cra_refcnt);
-
-	if (nla_put_u32(skb, CRYPTOCFGA_PRIORITY_VAL, alg->cra_priority))
-		goto nla_put_failure;
-	if (alg->cra_flags & CRYPTO_ALG_LARVAL) {
-		struct crypto_report_larval rl;
-
-		memset(&rl, 0, sizeof(rl));
-		strscpy(rl.type, "larval", sizeof(rl.type));
-		if (nla_put(skb, CRYPTOCFGA_REPORT_LARVAL, sizeof(rl), &rl))
-			goto nla_put_failure;
-		goto out;
-	}
-
-	if (alg->cra_type && alg->cra_type->report) {
-		if (alg->cra_type->report(skb, alg))
-			goto nla_put_failure;
-
-		goto out;
-	}
-
-	switch (alg->cra_flags & (CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_LARVAL)) {
-	case CRYPTO_ALG_TYPE_CIPHER:
-		if (crypto_report_cipher(skb, alg))
-			goto nla_put_failure;
-
-		break;
-	case CRYPTO_ALG_TYPE_COMPRESS:
-		if (crypto_report_comp(skb, alg))
-			goto nla_put_failure;
-
-		break;
-	}
-
-out:
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
-}
-
-static int crypto_report_alg(struct crypto_alg *alg,
-			     struct crypto_dump_info *info)
-{
-	struct sk_buff *in_skb = info->in_skb;
-	struct sk_buff *skb = info->out_skb;
-	struct nlmsghdr *nlh;
-	struct crypto_user_alg *ualg;
-	int err = 0;
-
-	nlh = nlmsg_put(skb, NETLINK_CB(in_skb).portid, info->nlmsg_seq,
-			CRYPTO_MSG_GETALG, sizeof(*ualg), info->nlmsg_flags);
-	if (!nlh) {
-		err = -EMSGSIZE;
-		goto out;
-	}
-
-	ualg = nlmsg_data(nlh);
-
-	err = crypto_report_one(alg, ualg, skb);
-	if (err) {
-		nlmsg_cancel(skb, nlh);
-		goto out;
-	}
-
-	nlmsg_end(skb, nlh);
-
-out:
-	return err;
-}
-
-static int crypto_report(struct sk_buff *in_skb, struct nlmsghdr *in_nlh,
-			 struct nlattr **attrs)
-{
-	struct net *net = sock_net(in_skb->sk);
-	struct crypto_user_alg *p = nlmsg_data(in_nlh);
-	struct crypto_alg *alg;
-	struct sk_buff *skb;
-	struct crypto_dump_info info;
-	int err;
-
-	if (!null_terminated(p->cru_name) || !null_terminated(p->cru_driver_name))
-		return -EINVAL;
-
-	alg = crypto_alg_match(p, 0);
-	if (!alg)
-		return -ENOENT;
-
-	err = -ENOMEM;
-	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
-	if (!skb)
-		goto drop_alg;
-
-	info.in_skb = in_skb;
-	info.out_skb = skb;
-	info.nlmsg_seq = in_nlh->nlmsg_seq;
-	info.nlmsg_flags = 0;
-
-	err = crypto_report_alg(alg, &info);
-
-drop_alg:
-	crypto_mod_put(alg);
-
-	if (err) {
-		kfree_skb(skb);
-		return err;
-	}
-
-	return nlmsg_unicast(net->crypto_nlsk, skb, NETLINK_CB(in_skb).portid);
-}
-
-static int crypto_dump_report(struct sk_buff *skb, struct netlink_callback *cb)
-{
-	const size_t start_pos = cb->args[0];
-	size_t pos = 0;
-	struct crypto_dump_info info;
-	struct crypto_alg *alg;
-	int res;
-
-	info.in_skb = cb->skb;
-	info.out_skb = skb;
-	info.nlmsg_seq = cb->nlh->nlmsg_seq;
-	info.nlmsg_flags = NLM_F_MULTI;
-
-	down_read(&crypto_alg_sem);
-	list_for_each_entry(alg, &crypto_alg_list, cra_list) {
-		if (pos >= start_pos) {
-			res = crypto_report_alg(alg, &info);
-			if (res == -EMSGSIZE)
-				break;
-			if (res)
-				goto out;
-		}
-		pos++;
-	}
-	cb->args[0] = pos;
-	res = skb->len;
-out:
-	up_read(&crypto_alg_sem);
-	return res;
-}
-
-static int crypto_dump_report_done(struct netlink_callback *cb)
-{
-	return 0;
-}
-
-static int crypto_update_alg(struct sk_buff *skb, struct nlmsghdr *nlh,
-			     struct nlattr **attrs)
-{
-	struct crypto_alg *alg;
-	struct crypto_user_alg *p = nlmsg_data(nlh);
-	struct nlattr *priority = attrs[CRYPTOCFGA_PRIORITY_VAL];
-	LIST_HEAD(list);
-
-	if (!netlink_capable(skb, CAP_NET_ADMIN))
-		return -EPERM;
-
-	if (!null_terminated(p->cru_name) || !null_terminated(p->cru_driver_name))
-		return -EINVAL;
-
-	if (priority && !strlen(p->cru_driver_name))
-		return -EINVAL;
-
-	alg = crypto_alg_match(p, 1);
-	if (!alg)
-		return -ENOENT;
-
-	down_write(&crypto_alg_sem);
-
-	crypto_remove_spawns(alg, &list, NULL);
-
-	if (priority)
-		alg->cra_priority = nla_get_u32(priority);
-
-	up_write(&crypto_alg_sem);
-
-	crypto_mod_put(alg);
-	crypto_remove_final(&list);
-
-	return 0;
-}
-
-static int crypto_del_alg(struct sk_buff *skb, struct nlmsghdr *nlh,
-			  struct nlattr **attrs)
-{
-	struct crypto_alg *alg;
-	struct crypto_user_alg *p = nlmsg_data(nlh);
-	int err;
-
-	if (!netlink_capable(skb, CAP_NET_ADMIN))
-		return -EPERM;
-
-	if (!null_terminated(p->cru_name) || !null_terminated(p->cru_driver_name))
-		return -EINVAL;
-
-	alg = crypto_alg_match(p, 1);
-	if (!alg)
-		return -ENOENT;
-
-	/* We can not unregister core algorithms such as aes-generic.
-	 * We would loose the reference in the crypto_alg_list to this algorithm
-	 * if we try to unregister. Unregistering such an algorithm without
-	 * removing the module is not possible, so we restrict to crypto
-	 * instances that are build from templates. */
-	err = -EINVAL;
-	if (!(alg->cra_flags & CRYPTO_ALG_INSTANCE))
-		goto drop_alg;
-
-	err = -EBUSY;
-	if (refcount_read(&alg->cra_refcnt) > 2)
-		goto drop_alg;
-
-	crypto_unregister_instance((struct crypto_instance *)alg);
-	err = 0;
-
-drop_alg:
-	crypto_mod_put(alg);
-	return err;
-}
-
-static int crypto_add_alg(struct sk_buff *skb, struct nlmsghdr *nlh,
-			  struct nlattr **attrs)
-{
-	int exact = 0;
-	const char *name;
-	struct crypto_alg *alg;
-	struct crypto_user_alg *p = nlmsg_data(nlh);
-	struct nlattr *priority = attrs[CRYPTOCFGA_PRIORITY_VAL];
-
-	if (!netlink_capable(skb, CAP_NET_ADMIN))
-		return -EPERM;
-
-	if (!null_terminated(p->cru_name) || !null_terminated(p->cru_driver_name))
-		return -EINVAL;
-
-	if (strlen(p->cru_driver_name))
-		exact = 1;
-
-	if (priority && !exact)
-		return -EINVAL;
-
-	alg = crypto_alg_match(p, exact);
-	if (alg) {
-		crypto_mod_put(alg);
-		return -EEXIST;
-	}
-
-	if (strlen(p->cru_driver_name))
-		name = p->cru_driver_name;
-	else
-		name = p->cru_name;
-
-	alg = crypto_alg_mod_lookup(name, p->cru_type, p->cru_mask);
-	if (IS_ERR(alg))
-		return PTR_ERR(alg);
-
-	down_write(&crypto_alg_sem);
-
-	if (priority)
-		alg->cra_priority = nla_get_u32(priority);
-
-	up_write(&crypto_alg_sem);
-
-	crypto_mod_put(alg);
-
-	return 0;
-}
-
-static int crypto_del_rng(struct sk_buff *skb, struct nlmsghdr *nlh,
-			  struct nlattr **attrs)
-{
-	if (!netlink_capable(skb, CAP_NET_ADMIN))
-		return -EPERM;
-	return crypto_del_default_rng();
-}
-
-#define MSGSIZE(type) sizeof(struct type)
-
-static const int crypto_msg_min[CRYPTO_NR_MSGTYPES] = {
-	[CRYPTO_MSG_NEWALG	- CRYPTO_MSG_BASE] = MSGSIZE(crypto_user_alg),
-	[CRYPTO_MSG_DELALG	- CRYPTO_MSG_BASE] = MSGSIZE(crypto_user_alg),
-	[CRYPTO_MSG_UPDATEALG	- CRYPTO_MSG_BASE] = MSGSIZE(crypto_user_alg),
-	[CRYPTO_MSG_GETALG	- CRYPTO_MSG_BASE] = MSGSIZE(crypto_user_alg),
-	[CRYPTO_MSG_DELRNG	- CRYPTO_MSG_BASE] = 0,
-	[CRYPTO_MSG_GETSTAT	- CRYPTO_MSG_BASE] = MSGSIZE(crypto_user_alg),
-};
-
-static const struct nla_policy crypto_policy[CRYPTOCFGA_MAX+1] = {
-	[CRYPTOCFGA_PRIORITY_VAL]   = { .type = NLA_U32},
-};
-
-#undef MSGSIZE
-
-static const struct crypto_link {
-	int (*doit)(struct sk_buff *, struct nlmsghdr *, struct nlattr **);
-	int (*dump)(struct sk_buff *, struct netlink_callback *);
-	int (*done)(struct netlink_callback *);
-} crypto_dispatch[CRYPTO_NR_MSGTYPES] = {
-	[CRYPTO_MSG_NEWALG	- CRYPTO_MSG_BASE] = { .doit = crypto_add_alg},
-	[CRYPTO_MSG_DELALG	- CRYPTO_MSG_BASE] = { .doit = crypto_del_alg},
-	[CRYPTO_MSG_UPDATEALG	- CRYPTO_MSG_BASE] = { .doit = crypto_update_alg},
-	[CRYPTO_MSG_GETALG	- CRYPTO_MSG_BASE] = { .doit = crypto_report,
-						       .dump = crypto_dump_report,
-						       .done = crypto_dump_report_done},
-	[CRYPTO_MSG_DELRNG	- CRYPTO_MSG_BASE] = { .doit = crypto_del_rng },
-	[CRYPTO_MSG_GETSTAT	- CRYPTO_MSG_BASE] = { .doit = crypto_reportstat},
-};
-
-static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
-			       struct netlink_ext_ack *extack)
-{
-	struct net *net = sock_net(skb->sk);
-	struct nlattr *attrs[CRYPTOCFGA_MAX+1];
-	const struct crypto_link *link;
-	int type, err;
-
-	type = nlh->nlmsg_type;
-	if (type > CRYPTO_MSG_MAX)
-		return -EINVAL;
-
-	type -= CRYPTO_MSG_BASE;
-	link = &crypto_dispatch[type];
-
-	if ((type == (CRYPTO_MSG_GETALG - CRYPTO_MSG_BASE) &&
-	    (nlh->nlmsg_flags & NLM_F_DUMP))) {
-		struct crypto_alg *alg;
-		unsigned long dump_alloc = 0;
-
-		if (link->dump == NULL)
-			return -EINVAL;
-
-		down_read(&crypto_alg_sem);
-		list_for_each_entry(alg, &crypto_alg_list, cra_list)
-			dump_alloc += CRYPTO_REPORT_MAXSIZE;
-		up_read(&crypto_alg_sem);
-
-		{
-			struct netlink_dump_control c = {
-				.dump = link->dump,
-				.done = link->done,
-				.min_dump_alloc = min(dump_alloc, 65535UL),
-			};
-			err = netlink_dump_start(net->crypto_nlsk, skb, nlh, &c);
-		}
-
-		return err;
-	}
-
-	err = nlmsg_parse_deprecated(nlh, crypto_msg_min[type], attrs,
-				     CRYPTOCFGA_MAX, crypto_policy, extack);
-	if (err < 0)
-		return err;
-
-	if (link->doit == NULL)
-		return -EINVAL;
-
-	return link->doit(skb, nlh, attrs);
-}
-
-static void crypto_netlink_rcv(struct sk_buff *skb)
-{
-	mutex_lock(&crypto_cfg_mutex);
-	netlink_rcv_skb(skb, &crypto_user_rcv_msg);
-	mutex_unlock(&crypto_cfg_mutex);
-}
-
-static int __net_init crypto_netlink_init(struct net *net)
-{
-	struct netlink_kernel_cfg cfg = {
-		.input	= crypto_netlink_rcv,
-	};
-
-	net->crypto_nlsk = netlink_kernel_create(net, NETLINK_CRYPTO, &cfg);
-	return net->crypto_nlsk == NULL ? -ENOMEM : 0;
-}
-
-static void __net_exit crypto_netlink_exit(struct net *net)
-{
-	netlink_kernel_release(net->crypto_nlsk);
-	net->crypto_nlsk = NULL;
-}
-
-static struct pernet_operations crypto_netlink_net_ops = {
-	.init = crypto_netlink_init,
-	.exit = crypto_netlink_exit,
-};
-
-static int __init crypto_user_init(void)
-{
-	return register_pernet_subsys(&crypto_netlink_net_ops);
-}
-
-static void __exit crypto_user_exit(void)
-{
-	unregister_pernet_subsys(&crypto_netlink_net_ops);
-}
-
-module_init(crypto_user_init);
-module_exit(crypto_user_exit);
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>");
-MODULE_DESCRIPTION("Crypto userspace configuration API");
-MODULE_ALIAS("net-pf-16-proto-21");
diff --git a/crypto/crypto_user_stat.c b/crypto/crypto_user_stat.c
deleted file mode 100644
index d4f3d39b5137..000000000000
--- a/crypto/crypto_user_stat.c
+++ /dev/null
@@ -1,176 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Crypto user configuration API.
- *
- * Copyright (C) 2017-2018 Corentin Labbe <clabbe@baylibre.com>
- *
- */
-
-#include <crypto/algapi.h>
-#include <crypto/internal/cryptouser.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <net/netlink.h>
-#include <net/sock.h>
-
-#define null_terminated(x)	(strnlen(x, sizeof(x)) < sizeof(x))
-
-struct crypto_dump_info {
-	struct sk_buff *in_skb;
-	struct sk_buff *out_skb;
-	u32 nlmsg_seq;
-	u16 nlmsg_flags;
-};
-
-static int crypto_report_cipher(struct sk_buff *skb, struct crypto_alg *alg)
-{
-	struct crypto_stat_cipher rcipher;
-
-	memset(&rcipher, 0, sizeof(rcipher));
-
-	strscpy(rcipher.type, "cipher", sizeof(rcipher.type));
-
-	return nla_put(skb, CRYPTOCFGA_STAT_CIPHER, sizeof(rcipher), &rcipher);
-}
-
-static int crypto_report_comp(struct sk_buff *skb, struct crypto_alg *alg)
-{
-	struct crypto_stat_compress rcomp;
-
-	memset(&rcomp, 0, sizeof(rcomp));
-
-	strscpy(rcomp.type, "compression", sizeof(rcomp.type));
-
-	return nla_put(skb, CRYPTOCFGA_STAT_COMPRESS, sizeof(rcomp), &rcomp);
-}
-
-static int crypto_reportstat_one(struct crypto_alg *alg,
-				 struct crypto_user_alg *ualg,
-				 struct sk_buff *skb)
-{
-	memset(ualg, 0, sizeof(*ualg));
-
-	strscpy(ualg->cru_name, alg->cra_name, sizeof(ualg->cru_name));
-	strscpy(ualg->cru_driver_name, alg->cra_driver_name,
-		sizeof(ualg->cru_driver_name));
-	strscpy(ualg->cru_module_name, module_name(alg->cra_module),
-		sizeof(ualg->cru_module_name));
-
-	ualg->cru_type = 0;
-	ualg->cru_mask = 0;
-	ualg->cru_flags = alg->cra_flags;
-	ualg->cru_refcnt = refcount_read(&alg->cra_refcnt);
-
-	if (nla_put_u32(skb, CRYPTOCFGA_PRIORITY_VAL, alg->cra_priority))
-		goto nla_put_failure;
-	if (alg->cra_flags & CRYPTO_ALG_LARVAL) {
-		struct crypto_stat_larval rl;
-
-		memset(&rl, 0, sizeof(rl));
-		strscpy(rl.type, "larval", sizeof(rl.type));
-		if (nla_put(skb, CRYPTOCFGA_STAT_LARVAL, sizeof(rl), &rl))
-			goto nla_put_failure;
-		goto out;
-	}
-
-	if (alg->cra_type && alg->cra_type->report_stat) {
-		if (alg->cra_type->report_stat(skb, alg))
-			goto nla_put_failure;
-		goto out;
-	}
-
-	switch (alg->cra_flags & (CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_LARVAL)) {
-	case CRYPTO_ALG_TYPE_CIPHER:
-		if (crypto_report_cipher(skb, alg))
-			goto nla_put_failure;
-		break;
-	case CRYPTO_ALG_TYPE_COMPRESS:
-		if (crypto_report_comp(skb, alg))
-			goto nla_put_failure;
-		break;
-	default:
-		pr_err("ERROR: Unhandled alg %d in %s\n",
-		       alg->cra_flags & (CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_LARVAL),
-		       __func__);
-	}
-
-out:
-	return 0;
-
-nla_put_failure:
-	return -EMSGSIZE;
-}
-
-static int crypto_reportstat_alg(struct crypto_alg *alg,
-				 struct crypto_dump_info *info)
-{
-	struct sk_buff *in_skb = info->in_skb;
-	struct sk_buff *skb = info->out_skb;
-	struct nlmsghdr *nlh;
-	struct crypto_user_alg *ualg;
-	int err = 0;
-
-	nlh = nlmsg_put(skb, NETLINK_CB(in_skb).portid, info->nlmsg_seq,
-			CRYPTO_MSG_GETSTAT, sizeof(*ualg), info->nlmsg_flags);
-	if (!nlh) {
-		err = -EMSGSIZE;
-		goto out;
-	}
-
-	ualg = nlmsg_data(nlh);
-
-	err = crypto_reportstat_one(alg, ualg, skb);
-	if (err) {
-		nlmsg_cancel(skb, nlh);
-		goto out;
-	}
-
-	nlmsg_end(skb, nlh);
-
-out:
-	return err;
-}
-
-int crypto_reportstat(struct sk_buff *in_skb, struct nlmsghdr *in_nlh,
-		      struct nlattr **attrs)
-{
-	struct net *net = sock_net(in_skb->sk);
-	struct crypto_user_alg *p = nlmsg_data(in_nlh);
-	struct crypto_alg *alg;
-	struct sk_buff *skb;
-	struct crypto_dump_info info;
-	int err;
-
-	if (!null_terminated(p->cru_name) || !null_terminated(p->cru_driver_name))
-		return -EINVAL;
-
-	alg = crypto_alg_match(p, 0);
-	if (!alg)
-		return -ENOENT;
-
-	err = -ENOMEM;
-	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
-	if (!skb)
-		goto drop_alg;
-
-	info.in_skb = in_skb;
-	info.out_skb = skb;
-	info.nlmsg_seq = in_nlh->nlmsg_seq;
-	info.nlmsg_flags = 0;
-
-	err = crypto_reportstat_alg(alg, &info);
-
-drop_alg:
-	crypto_mod_put(alg);
-
-	if (err) {
-		kfree_skb(skb);
-		return err;
-	}
-
-	return nlmsg_unicast(net->crypto_nlsk, skb, NETLINK_CB(in_skb).portid);
-}
-
-MODULE_LICENSE("GPL");
diff --git a/crypto/hash.h b/crypto/hash.h
index 93f6ba0df263..cf9aee07f77d 100644
--- a/crypto/hash.h
+++ b/crypto/hash.h
@@ -8,39 +8,9 @@
 #define _LOCAL_CRYPTO_HASH_H
 
 #include <crypto/internal/hash.h>
-#include <linux/cryptouser.h>
 
 #include "internal.h"
 
-static inline struct crypto_istat_hash *hash_get_stat(
-	struct hash_alg_common *alg)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	return &alg->stat;
-#else
-	return NULL;
-#endif
-}
-
-static inline int crypto_hash_report_stat(struct sk_buff *skb,
-					  struct crypto_alg *alg,
-					  const char *type)
-{
-	struct hash_alg_common *halg = __crypto_hash_alg_common(alg);
-	struct crypto_istat_hash *istat = hash_get_stat(halg);
-	struct crypto_stat_hash rhash;
-
-	memset(&rhash, 0, sizeof(rhash));
-
-	strscpy(rhash.type, type, sizeof(rhash.type));
-
-	rhash.stat_hash_cnt = atomic64_read(&istat->hash_cnt);
-	rhash.stat_hash_tlen = atomic64_read(&istat->hash_tlen);
-	rhash.stat_err_cnt = atomic64_read(&istat->err_cnt);
-
-	return nla_put(skb, CRYPTOCFGA_STAT_HASH, sizeof(rhash), &rhash);
-}
-
 extern const struct crypto_type crypto_shash_type;
 
 int hash_prepare_alg(struct hash_alg_common *alg);
diff --git a/crypto/kpp.c b/crypto/kpp.c
index 33d44e59387f..ecc63a1a948d 100644
--- a/crypto/kpp.c
+++ b/crypto/kpp.c
@@ -66,29 +66,6 @@ static void crypto_kpp_free_instance(struct crypto_instance *inst)
 	kpp->free(kpp);
 }
 
-static int __maybe_unused crypto_kpp_report_stat(
-	struct sk_buff *skb, struct crypto_alg *alg)
-{
-	struct kpp_alg *kpp = __crypto_kpp_alg(alg);
-	struct crypto_istat_kpp *istat;
-	struct crypto_stat_kpp rkpp;
-
-	istat = kpp_get_stat(kpp);
-
-	memset(&rkpp, 0, sizeof(rkpp));
-
-	strscpy(rkpp.type, "kpp", sizeof(rkpp.type));
-
-	rkpp.stat_setsecret_cnt = atomic64_read(&istat->setsecret_cnt);
-	rkpp.stat_generate_public_key_cnt =
-		atomic64_read(&istat->generate_public_key_cnt);
-	rkpp.stat_compute_shared_secret_cnt =
-		atomic64_read(&istat->compute_shared_secret_cnt);
-	rkpp.stat_err_cnt = atomic64_read(&istat->err_cnt);
-
-	return nla_put(skb, CRYPTOCFGA_STAT_KPP, sizeof(rkpp), &rkpp);
-}
-
 static const struct crypto_type crypto_kpp_type = {
 	.extsize = crypto_alg_extsize,
 	.init_tfm = crypto_kpp_init_tfm,
@@ -98,9 +75,6 @@ static const struct crypto_type crypto_kpp_type = {
 #endif
 #if IS_ENABLED(CONFIG_CRYPTO_USER)
 	.report = crypto_kpp_report,
-#endif
-#ifdef CONFIG_CRYPTO_STATS
-	.report_stat = crypto_kpp_report_stat,
 #endif
 	.maskclear = ~CRYPTO_ALG_TYPE_MASK,
 	.maskset = CRYPTO_ALG_TYPE_MASK,
@@ -131,15 +105,11 @@ EXPORT_SYMBOL_GPL(crypto_has_kpp);
 
 static void kpp_prepare_alg(struct kpp_alg *alg)
 {
-	struct crypto_istat_kpp *istat = kpp_get_stat(alg);
 	struct crypto_alg *base = &alg->base;
 
 	base->cra_type = &crypto_kpp_type;
 	base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
 	base->cra_flags |= CRYPTO_ALG_TYPE_KPP;
-
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
-		memset(istat, 0, sizeof(*istat));
 }
 
 int crypto_register_kpp(struct kpp_alg *alg)
diff --git a/crypto/lskcipher.c b/crypto/lskcipher.c
index 0f1bd7dcde24..cdb4897c63e6 100644
--- a/crypto/lskcipher.c
+++ b/crypto/lskcipher.c
@@ -29,25 +29,6 @@ static inline struct lskcipher_alg *__crypto_lskcipher_alg(
 	return container_of(alg, struct lskcipher_alg, co.base);
 }
 
-static inline struct crypto_istat_cipher *lskcipher_get_stat(
-	struct lskcipher_alg *alg)
-{
-	return skcipher_get_stat_common(&alg->co);
-}
-
-static inline int crypto_lskcipher_errstat(struct lskcipher_alg *alg, int err)
-{
-	struct crypto_istat_cipher *istat = lskcipher_get_stat(alg);
-
-	if (!IS_ENABLED(CONFIG_CRYPTO_STATS))
-		return err;
-
-	if (err)
-		atomic64_inc(&istat->err_cnt);
-
-	return err;
-}
-
 static int lskcipher_setkey_unaligned(struct crypto_lskcipher *tfm,
 				      const u8 *key, unsigned int keylen)
 {
@@ -147,20 +128,13 @@ static int crypto_lskcipher_crypt(struct crypto_lskcipher *tfm, const u8 *src,
 					       u32 flags))
 {
 	unsigned long alignmask = crypto_lskcipher_alignmask(tfm);
-	struct lskcipher_alg *alg = crypto_lskcipher_alg(tfm);
-	int ret;
 
 	if (((unsigned long)src | (unsigned long)dst | (unsigned long)iv) &
-	    alignmask) {
-		ret = crypto_lskcipher_crypt_unaligned(tfm, src, dst, len, iv,
-						       crypt);
-		goto out;
-	}
+	    alignmask)
+		return crypto_lskcipher_crypt_unaligned(tfm, src, dst, len, iv,
+							crypt);
 
-	ret = crypt(tfm, src, dst, len, iv, CRYPTO_LSKCIPHER_FLAG_FINAL);
-
-out:
-	return crypto_lskcipher_errstat(alg, ret);
+	return crypt(tfm, src, dst, len, iv, CRYPTO_LSKCIPHER_FLAG_FINAL);
 }
 
 int crypto_lskcipher_encrypt(struct crypto_lskcipher *tfm, const u8 *src,
@@ -168,13 +142,6 @@ int crypto_lskcipher_encrypt(struct crypto_lskcipher *tfm, const u8 *src,
 {
 	struct lskcipher_alg *alg = crypto_lskcipher_alg(tfm);
 
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS)) {
-		struct crypto_istat_cipher *istat = lskcipher_get_stat(alg);
-
-		atomic64_inc(&istat->encrypt_cnt);
-		atomic64_add(len, &istat->encrypt_tlen);
-	}
-
 	return crypto_lskcipher_crypt(tfm, src, dst, len, iv, alg->encrypt);
 }
 EXPORT_SYMBOL_GPL(crypto_lskcipher_encrypt);
@@ -184,13 +151,6 @@ int crypto_lskcipher_decrypt(struct crypto_lskcipher *tfm, const u8 *src,
 {
 	struct lskcipher_alg *alg = crypto_lskcipher_alg(tfm);
 
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS)) {
-		struct crypto_istat_cipher *istat = lskcipher_get_stat(alg);
-
-		atomic64_inc(&istat->decrypt_cnt);
-		atomic64_add(len, &istat->decrypt_tlen);
-	}
-
 	return crypto_lskcipher_crypt(tfm, src, dst, len, iv, alg->decrypt);
 }
 EXPORT_SYMBOL_GPL(crypto_lskcipher_decrypt);
@@ -320,28 +280,6 @@ static int __maybe_unused crypto_lskcipher_report(
 		       sizeof(rblkcipher), &rblkcipher);
 }
 
-static int __maybe_unused crypto_lskcipher_report_stat(
-	struct sk_buff *skb, struct crypto_alg *alg)
-{
-	struct lskcipher_alg *skcipher = __crypto_lskcipher_alg(alg);
-	struct crypto_istat_cipher *istat;
-	struct crypto_stat_cipher rcipher;
-
-	istat = lskcipher_get_stat(skcipher);
-
-	memset(&rcipher, 0, sizeof(rcipher));
-
-	strscpy(rcipher.type, "cipher", sizeof(rcipher.type));
-
-	rcipher.stat_encrypt_cnt = atomic64_read(&istat->encrypt_cnt);
-	rcipher.stat_encrypt_tlen = atomic64_read(&istat->encrypt_tlen);
-	rcipher.stat_decrypt_cnt =  atomic64_read(&istat->decrypt_cnt);
-	rcipher.stat_decrypt_tlen = atomic64_read(&istat->decrypt_tlen);
-	rcipher.stat_err_cnt =  atomic64_read(&istat->err_cnt);
-
-	return nla_put(skb, CRYPTOCFGA_STAT_CIPHER, sizeof(rcipher), &rcipher);
-}
-
 static const struct crypto_type crypto_lskcipher_type = {
 	.extsize = crypto_alg_extsize,
 	.init_tfm = crypto_lskcipher_init_tfm,
@@ -351,9 +289,6 @@ static const struct crypto_type crypto_lskcipher_type = {
 #endif
 #if IS_ENABLED(CONFIG_CRYPTO_USER)
 	.report = crypto_lskcipher_report,
-#endif
-#ifdef CONFIG_CRYPTO_STATS
-	.report_stat = crypto_lskcipher_report_stat,
 #endif
 	.maskclear = ~CRYPTO_ALG_TYPE_MASK,
 	.maskset = CRYPTO_ALG_TYPE_MASK,
diff --git a/crypto/rng.c b/crypto/rng.c
index 279dffdebf59..9d8804e46422 100644
--- a/crypto/rng.c
+++ b/crypto/rng.c
@@ -30,30 +30,24 @@ static int crypto_default_rng_refcnt;
 
 int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen)
 {
-	struct rng_alg *alg = crypto_rng_alg(tfm);
 	u8 *buf = NULL;
 	int err;
 
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
-		atomic64_inc(&rng_get_stat(alg)->seed_cnt);
-
 	if (!seed && slen) {
 		buf = kmalloc(slen, GFP_KERNEL);
-		err = -ENOMEM;
 		if (!buf)
-			goto out;
+			return -ENOMEM;
 
 		err = get_random_bytes_wait(buf, slen);
 		if (err)
-			goto free_buf;
+			goto out;
 		seed = buf;
 	}
 
-	err = alg->seed(tfm, seed, slen);
-free_buf:
-	kfree_sensitive(buf);
+	err = crypto_rng_alg(tfm)->seed(tfm, seed, slen);
 out:
-	return crypto_rng_errstat(alg, err);
+	kfree_sensitive(buf);
+	return err;
 }
 EXPORT_SYMBOL_GPL(crypto_rng_reset);
 
@@ -91,27 +85,6 @@ static void crypto_rng_show(struct seq_file *m, struct crypto_alg *alg)
 	seq_printf(m, "seedsize     : %u\n", seedsize(alg));
 }
 
-static int __maybe_unused crypto_rng_report_stat(
-	struct sk_buff *skb, struct crypto_alg *alg)
-{
-	struct rng_alg *rng = __crypto_rng_alg(alg);
-	struct crypto_istat_rng *istat;
-	struct crypto_stat_rng rrng;
-
-	istat = rng_get_stat(rng);
-
-	memset(&rrng, 0, sizeof(rrng));
-
-	strscpy(rrng.type, "rng", sizeof(rrng.type));
-
-	rrng.stat_generate_cnt = atomic64_read(&istat->generate_cnt);
-	rrng.stat_generate_tlen = atomic64_read(&istat->generate_tlen);
-	rrng.stat_seed_cnt = atomic64_read(&istat->seed_cnt);
-	rrng.stat_err_cnt = atomic64_read(&istat->err_cnt);
-
-	return nla_put(skb, CRYPTOCFGA_STAT_RNG, sizeof(rrng), &rrng);
-}
-
 static const struct crypto_type crypto_rng_type = {
 	.extsize = crypto_alg_extsize,
 	.init_tfm = crypto_rng_init_tfm,
@@ -120,9 +93,6 @@ static const struct crypto_type crypto_rng_type = {
 #endif
 #if IS_ENABLED(CONFIG_CRYPTO_USER)
 	.report = crypto_rng_report,
-#endif
-#ifdef CONFIG_CRYPTO_STATS
-	.report_stat = crypto_rng_report_stat,
 #endif
 	.maskclear = ~CRYPTO_ALG_TYPE_MASK,
 	.maskset = CRYPTO_ALG_TYPE_MASK,
@@ -199,7 +169,6 @@ EXPORT_SYMBOL_GPL(crypto_del_default_rng);
 
 int crypto_register_rng(struct rng_alg *alg)
 {
-	struct crypto_istat_rng *istat = rng_get_stat(alg);
 	struct crypto_alg *base = &alg->base;
 
 	if (alg->seedsize > PAGE_SIZE / 8)
@@ -209,9 +178,6 @@ int crypto_register_rng(struct rng_alg *alg)
 	base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
 	base->cra_flags |= CRYPTO_ALG_TYPE_RNG;
 
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
-		memset(istat, 0, sizeof(*istat));
-
 	return crypto_register_alg(base);
 }
 EXPORT_SYMBOL_GPL(crypto_register_rng);
diff --git a/crypto/scompress.c b/crypto/scompress.c
index 60bbb7ea4060..1cef6bb06a81 100644
--- a/crypto/scompress.c
+++ b/crypto/scompress.c
@@ -270,9 +270,6 @@ static const struct crypto_type crypto_scomp_type = {
 #endif
 #if IS_ENABLED(CONFIG_CRYPTO_USER)
 	.report = crypto_scomp_report,
-#endif
-#ifdef CONFIG_CRYPTO_STATS
-	.report_stat = crypto_acomp_report_stat,
 #endif
 	.maskclear = ~CRYPTO_ALG_TYPE_MASK,
 	.maskset = CRYPTO_ALG_TYPE_MASK,
diff --git a/crypto/shash.c b/crypto/shash.c
index c3f7f6a25280..301ab42bf849 100644
--- a/crypto/shash.c
+++ b/crypto/shash.c
@@ -16,18 +16,6 @@
 
 #include "hash.h"
 
-static inline struct crypto_istat_hash *shash_get_stat(struct shash_alg *alg)
-{
-	return hash_get_stat(&alg->halg);
-}
-
-static inline int crypto_shash_errstat(struct shash_alg *alg, int err)
-{
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS) && err)
-		atomic64_inc(&shash_get_stat(alg)->err_cnt);
-	return err;
-}
-
 int shash_no_setkey(struct crypto_shash *tfm, const u8 *key,
 		    unsigned int keylen)
 {
@@ -61,29 +49,13 @@ EXPORT_SYMBOL_GPL(crypto_shash_setkey);
 int crypto_shash_update(struct shash_desc *desc, const u8 *data,
 			unsigned int len)
 {
-	struct shash_alg *shash = crypto_shash_alg(desc->tfm);
-	int err;
-
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
-		atomic64_add(len, &shash_get_stat(shash)->hash_tlen);
-
-	err = shash->update(desc, data, len);
-
-	return crypto_shash_errstat(shash, err);
+	return crypto_shash_alg(desc->tfm)->update(desc, data, len);
 }
 EXPORT_SYMBOL_GPL(crypto_shash_update);
 
 int crypto_shash_final(struct shash_desc *desc, u8 *out)
 {
-	struct shash_alg *shash = crypto_shash_alg(desc->tfm);
-	int err;
-
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
-		atomic64_inc(&shash_get_stat(shash)->hash_cnt);
-
-	err = shash->final(desc, out);
-
-	return crypto_shash_errstat(shash, err);
+	return crypto_shash_alg(desc->tfm)->final(desc, out);
 }
 EXPORT_SYMBOL_GPL(crypto_shash_final);
 
@@ -99,20 +71,7 @@ static int shash_default_finup(struct shash_desc *desc, const u8 *data,
 int crypto_shash_finup(struct shash_desc *desc, const u8 *data,
 		       unsigned int len, u8 *out)
 {
-	struct crypto_shash *tfm = desc->tfm;
-	struct shash_alg *shash = crypto_shash_alg(tfm);
-	int err;
-
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS)) {
-		struct crypto_istat_hash *istat = shash_get_stat(shash);
-
-		atomic64_inc(&istat->hash_cnt);
-		atomic64_add(len, &istat->hash_tlen);
-	}
-
-	err = shash->finup(desc, data, len, out);
-
-	return crypto_shash_errstat(shash, err);
+	return crypto_shash_alg(desc->tfm)->finup(desc, data, len, out);
 }
 EXPORT_SYMBOL_GPL(crypto_shash_finup);
 
@@ -129,22 +88,11 @@ int crypto_shash_digest(struct shash_desc *desc, const u8 *data,
 			unsigned int len, u8 *out)
 {
 	struct crypto_shash *tfm = desc->tfm;
-	struct shash_alg *shash = crypto_shash_alg(tfm);
-	int err;
-
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS)) {
-		struct crypto_istat_hash *istat = shash_get_stat(shash);
-
-		atomic64_inc(&istat->hash_cnt);
-		atomic64_add(len, &istat->hash_tlen);
-	}
 
 	if (crypto_shash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
-		err = -ENOKEY;
-	else
-		err = shash->digest(desc, data, len, out);
+		return -ENOKEY;
 
-	return crypto_shash_errstat(shash, err);
+	return crypto_shash_alg(tfm)->digest(desc, data, len, out);
 }
 EXPORT_SYMBOL_GPL(crypto_shash_digest);
 
@@ -265,12 +213,6 @@ static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg)
 	seq_printf(m, "digestsize   : %u\n", salg->digestsize);
 }
 
-static int __maybe_unused crypto_shash_report_stat(
-	struct sk_buff *skb, struct crypto_alg *alg)
-{
-	return crypto_hash_report_stat(skb, alg, "shash");
-}
-
 const struct crypto_type crypto_shash_type = {
 	.extsize = crypto_alg_extsize,
 	.init_tfm = crypto_shash_init_tfm,
@@ -280,9 +222,6 @@ const struct crypto_type crypto_shash_type = {
 #endif
 #if IS_ENABLED(CONFIG_CRYPTO_USER)
 	.report = crypto_shash_report,
-#endif
-#ifdef CONFIG_CRYPTO_STATS
-	.report_stat = crypto_shash_report_stat,
 #endif
 	.maskclear = ~CRYPTO_ALG_TYPE_MASK,
 	.maskset = CRYPTO_ALG_TYPE_MASK,
@@ -350,7 +289,6 @@ EXPORT_SYMBOL_GPL(crypto_clone_shash);
 
 int hash_prepare_alg(struct hash_alg_common *alg)
 {
-	struct crypto_istat_hash *istat = hash_get_stat(alg);
 	struct crypto_alg *base = &alg->base;
 
 	if (alg->digestsize > HASH_MAX_DIGESTSIZE)
@@ -362,9 +300,6 @@ int hash_prepare_alg(struct hash_alg_common *alg)
 
 	base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
 
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
-		memset(istat, 0, sizeof(*istat));
-
 	return 0;
 }
 
diff --git a/crypto/sig.c b/crypto/sig.c
index 224c47019297..7645bedf3a1f 100644
--- a/crypto/sig.c
+++ b/crypto/sig.c
@@ -45,16 +45,6 @@ static int __maybe_unused crypto_sig_report(struct sk_buff *skb,
 	return nla_put(skb, CRYPTOCFGA_REPORT_AKCIPHER, sizeof(rsig), &rsig);
 }
 
-static int __maybe_unused crypto_sig_report_stat(struct sk_buff *skb,
-						 struct crypto_alg *alg)
-{
-	struct crypto_stat_akcipher rsig = {};
-
-	strscpy(rsig.type, "sig", sizeof(rsig.type));
-
-	return nla_put(skb, CRYPTOCFGA_STAT_AKCIPHER, sizeof(rsig), &rsig);
-}
-
 static const struct crypto_type crypto_sig_type = {
 	.extsize = crypto_alg_extsize,
 	.init_tfm = crypto_sig_init_tfm,
@@ -63,9 +53,6 @@ static const struct crypto_type crypto_sig_type = {
 #endif
 #if IS_ENABLED(CONFIG_CRYPTO_USER)
 	.report = crypto_sig_report,
-#endif
-#ifdef CONFIG_CRYPTO_STATS
-	.report_stat = crypto_sig_report_stat,
 #endif
 	.maskclear = ~CRYPTO_ALG_TYPE_MASK,
 	.maskset = CRYPTO_ALG_TYPE_SIG_MASK,
diff --git a/crypto/skcipher.c b/crypto/skcipher.c
index bc70e159d27d..ceed7f33a67b 100644
--- a/crypto/skcipher.c
+++ b/crypto/skcipher.c
@@ -89,25 +89,6 @@ static inline struct skcipher_alg *__crypto_skcipher_alg(
 	return container_of(alg, struct skcipher_alg, base);
 }
 
-static inline struct crypto_istat_cipher *skcipher_get_stat(
-	struct skcipher_alg *alg)
-{
-	return skcipher_get_stat_common(&alg->co);
-}
-
-static inline int crypto_skcipher_errstat(struct skcipher_alg *alg, int err)
-{
-	struct crypto_istat_cipher *istat = skcipher_get_stat(alg);
-
-	if (!IS_ENABLED(CONFIG_CRYPTO_STATS))
-		return err;
-
-	if (err && err != -EINPROGRESS && err != -EBUSY)
-		atomic64_inc(&istat->err_cnt);
-
-	return err;
-}
-
 static int skcipher_done_slow(struct skcipher_walk *walk, unsigned int bsize)
 {
 	u8 *addr;
@@ -654,23 +635,12 @@ int crypto_skcipher_encrypt(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct skcipher_alg *alg = crypto_skcipher_alg(tfm);
-	int ret;
-
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS)) {
-		struct crypto_istat_cipher *istat = skcipher_get_stat(alg);
-
-		atomic64_inc(&istat->encrypt_cnt);
-		atomic64_add(req->cryptlen, &istat->encrypt_tlen);
-	}
 
 	if (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
-		ret = -ENOKEY;
-	else if (alg->co.base.cra_type != &crypto_skcipher_type)
-		ret = crypto_lskcipher_encrypt_sg(req);
-	else
-		ret = alg->encrypt(req);
-
-	return crypto_skcipher_errstat(alg, ret);
+		return -ENOKEY;
+	if (alg->co.base.cra_type != &crypto_skcipher_type)
+		return crypto_lskcipher_encrypt_sg(req);
+	return alg->encrypt(req);
 }
 EXPORT_SYMBOL_GPL(crypto_skcipher_encrypt);
 
@@ -678,23 +648,12 @@ int crypto_skcipher_decrypt(struct skcipher_request *req)
 {
 	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
 	struct skcipher_alg *alg = crypto_skcipher_alg(tfm);
-	int ret;
-
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS)) {
-		struct crypto_istat_cipher *istat = skcipher_get_stat(alg);
-
-		atomic64_inc(&istat->decrypt_cnt);
-		atomic64_add(req->cryptlen, &istat->decrypt_tlen);
-	}
 
 	if (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
-		ret = -ENOKEY;
-	else if (alg->co.base.cra_type != &crypto_skcipher_type)
-		ret = crypto_lskcipher_decrypt_sg(req);
-	else
-		ret = alg->decrypt(req);
-
-	return crypto_skcipher_errstat(alg, ret);
+		return -ENOKEY;
+	if (alg->co.base.cra_type != &crypto_skcipher_type)
+		return crypto_lskcipher_decrypt_sg(req);
+	return alg->decrypt(req);
 }
 EXPORT_SYMBOL_GPL(crypto_skcipher_decrypt);
 
@@ -846,28 +805,6 @@ static int __maybe_unused crypto_skcipher_report(
 		       sizeof(rblkcipher), &rblkcipher);
 }
 
-static int __maybe_unused crypto_skcipher_report_stat(
-	struct sk_buff *skb, struct crypto_alg *alg)
-{
-	struct skcipher_alg *skcipher = __crypto_skcipher_alg(alg);
-	struct crypto_istat_cipher *istat;
-	struct crypto_stat_cipher rcipher;
-
-	istat = skcipher_get_stat(skcipher);
-
-	memset(&rcipher, 0, sizeof(rcipher));
-
-	strscpy(rcipher.type, "cipher", sizeof(rcipher.type));
-
-	rcipher.stat_encrypt_cnt = atomic64_read(&istat->encrypt_cnt);
-	rcipher.stat_encrypt_tlen = atomic64_read(&istat->encrypt_tlen);
-	rcipher.stat_decrypt_cnt =  atomic64_read(&istat->decrypt_cnt);
-	rcipher.stat_decrypt_tlen = atomic64_read(&istat->decrypt_tlen);
-	rcipher.stat_err_cnt =  atomic64_read(&istat->err_cnt);
-
-	return nla_put(skb, CRYPTOCFGA_STAT_CIPHER, sizeof(rcipher), &rcipher);
-}
-
 static const struct crypto_type crypto_skcipher_type = {
 	.extsize = crypto_skcipher_extsize,
 	.init_tfm = crypto_skcipher_init_tfm,
@@ -877,9 +814,6 @@ static const struct crypto_type crypto_skcipher_type = {
 #endif
 #if IS_ENABLED(CONFIG_CRYPTO_USER)
 	.report = crypto_skcipher_report,
-#endif
-#ifdef CONFIG_CRYPTO_STATS
-	.report_stat = crypto_skcipher_report_stat,
 #endif
 	.maskclear = ~CRYPTO_ALG_TYPE_MASK,
 	.maskset = CRYPTO_ALG_TYPE_SKCIPHER_MASK,
@@ -935,7 +869,6 @@ EXPORT_SYMBOL_GPL(crypto_has_skcipher);
 
 int skcipher_prepare_alg_common(struct skcipher_alg_common *alg)
 {
-	struct crypto_istat_cipher *istat = skcipher_get_stat_common(alg);
 	struct crypto_alg *base = &alg->base;
 
 	if (alg->ivsize > PAGE_SIZE / 8 || alg->chunksize > PAGE_SIZE / 8 ||
@@ -948,9 +881,6 @@ int skcipher_prepare_alg_common(struct skcipher_alg_common *alg)
 
 	base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK;
 
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
-		memset(istat, 0, sizeof(*istat));
-
 	return 0;
 }
 
diff --git a/crypto/skcipher.h b/crypto/skcipher.h
index 16c9484360da..703651367dd8 100644
--- a/crypto/skcipher.h
+++ b/crypto/skcipher.h
@@ -10,16 +10,6 @@
 #include <crypto/internal/skcipher.h>
 #include "internal.h"
 
-static inline struct crypto_istat_cipher *skcipher_get_stat_common(
-	struct skcipher_alg_common *alg)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	return &alg->stat;
-#else
-	return NULL;
-#endif
-}
-
 int crypto_lskcipher_encrypt_sg(struct skcipher_request *req);
 int crypto_lskcipher_decrypt_sg(struct skcipher_request *req);
 int crypto_init_lskcipher_ops_sg(struct crypto_tfm *tfm);
diff --git a/include/crypto/acompress.h b/include/crypto/acompress.h
index 80e243611fe2..54937b615239 100644
--- a/include/crypto/acompress.h
+++ b/include/crypto/acompress.h
@@ -56,31 +56,7 @@ struct crypto_acomp {
 	struct crypto_tfm base;
 };
 
-/*
- * struct crypto_istat_compress - statistics for compress algorithm
- * @compress_cnt:	number of compress requests
- * @compress_tlen:	total data size handled by compress requests
- * @decompress_cnt:	number of decompress requests
- * @decompress_tlen:	total data size handled by decompress requests
- * @err_cnt:		number of error for compress requests
- */
-struct crypto_istat_compress {
-	atomic64_t compress_cnt;
-	atomic64_t compress_tlen;
-	atomic64_t decompress_cnt;
-	atomic64_t decompress_tlen;
-	atomic64_t err_cnt;
-};
-
-#ifdef CONFIG_CRYPTO_STATS
-#define COMP_ALG_COMMON_STATS struct crypto_istat_compress stat;
-#else
-#define COMP_ALG_COMMON_STATS
-#endif
-
 #define COMP_ALG_COMMON {			\
-	COMP_ALG_COMMON_STATS			\
-						\
 	struct crypto_alg base;			\
 }
 struct comp_alg_common COMP_ALG_COMMON;
@@ -261,27 +237,6 @@ static inline void acomp_request_set_params(struct acomp_req *req,
 		req->flags |= CRYPTO_ACOMP_ALLOC_OUTPUT;
 }
 
-static inline struct crypto_istat_compress *comp_get_stat(
-	struct comp_alg_common *alg)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	return &alg->stat;
-#else
-	return NULL;
-#endif
-}
-
-static inline int crypto_comp_errstat(struct comp_alg_common *alg, int err)
-{
-	if (!IS_ENABLED(CONFIG_CRYPTO_STATS))
-		return err;
-
-	if (err && err != -EINPROGRESS && err != -EBUSY)
-		atomic64_inc(&comp_get_stat(alg)->err_cnt);
-
-	return err;
-}
-
 /**
  * crypto_acomp_compress() -- Invoke asynchronous compress operation
  *
@@ -293,19 +248,7 @@ static inline int crypto_comp_errstat(struct comp_alg_common *alg, int err)
  */
 static inline int crypto_acomp_compress(struct acomp_req *req)
 {
-	struct crypto_acomp *tfm = crypto_acomp_reqtfm(req);
-	struct comp_alg_common *alg;
-
-	alg = crypto_comp_alg_common(tfm);
-
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS)) {
-		struct crypto_istat_compress *istat = comp_get_stat(alg);
-
-		atomic64_inc(&istat->compress_cnt);
-		atomic64_add(req->slen, &istat->compress_tlen);
-	}
-
-	return crypto_comp_errstat(alg, tfm->compress(req));
+	return crypto_acomp_reqtfm(req)->compress(req);
 }
 
 /**
@@ -319,19 +262,7 @@ static inline int crypto_acomp_compress(struct acomp_req *req)
  */
 static inline int crypto_acomp_decompress(struct acomp_req *req)
 {
-	struct crypto_acomp *tfm = crypto_acomp_reqtfm(req);
-	struct comp_alg_common *alg;
-
-	alg = crypto_comp_alg_common(tfm);
-
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS)) {
-		struct crypto_istat_compress *istat = comp_get_stat(alg);
-
-		atomic64_inc(&istat->decompress_cnt);
-		atomic64_add(req->slen, &istat->decompress_tlen);
-	}
-
-	return crypto_comp_errstat(alg, tfm->decompress(req));
+	return crypto_acomp_reqtfm(req)->decompress(req);
 }
 
 #endif
diff --git a/include/crypto/aead.h b/include/crypto/aead.h
index 51382befbe37..0e8a41638678 100644
--- a/include/crypto/aead.h
+++ b/include/crypto/aead.h
@@ -101,22 +101,6 @@ struct aead_request {
 	void *__ctx[] CRYPTO_MINALIGN_ATTR;
 };
 
-/*
- * struct crypto_istat_aead - statistics for AEAD algorithm
- * @encrypt_cnt:	number of encrypt requests
- * @encrypt_tlen:	total data size handled by encrypt requests
- * @decrypt_cnt:	number of decrypt requests
- * @decrypt_tlen:	total data size handled by decrypt requests
- * @err_cnt:		number of error for AEAD requests
- */
-struct crypto_istat_aead {
-	atomic64_t encrypt_cnt;
-	atomic64_t encrypt_tlen;
-	atomic64_t decrypt_cnt;
-	atomic64_t decrypt_tlen;
-	atomic64_t err_cnt;
-};
-
 /**
  * struct aead_alg - AEAD cipher definition
  * @maxauthsize: Set the maximum authentication tag size supported by the
@@ -135,7 +119,6 @@ struct crypto_istat_aead {
  * @setkey: see struct skcipher_alg
  * @encrypt: see struct skcipher_alg
  * @decrypt: see struct skcipher_alg
- * @stat: statistics for AEAD algorithm
  * @ivsize: see struct skcipher_alg
  * @chunksize: see struct skcipher_alg
  * @init: Initialize the cryptographic transformation object. This function
@@ -162,10 +145,6 @@ struct aead_alg {
 	int (*init)(struct crypto_aead *tfm);
 	void (*exit)(struct crypto_aead *tfm);
 
-#ifdef CONFIG_CRYPTO_STATS
-	struct crypto_istat_aead stat;
-#endif
-
 	unsigned int ivsize;
 	unsigned int maxauthsize;
 	unsigned int chunksize;
diff --git a/include/crypto/akcipher.h b/include/crypto/akcipher.h
index 31c111bebb68..18a10cad07aa 100644
--- a/include/crypto/akcipher.h
+++ b/include/crypto/akcipher.h
@@ -54,26 +54,6 @@ struct crypto_akcipher {
 	struct crypto_tfm base;
 };
 
-/*
- * struct crypto_istat_akcipher - statistics for akcipher algorithm
- * @encrypt_cnt:	number of encrypt requests
- * @encrypt_tlen:	total data size handled by encrypt requests
- * @decrypt_cnt:	number of decrypt requests
- * @decrypt_tlen:	total data size handled by decrypt requests
- * @verify_cnt:		number of verify operation
- * @sign_cnt:		number of sign requests
- * @err_cnt:		number of error for akcipher requests
- */
-struct crypto_istat_akcipher {
-	atomic64_t encrypt_cnt;
-	atomic64_t encrypt_tlen;
-	atomic64_t decrypt_cnt;
-	atomic64_t decrypt_tlen;
-	atomic64_t verify_cnt;
-	atomic64_t sign_cnt;
-	atomic64_t err_cnt;
-};
-
 /**
  * struct akcipher_alg - generic public key algorithm
  *
@@ -110,7 +90,6 @@ struct crypto_istat_akcipher {
  * @exit:	Deinitialize the cryptographic transformation object. This is a
  *		counterpart to @init, used to remove various changes set in
  *		@init.
- * @stat:	Statistics for akcipher algorithm
  *
  * @base:	Common crypto API algorithm data structure
  */
@@ -127,10 +106,6 @@ struct akcipher_alg {
 	int (*init)(struct crypto_akcipher *tfm);
 	void (*exit)(struct crypto_akcipher *tfm);
 
-#ifdef CONFIG_CRYPTO_STATS
-	struct crypto_istat_akcipher stat;
-#endif
-
 	struct crypto_alg base;
 };
 
@@ -302,27 +277,6 @@ static inline unsigned int crypto_akcipher_maxsize(struct crypto_akcipher *tfm)
 	return alg->max_size(tfm);
 }
 
-static inline struct crypto_istat_akcipher *akcipher_get_stat(
-	struct akcipher_alg *alg)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	return &alg->stat;
-#else
-	return NULL;
-#endif
-}
-
-static inline int crypto_akcipher_errstat(struct akcipher_alg *alg, int err)
-{
-	if (!IS_ENABLED(CONFIG_CRYPTO_STATS))
-		return err;
-
-	if (err && err != -EINPROGRESS && err != -EBUSY)
-		atomic64_inc(&akcipher_get_stat(alg)->err_cnt);
-
-	return err;
-}
-
 /**
  * crypto_akcipher_encrypt() - Invoke public key encrypt operation
  *
@@ -336,16 +290,8 @@ static inline int crypto_akcipher_errstat(struct akcipher_alg *alg, int err)
 static inline int crypto_akcipher_encrypt(struct akcipher_request *req)
 {
 	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
-	struct akcipher_alg *alg = crypto_akcipher_alg(tfm);
 
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS)) {
-		struct crypto_istat_akcipher *istat = akcipher_get_stat(alg);
-
-		atomic64_inc(&istat->encrypt_cnt);
-		atomic64_add(req->src_len, &istat->encrypt_tlen);
-	}
-
-	return crypto_akcipher_errstat(alg, alg->encrypt(req));
+	return crypto_akcipher_alg(tfm)->encrypt(req);
 }
 
 /**
@@ -361,16 +307,8 @@ static inline int crypto_akcipher_encrypt(struct akcipher_request *req)
 static inline int crypto_akcipher_decrypt(struct akcipher_request *req)
 {
 	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
-	struct akcipher_alg *alg = crypto_akcipher_alg(tfm);
 
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS)) {
-		struct crypto_istat_akcipher *istat = akcipher_get_stat(alg);
-
-		atomic64_inc(&istat->decrypt_cnt);
-		atomic64_add(req->src_len, &istat->decrypt_tlen);
-	}
-
-	return crypto_akcipher_errstat(alg, alg->decrypt(req));
+	return crypto_akcipher_alg(tfm)->decrypt(req);
 }
 
 /**
@@ -422,12 +360,8 @@ int crypto_akcipher_sync_decrypt(struct crypto_akcipher *tfm,
 static inline int crypto_akcipher_sign(struct akcipher_request *req)
 {
 	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
-	struct akcipher_alg *alg = crypto_akcipher_alg(tfm);
 
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
-		atomic64_inc(&akcipher_get_stat(alg)->sign_cnt);
-
-	return crypto_akcipher_errstat(alg, alg->sign(req));
+	return crypto_akcipher_alg(tfm)->sign(req);
 }
 
 /**
@@ -447,12 +381,8 @@ static inline int crypto_akcipher_sign(struct akcipher_request *req)
 static inline int crypto_akcipher_verify(struct akcipher_request *req)
 {
 	struct crypto_akcipher *tfm = crypto_akcipher_reqtfm(req);
-	struct akcipher_alg *alg = crypto_akcipher_alg(tfm);
-
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
-		atomic64_inc(&akcipher_get_stat(alg)->verify_cnt);
 
-	return crypto_akcipher_errstat(alg, alg->verify(req));
+	return crypto_akcipher_alg(tfm)->verify(req);
 }
 
 /**
diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h
index 7a4a71af653f..156de41ca760 100644
--- a/include/crypto/algapi.h
+++ b/include/crypto/algapi.h
@@ -61,9 +61,6 @@ struct crypto_type {
 	void (*show)(struct seq_file *m, struct crypto_alg *alg);
 	int (*report)(struct sk_buff *skb, struct crypto_alg *alg);
 	void (*free)(struct crypto_instance *inst);
-#ifdef CONFIG_CRYPTO_STATS
-	int (*report_stat)(struct sk_buff *skb, struct crypto_alg *alg);
-#endif
 
 	unsigned int type;
 	unsigned int maskclear;
diff --git a/include/crypto/hash.h b/include/crypto/hash.h
index 5d61f576cfc8..0014bdd81ab7 100644
--- a/include/crypto/hash.h
+++ b/include/crypto/hash.h
@@ -23,27 +23,8 @@ struct crypto_ahash;
  * crypto_unregister_shash().
  */
 
-/*
- * struct crypto_istat_hash - statistics for has algorithm
- * @hash_cnt:		number of hash requests
- * @hash_tlen:		total data size hashed
- * @err_cnt:		number of error for hash requests
- */
-struct crypto_istat_hash {
-	atomic64_t hash_cnt;
-	atomic64_t hash_tlen;
-	atomic64_t err_cnt;
-};
-
-#ifdef CONFIG_CRYPTO_STATS
-#define HASH_ALG_COMMON_STAT struct crypto_istat_hash stat;
-#else
-#define HASH_ALG_COMMON_STAT
-#endif
-
 /*
  * struct hash_alg_common - define properties of message digest
- * @stat: Statistics for hash algorithm.
  * @digestsize: Size of the result of the transformation. A buffer of this size
  *	        must be available to the @final and @finup calls, so they can
  *	        store the resulting hash into it. For various predefined sizes,
@@ -60,8 +41,6 @@ struct crypto_istat_hash {
  *	  information.
  */
 #define HASH_ALG_COMMON {		\
-	HASH_ALG_COMMON_STAT		\
-					\
 	unsigned int digestsize;	\
 	unsigned int statesize;		\
 					\
@@ -243,7 +222,6 @@ struct shash_alg {
 	};
 };
 #undef HASH_ALG_COMMON
-#undef HASH_ALG_COMMON_STAT
 
 struct crypto_ahash {
 	bool using_shash; /* Underlying algorithm is shash, not ahash */
diff --git a/include/crypto/internal/acompress.h b/include/crypto/internal/acompress.h
index 4ac46bafba9d..d00392d1988e 100644
--- a/include/crypto/internal/acompress.h
+++ b/include/crypto/internal/acompress.h
@@ -31,7 +31,6 @@
  *		@init.
  *
  * @reqsize:	Context size for (de)compression requests
- * @stat:	Statistics for compress algorithm
  * @base:	Common crypto API algorithm data structure
  * @calg:	Cmonn algorithm data structure shared with scomp
  */
diff --git a/include/crypto/internal/cryptouser.h b/include/crypto/internal/cryptouser.h
deleted file mode 100644
index fd54074332f5..000000000000
--- a/include/crypto/internal/cryptouser.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#include <linux/cryptouser.h>
-#include <net/netlink.h>
-
-struct crypto_alg *crypto_alg_match(struct crypto_user_alg *p, int exact);
-
-#ifdef CONFIG_CRYPTO_STATS
-int crypto_reportstat(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, struct nlattr **attrs);
-#else
-static inline int crypto_reportstat(struct sk_buff *in_skb,
-				    struct nlmsghdr *in_nlh,
-				    struct nlattr **attrs)
-{
-	return -ENOTSUPP;
-}
-#endif
diff --git a/include/crypto/internal/scompress.h b/include/crypto/internal/scompress.h
index 858fe3965ae3..07a10fd2d321 100644
--- a/include/crypto/internal/scompress.h
+++ b/include/crypto/internal/scompress.h
@@ -27,7 +27,6 @@ struct crypto_scomp {
  * @free_ctx:	Function frees context allocated with alloc_ctx
  * @compress:	Function performs a compress operation
  * @decompress:	Function performs a de-compress operation
- * @stat:	Statistics for compress algorithm
  * @base:	Common crypto API algorithm data structure
  * @calg:	Cmonn algorithm data structure shared with acomp
  */
diff --git a/include/crypto/kpp.h b/include/crypto/kpp.h
index 1988e24a0d1d..2d9c4de57b69 100644
--- a/include/crypto/kpp.h
+++ b/include/crypto/kpp.h
@@ -51,20 +51,6 @@ struct crypto_kpp {
 	struct crypto_tfm base;
 };
 
-/*
- * struct crypto_istat_kpp - statistics for KPP algorithm
- * @setsecret_cnt:		number of setsecrey operation
- * @generate_public_key_cnt:	number of generate_public_key operation
- * @compute_shared_secret_cnt:	number of compute_shared_secret operation
- * @err_cnt:			number of error for KPP requests
- */
-struct crypto_istat_kpp {
-	atomic64_t setsecret_cnt;
-	atomic64_t generate_public_key_cnt;
-	atomic64_t compute_shared_secret_cnt;
-	atomic64_t err_cnt;
-};
-
 /**
  * struct kpp_alg - generic key-agreement protocol primitives
  *
@@ -87,7 +73,6 @@ struct crypto_istat_kpp {
  * @exit:		Undo everything @init did.
  *
  * @base:		Common crypto API algorithm data structure
- * @stat:		Statistics for KPP algorithm
  */
 struct kpp_alg {
 	int (*set_secret)(struct crypto_kpp *tfm, const void *buffer,
@@ -100,10 +85,6 @@ struct kpp_alg {
 	int (*init)(struct crypto_kpp *tfm);
 	void (*exit)(struct crypto_kpp *tfm);
 
-#ifdef CONFIG_CRYPTO_STATS
-	struct crypto_istat_kpp stat;
-#endif
-
 	struct crypto_alg base;
 };
 
@@ -291,26 +272,6 @@ struct kpp_secret {
 	unsigned short len;
 };
 
-static inline struct crypto_istat_kpp *kpp_get_stat(struct kpp_alg *alg)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	return &alg->stat;
-#else
-	return NULL;
-#endif
-}
-
-static inline int crypto_kpp_errstat(struct kpp_alg *alg, int err)
-{
-	if (!IS_ENABLED(CONFIG_CRYPTO_STATS))
-		return err;
-
-	if (err && err != -EINPROGRESS && err != -EBUSY)
-		atomic64_inc(&kpp_get_stat(alg)->err_cnt);
-
-	return err;
-}
-
 /**
  * crypto_kpp_set_secret() - Invoke kpp operation
  *
@@ -329,12 +290,7 @@ static inline int crypto_kpp_errstat(struct kpp_alg *alg, int err)
 static inline int crypto_kpp_set_secret(struct crypto_kpp *tfm,
 					const void *buffer, unsigned int len)
 {
-	struct kpp_alg *alg = crypto_kpp_alg(tfm);
-
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
-		atomic64_inc(&kpp_get_stat(alg)->setsecret_cnt);
-
-	return crypto_kpp_errstat(alg, alg->set_secret(tfm, buffer, len));
+	return crypto_kpp_alg(tfm)->set_secret(tfm, buffer, len);
 }
 
 /**
@@ -353,12 +309,8 @@ static inline int crypto_kpp_set_secret(struct crypto_kpp *tfm,
 static inline int crypto_kpp_generate_public_key(struct kpp_request *req)
 {
 	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
-	struct kpp_alg *alg = crypto_kpp_alg(tfm);
-
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
-		atomic64_inc(&kpp_get_stat(alg)->generate_public_key_cnt);
 
-	return crypto_kpp_errstat(alg, alg->generate_public_key(req));
+	return crypto_kpp_alg(tfm)->generate_public_key(req);
 }
 
 /**
@@ -374,12 +326,8 @@ static inline int crypto_kpp_generate_public_key(struct kpp_request *req)
 static inline int crypto_kpp_compute_shared_secret(struct kpp_request *req)
 {
 	struct crypto_kpp *tfm = crypto_kpp_reqtfm(req);
-	struct kpp_alg *alg = crypto_kpp_alg(tfm);
-
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS))
-		atomic64_inc(&kpp_get_stat(alg)->compute_shared_secret_cnt);
 
-	return crypto_kpp_errstat(alg, alg->compute_shared_secret(req));
+	return crypto_kpp_alg(tfm)->compute_shared_secret(req);
 }
 
 /**
diff --git a/include/crypto/rng.h b/include/crypto/rng.h
index 6abe5102e5fb..5ac4388f50e1 100644
--- a/include/crypto/rng.h
+++ b/include/crypto/rng.h
@@ -15,20 +15,6 @@
 
 struct crypto_rng;
 
-/*
- * struct crypto_istat_rng: statistics for RNG algorithm
- * @generate_cnt:	number of RNG generate requests
- * @generate_tlen:	total data size of generated data by the RNG
- * @seed_cnt:		number of times the RNG was seeded
- * @err_cnt:		number of error for RNG requests
- */
-struct crypto_istat_rng {
-	atomic64_t generate_cnt;
-	atomic64_t generate_tlen;
-	atomic64_t seed_cnt;
-	atomic64_t err_cnt;
-};
-
 /**
  * struct rng_alg - random number generator definition
  *
@@ -46,7 +32,6 @@ struct crypto_istat_rng {
  *		size of the seed is defined with @seedsize .
  * @set_ent:	Set entropy that would otherwise be obtained from
  *		entropy source.  Internal use only.
- * @stat:	Statistics for rng algorithm
  * @seedsize:	The seed size required for a random number generator
  *		initialization defined with this variable. Some
  *		random number generators does not require a seed
@@ -63,10 +48,6 @@ struct rng_alg {
 	void (*set_ent)(struct crypto_rng *tfm, const u8 *data,
 			unsigned int len);
 
-#ifdef CONFIG_CRYPTO_STATS
-	struct crypto_istat_rng stat;
-#endif
-
 	unsigned int seedsize;
 
 	struct crypto_alg base;
@@ -144,26 +125,6 @@ static inline void crypto_free_rng(struct crypto_rng *tfm)
 	crypto_destroy_tfm(tfm, crypto_rng_tfm(tfm));
 }
 
-static inline struct crypto_istat_rng *rng_get_stat(struct rng_alg *alg)
-{
-#ifdef CONFIG_CRYPTO_STATS
-	return &alg->stat;
-#else
-	return NULL;
-#endif
-}
-
-static inline int crypto_rng_errstat(struct rng_alg *alg, int err)
-{
-	if (!IS_ENABLED(CONFIG_CRYPTO_STATS))
-		return err;
-
-	if (err && err != -EINPROGRESS && err != -EBUSY)
-		atomic64_inc(&rng_get_stat(alg)->err_cnt);
-
-	return err;
-}
-
 /**
  * crypto_rng_generate() - get random number
  * @tfm: cipher handle
@@ -182,17 +143,7 @@ static inline int crypto_rng_generate(struct crypto_rng *tfm,
 				      const u8 *src, unsigned int slen,
 				      u8 *dst, unsigned int dlen)
 {
-	struct rng_alg *alg = crypto_rng_alg(tfm);
-
-	if (IS_ENABLED(CONFIG_CRYPTO_STATS)) {
-		struct crypto_istat_rng *istat = rng_get_stat(alg);
-
-		atomic64_inc(&istat->generate_cnt);
-		atomic64_add(dlen, &istat->generate_tlen);
-	}
-
-	return crypto_rng_errstat(alg,
-				  alg->generate(tfm, src, slen, dst, dlen));
+	return crypto_rng_alg(tfm)->generate(tfm, src, slen, dst, dlen);
 }
 
 /**
diff --git a/include/crypto/skcipher.h b/include/crypto/skcipher.h
index c8857d7bdb37..74d47e23374e 100644
--- a/include/crypto/skcipher.h
+++ b/include/crypto/skcipher.h
@@ -64,28 +64,6 @@ struct crypto_lskcipher {
 	struct crypto_tfm base;
 };
 
-/*
- * struct crypto_istat_cipher - statistics for cipher algorithm
- * @encrypt_cnt:	number of encrypt requests
- * @encrypt_tlen:	total data size handled by encrypt requests
- * @decrypt_cnt:	number of decrypt requests
- * @decrypt_tlen:	total data size handled by decrypt requests
- * @err_cnt:		number of error for cipher requests
- */
-struct crypto_istat_cipher {
-	atomic64_t encrypt_cnt;
-	atomic64_t encrypt_tlen;
-	atomic64_t decrypt_cnt;
-	atomic64_t decrypt_tlen;
-	atomic64_t err_cnt;
-};
-
-#ifdef CONFIG_CRYPTO_STATS
-#define SKCIPHER_ALG_COMMON_STAT struct crypto_istat_cipher stat;
-#else
-#define SKCIPHER_ALG_COMMON_STAT
-#endif
-
 /*
  * struct skcipher_alg_common - common properties of skcipher_alg
  * @min_keysize: Minimum key size supported by the transformation. This is the
@@ -103,7 +81,6 @@ struct crypto_istat_cipher {
  * @chunksize: Equal to the block size except for stream ciphers such as
  *	       CTR where it is set to the underlying block size.
  * @statesize: Size of the internal state for the algorithm.
- * @stat: Statistics for cipher algorithm
  * @base: Definition of a generic crypto algorithm.
  */
 #define SKCIPHER_ALG_COMMON {		\
@@ -113,8 +90,6 @@ struct crypto_istat_cipher {
 	unsigned int chunksize;		\
 	unsigned int statesize;		\
 					\
-	SKCIPHER_ALG_COMMON_STAT	\
-					\
 	struct crypto_alg base;		\
 }
 struct skcipher_alg_common SKCIPHER_ALG_COMMON;
diff --git a/include/uapi/linux/cryptouser.h b/include/uapi/linux/cryptouser.h
index 5730c67f0617..20a6c0fc149e 100644
--- a/include/uapi/linux/cryptouser.h
+++ b/include/uapi/linux/cryptouser.h
@@ -32,7 +32,7 @@ enum {
 	CRYPTO_MSG_UPDATEALG,
 	CRYPTO_MSG_GETALG,
 	CRYPTO_MSG_DELRNG,
-	CRYPTO_MSG_GETSTAT,
+	CRYPTO_MSG_GETSTAT, /* No longer supported, do not use. */
 	__CRYPTO_MSG_MAX
 };
 #define CRYPTO_MSG_MAX (__CRYPTO_MSG_MAX - 1)
@@ -54,16 +54,16 @@ enum crypto_attr_type_t {
 	CRYPTOCFGA_REPORT_AKCIPHER,	/* struct crypto_report_akcipher */
 	CRYPTOCFGA_REPORT_KPP,		/* struct crypto_report_kpp */
 	CRYPTOCFGA_REPORT_ACOMP,	/* struct crypto_report_acomp */
-	CRYPTOCFGA_STAT_LARVAL,		/* struct crypto_stat */
-	CRYPTOCFGA_STAT_HASH,		/* struct crypto_stat */
-	CRYPTOCFGA_STAT_BLKCIPHER,	/* struct crypto_stat */
-	CRYPTOCFGA_STAT_AEAD,		/* struct crypto_stat */
-	CRYPTOCFGA_STAT_COMPRESS,	/* struct crypto_stat */
-	CRYPTOCFGA_STAT_RNG,		/* struct crypto_stat */
-	CRYPTOCFGA_STAT_CIPHER,		/* struct crypto_stat */
-	CRYPTOCFGA_STAT_AKCIPHER,	/* struct crypto_stat */
-	CRYPTOCFGA_STAT_KPP,		/* struct crypto_stat */
-	CRYPTOCFGA_STAT_ACOMP,		/* struct crypto_stat */
+	CRYPTOCFGA_STAT_LARVAL,		/* No longer supported, do not use. */
+	CRYPTOCFGA_STAT_HASH,		/* No longer supported, do not use. */
+	CRYPTOCFGA_STAT_BLKCIPHER,	/* No longer supported, do not use. */
+	CRYPTOCFGA_STAT_AEAD,		/* No longer supported, do not use. */
+	CRYPTOCFGA_STAT_COMPRESS,	/* No longer supported, do not use. */
+	CRYPTOCFGA_STAT_RNG,		/* No longer supported, do not use. */
+	CRYPTOCFGA_STAT_CIPHER,		/* No longer supported, do not use. */
+	CRYPTOCFGA_STAT_AKCIPHER,	/* No longer supported, do not use. */
+	CRYPTOCFGA_STAT_KPP,		/* No longer supported, do not use. */
+	CRYPTOCFGA_STAT_ACOMP,		/* No longer supported, do not use. */
 	__CRYPTOCFGA_MAX
 
 #define CRYPTOCFGA_MAX (__CRYPTOCFGA_MAX - 1)
@@ -79,6 +79,7 @@ struct crypto_user_alg {
 	__u32 cru_flags;
 };
 
+/* No longer supported, do not use. */
 struct crypto_stat_aead {
 	char type[CRYPTO_MAX_NAME];
 	__u64 stat_encrypt_cnt;
@@ -88,6 +89,7 @@ struct crypto_stat_aead {
 	__u64 stat_err_cnt;
 };
 
+/* No longer supported, do not use. */
 struct crypto_stat_akcipher {
 	char type[CRYPTO_MAX_NAME];
 	__u64 stat_encrypt_cnt;
@@ -99,6 +101,7 @@ struct crypto_stat_akcipher {
 	__u64 stat_err_cnt;
 };
 
+/* No longer supported, do not use. */
 struct crypto_stat_cipher {
 	char type[CRYPTO_MAX_NAME];
 	__u64 stat_encrypt_cnt;
@@ -108,6 +111,7 @@ struct crypto_stat_cipher {
 	__u64 stat_err_cnt;
 };
 
+/* No longer supported, do not use. */
 struct crypto_stat_compress {
 	char type[CRYPTO_MAX_NAME];
 	__u64 stat_compress_cnt;
@@ -117,6 +121,7 @@ struct crypto_stat_compress {
 	__u64 stat_err_cnt;
 };
 
+/* No longer supported, do not use. */
 struct crypto_stat_hash {
 	char type[CRYPTO_MAX_NAME];
 	__u64 stat_hash_cnt;
@@ -124,6 +129,7 @@ struct crypto_stat_hash {
 	__u64 stat_err_cnt;
 };
 
+/* No longer supported, do not use. */
 struct crypto_stat_kpp {
 	char type[CRYPTO_MAX_NAME];
 	__u64 stat_setsecret_cnt;
@@ -132,6 +138,7 @@ struct crypto_stat_kpp {
 	__u64 stat_err_cnt;
 };
 
+/* No longer supported, do not use. */
 struct crypto_stat_rng {
 	char type[CRYPTO_MAX_NAME];
 	__u64 stat_generate_cnt;
@@ -140,6 +147,7 @@ struct crypto_stat_rng {
 	__u64 stat_err_cnt;
 };
 
+/* No longer supported, do not use. */
 struct crypto_stat_larval {
 	char type[CRYPTO_MAX_NAME];
 };
-- 
cgit v1.2.3


From 58169ec9c40309541509181e068177eab72e6caa Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 29 Mar 2024 14:49:31 +0000
Subject: inet: preserve const qualifier in inet_csk()

We can change inet_csk() to propagate its argument const qualifier,
thanks to container_of_const().

We have to fix few places that had mistakes, like tcp_bound_rto().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20240329144931.295800-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/espintcp.h             | 2 +-
 include/net/inet_connection_sock.h | 5 +----
 include/net/tcp.h                  | 2 +-
 include/net/tls.h                  | 2 +-
 net/ipv4/tcp_input.c               | 2 +-
 net/ipv4/tcp_timer.c               | 4 ++--
 net/mptcp/protocol.h               | 2 +-
 7 files changed, 8 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/espintcp.h b/include/net/espintcp.h
index 0335bbd76552..c70efd704b6d 100644
--- a/include/net/espintcp.h
+++ b/include/net/espintcp.h
@@ -32,7 +32,7 @@ struct espintcp_ctx {
 
 static inline struct espintcp_ctx *espintcp_getctx(const struct sock *sk)
 {
-	struct inet_connection_sock *icsk = inet_csk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 
 	/* RCU is only needed for diag */
 	return (__force void *)icsk->icsk_ulp_data;
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index ccf171f7eb60..d96c871b0d58 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -147,10 +147,7 @@ struct inet_connection_sock {
 #define ICSK_TIME_LOSS_PROBE	5	/* Tail loss probe timer */
 #define ICSK_TIME_REO_TIMEOUT	6	/* Reordering timer */
 
-static inline struct inet_connection_sock *inet_csk(const struct sock *sk)
-{
-	return (struct inet_connection_sock *)sk;
-}
+#define inet_csk(ptr) container_of_const(ptr, struct inet_connection_sock, icsk_inet.sk)
 
 static inline void *inet_csk_ca(const struct sock *sk)
 {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6ae35199d3b3..6c27195619ab 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -742,7 +742,7 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu);
 int tcp_mss_to_mtu(struct sock *sk, int mss);
 void tcp_mtup_init(struct sock *sk);
 
-static inline void tcp_bound_rto(const struct sock *sk)
+static inline void tcp_bound_rto(struct sock *sk)
 {
 	if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX)
 		inet_csk(sk)->icsk_rto = TCP_RTO_MAX;
diff --git a/include/net/tls.h b/include/net/tls.h
index 340ad43971e4..400f4857dc11 100644
--- a/include/net/tls.h
+++ b/include/net/tls.h
@@ -361,7 +361,7 @@ static inline bool tls_is_skb_tx_device_offloaded(const struct sk_buff *skb)
 
 static inline struct tls_context *tls_get_ctx(const struct sock *sk)
 {
-	struct inet_connection_sock *icsk = inet_csk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 
 	/* Use RCU on icsk_ulp_data only for sock diag code,
 	 * TLS data path doesn't need rcu_dereference().
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5d874817a78d..1b6cd3840012 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6999,7 +6999,7 @@ EXPORT_SYMBOL(inet_reqsk_alloc);
 /*
  * Return true if a syncookie should be sent
  */
-static bool tcp_syn_flood_action(const struct sock *sk, const char *proto)
+static bool tcp_syn_flood_action(struct sock *sk, const char *proto)
 {
 	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
 	const char *msg = "Dropping request";
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index d1ad20ce1c8c..976db57b95d4 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -25,7 +25,7 @@
 
 static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
 {
-	struct inet_connection_sock *icsk = inet_csk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 	const struct tcp_sock *tp = tcp_sk(sk);
 	u32 elapsed, user_timeout;
 	s32 remaining;
@@ -47,7 +47,7 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
 
 u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when)
 {
-	struct inet_connection_sock *icsk = inet_csk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 	u32 remaining, user_timeout;
 	s32 elapsed;
 
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index a10ebf3ee10a..46f4655b7123 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -558,7 +558,7 @@ struct mptcp_subflow_context {
 static inline struct mptcp_subflow_context *
 mptcp_subflow_ctx(const struct sock *sk)
 {
-	struct inet_connection_sock *icsk = inet_csk(sk);
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 
 	/* Use RCU on icsk_ulp_data only for sock diag code */
 	return (__force struct mptcp_subflow_context *)icsk->icsk_ulp_data;
-- 
cgit v1.2.3


From 1eeb5043573981f3a1278876515851b7f6b1df1b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 29 Mar 2024 15:32:03 +0000
Subject: tcp/dccp: do not care about families in inet_twsk_purge()

We lost ability to unload ipv6 module a long time ago.

Instead of calling expensive inet_twsk_purge() twice,
we can handle all families in one round.

Also remove an extra line added in my prior patch,
per Kuniyuki Iwashima feedback.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/netdev/20240327192934.6843-1-kuniyu@amazon.com/
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://lore.kernel.org/r/20240329153203.345203-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/inet_timewait_sock.h | 2 +-
 include/net/tcp.h                | 2 +-
 net/dccp/ipv4.c                  | 2 +-
 net/dccp/ipv6.c                  | 6 ------
 net/ipv4/inet_timewait_sock.c    | 9 +++------
 net/ipv4/tcp_ipv4.c              | 2 +-
 net/ipv4/tcp_minisocks.c         | 6 +++---
 net/ipv6/tcp_ipv6.c              | 6 ------
 8 files changed, 10 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index f28da08a37b4..2a536eea9424 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -111,7 +111,7 @@ static inline void inet_twsk_reschedule(struct inet_timewait_sock *tw, int timeo
 
 void inet_twsk_deschedule_put(struct inet_timewait_sock *tw);
 
-void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family);
+void inet_twsk_purge(struct inet_hashinfo *hashinfo);
 
 static inline
 struct net *twsk_net(const struct inet_timewait_sock *twsk)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6c27195619ab..9ab5b37e9d53 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -353,7 +353,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
 void tcp_rcv_space_adjust(struct sock *sk);
 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
 void tcp_twsk_destructor(struct sock *sk);
-void tcp_twsk_purge(struct list_head *net_exit_list, int family);
+void tcp_twsk_purge(struct list_head *net_exit_list);
 ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos,
 			struct pipe_inode_info *pipe, size_t len,
 			unsigned int flags);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 44b033fe1ef6..9fc9cea4c251 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -1039,7 +1039,7 @@ static void __net_exit dccp_v4_exit_net(struct net *net)
 
 static void __net_exit dccp_v4_exit_batch(struct list_head *net_exit_list)
 {
-	inet_twsk_purge(&dccp_hashinfo, AF_INET);
+	inet_twsk_purge(&dccp_hashinfo);
 }
 
 static struct pernet_operations dccp_v4_ops = {
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index ded07e09f813..c8ca703dc331 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -1119,15 +1119,9 @@ static void __net_exit dccp_v6_exit_net(struct net *net)
 	inet_ctl_sock_destroy(pn->v6_ctl_sk);
 }
 
-static void __net_exit dccp_v6_exit_batch(struct list_head *net_exit_list)
-{
-	inet_twsk_purge(&dccp_hashinfo, AF_INET6);
-}
-
 static struct pernet_operations dccp_v6_ops = {
 	.init   = dccp_v6_init_net,
 	.exit   = dccp_v6_exit_net,
-	.exit_batch = dccp_v6_exit_batch,
 	.id	= &dccp_v6_pernet_id,
 	.size   = sizeof(struct dccp_v6_pernet),
 };
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index b0cc07d9a568..e28075f0006e 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -264,7 +264,7 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm)
 EXPORT_SYMBOL_GPL(__inet_twsk_schedule);
 
 /* Remove all non full sockets (TIME_WAIT and NEW_SYN_RECV) for dead netns */
-void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family)
+void inet_twsk_purge(struct inet_hashinfo *hashinfo)
 {
 	struct inet_ehash_bucket *head = &hashinfo->ehash[0];
 	unsigned int ehash_mask = hashinfo->ehash_mask;
@@ -273,7 +273,6 @@ void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family)
 	struct sock *sk;
 
 	for (slot = 0; slot <= ehash_mask; slot++, head++) {
-
 		if (hlist_nulls_empty(&head->chain))
 			continue;
 
@@ -288,15 +287,13 @@ restart:
 					     TCPF_NEW_SYN_RECV))
 				continue;
 
-			if (sk->sk_family != family ||
-			    refcount_read(&sock_net(sk)->ns.count))
+			if (refcount_read(&sock_net(sk)->ns.count))
 				continue;
 
 			if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
 				continue;
 
-			if (unlikely(sk->sk_family != family ||
-				     refcount_read(&sock_net(sk)->ns.count))) {
+			if (refcount_read(&sock_net(sk)->ns.count)) {
 				sock_gen_put(sk);
 				goto restart;
 			}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a22ee5838751..1e0a9762f92e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3501,7 +3501,7 @@ static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
 {
 	struct net *net;
 
-	tcp_twsk_purge(net_exit_list, AF_INET);
+	tcp_twsk_purge(net_exit_list);
 
 	list_for_each_entry(net, net_exit_list, exit_list) {
 		inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f0761f060a83..5b21a07ddf9a 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -388,7 +388,7 @@ void tcp_twsk_destructor(struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
 
-void tcp_twsk_purge(struct list_head *net_exit_list, int family)
+void tcp_twsk_purge(struct list_head *net_exit_list)
 {
 	bool purged_once = false;
 	struct net *net;
@@ -396,9 +396,9 @@ void tcp_twsk_purge(struct list_head *net_exit_list, int family)
 	list_for_each_entry(net, net_exit_list, exit_list) {
 		if (net->ipv4.tcp_death_row.hashinfo->pernet) {
 			/* Even if tw_refcount == 1, we must clean up kernel reqsk */
-			inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo, family);
+			inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo);
 		} else if (!purged_once) {
-			inet_twsk_purge(&tcp_hashinfo, family);
+			inet_twsk_purge(&tcp_hashinfo);
 			purged_once = true;
 		}
 	}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 3f4cba49e9ee..5ae74f661d25 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2389,15 +2389,9 @@ static void __net_exit tcpv6_net_exit(struct net *net)
 	inet_ctl_sock_destroy(net->ipv6.tcp_sk);
 }
 
-static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list)
-{
-	tcp_twsk_purge(net_exit_list, AF_INET6);
-}
-
 static struct pernet_operations tcpv6_net_ops = {
 	.init	    = tcpv6_net_init,
 	.exit	    = tcpv6_net_exit,
-	.exit_batch = tcpv6_net_exit_batch,
 };
 
 int __init tcpv6_init(void)
-- 
cgit v1.2.3


From 5bc63d3a6f466add504f283d9f743f20ca9ec334 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 29 Mar 2024 10:57:08 -0700
Subject: netlink: create a new header for internal genetlink symbols

There are things in linux/genetlink.h which are only used
under net/netlink/. Move them to a new local header.
A new header with just 2 externs isn't great, but alternative
would be to include af_netlink.h in genetlink.c which feels
even worse.

Link: https://lore.kernel.org/r/20240329175710.291749-2-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/genetlink.h |  5 -----
 net/netlink/af_netlink.c  |  2 +-
 net/netlink/genetlink.c   |  2 ++
 net/netlink/genetlink.h   | 11 +++++++++++
 4 files changed, 14 insertions(+), 6 deletions(-)
 create mode 100644 net/netlink/genetlink.h

(limited to 'include')

diff --git a/include/linux/genetlink.h b/include/linux/genetlink.h
index c285968e437a..9dbd7ba9b858 100644
--- a/include/linux/genetlink.h
+++ b/include/linux/genetlink.h
@@ -4,15 +4,10 @@
 
 #include <uapi/linux/genetlink.h>
 
-
 /* All generic netlink requests are serialized by a global lock.  */
 extern void genl_lock(void);
 extern void genl_unlock(void);
 
-/* for synchronisation between af_netlink and genetlink */
-extern atomic_t genl_sk_destructing_cnt;
-extern wait_queue_head_t genl_sk_destructing_waitq;
-
 #define MODULE_ALIAS_GENL_FAMILY(family)\
  MODULE_ALIAS_NET_PF_PROTO_NAME(PF_NETLINK, NETLINK_GENERIC, "-family-" family)
 
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 7554803218a2..dc8c3c01d51b 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -59,7 +59,6 @@
 #include <linux/rhashtable.h>
 #include <asm/cacheflush.h>
 #include <linux/hash.h>
-#include <linux/genetlink.h>
 #include <linux/net_namespace.h>
 #include <linux/nospec.h>
 #include <linux/btf_ids.h>
@@ -73,6 +72,7 @@
 #include <trace/events/netlink.h>
 
 #include "af_netlink.h"
+#include "genetlink.h"
 
 struct listeners {
 	struct rcu_head		rcu;
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 3b7666944b11..feb54c63a116 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -22,6 +22,8 @@
 #include <net/sock.h>
 #include <net/genetlink.h>
 
+#include "genetlink.h"
+
 static DEFINE_MUTEX(genl_mutex); /* serialization of message processing */
 static DECLARE_RWSEM(cb_lock);
 
diff --git a/net/netlink/genetlink.h b/net/netlink/genetlink.h
new file mode 100644
index 000000000000..89bd9d2631c3
--- /dev/null
+++ b/net/netlink/genetlink.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __NET_GENETLINK_H
+#define __NET_GENETLINK_H
+
+#include <linux/wait.h>
+
+/* for synchronisation between af_netlink and genetlink */
+extern atomic_t genl_sk_destructing_cnt;
+extern wait_queue_head_t genl_sk_destructing_waitq;
+
+#endif	/* __LINUX_GENERIC_NETLINK_H */
-- 
cgit v1.2.3


From cd7209628cdb2a7edd7656c126d2455e7102e949 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 29 Mar 2024 10:57:10 -0700
Subject: genetlink: remove linux/genetlink.h

genetlink.h is a shell of what used to be a combined uAPI
and kernel header over a decade ago. It has fewer than
10 lines of code. Merge it into net/genetlink.h.
In some ways it'd be better to keep the combined header
under linux/ but it would make looking through git history
harder.

Acked-by: Sven Eckelmann <sven@narfation.org>
Link: https://lore.kernel.org/r/20240329175710.291749-4-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/wireguard/main.c      |  2 +-
 include/linux/genetlink.h         | 14 --------------
 include/linux/genl_magic_struct.h |  2 +-
 include/net/genetlink.h           | 10 +++++++++-
 net/batman-adv/main.c             |  2 +-
 net/batman-adv/netlink.c          |  1 -
 net/openvswitch/datapath.c        |  1 -
 7 files changed, 12 insertions(+), 20 deletions(-)
 delete mode 100644 include/linux/genetlink.h

(limited to 'include')

diff --git a/drivers/net/wireguard/main.c b/drivers/net/wireguard/main.c
index ee4da9ab8013..a00671b58701 100644
--- a/drivers/net/wireguard/main.c
+++ b/drivers/net/wireguard/main.c
@@ -14,7 +14,7 @@
 
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/genetlink.h>
+#include <net/genetlink.h>
 #include <net/rtnetlink.h>
 
 static int __init wg_mod_init(void)
diff --git a/include/linux/genetlink.h b/include/linux/genetlink.h
deleted file mode 100644
index 9dbd7ba9b858..000000000000
--- a/include/linux/genetlink.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __LINUX_GENERIC_NETLINK_H
-#define __LINUX_GENERIC_NETLINK_H
-
-#include <uapi/linux/genetlink.h>
-
-/* All generic netlink requests are serialized by a global lock.  */
-extern void genl_lock(void);
-extern void genl_unlock(void);
-
-#define MODULE_ALIAS_GENL_FAMILY(family)\
- MODULE_ALIAS_NET_PF_PROTO_NAME(PF_NETLINK, NETLINK_GENERIC, "-family-" family)
-
-#endif	/* __LINUX_GENERIC_NETLINK_H */
diff --git a/include/linux/genl_magic_struct.h b/include/linux/genl_magic_struct.h
index a419d93789ff..621b87a87d74 100644
--- a/include/linux/genl_magic_struct.h
+++ b/include/linux/genl_magic_struct.h
@@ -15,8 +15,8 @@
 #endif
 
 #include <linux/args.h>
-#include <linux/genetlink.h>
 #include <linux/types.h>
+#include <net/genetlink.h>
 
 extern int CONCATENATE(GENL_MAGIC_FAMILY, _genl_register)(void);
 extern void CONCATENATE(GENL_MAGIC_FAMILY, _genl_unregister)(void);
diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 9ece6e5a3ea8..9ab49bfeae78 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -2,12 +2,20 @@
 #ifndef __NET_GENERIC_NETLINK_H
 #define __NET_GENERIC_NETLINK_H
 
-#include <linux/genetlink.h>
+#include <linux/net.h>
 #include <net/netlink.h>
 #include <net/net_namespace.h>
+#include <uapi/linux/genetlink.h>
 
 #define GENLMSG_DEFAULT_SIZE (NLMSG_DEFAULT_SIZE - GENL_HDRLEN)
 
+/* Non-parallel generic netlink requests are serialized by a global lock. */
+void genl_lock(void);
+void genl_unlock(void);
+
+#define MODULE_ALIAS_GENL_FAMILY(family) \
+ MODULE_ALIAS_NET_PF_PROTO_NAME(PF_NETLINK, NETLINK_GENERIC, "-family-" family)
+
 /* Binding to multicast group requires %CAP_NET_ADMIN */
 #define GENL_MCAST_CAP_NET_ADMIN	BIT(0)
 /* Binding to multicast group requires %CAP_SYS_ADMIN */
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 75119f1ffccc..8e0f44c71696 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -14,7 +14,6 @@
 #include <linux/crc32c.h>
 #include <linux/device.h>
 #include <linux/errno.h>
-#include <linux/genetlink.h>
 #include <linux/gfp.h>
 #include <linux/if_ether.h>
 #include <linux/if_vlan.h>
@@ -38,6 +37,7 @@
 #include <linux/string.h>
 #include <linux/workqueue.h>
 #include <net/dsfield.h>
+#include <net/genetlink.h>
 #include <net/rtnetlink.h>
 #include <uapi/linux/batadv_packet.h>
 #include <uapi/linux/batman_adv.h>
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 0954757f0b8b..9362cd9d6f3d 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -15,7 +15,6 @@
 #include <linux/cache.h>
 #include <linux/err.h>
 #include <linux/errno.h>
-#include <linux/genetlink.h>
 #include <linux/gfp.h>
 #include <linux/if_ether.h>
 #include <linux/if_vlan.h>
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 11c69415c605..99d72543abd3 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -15,7 +15,6 @@
 #include <linux/delay.h>
 #include <linux/time.h>
 #include <linux/etherdevice.h>
-#include <linux/genetlink.h>
 #include <linux/kernel.h>
 #include <linux/kthread.h>
 #include <linux/mutex.h>
-- 
cgit v1.2.3


From 225a8d0bd93eb87fe49947069075260031bad8af Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Mon, 25 Mar 2024 21:06:57 +0100
Subject: drm/ast: Implement polling for VGA and SIL164 connectors

Implement polling for VGA and SIL164 connectors. Set the flag
DRM_CONNECTOR_POLL_DISCONNECT for each to detect the removal of the
monitor cable. Implement struct drm_connector_helper_funcs.detect_ctx
for each type of connector by testing for EDID data.

The helper drm_connector_helper_detect_ctx() implements .detect_ctx()
on top of the connector's DDC channel. The function can be used by
other drivers as companion to drm_connector_helper_get_modes().

v6:
- change helper name to drm_connector_helper_detec_from_ddc()
  (Maxime, Sui)
v5:
- share implementation in drm_connector_helper_detect_ctx() (Maxime)
- test for DDC presence with drm_probe_ddc() (Maxime, Jani)

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Jocelyn Falempe <jfalempe@redhat.com>
Acked-by: Sui Jingfeng <sui.jingfeng@linux.dev>
Acked-by: Maxime Ripard <mripard@kernel.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20240325200855.21150-13-tzimmermann@suse.de
---
 drivers/gpu/drm/ast/ast_mode.c     |  6 ++++--
 drivers/gpu/drm/drm_probe_helper.c | 29 +++++++++++++++++++++++++++++
 include/drm/drm_probe_helper.h     |  4 ++++
 3 files changed, 37 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/ast/ast_mode.c b/drivers/gpu/drm/ast/ast_mode.c
index 71cc681d6188..b4e1c812520c 100644
--- a/drivers/gpu/drm/ast/ast_mode.c
+++ b/drivers/gpu/drm/ast/ast_mode.c
@@ -1346,6 +1346,7 @@ static int ast_crtc_init(struct drm_device *dev)
 
 static const struct drm_connector_helper_funcs ast_vga_connector_helper_funcs = {
 	.get_modes = drm_connector_helper_get_modes,
+	.detect_ctx = drm_connector_helper_detect_from_ddc,
 };
 
 static const struct drm_connector_funcs ast_vga_connector_funcs = {
@@ -1379,7 +1380,7 @@ static int ast_vga_connector_init(struct drm_device *dev, struct drm_connector *
 	connector->interlace_allowed = 0;
 	connector->doublescan_allowed = 0;
 
-	connector->polled = DRM_CONNECTOR_POLL_CONNECT;
+	connector->polled = DRM_CONNECTOR_POLL_CONNECT | DRM_CONNECTOR_POLL_DISCONNECT;
 
 	return 0;
 }
@@ -1414,6 +1415,7 @@ static int ast_vga_output_init(struct ast_device *ast)
 
 static const struct drm_connector_helper_funcs ast_sil164_connector_helper_funcs = {
 	.get_modes = drm_connector_helper_get_modes,
+	.detect_ctx = drm_connector_helper_detect_from_ddc,
 };
 
 static const struct drm_connector_funcs ast_sil164_connector_funcs = {
@@ -1447,7 +1449,7 @@ static int ast_sil164_connector_init(struct drm_device *dev, struct drm_connecto
 	connector->interlace_allowed = 0;
 	connector->doublescan_allowed = 0;
 
-	connector->polled = DRM_CONNECTOR_POLL_CONNECT;
+	connector->polled = DRM_CONNECTOR_POLL_CONNECT | DRM_CONNECTOR_POLL_DISCONNECT;
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/drm_probe_helper.c b/drivers/gpu/drm/drm_probe_helper.c
index bf2dd1f46b6c..8fb842c6c638 100644
--- a/drivers/gpu/drm/drm_probe_helper.c
+++ b/drivers/gpu/drm/drm_probe_helper.c
@@ -1279,3 +1279,32 @@ int drm_connector_helper_tv_get_modes(struct drm_connector *connector)
 	return i;
 }
 EXPORT_SYMBOL(drm_connector_helper_tv_get_modes);
+
+/**
+ * drm_connector_helper_detect_from_ddc - Read EDID and detect connector status.
+ * @connector: The connector
+ * @ctx: Acquire context
+ * @force: Perform screen-destructive operations, if necessary
+ *
+ * Detects the connector status by reading the EDID using drm_probe_ddc(),
+ * which requires connector->ddc to be set. Returns connector_status_connected
+ * on success or connector_status_disconnected on failure.
+ *
+ * Returns:
+ * The connector status as defined by enum drm_connector_status.
+ */
+int drm_connector_helper_detect_from_ddc(struct drm_connector *connector,
+					 struct drm_modeset_acquire_ctx *ctx,
+					 bool force)
+{
+	struct i2c_adapter *ddc = connector->ddc;
+
+	if (!ddc)
+		return connector_status_unknown;
+
+	if (drm_probe_ddc(ddc))
+		return connector_status_connected;
+
+	return connector_status_disconnected;
+}
+EXPORT_SYMBOL(drm_connector_helper_detect_from_ddc);
diff --git a/include/drm/drm_probe_helper.h b/include/drm/drm_probe_helper.h
index 62741a88796b..e9dc32433631 100644
--- a/include/drm/drm_probe_helper.h
+++ b/include/drm/drm_probe_helper.h
@@ -37,4 +37,8 @@ int drm_connector_helper_get_modes_fixed(struct drm_connector *connector,
 int drm_connector_helper_get_modes(struct drm_connector *connector);
 int drm_connector_helper_tv_get_modes(struct drm_connector *connector);
 
+int drm_connector_helper_detect_from_ddc(struct drm_connector *connector,
+					 struct drm_modeset_acquire_ctx *ctx,
+					 bool force);
+
 #endif
-- 
cgit v1.2.3


From 10021ef27310279c850cf6cb38542c443a995e92 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Mon, 25 Mar 2024 21:06:58 +0100
Subject: drm/ast: Automatically clean up poll helper

Automatically clean up the conncetor-poll thread as part of the DRM
device release. The new helper drmm_kms_helper_poll_init() provides
a shared implementation for all drivers.

v6:
- fix kernel doc comment (Sui, kernel test robot)

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Jocelyn Falempe <jfalempe@redhat.com>
Acked-by: Sui Jingfeng <sui.jingfeng@linux.dev>
Link: https://patchwork.freedesktop.org/patch/msgid/20240325200855.21150-14-tzimmermann@suse.de
---
 drivers/gpu/drm/ast/ast_mode.c     |  4 +++-
 drivers/gpu/drm/drm_probe_helper.c | 27 +++++++++++++++++++++++++++
 include/drm/drm_probe_helper.h     |  2 ++
 3 files changed, 32 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/ast/ast_mode.c b/drivers/gpu/drm/ast/ast_mode.c
index b4e1c812520c..bb9b66aba9ee 100644
--- a/drivers/gpu/drm/ast/ast_mode.c
+++ b/drivers/gpu/drm/ast/ast_mode.c
@@ -1905,7 +1905,9 @@ int ast_mode_config_init(struct ast_device *ast)
 
 	drm_mode_config_reset(dev);
 
-	drm_kms_helper_poll_init(dev);
+	ret = drmm_kms_helper_poll_init(dev);
+	if (ret)
+		return ret;
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/drm_probe_helper.c b/drivers/gpu/drm/drm_probe_helper.c
index 8fb842c6c638..968a3ee66b1e 100644
--- a/drivers/gpu/drm/drm_probe_helper.c
+++ b/drivers/gpu/drm/drm_probe_helper.c
@@ -37,6 +37,7 @@
 #include <drm/drm_crtc.h>
 #include <drm/drm_edid.h>
 #include <drm/drm_fourcc.h>
+#include <drm/drm_managed.h>
 #include <drm/drm_modeset_helper_vtables.h>
 #include <drm/drm_print.h>
 #include <drm/drm_probe_helper.h>
@@ -951,6 +952,32 @@ void drm_kms_helper_poll_fini(struct drm_device *dev)
 }
 EXPORT_SYMBOL(drm_kms_helper_poll_fini);
 
+static void drm_kms_helper_poll_init_release(struct drm_device *dev, void *res)
+{
+	drm_kms_helper_poll_fini(dev);
+}
+
+/**
+ * drmm_kms_helper_poll_init - initialize and enable output polling
+ * @dev: drm_device
+ *
+ * This function initializes and then also enables output polling support for
+ * @dev similar to drm_kms_helper_poll_init(). Polling will automatically be
+ * cleaned up when the DRM device goes away.
+ *
+ * See drm_kms_helper_poll_init() for more information.
+ *
+ * Returns:
+ * 0 on success, or a negative errno code otherwise.
+ */
+int drmm_kms_helper_poll_init(struct drm_device *dev)
+{
+	drm_kms_helper_poll_init(dev);
+
+	return drmm_add_action_or_reset(dev, drm_kms_helper_poll_init_release, dev);
+}
+EXPORT_SYMBOL(drmm_kms_helper_poll_init);
+
 static bool check_connector_changed(struct drm_connector *connector)
 {
 	struct drm_device *dev = connector->dev;
diff --git a/include/drm/drm_probe_helper.h b/include/drm/drm_probe_helper.h
index e9dc32433631..d6ce7b218b77 100644
--- a/include/drm/drm_probe_helper.h
+++ b/include/drm/drm_probe_helper.h
@@ -16,6 +16,8 @@ int drm_helper_probe_single_connector_modes(struct drm_connector
 int drm_helper_probe_detect(struct drm_connector *connector,
 			    struct drm_modeset_acquire_ctx *ctx,
 			    bool force);
+
+int drmm_kms_helper_poll_init(struct drm_device *dev);
 void drm_kms_helper_poll_init(struct drm_device *dev);
 void drm_kms_helper_poll_fini(struct drm_device *dev);
 bool drm_helper_hpd_irq_event(struct drm_device *dev);
-- 
cgit v1.2.3


From 94cd1011002ee9aabfd5b7d96b366384b6de3707 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Date: Fri, 29 Mar 2024 18:24:31 +0100
Subject: mmc: sdio: store owner from modules with sdio_register_driver()

Modules registering driver with sdio_register_driver() might
forget to set .owner field.  The field is used by some of other kernel
parts for reference counting (try_module_get()), so it is expected that
drivers will set it.

Solve the problem by moving this task away from the drivers to the core
code, just like we did for platform_driver in
commit 9447057eaff8 ("platform_device: use a macro instead of
platform_driver_register").

Since many drivers forget to set the .owner, this effectively will fix
them.  Examples of fixed drivers are: ath6kl, b43, btsdio.c, ks7010,
libertas, MediaTek WiFi drivers, Realtek WiFi drivers, rsi, siano,
wilc1000, wl1251 and more.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Francesco Dolcini <francesco.dolcini@toradex.com>
Link: https://lore.kernel.org/r/20240329-module-owner-sdio-v1-1-e4010b11ccaa@linaro.org
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/sdio_bus.c   | 9 ++++++---
 include/linux/mmc/sdio_func.h | 5 ++++-
 2 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/mmc/core/sdio_bus.c b/drivers/mmc/core/sdio_bus.c
index 71d885fbc228..c5fdfe2325f8 100644
--- a/drivers/mmc/core/sdio_bus.c
+++ b/drivers/mmc/core/sdio_bus.c
@@ -265,16 +265,19 @@ void sdio_unregister_bus(void)
 }
 
 /**
- *	sdio_register_driver - register a function driver
+ *	__sdio_register_driver - register a function driver
  *	@drv: SDIO function driver
+ *	@owner: owning module/driver
  */
-int sdio_register_driver(struct sdio_driver *drv)
+int __sdio_register_driver(struct sdio_driver *drv, struct module *owner)
 {
 	drv->drv.name = drv->name;
 	drv->drv.bus = &sdio_bus_type;
+	drv->drv.owner = owner;
+
 	return driver_register(&drv->drv);
 }
-EXPORT_SYMBOL_GPL(sdio_register_driver);
+EXPORT_SYMBOL_GPL(__sdio_register_driver);
 
 /**
  *	sdio_unregister_driver - unregister a function driver
diff --git a/include/linux/mmc/sdio_func.h b/include/linux/mmc/sdio_func.h
index 478855b8e406..fed1f5f4a8d3 100644
--- a/include/linux/mmc/sdio_func.h
+++ b/include/linux/mmc/sdio_func.h
@@ -106,7 +106,10 @@ struct sdio_driver {
 	.class = (dev_class), \
 	.vendor = SDIO_ANY_ID, .device = SDIO_ANY_ID
 
-extern int sdio_register_driver(struct sdio_driver *);
+/* use a macro to avoid include chaining to get THIS_MODULE */
+#define sdio_register_driver(drv) \
+	__sdio_register_driver(drv, THIS_MODULE)
+extern int __sdio_register_driver(struct sdio_driver *, struct module *);
 extern void sdio_unregister_driver(struct sdio_driver *);
 
 /**
-- 
cgit v1.2.3


From ca4ddc26f8acaa9cb451fcb20f7ab0f02e4970cb Mon Sep 17 00:00:00 2001
From: David Lechner <dlechner@baylibre.com>
Date: Fri, 29 Mar 2024 10:28:46 -0500
Subject: bpf: Fix typo in uapi doc comments

In a few places in the bpf uapi headers, EOPNOTSUPP is missing a "P" in
the doc comments. This adds the missing "P".

Signed-off-by: David Lechner <dlechner@baylibre.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20240329152900.398260-2-dlechner@baylibre.com
---
 include/uapi/linux/bpf.h       | 4 ++--
 tools/include/uapi/linux/bpf.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 96d57e483133..79c548276b6b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5026,7 +5026,7 @@ union bpf_attr {
  *		bytes will be copied to *dst*
  *	Return
  *		The **hash_algo** is returned on success,
- *		**-EOPNOTSUP** if IMA is disabled or **-EINVAL** if
+ *		**-EOPNOTSUPP** if IMA is disabled or **-EINVAL** if
  *		invalid arguments are passed.
  *
  * struct socket *bpf_sock_from_file(struct file *file)
@@ -5512,7 +5512,7 @@ union bpf_attr {
  *		bytes will be copied to *dst*
  *	Return
  *		The **hash_algo** is returned on success,
- *		**-EOPNOTSUP** if the hash calculation failed or **-EINVAL** if
+ *		**-EOPNOTSUPP** if the hash calculation failed or **-EINVAL** if
  *		invalid arguments are passed.
  *
  * void *bpf_kptr_xchg(void *map_value, void *ptr)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 96d57e483133..79c548276b6b 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5026,7 +5026,7 @@ union bpf_attr {
  *		bytes will be copied to *dst*
  *	Return
  *		The **hash_algo** is returned on success,
- *		**-EOPNOTSUP** if IMA is disabled or **-EINVAL** if
+ *		**-EOPNOTSUPP** if IMA is disabled or **-EINVAL** if
  *		invalid arguments are passed.
  *
  * struct socket *bpf_sock_from_file(struct file *file)
@@ -5512,7 +5512,7 @@ union bpf_attr {
  *		bytes will be copied to *dst*
  *	Return
  *		The **hash_algo** is returned on success,
- *		**-EOPNOTSUP** if the hash calculation failed or **-EINVAL** if
+ *		**-EOPNOTSUPP** if the hash calculation failed or **-EINVAL** if
  *		invalid arguments are passed.
  *
  * void *bpf_kptr_xchg(void *map_value, void *ptr)
-- 
cgit v1.2.3


From ce09cbdd988887662546a1175bcfdfc6c8fdd150 Mon Sep 17 00:00:00 2001
From: Jose Fernandez <josef@netflix.com>
Date: Mon, 1 Apr 2024 21:40:10 -0600
Subject: bpf: Improve program stats run-time calculation

This patch improves the run-time calculation for program stats by
capturing the duration as soon as possible after the program returns.

Previously, the duration included u64_stats_t operations. While the
instrumentation overhead is part of the total time spent when stats are
enabled, distinguishing between the program's native execution time and
the time spent due to instrumentation is crucial for accurate
performance analysis.

By making this change, the patch facilitates more precise optimization
of BPF programs, enabling users to understand their performance in
environments without stats enabled.

I used a virtualized environment to measure the run-time over one minute
for a basic raw_tracepoint/sys_enter program, which just increments a
local counter. Although the virtualization introduced some performance
degradation that could affect the results, I observed approximately a
16% decrease in average run-time reported by stats with this change
(310 -> 260 nsec).

Signed-off-by: Jose Fernandez <josef@netflix.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20240402034010.25060-1-josef@netflix.com
---
 include/linux/filter.h  | 6 ++++--
 kernel/bpf/trampoline.c | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 44934b968b57..531b36090122 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -654,14 +654,16 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
 	cant_migrate();
 	if (static_branch_unlikely(&bpf_stats_enabled_key)) {
 		struct bpf_prog_stats *stats;
-		u64 start = sched_clock();
+		u64 duration, start = sched_clock();
 		unsigned long flags;
 
 		ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
+
+		duration = sched_clock() - start;
 		stats = this_cpu_ptr(prog->stats);
 		flags = u64_stats_update_begin_irqsave(&stats->syncp);
 		u64_stats_inc(&stats->cnt);
-		u64_stats_add(&stats->nsecs, sched_clock() - start);
+		u64_stats_add(&stats->nsecs, duration);
 		u64_stats_update_end_irqrestore(&stats->syncp, flags);
 	} else {
 		ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index cc50607f8d8c..26ae703d3c3b 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -885,12 +885,13 @@ static void notrace update_prog_stats(struct bpf_prog *prog,
 	     * Hence check that 'start' is valid.
 	     */
 	    start > NO_START_TIME) {
+		u64 duration = sched_clock() - start;
 		unsigned long flags;
 
 		stats = this_cpu_ptr(prog->stats);
 		flags = u64_stats_update_begin_irqsave(&stats->syncp);
 		u64_stats_inc(&stats->cnt);
-		u64_stats_add(&stats->nsecs, sched_clock() - start);
+		u64_stats_add(&stats->nsecs, duration);
 		u64_stats_update_end_irqrestore(&stats->syncp, flags);
 	}
 }
-- 
cgit v1.2.3


From 08ea486a61451189b190c7b89e406b889cf693fa Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Mon, 1 Apr 2024 10:02:08 +0000
Subject: ALSA: control: Introduce snd_ctl_find_id_mixer_locked()

Adds wrapper function snd_ctl_find_id_mixer_locked(). This is
identical to snd_ctl_find_id_mixer() except that it can be called
from code that is already holding controls_rwsem.

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Link: https://msgid.link/r/20240401100210.61277-2-rf@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/control.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include')

diff --git a/include/sound/control.h b/include/sound/control.h
index 9a4f4f7138da..c1659036c4a7 100644
--- a/include/sound/control.h
+++ b/include/sound/control.h
@@ -167,6 +167,29 @@ snd_ctl_find_id_mixer(struct snd_card *card, const char *name)
 	return snd_ctl_find_id(card, &id);
 }
 
+/**
+ * snd_ctl_find_id_mixer_locked - find the control instance with the given name string
+ * @card: the card instance
+ * @name: the name string
+ *
+ * Finds the control instance with the given name and
+ * @SNDRV_CTL_ELEM_IFACE_MIXER. Other fields are set to zero.
+ *
+ * This is merely a wrapper to snd_ctl_find_id_locked().
+ * The caller must down card->controls_rwsem before calling this function.
+ *
+ * Return: The pointer of the instance if found, or %NULL if not.
+ */
+static inline struct snd_kcontrol *
+snd_ctl_find_id_mixer_locked(struct snd_card *card, const char *name)
+{
+	struct snd_ctl_elem_id id = {};
+
+	id.iface = SNDRV_CTL_ELEM_IFACE_MIXER;
+	strscpy(id.name, name, sizeof(id.name));
+	return snd_ctl_find_id_locked(card, &id);
+}
+
 int snd_ctl_create(struct snd_card *card);
 
 int snd_ctl_register_ioctl(snd_kctl_ioctl_func_t fcn);
-- 
cgit v1.2.3


From 416bdb89605d960405178b9bf04df512d1ace1a3 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 22 Dec 2023 21:05:11 -0800
Subject: counter: linux/counter.h: fix Excess kernel-doc description warning

Remove the @priv: line to prevent the kernel-doc warning:

include/linux/counter.h:400: warning: Excess struct member 'priv' description in 'counter_device'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Fixes: f2ee4759fb70 ("counter: remove old and now unused registration API")
Link: https://lore.kernel.org/r/20231223050511.13849-1-rdunlap@infradead.org
Signed-off-by: William Breathitt Gray <william.gray@linaro.org>
---
 include/linux/counter.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/counter.h b/include/linux/counter.h
index 702e9108bbb4..b767b5c821f5 100644
--- a/include/linux/counter.h
+++ b/include/linux/counter.h
@@ -359,7 +359,6 @@ struct counter_ops {
  * @num_counts:		number of Counts specified in @counts
  * @ext:		optional array of Counter device extensions
  * @num_ext:		number of Counter device extensions specified in @ext
- * @priv:		optional private data supplied by driver
  * @dev:		internal device structure
  * @chrdev:		internal character device structure
  * @events_list:	list of current watching Counter events
-- 
cgit v1.2.3


From 2f48aba356a004d854bc6d77fbc032b2fc666911 Mon Sep 17 00:00:00 2001
From: Fabrice Gasnier <fabrice.gasnier@foss.st.com>
Date: Wed, 6 Mar 2024 16:36:31 +0100
Subject: counter: Introduce the COUNTER_COMP_FREQUENCY() macro

Now that there are two users for the "frequency" extension, introduce a
new COUNTER_COMP_FREQUENCY() macro.
This extension is intended to be a read-only signal attribute.

Suggested-by: William Breathitt Gray <william.gray@linaro.org>
Signed-off-by: Fabrice Gasnier <fabrice.gasnier@foss.st.com>
Link: https://lore.kernel.org/r/20240306153631.4051115-1-fabrice.gasnier@foss.st.com
Signed-off-by: William Breathitt Gray <william.gray@linaro.org>
---
 include/linux/counter.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/counter.h b/include/linux/counter.h
index b767b5c821f5..cd35d8574ee2 100644
--- a/include/linux/counter.h
+++ b/include/linux/counter.h
@@ -601,6 +601,9 @@ struct counter_array {
 #define COUNTER_COMP_FLOOR(_read, _write) \
 	COUNTER_COMP_COUNT_U64("floor", _read, _write)
 
+#define COUNTER_COMP_FREQUENCY(_read) \
+	COUNTER_COMP_SIGNAL_U64("frequency", _read, NULL)
+
 #define COUNTER_COMP_POLARITY(_read, _write, _available) \
 { \
 	.type = COUNTER_COMP_SIGNAL_POLARITY, \
-- 
cgit v1.2.3


From 1aed15275b7ce17b5ebdfc112a76e0d7165ed46b Mon Sep 17 00:00:00 2001
From: Fabrice Gasnier <fabrice.gasnier@foss.st.com>
Date: Thu, 7 Mar 2024 14:33:06 +0100
Subject: counter: stm32-timer-cnt: add support for capture events

Add support for capture events. Captured counter value for each channel
can be retrieved through CCRx register.
STM32 timers can have up to 4 capture channels (on input channel 1 to
channel 4), hence need to check the number of channels before reading
the capture data.
The capture configuration is hard-coded to capture signals on both edges
(non-inverted). Interrupts are used to report events independently for
each channel.

Reviewed-by: William Breathitt Gray <william.gray@linaro.org>
Acked-by: Lee Jones <lee@kernel.org>
Signed-off-by: Fabrice Gasnier <fabrice.gasnier@foss.st.com>
Link: https://lore.kernel.org/r/20240307133306.383045-11-fabrice.gasnier@foss.st.com
Signed-off-by: William Breathitt Gray <william.gray@linaro.org>
---
 drivers/counter/stm32-timer-cnt.c | 134 +++++++++++++++++++++++++++++++++++++-
 include/linux/mfd/stm32-timers.h  |  13 ++++
 2 files changed, 144 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/counter/stm32-timer-cnt.c b/drivers/counter/stm32-timer-cnt.c
index 9fcafec682b7..0664ef969f79 100644
--- a/drivers/counter/stm32-timer-cnt.c
+++ b/drivers/counter/stm32-timer-cnt.c
@@ -262,6 +262,40 @@ static int stm32_count_prescaler_write(struct counter_device *counter,
 	return regmap_write(priv->regmap, TIM_PSC, psc);
 }
 
+static int stm32_count_cap_read(struct counter_device *counter,
+				struct counter_count *count,
+				size_t ch, u64 *cap)
+{
+	struct stm32_timer_cnt *const priv = counter_priv(counter);
+	u32 ccrx;
+
+	if (ch >= priv->nchannels)
+		return -EOPNOTSUPP;
+
+	switch (ch) {
+	case 0:
+		regmap_read(priv->regmap, TIM_CCR1, &ccrx);
+		break;
+	case 1:
+		regmap_read(priv->regmap, TIM_CCR2, &ccrx);
+		break;
+	case 2:
+		regmap_read(priv->regmap, TIM_CCR3, &ccrx);
+		break;
+	case 3:
+		regmap_read(priv->regmap, TIM_CCR4, &ccrx);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	dev_dbg(counter->parent, "CCR%zu: 0x%08x\n", ch + 1, ccrx);
+
+	*cap = ccrx;
+
+	return 0;
+}
+
 static int stm32_count_nb_ovf_read(struct counter_device *counter,
 				   struct counter_count *count, u64 *val)
 {
@@ -288,6 +322,8 @@ static int stm32_count_nb_ovf_write(struct counter_device *counter,
 	return 0;
 }
 
+static DEFINE_COUNTER_ARRAY_CAPTURE(stm32_count_cap_array, 4);
+
 static struct counter_comp stm32_count_ext[] = {
 	COUNTER_COMP_DIRECTION(stm32_count_direction_read),
 	COUNTER_COMP_ENABLE(stm32_count_enable_read, stm32_count_enable_write),
@@ -295,6 +331,7 @@ static struct counter_comp stm32_count_ext[] = {
 			     stm32_count_ceiling_write),
 	COUNTER_COMP_COUNT_U64("prescaler", stm32_count_prescaler_read,
 			       stm32_count_prescaler_write),
+	COUNTER_COMP_ARRAY_CAPTURE(stm32_count_cap_read, NULL, stm32_count_cap_array),
 	COUNTER_COMP_COUNT_U64("num_overflows", stm32_count_nb_ovf_read, stm32_count_nb_ovf_write),
 };
 
@@ -353,11 +390,68 @@ static int stm32_action_read(struct counter_device *counter,
 	}
 }
 
+struct stm32_count_cc_regs {
+	u32 ccmr_reg;
+	u32 ccmr_mask;
+	u32 ccmr_bits;
+	u32 ccer_bits;
+};
+
+static const struct stm32_count_cc_regs stm32_cc[] = {
+	{ TIM_CCMR1, TIM_CCMR_CC1S, TIM_CCMR_CC1S_TI1,
+		TIM_CCER_CC1E | TIM_CCER_CC1P | TIM_CCER_CC1NP },
+	{ TIM_CCMR1, TIM_CCMR_CC2S, TIM_CCMR_CC2S_TI2,
+		TIM_CCER_CC2E | TIM_CCER_CC2P | TIM_CCER_CC2NP },
+	{ TIM_CCMR2, TIM_CCMR_CC3S, TIM_CCMR_CC3S_TI3,
+		TIM_CCER_CC3E | TIM_CCER_CC3P | TIM_CCER_CC3NP },
+	{ TIM_CCMR2, TIM_CCMR_CC4S, TIM_CCMR_CC4S_TI4,
+		TIM_CCER_CC4E | TIM_CCER_CC4P | TIM_CCER_CC4NP },
+};
+
+static int stm32_count_capture_configure(struct counter_device *counter, unsigned int ch,
+					 bool enable)
+{
+	struct stm32_timer_cnt *const priv = counter_priv(counter);
+	const struct stm32_count_cc_regs *cc;
+	u32 ccmr, ccer;
+
+	if (ch >= ARRAY_SIZE(stm32_cc) || ch >= priv->nchannels) {
+		dev_err(counter->parent, "invalid ch: %d\n", ch);
+		return -EINVAL;
+	}
+
+	cc = &stm32_cc[ch];
+
+	/*
+	 * configure channel in input capture mode, map channel 1 on TI1, channel2 on TI2...
+	 * Select both edges / non-inverted to trigger a capture.
+	 */
+	if (enable) {
+		/* first clear possibly latched capture flag upon enabling */
+		if (!regmap_test_bits(priv->regmap, TIM_CCER, cc->ccer_bits))
+			regmap_write(priv->regmap, TIM_SR, ~TIM_SR_CC_IF(ch));
+		regmap_update_bits(priv->regmap, cc->ccmr_reg, cc->ccmr_mask,
+				   cc->ccmr_bits);
+		regmap_set_bits(priv->regmap, TIM_CCER, cc->ccer_bits);
+	} else {
+		regmap_clear_bits(priv->regmap, TIM_CCER, cc->ccer_bits);
+		regmap_clear_bits(priv->regmap, cc->ccmr_reg, cc->ccmr_mask);
+	}
+
+	regmap_read(priv->regmap, cc->ccmr_reg, &ccmr);
+	regmap_read(priv->regmap, TIM_CCER, &ccer);
+	dev_dbg(counter->parent, "%s(%s) ch%d 0x%08x 0x%08x\n", __func__, enable ? "ena" : "dis",
+		ch, ccmr, ccer);
+
+	return 0;
+}
+
 static int stm32_count_events_configure(struct counter_device *counter)
 {
 	struct stm32_timer_cnt *const priv = counter_priv(counter);
 	struct counter_event_node *event_node;
 	u32 dier = 0;
+	int i, ret;
 
 	list_for_each_entry(event_node, &counter->events_list, l) {
 		switch (event_node->event) {
@@ -367,6 +461,12 @@ static int stm32_count_events_configure(struct counter_device *counter)
 				regmap_write(priv->regmap, TIM_SR, (u32)~TIM_SR_UIF);
 			dier |= TIM_DIER_UIE;
 			break;
+		case COUNTER_EVENT_CAPTURE:
+			ret = stm32_count_capture_configure(counter, event_node->channel, true);
+			if (ret)
+				return ret;
+			dier |= TIM_DIER_CC_IE(event_node->channel);
+			break;
 		default:
 			/* should never reach this path */
 			return -EINVAL;
@@ -376,6 +476,15 @@ static int stm32_count_events_configure(struct counter_device *counter)
 	/* Enable / disable all events at once, from events_list, so write all DIER bits */
 	regmap_write(priv->regmap, TIM_DIER, dier);
 
+	/* check for disabled capture events */
+	for (i = 0 ; i < priv->nchannels; i++) {
+		if (!(dier & TIM_DIER_CC_IE(i))) {
+			ret = stm32_count_capture_configure(counter, i, false);
+			if (ret)
+				return ret;
+		}
+	}
+
 	return 0;
 }
 
@@ -389,6 +498,12 @@ static int stm32_count_watch_validate(struct counter_device *counter,
 		return -EOPNOTSUPP;
 
 	switch (watch->event) {
+	case COUNTER_EVENT_CAPTURE:
+		if (watch->channel >= priv->nchannels) {
+			dev_err(counter->parent, "Invalid channel %d\n", watch->channel);
+			return -EINVAL;
+		}
+		return 0;
 	case COUNTER_EVENT_OVERFLOW_UNDERFLOW:
 		return 0;
 	default:
@@ -499,6 +614,7 @@ static irqreturn_t stm32_timer_cnt_isr(int irq, void *ptr)
 	struct stm32_timer_cnt *const priv = counter_priv(counter);
 	u32 clr = GENMASK(31, 0); /* SR flags can be cleared by writing 0 (wr 1 has no effect) */
 	u32 sr, dier;
+	int i;
 
 	regmap_read(priv->regmap, TIM_SR, &sr);
 	regmap_read(priv->regmap, TIM_DIER, &dier);
@@ -506,7 +622,7 @@ static irqreturn_t stm32_timer_cnt_isr(int irq, void *ptr)
 	 * Some status bits in SR don't match with the enable bits in DIER. Only take care of
 	 * the possibly enabled bits in DIER (that matches in between SR and DIER).
 	 */
-	dier &= TIM_DIER_UIE;
+	dier &= (TIM_DIER_UIE | TIM_DIER_CC1IE | TIM_DIER_CC2IE | TIM_DIER_CC3IE | TIM_DIER_CC4IE);
 	sr &= dier;
 
 	if (sr & TIM_SR_UIF) {
@@ -519,6 +635,15 @@ static irqreturn_t stm32_timer_cnt_isr(int irq, void *ptr)
 		clr &= ~TIM_SR_UIF;
 	}
 
+	/* Check capture events */
+	for (i = 0 ; i < priv->nchannels; i++) {
+		if (sr & TIM_SR_CC_IF(i)) {
+			counter_push_event(counter, COUNTER_EVENT_CAPTURE, i);
+			clr &= ~TIM_SR_CC_IF(i);
+			dev_dbg(counter->parent, "COUNTER_EVENT_CAPTURE, %d\n", i);
+		}
+	}
+
 	regmap_write(priv->regmap, TIM_SR, clr);
 
 	return IRQ_HANDLED;
@@ -633,8 +758,11 @@ static int stm32_timer_cnt_probe(struct platform_device *pdev)
 		}
 	} else {
 		for (i = 0; i < priv->nr_irqs; i++) {
-			/* Only take care of update IRQ for overflow events */
-			if (i != STM32_TIMERS_IRQ_UP)
+			/*
+			 * Only take care of update IRQ for overflow events, and cc for
+			 * capture events.
+			 */
+			if (i != STM32_TIMERS_IRQ_UP && i != STM32_TIMERS_IRQ_CC)
 				continue;
 
 			ret = devm_request_irq(&pdev->dev, ddata->irq[i], stm32_timer_cnt_isr,
diff --git a/include/linux/mfd/stm32-timers.h b/include/linux/mfd/stm32-timers.h
index ca35af30745f..9eb17481b07f 100644
--- a/include/linux/mfd/stm32-timers.h
+++ b/include/linux/mfd/stm32-timers.h
@@ -41,6 +41,11 @@
 #define TIM_SMCR_SMS	(BIT(0) | BIT(1) | BIT(2)) /* Slave mode selection */
 #define TIM_SMCR_TS	(BIT(4) | BIT(5) | BIT(6)) /* Trigger selection */
 #define TIM_DIER_UIE	BIT(0)	/* Update interrupt	   */
+#define TIM_DIER_CC1IE	BIT(1)  /* CC1 Interrupt Enable    */
+#define TIM_DIER_CC2IE	BIT(2)  /* CC2 Interrupt Enable    */
+#define TIM_DIER_CC3IE	BIT(3)  /* CC3 Interrupt Enable    */
+#define TIM_DIER_CC4IE	BIT(4)  /* CC4 Interrupt Enable    */
+#define TIM_DIER_CC_IE(x)	BIT((x) + 1) /* CC1, CC2, CC3, CC4 interrupt enable */
 #define TIM_DIER_UDE	BIT(8)  /* Update DMA request Enable */
 #define TIM_DIER_CC1DE	BIT(9)  /* CC1 DMA request Enable  */
 #define TIM_DIER_CC2DE	BIT(10) /* CC2 DMA request Enable  */
@@ -49,6 +54,7 @@
 #define TIM_DIER_COMDE	BIT(13) /* COM DMA request Enable  */
 #define TIM_DIER_TDE	BIT(14) /* Trigger DMA request Enable */
 #define TIM_SR_UIF	BIT(0)	/* Update interrupt flag   */
+#define TIM_SR_CC_IF(x)	BIT((x) + 1) /* CC1, CC2, CC3, CC4 interrupt flag */
 #define TIM_EGR_UG	BIT(0)	/* Update Generation       */
 #define TIM_CCMR_PE	BIT(3)	/* Channel Preload Enable  */
 #define TIM_CCMR_M1	(BIT(6) | BIT(5))  /* Channel PWM Mode 1 */
@@ -60,16 +66,23 @@
 #define TIM_CCMR_CC1S_TI2	BIT(1)	/* IC1/IC3 selects TI2/TI4 */
 #define TIM_CCMR_CC2S_TI2	BIT(8)	/* IC2/IC4 selects TI2/TI4 */
 #define TIM_CCMR_CC2S_TI1	BIT(9)	/* IC2/IC4 selects TI1/TI3 */
+#define TIM_CCMR_CC3S		(BIT(0) | BIT(1)) /* Capture/compare 3 sel */
+#define TIM_CCMR_CC4S		(BIT(8) | BIT(9)) /* Capture/compare 4 sel */
+#define TIM_CCMR_CC3S_TI3	BIT(0)	/* IC3 selects TI3 */
+#define TIM_CCMR_CC4S_TI4	BIT(8)	/* IC4 selects TI4 */
 #define TIM_CCER_CC1E	BIT(0)	/* Capt/Comp 1  out Ena    */
 #define TIM_CCER_CC1P	BIT(1)	/* Capt/Comp 1  Polarity   */
 #define TIM_CCER_CC1NE	BIT(2)	/* Capt/Comp 1N out Ena    */
 #define TIM_CCER_CC1NP	BIT(3)	/* Capt/Comp 1N Polarity   */
 #define TIM_CCER_CC2E	BIT(4)	/* Capt/Comp 2  out Ena    */
 #define TIM_CCER_CC2P	BIT(5)	/* Capt/Comp 2  Polarity   */
+#define TIM_CCER_CC2NP	BIT(7)	/* Capt/Comp 2N Polarity   */
 #define TIM_CCER_CC3E	BIT(8)	/* Capt/Comp 3  out Ena    */
 #define TIM_CCER_CC3P	BIT(9)	/* Capt/Comp 3  Polarity   */
+#define TIM_CCER_CC3NP	BIT(11)	/* Capt/Comp 3N Polarity   */
 #define TIM_CCER_CC4E	BIT(12)	/* Capt/Comp 4  out Ena    */
 #define TIM_CCER_CC4P	BIT(13)	/* Capt/Comp 4  Polarity   */
+#define TIM_CCER_CC4NP	BIT(15)	/* Capt/Comp 4N Polarity   */
 #define TIM_CCER_CCXE	(BIT(0) | BIT(4) | BIT(8) | BIT(12))
 #define TIM_BDTR_BKE(x)	BIT(12 + (x) * 12) /* Break input enable */
 #define TIM_BDTR_BKP(x)	BIT(13 + (x) * 12) /* Break input polarity */
-- 
cgit v1.2.3


From 8db2509faa331865903a81a92f15c449e821b1d7 Mon Sep 17 00:00:00 2001
From: Jonathan Neuschäfer <j.neuschaefer@gmx.net>
Date: Fri, 29 Mar 2024 17:26:27 +0100
Subject: rhashtable: Improve grammar
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Change "a" to "an" according to the usual rules, fix an "if" that
was mistyped as "in", improve grammar in "considerable slow" ->
"considerably slower".

Signed-off-by: Jonathan Neuschäfer <j.neuschaefer@gmx.net>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/20240329-misc-rhashtable-v1-1-5862383ff798@gmx.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/rhashtable.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 5b5357c0bd8c..8463a128e2f4 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -633,7 +633,7 @@ restart:
  * @params:	hash table parameters
  *
  * Computes the hash value for the key and traverses the bucket chain looking
- * for a entry with an identical key. The first matching entry is returned.
+ * for an entry with an identical key. The first matching entry is returned.
  *
  * This must only be called under the RCU read lock.
  *
@@ -655,7 +655,7 @@ static inline void *rhashtable_lookup(
  * @params:	hash table parameters
  *
  * Computes the hash value for the key and traverses the bucket chain looking
- * for a entry with an identical key. The first matching entry is returned.
+ * for an entry with an identical key. The first matching entry is returned.
  *
  * Only use this function when you have other mechanisms guaranteeing
  * that the object won't go away after the RCU read lock is released.
@@ -682,7 +682,7 @@ static inline void *rhashtable_lookup_fast(
  * @params:	hash table parameters
  *
  * Computes the hash value for the key and traverses the bucket chain looking
- * for a entry with an identical key.  All matching entries are returned
+ * for an entry with an identical key.  All matching entries are returned
  * in a list.
  *
  * This must only be called under the RCU read lock.
@@ -699,7 +699,7 @@ static inline struct rhlist_head *rhltable_lookup(
 }
 
 /* Internal function, please use rhashtable_insert_fast() instead. This
- * function returns the existing element already in hashes in there is a clash,
+ * function returns the existing element already in hashes if there is a clash,
  * otherwise it returns an error via ERR_PTR().
  */
 static inline void *__rhashtable_insert_fast(
@@ -1130,7 +1130,7 @@ static inline int rhashtable_remove_fast(
  *
  * Since the hash chain is single linked, the removal operation needs to
  * walk the bucket chain upon removal. The removal operation is thus
- * considerable slow if the hash table is not correctly sized.
+ * considerably slower if the hash table is not correctly sized.
  *
  * Will automatically shrink the table if permitted when residency drops
  * below 30%
-- 
cgit v1.2.3


From 4a96a4e807c390a9d91b450ebe04eeb2e0ecc076 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Fri, 29 Mar 2024 17:55:06 +0100
Subject: page_pool: check for PP direct cache locality later

Since we have pool->p.napi (Jakub) and pool->cpuid (Lorenzo) to check
whether it's safe to use direct recycling, we can use both globally for
each page instead of relying solely on @allow_direct argument.
Let's assume that @allow_direct means "I'm sure it's local, don't waste
time rechecking this" and when it's false, try the mentioned params to
still recycle the page directly. If neither is true, we'll lose some
CPU cycles, but then it surely won't be hotpath. On the other hand,
paths where it's possible to use direct cache, but not possible to
safely set @allow_direct, will benefit from this move.
The whole propagation of @napi_safe through a dozen of skb freeing
functions can now go away, which saves us some stack space.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Link: https://lore.kernel.org/r/20240329165507.3240110-2-aleksander.lobakin@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 12 ++++-----
 net/core/page_pool.c   | 31 +++++++++++++++++++---
 net/core/skbuff.c      | 70 ++++++++++++++++----------------------------------
 net/ipv4/esp4.c        |  2 +-
 net/ipv6/esp6.c        |  2 +-
 5 files changed, 58 insertions(+), 59 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b7f1ecdaec38..03ea36a82cdd 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3510,25 +3510,25 @@ int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
 		    unsigned int headroom);
 int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
 			 struct bpf_prog *prog);
-bool napi_pp_put_page(struct page *page, bool napi_safe);
+bool napi_pp_put_page(struct page *page);
 
 static inline void
-skb_page_unref(const struct sk_buff *skb, struct page *page, bool napi_safe)
+skb_page_unref(const struct sk_buff *skb, struct page *page)
 {
 #ifdef CONFIG_PAGE_POOL
-	if (skb->pp_recycle && napi_pp_put_page(page, napi_safe))
+	if (skb->pp_recycle && napi_pp_put_page(page))
 		return;
 #endif
 	put_page(page);
 }
 
 static inline void
-napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe)
+napi_frag_unref(skb_frag_t *frag, bool recycle)
 {
 	struct page *page = skb_frag_page(frag);
 
 #ifdef CONFIG_PAGE_POOL
-	if (recycle && napi_pp_put_page(page, napi_safe))
+	if (recycle && napi_pp_put_page(page))
 		return;
 #endif
 	put_page(page);
@@ -3544,7 +3544,7 @@ napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe)
  */
 static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
 {
-	napi_frag_unref(frag, recycle, false);
+	napi_frag_unref(frag, recycle);
 }
 
 /**
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index dd364d738c00..9d56257e444b 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -690,8 +690,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
 			page_pool_dma_sync_for_device(pool, page,
 						      dma_sync_size);
 
-		if (allow_direct && in_softirq() &&
-		    page_pool_recycle_in_cache(page, pool))
+		if (allow_direct && page_pool_recycle_in_cache(page, pool))
 			return NULL;
 
 		/* Page found as candidate for recycling */
@@ -716,9 +715,35 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
 	return NULL;
 }
 
+static bool page_pool_napi_local(const struct page_pool *pool)
+{
+	const struct napi_struct *napi;
+	u32 cpuid;
+
+	if (unlikely(!in_softirq()))
+		return false;
+
+	/* Allow direct recycle if we have reasons to believe that we are
+	 * in the same context as the consumer would run, so there's
+	 * no possible race.
+	 * __page_pool_put_page() makes sure we're not in hardirq context
+	 * and interrupts are enabled prior to accessing the cache.
+	 */
+	cpuid = smp_processor_id();
+	if (READ_ONCE(pool->cpuid) == cpuid)
+		return true;
+
+	napi = READ_ONCE(pool->p.napi);
+
+	return napi && READ_ONCE(napi->list_owner) == cpuid;
+}
+
 void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
 				unsigned int dma_sync_size, bool allow_direct)
 {
+	if (!allow_direct)
+		allow_direct = page_pool_napi_local(pool);
+
 	page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
 	if (page && !page_pool_recycle_in_ring(pool, page)) {
 		/* Cache full, fallback to free pages */
@@ -969,7 +994,7 @@ void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
 static void page_pool_disable_direct_recycling(struct page_pool *pool)
 {
 	/* Disable direct recycling based on pool->cpuid.
-	 * Paired with READ_ONCE() in napi_pp_put_page().
+	 * Paired with READ_ONCE() in page_pool_napi_local().
 	 */
 	WRITE_ONCE(pool->cpuid, -1);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index a1be84be5d35..2a5ce6667bbb 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1004,11 +1004,8 @@ int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
 EXPORT_SYMBOL(skb_cow_data_for_xdp);
 
 #if IS_ENABLED(CONFIG_PAGE_POOL)
-bool napi_pp_put_page(struct page *page, bool napi_safe)
+bool napi_pp_put_page(struct page *page)
 {
-	bool allow_direct = false;
-	struct page_pool *pp;
-
 	page = compound_head(page);
 
 	/* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
@@ -1021,39 +1018,18 @@ bool napi_pp_put_page(struct page *page, bool napi_safe)
 	if (unlikely(!is_pp_page(page)))
 		return false;
 
-	pp = page->pp;
-
-	/* Allow direct recycle if we have reasons to believe that we are
-	 * in the same context as the consumer would run, so there's
-	 * no possible race.
-	 * __page_pool_put_page() makes sure we're not in hardirq context
-	 * and interrupts are enabled prior to accessing the cache.
-	 */
-	if (napi_safe || in_softirq()) {
-		const struct napi_struct *napi = READ_ONCE(pp->p.napi);
-		unsigned int cpuid = smp_processor_id();
-
-		allow_direct = napi && READ_ONCE(napi->list_owner) == cpuid;
-		allow_direct |= READ_ONCE(pp->cpuid) == cpuid;
-	}
-
-	/* Driver set this to memory recycling info. Reset it on recycle.
-	 * This will *not* work for NIC using a split-page memory model.
-	 * The page will be returned to the pool here regardless of the
-	 * 'flipped' fragment being in use or not.
-	 */
-	page_pool_put_full_page(pp, page, allow_direct);
+	page_pool_put_full_page(page->pp, page, false);
 
 	return true;
 }
 EXPORT_SYMBOL(napi_pp_put_page);
 #endif
 
-static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)
+static bool skb_pp_recycle(struct sk_buff *skb, void *data)
 {
 	if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
 		return false;
-	return napi_pp_put_page(virt_to_page(data), napi_safe);
+	return napi_pp_put_page(virt_to_page(data));
 }
 
 /**
@@ -1095,12 +1071,12 @@ static void skb_kfree_head(void *head, unsigned int end_offset)
 		kfree(head);
 }
 
-static void skb_free_head(struct sk_buff *skb, bool napi_safe)
+static void skb_free_head(struct sk_buff *skb)
 {
 	unsigned char *head = skb->head;
 
 	if (skb->head_frag) {
-		if (skb_pp_recycle(skb, head, napi_safe))
+		if (skb_pp_recycle(skb, head))
 			return;
 		skb_free_frag(head);
 	} else {
@@ -1108,8 +1084,7 @@ static void skb_free_head(struct sk_buff *skb, bool napi_safe)
 	}
 }
 
-static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
-			     bool napi_safe)
+static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
 {
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 	int i;
@@ -1126,13 +1101,13 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
 	}
 
 	for (i = 0; i < shinfo->nr_frags; i++)
-		napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe);
+		napi_frag_unref(&shinfo->frags[i], skb->pp_recycle);
 
 free_head:
 	if (shinfo->frag_list)
 		kfree_skb_list_reason(shinfo->frag_list, reason);
 
-	skb_free_head(skb, napi_safe);
+	skb_free_head(skb);
 exit:
 	/* When we clone an SKB we copy the reycling bit. The pp_recycle
 	 * bit is only set on the head though, so in order to avoid races
@@ -1193,12 +1168,11 @@ void skb_release_head_state(struct sk_buff *skb)
 }
 
 /* Free everything but the sk_buff shell. */
-static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,
-			    bool napi_safe)
+static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
 {
 	skb_release_head_state(skb);
 	if (likely(skb->head))
-		skb_release_data(skb, reason, napi_safe);
+		skb_release_data(skb, reason);
 }
 
 /**
@@ -1212,7 +1186,7 @@ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,
 
 void __kfree_skb(struct sk_buff *skb)
 {
-	skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false);
+	skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
 	kfree_skbmem(skb);
 }
 EXPORT_SYMBOL(__kfree_skb);
@@ -1269,7 +1243,7 @@ static void kfree_skb_add_bulk(struct sk_buff *skb,
 		return;
 	}
 
-	skb_release_all(skb, reason, false);
+	skb_release_all(skb, reason);
 	sa->skb_array[sa->skb_count++] = skb;
 
 	if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) {
@@ -1443,7 +1417,7 @@ EXPORT_SYMBOL(consume_skb);
 void __consume_stateless_skb(struct sk_buff *skb)
 {
 	trace_consume_skb(skb, __builtin_return_address(0));
-	skb_release_data(skb, SKB_CONSUMED, false);
+	skb_release_data(skb, SKB_CONSUMED);
 	kfree_skbmem(skb);
 }
 
@@ -1470,7 +1444,7 @@ static void napi_skb_cache_put(struct sk_buff *skb)
 
 void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason)
 {
-	skb_release_all(skb, reason, true);
+	skb_release_all(skb, reason);
 	napi_skb_cache_put(skb);
 }
 
@@ -1508,7 +1482,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
 		return;
 	}
 
-	skb_release_all(skb, SKB_CONSUMED, !!budget);
+	skb_release_all(skb, SKB_CONSUMED);
 	napi_skb_cache_put(skb);
 }
 EXPORT_SYMBOL(napi_consume_skb);
@@ -1639,7 +1613,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
  */
 struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
 {
-	skb_release_all(dst, SKB_CONSUMED, false);
+	skb_release_all(dst, SKB_CONSUMED);
 	return __skb_clone(dst, src);
 }
 EXPORT_SYMBOL_GPL(skb_morph);
@@ -2271,9 +2245,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 		if (skb_has_frag_list(skb))
 			skb_clone_fraglist(skb);
 
-		skb_release_data(skb, SKB_CONSUMED, false);
+		skb_release_data(skb, SKB_CONSUMED);
 	} else {
-		skb_free_head(skb, false);
+		skb_free_head(skb);
 	}
 	off = (data + nhead) - skb->head;
 
@@ -6574,12 +6548,12 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
 			skb_frag_ref(skb, i);
 		if (skb_has_frag_list(skb))
 			skb_clone_fraglist(skb);
-		skb_release_data(skb, SKB_CONSUMED, false);
+		skb_release_data(skb, SKB_CONSUMED);
 	} else {
 		/* we can reuse existing recount- all we did was
 		 * relocate values
 		 */
-		skb_free_head(skb, false);
+		skb_free_head(skb);
 	}
 
 	skb->head = data;
@@ -6714,7 +6688,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
 		skb_kfree_head(data, size);
 		return -ENOMEM;
 	}
-	skb_release_data(skb, SKB_CONSUMED, false);
+	skb_release_data(skb, SKB_CONSUMED);
 
 	skb->head = data;
 	skb->head_frag = 0;
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index d33d12421814..3d647c9a7a21 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -114,7 +114,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb)
 	 */
 	if (req->src != req->dst)
 		for (sg = sg_next(req->src); sg; sg = sg_next(sg))
-			skb_page_unref(skb, sg_page(sg), false);
+			skb_page_unref(skb, sg_page(sg));
 }
 
 #ifdef CONFIG_INET_ESPINTCP
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 7371886d4f9f..fe8d53f5a5ee 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -131,7 +131,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb)
 	 */
 	if (req->src != req->dst)
 		for (sg = sg_next(req->src); sg; sg = sg_next(sg))
-			skb_page_unref(skb, sg_page(sg), false);
+			skb_page_unref(skb, sg_page(sg));
 }
 
 #ifdef CONFIG_INET6_ESPINTCP
-- 
cgit v1.2.3


From 9a79c65f00e2b036e17af3a3a607d7d732b7affb Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Sun, 31 Mar 2024 17:05:21 +0800
Subject: tcp/dccp: complete lockless accesses to sk->sk_max_ack_backlog

Since commit 099ecf59f05b ("net: annotate lockless accesses to
sk->sk_max_ack_backlog") decided to handle the sk_max_ack_backlog
locklessly, there is one more function mostly called in TCP/DCCP
cases. So this patch completes it:)

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20240331090521.71965-1-kerneljasonxing@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/inet_connection_sock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index d96c871b0d58..20e7b0c0b3d1 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -281,7 +281,7 @@ static inline int inet_csk_reqsk_queue_len(const struct sock *sk)
 
 static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk)
 {
-	return inet_csk_reqsk_queue_len(sk) >= sk->sk_max_ack_backlog;
+	return inet_csk_reqsk_queue_len(sk) >= READ_ONCE(sk->sk_max_ack_backlog);
 }
 
 bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req);
-- 
cgit v1.2.3


From e57ba7e3d7bccd12b6ac9298a9fe0fd8d92ea31d Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Mon, 1 Apr 2024 11:10:04 +0800
Subject: uapi: team: use header file generated from YAML spec

generated with:

 $ ./tools/net/ynl/ynl-gen-c.py --mode uapi \
 > --spec Documentation/netlink/specs/team.yaml \
 > --header -o include/uapi/linux/if_team.h

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://lore.kernel.org/r/20240401031004.1159713-5-liuhangbin@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/if_team.h | 116 ++++++++++++++++---------------------------
 1 file changed, 43 insertions(+), 73 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_team.h b/include/uapi/linux/if_team.h
index 13c61fecb78b..a5c06243a435 100644
--- a/include/uapi/linux/if_team.h
+++ b/include/uapi/linux/if_team.h
@@ -1,108 +1,78 @@
-/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
-/*
- * include/linux/if_team.h - Network team device driver header
- * Copyright (c) 2011 Jiri Pirko <jpirko@redhat.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/*	Documentation/netlink/specs/team.yaml */
+/* YNL-GEN uapi header */
 
-#ifndef _UAPI_LINUX_IF_TEAM_H_
-#define _UAPI_LINUX_IF_TEAM_H_
+#ifndef _UAPI_LINUX_IF_TEAM_H
+#define _UAPI_LINUX_IF_TEAM_H
 
+#define TEAM_GENL_NAME		"team"
+#define TEAM_GENL_VERSION	1
 
-#define TEAM_STRING_MAX_LEN 32
-
-/**********************************
- * NETLINK_GENERIC netlink family.
- **********************************/
-
-enum {
-	TEAM_CMD_NOOP,
-	TEAM_CMD_OPTIONS_SET,
-	TEAM_CMD_OPTIONS_GET,
-	TEAM_CMD_PORT_LIST_GET,
-
-	__TEAM_CMD_MAX,
-	TEAM_CMD_MAX = (__TEAM_CMD_MAX - 1),
-};
+#define TEAM_STRING_MAX_LEN			32
+#define TEAM_GENL_CHANGE_EVENT_MC_GRP_NAME	"change_event"
 
 enum {
 	TEAM_ATTR_UNSPEC,
-	TEAM_ATTR_TEAM_IFINDEX,		/* u32 */
-	TEAM_ATTR_LIST_OPTION,		/* nest */
-	TEAM_ATTR_LIST_PORT,		/* nest */
+	TEAM_ATTR_TEAM_IFINDEX,
+	TEAM_ATTR_LIST_OPTION,
+	TEAM_ATTR_LIST_PORT,
 
 	__TEAM_ATTR_MAX,
-	TEAM_ATTR_MAX = __TEAM_ATTR_MAX - 1,
+	TEAM_ATTR_MAX = (__TEAM_ATTR_MAX - 1)
 };
 
-/* Nested layout of get/set msg:
- *
- *	[TEAM_ATTR_LIST_OPTION]
- *		[TEAM_ATTR_ITEM_OPTION]
- *			[TEAM_ATTR_OPTION_*], ...
- *		[TEAM_ATTR_ITEM_OPTION]
- *			[TEAM_ATTR_OPTION_*], ...
- *		...
- *	[TEAM_ATTR_LIST_PORT]
- *		[TEAM_ATTR_ITEM_PORT]
- *			[TEAM_ATTR_PORT_*], ...
- *		[TEAM_ATTR_ITEM_PORT]
- *			[TEAM_ATTR_PORT_*], ...
- *		...
- */
-
 enum {
 	TEAM_ATTR_ITEM_OPTION_UNSPEC,
-	TEAM_ATTR_ITEM_OPTION,		/* nest */
+	TEAM_ATTR_ITEM_OPTION,
 
 	__TEAM_ATTR_ITEM_OPTION_MAX,
-	TEAM_ATTR_ITEM_OPTION_MAX = __TEAM_ATTR_ITEM_OPTION_MAX - 1,
+	TEAM_ATTR_ITEM_OPTION_MAX = (__TEAM_ATTR_ITEM_OPTION_MAX - 1)
 };
 
 enum {
 	TEAM_ATTR_OPTION_UNSPEC,
-	TEAM_ATTR_OPTION_NAME,		/* string */
-	TEAM_ATTR_OPTION_CHANGED,	/* flag */
-	TEAM_ATTR_OPTION_TYPE,		/* u8 */
-	TEAM_ATTR_OPTION_DATA,		/* dynamic */
-	TEAM_ATTR_OPTION_REMOVED,	/* flag */
-	TEAM_ATTR_OPTION_PORT_IFINDEX,	/* u32 */ /* for per-port options */
-	TEAM_ATTR_OPTION_ARRAY_INDEX,	/* u32 */ /* for array options */
+	TEAM_ATTR_OPTION_NAME,
+	TEAM_ATTR_OPTION_CHANGED,
+	TEAM_ATTR_OPTION_TYPE,
+	TEAM_ATTR_OPTION_DATA,
+	TEAM_ATTR_OPTION_REMOVED,
+	TEAM_ATTR_OPTION_PORT_IFINDEX,
+	TEAM_ATTR_OPTION_ARRAY_INDEX,
 
 	__TEAM_ATTR_OPTION_MAX,
-	TEAM_ATTR_OPTION_MAX = __TEAM_ATTR_OPTION_MAX - 1,
+	TEAM_ATTR_OPTION_MAX = (__TEAM_ATTR_OPTION_MAX - 1)
 };
 
 enum {
 	TEAM_ATTR_ITEM_PORT_UNSPEC,
-	TEAM_ATTR_ITEM_PORT,		/* nest */
+	TEAM_ATTR_ITEM_PORT,
 
 	__TEAM_ATTR_ITEM_PORT_MAX,
-	TEAM_ATTR_ITEM_PORT_MAX = __TEAM_ATTR_ITEM_PORT_MAX - 1,
+	TEAM_ATTR_ITEM_PORT_MAX = (__TEAM_ATTR_ITEM_PORT_MAX - 1)
 };
 
 enum {
 	TEAM_ATTR_PORT_UNSPEC,
-	TEAM_ATTR_PORT_IFINDEX,		/* u32 */
-	TEAM_ATTR_PORT_CHANGED,		/* flag */
-	TEAM_ATTR_PORT_LINKUP,		/* flag */
-	TEAM_ATTR_PORT_SPEED,		/* u32 */
-	TEAM_ATTR_PORT_DUPLEX,		/* u8 */
-	TEAM_ATTR_PORT_REMOVED,		/* flag */
+	TEAM_ATTR_PORT_IFINDEX,
+	TEAM_ATTR_PORT_CHANGED,
+	TEAM_ATTR_PORT_LINKUP,
+	TEAM_ATTR_PORT_SPEED,
+	TEAM_ATTR_PORT_DUPLEX,
+	TEAM_ATTR_PORT_REMOVED,
 
 	__TEAM_ATTR_PORT_MAX,
-	TEAM_ATTR_PORT_MAX = __TEAM_ATTR_PORT_MAX - 1,
+	TEAM_ATTR_PORT_MAX = (__TEAM_ATTR_PORT_MAX - 1)
 };
 
-/*
- * NETLINK_GENERIC related info
- */
-#define TEAM_GENL_NAME "team"
-#define TEAM_GENL_VERSION 0x1
-#define TEAM_GENL_CHANGE_EVENT_MC_GRP_NAME "change_event"
+enum {
+	TEAM_CMD_NOOP,
+	TEAM_CMD_OPTIONS_SET,
+	TEAM_CMD_OPTIONS_GET,
+	TEAM_CMD_PORT_LIST_GET,
+
+	__TEAM_CMD_MAX,
+	TEAM_CMD_MAX = (__TEAM_CMD_MAX - 1)
+};
 
-#endif /* _UAPI_LINUX_IF_TEAM_H_ */
+#endif /* _UAPI_LINUX_IF_TEAM_H */
-- 
cgit v1.2.3


From 0439fcff304acdedfc493dbf900dce86922fd31f Mon Sep 17 00:00:00 2001
From: Sumit Garg <sumit.garg@linaro.org>
Date: Mon, 25 Mar 2024 16:11:01 +0100
Subject: tee: Refactor TEE subsystem header files

Since commit 25559c22cef8 ("tee: add kernel internal client interface"),
it has been a common include/linux/tee_drv.h header file which is shared
to hold TEE subsystem internal bits along with the APIs exposed to the
TEE client drivers. However, this practice is prone to TEE subsystem
internal APIs abuse and especially so with the new TEE implementation
drivers being added to reuse existing functionality.

In order to address this split TEE subsystem internal bits as a separate
header file: include/linux/tee_core.h which should be the one used by
TEE implementation drivers. With that include/linux/tee_drv.h lists only
APIs exposed by TEE subsystem to the TEE client drivers.

Signed-off-by: Sumit Garg <sumit.garg@linaro.org>
Signed-off-by: Balint Dobszay <balint.dobszay@arm.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
---
 MAINTAINERS                         |   1 +
 drivers/tee/amdtee/amdtee_private.h |   2 +-
 drivers/tee/amdtee/call.c           |   2 +-
 drivers/tee/amdtee/core.c           |   3 +-
 drivers/tee/amdtee/shm_pool.c       |   2 +-
 drivers/tee/optee/call.c            |   2 +-
 drivers/tee/optee/core.c            |   2 +-
 drivers/tee/optee/device.c          |   2 +-
 drivers/tee/optee/ffa_abi.c         |   2 +-
 drivers/tee/optee/notif.c           |   2 +-
 drivers/tee/optee/optee_private.h   |   2 +-
 drivers/tee/optee/rpc.c             |   2 +-
 drivers/tee/optee/smc_abi.c         |   2 +-
 drivers/tee/tee_core.c              |   2 +-
 drivers/tee/tee_private.h           |  35 -----
 drivers/tee/tee_shm.c               |   2 +-
 drivers/tee/tee_shm_pool.c          |   2 +-
 include/linux/tee_core.h            | 296 ++++++++++++++++++++++++++++++++++++
 include/linux/tee_drv.h             | 285 ++++------------------------------
 19 files changed, 343 insertions(+), 305 deletions(-)
 create mode 100644 include/linux/tee_core.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index aa3b947fb080..fd221a7d4d1c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -21684,6 +21684,7 @@ F:	Documentation/driver-api/tee.rst
 F:	Documentation/tee/
 F:	Documentation/userspace-api/tee.rst
 F:	drivers/tee/
+F:	include/linux/tee_core.h
 F:	include/linux/tee_drv.h
 F:	include/uapi/linux/tee.h
 
diff --git a/drivers/tee/amdtee/amdtee_private.h b/drivers/tee/amdtee/amdtee_private.h
index 6d0f7062bb87..d87050033894 100644
--- a/drivers/tee/amdtee/amdtee_private.h
+++ b/drivers/tee/amdtee/amdtee_private.h
@@ -9,7 +9,7 @@
 
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
-#include <linux/tee_drv.h>
+#include <linux/tee_core.h>
 #include <linux/kref.h>
 #include <linux/types.h>
 #include "amdtee_if.h"
diff --git a/drivers/tee/amdtee/call.c b/drivers/tee/amdtee/call.c
index e9b63dcb3194..4c21b02be4af 100644
--- a/drivers/tee/amdtee/call.c
+++ b/drivers/tee/amdtee/call.c
@@ -5,7 +5,7 @@
 
 #include <linux/device.h>
 #include <linux/tee.h>
-#include <linux/tee_drv.h>
+#include <linux/tee_core.h>
 #include <linux/psp-tee.h>
 #include <linux/slab.h>
 #include <linux/psp.h>
diff --git a/drivers/tee/amdtee/core.c b/drivers/tee/amdtee/core.c
index 3c15f6a9e91c..e487231d25dc 100644
--- a/drivers/tee/amdtee/core.c
+++ b/drivers/tee/amdtee/core.c
@@ -9,13 +9,12 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/device.h>
-#include <linux/tee_drv.h>
+#include <linux/tee_core.h>
 #include <linux/types.h>
 #include <linux/mm.h>
 #include <linux/uaccess.h>
 #include <linux/firmware.h>
 #include "amdtee_private.h"
-#include "../tee_private.h"
 #include <linux/psp-tee.h>
 
 static struct amdtee_driver_data *drv_data;
diff --git a/drivers/tee/amdtee/shm_pool.c b/drivers/tee/amdtee/shm_pool.c
index f0303126f199..6346e0bc8a64 100644
--- a/drivers/tee/amdtee/shm_pool.c
+++ b/drivers/tee/amdtee/shm_pool.c
@@ -4,7 +4,7 @@
  */
 
 #include <linux/slab.h>
-#include <linux/tee_drv.h>
+#include <linux/tee_core.h>
 #include <linux/psp.h>
 #include "amdtee_private.h"
 
diff --git a/drivers/tee/optee/call.c b/drivers/tee/optee/call.c
index a91e50be11be..16eb953e14bb 100644
--- a/drivers/tee/optee/call.c
+++ b/drivers/tee/optee/call.c
@@ -7,7 +7,7 @@
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/tee_drv.h>
+#include <linux/tee_core.h>
 #include <linux/types.h>
 #include "optee_private.h"
 
diff --git a/drivers/tee/optee/core.c b/drivers/tee/optee/core.c
index 3aed554bc8d8..f762e3a25119 100644
--- a/drivers/tee/optee/core.c
+++ b/drivers/tee/optee/core.c
@@ -13,7 +13,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/string.h>
-#include <linux/tee_drv.h>
+#include <linux/tee_core.h>
 #include <linux/types.h>
 #include "optee_private.h"
 
diff --git a/drivers/tee/optee/device.c b/drivers/tee/optee/device.c
index 1892e49a8e6a..d296c70ddfdc 100644
--- a/drivers/tee/optee/device.c
+++ b/drivers/tee/optee/device.c
@@ -7,7 +7,7 @@
 
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/tee_drv.h>
+#include <linux/tee_core.h>
 #include <linux/uuid.h>
 #include "optee_private.h"
 
diff --git a/drivers/tee/optee/ffa_abi.c b/drivers/tee/optee/ffa_abi.c
index ecb5eb079408..cee8ccb84cb8 100644
--- a/drivers/tee/optee/ffa_abi.c
+++ b/drivers/tee/optee/ffa_abi.c
@@ -11,7 +11,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/string.h>
-#include <linux/tee_drv.h>
+#include <linux/tee_core.h>
 #include <linux/types.h>
 #include "optee_private.h"
 #include "optee_ffa.h"
diff --git a/drivers/tee/optee/notif.c b/drivers/tee/optee/notif.c
index 05212842b0a5..0d7878e770cd 100644
--- a/drivers/tee/optee/notif.c
+++ b/drivers/tee/optee/notif.c
@@ -9,7 +9,7 @@
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
-#include <linux/tee_drv.h>
+#include <linux/tee_core.h>
 #include "optee_private.h"
 
 struct notif_entry {
diff --git a/drivers/tee/optee/optee_private.h b/drivers/tee/optee/optee_private.h
index 7a5243c78b55..a0698ac18993 100644
--- a/drivers/tee/optee/optee_private.h
+++ b/drivers/tee/optee/optee_private.h
@@ -9,7 +9,7 @@
 #include <linux/arm-smccc.h>
 #include <linux/rhashtable.h>
 #include <linux/semaphore.h>
-#include <linux/tee_drv.h>
+#include <linux/tee_core.h>
 #include <linux/types.h>
 #include "optee_msg.h"
 
diff --git a/drivers/tee/optee/rpc.c b/drivers/tee/optee/rpc.c
index e69bc6380683..f086812f1179 100644
--- a/drivers/tee/optee/rpc.c
+++ b/drivers/tee/optee/rpc.c
@@ -8,7 +8,7 @@
 #include <linux/delay.h>
 #include <linux/i2c.h>
 #include <linux/slab.h>
-#include <linux/tee_drv.h>
+#include <linux/tee_core.h>
 #include "optee_private.h"
 #include "optee_rpc_cmd.h"
 
diff --git a/drivers/tee/optee/smc_abi.c b/drivers/tee/optee/smc_abi.c
index a37f87087e5c..9c296b887dc1 100644
--- a/drivers/tee/optee/smc_abi.c
+++ b/drivers/tee/optee/smc_abi.c
@@ -23,7 +23,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/string.h>
-#include <linux/tee_drv.h>
+#include <linux/tee_core.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
 #include "optee_private.h"
diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c
index e59c20d74b36..82ad095d2b1c 100644
--- a/drivers/tee/tee_core.c
+++ b/drivers/tee/tee_core.c
@@ -11,7 +11,7 @@
 #include <linux/idr.h>
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <linux/tee_drv.h>
+#include <linux/tee_core.h>
 #include <linux/uaccess.h>
 #include <crypto/hash.h>
 #include <crypto/sha1.h>
diff --git a/drivers/tee/tee_private.h b/drivers/tee/tee_private.h
index 754e11dcb240..9bc50605227c 100644
--- a/drivers/tee/tee_private.h
+++ b/drivers/tee/tee_private.h
@@ -12,41 +12,6 @@
 #include <linux/mutex.h>
 #include <linux/types.h>
 
-#define TEE_DEVICE_FLAG_REGISTERED	0x1
-#define TEE_MAX_DEV_NAME_LEN		32
-
-/**
- * struct tee_device - TEE Device representation
- * @name:	name of device
- * @desc:	description of device
- * @id:		unique id of device
- * @flags:	represented by TEE_DEVICE_FLAG_REGISTERED above
- * @dev:	embedded basic device structure
- * @cdev:	embedded cdev
- * @num_users:	number of active users of this device
- * @c_no_user:	completion used when unregistering the device
- * @mutex:	mutex protecting @num_users and @idr
- * @idr:	register of user space shared memory objects allocated or
- *		registered on this device
- * @pool:	shared memory pool
- */
-struct tee_device {
-	char name[TEE_MAX_DEV_NAME_LEN];
-	const struct tee_desc *desc;
-	int id;
-	unsigned int flags;
-
-	struct device dev;
-	struct cdev cdev;
-
-	size_t num_users;
-	struct completion c_no_users;
-	struct mutex mutex;	/* protects num_users and idr */
-
-	struct idr idr;
-	struct tee_shm_pool *pool;
-};
-
 int tee_shm_get_fd(struct tee_shm *shm);
 
 bool tee_device_get(struct tee_device *teedev);
diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c
index 731d9028b67f..96a45c817427 100644
--- a/drivers/tee/tee_shm.c
+++ b/drivers/tee/tee_shm.c
@@ -8,7 +8,7 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
-#include <linux/tee_drv.h>
+#include <linux/tee_core.h>
 #include <linux/uaccess.h>
 #include <linux/uio.h>
 #include <linux/highmem.h>
diff --git a/drivers/tee/tee_shm_pool.c b/drivers/tee/tee_shm_pool.c
index 058bfbac657a..80004b55628d 100644
--- a/drivers/tee/tee_shm_pool.c
+++ b/drivers/tee/tee_shm_pool.c
@@ -6,7 +6,7 @@
 #include <linux/dma-buf.h>
 #include <linux/genalloc.h>
 #include <linux/slab.h>
-#include <linux/tee_drv.h>
+#include <linux/tee_core.h>
 #include "tee_private.h"
 
 static int pool_op_gen_alloc(struct tee_shm_pool *pool, struct tee_shm *shm,
diff --git a/include/linux/tee_core.h b/include/linux/tee_core.h
new file mode 100644
index 000000000000..d9b3ba8e8fa9
--- /dev/null
+++ b/include/linux/tee_core.h
@@ -0,0 +1,296 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024 Linaro Limited
+ */
+
+#ifndef __TEE_CORE_H
+#define __TEE_CORE_H
+
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/idr.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/tee.h>
+#include <linux/tee_drv.h>
+#include <linux/types.h>
+#include <linux/uuid.h>
+
+/*
+ * The file describes the API provided by the generic TEE driver to the
+ * specific TEE driver.
+ */
+
+#define TEE_SHM_DYNAMIC		BIT(0)  /* Dynamic shared memory registered */
+					/* in secure world */
+#define TEE_SHM_USER_MAPPED	BIT(1)  /* Memory mapped in user space */
+#define TEE_SHM_POOL		BIT(2)  /* Memory allocated from pool */
+#define TEE_SHM_PRIV		BIT(3)  /* Memory private to TEE driver */
+
+#define TEE_DEVICE_FLAG_REGISTERED	0x1
+#define TEE_MAX_DEV_NAME_LEN		32
+
+/**
+ * struct tee_device - TEE Device representation
+ * @name:	name of device
+ * @desc:	description of device
+ * @id:		unique id of device
+ * @flags:	represented by TEE_DEVICE_FLAG_REGISTERED above
+ * @dev:	embedded basic device structure
+ * @cdev:	embedded cdev
+ * @num_users:	number of active users of this device
+ * @c_no_user:	completion used when unregistering the device
+ * @mutex:	mutex protecting @num_users and @idr
+ * @idr:	register of user space shared memory objects allocated or
+ *		registered on this device
+ * @pool:	shared memory pool
+ */
+struct tee_device {
+	char name[TEE_MAX_DEV_NAME_LEN];
+	const struct tee_desc *desc;
+	int id;
+	unsigned int flags;
+
+	struct device dev;
+	struct cdev cdev;
+
+	size_t num_users;
+	struct completion c_no_users;
+	struct mutex mutex;	/* protects num_users and idr */
+
+	struct idr idr;
+	struct tee_shm_pool *pool;
+};
+
+/**
+ * struct tee_driver_ops - driver operations vtable
+ * @get_version:	returns version of driver
+ * @open:		called when the device file is opened
+ * @release:		release this open file
+ * @open_session:	open a new session
+ * @close_session:	close a session
+ * @system_session:	declare session as a system session
+ * @invoke_func:	invoke a trusted function
+ * @cancel_req:		request cancel of an ongoing invoke or open
+ * @supp_recv:		called for supplicant to get a command
+ * @supp_send:		called for supplicant to send a response
+ * @shm_register:	register shared memory buffer in TEE
+ * @shm_unregister:	unregister shared memory buffer in TEE
+ */
+struct tee_driver_ops {
+	void (*get_version)(struct tee_device *teedev,
+			    struct tee_ioctl_version_data *vers);
+	int (*open)(struct tee_context *ctx);
+	void (*release)(struct tee_context *ctx);
+	int (*open_session)(struct tee_context *ctx,
+			    struct tee_ioctl_open_session_arg *arg,
+			    struct tee_param *param);
+	int (*close_session)(struct tee_context *ctx, u32 session);
+	int (*system_session)(struct tee_context *ctx, u32 session);
+	int (*invoke_func)(struct tee_context *ctx,
+			   struct tee_ioctl_invoke_arg *arg,
+			   struct tee_param *param);
+	int (*cancel_req)(struct tee_context *ctx, u32 cancel_id, u32 session);
+	int (*supp_recv)(struct tee_context *ctx, u32 *func, u32 *num_params,
+			 struct tee_param *param);
+	int (*supp_send)(struct tee_context *ctx, u32 ret, u32 num_params,
+			 struct tee_param *param);
+	int (*shm_register)(struct tee_context *ctx, struct tee_shm *shm,
+			    struct page **pages, size_t num_pages,
+			    unsigned long start);
+	int (*shm_unregister)(struct tee_context *ctx, struct tee_shm *shm);
+};
+
+/**
+ * struct tee_desc - Describes the TEE driver to the subsystem
+ * @name:	name of driver
+ * @ops:	driver operations vtable
+ * @owner:	module providing the driver
+ * @flags:	Extra properties of driver, defined by TEE_DESC_* below
+ */
+#define TEE_DESC_PRIVILEGED	0x1
+struct tee_desc {
+	const char *name;
+	const struct tee_driver_ops *ops;
+	struct module *owner;
+	u32 flags;
+};
+
+/**
+ * tee_device_alloc() - Allocate a new struct tee_device instance
+ * @teedesc:	Descriptor for this driver
+ * @dev:	Parent device for this device
+ * @pool:	Shared memory pool, NULL if not used
+ * @driver_data: Private driver data for this device
+ *
+ * Allocates a new struct tee_device instance. The device is
+ * removed by tee_device_unregister().
+ *
+ * @returns a pointer to a 'struct tee_device' or an ERR_PTR on failure
+ */
+struct tee_device *tee_device_alloc(const struct tee_desc *teedesc,
+				    struct device *dev,
+				    struct tee_shm_pool *pool,
+				    void *driver_data);
+
+/**
+ * tee_device_register() - Registers a TEE device
+ * @teedev:	Device to register
+ *
+ * tee_device_unregister() need to be called to remove the @teedev if
+ * this function fails.
+ *
+ * @returns < 0 on failure
+ */
+int tee_device_register(struct tee_device *teedev);
+
+/**
+ * tee_device_unregister() - Removes a TEE device
+ * @teedev:	Device to unregister
+ *
+ * This function should be called to remove the @teedev even if
+ * tee_device_register() hasn't been called yet. Does nothing if
+ * @teedev is NULL.
+ */
+void tee_device_unregister(struct tee_device *teedev);
+
+/**
+ * tee_session_calc_client_uuid() - Calculates client UUID for session
+ * @uuid:		Resulting UUID
+ * @connection_method:	Connection method for session (TEE_IOCTL_LOGIN_*)
+ * @connectuon_data:	Connection data for opening session
+ *
+ * Based on connection method calculates UUIDv5 based client UUID.
+ *
+ * For group based logins verifies that calling process has specified
+ * credentials.
+ *
+ * @return < 0 on failure
+ */
+int tee_session_calc_client_uuid(uuid_t *uuid, u32 connection_method,
+				 const u8 connection_data[TEE_IOCTL_UUID_LEN]);
+
+/**
+ * struct tee_shm_pool - shared memory pool
+ * @ops:		operations
+ * @private_data:	private data for the shared memory manager
+ */
+struct tee_shm_pool {
+	const struct tee_shm_pool_ops *ops;
+	void *private_data;
+};
+
+/**
+ * struct tee_shm_pool_ops - shared memory pool operations
+ * @alloc:		called when allocating shared memory
+ * @free:		called when freeing shared memory
+ * @destroy_pool:	called when destroying the pool
+ */
+struct tee_shm_pool_ops {
+	int (*alloc)(struct tee_shm_pool *pool, struct tee_shm *shm,
+		     size_t size, size_t align);
+	void (*free)(struct tee_shm_pool *pool, struct tee_shm *shm);
+	void (*destroy_pool)(struct tee_shm_pool *pool);
+};
+
+/*
+ * tee_shm_pool_alloc_res_mem() - Create a shm manager for reserved memory
+ * @vaddr:	Virtual address of start of pool
+ * @paddr:	Physical address of start of pool
+ * @size:	Size in bytes of the pool
+ *
+ * @returns pointer to a 'struct tee_shm_pool' or an ERR_PTR on failure.
+ */
+struct tee_shm_pool *tee_shm_pool_alloc_res_mem(unsigned long vaddr,
+						phys_addr_t paddr, size_t size,
+						int min_alloc_order);
+
+/**
+ * tee_shm_pool_free() - Free a shared memory pool
+ * @pool:	The shared memory pool to free
+ *
+ * The must be no remaining shared memory allocated from this pool when
+ * this function is called.
+ */
+static inline void tee_shm_pool_free(struct tee_shm_pool *pool)
+{
+	pool->ops->destroy_pool(pool);
+}
+
+/**
+ * tee_get_drvdata() - Return driver_data pointer
+ * @returns the driver_data pointer supplied to tee_register().
+ */
+void *tee_get_drvdata(struct tee_device *teedev);
+
+/**
+ * tee_shm_alloc_priv_buf() - Allocate shared memory for private use by specific
+ *                            TEE driver
+ * @ctx:	The TEE context for shared memory allocation
+ * @size:	Shared memory allocation size
+ * @returns a pointer to 'struct tee_shm' on success or an ERR_PTR on failure
+ */
+struct tee_shm *tee_shm_alloc_priv_buf(struct tee_context *ctx, size_t size);
+
+/**
+ * tee_shm_is_dynamic() - Check if shared memory object is of the dynamic kind
+ * @shm:	Shared memory handle
+ * @returns true if object is dynamic shared memory
+ */
+static inline bool tee_shm_is_dynamic(struct tee_shm *shm)
+{
+	return shm && (shm->flags & TEE_SHM_DYNAMIC);
+}
+
+/**
+ * tee_shm_put() - Decrease reference count on a shared memory handle
+ * @shm:	Shared memory handle
+ */
+void tee_shm_put(struct tee_shm *shm);
+
+/**
+ * tee_shm_get_id() - Get id of a shared memory object
+ * @shm:	Shared memory handle
+ * @returns id
+ */
+static inline int tee_shm_get_id(struct tee_shm *shm)
+{
+	return shm->id;
+}
+
+/**
+ * tee_shm_get_from_id() - Find shared memory object and increase reference
+ * count
+ * @ctx:	Context owning the shared memory
+ * @id:		Id of shared memory object
+ * @returns a pointer to 'struct tee_shm' on success or an ERR_PTR on failure
+ */
+struct tee_shm *tee_shm_get_from_id(struct tee_context *ctx, int id);
+
+static inline bool tee_param_is_memref(struct tee_param *param)
+{
+	switch (param->attr & TEE_IOCTL_PARAM_ATTR_TYPE_MASK) {
+	case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT:
+	case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT:
+	case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT:
+		return true;
+	default:
+		return false;
+	}
+}
+
+/**
+ * teedev_open() - Open a struct tee_device
+ * @teedev:	Device to open
+ *
+ * @return a pointer to struct tee_context on success or an ERR_PTR on failure.
+ */
+struct tee_context *teedev_open(struct tee_device *teedev);
+
+/**
+ * teedev_close_context() - closes a struct tee_context
+ * @ctx:	The struct tee_context to close
+ */
+void teedev_close_context(struct tee_context *ctx);
+
+#endif /*__TEE_CORE_H*/
diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h
index 71632e3c5f18..786b9ae6cf4d 100644
--- a/include/linux/tee_drv.h
+++ b/include/linux/tee_drv.h
@@ -1,40 +1,28 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (c) 2015-2022 Linaro Limited
+ * Copyright (c) 2015-2024 Linaro Limited
  */
 
 #ifndef __TEE_DRV_H
 #define __TEE_DRV_H
 
 #include <linux/device.h>
-#include <linux/idr.h>
 #include <linux/kref.h>
 #include <linux/list.h>
 #include <linux/mod_devicetable.h>
 #include <linux/tee.h>
 #include <linux/types.h>
-#include <linux/uuid.h>
 
 /*
- * The file describes the API provided by the generic TEE driver to the
- * specific TEE driver.
+ * The file describes the API provided by the TEE subsystem to the
+ * TEE client drivers.
  */
 
-#define TEE_SHM_DYNAMIC		BIT(0)  /* Dynamic shared memory registered */
-					/* in secure world */
-#define TEE_SHM_USER_MAPPED	BIT(1)  /* Memory mapped in user space */
-#define TEE_SHM_POOL		BIT(2)  /* Memory allocated from pool */
-#define TEE_SHM_PRIV		BIT(3)  /* Memory private to TEE driver */
-
-struct device;
 struct tee_device;
-struct tee_shm;
-struct tee_shm_pool;
 
 /**
  * struct tee_context - driver specific context on file pointer data
  * @teedev:	pointer to this drivers struct tee_device
- * @list_shm:	List of shared memory object owned by this context
  * @data:	driver specific context data, managed by the driver
  * @refcount:	reference counter for this structure
  * @releasing:  flag that indicates if context is being released right now.
@@ -57,134 +45,6 @@ struct tee_context {
 	bool cap_memref_null;
 };
 
-struct tee_param_memref {
-	size_t shm_offs;
-	size_t size;
-	struct tee_shm *shm;
-};
-
-struct tee_param_value {
-	u64 a;
-	u64 b;
-	u64 c;
-};
-
-struct tee_param {
-	u64 attr;
-	union {
-		struct tee_param_memref memref;
-		struct tee_param_value value;
-	} u;
-};
-
-/**
- * struct tee_driver_ops - driver operations vtable
- * @get_version:	returns version of driver
- * @open:		called when the device file is opened
- * @release:		release this open file
- * @open_session:	open a new session
- * @close_session:	close a session
- * @system_session:	declare session as a system session
- * @invoke_func:	invoke a trusted function
- * @cancel_req:		request cancel of an ongoing invoke or open
- * @supp_recv:		called for supplicant to get a command
- * @supp_send:		called for supplicant to send a response
- * @shm_register:	register shared memory buffer in TEE
- * @shm_unregister:	unregister shared memory buffer in TEE
- */
-struct tee_driver_ops {
-	void (*get_version)(struct tee_device *teedev,
-			    struct tee_ioctl_version_data *vers);
-	int (*open)(struct tee_context *ctx);
-	void (*release)(struct tee_context *ctx);
-	int (*open_session)(struct tee_context *ctx,
-			    struct tee_ioctl_open_session_arg *arg,
-			    struct tee_param *param);
-	int (*close_session)(struct tee_context *ctx, u32 session);
-	int (*system_session)(struct tee_context *ctx, u32 session);
-	int (*invoke_func)(struct tee_context *ctx,
-			   struct tee_ioctl_invoke_arg *arg,
-			   struct tee_param *param);
-	int (*cancel_req)(struct tee_context *ctx, u32 cancel_id, u32 session);
-	int (*supp_recv)(struct tee_context *ctx, u32 *func, u32 *num_params,
-			 struct tee_param *param);
-	int (*supp_send)(struct tee_context *ctx, u32 ret, u32 num_params,
-			 struct tee_param *param);
-	int (*shm_register)(struct tee_context *ctx, struct tee_shm *shm,
-			    struct page **pages, size_t num_pages,
-			    unsigned long start);
-	int (*shm_unregister)(struct tee_context *ctx, struct tee_shm *shm);
-};
-
-/**
- * struct tee_desc - Describes the TEE driver to the subsystem
- * @name:	name of driver
- * @ops:	driver operations vtable
- * @owner:	module providing the driver
- * @flags:	Extra properties of driver, defined by TEE_DESC_* below
- */
-#define TEE_DESC_PRIVILEGED	0x1
-struct tee_desc {
-	const char *name;
-	const struct tee_driver_ops *ops;
-	struct module *owner;
-	u32 flags;
-};
-
-/**
- * tee_device_alloc() - Allocate a new struct tee_device instance
- * @teedesc:	Descriptor for this driver
- * @dev:	Parent device for this device
- * @pool:	Shared memory pool, NULL if not used
- * @driver_data: Private driver data for this device
- *
- * Allocates a new struct tee_device instance. The device is
- * removed by tee_device_unregister().
- *
- * @returns a pointer to a 'struct tee_device' or an ERR_PTR on failure
- */
-struct tee_device *tee_device_alloc(const struct tee_desc *teedesc,
-				    struct device *dev,
-				    struct tee_shm_pool *pool,
-				    void *driver_data);
-
-/**
- * tee_device_register() - Registers a TEE device
- * @teedev:	Device to register
- *
- * tee_device_unregister() need to be called to remove the @teedev if
- * this function fails.
- *
- * @returns < 0 on failure
- */
-int tee_device_register(struct tee_device *teedev);
-
-/**
- * tee_device_unregister() - Removes a TEE device
- * @teedev:	Device to unregister
- *
- * This function should be called to remove the @teedev even if
- * tee_device_register() hasn't been called yet. Does nothing if
- * @teedev is NULL.
- */
-void tee_device_unregister(struct tee_device *teedev);
-
-/**
- * tee_session_calc_client_uuid() - Calculates client UUID for session
- * @uuid:		Resulting UUID
- * @connection_method:	Connection method for session (TEE_IOCTL_LOGIN_*)
- * @connectuon_data:	Connection data for opening session
- *
- * Based on connection method calculates UUIDv5 based client UUID.
- *
- * For group based logins verifies that calling process has specified
- * credentials.
- *
- * @return < 0 on failure
- */
-int tee_session_calc_client_uuid(uuid_t *uuid, u32 connection_method,
-				 const u8 connection_data[TEE_IOCTL_UUID_LEN]);
-
 /**
  * struct tee_shm - shared memory object
  * @ctx:	context using the object
@@ -195,15 +55,12 @@ int tee_session_calc_client_uuid(uuid_t *uuid, u32 connection_method,
  * @pages:	locked pages from userspace
  * @num_pages:	number of locked pages
  * @refcount:	reference counter
- * @flags:	defined by TEE_SHM_* in tee_drv.h
+ * @flags:	defined by TEE_SHM_* in tee_core.h
  * @id:		unique id of a shared memory object on this device, shared
  *		with user space
  * @sec_world_id:
  *		secure world assigned id of this shared memory object, not
  *		used by all drivers
- *
- * This pool is only supposed to be accessed directly from the TEE
- * subsystem and from drivers that implements their own shm pool manager.
  */
 struct tee_shm {
 	struct tee_context *ctx;
@@ -219,87 +76,52 @@ struct tee_shm {
 	u64 sec_world_id;
 };
 
-/**
- * struct tee_shm_pool - shared memory pool
- * @ops:		operations
- * @private_data:	private data for the shared memory manager
- */
-struct tee_shm_pool {
-	const struct tee_shm_pool_ops *ops;
-	void *private_data;
+struct tee_param_memref {
+	size_t shm_offs;
+	size_t size;
+	struct tee_shm *shm;
 };
 
-/**
- * struct tee_shm_pool_ops - shared memory pool operations
- * @alloc:		called when allocating shared memory
- * @free:		called when freeing shared memory
- * @destroy_pool:	called when destroying the pool
- */
-struct tee_shm_pool_ops {
-	int (*alloc)(struct tee_shm_pool *pool, struct tee_shm *shm,
-		     size_t size, size_t align);
-	void (*free)(struct tee_shm_pool *pool, struct tee_shm *shm);
-	void (*destroy_pool)(struct tee_shm_pool *pool);
+struct tee_param_value {
+	u64 a;
+	u64 b;
+	u64 c;
 };
 
-/*
- * tee_shm_pool_alloc_res_mem() - Create a shm manager for reserved memory
- * @vaddr:	Virtual address of start of pool
- * @paddr:	Physical address of start of pool
- * @size:	Size in bytes of the pool
- *
- * @returns pointer to a 'struct tee_shm_pool' or an ERR_PTR on failure.
- */
-struct tee_shm_pool *tee_shm_pool_alloc_res_mem(unsigned long vaddr,
-						phys_addr_t paddr, size_t size,
-						int min_alloc_order);
+struct tee_param {
+	u64 attr;
+	union {
+		struct tee_param_memref memref;
+		struct tee_param_value value;
+	} u;
+};
 
 /**
- * tee_shm_pool_free() - Free a shared memory pool
- * @pool:	The shared memory pool to free
- *
- * The must be no remaining shared memory allocated from this pool when
- * this function is called.
+ * tee_shm_alloc_kernel_buf() - Allocate kernel shared memory for a
+ *                              particular TEE client driver
+ * @ctx:	The TEE context for shared memory allocation
+ * @size:	Shared memory allocation size
+ * @returns a pointer to 'struct tee_shm' on success or an ERR_PTR on failure
  */
-static inline void tee_shm_pool_free(struct tee_shm_pool *pool)
-{
-	pool->ops->destroy_pool(pool);
-}
+struct tee_shm *tee_shm_alloc_kernel_buf(struct tee_context *ctx, size_t size);
 
 /**
- * tee_get_drvdata() - Return driver_data pointer
- * @returns the driver_data pointer supplied to tee_register().
+ * tee_shm_register_kernel_buf() - Register kernel shared memory for a
+ *                                 particular TEE client driver
+ * @ctx:	The TEE context for shared memory registration
+ * @addr:	Kernel buffer address
+ * @length:	Kernel buffer length
+ * @returns a pointer to 'struct tee_shm' on success or an ERR_PTR on failure
  */
-void *tee_get_drvdata(struct tee_device *teedev);
-
-struct tee_shm *tee_shm_alloc_priv_buf(struct tee_context *ctx, size_t size);
-struct tee_shm *tee_shm_alloc_kernel_buf(struct tee_context *ctx, size_t size);
-
 struct tee_shm *tee_shm_register_kernel_buf(struct tee_context *ctx,
 					    void *addr, size_t length);
 
-/**
- * tee_shm_is_dynamic() - Check if shared memory object is of the dynamic kind
- * @shm:	Shared memory handle
- * @returns true if object is dynamic shared memory
- */
-static inline bool tee_shm_is_dynamic(struct tee_shm *shm)
-{
-	return shm && (shm->flags & TEE_SHM_DYNAMIC);
-}
-
 /**
  * tee_shm_free() - Free shared memory
  * @shm:	Handle to shared memory to free
  */
 void tee_shm_free(struct tee_shm *shm);
 
-/**
- * tee_shm_put() - Decrease reference count on a shared memory handle
- * @shm:	Shared memory handle
- */
-void tee_shm_put(struct tee_shm *shm);
-
 /**
  * tee_shm_get_va() - Get virtual address of a shared memory plus an offset
  * @shm:	Shared memory handle
@@ -352,25 +174,6 @@ static inline size_t tee_shm_get_page_offset(struct tee_shm *shm)
 	return shm->offset;
 }
 
-/**
- * tee_shm_get_id() - Get id of a shared memory object
- * @shm:	Shared memory handle
- * @returns id
- */
-static inline int tee_shm_get_id(struct tee_shm *shm)
-{
-	return shm->id;
-}
-
-/**
- * tee_shm_get_from_id() - Find shared memory object and increase reference
- * count
- * @ctx:	Context owning the shared memory
- * @id:		Id of shared memory object
- * @returns a pointer to 'struct tee_shm' on success or an ERR_PTR on failure
- */
-struct tee_shm *tee_shm_get_from_id(struct tee_context *ctx, int id);
-
 /**
  * tee_client_open_context() - Open a TEE context
  * @start:	if not NULL, continue search after this context
@@ -470,18 +273,6 @@ int tee_client_invoke_func(struct tee_context *ctx,
 int tee_client_cancel_req(struct tee_context *ctx,
 			  struct tee_ioctl_cancel_arg *arg);
 
-static inline bool tee_param_is_memref(struct tee_param *param)
-{
-	switch (param->attr & TEE_IOCTL_PARAM_ATTR_TYPE_MASK) {
-	case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT:
-	case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT:
-	case TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT:
-		return true;
-	default:
-		return false;
-	}
-}
-
 extern const struct bus_type tee_bus_type;
 
 /**
@@ -509,18 +300,4 @@ struct tee_client_driver {
 #define to_tee_client_driver(d) \
 		container_of(d, struct tee_client_driver, driver)
 
-/**
- * teedev_open() - Open a struct tee_device
- * @teedev:	Device to open
- *
- * @return a pointer to struct tee_context on success or an ERR_PTR on failure.
- */
-struct tee_context *teedev_open(struct tee_device *teedev);
-
-/**
- * teedev_close_context() - closes a struct tee_context
- * @ctx:	The struct tee_context to close
- */
-void teedev_close_context(struct tee_context *ctx);
-
 #endif /*__TEE_DRV_H*/
-- 
cgit v1.2.3


From 464eb03c4a7cfb32cb3324249193cf6bb5b35152 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 28 Mar 2024 08:27:49 +0100
Subject: rtnetlink: add guard for RTNL

The new guard/scoped_gard can be useful for the RTNL as well,
so add a guard definition for it. It gets used like

 {
   guard(rtnl)();
   // RTNL held until end of block
 }

or

  scoped_guard(rtnl) {
    // RTNL held in this block
  }

as with any other guard/scoped_guard.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index cdfc897f1e3c..a7da7dfc06a2 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -7,6 +7,7 @@
 #include <linux/netdevice.h>
 #include <linux/wait.h>
 #include <linux/refcount.h>
+#include <linux/cleanup.h>
 #include <uapi/linux/rtnetlink.h>
 
 extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo);
@@ -46,6 +47,8 @@ extern int rtnl_is_locked(void);
 extern int rtnl_lock_killable(void);
 extern bool refcount_dec_and_rtnl_lock(refcount_t *r);
 
+DEFINE_LOCK_GUARD_0(rtnl, rtnl_lock(), rtnl_unlock())
+
 extern wait_queue_head_t netdev_unregistering_wq;
 extern atomic_t dev_unreg_count;
 extern struct rw_semaphore pernet_ops_rwsem;
-- 
cgit v1.2.3


From b1f81b9a535b48b2c9ca460720a2bc73fd2001de Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 28 Mar 2024 08:27:50 +0100
Subject: netdevice: add DEFINE_FREE() for dev_put

For short netdev holds within a function there are still a lot of
users of dev_put() rather than netdev_put(). Add DEFINE_FREE() to
allow making those safer.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7d12b5a9380f..0c198620ac93 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4097,6 +4097,8 @@ static inline void dev_put(struct net_device *dev)
 	netdev_put(dev, NULL);
 }
 
+DEFINE_FREE(dev_put, struct net_device *, if (_T) dev_put(_T))
+
 static inline void netdev_ref_replace(struct net_device *odev,
 				      struct net_device *ndev,
 				      netdevice_tracker *tracker,
-- 
cgit v1.2.3


From dbcedec3a31119d7594baacc743300d127c99c56 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 7 Mar 2024 15:49:05 +0200
Subject: gpiolib: legacy: Remove unused gpio_request_array() and
 gpio_free_array()

No more users.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Yanteng Si <siyanteng@loongson.cn>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 Documentation/driver-api/gpio/legacy.rst           | 16 ---------
 .../translations/zh_CN/driver-api/gpio/legacy.rst  | 16 ---------
 Documentation/translations/zh_TW/gpio.txt          | 17 ----------
 drivers/gpio/gpiolib-legacy.c                      | 39 ----------------------
 include/linux/gpio.h                               | 15 ---------
 5 files changed, 103 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/gpio/legacy.rst b/Documentation/driver-api/gpio/legacy.rst
index b6505914791c..534dfe95d128 100644
--- a/Documentation/driver-api/gpio/legacy.rst
+++ b/Documentation/driver-api/gpio/legacy.rst
@@ -225,8 +225,6 @@ setup or driver probe/teardown code, so this is an easy constraint.)::
                 gpio_request()
 
         ## 	gpio_request_one()
-        ##	gpio_request_array()
-        ## 	gpio_free_array()
 
                 gpio_free()
 
@@ -295,14 +293,6 @@ are claimed, three additional calls are defined::
 	 */
 	int gpio_request_one(unsigned gpio, unsigned long flags, const char *label);
 
-	/* request multiple GPIOs in a single call
-	 */
-	int gpio_request_array(struct gpio *array, size_t num);
-
-	/* release multiple GPIOs in a single call
-	 */
-	void gpio_free_array(struct gpio *array, size_t num);
-
 where 'flags' is currently defined to specify the following properties:
 
 	* GPIOF_DIR_IN		- to configure direction as input
@@ -341,12 +331,6 @@ A typical example of usage::
 	if (err)
 		...
 
-	err = gpio_request_array(leds_gpios, ARRAY_SIZE(leds_gpios));
-	if (err)
-		...
-
-	gpio_free_array(leds_gpios, ARRAY_SIZE(leds_gpios));
-
 
 GPIOs mapped to IRQs
 --------------------
diff --git a/Documentation/translations/zh_CN/driver-api/gpio/legacy.rst b/Documentation/translations/zh_CN/driver-api/gpio/legacy.rst
index aeccff777170..0faf042001d2 100644
--- a/Documentation/translations/zh_CN/driver-api/gpio/legacy.rst
+++ b/Documentation/translations/zh_CN/driver-api/gpio/legacy.rst
@@ -208,8 +208,6 @@ GPIO 值的命令需要等待其信息排到队首才发送命令，再获得其
                 gpio_request()
 
         ## 	gpio_request_one()
-        ##	gpio_request_array()
-        ## 	gpio_free_array()
 
                 gpio_free()
 
@@ -272,14 +270,6 @@ gpio_request()前将这类细节配置好，例如使用引脚控制子系统的
 	 */
 	int gpio_request_one(unsigned gpio, unsigned long flags, const char *label);
 
-	/* 在单个函数中申请多个 GPIO
-	 */
-	int gpio_request_array(struct gpio *array, size_t num);
-
-	/* 在单个函数中释放多个 GPIO
-	 */
-	void gpio_free_array(struct gpio *array, size_t num);
-
 这里 'flags' 当前定义可指定以下属性:
 
 	* GPIOF_DIR_IN		- 配置方向为输入
@@ -317,12 +307,6 @@ gpio_request()前将这类细节配置好，例如使用引脚控制子系统的
 	if (err)
 		...
 
-	err = gpio_request_array(leds_gpios, ARRAY_SIZE(leds_gpios));
-	if (err)
-		...
-
-	gpio_free_array(leds_gpios, ARRAY_SIZE(leds_gpios));
-
 
 GPIO 映射到 IRQ
 ----------------
diff --git a/Documentation/translations/zh_TW/gpio.txt b/Documentation/translations/zh_TW/gpio.txt
index b9b48012c62e..77d69d381316 100644
--- a/Documentation/translations/zh_TW/gpio.txt
+++ b/Documentation/translations/zh_TW/gpio.txt
@@ -215,13 +215,10 @@ GPIO 值的命令需要等待其信息排到隊首才發送命令，再獲得其
 	gpio_request()
 
 ## 	gpio_request_one()
-##	gpio_request_array()
-## 	gpio_free_array()
 
 	gpio_free()
 
 
-
 聲明和釋放 GPIO
 ----------------------------
 爲了有助於捕獲系統配置錯誤,定義了兩個函數。
@@ -278,14 +275,6 @@ gpio_request()前將這類細節配置好，例如使用 pinctrl 子系統的映
 	 */
 	int gpio_request_one(unsigned gpio, unsigned long flags, const char *label);
 
-	/* 在單個函數中申請多個 GPIO
-	 */
-	int gpio_request_array(struct gpio *array, size_t num);
-
-	/* 在單個函數中釋放多個 GPIO
-	 */
-	void gpio_free_array(struct gpio *array, size_t num);
-
 這裡 'flags' 當前定義可指定以下屬性:
 
 	* GPIOF_DIR_IN		- 配置方向爲輸入
@@ -323,12 +312,6 @@ gpio_request()前將這類細節配置好，例如使用 pinctrl 子系統的映
 	if (err)
 		...
 
-	err = gpio_request_array(leds_gpios, ARRAY_SIZE(leds_gpios));
-	if (err)
-		...
-
-	gpio_free_array(leds_gpios, ARRAY_SIZE(leds_gpios));
-
 
 GPIO 映射到 IRQ
 --------------------
diff --git a/drivers/gpio/gpiolib-legacy.c b/drivers/gpio/gpiolib-legacy.c
index b138682fec3d..616d60187f85 100644
--- a/drivers/gpio/gpiolib-legacy.c
+++ b/drivers/gpio/gpiolib-legacy.c
@@ -72,42 +72,3 @@ int gpio_request(unsigned gpio, const char *label)
 	return gpiod_request(desc, label);
 }
 EXPORT_SYMBOL_GPL(gpio_request);
-
-/**
- * gpio_request_array - request multiple GPIOs in a single call
- * @array:	array of the 'struct gpio'
- * @num:	how many GPIOs in the array
- *
- * **DEPRECATED** This function is deprecated and must not be used in new code.
- */
-int gpio_request_array(const struct gpio *array, size_t num)
-{
-	int i, err;
-
-	for (i = 0; i < num; i++, array++) {
-		err = gpio_request_one(array->gpio, array->flags, array->label);
-		if (err)
-			goto err_free;
-	}
-	return 0;
-
-err_free:
-	while (i--)
-		gpio_free((--array)->gpio);
-	return err;
-}
-EXPORT_SYMBOL_GPL(gpio_request_array);
-
-/**
- * gpio_free_array - release multiple GPIOs in a single call
- * @array:	array of the 'struct gpio'
- * @num:	how many GPIOs in the array
- *
- * **DEPRECATED** This function is deprecated and must not be used in new code.
- */
-void gpio_free_array(const struct gpio *array, size_t num)
-{
-	while (num--)
-		gpio_free((array++)->gpio);
-}
-EXPORT_SYMBOL_GPL(gpio_free_array);
diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index 7ecc25c543ce..4aaedcf424ce 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -114,8 +114,6 @@ static inline int gpio_to_irq(unsigned gpio)
 }
 
 int gpio_request_one(unsigned gpio, unsigned long flags, const char *label);
-int gpio_request_array(const struct gpio *array, size_t num);
-void gpio_free_array(const struct gpio *array, size_t num);
 
 /* CONFIG_GPIOLIB: bindings for managed devices that want to request gpios */
 
@@ -146,11 +144,6 @@ static inline int gpio_request_one(unsigned gpio,
 	return -ENOSYS;
 }
 
-static inline int gpio_request_array(const struct gpio *array, size_t num)
-{
-	return -ENOSYS;
-}
-
 static inline void gpio_free(unsigned gpio)
 {
 	might_sleep();
@@ -159,14 +152,6 @@ static inline void gpio_free(unsigned gpio)
 	WARN_ON(1);
 }
 
-static inline void gpio_free_array(const struct gpio *array, size_t num)
-{
-	might_sleep();
-
-	/* GPIO can never have been requested */
-	WARN_ON(1);
-}
-
 static inline int gpio_direction_input(unsigned gpio)
 {
 	return -ENOSYS;
-- 
cgit v1.2.3


From cf4441503e20a0c0a36e0a98dcf57de3d3844c91 Mon Sep 17 00:00:00 2001
From: Balint Dobszay <balint.dobszay@arm.com>
Date: Mon, 25 Mar 2024 16:11:02 +0100
Subject: tee: optee: Move pool_op helper functions

Move the pool alloc and free helper functions from the OP-TEE driver to
the TEE subsystem, since these could be reused in other TEE drivers.
This patch is not supposed to change behavior, it's only reorganizing
the code.

Reviewed-by: Sumit Garg <sumit.garg@linaro.org>
Suggested-by: Jens Wiklander <jens.wiklander@linaro.org>
Signed-off-by: Balint Dobszay <balint.dobszay@arm.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
---
 drivers/tee/optee/core.c          | 64 --------------------------------------
 drivers/tee/optee/ffa_abi.c       |  6 ++--
 drivers/tee/optee/optee_private.h | 12 --------
 drivers/tee/optee/smc_abi.c       |  9 +++---
 drivers/tee/tee_shm.c             | 65 +++++++++++++++++++++++++++++++++++++++
 include/linux/tee_core.h          | 10 ++++++
 6 files changed, 82 insertions(+), 84 deletions(-)

(limited to 'include')

diff --git a/drivers/tee/optee/core.c b/drivers/tee/optee/core.c
index f762e3a25119..39e688d4e974 100644
--- a/drivers/tee/optee/core.c
+++ b/drivers/tee/optee/core.c
@@ -9,7 +9,6 @@
 #include <linux/crash_dump.h>
 #include <linux/errno.h>
 #include <linux/io.h>
-#include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/string.h>
@@ -17,69 +16,6 @@
 #include <linux/types.h>
 #include "optee_private.h"
 
-int optee_pool_op_alloc_helper(struct tee_shm_pool *pool, struct tee_shm *shm,
-			       size_t size, size_t align,
-			       int (*shm_register)(struct tee_context *ctx,
-						   struct tee_shm *shm,
-						   struct page **pages,
-						   size_t num_pages,
-						   unsigned long start))
-{
-	size_t nr_pages = roundup(size, PAGE_SIZE) / PAGE_SIZE;
-	struct page **pages;
-	unsigned int i;
-	int rc = 0;
-
-	/*
-	 * Ignore alignment since this is already going to be page aligned
-	 * and there's no need for any larger alignment.
-	 */
-	shm->kaddr = alloc_pages_exact(nr_pages * PAGE_SIZE,
-				       GFP_KERNEL | __GFP_ZERO);
-	if (!shm->kaddr)
-		return -ENOMEM;
-
-	shm->paddr = virt_to_phys(shm->kaddr);
-	shm->size = nr_pages * PAGE_SIZE;
-
-	pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL);
-	if (!pages) {
-		rc = -ENOMEM;
-		goto err;
-	}
-
-	for (i = 0; i < nr_pages; i++)
-		pages[i] = virt_to_page((u8 *)shm->kaddr + i * PAGE_SIZE);
-
-	shm->pages = pages;
-	shm->num_pages = nr_pages;
-
-	if (shm_register) {
-		rc = shm_register(shm->ctx, shm, pages, nr_pages,
-				  (unsigned long)shm->kaddr);
-		if (rc)
-			goto err;
-	}
-
-	return 0;
-err:
-	free_pages_exact(shm->kaddr, shm->size);
-	shm->kaddr = NULL;
-	return rc;
-}
-
-void optee_pool_op_free_helper(struct tee_shm_pool *pool, struct tee_shm *shm,
-			       int (*shm_unregister)(struct tee_context *ctx,
-						     struct tee_shm *shm))
-{
-	if (shm_unregister)
-		shm_unregister(shm->ctx, shm);
-	free_pages_exact(shm->kaddr, shm->size);
-	shm->kaddr = NULL;
-	kfree(shm->pages);
-	shm->pages = NULL;
-}
-
 static void optee_bus_scan(struct work_struct *work)
 {
 	WARN_ON(optee_enumerate_devices(PTA_CMD_GET_DEVICES_SUPP));
diff --git a/drivers/tee/optee/ffa_abi.c b/drivers/tee/optee/ffa_abi.c
index cee8ccb84cb8..3235e1c719e8 100644
--- a/drivers/tee/optee/ffa_abi.c
+++ b/drivers/tee/optee/ffa_abi.c
@@ -374,14 +374,14 @@ static int optee_ffa_shm_unregister_supp(struct tee_context *ctx,
 static int pool_ffa_op_alloc(struct tee_shm_pool *pool,
 			     struct tee_shm *shm, size_t size, size_t align)
 {
-	return optee_pool_op_alloc_helper(pool, shm, size, align,
-					  optee_ffa_shm_register);
+	return tee_dyn_shm_alloc_helper(shm, size, align,
+					optee_ffa_shm_register);
 }
 
 static void pool_ffa_op_free(struct tee_shm_pool *pool,
 			     struct tee_shm *shm)
 {
-	optee_pool_op_free_helper(pool, shm, optee_ffa_shm_unregister);
+	tee_dyn_shm_free_helper(shm, optee_ffa_shm_unregister);
 }
 
 static void pool_ffa_op_destroy_pool(struct tee_shm_pool *pool)
diff --git a/drivers/tee/optee/optee_private.h b/drivers/tee/optee/optee_private.h
index a0698ac18993..429cc20be5cc 100644
--- a/drivers/tee/optee/optee_private.h
+++ b/drivers/tee/optee/optee_private.h
@@ -283,18 +283,6 @@ int optee_cancel_req(struct tee_context *ctx, u32 cancel_id, u32 session);
 int optee_enumerate_devices(u32 func);
 void optee_unregister_devices(void);
 
-int optee_pool_op_alloc_helper(struct tee_shm_pool *pool, struct tee_shm *shm,
-			       size_t size, size_t align,
-			       int (*shm_register)(struct tee_context *ctx,
-						   struct tee_shm *shm,
-						   struct page **pages,
-						   size_t num_pages,
-						   unsigned long start));
-void optee_pool_op_free_helper(struct tee_shm_pool *pool, struct tee_shm *shm,
-			       int (*shm_unregister)(struct tee_context *ctx,
-						     struct tee_shm *shm));
-
-
 void optee_remove_common(struct optee *optee);
 int optee_open(struct tee_context *ctx, bool cap_memref_null);
 void optee_release(struct tee_context *ctx);
diff --git a/drivers/tee/optee/smc_abi.c b/drivers/tee/optee/smc_abi.c
index 9c296b887dc1..734c484ed0f6 100644
--- a/drivers/tee/optee/smc_abi.c
+++ b/drivers/tee/optee/smc_abi.c
@@ -592,19 +592,18 @@ static int pool_op_alloc(struct tee_shm_pool *pool,
 	 * to be registered with OP-TEE.
 	 */
 	if (shm->flags & TEE_SHM_PRIV)
-		return optee_pool_op_alloc_helper(pool, shm, size, align, NULL);
+		return tee_dyn_shm_alloc_helper(shm, size, align, NULL);
 
-	return optee_pool_op_alloc_helper(pool, shm, size, align,
-					  optee_shm_register);
+	return tee_dyn_shm_alloc_helper(shm, size, align, optee_shm_register);
 }
 
 static void pool_op_free(struct tee_shm_pool *pool,
 			 struct tee_shm *shm)
 {
 	if (!(shm->flags & TEE_SHM_PRIV))
-		optee_pool_op_free_helper(pool, shm, optee_shm_unregister);
+		tee_dyn_shm_free_helper(shm, optee_shm_unregister);
 	else
-		optee_pool_op_free_helper(pool, shm, NULL);
+		tee_dyn_shm_free_helper(shm, NULL);
 }
 
 static void pool_op_destroy_pool(struct tee_shm_pool *pool)
diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c
index 96a45c817427..daf6e5cfd59a 100644
--- a/drivers/tee/tee_shm.c
+++ b/drivers/tee/tee_shm.c
@@ -5,6 +5,7 @@
 #include <linux/anon_inodes.h>
 #include <linux/device.h>
 #include <linux/idr.h>
+#include <linux/io.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
@@ -202,6 +203,70 @@ struct tee_shm *tee_shm_alloc_priv_buf(struct tee_context *ctx, size_t size)
 }
 EXPORT_SYMBOL_GPL(tee_shm_alloc_priv_buf);
 
+int tee_dyn_shm_alloc_helper(struct tee_shm *shm, size_t size, size_t align,
+			     int (*shm_register)(struct tee_context *ctx,
+						 struct tee_shm *shm,
+						 struct page **pages,
+						 size_t num_pages,
+						 unsigned long start))
+{
+	size_t nr_pages = roundup(size, PAGE_SIZE) / PAGE_SIZE;
+	struct page **pages;
+	unsigned int i;
+	int rc = 0;
+
+	/*
+	 * Ignore alignment since this is already going to be page aligned
+	 * and there's no need for any larger alignment.
+	 */
+	shm->kaddr = alloc_pages_exact(nr_pages * PAGE_SIZE,
+				       GFP_KERNEL | __GFP_ZERO);
+	if (!shm->kaddr)
+		return -ENOMEM;
+
+	shm->paddr = virt_to_phys(shm->kaddr);
+	shm->size = nr_pages * PAGE_SIZE;
+
+	pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL);
+	if (!pages) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	for (i = 0; i < nr_pages; i++)
+		pages[i] = virt_to_page((u8 *)shm->kaddr + i * PAGE_SIZE);
+
+	shm->pages = pages;
+	shm->num_pages = nr_pages;
+
+	if (shm_register) {
+		rc = shm_register(shm->ctx, shm, pages, nr_pages,
+				  (unsigned long)shm->kaddr);
+		if (rc)
+			goto err;
+	}
+
+	return 0;
+err:
+	free_pages_exact(shm->kaddr, shm->size);
+	shm->kaddr = NULL;
+	return rc;
+}
+EXPORT_SYMBOL_GPL(tee_dyn_shm_alloc_helper);
+
+void tee_dyn_shm_free_helper(struct tee_shm *shm,
+			     int (*shm_unregister)(struct tee_context *ctx,
+						   struct tee_shm *shm))
+{
+	if (shm_unregister)
+		shm_unregister(shm->ctx, shm);
+	free_pages_exact(shm->kaddr, shm->size);
+	shm->kaddr = NULL;
+	kfree(shm->pages);
+	shm->pages = NULL;
+}
+EXPORT_SYMBOL_GPL(tee_dyn_shm_free_helper);
+
 static struct tee_shm *
 register_shm_helper(struct tee_context *ctx, struct iov_iter *iter, u32 flags,
 		    int id)
diff --git a/include/linux/tee_core.h b/include/linux/tee_core.h
index d9b3ba8e8fa9..efd16ed52315 100644
--- a/include/linux/tee_core.h
+++ b/include/linux/tee_core.h
@@ -232,6 +232,16 @@ void *tee_get_drvdata(struct tee_device *teedev);
  */
 struct tee_shm *tee_shm_alloc_priv_buf(struct tee_context *ctx, size_t size);
 
+int tee_dyn_shm_alloc_helper(struct tee_shm *shm, size_t size, size_t align,
+			     int (*shm_register)(struct tee_context *ctx,
+						 struct tee_shm *shm,
+						 struct page **pages,
+						 size_t num_pages,
+						 unsigned long start));
+void tee_dyn_shm_free_helper(struct tee_shm *shm,
+			     int (*shm_unregister)(struct tee_context *ctx,
+						   struct tee_shm *shm));
+
 /**
  * tee_shm_is_dynamic() - Check if shared memory object is of the dynamic kind
  * @shm:	Shared memory handle
-- 
cgit v1.2.3


From c835e5a3153cae2956efdbb9948b90f2b9e5e64d Mon Sep 17 00:00:00 2001
From: Balint Dobszay <balint.dobszay@arm.com>
Date: Mon, 25 Mar 2024 16:11:03 +0100
Subject: tee: tstee: Add Trusted Services TEE driver

The Trusted Services project provides a framework for developing and
deploying device Root of Trust services in FF-A Secure Partitions. The
FF-A SPs are accessible through the FF-A driver, but this doesn't
provide a user space interface. The goal of this TEE driver is to make
Trusted Services SPs accessible for user space clients.

All TS SPs have the same FF-A UUID, it identifies the RPC protocol used
by TS. A TS SP can host one or more services, a service is identified by
its service UUID. The same type of service cannot be present twice in
the same SP. During SP boot each service in an SP is assigned an
interface ID, this is just a short ID to simplify message addressing.
There is 1:1 mapping between TS SPs and TEE devices, i.e. a separate TEE
device is registered for each TS SP. This is required since contrary to
the generic TEE design where memory is shared with the whole TEE
implementation, in case of FF-A, memory is shared with a specific SP. A
user space client has to be able to separately share memory with each SP
based on its endpoint ID.

Acked-by: Sumit Garg <sumit.garg@linaro.org>
Signed-off-by: Balint Dobszay <balint.dobszay@arm.com>
Signed-off-by: Jens Wiklander <jens.wiklander@linaro.org>
---
 drivers/tee/Kconfig               |   1 +
 drivers/tee/Makefile              |   1 +
 drivers/tee/tstee/Kconfig         |  11 +
 drivers/tee/tstee/Makefile        |   3 +
 drivers/tee/tstee/core.c          | 480 ++++++++++++++++++++++++++++++++++++++
 drivers/tee/tstee/tstee_private.h |  92 ++++++++
 include/uapi/linux/tee.h          |   1 +
 7 files changed, 589 insertions(+)
 create mode 100644 drivers/tee/tstee/Kconfig
 create mode 100644 drivers/tee/tstee/Makefile
 create mode 100644 drivers/tee/tstee/core.c
 create mode 100644 drivers/tee/tstee/tstee_private.h

(limited to 'include')

diff --git a/drivers/tee/Kconfig b/drivers/tee/Kconfig
index 73a147202e88..61b507c18780 100644
--- a/drivers/tee/Kconfig
+++ b/drivers/tee/Kconfig
@@ -15,5 +15,6 @@ if TEE
 
 source "drivers/tee/optee/Kconfig"
 source "drivers/tee/amdtee/Kconfig"
+source "drivers/tee/tstee/Kconfig"
 
 endif
diff --git a/drivers/tee/Makefile b/drivers/tee/Makefile
index 68da044afbfa..5488cba30bd2 100644
--- a/drivers/tee/Makefile
+++ b/drivers/tee/Makefile
@@ -5,3 +5,4 @@ tee-objs += tee_shm.o
 tee-objs += tee_shm_pool.o
 obj-$(CONFIG_OPTEE) += optee/
 obj-$(CONFIG_AMDTEE) += amdtee/
+obj-$(CONFIG_ARM_TSTEE) += tstee/
diff --git a/drivers/tee/tstee/Kconfig b/drivers/tee/tstee/Kconfig
new file mode 100644
index 000000000000..d32f91d47398
--- /dev/null
+++ b/drivers/tee/tstee/Kconfig
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config ARM_TSTEE
+	tristate "Arm Trusted Services TEE driver"
+	depends on ARM_FFA_TRANSPORT
+	default n
+	help
+	  The Trusted Services project provides a framework for developing and
+	  deploying device Root of Trust services in FF-A Secure Partitions.
+	  This driver provides an interface to make Trusted Services Secure
+	  Partitions accessible for user space clients, since the FF-A driver
+	  doesn't implement a user space interface directly.
diff --git a/drivers/tee/tstee/Makefile b/drivers/tee/tstee/Makefile
new file mode 100644
index 000000000000..5227020ebd30
--- /dev/null
+++ b/drivers/tee/tstee/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+arm-tstee-objs := core.o
+obj-$(CONFIG_ARM_TSTEE) = arm-tstee.o
diff --git a/drivers/tee/tstee/core.c b/drivers/tee/tstee/core.c
new file mode 100644
index 000000000000..533425e9e9e7
--- /dev/null
+++ b/drivers/tee/tstee/core.c
@@ -0,0 +1,480 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2023, Arm Limited
+ */
+
+#include <linux/arm_ffa.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/limits.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <linux/tee_core.h>
+#include <linux/types.h>
+#include <linux/uuid.h>
+#include <linux/xarray.h>
+#include "tstee_private.h"
+
+#define FFA_DIRECT_REQ_ARG_NUM 5
+#define FFA_INVALID_MEM_HANDLE U64_MAX
+
+static void arg_list_to_ffa_data(const u32 *args,
+				 struct ffa_send_direct_data *data)
+{
+	data->data0 = args[0];
+	data->data1 = args[1];
+	data->data2 = args[2];
+	data->data3 = args[3];
+	data->data4 = args[4];
+}
+
+static void arg_list_from_ffa_data(const struct ffa_send_direct_data *data,
+				   u32 *args)
+{
+	args[0] = lower_32_bits(data->data0);
+	args[1] = lower_32_bits(data->data1);
+	args[2] = lower_32_bits(data->data2);
+	args[3] = lower_32_bits(data->data3);
+	args[4] = lower_32_bits(data->data4);
+}
+
+static void tstee_get_version(struct tee_device *teedev,
+			      struct tee_ioctl_version_data *vers)
+{
+	struct tstee *tstee = tee_get_drvdata(teedev);
+	struct tee_ioctl_version_data v = {
+		.impl_id = TEE_IMPL_ID_TSTEE,
+		/* FF-A endpoint ID only uses the lower 16 bits */
+		.impl_caps = lower_16_bits(tstee->ffa_dev->vm_id),
+		.gen_caps = 0,
+	};
+
+	*vers = v;
+}
+
+static int tstee_open(struct tee_context *ctx)
+{
+	struct ts_context_data *ctxdata;
+
+	ctxdata = kzalloc(sizeof(*ctxdata), GFP_KERNEL);
+	if (!ctxdata)
+		return -ENOMEM;
+
+	xa_init_flags(&ctxdata->sess_list, XA_FLAGS_ALLOC);
+
+	ctx->data = ctxdata;
+
+	return 0;
+}
+
+static void tstee_release(struct tee_context *ctx)
+{
+	struct ts_context_data *ctxdata = ctx->data;
+	struct ts_session *sess;
+	unsigned long idx;
+
+	if (!ctxdata)
+		return;
+
+	xa_for_each(&ctxdata->sess_list, idx, sess) {
+		xa_erase(&ctxdata->sess_list, idx);
+		kfree(sess);
+	}
+
+	xa_destroy(&ctxdata->sess_list);
+
+	kfree(ctxdata);
+	ctx->data = NULL;
+}
+
+static int tstee_open_session(struct tee_context *ctx,
+			      struct tee_ioctl_open_session_arg *arg,
+			      struct tee_param *param __always_unused)
+{
+	struct tstee *tstee = tee_get_drvdata(ctx->teedev);
+	struct ffa_device *ffa_dev = tstee->ffa_dev;
+	struct ts_context_data *ctxdata = ctx->data;
+	struct ffa_send_direct_data ffa_data;
+	struct ts_session *sess = NULL;
+	u32 ffa_args[FFA_DIRECT_REQ_ARG_NUM] = {};
+	u32 sess_id;
+	int rc;
+
+	ffa_args[TS_RPC_CTRL_REG] =
+		TS_RPC_CTRL_PACK_IFACE_OPCODE(TS_RPC_MGMT_IFACE_ID,
+					      TS_RPC_OP_SERVICE_INFO);
+
+	memcpy(ffa_args + TS_RPC_SERVICE_INFO_UUID0, arg->uuid, UUID_SIZE);
+
+	arg_list_to_ffa_data(ffa_args, &ffa_data);
+	rc = ffa_dev->ops->msg_ops->sync_send_receive(ffa_dev, &ffa_data);
+	if (rc)
+		return rc;
+
+	arg_list_from_ffa_data(&ffa_data, ffa_args);
+
+	if (ffa_args[TS_RPC_SERVICE_INFO_RPC_STATUS] != TS_RPC_OK)
+		return -ENODEV;
+
+	if (ffa_args[TS_RPC_SERVICE_INFO_IFACE] > U8_MAX)
+		return -EINVAL;
+
+	sess = kzalloc(sizeof(*sess), GFP_KERNEL);
+	if (!sess)
+		return -ENOMEM;
+
+	sess->iface_id = ffa_args[TS_RPC_SERVICE_INFO_IFACE];
+
+	rc = xa_alloc(&ctxdata->sess_list, &sess_id, sess, xa_limit_32b,
+		      GFP_KERNEL);
+	if (rc) {
+		kfree(sess);
+		return rc;
+	}
+
+	arg->session = sess_id;
+	arg->ret = 0;
+
+	return 0;
+}
+
+static int tstee_close_session(struct tee_context *ctx, u32 session)
+{
+	struct ts_context_data *ctxdata = ctx->data;
+	struct ts_session *sess;
+
+	/* Calls xa_lock() internally */
+	sess = xa_erase(&ctxdata->sess_list, session);
+	if (!sess)
+		return -EINVAL;
+
+	kfree(sess);
+
+	return 0;
+}
+
+static int tstee_invoke_func(struct tee_context *ctx,
+			     struct tee_ioctl_invoke_arg *arg,
+			     struct tee_param *param)
+{
+	struct tstee *tstee = tee_get_drvdata(ctx->teedev);
+	struct ffa_device *ffa_dev = tstee->ffa_dev;
+	struct ts_context_data *ctxdata = ctx->data;
+	struct ffa_send_direct_data ffa_data;
+	struct tee_shm *shm = NULL;
+	struct ts_session *sess;
+	u32 req_len, ffa_args[FFA_DIRECT_REQ_ARG_NUM] = {};
+	int shm_id, rc;
+	u8 iface_id;
+	u64 handle;
+	u16 opcode;
+
+	xa_lock(&ctxdata->sess_list);
+	sess = xa_load(&ctxdata->sess_list, arg->session);
+
+	/*
+	 * Do this while holding the lock to make sure that the session wasn't
+	 * closed meanwhile
+	 */
+	if (sess)
+		iface_id = sess->iface_id;
+
+	xa_unlock(&ctxdata->sess_list);
+	if (!sess)
+		return -EINVAL;
+
+	opcode = lower_16_bits(arg->func);
+	shm_id = lower_32_bits(param[0].u.value.a);
+	req_len = lower_32_bits(param[0].u.value.b);
+
+	if (shm_id != 0) {
+		shm = tee_shm_get_from_id(ctx, shm_id);
+		if (IS_ERR(shm))
+			return PTR_ERR(shm);
+
+		if (shm->size < req_len) {
+			dev_err(&ffa_dev->dev,
+				"request doesn't fit into shared memory buffer\n");
+			rc = -EINVAL;
+			goto out;
+		}
+
+		handle = shm->sec_world_id;
+	} else {
+		handle = FFA_INVALID_MEM_HANDLE;
+	}
+
+	ffa_args[TS_RPC_CTRL_REG] = TS_RPC_CTRL_PACK_IFACE_OPCODE(iface_id,
+								  opcode);
+	ffa_args[TS_RPC_SERVICE_MEM_HANDLE_LSW] = lower_32_bits(handle);
+	ffa_args[TS_RPC_SERVICE_MEM_HANDLE_MSW] = upper_32_bits(handle);
+	ffa_args[TS_RPC_SERVICE_REQ_LEN] = req_len;
+	ffa_args[TS_RPC_SERVICE_CLIENT_ID] = 0;
+
+	arg_list_to_ffa_data(ffa_args, &ffa_data);
+	rc = ffa_dev->ops->msg_ops->sync_send_receive(ffa_dev, &ffa_data);
+	if (rc)
+		goto out;
+
+	arg_list_from_ffa_data(&ffa_data, ffa_args);
+
+	if (ffa_args[TS_RPC_SERVICE_RPC_STATUS] != TS_RPC_OK) {
+		dev_err(&ffa_dev->dev, "invoke_func rpc status: %d\n",
+			ffa_args[TS_RPC_SERVICE_RPC_STATUS]);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	arg->ret = ffa_args[TS_RPC_SERVICE_STATUS];
+	if (shm && shm->size >= ffa_args[TS_RPC_SERVICE_RESP_LEN])
+		param[0].u.value.a = ffa_args[TS_RPC_SERVICE_RESP_LEN];
+
+out:
+	if (shm)
+		tee_shm_put(shm);
+
+	return rc;
+}
+
+static int tstee_shm_register(struct tee_context *ctx, struct tee_shm *shm,
+			      struct page **pages, size_t num_pages,
+			      unsigned long start __always_unused)
+{
+	struct tstee *tstee = tee_get_drvdata(ctx->teedev);
+	struct ffa_device *ffa_dev = tstee->ffa_dev;
+	struct ffa_mem_region_attributes mem_attr = {
+		.receiver = tstee->ffa_dev->vm_id,
+		.attrs = FFA_MEM_RW,
+		.flag = 0,
+	};
+	struct ffa_mem_ops_args mem_args = {
+		.attrs = &mem_attr,
+		.use_txbuf = true,
+		.nattrs = 1,
+		.flags = 0,
+	};
+	struct ffa_send_direct_data ffa_data;
+	struct sg_table sgt;
+	u32 ffa_args[FFA_DIRECT_REQ_ARG_NUM] = {};
+	int rc;
+
+	rc = sg_alloc_table_from_pages(&sgt, pages, num_pages, 0,
+				       num_pages * PAGE_SIZE, GFP_KERNEL);
+	if (rc)
+		return rc;
+
+	mem_args.sg = sgt.sgl;
+	rc = ffa_dev->ops->mem_ops->memory_share(&mem_args);
+	sg_free_table(&sgt);
+	if (rc)
+		return rc;
+
+	shm->sec_world_id = mem_args.g_handle;
+
+	ffa_args[TS_RPC_CTRL_REG] =
+			TS_RPC_CTRL_PACK_IFACE_OPCODE(TS_RPC_MGMT_IFACE_ID,
+						      TS_RPC_OP_RETRIEVE_MEM);
+	ffa_args[TS_RPC_RETRIEVE_MEM_HANDLE_LSW] =
+			lower_32_bits(shm->sec_world_id);
+	ffa_args[TS_RPC_RETRIEVE_MEM_HANDLE_MSW] =
+			upper_32_bits(shm->sec_world_id);
+	ffa_args[TS_RPC_RETRIEVE_MEM_TAG_LSW] = 0;
+	ffa_args[TS_RPC_RETRIEVE_MEM_TAG_MSW] = 0;
+
+	arg_list_to_ffa_data(ffa_args, &ffa_data);
+	rc = ffa_dev->ops->msg_ops->sync_send_receive(ffa_dev, &ffa_data);
+	if (rc) {
+		(void)ffa_dev->ops->mem_ops->memory_reclaim(shm->sec_world_id,
+							    0);
+		return rc;
+	}
+
+	arg_list_from_ffa_data(&ffa_data, ffa_args);
+
+	if (ffa_args[TS_RPC_RETRIEVE_MEM_RPC_STATUS] != TS_RPC_OK) {
+		dev_err(&ffa_dev->dev, "shm_register rpc status: %d\n",
+			ffa_args[TS_RPC_RETRIEVE_MEM_RPC_STATUS]);
+		ffa_dev->ops->mem_ops->memory_reclaim(shm->sec_world_id, 0);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int tstee_shm_unregister(struct tee_context *ctx, struct tee_shm *shm)
+{
+	struct tstee *tstee = tee_get_drvdata(ctx->teedev);
+	struct ffa_device *ffa_dev = tstee->ffa_dev;
+	struct ffa_send_direct_data ffa_data;
+	u32 ffa_args[FFA_DIRECT_REQ_ARG_NUM] = {};
+	int rc;
+
+	ffa_args[TS_RPC_CTRL_REG] =
+			TS_RPC_CTRL_PACK_IFACE_OPCODE(TS_RPC_MGMT_IFACE_ID,
+						      TS_RPC_OP_RELINQ_MEM);
+	ffa_args[TS_RPC_RELINQ_MEM_HANDLE_LSW] =
+			lower_32_bits(shm->sec_world_id);
+	ffa_args[TS_RPC_RELINQ_MEM_HANDLE_MSW] =
+			upper_32_bits(shm->sec_world_id);
+
+	arg_list_to_ffa_data(ffa_args, &ffa_data);
+	rc = ffa_dev->ops->msg_ops->sync_send_receive(ffa_dev, &ffa_data);
+	if (rc)
+		return rc;
+	arg_list_from_ffa_data(&ffa_data, ffa_args);
+
+	if (ffa_args[TS_RPC_RELINQ_MEM_RPC_STATUS] != TS_RPC_OK) {
+		dev_err(&ffa_dev->dev, "shm_unregister rpc status: %d\n",
+			ffa_args[TS_RPC_RELINQ_MEM_RPC_STATUS]);
+		return -EINVAL;
+	}
+
+	rc = ffa_dev->ops->mem_ops->memory_reclaim(shm->sec_world_id, 0);
+
+	return rc;
+}
+
+static const struct tee_driver_ops tstee_ops = {
+	.get_version = tstee_get_version,
+	.open = tstee_open,
+	.release = tstee_release,
+	.open_session = tstee_open_session,
+	.close_session = tstee_close_session,
+	.invoke_func = tstee_invoke_func,
+};
+
+static const struct tee_desc tstee_desc = {
+	.name = "tstee-clnt",
+	.ops = &tstee_ops,
+	.owner = THIS_MODULE,
+};
+
+static int pool_op_alloc(struct tee_shm_pool *pool, struct tee_shm *shm,
+			 size_t size, size_t align)
+{
+	return tee_dyn_shm_alloc_helper(shm, size, align, tstee_shm_register);
+}
+
+static void pool_op_free(struct tee_shm_pool *pool, struct tee_shm *shm)
+{
+	tee_dyn_shm_free_helper(shm, tstee_shm_unregister);
+}
+
+static void pool_op_destroy_pool(struct tee_shm_pool *pool)
+{
+	kfree(pool);
+}
+
+static const struct tee_shm_pool_ops pool_ops = {
+	.alloc = pool_op_alloc,
+	.free = pool_op_free,
+	.destroy_pool = pool_op_destroy_pool,
+};
+
+static struct tee_shm_pool *tstee_create_shm_pool(void)
+{
+	struct tee_shm_pool *pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+
+	if (!pool)
+		return ERR_PTR(-ENOMEM);
+
+	pool->ops = &pool_ops;
+
+	return pool;
+}
+
+static bool tstee_check_rpc_compatible(struct ffa_device *ffa_dev)
+{
+	struct ffa_send_direct_data ffa_data;
+	u32 ffa_args[FFA_DIRECT_REQ_ARG_NUM] = {};
+
+	ffa_args[TS_RPC_CTRL_REG] =
+			TS_RPC_CTRL_PACK_IFACE_OPCODE(TS_RPC_MGMT_IFACE_ID,
+						      TS_RPC_OP_GET_VERSION);
+
+	arg_list_to_ffa_data(ffa_args, &ffa_data);
+	if (ffa_dev->ops->msg_ops->sync_send_receive(ffa_dev, &ffa_data))
+		return false;
+
+	arg_list_from_ffa_data(&ffa_data, ffa_args);
+
+	return ffa_args[TS_RPC_GET_VERSION_RESP] == TS_RPC_PROTOCOL_VERSION;
+}
+
+static int tstee_probe(struct ffa_device *ffa_dev)
+{
+	struct tstee *tstee;
+	int rc;
+
+	ffa_dev->ops->msg_ops->mode_32bit_set(ffa_dev);
+
+	if (!tstee_check_rpc_compatible(ffa_dev))
+		return -EINVAL;
+
+	tstee = kzalloc(sizeof(*tstee), GFP_KERNEL);
+	if (!tstee)
+		return -ENOMEM;
+
+	tstee->ffa_dev = ffa_dev;
+
+	tstee->pool = tstee_create_shm_pool();
+	if (IS_ERR(tstee->pool)) {
+		rc = PTR_ERR(tstee->pool);
+		tstee->pool = NULL;
+		goto err_free_tstee;
+	}
+
+	tstee->teedev = tee_device_alloc(&tstee_desc, NULL, tstee->pool, tstee);
+	if (IS_ERR(tstee->teedev)) {
+		rc = PTR_ERR(tstee->teedev);
+		tstee->teedev = NULL;
+		goto err_free_pool;
+	}
+
+	rc = tee_device_register(tstee->teedev);
+	if (rc)
+		goto err_unreg_teedev;
+
+	ffa_dev_set_drvdata(ffa_dev, tstee);
+
+	return 0;
+
+err_unreg_teedev:
+	tee_device_unregister(tstee->teedev);
+err_free_pool:
+	tee_shm_pool_free(tstee->pool);
+err_free_tstee:
+	kfree(tstee);
+	return rc;
+}
+
+static void tstee_remove(struct ffa_device *ffa_dev)
+{
+	struct tstee *tstee = ffa_dev->dev.driver_data;
+
+	tee_device_unregister(tstee->teedev);
+	tee_shm_pool_free(tstee->pool);
+	kfree(tstee);
+}
+
+static const struct ffa_device_id tstee_device_ids[] = {
+	/* TS RPC protocol UUID: bdcd76d7-825e-4751-963b-86d4f84943ac */
+	{ TS_RPC_UUID },
+	{}
+};
+
+static struct ffa_driver tstee_driver = {
+	.name = "arm_tstee",
+	.probe = tstee_probe,
+	.remove = tstee_remove,
+	.id_table = tstee_device_ids,
+};
+
+module_ffa_driver(tstee_driver);
+
+MODULE_AUTHOR("Balint Dobszay <balint.dobszay@arm.com>");
+MODULE_DESCRIPTION("Arm Trusted Services TEE driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/tee/tstee/tstee_private.h b/drivers/tee/tstee/tstee_private.h
new file mode 100644
index 000000000000..8e58725b57eb
--- /dev/null
+++ b/drivers/tee/tstee/tstee_private.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2023, Arm Limited
+ */
+
+#ifndef TSTEE_PRIVATE_H
+#define TSTEE_PRIVATE_H
+
+#include <linux/arm_ffa.h>
+#include <linux/bitops.h>
+#include <linux/tee_core.h>
+#include <linux/types.h>
+#include <linux/uuid.h>
+#include <linux/xarray.h>
+
+/*
+ * The description of the ABI implemented in this file is available at
+ * https://trusted-services.readthedocs.io/en/v1.0.0/developer/service-access-protocols.html#abi
+ */
+
+/* UUID of this protocol */
+#define TS_RPC_UUID UUID_INIT(0xbdcd76d7, 0x825e, 0x4751, \
+			      0x96, 0x3b, 0x86, 0xd4, 0xf8, 0x49, 0x43, 0xac)
+
+/* Protocol version*/
+#define TS_RPC_PROTOCOL_VERSION		(1)
+
+/* Status codes */
+#define TS_RPC_OK			(0)
+
+/* RPC control register */
+#define TS_RPC_CTRL_REG			(0)
+#define OPCODE_MASK			GENMASK(15, 0)
+#define IFACE_ID_MASK			GENMASK(23, 16)
+#define TS_RPC_CTRL_OPCODE(x)		((u16)(FIELD_GET(OPCODE_MASK, (x))))
+#define TS_RPC_CTRL_IFACE_ID(x)		((u8)(FIELD_GET(IFACE_ID_MASK, (x))))
+#define TS_RPC_CTRL_PACK_IFACE_OPCODE(i, o)	\
+	(FIELD_PREP(IFACE_ID_MASK, (i)) | FIELD_PREP(OPCODE_MASK, (o)))
+#define TS_RPC_CTRL_SAP_RC		BIT(30)
+#define TS_RPC_CTRL_SAP_ERR		BIT(31)
+
+/* Interface ID for RPC management operations */
+#define TS_RPC_MGMT_IFACE_ID		(0xff)
+
+/* Management calls */
+#define TS_RPC_OP_GET_VERSION		(0x0000)
+#define TS_RPC_GET_VERSION_RESP		(1)
+
+#define TS_RPC_OP_RETRIEVE_MEM		(0x0001)
+#define TS_RPC_RETRIEVE_MEM_HANDLE_LSW	(1)
+#define TS_RPC_RETRIEVE_MEM_HANDLE_MSW	(2)
+#define TS_RPC_RETRIEVE_MEM_TAG_LSW	(3)
+#define TS_RPC_RETRIEVE_MEM_TAG_MSW	(4)
+#define TS_RPC_RETRIEVE_MEM_RPC_STATUS	(1)
+
+#define TS_RPC_OP_RELINQ_MEM		(0x0002)
+#define TS_RPC_RELINQ_MEM_HANDLE_LSW	(1)
+#define TS_RPC_RELINQ_MEM_HANDLE_MSW	(2)
+#define TS_RPC_RELINQ_MEM_RPC_STATUS	(1)
+
+#define TS_RPC_OP_SERVICE_INFO		(0x0003)
+#define TS_RPC_SERVICE_INFO_UUID0	(1)
+#define TS_RPC_SERVICE_INFO_UUID1	(2)
+#define TS_RPC_SERVICE_INFO_UUID2	(3)
+#define TS_RPC_SERVICE_INFO_UUID3	(4)
+#define TS_RPC_SERVICE_INFO_RPC_STATUS	(1)
+#define TS_RPC_SERVICE_INFO_IFACE	(2)
+
+/* Service call */
+#define TS_RPC_SERVICE_MEM_HANDLE_LSW	(1)
+#define TS_RPC_SERVICE_MEM_HANDLE_MSW	(2)
+#define TS_RPC_SERVICE_REQ_LEN		(3)
+#define TS_RPC_SERVICE_CLIENT_ID	(4)
+#define TS_RPC_SERVICE_RPC_STATUS	(1)
+#define TS_RPC_SERVICE_STATUS		(2)
+#define TS_RPC_SERVICE_RESP_LEN		(3)
+
+struct tstee {
+	struct ffa_device *ffa_dev;
+	struct tee_device *teedev;
+	struct tee_shm_pool *pool;
+};
+
+struct ts_session {
+	u8 iface_id;
+};
+
+struct ts_context_data {
+	struct xarray sess_list;
+};
+
+#endif /* TSTEE_PRIVATE_H */
diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h
index 23e57164693c..d0430bee8292 100644
--- a/include/uapi/linux/tee.h
+++ b/include/uapi/linux/tee.h
@@ -56,6 +56,7 @@
  */
 #define TEE_IMPL_ID_OPTEE	1
 #define TEE_IMPL_ID_AMDTEE	2
+#define TEE_IMPL_ID_TSTEE	3
 
 /*
  * OP-TEE specific capabilities
-- 
cgit v1.2.3


From afde996a33ee4dbe3692e1eff28b56c820331428 Mon Sep 17 00:00:00 2001
From: Dhruva Gole <d-gole@ti.com>
Date: Mon, 18 Mar 2024 20:46:32 +0530
Subject: PM: wakeup: make device_wakeup_disable() return void

The device_wakeup_disable() call only returns an error if no dev exists,
but there's not much a user can do at that point.

Rather, make this function return void.

Signed-off-by: Dhruva Gole <d-gole@ti.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/wakeup.c       | 11 +++++++----
 drivers/mmc/host/sdhci-pci-core.c |  2 +-
 include/linux/pm_wakeup.h         |  5 ++---
 3 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index a917219feea6..752b417e8129 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -451,16 +451,15 @@ static struct wakeup_source *device_wakeup_detach(struct device *dev)
  * Detach the @dev's wakeup source object from it, unregister this wakeup source
  * object and destroy it.
  */
-int device_wakeup_disable(struct device *dev)
+void device_wakeup_disable(struct device *dev)
 {
 	struct wakeup_source *ws;
 
 	if (!dev || !dev->power.can_wakeup)
-		return -EINVAL;
+		return;
 
 	ws = device_wakeup_detach(dev);
 	wakeup_source_unregister(ws);
-	return 0;
 }
 EXPORT_SYMBOL_GPL(device_wakeup_disable);
 
@@ -502,7 +501,11 @@ EXPORT_SYMBOL_GPL(device_set_wakeup_capable);
  */
 int device_set_wakeup_enable(struct device *dev, bool enable)
 {
-	return enable ? device_wakeup_enable(dev) : device_wakeup_disable(dev);
+	if (enable)
+		return device_wakeup_enable(dev);
+
+	device_wakeup_disable(dev);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(device_set_wakeup_enable);
 
diff --git a/drivers/mmc/host/sdhci-pci-core.c b/drivers/mmc/host/sdhci-pci-core.c
index 025b31aa712c..ef89ec382bfe 100644
--- a/drivers/mmc/host/sdhci-pci-core.c
+++ b/drivers/mmc/host/sdhci-pci-core.c
@@ -63,7 +63,7 @@ static int sdhci_pci_init_wakeup(struct sdhci_pci_chip *chip)
 	if ((pm_flags & MMC_PM_KEEP_POWER) && (pm_flags & MMC_PM_WAKE_SDIO_IRQ))
 		return device_wakeup_enable(&chip->pdev->dev);
 	else if (!cap_cd_wake)
-		return device_wakeup_disable(&chip->pdev->dev);
+		device_wakeup_disable(&chip->pdev->dev);
 
 	return 0;
 }
diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h
index 6eb9adaef52b..428803eed798 100644
--- a/include/linux/pm_wakeup.h
+++ b/include/linux/pm_wakeup.h
@@ -107,7 +107,7 @@ extern void wakeup_sources_read_unlock(int idx);
 extern struct wakeup_source *wakeup_sources_walk_start(void);
 extern struct wakeup_source *wakeup_sources_walk_next(struct wakeup_source *ws);
 extern int device_wakeup_enable(struct device *dev);
-extern int device_wakeup_disable(struct device *dev);
+extern void device_wakeup_disable(struct device *dev);
 extern void device_set_wakeup_capable(struct device *dev, bool capable);
 extern int device_set_wakeup_enable(struct device *dev, bool enable);
 extern void __pm_stay_awake(struct wakeup_source *ws);
@@ -154,10 +154,9 @@ static inline int device_wakeup_enable(struct device *dev)
 	return 0;
 }
 
-static inline int device_wakeup_disable(struct device *dev)
+static inline void device_wakeup_disable(struct device *dev)
 {
 	dev->power.should_wakeup = false;
-	return 0;
 }
 
 static inline int device_set_wakeup_enable(struct device *dev, bool enable)
-- 
cgit v1.2.3


From 3642c7ed52312ac2b95c9aba45c40e50bd8798ad Mon Sep 17 00:00:00 2001
From: Dhruva Gole <d-gole@ti.com>
Date: Mon, 18 Mar 2024 20:46:33 +0530
Subject: PM: wakeup: Remove unnecessary else from device_init_wakeup()

Checkpatch warns that else is generally not necessary after a return
condition which exists in the if part of this function.

Hence, just to abide by what checkpatch recommends, follow it's
guidelines.

Signed-off-by: Dhruva Gole <d-gole@ti.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm_wakeup.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h
index 428803eed798..76cd1f9f1365 100644
--- a/include/linux/pm_wakeup.h
+++ b/include/linux/pm_wakeup.h
@@ -234,11 +234,10 @@ static inline int device_init_wakeup(struct device *dev, bool enable)
 	if (enable) {
 		device_set_wakeup_capable(dev, true);
 		return device_wakeup_enable(dev);
-	} else {
-		device_wakeup_disable(dev);
-		device_set_wakeup_capable(dev, false);
-		return 0;
 	}
+	device_wakeup_disable(dev);
+	device_set_wakeup_capable(dev, false);
+	return 0;
 }
 
 #endif /* _LINUX_PM_WAKEUP_H */
-- 
cgit v1.2.3


From 4d0333798ebbfa1683cc3bc056d1b25b8c24344c Mon Sep 17 00:00:00 2001
From: Simon Trimmer <simont@opensource.cirrus.com>
Date: Mon, 25 Mar 2024 11:31:23 +0000
Subject: firmware: cs_dsp: Add locked wrappers for coeff read and write

It is a common pattern for functions to take and release the DSP
pwr_lock over the cs_dsp calls to read and write firmware controls.
Add wrapper functions to do this sequence so that the calling code can
be simplified to a single function call..

Signed-off-by: Simon Trimmer <simont@opensource.cirrus.com>
Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Reviewed-by: Takashi Iwai <tiwai@suse.de>
Link: https://msgid.link/r/20240325113127.112783-2-rf@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/firmware/cirrus/cs_dsp.c       | 54 ++++++++++++++++++++++++++++++++++
 include/linux/firmware/cirrus/cs_dsp.h |  4 +++
 2 files changed, 58 insertions(+)

(limited to 'include')

diff --git a/drivers/firmware/cirrus/cs_dsp.c b/drivers/firmware/cirrus/cs_dsp.c
index 79d4254d1f9b..1263382e0d10 100644
--- a/drivers/firmware/cirrus/cs_dsp.c
+++ b/drivers/firmware/cirrus/cs_dsp.c
@@ -819,6 +819,33 @@ int cs_dsp_coeff_write_ctrl(struct cs_dsp_coeff_ctl *ctl,
 }
 EXPORT_SYMBOL_NS_GPL(cs_dsp_coeff_write_ctrl, FW_CS_DSP);
 
+/**
+ * cs_dsp_coeff_lock_and_write_ctrl() - Writes the given buffer to the given coefficient control
+ * @ctl: pointer to coefficient control
+ * @off: word offset at which data should be written
+ * @buf: the buffer to write to the given control
+ * @len: the length of the buffer in bytes
+ *
+ * Same as cs_dsp_coeff_write_ctrl() but takes pwr_lock.
+ *
+ * Return: A negative number on error, 1 when the control value changed and 0 when it has not.
+ */
+int cs_dsp_coeff_lock_and_write_ctrl(struct cs_dsp_coeff_ctl *ctl,
+				     unsigned int off, const void *buf, size_t len)
+{
+	struct cs_dsp *dsp = ctl->dsp;
+	int ret;
+
+	lockdep_assert_not_held(&dsp->pwr_lock);
+
+	mutex_lock(&dsp->pwr_lock);
+	ret = cs_dsp_coeff_write_ctrl(ctl, off, buf, len);
+	mutex_unlock(&dsp->pwr_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(cs_dsp_coeff_lock_and_write_ctrl);
+
 static int cs_dsp_coeff_read_ctrl_raw(struct cs_dsp_coeff_ctl *ctl,
 				      unsigned int off, void *buf, size_t len)
 {
@@ -891,6 +918,33 @@ int cs_dsp_coeff_read_ctrl(struct cs_dsp_coeff_ctl *ctl,
 }
 EXPORT_SYMBOL_NS_GPL(cs_dsp_coeff_read_ctrl, FW_CS_DSP);
 
+/**
+ * cs_dsp_coeff_lock_and_read_ctrl() - Reads the given coefficient control into the given buffer
+ * @ctl: pointer to coefficient control
+ * @off: word offset at which data should be read
+ * @buf: the buffer to store to the given control
+ * @len: the length of the buffer in bytes
+ *
+ * Same as cs_dsp_coeff_read_ctrl() but takes pwr_lock.
+ *
+ * Return: Zero for success, a negative number on error.
+ */
+int cs_dsp_coeff_lock_and_read_ctrl(struct cs_dsp_coeff_ctl *ctl,
+				    unsigned int off, void *buf, size_t len)
+{
+	struct cs_dsp *dsp = ctl->dsp;
+	int ret;
+
+	lockdep_assert_not_held(&dsp->pwr_lock);
+
+	mutex_lock(&dsp->pwr_lock);
+	ret = cs_dsp_coeff_read_ctrl(ctl, off, buf, len);
+	mutex_unlock(&dsp->pwr_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(cs_dsp_coeff_lock_and_read_ctrl);
+
 static int cs_dsp_coeff_init_control_caches(struct cs_dsp *dsp)
 {
 	struct cs_dsp_coeff_ctl *ctl;
diff --git a/include/linux/firmware/cirrus/cs_dsp.h b/include/linux/firmware/cirrus/cs_dsp.h
index 29cd11d5a3cf..6097b1f701f1 100644
--- a/include/linux/firmware/cirrus/cs_dsp.h
+++ b/include/linux/firmware/cirrus/cs_dsp.h
@@ -239,8 +239,12 @@ void cs_dsp_cleanup_debugfs(struct cs_dsp *dsp);
 int cs_dsp_coeff_write_acked_control(struct cs_dsp_coeff_ctl *ctl, unsigned int event_id);
 int cs_dsp_coeff_write_ctrl(struct cs_dsp_coeff_ctl *ctl, unsigned int off,
 			    const void *buf, size_t len);
+int cs_dsp_coeff_lock_and_write_ctrl(struct cs_dsp_coeff_ctl *ctl, unsigned int off,
+				     const void *buf, size_t len);
 int cs_dsp_coeff_read_ctrl(struct cs_dsp_coeff_ctl *ctl, unsigned int off,
 			   void *buf, size_t len);
+int cs_dsp_coeff_lock_and_read_ctrl(struct cs_dsp_coeff_ctl *ctl, unsigned int off,
+				    void *buf, size_t len);
 struct cs_dsp_coeff_ctl *cs_dsp_get_ctl(struct cs_dsp *dsp, const char *name, int type,
 					unsigned int alg);
 
-- 
cgit v1.2.3


From 7bdbf7446305cb65c510c16d57cde82bc76b234a Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Mon, 1 Apr 2024 19:13:02 -0700
Subject: bpf: add special internal-only MOV instruction to resolve per-CPU
 addrs

Add a new BPF instruction for resolving absolute addresses of per-CPU
data from their per-CPU offsets. This instruction is internal-only and
users are not allowed to use them directly. They will only be used for
internal inlining optimizations for now between BPF verifier and BPF JITs.

We use a special BPF_MOV | BPF_ALU64 | BPF_X form with insn->off field
set to BPF_ADDR_PERCPU = -1. I used negative offset value to distinguish
them from positive ones used by user-exposed instructions.

Such instruction performs a resolution of a per-CPU offset stored in
a register to a valid kernel address which can be dereferenced. It is
useful in any use case where absolute address of a per-CPU data has to
be resolved (e.g., in inlining bpf_map_lookup_elem()).

BPF disassembler is also taught to recognize them to support dumping
final BPF assembly code (non-JIT'ed version).

Add arch-specific way for BPF JITs to mark support for this instructions.

This patch also adds support for these instructions in x86-64 BPF JIT.

Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/r/20240402021307.1012571-2-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c | 16 ++++++++++++++++
 include/linux/filter.h      | 20 ++++++++++++++++++++
 kernel/bpf/core.c           |  5 +++++
 kernel/bpf/disasm.c         | 14 ++++++++++++++
 4 files changed, 55 insertions(+)

(limited to 'include')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 3b639d6f2f54..af89dd117dce 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1382,6 +1382,17 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
 				maybe_emit_mod(&prog, AUX_REG, dst_reg, true);
 				EMIT3(0x0F, 0x44, add_2reg(0xC0, AUX_REG, dst_reg));
 				break;
+			} else if (insn_is_mov_percpu_addr(insn)) {
+				u32 off = (u32)(unsigned long)&this_cpu_off;
+
+				/* mov <dst>, <src> (if necessary) */
+				EMIT_mov(dst_reg, src_reg);
+
+				/* add <dst>, gs:[<off>] */
+				EMIT2(0x65, add_1mod(0x48, dst_reg));
+				EMIT3(0x03, add_1reg(0x04, dst_reg), 0x25);
+				EMIT(off, 4);
+				break;
 			}
 			fallthrough;
 		case BPF_ALU | BPF_MOV | BPF_X:
@@ -3365,6 +3376,11 @@ bool bpf_jit_supports_subprog_tailcalls(void)
 	return true;
 }
 
+bool bpf_jit_supports_percpu_insn(void)
+{
+	return true;
+}
+
 void bpf_jit_free(struct bpf_prog *prog)
 {
 	if (prog->jited) {
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 531b36090122..161d5f7b64ed 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -178,6 +178,25 @@ struct ctl_table_header;
 		.off   = 0,					\
 		.imm   = 0 })
 
+/* Special (internal-only) form of mov, used to resolve per-CPU addrs:
+ * dst_reg = src_reg + <percpu_base_off>
+ * BPF_ADDR_PERCPU is used as a special insn->off value.
+ */
+#define BPF_ADDR_PERCPU	(-1)
+
+#define BPF_MOV64_PERCPU_REG(DST, SRC)				\
+	((struct bpf_insn) {					\
+		.code  = BPF_ALU64 | BPF_MOV | BPF_X,		\
+		.dst_reg = DST,					\
+		.src_reg = SRC,					\
+		.off   = BPF_ADDR_PERCPU,			\
+		.imm   = 0 })
+
+static inline bool insn_is_mov_percpu_addr(const struct bpf_insn *insn)
+{
+	return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->off == BPF_ADDR_PERCPU;
+}
+
 /* Short form of mov, dst_reg = imm32 */
 
 #define BPF_MOV64_IMM(DST, IMM)					\
@@ -972,6 +991,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
 void bpf_jit_compile(struct bpf_prog *prog);
 bool bpf_jit_needs_zext(void);
 bool bpf_jit_supports_subprog_tailcalls(void);
+bool bpf_jit_supports_percpu_insn(void);
 bool bpf_jit_supports_kfunc_call(void);
 bool bpf_jit_supports_far_kfunc_call(void);
 bool bpf_jit_supports_exceptions(void);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ae406a2814db..7a33a3a7e63c 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2945,6 +2945,11 @@ bool __weak bpf_jit_supports_subprog_tailcalls(void)
 	return false;
 }
 
+bool __weak bpf_jit_supports_percpu_insn(void)
+{
+	return false;
+}
+
 bool __weak bpf_jit_supports_kfunc_call(void)
 {
 	return false;
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index bd2e2dd04740..309c4aa1b026 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -172,6 +172,17 @@ static bool is_addr_space_cast(const struct bpf_insn *insn)
 		insn->off == BPF_ADDR_SPACE_CAST;
 }
 
+/* Special (internal-only) form of mov, used to resolve per-CPU addrs:
+ * dst_reg = src_reg + <percpu_base_off>
+ * BPF_ADDR_PERCPU is used as a special insn->off value.
+ */
+#define BPF_ADDR_PERCPU	(-1)
+
+static inline bool is_mov_percpu_addr(const struct bpf_insn *insn)
+{
+	return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->off == BPF_ADDR_PERCPU;
+}
+
 void print_bpf_insn(const struct bpf_insn_cbs *cbs,
 		    const struct bpf_insn *insn,
 		    bool allow_ptr_leaks)
@@ -194,6 +205,9 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
 			verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %d, %d)\n",
 				insn->code, insn->dst_reg,
 				insn->src_reg, ((u32)insn->imm) >> 16, (u16)insn->imm);
+		} else if (is_mov_percpu_addr(insn)) {
+			verbose(cbs->private_data, "(%02x) r%d = &(void __percpu *)(r%d)\n",
+				insn->code, insn->dst_reg, insn->src_reg);
 		} else if (BPF_SRC(insn->code) == BPF_X) {
 			verbose(cbs->private_data, "(%02x) %c%d %s %s%c%d\n",
 				insn->code, class == BPF_ALU ? 'w' : 'r',
-- 
cgit v1.2.3


From 9807080e2170a9e5a7428d0a0e8d082585b6451f Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Mon, 1 Apr 2024 15:36:04 +0800
Subject: trace: adjust TP_STORE_ADDR_PORTS_SKB() parameters

Introducing entry_saddr and entry_daddr parameters in this macro
for later use can help us record the reverse 4-tuple by analyzing
the 4-tuple of the incoming skb when receiving.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20240401073605.37335-2-kerneljasonxing@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/trace/events/net_probe_common.h | 20 +++++++++++---------
 include/trace/events/tcp.h              |  2 +-
 include/trace/events/udp.h              |  2 +-
 3 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/net_probe_common.h b/include/trace/events/net_probe_common.h
index 5e33f91bdea3..976a58364bff 100644
--- a/include/trace/events/net_probe_common.h
+++ b/include/trace/events/net_probe_common.h
@@ -70,14 +70,14 @@
 	TP_STORE_V4MAPPED(__entry, saddr, daddr)
 #endif
 
-#define TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb, protoh)		\
+#define TP_STORE_ADDR_PORTS_SKB_V4(skb, protoh, entry_saddr, entry_daddr) \
 	do {								\
-		struct sockaddr_in *v4 = (void *)__entry->saddr;	\
+		struct sockaddr_in *v4 = (void *)entry_saddr;		\
 									\
 		v4->sin_family = AF_INET;				\
 		v4->sin_port = protoh->source;				\
 		v4->sin_addr.s_addr = ip_hdr(skb)->saddr;		\
-		v4 = (void *)__entry->daddr;				\
+		v4 = (void *)entry_daddr;				\
 		v4->sin_family = AF_INET;				\
 		v4->sin_port = protoh->dest;				\
 		v4->sin_addr.s_addr = ip_hdr(skb)->daddr;		\
@@ -85,28 +85,30 @@
 
 #if IS_ENABLED(CONFIG_IPV6)
 
-#define TP_STORE_ADDR_PORTS_SKB(__entry, skb, protoh)			\
+#define TP_STORE_ADDR_PORTS_SKB(skb, protoh, entry_saddr, entry_daddr)	\
 	do {								\
 		const struct iphdr *iph = ip_hdr(skb);			\
 									\
 		if (iph->version == 6) {				\
-			struct sockaddr_in6 *v6 = (void *)__entry->saddr; \
+			struct sockaddr_in6 *v6 = (void *)entry_saddr;	\
 									\
 			v6->sin6_family = AF_INET6;			\
 			v6->sin6_port = protoh->source;			\
 			v6->sin6_addr = ipv6_hdr(skb)->saddr;		\
-			v6 = (void *)__entry->daddr;			\
+			v6 = (void *)entry_daddr;			\
 			v6->sin6_family = AF_INET6;			\
 			v6->sin6_port = protoh->dest;			\
 			v6->sin6_addr = ipv6_hdr(skb)->daddr;		\
 		} else							\
-			TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb, protoh); \
+			TP_STORE_ADDR_PORTS_SKB_V4(skb, protoh,		\
+						   entry_saddr,		\
+						   entry_daddr);	\
 	} while (0)
 
 #else
 
-#define TP_STORE_ADDR_PORTS_SKB(__entry, skb, protoh)		\
-	TP_STORE_ADDR_PORTS_SKB_V4(__entry, skb, protoh)
+#define TP_STORE_ADDR_PORTS_SKB(skb, protoh, entry_saddr, entry_daddr)	\
+	TP_STORE_ADDR_PORTS_SKB_V4(skb, protoh, entry_saddr, entry_daddr)
 
 #endif
 
diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 1db95175c1e5..cf14b6fcbeed 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -295,7 +295,7 @@ DECLARE_EVENT_CLASS(tcp_event_skb,
 		memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
 		memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
 
-		TP_STORE_ADDR_PORTS_SKB(__entry, skb, th);
+		TP_STORE_ADDR_PORTS_SKB(skb, th, __entry->saddr, __entry->daddr);
 	),
 
 	TP_printk("skbaddr=%p src=%pISpc dest=%pISpc",
diff --git a/include/trace/events/udp.h b/include/trace/events/udp.h
index 62bebe2a6ece..6142be4068e2 100644
--- a/include/trace/events/udp.h
+++ b/include/trace/events/udp.h
@@ -38,7 +38,7 @@ TRACE_EVENT(udp_fail_queue_rcv_skb,
 		memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
 		memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
 
-		TP_STORE_ADDR_PORTS_SKB(__entry, skb, uh);
+		TP_STORE_ADDR_PORTS_SKB(skb, uh, __entry->saddr, __entry->daddr);
 	),
 
 	TP_printk("rc=%d family=%s src=%pISpc dest=%pISpc", __entry->rc,
-- 
cgit v1.2.3


From 19822a980e1956a6572998887a7df5a0607a32f6 Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Mon, 1 Apr 2024 15:36:05 +0800
Subject: trace: tcp: fully support trace_tcp_send_reset

Prior to this patch, what we can see by enabling trace_tcp_send is
only happening under two circumstances:
1) active rst mode
2) non-active rst mode and based on the full socket

That means the inconsistency occurs if we use tcpdump and trace
simultaneously to see how rst happens.

It's necessary that we should take into other cases into considerations,
say:
1) time-wait socket
2) no socket
...

By parsing the incoming skb and reversing its 4-tuple can
we know the exact 'flow' which might not exist.

Samples after applied this patch:
1. tcp_send_reset: skbaddr=XXX skaddr=XXX src=ip:port dest=ip:port
state=TCP_ESTABLISHED
2. tcp_send_reset: skbaddr=000...000 skaddr=XXX src=ip:port dest=ip:port
state=UNKNOWN
Note:
1) UNKNOWN means we cannot extract the right information from skb.
2) skbaddr/skaddr could be 0

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Link: https://lore.kernel.org/r/20240401073605.37335-3-kerneljasonxing@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/trace/events/tcp.h | 40 ++++++++++++++++++++++++++++++++++++++--
 net/ipv4/tcp_ipv4.c        |  7 +++----
 net/ipv6/tcp_ipv6.c        |  3 ++-
 3 files changed, 43 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index cf14b6fcbeed..5c04a61a11c2 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -78,11 +78,47 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb,
  * skb of trace_tcp_send_reset is the skb that caused RST. In case of
  * active reset, skb should be NULL
  */
-DEFINE_EVENT(tcp_event_sk_skb, tcp_send_reset,
+TRACE_EVENT(tcp_send_reset,
 
 	TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
 
-	TP_ARGS(sk, skb)
+	TP_ARGS(sk, skb),
+
+	TP_STRUCT__entry(
+		__field(const void *, skbaddr)
+		__field(const void *, skaddr)
+		__field(int, state)
+		__array(__u8, saddr, sizeof(struct sockaddr_in6))
+		__array(__u8, daddr, sizeof(struct sockaddr_in6))
+	),
+
+	TP_fast_assign(
+		__entry->skbaddr = skb;
+		__entry->skaddr = sk;
+		/* Zero means unknown state. */
+		__entry->state = sk ? sk->sk_state : 0;
+
+		memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
+		memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
+
+		if (sk && sk_fullsock(sk)) {
+			const struct inet_sock *inet = inet_sk(sk);
+
+			TP_STORE_ADDR_PORTS(__entry, inet, sk);
+		} else if (skb) {
+			const struct tcphdr *th = (const struct tcphdr *)skb->data;
+			/*
+			 * We should reverse the 4-tuple of skb, so later
+			 * it can print the right flow direction of rst.
+			 */
+			TP_STORE_ADDR_PORTS_SKB(skb, th, entry->daddr, entry->saddr);
+		}
+	),
+
+	TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s",
+		  __entry->skbaddr, __entry->skaddr,
+		  __entry->saddr, __entry->daddr,
+		  __entry->state ? show_tcp_state_name(__entry->state) : "UNKNOWN")
 );
 
 /*
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1e0a9762f92e..56b75efcfd12 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -866,11 +866,10 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
 	 * routing might fail in this case. No choice here, if we choose to force
 	 * input interface, we will misroute in case of asymmetric route.
 	 */
-	if (sk) {
+	if (sk)
 		arg.bound_dev_if = sk->sk_bound_dev_if;
-		if (sk_fullsock(sk))
-			trace_tcp_send_reset(sk, skb);
-	}
+
+	trace_tcp_send_reset(sk, skb);
 
 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5ae74f661d25..cffebaec66f1 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1113,7 +1113,6 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
 	if (sk) {
 		oif = sk->sk_bound_dev_if;
 		if (sk_fullsock(sk)) {
-			trace_tcp_send_reset(sk, skb);
 			if (inet6_test_bit(REPFLOW, sk))
 				label = ip6_flowlabel(ipv6h);
 			priority = READ_ONCE(sk->sk_priority);
@@ -1129,6 +1128,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
 			label = ip6_flowlabel(ipv6h);
 	}
 
+	trace_tcp_send_reset(sk, skb);
+
 	tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1,
 			     ipv6_get_dsfield(ipv6h), label, priority, txhash,
 			     &key);
-- 
cgit v1.2.3


From 118f457da9ed58a79e24b73c2ef0aa1987241f0e Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 1 Apr 2024 10:31:25 -0700
Subject: af_unix: Remove lock dance in unix_peek_fds().

In the previous GC implementation, the shape of the inflight socket
graph was not expected to change while GC was in progress.

MSG_PEEK was tricky because it could install inflight fd silently
and transform the graph.

Let's say we peeked a fd, which was a listening socket, and accept()ed
some embryo sockets from it.  The garbage collection algorithm would
have been confused because the set of sockets visited in scan_inflight()
would change within the same GC invocation.

That's why we placed spin_lock(&unix_gc_lock) and spin_unlock() in
unix_peek_fds() with a fat comment.

In the new GC implementation, we no longer garbage-collect the socket
if it exists in another queue, that is, if it has a bridge to another
SCC.  Also, accept() will require the lock if it has edges.

Thus, we need not do the complicated lock dance.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://lore.kernel.org/r/20240401173125.92184-3-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/af_unix.h |  1 -
 net/unix/af_unix.c    | 42 ------------------------------------------
 net/unix/garbage.c    |  2 +-
 3 files changed, 1 insertion(+), 44 deletions(-)

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 226a8da2cbe3..7311b77edfc7 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -17,7 +17,6 @@ static inline struct unix_sock *unix_get_socket(struct file *filp)
 }
 #endif
 
-extern spinlock_t unix_gc_lock;
 extern unsigned int unix_tot_inflight;
 void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver);
 void unix_del_edges(struct scm_fp_list *fpl);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 78be8b520cef..61ecfa9c9c6b 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1814,48 +1814,6 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
 {
 	scm->fp = scm_fp_dup(UNIXCB(skb).fp);
-
-	/*
-	 * Garbage collection of unix sockets starts by selecting a set of
-	 * candidate sockets which have reference only from being in flight
-	 * (total_refs == inflight_refs).  This condition is checked once during
-	 * the candidate collection phase, and candidates are marked as such, so
-	 * that non-candidates can later be ignored.  While inflight_refs is
-	 * protected by unix_gc_lock, total_refs (file count) is not, hence this
-	 * is an instantaneous decision.
-	 *
-	 * Once a candidate, however, the socket must not be reinstalled into a
-	 * file descriptor while the garbage collection is in progress.
-	 *
-	 * If the above conditions are met, then the directed graph of
-	 * candidates (*) does not change while unix_gc_lock is held.
-	 *
-	 * Any operations that changes the file count through file descriptors
-	 * (dup, close, sendmsg) does not change the graph since candidates are
-	 * not installed in fds.
-	 *
-	 * Dequeing a candidate via recvmsg would install it into an fd, but
-	 * that takes unix_gc_lock to decrement the inflight count, so it's
-	 * serialized with garbage collection.
-	 *
-	 * MSG_PEEK is special in that it does not change the inflight count,
-	 * yet does install the socket into an fd.  The following lock/unlock
-	 * pair is to ensure serialization with garbage collection.  It must be
-	 * done between incrementing the file count and installing the file into
-	 * an fd.
-	 *
-	 * If garbage collection starts after the barrier provided by the
-	 * lock/unlock, then it will see the elevated refcount and not mark this
-	 * as a candidate.  If a garbage collection is already in progress
-	 * before the file count was incremented, then the lock/unlock pair will
-	 * ensure that garbage collection is finished before progressing to
-	 * installing the fd.
-	 *
-	 * (*) A -> B where B is on the queue of A or B is on the queue of C
-	 * which is on the queue of listening socket A.
-	 */
-	spin_lock(&unix_gc_lock);
-	spin_unlock(&unix_gc_lock);
 }
 
 static void unix_destruct_scm(struct sk_buff *skb)
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 89ea71d9297b..12a4ec27e0d4 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -183,7 +183,7 @@ static void unix_free_vertices(struct scm_fp_list *fpl)
 	}
 }
 
-DEFINE_SPINLOCK(unix_gc_lock);
+static DEFINE_SPINLOCK(unix_gc_lock);
 unsigned int unix_tot_inflight;
 
 void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver)
-- 
cgit v1.2.3


From ada9841e3e665bb320d2eedcf33f640198e2378d Mon Sep 17 00:00:00 2001
From: Pawel Dembicki <paweldembicki@gmail.com>
Date: Tue, 2 Apr 2024 22:11:18 +0200
Subject: net: phy: marvell: add basic support of 88E308X/88E609X family

This patch implements only basic support.

It covers PHY used in multiple IC:
PHY: 88E3082, 88E3083
Switch: 88E6096, 88E6097

Signed-off-by: Pawel Dembicki <paweldembicki@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://lore.kernel.org/r/20240402201123.2961909-1-paweldembicki@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/marvell.c   | 13 +++++++++++++
 include/linux/marvell_phy.h |  1 +
 2 files changed, 14 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index 42ed013385bf..fae7eb57ee2c 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -3289,6 +3289,18 @@ static struct phy_driver marvell_drivers[] = {
 		.get_strings = marvell_get_strings,
 		.get_stats = marvell_get_stats,
 	},
+	{
+		.phy_id = MARVELL_PHY_ID_88E3082,
+		.phy_id_mask = MARVELL_PHY_ID_MASK,
+		.name = "Marvell 88E308X/88E609X Family",
+		/* PHY_BASIC_FEATURES */
+		.probe = marvell_probe,
+		.config_init = marvell_config_init,
+		.aneg_done = marvell_aneg_done,
+		.read_status = marvell_read_status,
+		.resume = genphy_resume,
+		.suspend = genphy_suspend,
+	},
 	{
 		.phy_id = MARVELL_PHY_ID_88E1112,
 		.phy_id_mask = MARVELL_PHY_ID_MASK,
@@ -3742,6 +3754,7 @@ module_phy_driver(marvell_drivers);
 
 static struct mdio_device_id __maybe_unused marvell_tbl[] = {
 	{ MARVELL_PHY_ID_88E1101, MARVELL_PHY_ID_MASK },
+	{ MARVELL_PHY_ID_88E3082, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E1112, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E1111, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E1111_FINISAR, MARVELL_PHY_ID_MASK },
diff --git a/include/linux/marvell_phy.h b/include/linux/marvell_phy.h
index 693eba9869e4..88254f9aec2b 100644
--- a/include/linux/marvell_phy.h
+++ b/include/linux/marvell_phy.h
@@ -7,6 +7,7 @@
 
 /* Known PHY IDs */
 #define MARVELL_PHY_ID_88E1101		0x01410c60
+#define MARVELL_PHY_ID_88E3082		0x01410c80
 #define MARVELL_PHY_ID_88E1112		0x01410c90
 #define MARVELL_PHY_ID_88E1111		0x01410cc0
 #define MARVELL_PHY_ID_88E1118		0x01410e10
-- 
cgit v1.2.3


From 9cc8a6e62624e4a0e0bcf8dfd10cc89db6e0f14b Mon Sep 17 00:00:00 2001
From: Pawel Dembicki <paweldembicki@gmail.com>
Date: Tue, 2 Apr 2024 22:11:19 +0200
Subject: net: ethtool: Add impedance mismatch result code to cable test

Some PHYs can recognize during a cable test if the impedance in the cable
is okay. They can detect reflections caused by impedance discontinuity
between a regular 100 Ohm cable and an abnormal part with a higher or
lower impedance.

This commit introduces a new result code:
ETHTOOL_A_CABLE_RESULT_CODE_IMPEDANCE_MISMATCH,
which represents the results of a cable test indicating issues with
impedance integrity.

Signed-off-by: Pawel Dembicki <paweldembicki@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://lore.kernel.org/r/20240402201123.2961909-2-paweldembicki@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/ethtool_netlink.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index 3f89074aa06c..accbb1a231df 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -515,6 +515,10 @@ enum {
 	ETHTOOL_A_CABLE_RESULT_CODE_OPEN,
 	ETHTOOL_A_CABLE_RESULT_CODE_SAME_SHORT,
 	ETHTOOL_A_CABLE_RESULT_CODE_CROSS_SHORT,
+	/* detected reflection caused by the impedance discontinuity between
+	 * a regular 100 Ohm cable and a part with the abnormal impedance value
+	 */
+	ETHTOOL_A_CABLE_RESULT_CODE_IMPEDANCE_MISMATCH,
 };
 
 enum {
-- 
cgit v1.2.3


From 30f8d23814ea4bb0229a78f7f8f36500c2e2d962 Mon Sep 17 00:00:00 2001
From: Gal Pressman <gal@nvidia.com>
Date: Tue, 2 Apr 2024 16:30:39 +0300
Subject: net/mlx5: Convert uintX_t to uX

In the kernel, the preferred types are uX.

Signed-off-by: Gal Pressman <gal@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20240402133043.56322-8-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h                  | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.c     | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.h     | 4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h       | 4 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_stats.c | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c              | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/fw.c                  | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h           | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v0.c  | 2 +-
 drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c  | 4 ++--
 include/linux/mlx5/device.h                                   | 2 +-
 11 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 84db05fb9389..f5a3ac40f6e3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -1160,7 +1160,7 @@ void mlx5e_vxlan_set_netdev_info(struct mlx5e_priv *priv);
 void mlx5e_ethtool_get_drvinfo(struct mlx5e_priv *priv,
 			       struct ethtool_drvinfo *drvinfo);
 void mlx5e_ethtool_get_strings(struct mlx5e_priv *priv,
-			       uint32_t stringset, uint8_t *data);
+			       u32 stringset, u8 *data);
 int mlx5e_ethtool_get_sset_count(struct mlx5e_priv *priv, int sset);
 void mlx5e_ethtool_get_ethtool_stats(struct mlx5e_priv *priv,
 				     struct ethtool_stats *stats, u64 *data);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.c
index c7d191f66ad1..4f83e3172767 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.c
@@ -73,7 +73,7 @@ void mlx5e_accel_fs_del_sk(struct mlx5_flow_handle *rule)
 
 struct mlx5_flow_handle *mlx5e_accel_fs_add_sk(struct mlx5e_flow_steering *fs,
 					       struct sock *sk, u32 tirn,
-					       uint32_t flow_tag)
+					       u32 flow_tag)
 {
 	struct mlx5e_accel_fs_tcp *fs_tcp = mlx5e_fs_get_accel_tcp(fs);
 	struct mlx5_flow_destination dest = {};
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.h
index a032bff482a6..7e899c716267 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/fs_tcp.h
@@ -11,14 +11,14 @@ int mlx5e_accel_fs_tcp_create(struct mlx5e_flow_steering *fs);
 void mlx5e_accel_fs_tcp_destroy(struct mlx5e_flow_steering *fs);
 struct mlx5_flow_handle *mlx5e_accel_fs_add_sk(struct mlx5e_flow_steering *fs,
 					       struct sock *sk, u32 tirn,
-					       uint32_t flow_tag);
+					       u32 flow_tag);
 void mlx5e_accel_fs_del_sk(struct mlx5_flow_handle *rule);
 #else
 static inline int mlx5e_accel_fs_tcp_create(struct mlx5e_flow_steering *fs) { return 0; }
 static inline void mlx5e_accel_fs_tcp_destroy(struct mlx5e_flow_steering *fs) {}
 static inline struct mlx5_flow_handle *mlx5e_accel_fs_add_sk(struct mlx5e_flow_steering *fs,
 							     struct sock *sk, u32 tirn,
-							     uint32_t flow_tag)
+							     u32 flow_tag)
 { return ERR_PTR(-EOPNOTSUPP); }
 static inline void mlx5e_accel_fs_del_sk(struct mlx5_flow_handle *rule) {}
 #endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h
index c1844128effa..07a04a142a2e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls.h
@@ -95,7 +95,7 @@ int mlx5e_ktls_init(struct mlx5e_priv *priv);
 void mlx5e_ktls_cleanup(struct mlx5e_priv *priv);
 
 int mlx5e_ktls_get_count(struct mlx5e_priv *priv);
-void mlx5e_ktls_get_strings(struct mlx5e_priv *priv, uint8_t **data);
+void mlx5e_ktls_get_strings(struct mlx5e_priv *priv, u8 **data);
 void mlx5e_ktls_get_stats(struct mlx5e_priv *priv, u64 **data);
 
 #else
@@ -144,7 +144,7 @@ static inline bool mlx5e_is_ktls_rx(struct mlx5_core_dev *mdev)
 static inline int mlx5e_ktls_init(struct mlx5e_priv *priv) { return 0; }
 static inline void mlx5e_ktls_cleanup(struct mlx5e_priv *priv) { }
 static inline int mlx5e_ktls_get_count(struct mlx5e_priv *priv) { return 0; }
-static inline void mlx5e_ktls_get_strings(struct mlx5e_priv *priv, uint8_t **data) { }
+static inline void mlx5e_ktls_get_strings(struct mlx5e_priv *priv, u8 **data) { }
 
 static inline void mlx5e_ktls_get_stats(struct mlx5e_priv *priv, u64 **data) { }
 #endif
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_stats.c
index 7bf79973128b..60be2d72eb9e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_stats.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_stats.c
@@ -58,7 +58,7 @@ int mlx5e_ktls_get_count(struct mlx5e_priv *priv)
 	return ARRAY_SIZE(mlx5e_ktls_sw_stats_desc);
 }
 
-void mlx5e_ktls_get_strings(struct mlx5e_priv *priv, uint8_t **data)
+void mlx5e_ktls_get_strings(struct mlx5e_priv *priv, u8 **data)
 {
 	unsigned int i, n;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 55b7efe21624..a74ee698671c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -274,7 +274,7 @@ out:
 }
 
 static void mlx5e_rep_get_strings(struct net_device *dev,
-				  u32 stringset, uint8_t *data)
+				  u32 stringset, u8 *data)
 {
 	struct mlx5e_priv *priv = netdev_priv(dev);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
index e7faf7e73ca4..2d95a9b7b44e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
@@ -283,7 +283,7 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev)
 	return 0;
 }
 
-int mlx5_cmd_init_hca(struct mlx5_core_dev *dev, uint32_t *sw_owner_id)
+int mlx5_cmd_init_hca(struct mlx5_core_dev *dev, u32 *sw_owner_id)
 {
 	u32 in[MLX5_ST_SZ_DW(init_hca_in)] = {};
 	int i;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 58732f44940f..c38342b9f320 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -205,7 +205,7 @@ int mlx5_cmd_enable(struct mlx5_core_dev *dev);
 void mlx5_cmd_disable(struct mlx5_core_dev *dev);
 void mlx5_cmd_set_state(struct mlx5_core_dev *dev,
 			enum mlx5_cmdif_state cmdif_state);
-int mlx5_cmd_init_hca(struct mlx5_core_dev *dev, uint32_t *sw_owner_id);
+int mlx5_cmd_init_hca(struct mlx5_core_dev *dev, u32 *sw_owner_id);
 int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev);
 int mlx5_cmd_force_teardown_hca(struct mlx5_core_dev *dev);
 int mlx5_cmd_fast_teardown_hca(struct mlx5_core_dev *dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v0.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v0.c
index f708b029425a..e9f6c7ed7a7b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v0.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v0.c
@@ -1883,7 +1883,7 @@ dr_ste_v0_build_tnl_gtpu_flex_parser_1_init(struct mlx5dr_ste_build *sb,
 
 static int dr_ste_v0_build_tnl_header_0_1_tag(struct mlx5dr_match_param *value,
 					      struct mlx5dr_ste_build *sb,
-					      uint8_t *tag)
+					      u8 *tag)
 {
 	struct mlx5dr_match_misc5 *misc5 = &value->misc5;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c
index dd856cde188d..1d49704b9542 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c
@@ -1897,7 +1897,7 @@ void dr_ste_v1_build_flex_parser_tnl_geneve_init(struct mlx5dr_ste_build *sb,
 
 static int dr_ste_v1_build_tnl_header_0_1_tag(struct mlx5dr_match_param *value,
 					      struct mlx5dr_ste_build *sb,
-					      uint8_t *tag)
+					      u8 *tag)
 {
 	struct mlx5dr_match_misc5 *misc5 = &value->misc5;
 
@@ -2129,7 +2129,7 @@ dr_ste_v1_build_flex_parser_tnl_geneve_tlv_opt_init(struct mlx5dr_ste_build *sb,
 static int
 dr_ste_v1_build_flex_parser_tnl_geneve_tlv_opt_exist_tag(struct mlx5dr_match_param *value,
 							 struct mlx5dr_ste_build *sb,
-							 uint8_t *tag)
+							 u8 *tag)
 {
 	u8 parser_id = sb->caps->flex_parser_id_geneve_tlv_option_0;
 	struct mlx5dr_match_misc *misc = &value->misc;
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 01275c6e8468..da61be87a12e 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -68,7 +68,7 @@
 #define MLX5_UN_SZ_BYTES(typ) (sizeof(union mlx5_ifc_##typ##_bits) / 8)
 #define MLX5_UN_SZ_DW(typ) (sizeof(union mlx5_ifc_##typ##_bits) / 32)
 #define MLX5_BYTE_OFF(typ, fld) (__mlx5_bit_off(typ, fld) / 8)
-#define MLX5_ADDR_OF(typ, p, fld) ((void *)((uint8_t *)(p) + MLX5_BYTE_OFF(typ, fld)))
+#define MLX5_ADDR_OF(typ, p, fld) ((void *)((u8 *)(p) + MLX5_BYTE_OFF(typ, fld)))
 
 /* insert a value to a struct */
 #define MLX5_SET(typ, p, fld, v) do { \
-- 
cgit v1.2.3


From 137f3d50ad2a0f2e1ebe5181d6b32a5541786b99 Mon Sep 17 00:00:00 2001
From: Jianbo Liu <jianbol@nvidia.com>
Date: Tue, 2 Apr 2024 16:30:41 +0300
Subject: net/mlx5: Support matching on l4_type for ttc_table

Replace matching on TCP and UDP protocols with new l4_type field which
is parsed by steering for ttc_table. It is enabled by the
outer_l4_type or inner_l4_type bits in nic_rx or port_sel flow table
capabilities and used only if pcc_ifa2 bit in HCA capabilities is set.

Signed-off-by: Jianbo Liu <jianbol@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://lore.kernel.org/r/20240402133043.56322-10-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_fs.c    |   6 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    |   3 +-
 .../net/ethernet/mellanox/mlx5/core/lag/port_sel.c |   8 +-
 .../net/ethernet/mellanox/mlx5/core/lib/fs_ttc.c   | 254 +++++++++++++++++----
 .../net/ethernet/mellanox/mlx5/core/lib/fs_ttc.h   |   2 +-
 include/linux/mlx5/device.h                        |   6 +
 include/linux/mlx5/mlx5_ifc.h                      |  32 ++-
 7 files changed, 244 insertions(+), 67 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
index 777d311d44ef..8c5b291a171f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_fs.c
@@ -896,8 +896,7 @@ static void mlx5e_set_inner_ttc_params(struct mlx5e_flow_steering *fs,
 	int tt;
 
 	memset(ttc_params, 0, sizeof(*ttc_params));
-	ttc_params->ns = mlx5_get_flow_namespace(fs->mdev,
-						 MLX5_FLOW_NAMESPACE_KERNEL);
+	ttc_params->ns_type = MLX5_FLOW_NAMESPACE_KERNEL;
 	ft_attr->level = MLX5E_INNER_TTC_FT_LEVEL;
 	ft_attr->prio = MLX5E_NIC_PRIO;
 
@@ -920,8 +919,7 @@ void mlx5e_set_ttc_params(struct mlx5e_flow_steering *fs,
 	int tt;
 
 	memset(ttc_params, 0, sizeof(*ttc_params));
-	ttc_params->ns = mlx5_get_flow_namespace(fs->mdev,
-						 MLX5_FLOW_NAMESPACE_KERNEL);
+	ttc_params->ns_type = MLX5_FLOW_NAMESPACE_KERNEL;
 	ft_attr->level = MLX5E_TTC_FT_LEVEL;
 	ft_attr->prio = MLX5E_NIC_PRIO;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index f2f19fa37d91..aeb32cb27182 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -835,8 +835,7 @@ static void mlx5e_hairpin_set_ttc_params(struct mlx5e_hairpin *hp,
 
 	memset(ttc_params, 0, sizeof(*ttc_params));
 
-	ttc_params->ns = mlx5_get_flow_namespace(hp->func_mdev,
-						 MLX5_FLOW_NAMESPACE_KERNEL);
+	ttc_params->ns_type = MLX5_FLOW_NAMESPACE_KERNEL;
 	for (tt = 0; tt < MLX5_NUM_TT; tt++) {
 		ttc_params->dests[tt].type = MLX5_FLOW_DESTINATION_TYPE_TIR;
 		ttc_params->dests[tt].tir_num =
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c
index 101b3bb90863..c16b462ddedf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c
@@ -449,13 +449,11 @@ static void set_tt_map(struct mlx5_lag_port_sel *port_sel,
 static void mlx5_lag_set_inner_ttc_params(struct mlx5_lag *ldev,
 					  struct ttc_params *ttc_params)
 {
-	struct mlx5_core_dev *dev = ldev->pf[MLX5_LAG_P1].dev;
 	struct mlx5_lag_port_sel *port_sel = &ldev->port_sel;
 	struct mlx5_flow_table_attr *ft_attr;
 	int tt;
 
-	ttc_params->ns = mlx5_get_flow_namespace(dev,
-						 MLX5_FLOW_NAMESPACE_PORT_SEL);
+	ttc_params->ns_type = MLX5_FLOW_NAMESPACE_PORT_SEL;
 	ft_attr = &ttc_params->ft_attr;
 	ft_attr->level = MLX5_LAG_FT_LEVEL_INNER_TTC;
 
@@ -470,13 +468,11 @@ static void mlx5_lag_set_inner_ttc_params(struct mlx5_lag *ldev,
 static void mlx5_lag_set_outer_ttc_params(struct mlx5_lag *ldev,
 					  struct ttc_params *ttc_params)
 {
-	struct mlx5_core_dev *dev = ldev->pf[MLX5_LAG_P1].dev;
 	struct mlx5_lag_port_sel *port_sel = &ldev->port_sel;
 	struct mlx5_flow_table_attr *ft_attr;
 	int tt;
 
-	ttc_params->ns = mlx5_get_flow_namespace(dev,
-						 MLX5_FLOW_NAMESPACE_PORT_SEL);
+	ttc_params->ns_type = MLX5_FLOW_NAMESPACE_PORT_SEL;
 	ft_attr = &ttc_params->ft_attr;
 	ft_attr->level = MLX5_LAG_FT_LEVEL_TTC;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.c
index b78f2ba25c19..9f13cea16446 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.c
@@ -9,21 +9,24 @@
 #include "mlx5_core.h"
 #include "lib/fs_ttc.h"
 
-#define MLX5_TTC_NUM_GROUPS	3
-#define MLX5_TTC_GROUP1_SIZE	(BIT(3) + MLX5_NUM_TUNNEL_TT)
-#define MLX5_TTC_GROUP2_SIZE	 BIT(1)
-#define MLX5_TTC_GROUP3_SIZE	 BIT(0)
-#define MLX5_TTC_TABLE_SIZE	(MLX5_TTC_GROUP1_SIZE +\
-				 MLX5_TTC_GROUP2_SIZE +\
-				 MLX5_TTC_GROUP3_SIZE)
-
-#define MLX5_INNER_TTC_NUM_GROUPS	3
-#define MLX5_INNER_TTC_GROUP1_SIZE	BIT(3)
-#define MLX5_INNER_TTC_GROUP2_SIZE	BIT(1)
-#define MLX5_INNER_TTC_GROUP3_SIZE	BIT(0)
-#define MLX5_INNER_TTC_TABLE_SIZE	(MLX5_INNER_TTC_GROUP1_SIZE +\
-					 MLX5_INNER_TTC_GROUP2_SIZE +\
-					 MLX5_INNER_TTC_GROUP3_SIZE)
+#define MLX5_TTC_MAX_NUM_GROUPS		4
+#define MLX5_TTC_GROUP_TCPUDP_SIZE	(MLX5_TT_IPV6_UDP + 1)
+
+struct mlx5_fs_ttc_groups {
+	bool use_l4_type;
+	int num_groups;
+	int group_size[MLX5_TTC_MAX_NUM_GROUPS];
+};
+
+static int mlx5_fs_ttc_table_size(const struct mlx5_fs_ttc_groups *groups)
+{
+	int i, sz = 0;
+
+	for (i = 0; i < groups->num_groups; i++)
+		sz += groups->group_size[i];
+
+	return sz;
+}
 
 /* L3/L4 traffic type classifier */
 struct mlx5_ttc_table {
@@ -138,6 +141,53 @@ static struct mlx5_etype_proto ttc_tunnel_rules[] = {
 
 };
 
+enum TTC_GROUP_TYPE {
+	TTC_GROUPS_DEFAULT = 0,
+	TTC_GROUPS_USE_L4_TYPE = 1,
+};
+
+static const struct mlx5_fs_ttc_groups ttc_groups[] = {
+	[TTC_GROUPS_DEFAULT] = {
+		.num_groups = 3,
+		.group_size = {
+			BIT(3) + MLX5_NUM_TUNNEL_TT,
+			BIT(1),
+			BIT(0),
+		},
+	},
+	[TTC_GROUPS_USE_L4_TYPE] = {
+		.use_l4_type = true,
+		.num_groups = 4,
+		.group_size = {
+			MLX5_TTC_GROUP_TCPUDP_SIZE,
+			BIT(3) + MLX5_NUM_TUNNEL_TT - MLX5_TTC_GROUP_TCPUDP_SIZE,
+			BIT(1),
+			BIT(0),
+		},
+	},
+};
+
+static const struct mlx5_fs_ttc_groups inner_ttc_groups[] = {
+	[TTC_GROUPS_DEFAULT] = {
+		.num_groups = 3,
+		.group_size = {
+			BIT(3),
+			BIT(1),
+			BIT(0),
+		},
+	},
+	[TTC_GROUPS_USE_L4_TYPE] = {
+		.use_l4_type = true,
+		.num_groups = 4,
+		.group_size = {
+			MLX5_TTC_GROUP_TCPUDP_SIZE,
+			BIT(3) - MLX5_TTC_GROUP_TCPUDP_SIZE,
+			BIT(1),
+			BIT(0),
+		},
+	},
+};
+
 u8 mlx5_get_proto_by_tunnel_type(enum mlx5_tunnel_types tt)
 {
 	return ttc_tunnel_rules[tt].proto;
@@ -188,9 +238,29 @@ static u8 mlx5_etype_to_ipv(u16 ethertype)
 	return 0;
 }
 
+static void mlx5_fs_ttc_set_match_proto(void *headers_c, void *headers_v,
+					u8 proto, bool use_l4_type)
+{
+	int l4_type;
+
+	if (use_l4_type && (proto == IPPROTO_TCP || proto == IPPROTO_UDP)) {
+		if (proto == IPPROTO_TCP)
+			l4_type = MLX5_PACKET_L4_TYPE_TCP;
+		else
+			l4_type = MLX5_PACKET_L4_TYPE_UDP;
+
+		MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, l4_type);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, l4_type, l4_type);
+	} else {
+		MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, ip_protocol);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, proto);
+	}
+}
+
 static struct mlx5_flow_handle *
 mlx5_generate_ttc_rule(struct mlx5_core_dev *dev, struct mlx5_flow_table *ft,
-		       struct mlx5_flow_destination *dest, u16 etype, u8 proto)
+		       struct mlx5_flow_destination *dest, u16 etype, u8 proto,
+		       bool use_l4_type)
 {
 	int match_ipv_outer =
 		MLX5_CAP_FLOWTABLE_NIC_RX(dev,
@@ -207,8 +277,13 @@ mlx5_generate_ttc_rule(struct mlx5_core_dev *dev, struct mlx5_flow_table *ft,
 
 	if (proto) {
 		spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
-		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ip_protocol);
-		MLX5_SET(fte_match_param, spec->match_value, outer_headers.ip_protocol, proto);
+		mlx5_fs_ttc_set_match_proto(MLX5_ADDR_OF(fte_match_param,
+							 spec->match_criteria,
+							 outer_headers),
+					    MLX5_ADDR_OF(fte_match_param,
+							 spec->match_value,
+							 outer_headers),
+					    proto, use_l4_type);
 	}
 
 	ipv = mlx5_etype_to_ipv(etype);
@@ -234,7 +309,8 @@ mlx5_generate_ttc_rule(struct mlx5_core_dev *dev, struct mlx5_flow_table *ft,
 
 static int mlx5_generate_ttc_table_rules(struct mlx5_core_dev *dev,
 					 struct ttc_params *params,
-					 struct mlx5_ttc_table *ttc)
+					 struct mlx5_ttc_table *ttc,
+					 bool use_l4_type)
 {
 	struct mlx5_flow_handle **trules;
 	struct mlx5_ttc_rule *rules;
@@ -251,7 +327,8 @@ static int mlx5_generate_ttc_table_rules(struct mlx5_core_dev *dev,
 			continue;
 		rule->rule = mlx5_generate_ttc_rule(dev, ft, &params->dests[tt],
 						    ttc_rules[tt].etype,
-						    ttc_rules[tt].proto);
+						    ttc_rules[tt].proto,
+						    use_l4_type);
 		if (IS_ERR(rule->rule)) {
 			err = PTR_ERR(rule->rule);
 			rule->rule = NULL;
@@ -273,7 +350,8 @@ static int mlx5_generate_ttc_table_rules(struct mlx5_core_dev *dev,
 		trules[tt] = mlx5_generate_ttc_rule(dev, ft,
 						    &params->tunnel_dests[tt],
 						    ttc_tunnel_rules[tt].etype,
-						    ttc_tunnel_rules[tt].proto);
+						    ttc_tunnel_rules[tt].proto,
+						    use_l4_type);
 		if (IS_ERR(trules[tt])) {
 			err = PTR_ERR(trules[tt]);
 			trules[tt] = NULL;
@@ -289,7 +367,8 @@ del_rules:
 }
 
 static int mlx5_create_ttc_table_groups(struct mlx5_ttc_table *ttc,
-					bool use_ipv)
+					bool use_ipv,
+					const struct mlx5_fs_ttc_groups *groups)
 {
 	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
 	int ix = 0;
@@ -297,7 +376,7 @@ static int mlx5_create_ttc_table_groups(struct mlx5_ttc_table *ttc,
 	int err;
 	u8 *mc;
 
-	ttc->g = kcalloc(MLX5_TTC_NUM_GROUPS, sizeof(*ttc->g), GFP_KERNEL);
+	ttc->g = kcalloc(groups->num_groups, sizeof(*ttc->g), GFP_KERNEL);
 	if (!ttc->g)
 		return -ENOMEM;
 	in = kvzalloc(inlen, GFP_KERNEL);
@@ -307,16 +386,31 @@ static int mlx5_create_ttc_table_groups(struct mlx5_ttc_table *ttc,
 		return -ENOMEM;
 	}
 
-	/* L4 Group */
 	mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria);
-	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ip_protocol);
 	if (use_ipv)
 		MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ip_version);
 	else
 		MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ethertype);
 	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+
+	/* TCP UDP group */
+	if (groups->use_l4_type) {
+		MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.l4_type);
+		MLX5_SET_CFG(in, start_flow_index, ix);
+		ix += groups->group_size[ttc->num_groups];
+		MLX5_SET_CFG(in, end_flow_index, ix - 1);
+		ttc->g[ttc->num_groups] = mlx5_create_flow_group(ttc->t, in);
+		if (IS_ERR(ttc->g[ttc->num_groups]))
+			goto err;
+		ttc->num_groups++;
+
+		MLX5_SET(fte_match_param, mc, outer_headers.l4_type, 0);
+	}
+
+	/* L4 Group */
+	MLX5_SET_TO_ONES(fte_match_param, mc, outer_headers.ip_protocol);
 	MLX5_SET_CFG(in, start_flow_index, ix);
-	ix += MLX5_TTC_GROUP1_SIZE;
+	ix += groups->group_size[ttc->num_groups];
 	MLX5_SET_CFG(in, end_flow_index, ix - 1);
 	ttc->g[ttc->num_groups] = mlx5_create_flow_group(ttc->t, in);
 	if (IS_ERR(ttc->g[ttc->num_groups]))
@@ -326,7 +420,7 @@ static int mlx5_create_ttc_table_groups(struct mlx5_ttc_table *ttc,
 	/* L3 Group */
 	MLX5_SET(fte_match_param, mc, outer_headers.ip_protocol, 0);
 	MLX5_SET_CFG(in, start_flow_index, ix);
-	ix += MLX5_TTC_GROUP2_SIZE;
+	ix += groups->group_size[ttc->num_groups];
 	MLX5_SET_CFG(in, end_flow_index, ix - 1);
 	ttc->g[ttc->num_groups] = mlx5_create_flow_group(ttc->t, in);
 	if (IS_ERR(ttc->g[ttc->num_groups]))
@@ -336,7 +430,7 @@ static int mlx5_create_ttc_table_groups(struct mlx5_ttc_table *ttc,
 	/* Any Group */
 	memset(in, 0, inlen);
 	MLX5_SET_CFG(in, start_flow_index, ix);
-	ix += MLX5_TTC_GROUP3_SIZE;
+	ix += groups->group_size[ttc->num_groups];
 	MLX5_SET_CFG(in, end_flow_index, ix - 1);
 	ttc->g[ttc->num_groups] = mlx5_create_flow_group(ttc->t, in);
 	if (IS_ERR(ttc->g[ttc->num_groups]))
@@ -358,7 +452,7 @@ static struct mlx5_flow_handle *
 mlx5_generate_inner_ttc_rule(struct mlx5_core_dev *dev,
 			     struct mlx5_flow_table *ft,
 			     struct mlx5_flow_destination *dest,
-			     u16 etype, u8 proto)
+			     u16 etype, u8 proto, bool use_l4_type)
 {
 	MLX5_DECLARE_FLOW_ACT(flow_act);
 	struct mlx5_flow_handle *rule;
@@ -379,8 +473,13 @@ mlx5_generate_inner_ttc_rule(struct mlx5_core_dev *dev,
 
 	if (proto) {
 		spec->match_criteria_enable = MLX5_MATCH_INNER_HEADERS;
-		MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, inner_headers.ip_protocol);
-		MLX5_SET(fte_match_param, spec->match_value, inner_headers.ip_protocol, proto);
+		mlx5_fs_ttc_set_match_proto(MLX5_ADDR_OF(fte_match_param,
+							 spec->match_criteria,
+							 inner_headers),
+					    MLX5_ADDR_OF(fte_match_param,
+							 spec->match_value,
+							 inner_headers),
+					    proto, use_l4_type);
 	}
 
 	rule = mlx5_add_flow_rules(ft, spec, &flow_act, dest, 1);
@@ -395,7 +494,8 @@ mlx5_generate_inner_ttc_rule(struct mlx5_core_dev *dev,
 
 static int mlx5_generate_inner_ttc_table_rules(struct mlx5_core_dev *dev,
 					       struct ttc_params *params,
-					       struct mlx5_ttc_table *ttc)
+					       struct mlx5_ttc_table *ttc,
+					       bool use_l4_type)
 {
 	struct mlx5_ttc_rule *rules;
 	struct mlx5_flow_table *ft;
@@ -413,7 +513,8 @@ static int mlx5_generate_inner_ttc_table_rules(struct mlx5_core_dev *dev,
 		rule->rule = mlx5_generate_inner_ttc_rule(dev, ft,
 							  &params->dests[tt],
 							  ttc_rules[tt].etype,
-							  ttc_rules[tt].proto);
+							  ttc_rules[tt].proto,
+							  use_l4_type);
 		if (IS_ERR(rule->rule)) {
 			err = PTR_ERR(rule->rule);
 			rule->rule = NULL;
@@ -430,7 +531,8 @@ del_rules:
 	return err;
 }
 
-static int mlx5_create_inner_ttc_table_groups(struct mlx5_ttc_table *ttc)
+static int mlx5_create_inner_ttc_table_groups(struct mlx5_ttc_table *ttc,
+					      const struct mlx5_fs_ttc_groups *groups)
 {
 	int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
 	int ix = 0;
@@ -438,8 +540,7 @@ static int mlx5_create_inner_ttc_table_groups(struct mlx5_ttc_table *ttc)
 	int err;
 	u8 *mc;
 
-	ttc->g = kcalloc(MLX5_INNER_TTC_NUM_GROUPS, sizeof(*ttc->g),
-			 GFP_KERNEL);
+	ttc->g = kcalloc(groups->num_groups, sizeof(*ttc->g), GFP_KERNEL);
 	if (!ttc->g)
 		return -ENOMEM;
 	in = kvzalloc(inlen, GFP_KERNEL);
@@ -449,13 +550,28 @@ static int mlx5_create_inner_ttc_table_groups(struct mlx5_ttc_table *ttc)
 		return -ENOMEM;
 	}
 
-	/* L4 Group */
 	mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria);
-	MLX5_SET_TO_ONES(fte_match_param, mc, inner_headers.ip_protocol);
 	MLX5_SET_TO_ONES(fte_match_param, mc, inner_headers.ip_version);
 	MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_INNER_HEADERS);
+
+	/* TCP UDP group */
+	if (groups->use_l4_type) {
+		MLX5_SET_TO_ONES(fte_match_param, mc, inner_headers.l4_type);
+		MLX5_SET_CFG(in, start_flow_index, ix);
+		ix += groups->group_size[ttc->num_groups];
+		MLX5_SET_CFG(in, end_flow_index, ix - 1);
+		ttc->g[ttc->num_groups] = mlx5_create_flow_group(ttc->t, in);
+		if (IS_ERR(ttc->g[ttc->num_groups]))
+			goto err;
+		ttc->num_groups++;
+
+		MLX5_SET(fte_match_param, mc, inner_headers.l4_type, 0);
+	}
+
+	/* L4 Group */
+	MLX5_SET_TO_ONES(fte_match_param, mc, inner_headers.ip_protocol);
 	MLX5_SET_CFG(in, start_flow_index, ix);
-	ix += MLX5_INNER_TTC_GROUP1_SIZE;
+	ix += groups->group_size[ttc->num_groups];
 	MLX5_SET_CFG(in, end_flow_index, ix - 1);
 	ttc->g[ttc->num_groups] = mlx5_create_flow_group(ttc->t, in);
 	if (IS_ERR(ttc->g[ttc->num_groups]))
@@ -465,7 +581,7 @@ static int mlx5_create_inner_ttc_table_groups(struct mlx5_ttc_table *ttc)
 	/* L3 Group */
 	MLX5_SET(fte_match_param, mc, inner_headers.ip_protocol, 0);
 	MLX5_SET_CFG(in, start_flow_index, ix);
-	ix += MLX5_INNER_TTC_GROUP2_SIZE;
+	ix += groups->group_size[ttc->num_groups];
 	MLX5_SET_CFG(in, end_flow_index, ix - 1);
 	ttc->g[ttc->num_groups] = mlx5_create_flow_group(ttc->t, in);
 	if (IS_ERR(ttc->g[ttc->num_groups]))
@@ -475,7 +591,7 @@ static int mlx5_create_inner_ttc_table_groups(struct mlx5_ttc_table *ttc)
 	/* Any Group */
 	memset(in, 0, inlen);
 	MLX5_SET_CFG(in, start_flow_index, ix);
-	ix += MLX5_INNER_TTC_GROUP3_SIZE;
+	ix += groups->group_size[ttc->num_groups];
 	MLX5_SET_CFG(in, end_flow_index, ix - 1);
 	ttc->g[ttc->num_groups] = mlx5_create_flow_group(ttc->t, in);
 	if (IS_ERR(ttc->g[ttc->num_groups]))
@@ -496,27 +612,47 @@ err:
 struct mlx5_ttc_table *mlx5_create_inner_ttc_table(struct mlx5_core_dev *dev,
 						   struct ttc_params *params)
 {
+	const struct mlx5_fs_ttc_groups *groups;
+	struct mlx5_flow_namespace *ns;
 	struct mlx5_ttc_table *ttc;
+	bool use_l4_type;
 	int err;
 
 	ttc = kvzalloc(sizeof(*ttc), GFP_KERNEL);
 	if (!ttc)
 		return ERR_PTR(-ENOMEM);
 
+	switch (params->ns_type) {
+	case MLX5_FLOW_NAMESPACE_PORT_SEL:
+		use_l4_type = MLX5_CAP_GEN_2(dev, pcc_ifa2) &&
+			MLX5_CAP_PORT_SELECTION_FT_FIELD_SUPPORT_2(dev, inner_l4_type);
+		break;
+	case MLX5_FLOW_NAMESPACE_KERNEL:
+		use_l4_type = MLX5_CAP_GEN_2(dev, pcc_ifa2) &&
+			MLX5_CAP_NIC_RX_FT_FIELD_SUPPORT_2(dev, inner_l4_type);
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+
+	ns = mlx5_get_flow_namespace(dev, params->ns_type);
+	groups = use_l4_type ? &inner_ttc_groups[TTC_GROUPS_USE_L4_TYPE] :
+			       &inner_ttc_groups[TTC_GROUPS_DEFAULT];
+
 	WARN_ON_ONCE(params->ft_attr.max_fte);
-	params->ft_attr.max_fte = MLX5_INNER_TTC_TABLE_SIZE;
-	ttc->t = mlx5_create_flow_table(params->ns, &params->ft_attr);
+	params->ft_attr.max_fte = mlx5_fs_ttc_table_size(groups);
+	ttc->t = mlx5_create_flow_table(ns, &params->ft_attr);
 	if (IS_ERR(ttc->t)) {
 		err = PTR_ERR(ttc->t);
 		kvfree(ttc);
 		return ERR_PTR(err);
 	}
 
-	err = mlx5_create_inner_ttc_table_groups(ttc);
+	err = mlx5_create_inner_ttc_table_groups(ttc, groups);
 	if (err)
 		goto destroy_ft;
 
-	err = mlx5_generate_inner_ttc_table_rules(dev, params, ttc);
+	err = mlx5_generate_inner_ttc_table_rules(dev, params, ttc, use_l4_type);
 	if (err)
 		goto destroy_ft;
 
@@ -549,27 +685,47 @@ struct mlx5_ttc_table *mlx5_create_ttc_table(struct mlx5_core_dev *dev,
 	bool match_ipv_outer =
 		MLX5_CAP_FLOWTABLE_NIC_RX(dev,
 					  ft_field_support.outer_ip_version);
+	const struct mlx5_fs_ttc_groups *groups;
+	struct mlx5_flow_namespace *ns;
 	struct mlx5_ttc_table *ttc;
+	bool use_l4_type;
 	int err;
 
 	ttc = kvzalloc(sizeof(*ttc), GFP_KERNEL);
 	if (!ttc)
 		return ERR_PTR(-ENOMEM);
 
+	switch (params->ns_type) {
+	case MLX5_FLOW_NAMESPACE_PORT_SEL:
+		use_l4_type = MLX5_CAP_GEN_2(dev, pcc_ifa2) &&
+			MLX5_CAP_PORT_SELECTION_FT_FIELD_SUPPORT_2(dev, outer_l4_type);
+		break;
+	case MLX5_FLOW_NAMESPACE_KERNEL:
+		use_l4_type = MLX5_CAP_GEN_2(dev, pcc_ifa2) &&
+			MLX5_CAP_NIC_RX_FT_FIELD_SUPPORT_2(dev, outer_l4_type);
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+
+	ns = mlx5_get_flow_namespace(dev, params->ns_type);
+	groups = use_l4_type ? &ttc_groups[TTC_GROUPS_USE_L4_TYPE] :
+			       &ttc_groups[TTC_GROUPS_DEFAULT];
+
 	WARN_ON_ONCE(params->ft_attr.max_fte);
-	params->ft_attr.max_fte = MLX5_TTC_TABLE_SIZE;
-	ttc->t = mlx5_create_flow_table(params->ns, &params->ft_attr);
+	params->ft_attr.max_fte = mlx5_fs_ttc_table_size(groups);
+	ttc->t = mlx5_create_flow_table(ns, &params->ft_attr);
 	if (IS_ERR(ttc->t)) {
 		err = PTR_ERR(ttc->t);
 		kvfree(ttc);
 		return ERR_PTR(err);
 	}
 
-	err = mlx5_create_ttc_table_groups(ttc, match_ipv_outer);
+	err = mlx5_create_ttc_table_groups(ttc, match_ipv_outer, groups);
 	if (err)
 		goto destroy_ft;
 
-	err = mlx5_generate_ttc_table_rules(dev, params, ttc);
+	err = mlx5_generate_ttc_table_rules(dev, params, ttc, use_l4_type);
 	if (err)
 		goto destroy_ft;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.h
index 85fef0cd1c07..92eea6bea310 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.h
@@ -40,7 +40,7 @@ struct mlx5_ttc_rule {
 struct mlx5_ttc_table;
 
 struct ttc_params {
-	struct mlx5_flow_namespace *ns;
+	enum mlx5_flow_namespace_type ns_type;
 	struct mlx5_flow_table_attr ft_attr;
 	struct mlx5_flow_destination dests[MLX5_NUM_TT];
 	DECLARE_BITMAP(ignore_dests, MLX5_NUM_TT);
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index da61be87a12e..d7bb31d9a446 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -1336,6 +1336,9 @@ enum mlx5_qcam_feature_groups {
 #define MLX5_CAP_ESW_FT_FIELD_SUPPORT_2(mdev, cap) \
 	MLX5_CAP_ESW_FLOWTABLE(mdev, ft_field_support_2_esw_fdb.cap)
 
+#define MLX5_CAP_NIC_RX_FT_FIELD_SUPPORT_2(mdev, cap) \
+		MLX5_CAP_FLOWTABLE(mdev, ft_field_support_2_nic_receive.cap)
+
 #define MLX5_CAP_ESW(mdev, cap) \
 	MLX5_GET(e_switch_cap, \
 		 mdev->caps.hca[MLX5_CAP_ESWITCH]->cur, cap)
@@ -1359,6 +1362,9 @@ enum mlx5_qcam_feature_groups {
 #define MLX5_CAP_FLOWTABLE_PORT_SELECTION(mdev, cap) \
 	MLX5_CAP_PORT_SELECTION(mdev, flow_table_properties_port_selection.cap)
 
+#define MLX5_CAP_PORT_SELECTION_FT_FIELD_SUPPORT_2(mdev, cap) \
+	MLX5_CAP_PORT_SELECTION(mdev, ft_field_support_2_port_selection.cap)
+
 #define MLX5_CAP_ODP(mdev, cap)\
 	MLX5_GET(odp_cap, mdev->caps.hca[MLX5_CAP_ODP]->cur, cap)
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index c940b329a475..40f6fa138e27 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -416,7 +416,10 @@ struct mlx5_ifc_flow_table_fields_supported_bits {
 
 /* Table 2170 - Flow Table Fields Supported 2 Format */
 struct mlx5_ifc_flow_table_fields_supported_2_bits {
-	u8         reserved_at_0[0xe];
+	u8         reserved_at_0[0x2];
+	u8         inner_l4_type[0x1];
+	u8         outer_l4_type[0x1];
+	u8         reserved_at_4[0xa];
 	u8         bth_opcode[0x1];
 	u8         reserved_at_f[0x1];
 	u8         tunnel_header_0_1[0x1];
@@ -525,6 +528,12 @@ union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits {
 	u8         reserved_at_0[0x80];
 };
 
+enum {
+	MLX5_PACKET_L4_TYPE_NONE,
+	MLX5_PACKET_L4_TYPE_TCP,
+	MLX5_PACKET_L4_TYPE_UDP,
+};
+
 struct mlx5_ifc_fte_match_set_lyr_2_4_bits {
 	u8         smac_47_16[0x20];
 
@@ -550,7 +559,8 @@ struct mlx5_ifc_fte_match_set_lyr_2_4_bits {
 	u8         tcp_sport[0x10];
 	u8         tcp_dport[0x10];
 
-	u8         reserved_at_c0[0x10];
+	u8         l4_type[0x2];
+	u8         reserved_at_c2[0xe];
 	u8         ipv4_ihl[0x4];
 	u8         reserved_at_c4[0x4];
 
@@ -846,7 +856,11 @@ struct mlx5_ifc_flow_table_nic_cap_bits {
 
 	struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_transmit_sniffer;
 
-	u8         reserved_at_e00[0x700];
+	u8         reserved_at_e00[0x600];
+
+	struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_support_2_nic_receive;
+
+	u8         reserved_at_1480[0x80];
 
 	struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_support_2_nic_receive_rdma;
 
@@ -876,7 +890,9 @@ struct mlx5_ifc_port_selection_cap_bits {
 
 	struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_port_selection;
 
-	u8         reserved_at_400[0x7c00];
+	struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_support_2_port_selection;
+
+	u8         reserved_at_480[0x7b80];
 };
 
 enum {
@@ -2004,7 +2020,13 @@ struct mlx5_ifc_cmd_hca_cap_2_bits {
 	u8	   reserved_at_3a0[0x10];
 	u8	   max_rqt_vhca_id[0x10];
 
-	u8	   reserved_at_3c0[0x440];
+	u8	   reserved_at_3c0[0x20];
+
+	u8	   reserved_at_3e0[0x10];
+	u8	   pcc_ifa2[0x1];
+	u8	   reserved_at_3f1[0xf];
+
+	u8	   reserved_at_400[0x400];
 };
 
 enum mlx5_ifc_flow_destination_type {
-- 
cgit v1.2.3


From c788d79cfa6b68b1068b25e9cdff25d6035ed191 Mon Sep 17 00:00:00 2001
From: Jianbo Liu <jianbol@nvidia.com>
Date: Tue, 2 Apr 2024 16:30:42 +0300
Subject: net/mlx5: Skip pages EQ creation for non-page supplier function

Page events are not issued by device on the function if
page_request_disable is set, so no need to create pages EQ.

Signed-off-by: Jianbo Liu <jianbol@nvidia.com>
Reviewed-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://lore.kernel.org/r/20240402133043.56322-11-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c | 9 ++++++++-
 include/linux/mlx5/mlx5_ifc.h                | 4 +++-
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 40a6cb052a2d..5693986ae656 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -688,6 +688,12 @@ static int create_async_eqs(struct mlx5_core_dev *dev)
 	if (err)
 		goto err2;
 
+	/* Skip page eq creation when the device does not request for page requests */
+	if (MLX5_CAP_GEN(dev, page_request_disable)) {
+		mlx5_core_dbg(dev, "Skip page EQ creation\n");
+		return 0;
+	}
+
 	param = (struct mlx5_eq_param) {
 		.irq = table->ctrl_irq,
 		.nent = /* TODO: sriov max_vf + */ 1,
@@ -716,7 +722,8 @@ static void destroy_async_eqs(struct mlx5_core_dev *dev)
 {
 	struct mlx5_eq_table *table = dev->priv.eq_table;
 
-	cleanup_async_eq(dev, &table->pages_eq, "pages");
+	if (!MLX5_CAP_GEN(dev, page_request_disable))
+		cleanup_async_eq(dev, &table->pages_eq, "pages");
 	cleanup_async_eq(dev, &table->async_eq, "async");
 	mlx5_cmd_allowed_opcode(dev, MLX5_CMD_OP_DESTROY_EQ);
 	mlx5_cmd_use_polling(dev);
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 40f6fa138e27..cc159d8563d1 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1485,7 +1485,9 @@ enum {
 };
 
 struct mlx5_ifc_cmd_hca_cap_bits {
-	u8         reserved_at_0[0x10];
+	u8         reserved_at_0[0x6];
+	u8         page_request_disable[0x1];
+	u8         reserved_at_7[0x9];
 	u8         shared_object_to_user_object_allowed[0x1];
 	u8         reserved_at_13[0xe];
 	u8         vhca_resource_manager[0x1];
-- 
cgit v1.2.3


From a835b784ba1a4514434f619fad2a0f9f1bf72d62 Mon Sep 17 00:00:00 2001
From: Mitul Golani <mitulkumar.ajitkumar.golani@intel.com>
Date: Fri, 22 Mar 2024 08:41:49 +0530
Subject: drm/dp: Add support to indicate if sink supports AS SDP

Add an API that indicates support for Adaptive Sync SDP in
the sink, which can be utilized by the rest of the DP programming.

--v1:
- Format commit message properly.

Signed-off-by: Mitul Golani <mitulkumar.ajitkumar.golani@intel.com>
Reviewed-by: Ankit Nautiyal <ankit.k.nautiyal@intel.com>
Signed-off-by: Ankit Nautiyal <ankit.k.nautiyal@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240322031157.3823909-2-mitulkumar.ajitkumar.golani@intel.com
---
 drivers/gpu/drm/display/drm_dp_helper.c | 25 +++++++++++++++++++++++++
 include/drm/display/drm_dp_helper.h     |  1 +
 2 files changed, 26 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/display/drm_dp_helper.c b/drivers/gpu/drm/display/drm_dp_helper.c
index f5d4be897866..5d7c6bab8e33 100644
--- a/drivers/gpu/drm/display/drm_dp_helper.c
+++ b/drivers/gpu/drm/display/drm_dp_helper.c
@@ -2948,6 +2948,31 @@ void drm_dp_vsc_sdp_log(struct drm_printer *p, const struct drm_dp_vsc_sdp *vsc)
 }
 EXPORT_SYMBOL(drm_dp_vsc_sdp_log);
 
+/**
+ * drm_dp_as_sdp_supported() - check if adaptive sync sdp is supported
+ * @aux: DisplayPort AUX channel
+ * @dpcd: DisplayPort configuration data
+ *
+ * Returns true if adaptive sync sdp is supported, else returns false
+ */
+bool drm_dp_as_sdp_supported(struct drm_dp_aux *aux, const u8 dpcd[DP_RECEIVER_CAP_SIZE])
+{
+	u8 rx_feature;
+
+	if (dpcd[DP_DPCD_REV] < DP_DPCD_REV_13)
+		return false;
+
+	if (drm_dp_dpcd_readb(aux, DP_DPRX_FEATURE_ENUMERATION_LIST_CONT_1,
+			      &rx_feature) != 1) {
+		drm_dbg_dp(aux->drm_dev,
+			   "Failed to read DP_DPRX_FEATURE_ENUMERATION_LIST_CONT_1\n");
+		return false;
+	}
+
+	return (rx_feature & DP_ADAPTIVE_SYNC_SDP_SUPPORTED);
+}
+EXPORT_SYMBOL(drm_dp_as_sdp_supported);
+
 /**
  * drm_dp_vsc_sdp_supported() - check if vsc sdp is supported
  * @aux: DisplayPort AUX channel
diff --git a/include/drm/display/drm_dp_helper.h b/include/drm/display/drm_dp_helper.h
index a62fcd051d4d..7df19acdc790 100644
--- a/include/drm/display/drm_dp_helper.h
+++ b/include/drm/display/drm_dp_helper.h
@@ -101,6 +101,7 @@ struct drm_dp_vsc_sdp {
 void drm_dp_vsc_sdp_log(struct drm_printer *p, const struct drm_dp_vsc_sdp *vsc);
 
 bool drm_dp_vsc_sdp_supported(struct drm_dp_aux *aux, const u8 dpcd[DP_RECEIVER_CAP_SIZE]);
+bool drm_dp_as_sdp_supported(struct drm_dp_aux *aux, const u8 dpcd[DP_RECEIVER_CAP_SIZE]);
 
 int drm_dp_psr_setup_time(const u8 psr_cap[EDP_PSR_RECEIVER_CAP_SIZE]);
 
-- 
cgit v1.2.3


From 0bbb8f594e33286cf10a9c722a68a32160fe5a7f Mon Sep 17 00:00:00 2001
From: Mitul Golani <mitulkumar.ajitkumar.golani@intel.com>
Date: Fri, 22 Mar 2024 08:41:50 +0530
Subject: drm/dp: Add Adaptive Sync SDP logging

Add structure representing Adaptive Sync Secondary Data Packet (AS SDP).
Also, add Adaptive Sync SDP logging in drm_dp_helper.c to facilitate
debugging.

--v2:
- Update logging. [Jani, Ankit]
- Use 'as_sdp' instead of 'async' [Ankit]
- Correct define placeholders to where they are actually used. [Jani]
- Update members in 'as_sdp' structure to make it uniform. [Jani]

--v3:
- Added changes to dri-devel mailing list. No code changes.

--v4:
- Instead of directly using operation mode, use an enum to accommodate
all operation modes (Ankit).

--v5:
Nit-pick changes to commit message.

--v6:
- Add correct place holder and name change for AS_SDP_OP_MODE.
- Separate i915 changes from drm changes.
- Remove extra lines.

--v7:
- Add drm/dp in subject line (Ankit)

Signed-off-by: Mitul Golani <mitulkumar.ajitkumar.golani@intel.com>
Reviewed-by: Ankit Nautiyal <ankit.k.nautiyal@intel.com>
Signed-off-by: Ankit Nautiyal <ankit.k.nautiyal@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240322031157.3823909-3-mitulkumar.ajitkumar.golani@intel.com
---
 drivers/gpu/drm/display/drm_dp_helper.c | 12 ++++++++++++
 include/drm/display/drm_dp.h            | 11 +++++++++++
 include/drm/display/drm_dp_helper.h     | 29 +++++++++++++++++++++++++++++
 3 files changed, 52 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/display/drm_dp_helper.c b/drivers/gpu/drm/display/drm_dp_helper.c
index 5d7c6bab8e33..d67ebe433447 100644
--- a/drivers/gpu/drm/display/drm_dp_helper.c
+++ b/drivers/gpu/drm/display/drm_dp_helper.c
@@ -2948,6 +2948,18 @@ void drm_dp_vsc_sdp_log(struct drm_printer *p, const struct drm_dp_vsc_sdp *vsc)
 }
 EXPORT_SYMBOL(drm_dp_vsc_sdp_log);
 
+void drm_dp_as_sdp_log(struct drm_printer *p, const struct drm_dp_as_sdp *as_sdp)
+{
+	drm_printf(p, "DP SDP: AS_SDP, revision %u, length %u\n",
+		   as_sdp->revision, as_sdp->length);
+	drm_printf(p, "    vtotal: %d\n", as_sdp->vtotal);
+	drm_printf(p, "    target_rr: %d\n", as_sdp->target_rr);
+	drm_printf(p, "    duration_incr_ms: %d\n", as_sdp->duration_incr_ms);
+	drm_printf(p, "    duration_decr_ms: %d\n", as_sdp->duration_decr_ms);
+	drm_printf(p, "    operation_mode: %d\n", as_sdp->mode);
+}
+EXPORT_SYMBOL(drm_dp_as_sdp_log);
+
 /**
  * drm_dp_as_sdp_supported() - check if adaptive sync sdp is supported
  * @aux: DisplayPort AUX channel
diff --git a/include/drm/display/drm_dp.h b/include/drm/display/drm_dp.h
index 4891bd916d26..0b032faa8cf2 100644
--- a/include/drm/display/drm_dp.h
+++ b/include/drm/display/drm_dp.h
@@ -1150,6 +1150,8 @@
 
 #define DP_DPRX_FEATURE_ENUMERATION_LIST_CONT_1         0x2214 /* 2.0 E11 */
 # define DP_ADAPTIVE_SYNC_SDP_SUPPORTED    (1 << 0)
+# define DP_ADAPTIVE_SYNC_SDP_OPERATION_MODE		GENMASK(1, 0)
+# define DP_ADAPTIVE_SYNC_SDP_LENGTH				GENMASK(5, 0)
 # define DP_AS_SDP_FIRST_HALF_LINE_OR_3840_PIXEL_CYCLE_WINDOW_NOT_SUPPORTED (1 << 1)
 # define DP_VSC_EXT_SDP_FRAMEWORK_VERSION_1_SUPPORTED  (1 << 4)
 
@@ -1639,10 +1641,12 @@ enum drm_dp_phy {
 #define DP_SDP_AUDIO_COPYMANAGEMENT	0x05 /* DP 1.2 */
 #define DP_SDP_ISRC			0x06 /* DP 1.2 */
 #define DP_SDP_VSC			0x07 /* DP 1.2 */
+#define DP_SDP_ADAPTIVE_SYNC		0x22 /* DP 1.4 */
 #define DP_SDP_CAMERA_GENERIC(i)	(0x08 + (i)) /* 0-7, DP 1.3 */
 #define DP_SDP_PPS			0x10 /* DP 1.4 */
 #define DP_SDP_VSC_EXT_VESA		0x20 /* DP 1.4 */
 #define DP_SDP_VSC_EXT_CEA		0x21 /* DP 1.4 */
+
 /* 0x80+ CEA-861 infoframe types */
 
 #define DP_SDP_AUDIO_INFOFRAME_HB2	0x1b
@@ -1798,4 +1802,11 @@ enum dp_content_type {
 	DP_CONTENT_TYPE_GAME = 0x04,
 };
 
+enum operation_mode {
+	DP_AS_SDP_AVT_DYNAMIC_VTOTAL = 0x00,
+	DP_AS_SDP_AVT_FIXED_VTOTAL = 0x01,
+	DP_AS_SDP_FAVT_TRR_NOT_REACHED = 0x02,
+	DP_AS_SDP_FAVT_TRR_REACHED = 0x03
+};
+
 #endif /* _DRM_DP_H_ */
diff --git a/include/drm/display/drm_dp_helper.h b/include/drm/display/drm_dp_helper.h
index 7df19acdc790..10147ae96326 100644
--- a/include/drm/display/drm_dp_helper.h
+++ b/include/drm/display/drm_dp_helper.h
@@ -98,6 +98,35 @@ struct drm_dp_vsc_sdp {
 	enum dp_content_type content_type;
 };
 
+/**
+ * struct drm_dp_as_sdp - drm DP Adaptive Sync SDP
+ *
+ * This structure represents a DP AS SDP of drm
+ * It is based on DP 2.1 spec [Table 2-126:  Adaptive-Sync SDP Header Bytes] and
+ * [Table 2-127: Adaptive-Sync SDP Payload for DB0 through DB8]
+ *
+ * @sdp_type: Secondary-data packet type
+ * @revision: Revision Number
+ * @length: Number of valid data bytes
+ * @vtotal: Minimum Vertical Vtotal
+ * @target_rr: Target Refresh
+ * @duration_incr_ms: Successive frame duration increase
+ * @duration_decr_ms: Successive frame duration decrease
+ * @operation_mode: Adaptive Sync Operation Mode
+ */
+struct drm_dp_as_sdp {
+	unsigned char sdp_type;
+	unsigned char revision;
+	unsigned char length;
+	int vtotal;
+	int target_rr;
+	int duration_incr_ms;
+	int duration_decr_ms;
+	enum operation_mode mode;
+};
+
+void drm_dp_as_sdp_log(struct drm_printer *p,
+		       const struct drm_dp_as_sdp *as_sdp);
 void drm_dp_vsc_sdp_log(struct drm_printer *p, const struct drm_dp_vsc_sdp *vsc);
 
 bool drm_dp_vsc_sdp_supported(struct drm_dp_aux *aux, const u8 dpcd[DP_RECEIVER_CAP_SIZE]);
-- 
cgit v1.2.3


From d2f277e26f521ccf6fb438463b41dba6123caabe Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Sun, 17 Mar 2024 20:41:45 +0200
Subject: fsnotify: rename fsnotify_{get,put}_sb_connectors()

Instead of counting the number of connectors in an sb, we would like
to count the number of watched objects per priority group.

As a start, create an accessor fsnotify_sb_watched_objects() to
s_fsnotify_connectors and rename the fsnotify_{get,put}_sb_connectors()
helpers to fsnotify_{get,put}_sb_watchers() to better describes the
counter.

Increment the counter at the end of fsnotify_attach_connector_to_object()
if connector was attached instead of decrementing it on race to connect.

This is fine, because fsnotify_delete_sb() cannot be running in parallel
to fsnotify_attach_connector_to_object() which requires a reference to
a filesystem object.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Message-Id: <20240317184154.1200192-2-amir73il@gmail.com>
---
 fs/notify/fsnotify.c             |  4 +--
 fs/notify/mark.c                 | 67 ++++++++++++++++++++++------------------
 include/linux/fsnotify.h         |  2 +-
 include/linux/fsnotify_backend.h |  5 +++
 4 files changed, 45 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 2fc105a72a8f..503e7c75e777 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -92,8 +92,8 @@ void fsnotify_sb_delete(struct super_block *sb)
 	fsnotify_unmount_inodes(sb);
 	fsnotify_clear_marks_by_sb(sb);
 	/* Wait for outstanding object references from connectors */
-	wait_var_event(&sb->s_fsnotify_connectors,
-		       !atomic_long_read(&sb->s_fsnotify_connectors));
+	wait_var_event(fsnotify_sb_watched_objects(sb),
+		       !atomic_long_read(fsnotify_sb_watched_objects(sb)));
 }
 
 /*
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index d6944ff86ffa..3bcce2671bd5 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -116,10 +116,43 @@ __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn)
 	return *fsnotify_conn_mask_p(conn);
 }
 
+static void fsnotify_get_sb_watched_objects(struct super_block *sb)
+{
+	atomic_long_inc(fsnotify_sb_watched_objects(sb));
+}
+
+static void fsnotify_put_sb_watched_objects(struct super_block *sb)
+{
+	if (atomic_long_dec_and_test(fsnotify_sb_watched_objects(sb)))
+		wake_up_var(fsnotify_sb_watched_objects(sb));
+}
+
 static void fsnotify_get_inode_ref(struct inode *inode)
 {
 	ihold(inode);
-	atomic_long_inc(&inode->i_sb->s_fsnotify_connectors);
+	fsnotify_get_sb_watched_objects(inode->i_sb);
+}
+
+static void fsnotify_put_inode_ref(struct inode *inode)
+{
+	fsnotify_put_sb_watched_objects(inode->i_sb);
+	iput(inode);
+}
+
+static void fsnotify_get_sb_watchers(struct fsnotify_mark_connector *conn)
+{
+	struct super_block *sb = fsnotify_connector_sb(conn);
+
+	if (sb)
+		fsnotify_get_sb_watched_objects(sb);
+}
+
+static void fsnotify_put_sb_watchers(struct fsnotify_mark_connector *conn)
+{
+	struct super_block *sb = fsnotify_connector_sb(conn);
+
+	if (sb)
+		fsnotify_put_sb_watched_objects(sb);
 }
 
 /*
@@ -213,31 +246,6 @@ static void fsnotify_connector_destroy_workfn(struct work_struct *work)
 	}
 }
 
-static void fsnotify_put_inode_ref(struct inode *inode)
-{
-	struct super_block *sb = inode->i_sb;
-
-	iput(inode);
-	if (atomic_long_dec_and_test(&sb->s_fsnotify_connectors))
-		wake_up_var(&sb->s_fsnotify_connectors);
-}
-
-static void fsnotify_get_sb_connectors(struct fsnotify_mark_connector *conn)
-{
-	struct super_block *sb = fsnotify_connector_sb(conn);
-
-	if (sb)
-		atomic_long_inc(&sb->s_fsnotify_connectors);
-}
-
-static void fsnotify_put_sb_connectors(struct fsnotify_mark_connector *conn)
-{
-	struct super_block *sb = fsnotify_connector_sb(conn);
-
-	if (sb && atomic_long_dec_and_test(&sb->s_fsnotify_connectors))
-		wake_up_var(&sb->s_fsnotify_connectors);
-}
-
 static void *fsnotify_detach_connector_from_object(
 					struct fsnotify_mark_connector *conn,
 					unsigned int *type)
@@ -261,7 +269,7 @@ static void *fsnotify_detach_connector_from_object(
 		fsnotify_conn_sb(conn)->s_fsnotify_mask = 0;
 	}
 
-	fsnotify_put_sb_connectors(conn);
+	fsnotify_put_sb_watchers(conn);
 	rcu_assign_pointer(*(conn->obj), NULL);
 	conn->obj = NULL;
 	conn->type = FSNOTIFY_OBJ_TYPE_DETACHED;
@@ -549,8 +557,6 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
 	conn->flags = 0;
 	conn->type = obj_type;
 	conn->obj = connp;
-	conn->flags = 0;
-	fsnotify_get_sb_connectors(conn);
 
 	/*
 	 * cmpxchg() provides the barrier so that readers of *connp can see
@@ -558,10 +564,11 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
 	 */
 	if (cmpxchg(connp, NULL, conn)) {
 		/* Someone else created list structure for us */
-		fsnotify_put_sb_connectors(conn);
 		kmem_cache_free(fsnotify_mark_connector_cachep, conn);
+		return 0;
 	}
 
+	fsnotify_get_sb_watchers(conn);
 	return 0;
 }
 
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 1a9de119a0f7..e470bb67c9a3 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -20,7 +20,7 @@
 /* Are there any inode/mount/sb objects that are being watched at all? */
 static inline bool fsnotify_sb_has_watchers(struct super_block *sb)
 {
-	return atomic_long_read(&sb->s_fsnotify_connectors);
+	return atomic_long_read(fsnotify_sb_watched_objects(sb));
 }
 
 /*
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 8f40c349b228..d4e3bc55d174 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -483,6 +483,11 @@ struct fsnotify_mark_connector {
 	struct hlist_head list;
 };
 
+static inline atomic_long_t *fsnotify_sb_watched_objects(struct super_block *sb)
+{
+	return &sb->s_fsnotify_connectors;
+}
+
 /*
  * A mark is simply an object attached to an in core inode which allows an
  * fsnotify listener to indicate they are either no longer interested in events
-- 
cgit v1.2.3


From 230d97d39ee2eb9030309f04f98615aaeb420dac Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Sun, 17 Mar 2024 20:41:47 +0200
Subject: fsnotify: create a wrapper fsnotify_find_inode_mark()

In preparation to passing an object pointer to fsnotify_find_mark(), add
a wrapper fsnotify_find_inode_mark() and use it where possible.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Message-Id: <20240317184154.1200192-4-amir73il@gmail.com>
---
 fs/nfsd/filecache.c              | 4 ++--
 fs/notify/dnotify/dnotify.c      | 4 ++--
 fs/notify/inotify/inotify_user.c | 2 +-
 include/linux/fsnotify_backend.h | 7 +++++++
 kernel/audit_tree.c              | 2 +-
 kernel/audit_watch.c             | 2 +-
 6 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index ddd3e0d9cfa6..ad9083ca144b 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -159,8 +159,8 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf, struct inode *inode)
 
 	do {
 		fsnotify_group_lock(nfsd_file_fsnotify_group);
-		mark = fsnotify_find_mark(&inode->i_fsnotify_marks,
-					  nfsd_file_fsnotify_group);
+		mark = fsnotify_find_inode_mark(inode,
+						nfsd_file_fsnotify_group);
 		if (mark) {
 			nfm = nfsd_file_mark_get(container_of(mark,
 						 struct nfsd_file_mark,
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 3464fa7e8538..f3669403fabf 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -162,7 +162,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
 	if (!S_ISDIR(inode->i_mode))
 		return;
 
-	fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group);
+	fsn_mark = fsnotify_find_inode_mark(inode, dnotify_group);
 	if (!fsn_mark)
 		return;
 	dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
@@ -326,7 +326,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
 	fsnotify_group_lock(dnotify_group);
 
 	/* add the new_fsn_mark or find an old one. */
-	fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group);
+	fsn_mark = fsnotify_find_inode_mark(inode, dnotify_group);
 	if (fsn_mark) {
 		dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
 		spin_lock(&fsn_mark->lock);
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 85d8fdd55329..4ffc30606e0b 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -544,7 +544,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
 	int create = (arg & IN_MASK_CREATE);
 	int ret;
 
-	fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
+	fsn_mark = fsnotify_find_inode_mark(inode, group);
 	if (!fsn_mark)
 		return -ENOENT;
 	else if (create) {
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index d4e3bc55d174..992b57a7e95f 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -789,6 +789,13 @@ static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark,
 					FSNOTIFY_OBJ_TYPE_INODE, add_flags);
 }
 
+static inline struct fsnotify_mark *fsnotify_find_inode_mark(
+						struct inode *inode,
+						struct fsnotify_group *group)
+{
+	return fsnotify_find_mark(&inode->i_fsnotify_marks, group);
+}
+
 /* given a group and a mark, flag mark to be freed when all references are dropped */
 extern void fsnotify_destroy_mark(struct fsnotify_mark *mark,
 				  struct fsnotify_group *group);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 1b07e6f12a07..f2f38903b2fe 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -463,7 +463,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	int n;
 
 	fsnotify_group_lock(audit_tree_group);
-	mark = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_tree_group);
+	mark = fsnotify_find_inode_mark(inode, audit_tree_group);
 	if (!mark)
 		return create_chunk(inode, tree);
 
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 7a98cd176a12..7f358740e958 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -90,7 +90,7 @@ static inline struct audit_parent *audit_find_parent(struct inode *inode)
 	struct audit_parent *parent = NULL;
 	struct fsnotify_mark *entry;
 
-	entry = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_watch_group);
+	entry = fsnotify_find_inode_mark(inode, audit_watch_group);
 	if (entry)
 		parent = container_of(entry, struct audit_parent, mark);
 
-- 
cgit v1.2.3


From 687c217c6aa2c24e6ddf98cc7f86a5b986e5918d Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Sun, 17 Mar 2024 20:41:49 +0200
Subject: fsnotify: pass object pointer and type to fsnotify mark helpers

Instead of passing fsnotify_connp_t, pass the pointer to the marked
object.

Store the object pointer in the connector and move the definition of
fsnotify_connp_t to internal fsnotify subsystem API, so it is no longer
used by fsnotify backends.

Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Message-Id: <20240317184154.1200192-6-amir73il@gmail.com>
---
 fs/notify/fanotify/fanotify_user.c | 95 ++++++++------------------------------
 fs/notify/fsnotify.h               | 23 ++++-----
 fs/notify/mark.c                   | 28 ++++++-----
 include/linux/fsnotify_backend.h   | 33 +++++--------
 4 files changed, 59 insertions(+), 120 deletions(-)

(limited to 'include')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 04a0b31f325d..925015ee7fbd 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1076,7 +1076,7 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
 }
 
 static int fanotify_remove_mark(struct fsnotify_group *group,
-				fsnotify_connp_t *connp, __u32 mask,
+				void *obj, unsigned int obj_type, __u32 mask,
 				unsigned int flags, __u32 umask)
 {
 	struct fsnotify_mark *fsn_mark = NULL;
@@ -1084,7 +1084,7 @@ static int fanotify_remove_mark(struct fsnotify_group *group,
 	int destroy_mark;
 
 	fsnotify_group_lock(group);
-	fsn_mark = fsnotify_find_mark(connp, group);
+	fsn_mark = fsnotify_find_mark(obj, obj_type, group);
 	if (!fsn_mark) {
 		fsnotify_group_unlock(group);
 		return -ENOENT;
@@ -1105,30 +1105,6 @@ static int fanotify_remove_mark(struct fsnotify_group *group,
 	return 0;
 }
 
-static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
-					 struct vfsmount *mnt, __u32 mask,
-					 unsigned int flags, __u32 umask)
-{
-	return fanotify_remove_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
-				    mask, flags, umask);
-}
-
-static int fanotify_remove_sb_mark(struct fsnotify_group *group,
-				   struct super_block *sb, __u32 mask,
-				   unsigned int flags, __u32 umask)
-{
-	return fanotify_remove_mark(group, &sb->s_fsnotify_marks, mask,
-				    flags, umask);
-}
-
-static int fanotify_remove_inode_mark(struct fsnotify_group *group,
-				      struct inode *inode, __u32 mask,
-				      unsigned int flags, __u32 umask)
-{
-	return fanotify_remove_mark(group, &inode->i_fsnotify_marks, mask,
-				    flags, umask);
-}
-
 static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark,
 				       unsigned int fan_flags)
 {
@@ -1249,7 +1225,7 @@ static int fanotify_set_mark_fsid(struct fsnotify_group *group,
 }
 
 static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
-						   fsnotify_connp_t *connp,
+						   void *obj,
 						   unsigned int obj_type,
 						   unsigned int fan_flags,
 						   struct fan_fsid *fsid)
@@ -1288,7 +1264,7 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
 		fan_mark->fsid.val[0] = fan_mark->fsid.val[1] = 0;
 	}
 
-	ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0);
+	ret = fsnotify_add_mark_locked(mark, obj, obj_type, 0);
 	if (ret)
 		goto out_put_mark;
 
@@ -1344,7 +1320,7 @@ static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark,
 }
 
 static int fanotify_add_mark(struct fsnotify_group *group,
-			     fsnotify_connp_t *connp, unsigned int obj_type,
+			     void *obj, unsigned int obj_type,
 			     __u32 mask, unsigned int fan_flags,
 			     struct fan_fsid *fsid)
 {
@@ -1353,9 +1329,9 @@ static int fanotify_add_mark(struct fsnotify_group *group,
 	int ret = 0;
 
 	fsnotify_group_lock(group);
-	fsn_mark = fsnotify_find_mark(connp, group);
+	fsn_mark = fsnotify_find_mark(obj, obj_type, group);
 	if (!fsn_mark) {
-		fsn_mark = fanotify_add_new_mark(group, connp, obj_type,
+		fsn_mark = fanotify_add_new_mark(group, obj, obj_type,
 						 fan_flags, fsid);
 		if (IS_ERR(fsn_mark)) {
 			fsnotify_group_unlock(group);
@@ -1392,30 +1368,6 @@ out:
 	return ret;
 }
 
-static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
-				      struct vfsmount *mnt, __u32 mask,
-				      unsigned int flags, struct fan_fsid *fsid)
-{
-	return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
-				 FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid);
-}
-
-static int fanotify_add_sb_mark(struct fsnotify_group *group,
-				struct super_block *sb, __u32 mask,
-				unsigned int flags, struct fan_fsid *fsid)
-{
-	return fanotify_add_mark(group, &sb->s_fsnotify_marks,
-				 FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid);
-}
-
-static int fanotify_add_inode_mark(struct fsnotify_group *group,
-				   struct inode *inode, __u32 mask,
-				   unsigned int flags, struct fan_fsid *fsid)
-{
-	return fanotify_add_mark(group, &inode->i_fsnotify_marks,
-				 FSNOTIFY_OBJ_TYPE_INODE, mask, flags, fsid);
-}
-
 static struct fsnotify_event *fanotify_alloc_overflow_event(void)
 {
 	struct fanotify_event *oevent;
@@ -1738,6 +1690,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 	unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS;
 	unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS;
 	unsigned int obj_type, fid_mode;
+	void *obj;
 	u32 umask = 0;
 	int ret;
 
@@ -1896,10 +1849,16 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 	}
 
 	/* inode held in place by reference to path; group by fget on fd */
-	if (mark_type == FAN_MARK_INODE)
+	if (mark_type == FAN_MARK_INODE) {
 		inode = path.dentry->d_inode;
-	else
+		obj = inode;
+	} else {
 		mnt = path.mnt;
+		if (mark_type == FAN_MARK_MOUNT)
+			obj = mnt;
+		else
+			obj = mnt->mnt_sb;
+	}
 
 	/*
 	 * If some other task has this inode open for write we should not add
@@ -1935,26 +1894,12 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 	/* create/update an inode mark */
 	switch (mark_cmd) {
 	case FAN_MARK_ADD:
-		if (mark_type == FAN_MARK_MOUNT)
-			ret = fanotify_add_vfsmount_mark(group, mnt, mask,
-							 flags, fsid);
-		else if (mark_type == FAN_MARK_FILESYSTEM)
-			ret = fanotify_add_sb_mark(group, mnt->mnt_sb, mask,
-						   flags, fsid);
-		else
-			ret = fanotify_add_inode_mark(group, inode, mask,
-						      flags, fsid);
+		ret = fanotify_add_mark(group, obj, obj_type, mask, flags,
+					fsid);
 		break;
 	case FAN_MARK_REMOVE:
-		if (mark_type == FAN_MARK_MOUNT)
-			ret = fanotify_remove_vfsmount_mark(group, mnt, mask,
-							    flags, umask);
-		else if (mark_type == FAN_MARK_FILESYSTEM)
-			ret = fanotify_remove_sb_mark(group, mnt->mnt_sb, mask,
-						      flags, umask);
-		else
-			ret = fanotify_remove_inode_mark(group, inode, mask,
-							 flags, umask);
+		ret = fanotify_remove_mark(group, obj, obj_type, mask, flags,
+					   umask);
 		break;
 	default:
 		ret = -EINVAL;
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 6d9951aac449..b1fec7cd8ee4 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -9,22 +9,28 @@
 
 #include "../mount.h"
 
+/*
+ * fsnotify_connp_t is what we embed in objects which connector can be attached
+ * to.
+ */
+typedef struct fsnotify_mark_connector __rcu *fsnotify_connp_t;
+
 static inline struct inode *fsnotify_conn_inode(
 				struct fsnotify_mark_connector *conn)
 {
-	return container_of(conn->obj, struct inode, i_fsnotify_marks);
+	return conn->obj;
 }
 
 static inline struct mount *fsnotify_conn_mount(
 				struct fsnotify_mark_connector *conn)
 {
-	return container_of(conn->obj, struct mount, mnt_fsnotify_marks);
+	return real_mount(conn->obj);
 }
 
 static inline struct super_block *fsnotify_conn_sb(
 				struct fsnotify_mark_connector *conn)
 {
-	return container_of(conn->obj, struct super_block, s_fsnotify_marks);
+	return conn->obj;
 }
 
 static inline struct super_block *fsnotify_object_sb(void *obj,
@@ -45,16 +51,7 @@ static inline struct super_block *fsnotify_object_sb(void *obj,
 static inline struct super_block *fsnotify_connector_sb(
 				struct fsnotify_mark_connector *conn)
 {
-	switch (conn->type) {
-	case FSNOTIFY_OBJ_TYPE_INODE:
-		return fsnotify_conn_inode(conn)->i_sb;
-	case FSNOTIFY_OBJ_TYPE_VFSMOUNT:
-		return fsnotify_conn_mount(conn)->mnt.mnt_sb;
-	case FSNOTIFY_OBJ_TYPE_SB:
-		return fsnotify_conn_sb(conn);
-	default:
-		return NULL;
-	}
+	return fsnotify_object_sb(conn->obj, conn->type);
 }
 
 /* destroy all events sitting in this groups notification queue */
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index d9977af25832..1f611e4eab53 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -265,6 +265,7 @@ static void *fsnotify_detach_connector_from_object(
 					struct fsnotify_mark_connector *conn,
 					unsigned int *type)
 {
+	fsnotify_connp_t *connp = fsnotify_object_connp(conn->obj, conn->type);
 	struct inode *inode = NULL;
 
 	*type = conn->type;
@@ -285,7 +286,7 @@ static void *fsnotify_detach_connector_from_object(
 	}
 
 	fsnotify_put_sb_watchers(conn);
-	rcu_assign_pointer(*(conn->obj), NULL);
+	rcu_assign_pointer(*connp, NULL);
 	conn->obj = NULL;
 	conn->type = FSNOTIFY_OBJ_TYPE_DETACHED;
 
@@ -560,7 +561,7 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
 }
 
 static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
-					       unsigned int obj_type)
+					       void *obj, unsigned int obj_type)
 {
 	struct fsnotify_mark_connector *conn;
 
@@ -571,7 +572,7 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
 	INIT_HLIST_HEAD(&conn->list);
 	conn->flags = 0;
 	conn->type = obj_type;
-	conn->obj = connp;
+	conn->obj = obj;
 
 	/*
 	 * cmpxchg() provides the barrier so that readers of *connp can see
@@ -620,24 +621,25 @@ out:
  * to which group and for which inodes. These marks are ordered according to
  * priority, highest number first, and then by the group's location in memory.
  */
-static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
-				  fsnotify_connp_t *connp,
+static int fsnotify_add_mark_list(struct fsnotify_mark *mark, void *obj,
 				  unsigned int obj_type, int add_flags)
 {
 	struct fsnotify_mark *lmark, *last = NULL;
 	struct fsnotify_mark_connector *conn;
+	fsnotify_connp_t *connp;
 	int cmp;
 	int err = 0;
 
 	if (WARN_ON(!fsnotify_valid_obj_type(obj_type)))
 		return -EINVAL;
 
+	connp = fsnotify_object_connp(obj, obj_type);
 restart:
 	spin_lock(&mark->lock);
 	conn = fsnotify_grab_connector(connp);
 	if (!conn) {
 		spin_unlock(&mark->lock);
-		err = fsnotify_attach_connector_to_object(connp, obj_type);
+		err = fsnotify_attach_connector_to_object(connp, obj, obj_type);
 		if (err)
 			return err;
 		goto restart;
@@ -689,7 +691,7 @@ out_err:
  * event types should be delivered to which group.
  */
 int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
-			     fsnotify_connp_t *connp, unsigned int obj_type,
+			     void *obj, unsigned int obj_type,
 			     int add_flags)
 {
 	struct fsnotify_group *group = mark->group;
@@ -710,7 +712,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
 	fsnotify_get_mark(mark); /* for g_list */
 	spin_unlock(&mark->lock);
 
-	ret = fsnotify_add_mark_list(mark, connp, obj_type, add_flags);
+	ret = fsnotify_add_mark_list(mark, obj, obj_type, add_flags);
 	if (ret)
 		goto err;
 
@@ -728,14 +730,14 @@ err:
 	return ret;
 }
 
-int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp,
+int fsnotify_add_mark(struct fsnotify_mark *mark, void *obj,
 		      unsigned int obj_type, int add_flags)
 {
 	int ret;
 	struct fsnotify_group *group = mark->group;
 
 	fsnotify_group_lock(group);
-	ret = fsnotify_add_mark_locked(mark, connp, obj_type, add_flags);
+	ret = fsnotify_add_mark_locked(mark, obj, obj_type, add_flags);
 	fsnotify_group_unlock(group);
 	return ret;
 }
@@ -745,12 +747,16 @@ EXPORT_SYMBOL_GPL(fsnotify_add_mark);
  * Given a list of marks, find the mark associated with given group. If found
  * take a reference to that mark and return it, else return NULL.
  */
-struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp,
+struct fsnotify_mark *fsnotify_find_mark(void *obj, unsigned int obj_type,
 					 struct fsnotify_group *group)
 {
+	fsnotify_connp_t *connp = fsnotify_object_connp(obj, obj_type);
 	struct fsnotify_mark_connector *conn;
 	struct fsnotify_mark *mark;
 
+	if (!connp)
+		return NULL;
+
 	conn = fsnotify_grab_connector(connp);
 	if (!conn)
 		return NULL;
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 992b57a7e95f..face68fcf850 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -456,13 +456,6 @@ FSNOTIFY_ITER_FUNCS(sb, SB)
 	     type < FSNOTIFY_ITER_TYPE_COUNT; \
 	     type++)
 
-/*
- * fsnotify_connp_t is what we embed in objects which connector can be attached
- * to. fsnotify_connp_t * is how we refer from connector back to object.
- */
-struct fsnotify_mark_connector;
-typedef struct fsnotify_mark_connector __rcu *fsnotify_connp_t;
-
 /*
  * Inode/vfsmount/sb point to this structure which tracks all marks attached to
  * the inode/vfsmount/sb. The reference to inode/vfsmount/sb is held by this
@@ -476,7 +469,7 @@ struct fsnotify_mark_connector {
 	unsigned short flags;	/* flags [lock] */
 	union {
 		/* Object pointer [lock] */
-		fsnotify_connp_t *obj;
+		void *obj;
 		/* Used listing heads to free after srcu period expires */
 		struct fsnotify_mark_connector *destroy_next;
 	};
@@ -763,37 +756,35 @@ extern void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn);
 extern void fsnotify_init_mark(struct fsnotify_mark *mark,
 			       struct fsnotify_group *group);
 /* Find mark belonging to given group in the list of marks */
-extern struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp,
-						struct fsnotify_group *group);
+struct fsnotify_mark *fsnotify_find_mark(void *obj, unsigned int obj_type,
+					 struct fsnotify_group *group);
 /* attach the mark to the object */
-extern int fsnotify_add_mark(struct fsnotify_mark *mark,
-			     fsnotify_connp_t *connp, unsigned int obj_type,
-			     int add_flags);
-extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
-				    fsnotify_connp_t *connp,
-				    unsigned int obj_type, int add_flags);
+int fsnotify_add_mark(struct fsnotify_mark *mark, void *obj,
+		      unsigned int obj_type, int add_flags);
+int fsnotify_add_mark_locked(struct fsnotify_mark *mark, void *obj,
+			     unsigned int obj_type, int add_flags);
 
 /* attach the mark to the inode */
 static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
 					  struct inode *inode,
 					  int add_flags)
 {
-	return fsnotify_add_mark(mark, &inode->i_fsnotify_marks,
-				 FSNOTIFY_OBJ_TYPE_INODE, add_flags);
+	return fsnotify_add_mark(mark, inode, FSNOTIFY_OBJ_TYPE_INODE,
+				 add_flags);
 }
 static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark,
 						 struct inode *inode,
 						 int add_flags)
 {
-	return fsnotify_add_mark_locked(mark, &inode->i_fsnotify_marks,
-					FSNOTIFY_OBJ_TYPE_INODE, add_flags);
+	return fsnotify_add_mark_locked(mark, inode, FSNOTIFY_OBJ_TYPE_INODE,
+					add_flags);
 }
 
 static inline struct fsnotify_mark *fsnotify_find_inode_mark(
 						struct inode *inode,
 						struct fsnotify_group *group)
 {
-	return fsnotify_find_mark(&inode->i_fsnotify_marks, group);
+	return fsnotify_find_mark(inode, FSNOTIFY_OBJ_TYPE_INODE, group);
 }
 
 /* given a group and a mark, flag mark to be freed when all references are dropped */
-- 
cgit v1.2.3


From c9d4603b054f9a63c07c7012040af4c80adb2c60 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Sun, 17 Mar 2024 20:41:50 +0200
Subject: fsnotify: create helper fsnotify_update_sb_watchers()

We would like to count watched objects by priority group, so we will need
to update the watched object counter after adding/removing marks.

Create a helper fsnotify_update_sb_watchers() and call it after
attaching/detaching a mark, instead of fsnotify_{get,put}_sb_watchers()
only after attaching/detaching a connector.

Soon, we will use this helper to count watched objects by the highest
watching priority group.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Message-Id: <20240317184154.1200192-7-amir73il@gmail.com>
---
 fs/notify/mark.c                 | 36 ++++++++++++++++++++++--------------
 include/linux/fsnotify_backend.h |  1 +
 2 files changed, 23 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 1f611e4eab53..647c49c20467 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -154,20 +154,23 @@ static void fsnotify_put_inode_ref(struct inode *inode)
 	iput(inode);
 }
 
-static void fsnotify_get_sb_watchers(struct fsnotify_mark_connector *conn)
+/*
+ * Grab or drop watched objects reference depending on whether the connector
+ * is attached and has any marks attached.
+ */
+static void fsnotify_update_sb_watchers(struct super_block *sb,
+					struct fsnotify_mark_connector *conn)
 {
-	struct super_block *sb = fsnotify_connector_sb(conn);
+	bool is_watched = conn->flags & FSNOTIFY_CONN_FLAG_IS_WATCHED;
+	bool has_marks = conn->obj && !hlist_empty(&conn->list);
 
-	if (sb)
+	if (has_marks && !is_watched) {
+		conn->flags |= FSNOTIFY_CONN_FLAG_IS_WATCHED;
 		fsnotify_get_sb_watched_objects(sb);
-}
-
-static void fsnotify_put_sb_watchers(struct fsnotify_mark_connector *conn)
-{
-	struct super_block *sb = fsnotify_connector_sb(conn);
-
-	if (sb)
+	} else if (!has_marks && is_watched) {
+		conn->flags &= ~FSNOTIFY_CONN_FLAG_IS_WATCHED;
 		fsnotify_put_sb_watched_objects(sb);
+	}
 }
 
 /*
@@ -266,6 +269,7 @@ static void *fsnotify_detach_connector_from_object(
 					unsigned int *type)
 {
 	fsnotify_connp_t *connp = fsnotify_object_connp(conn->obj, conn->type);
+	struct super_block *sb = fsnotify_connector_sb(conn);
 	struct inode *inode = NULL;
 
 	*type = conn->type;
@@ -285,10 +289,10 @@ static void *fsnotify_detach_connector_from_object(
 		fsnotify_conn_sb(conn)->s_fsnotify_mask = 0;
 	}
 
-	fsnotify_put_sb_watchers(conn);
 	rcu_assign_pointer(*connp, NULL);
 	conn->obj = NULL;
 	conn->type = FSNOTIFY_OBJ_TYPE_DETACHED;
+	fsnotify_update_sb_watchers(sb, conn);
 
 	return inode;
 }
@@ -340,6 +344,11 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
 		objp = fsnotify_detach_connector_from_object(conn, &type);
 		free_conn = true;
 	} else {
+		struct super_block *sb = fsnotify_connector_sb(conn);
+
+		/* Update watched objects after detaching mark */
+		if (sb)
+			fsnotify_update_sb_watchers(sb, conn);
 		objp = __fsnotify_recalc_mask(conn);
 		type = conn->type;
 	}
@@ -581,10 +590,7 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
 	if (cmpxchg(connp, NULL, conn)) {
 		/* Someone else created list structure for us */
 		kmem_cache_free(fsnotify_mark_connector_cachep, conn);
-		return 0;
 	}
-
-	fsnotify_get_sb_watchers(conn);
 	return 0;
 }
 
@@ -624,6 +630,7 @@ out:
 static int fsnotify_add_mark_list(struct fsnotify_mark *mark, void *obj,
 				  unsigned int obj_type, int add_flags)
 {
+	struct super_block *sb = fsnotify_object_sb(obj, obj_type);
 	struct fsnotify_mark *lmark, *last = NULL;
 	struct fsnotify_mark_connector *conn;
 	fsnotify_connp_t *connp;
@@ -673,6 +680,7 @@ restart:
 	/* mark should be the last entry.  last is the current last entry */
 	hlist_add_behind_rcu(&mark->obj_list, &last->obj_list);
 added:
+	fsnotify_update_sb_watchers(sb, conn);
 	/*
 	 * Since connector is attached to object using cmpxchg() we are
 	 * guaranteed that connector initialization is fully visible by anyone
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index face68fcf850..83004d9e07a3 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -465,6 +465,7 @@ FSNOTIFY_ITER_FUNCS(sb, SB)
 struct fsnotify_mark_connector {
 	spinlock_t lock;
 	unsigned short type;	/* Type of object [lock] */
+#define FSNOTIFY_CONN_FLAG_IS_WATCHED	0x01
 #define FSNOTIFY_CONN_FLAG_HAS_IREF	0x02
 	unsigned short flags;	/* flags [lock] */
 	union {
-- 
cgit v1.2.3


From 07a3b8d0bf726a1e49b050bbc6bd72f031e505fe Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Sun, 17 Mar 2024 20:41:51 +0200
Subject: fsnotify: lazy attach fsnotify_sb_info state to sb

Define a container struct fsnotify_sb_info to hold per-sb state,
including the reference to sb marks connector.

Allocate the fsnotify_sb_info state before attaching connector to any
object on the sb and free it only when killing sb.

This state is going to be used for storing per priority watched objects
counters.

Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Message-Id: <20240317184154.1200192-8-amir73il@gmail.com>
---
 fs/notify/fsnotify.c             | 16 +++++++++++++---
 fs/notify/fsnotify.h             |  9 ++++++++-
 fs/notify/mark.c                 | 32 +++++++++++++++++++++++++++++++-
 include/linux/fs.h               |  8 ++++----
 include/linux/fsnotify_backend.h | 17 +++++++++++++++++
 5 files changed, 73 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 503e7c75e777..fb3f36bc6ea9 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -89,11 +89,18 @@ static void fsnotify_unmount_inodes(struct super_block *sb)
 
 void fsnotify_sb_delete(struct super_block *sb)
 {
+	struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);
+
+	/* Were any marks ever added to any object on this sb? */
+	if (!sbinfo)
+		return;
+
 	fsnotify_unmount_inodes(sb);
 	fsnotify_clear_marks_by_sb(sb);
 	/* Wait for outstanding object references from connectors */
 	wait_var_event(fsnotify_sb_watched_objects(sb),
 		       !atomic_long_read(fsnotify_sb_watched_objects(sb)));
+	kfree(sbinfo);
 }
 
 /*
@@ -489,6 +496,7 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
 {
 	const struct path *path = fsnotify_data_path(data, data_type);
 	struct super_block *sb = fsnotify_data_sb(data, data_type);
+	struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);
 	struct fsnotify_iter_info iter_info = {};
 	struct mount *mnt = NULL;
 	struct inode *inode2 = NULL;
@@ -525,7 +533,7 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
 	 * SRCU because we have no references to any objects and do not
 	 * need SRCU to keep them "alive".
 	 */
-	if (!sb->s_fsnotify_marks &&
+	if ((!sbinfo || !sbinfo->sb_marks) &&
 	    (!mnt || !mnt->mnt_fsnotify_marks) &&
 	    (!inode || !inode->i_fsnotify_marks) &&
 	    (!inode2 || !inode2->i_fsnotify_marks))
@@ -552,8 +560,10 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
 
 	iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);
 
-	iter_info.marks[FSNOTIFY_ITER_TYPE_SB] =
-		fsnotify_first_mark(&sb->s_fsnotify_marks);
+	if (sbinfo) {
+		iter_info.marks[FSNOTIFY_ITER_TYPE_SB] =
+			fsnotify_first_mark(&sbinfo->sb_marks);
+	}
 	if (mnt) {
 		iter_info.marks[FSNOTIFY_ITER_TYPE_VFSMOUNT] =
 			fsnotify_first_mark(&mnt->mnt_fsnotify_marks);
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index b1fec7cd8ee4..2d059f789ee3 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -54,6 +54,13 @@ static inline struct super_block *fsnotify_connector_sb(
 	return fsnotify_object_sb(conn->obj, conn->type);
 }
 
+static inline fsnotify_connp_t *fsnotify_sb_marks(struct super_block *sb)
+{
+	struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);
+
+	return sbinfo ? &sbinfo->sb_marks : NULL;
+}
+
 /* destroy all events sitting in this groups notification queue */
 extern void fsnotify_flush_notify(struct fsnotify_group *group);
 
@@ -79,7 +86,7 @@ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
 /* run the list of all marks associated with sb and destroy them */
 static inline void fsnotify_clear_marks_by_sb(struct super_block *sb)
 {
-	fsnotify_destroy_marks(&sb->s_fsnotify_marks);
+	fsnotify_destroy_marks(fsnotify_sb_marks(sb));
 }
 
 /*
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 647c49c20467..b2f5d8c9cce1 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -106,7 +106,7 @@ static fsnotify_connp_t *fsnotify_object_connp(void *obj,
 	case FSNOTIFY_OBJ_TYPE_VFSMOUNT:
 		return &real_mount(obj)->mnt_fsnotify_marks;
 	case FSNOTIFY_OBJ_TYPE_SB:
-		return &((struct super_block *)obj)->s_fsnotify_marks;
+		return fsnotify_sb_marks(obj);
 	default:
 		return NULL;
 	}
@@ -569,6 +569,26 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
 	return -1;
 }
 
+static int fsnotify_attach_info_to_sb(struct super_block *sb)
+{
+	struct fsnotify_sb_info *sbinfo;
+
+	/* sb info is freed on fsnotify_sb_delete() */
+	sbinfo = kzalloc(sizeof(*sbinfo), GFP_KERNEL);
+	if (!sbinfo)
+		return -ENOMEM;
+
+	/*
+	 * cmpxchg() provides the barrier so that callers of fsnotify_sb_info()
+	 * will observe an initialized structure
+	 */
+	if (cmpxchg(&sb->s_fsnotify_info, NULL, sbinfo)) {
+		/* Someone else created sbinfo for us */
+		kfree(sbinfo);
+	}
+	return 0;
+}
+
 static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
 					       void *obj, unsigned int obj_type)
 {
@@ -640,6 +660,16 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, void *obj,
 	if (WARN_ON(!fsnotify_valid_obj_type(obj_type)))
 		return -EINVAL;
 
+	/*
+	 * Attach the sb info before attaching a connector to any object on sb.
+	 * The sb info will remain attached as long as sb lives.
+	 */
+	if (!fsnotify_sb_info(sb)) {
+		err = fsnotify_attach_info_to_sb(sb);
+		if (err)
+			return err;
+	}
+
 	connp = fsnotify_object_connp(obj, obj_type);
 restart:
 	spin_lock(&mark->lock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 00fc429b0af0..7f40b592f711 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -73,6 +73,8 @@ struct fscrypt_inode_info;
 struct fscrypt_operations;
 struct fsverity_info;
 struct fsverity_operations;
+struct fsnotify_mark_connector;
+struct fsnotify_sb_info;
 struct fs_context;
 struct fs_parameter_spec;
 struct fileattr;
@@ -620,8 +622,6 @@ is_uncached_acl(struct posix_acl *acl)
 #define IOP_XATTR	0x0008
 #define IOP_DEFAULT_READLINK	0x0010
 
-struct fsnotify_mark_connector;
-
 /*
  * Keep mostly read-only and often accessed (especially for
  * the RCU path lookup and 'stat' data) fields at the beginning
@@ -1249,7 +1249,7 @@ struct super_block {
 
 	/*
 	 * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and
-	 * s_fsnotify_marks together for cache efficiency. They are frequently
+	 * s_fsnotify_info together for cache efficiency. They are frequently
 	 * accessed and rarely modified.
 	 */
 	void			*s_fs_info;	/* Filesystem private info */
@@ -1261,7 +1261,7 @@ struct super_block {
 	time64_t		   s_time_max;
 #ifdef CONFIG_FSNOTIFY
 	__u32			s_fsnotify_mask;
-	struct fsnotify_mark_connector __rcu	*s_fsnotify_marks;
+	struct fsnotify_sb_info	*s_fsnotify_info;
 #endif
 
 	/*
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 83004d9e07a3..c9f2b2f6b493 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -477,6 +477,23 @@ struct fsnotify_mark_connector {
 	struct hlist_head list;
 };
 
+/*
+ * Container for per-sb fsnotify state (sb marks and more).
+ * Attached lazily on first marked object on the sb and freed when killing sb.
+ */
+struct fsnotify_sb_info {
+	struct fsnotify_mark_connector __rcu *sb_marks;
+};
+
+static inline struct fsnotify_sb_info *fsnotify_sb_info(struct super_block *sb)
+{
+#ifdef CONFIG_FSNOTIFY
+	return READ_ONCE(sb->s_fsnotify_info);
+#else
+	return NULL;
+#endif
+}
+
 static inline atomic_long_t *fsnotify_sb_watched_objects(struct super_block *sb)
 {
 	return &sb->s_fsnotify_connectors;
-- 
cgit v1.2.3


From cb5d4f48c10445c97a22af0bd8b9cf0ed6cc8036 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Sun, 17 Mar 2024 20:41:52 +0200
Subject: fsnotify: move s_fsnotify_connectors into fsnotify_sb_info

Move the s_fsnotify_connectors counter into the per-sb fsnotify state.

Suggested-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Message-Id: <20240317184154.1200192-9-amir73il@gmail.com>
---
 include/linux/fs.h               | 6 ------
 include/linux/fsnotify.h         | 8 +++++++-
 include/linux/fsnotify_backend.h | 7 ++++++-
 3 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7f40b592f711..c36c2f8fdbe3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1302,12 +1302,6 @@ struct super_block {
 	/* Number of inodes with nlink == 0 but still referenced */
 	atomic_long_t s_remove_count;
 
-	/*
-	 * Number of inode/mount/sb objects that are being watched, note that
-	 * inodes objects are currently double-accounted.
-	 */
-	atomic_long_t s_fsnotify_connectors;
-
 	/* Read-only state of the superblock is being changed */
 	int s_readonly_remount;
 
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index e470bb67c9a3..48dc65702415 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -20,7 +20,13 @@
 /* Are there any inode/mount/sb objects that are being watched at all? */
 static inline bool fsnotify_sb_has_watchers(struct super_block *sb)
 {
-	return atomic_long_read(fsnotify_sb_watched_objects(sb));
+	struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);
+
+	/* Were any marks ever added to any object on this sb? */
+	if (!sbinfo)
+		return false;
+
+	return atomic_long_read(&sbinfo->watched_objects);
 }
 
 /*
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index c9f2b2f6b493..ec592aeadfa3 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -483,6 +483,11 @@ struct fsnotify_mark_connector {
  */
 struct fsnotify_sb_info {
 	struct fsnotify_mark_connector __rcu *sb_marks;
+	/*
+	 * Number of inode/mount/sb objects that are being watched in this sb.
+	 * Note that inodes objects are currently double-accounted.
+	 */
+	atomic_long_t watched_objects;
 };
 
 static inline struct fsnotify_sb_info *fsnotify_sb_info(struct super_block *sb)
@@ -496,7 +501,7 @@ static inline struct fsnotify_sb_info *fsnotify_sb_info(struct super_block *sb)
 
 static inline atomic_long_t *fsnotify_sb_watched_objects(struct super_block *sb)
 {
-	return &sb->s_fsnotify_connectors;
+	return &fsnotify_sb_info(sb)->watched_objects;
 }
 
 /*
-- 
cgit v1.2.3


From 477cf917dd02853ba78a73cdeb6548889e5f8cd7 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Sun, 17 Mar 2024 20:41:53 +0200
Subject: fsnotify: use an enum for group priority constants

And use meaningfull names for the constants.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Message-Id: <20240317184154.1200192-10-amir73il@gmail.com>
---
 fs/notify/fanotify/fanotify_user.c | 11 +++++------
 include/linux/fsnotify_backend.h   | 20 ++++++++++++--------
 2 files changed, 17 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 925015ee7fbd..483a6a1255fb 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1516,13 +1516,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 	INIT_LIST_HEAD(&group->fanotify_data.access_list);
 	switch (class) {
 	case FAN_CLASS_NOTIF:
-		group->priority = FS_PRIO_0;
+		group->priority = FSNOTIFY_PRIO_NORMAL;
 		break;
 	case FAN_CLASS_CONTENT:
-		group->priority = FS_PRIO_1;
+		group->priority = FSNOTIFY_PRIO_CONTENT;
 		break;
 	case FAN_CLASS_PRE_CONTENT:
-		group->priority = FS_PRIO_2;
+		group->priority = FSNOTIFY_PRIO_PRE_CONTENT;
 		break;
 	default:
 		fd = -EINVAL;
@@ -1774,12 +1774,11 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 		goto fput_and_out;
 
 	/*
-	 * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF.  These are not
-	 * allowed to set permissions events.
+	 * Permission events require minimum priority FAN_CLASS_CONTENT.
 	 */
 	ret = -EINVAL;
 	if (mask & FANOTIFY_PERM_EVENTS &&
-	    group->priority == FS_PRIO_0)
+	    group->priority < FSNOTIFY_PRIO_CONTENT)
 		goto fput_and_out;
 
 	if (mask & FAN_FS_ERROR &&
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index ec592aeadfa3..fc38587d8564 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -176,6 +176,17 @@ struct fsnotify_event {
 	struct list_head list;
 };
 
+/*
+ * fsnotify group priorities.
+ * Events are sent in order from highest priority to lowest priority.
+ */
+enum fsnotify_group_prio {
+	FSNOTIFY_PRIO_NORMAL = 0,	/* normal notifiers, no permissions */
+	FSNOTIFY_PRIO_CONTENT,		/* fanotify permission events */
+	FSNOTIFY_PRIO_PRE_CONTENT,	/* fanotify pre-content events */
+	__FSNOTIFY_PRIO_NUM
+};
+
 /*
  * A group is a "thing" that wants to receive notification about filesystem
  * events.  The mask holds the subset of event types this group cares about.
@@ -201,14 +212,7 @@ struct fsnotify_group {
 	wait_queue_head_t notification_waitq;	/* read() on the notification file blocks on this waitq */
 	unsigned int q_len;			/* events on the queue */
 	unsigned int max_events;		/* maximum events allowed on the list */
-	/*
-	 * Valid fsnotify group priorities.  Events are send in order from highest
-	 * priority to lowest priority.  We default to the lowest priority.
-	 */
-	#define FS_PRIO_0	0 /* normal notifiers, no permissions */
-	#define FS_PRIO_1	1 /* fanotify content based access control */
-	#define FS_PRIO_2	2 /* fanotify pre-content access */
-	unsigned int priority;
+	enum fsnotify_group_prio priority;	/* priority for sending events */
 	bool shutdown;		/* group is being shut down, don't queue more events */
 
 #define FSNOTIFY_GROUP_USER	0x01 /* user allocated group */
-- 
cgit v1.2.3


From a5e57b4d370c6d320e5bfb0c919fe00aee29e039 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Sun, 17 Mar 2024 20:41:54 +0200
Subject: fsnotify: optimize the case of no permission event watchers

Commit e43de7f0862b ("fsnotify: optimize the case of no marks of any type")
optimized the case where there are no fsnotify watchers on any of the
filesystem's objects.

It is quite common for a system to have a single local filesystem and
it is quite common for the system to have some inotify watches on some
config files or directories, so the optimization of no marks at all is
often not in effect.

Permission event watchers, which require high priority group are more
rare, so optimizing the case of no marks og high priority groups can
improve performance for more systems, especially for performance
sensitive io workloads.

Count per-sb watched objects by high priority groups and use that the
optimize out the call to __fsnotify_parent() and fsnotify() in fsnotify
permission hooks.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Message-Id: <20240317184154.1200192-11-amir73il@gmail.com>
---
 fs/notify/fsnotify.c             |  3 +++
 fs/notify/mark.c                 | 30 +++++++++++++++++++++++++++---
 include/linux/fsnotify.h         | 19 ++++++++++++++++---
 include/linux/fsnotify_backend.h | 11 ++++++++---
 4 files changed, 54 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index fb3f36bc6ea9..2ae965ef37e8 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -100,6 +100,9 @@ void fsnotify_sb_delete(struct super_block *sb)
 	/* Wait for outstanding object references from connectors */
 	wait_var_event(fsnotify_sb_watched_objects(sb),
 		       !atomic_long_read(fsnotify_sb_watched_objects(sb)));
+	WARN_ON(fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_CONTENT));
+	WARN_ON(fsnotify_sb_has_priority_watchers(sb,
+						  FSNOTIFY_PRIO_PRE_CONTENT));
 	kfree(sbinfo);
 }
 
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index b2f5d8c9cce1..c3eefa70633c 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -161,13 +161,36 @@ static void fsnotify_put_inode_ref(struct inode *inode)
 static void fsnotify_update_sb_watchers(struct super_block *sb,
 					struct fsnotify_mark_connector *conn)
 {
+	struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);
 	bool is_watched = conn->flags & FSNOTIFY_CONN_FLAG_IS_WATCHED;
-	bool has_marks = conn->obj && !hlist_empty(&conn->list);
+	struct fsnotify_mark *first_mark = NULL;
+	unsigned int highest_prio = 0;
 
-	if (has_marks && !is_watched) {
+	if (conn->obj)
+		first_mark = hlist_entry_safe(conn->list.first,
+					      struct fsnotify_mark, obj_list);
+	if (first_mark)
+		highest_prio = first_mark->group->priority;
+	if (WARN_ON(highest_prio >= __FSNOTIFY_PRIO_NUM))
+		highest_prio = 0;
+
+	/*
+	 * If the highest priority of group watching this object is prio,
+	 * then watched object has a reference on counters [0..prio].
+	 * Update priority >= 1 watched objects counters.
+	 */
+	for (unsigned int p = conn->prio + 1; p <= highest_prio; p++)
+		atomic_long_inc(&sbinfo->watched_objects[p]);
+	for (unsigned int p = conn->prio; p > highest_prio; p--)
+		atomic_long_dec(&sbinfo->watched_objects[p]);
+	conn->prio = highest_prio;
+
+	/* Update priority >= 0 (a.k.a total) watched objects counter */
+	BUILD_BUG_ON(FSNOTIFY_PRIO_NORMAL != 0);
+	if (first_mark && !is_watched) {
 		conn->flags |= FSNOTIFY_CONN_FLAG_IS_WATCHED;
 		fsnotify_get_sb_watched_objects(sb);
-	} else if (!has_marks && is_watched) {
+	} else if (!first_mark && is_watched) {
 		conn->flags &= ~FSNOTIFY_CONN_FLAG_IS_WATCHED;
 		fsnotify_put_sb_watched_objects(sb);
 	}
@@ -600,6 +623,7 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
 	spin_lock_init(&conn->lock);
 	INIT_HLIST_HEAD(&conn->list);
 	conn->flags = 0;
+	conn->prio = 0;
 	conn->type = obj_type;
 	conn->obj = obj;
 
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 48dc65702415..4da80e92f804 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -17,8 +17,9 @@
 #include <linux/slab.h>
 #include <linux/bug.h>
 
-/* Are there any inode/mount/sb objects that are being watched at all? */
-static inline bool fsnotify_sb_has_watchers(struct super_block *sb)
+/* Are there any inode/mount/sb objects watched with priority prio or above? */
+static inline bool fsnotify_sb_has_priority_watchers(struct super_block *sb,
+						     int prio)
 {
 	struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);
 
@@ -26,7 +27,13 @@ static inline bool fsnotify_sb_has_watchers(struct super_block *sb)
 	if (!sbinfo)
 		return false;
 
-	return atomic_long_read(&sbinfo->watched_objects);
+	return atomic_long_read(&sbinfo->watched_objects[prio]);
+}
+
+/* Are there any inode/mount/sb objects that are being watched at all? */
+static inline bool fsnotify_sb_has_watchers(struct super_block *sb)
+{
+	return fsnotify_sb_has_priority_watchers(sb, 0);
 }
 
 /*
@@ -109,6 +116,12 @@ static inline int fsnotify_file(struct file *file, __u32 mask)
 		return 0;
 
 	path = &file->f_path;
+	/* Permission events require group prio >= FSNOTIFY_PRIO_CONTENT */
+	if (mask & ALL_FSNOTIFY_PERM_EVENTS &&
+	    !fsnotify_sb_has_priority_watchers(path->dentry->d_sb,
+					       FSNOTIFY_PRIO_CONTENT))
+		return 0;
+
 	return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH);
 }
 
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index fc38587d8564..7f1ab8264e41 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -468,7 +468,8 @@ FSNOTIFY_ITER_FUNCS(sb, SB)
  */
 struct fsnotify_mark_connector {
 	spinlock_t lock;
-	unsigned short type;	/* Type of object [lock] */
+	unsigned char type;	/* Type of object [lock] */
+	unsigned char prio;	/* Highest priority group */
 #define FSNOTIFY_CONN_FLAG_IS_WATCHED	0x01
 #define FSNOTIFY_CONN_FLAG_HAS_IREF	0x02
 	unsigned short flags;	/* flags [lock] */
@@ -490,8 +491,12 @@ struct fsnotify_sb_info {
 	/*
 	 * Number of inode/mount/sb objects that are being watched in this sb.
 	 * Note that inodes objects are currently double-accounted.
+	 *
+	 * The value in watched_objects[prio] is the number of objects that are
+	 * watched by groups of priority >= prio, so watched_objects[0] is the
+	 * total number of watched objects in this sb.
 	 */
-	atomic_long_t watched_objects;
+	atomic_long_t watched_objects[__FSNOTIFY_PRIO_NUM];
 };
 
 static inline struct fsnotify_sb_info *fsnotify_sb_info(struct super_block *sb)
@@ -505,7 +510,7 @@ static inline struct fsnotify_sb_info *fsnotify_sb_info(struct super_block *sb)
 
 static inline atomic_long_t *fsnotify_sb_watched_objects(struct super_block *sb)
 {
-	return &fsnotify_sb_info(sb)->watched_objects;
+	return &fsnotify_sb_info(sb)->watched_objects[0];
 }
 
 /*
-- 
cgit v1.2.3


From 251ea652050e715886b272f831cf5016150a60c8 Mon Sep 17 00:00:00 2001
From: Cezary Rojewski <cezary.rojewski@intel.com>
Date: Wed, 3 Apr 2024 11:16:26 +0200
Subject: ASoC: Intel: Skylake: Remove soc-topology ABI v4 support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The only known users are Chromebook configurations. Starting from
kernel v5.4, all of them are making use of soc-topology ABI v5.

Cc: Curtis Malainey <cujomalainey@chromium.org>
Cc: Łukasz Majczak <lmajczak@google.com>
Signed-off-by: Cezary Rojewski <cezary.rojewski@intel.com>
Link: https://msgid.link/r/20240403091629.647267-2-cezary.rojewski@intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/sound/skl-tplg-interface.h |  74 --------------
 sound/soc/intel/skylake/skl-topology.c  | 169 --------------------------------
 2 files changed, 243 deletions(-)

(limited to 'include')

diff --git a/include/uapi/sound/skl-tplg-interface.h b/include/uapi/sound/skl-tplg-interface.h
index 4bf9c4f9add8..940c4269322b 100644
--- a/include/uapi/sound/skl-tplg-interface.h
+++ b/include/uapi/sound/skl-tplg-interface.h
@@ -165,78 +165,4 @@ enum skl_tuple_type {
 	SKL_TYPE_DATA
 };
 
-/* v4 configuration data */
-
-struct skl_dfw_v4_module_pin {
-	__u16 module_id;
-	__u16 instance_id;
-} __packed;
-
-struct skl_dfw_v4_module_fmt {
-	__u32 channels;
-	__u32 freq;
-	__u32 bit_depth;
-	__u32 valid_bit_depth;
-	__u32 ch_cfg;
-	__u32 interleaving_style;
-	__u32 sample_type;
-	__u32 ch_map;
-} __packed;
-
-struct skl_dfw_v4_module_caps {
-	__u32 set_params:2;
-	__u32 rsvd:30;
-	__u32 param_id;
-	__u32 caps_size;
-	__u32 caps[HDA_SST_CFG_MAX];
-} __packed;
-
-struct skl_dfw_v4_pipe {
-	__u8 pipe_id;
-	__u8 pipe_priority;
-	__u16 conn_type:4;
-	__u16 rsvd:4;
-	__u16 memory_pages:8;
-} __packed;
-
-struct skl_dfw_v4_module {
-	char uuid[SKL_UUID_STR_SZ];
-
-	__u16 module_id;
-	__u16 instance_id;
-	__u32 max_mcps;
-	__u32 mem_pages;
-	__u32 obs;
-	__u32 ibs;
-	__u32 vbus_id;
-
-	__u32 max_in_queue:8;
-	__u32 max_out_queue:8;
-	__u32 time_slot:8;
-	__u32 core_id:4;
-	__u32 rsvd1:4;
-
-	__u32 module_type:8;
-	__u32 conn_type:4;
-	__u32 dev_type:4;
-	__u32 hw_conn_type:4;
-	__u32 rsvd2:12;
-
-	__u32 params_fixup:8;
-	__u32 converter:8;
-	__u32 input_pin_type:1;
-	__u32 output_pin_type:1;
-	__u32 is_dynamic_in_pin:1;
-	__u32 is_dynamic_out_pin:1;
-	__u32 is_loadable:1;
-	__u32 rsvd3:11;
-
-	struct skl_dfw_v4_pipe pipe;
-	struct skl_dfw_v4_module_fmt in_fmt[MAX_IN_QUEUE];
-	struct skl_dfw_v4_module_fmt out_fmt[MAX_OUT_QUEUE];
-	struct skl_dfw_v4_module_pin in_pin[MAX_IN_QUEUE];
-	struct skl_dfw_v4_module_pin out_pin[MAX_OUT_QUEUE];
-	struct skl_dfw_v4_module_caps caps;
-} __packed;
-
 #endif
diff --git a/sound/soc/intel/skylake/skl-topology.c b/sound/soc/intel/skylake/skl-topology.c
index 96cfebded072..e27f0fc3d897 100644
--- a/sound/soc/intel/skylake/skl-topology.c
+++ b/sound/soc/intel/skylake/skl-topology.c
@@ -2682,168 +2682,6 @@ static int skl_tplg_get_desc_blocks(struct device *dev,
 	return -EINVAL;
 }
 
-/* Functions to parse private data from configuration file format v4 */
-
-/*
- * Add pipeline from topology binary into driver pipeline list
- *
- * If already added we return that instance
- * Otherwise we create a new instance and add into driver list
- */
-static int skl_tplg_add_pipe_v4(struct device *dev,
-			struct skl_module_cfg *mconfig, struct skl_dev *skl,
-			struct skl_dfw_v4_pipe *dfw_pipe)
-{
-	struct skl_pipeline *ppl;
-	struct skl_pipe *pipe;
-	struct skl_pipe_params *params;
-
-	list_for_each_entry(ppl, &skl->ppl_list, node) {
-		if (ppl->pipe->ppl_id == dfw_pipe->pipe_id) {
-			mconfig->pipe = ppl->pipe;
-			return 0;
-		}
-	}
-
-	ppl = devm_kzalloc(dev, sizeof(*ppl), GFP_KERNEL);
-	if (!ppl)
-		return -ENOMEM;
-
-	pipe = devm_kzalloc(dev, sizeof(*pipe), GFP_KERNEL);
-	if (!pipe)
-		return -ENOMEM;
-
-	params = devm_kzalloc(dev, sizeof(*params), GFP_KERNEL);
-	if (!params)
-		return -ENOMEM;
-
-	pipe->ppl_id = dfw_pipe->pipe_id;
-	pipe->memory_pages = dfw_pipe->memory_pages;
-	pipe->pipe_priority = dfw_pipe->pipe_priority;
-	pipe->conn_type = dfw_pipe->conn_type;
-	pipe->state = SKL_PIPE_INVALID;
-	pipe->p_params = params;
-	INIT_LIST_HEAD(&pipe->w_list);
-
-	ppl->pipe = pipe;
-	list_add(&ppl->node, &skl->ppl_list);
-
-	mconfig->pipe = pipe;
-
-	return 0;
-}
-
-static void skl_fill_module_pin_info_v4(struct skl_dfw_v4_module_pin *dfw_pin,
-					struct skl_module_pin *m_pin,
-					bool is_dynamic, int max_pin)
-{
-	int i;
-
-	for (i = 0; i < max_pin; i++) {
-		m_pin[i].id.module_id = dfw_pin[i].module_id;
-		m_pin[i].id.instance_id = dfw_pin[i].instance_id;
-		m_pin[i].in_use = false;
-		m_pin[i].is_dynamic = is_dynamic;
-		m_pin[i].pin_state = SKL_PIN_UNBIND;
-	}
-}
-
-static void skl_tplg_fill_fmt_v4(struct skl_module_pin_fmt *dst_fmt,
-				 struct skl_dfw_v4_module_fmt *src_fmt,
-				 int pins)
-{
-	int i;
-
-	for (i = 0; i < pins; i++) {
-		dst_fmt[i].fmt.channels  = src_fmt[i].channels;
-		dst_fmt[i].fmt.s_freq = src_fmt[i].freq;
-		dst_fmt[i].fmt.bit_depth = src_fmt[i].bit_depth;
-		dst_fmt[i].fmt.valid_bit_depth = src_fmt[i].valid_bit_depth;
-		dst_fmt[i].fmt.ch_cfg = src_fmt[i].ch_cfg;
-		dst_fmt[i].fmt.ch_map = src_fmt[i].ch_map;
-		dst_fmt[i].fmt.interleaving_style =
-						src_fmt[i].interleaving_style;
-		dst_fmt[i].fmt.sample_type = src_fmt[i].sample_type;
-	}
-}
-
-static int skl_tplg_get_pvt_data_v4(struct snd_soc_tplg_dapm_widget *tplg_w,
-				    struct skl_dev *skl, struct device *dev,
-				    struct skl_module_cfg *mconfig)
-{
-	struct skl_dfw_v4_module *dfw =
-				(struct skl_dfw_v4_module *)tplg_w->priv.data;
-	int ret;
-	int idx = mconfig->fmt_cfg_idx;
-
-	dev_dbg(dev, "Parsing Skylake v4 widget topology data\n");
-
-	ret = guid_parse(dfw->uuid, (guid_t *)mconfig->guid);
-	if (ret)
-		return ret;
-	mconfig->id.module_id = -1;
-	mconfig->id.instance_id = dfw->instance_id;
-	mconfig->module->resources[0].cpc = dfw->max_mcps / 1000;
-	mconfig->module->resources[0].ibs = dfw->ibs;
-	mconfig->module->resources[0].obs = dfw->obs;
-	mconfig->core_id = dfw->core_id;
-	mconfig->module->max_input_pins = dfw->max_in_queue;
-	mconfig->module->max_output_pins = dfw->max_out_queue;
-	mconfig->module->loadable = dfw->is_loadable;
-	skl_tplg_fill_fmt_v4(mconfig->module->formats[0].inputs, dfw->in_fmt,
-			     MAX_IN_QUEUE);
-	skl_tplg_fill_fmt_v4(mconfig->module->formats[0].outputs, dfw->out_fmt,
-			     MAX_OUT_QUEUE);
-
-	mconfig->params_fixup = dfw->params_fixup;
-	mconfig->converter = dfw->converter;
-	mconfig->m_type = dfw->module_type;
-	mconfig->vbus_id = dfw->vbus_id;
-	mconfig->module->resources[0].is_pages = dfw->mem_pages;
-
-	ret = skl_tplg_add_pipe_v4(dev, mconfig, skl, &dfw->pipe);
-	if (ret)
-		return ret;
-
-	mconfig->dev_type = dfw->dev_type;
-	mconfig->hw_conn_type = dfw->hw_conn_type;
-	mconfig->time_slot = dfw->time_slot;
-	mconfig->formats_config[idx].caps_size = dfw->caps.caps_size;
-
-	mconfig->m_in_pin = devm_kcalloc(dev,
-				MAX_IN_QUEUE, sizeof(*mconfig->m_in_pin),
-				GFP_KERNEL);
-	if (!mconfig->m_in_pin)
-		return -ENOMEM;
-
-	mconfig->m_out_pin = devm_kcalloc(dev,
-				MAX_OUT_QUEUE, sizeof(*mconfig->m_out_pin),
-				GFP_KERNEL);
-	if (!mconfig->m_out_pin)
-		return -ENOMEM;
-
-	skl_fill_module_pin_info_v4(dfw->in_pin, mconfig->m_in_pin,
-				    dfw->is_dynamic_in_pin,
-				    mconfig->module->max_input_pins);
-	skl_fill_module_pin_info_v4(dfw->out_pin, mconfig->m_out_pin,
-				    dfw->is_dynamic_out_pin,
-				    mconfig->module->max_output_pins);
-
-	if (mconfig->formats_config[idx].caps_size) {
-		mconfig->formats_config[idx].set_params = dfw->caps.set_params;
-		mconfig->formats_config[idx].param_id = dfw->caps.param_id;
-		mconfig->formats_config[idx].caps =
-		devm_kzalloc(dev, mconfig->formats_config[idx].caps_size,
-			     GFP_KERNEL);
-		if (!mconfig->formats_config[idx].caps)
-			return -ENOMEM;
-		memcpy(mconfig->formats_config[idx].caps, dfw->caps.caps,
-		       dfw->caps.caps_size);
-	}
-
-	return 0;
-}
-
 static int skl_tplg_get_caps_data(struct device *dev, char *data,
 				  struct skl_module_cfg *mconfig)
 {
@@ -2877,13 +2715,6 @@ static int skl_tplg_get_pvt_data(struct snd_soc_tplg_dapm_widget *tplg_w,
 	char *data;
 	int ret;
 
-	/*
-	 * v4 configuration files have a valid UUID at the start of
-	 * the widget's private data.
-	 */
-	if (uuid_is_valid((char *)tplg_w->priv.data))
-		return skl_tplg_get_pvt_data_v4(tplg_w, skl, dev, mconfig);
-
 	/* Read the NUM_DATA_BLOCKS descriptor */
 	array = (struct snd_soc_tplg_vendor_array *)tplg_w->priv.data;
 	ret = skl_tplg_get_desc_blocks(dev, array);
-- 
cgit v1.2.3


From 4ba509bf3988f36b536b49b9e7022a6358f87f56 Mon Sep 17 00:00:00 2001
From: Cezary Rojewski <cezary.rojewski@intel.com>
Date: Wed, 3 Apr 2024 11:16:29 +0200
Subject: ASoC: topology: Remove obsolete ABI v4 structs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are no users of soc-topology ABI v4 since kernel v5.4 and no
kernel code makes use of them.

Cc: Curtis Malainey <cujomalainey@chromium.org>
Cc: Łukasz Majczak <lmajczak@google.com>
Signed-off-by: Cezary Rojewski <cezary.rojewski@intel.com>
Link: https://msgid.link/r/20240403091629.647267-5-cezary.rojewski@intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/sound/asoc.h | 56 -----------------------------------------------
 1 file changed, 56 deletions(-)

(limited to 'include')

diff --git a/include/uapi/sound/asoc.h b/include/uapi/sound/asoc.h
index 10851bca7174..99333cbd3114 100644
--- a/include/uapi/sound/asoc.h
+++ b/include/uapi/sound/asoc.h
@@ -576,60 +576,4 @@ struct snd_soc_tplg_dai {
 	struct snd_soc_tplg_private priv;
 } __attribute__((packed));
 
-/*
- * Old version of ABI structs, supported for backward compatibility.
- */
-
-/* Manifest v4 */
-struct snd_soc_tplg_manifest_v4 {
-	__le32 size;		/* in bytes of this structure */
-	__le32 control_elems;	/* number of control elements */
-	__le32 widget_elems;	/* number of widget elements */
-	__le32 graph_elems;	/* number of graph elements */
-	__le32 pcm_elems;	/* number of PCM elements */
-	__le32 dai_link_elems;	/* number of DAI link elements */
-	struct snd_soc_tplg_private priv;
-} __packed;
-
-/* Stream Capabilities v4 */
-struct snd_soc_tplg_stream_caps_v4 {
-	__le32 size;		/* in bytes of this structure */
-	char name[SNDRV_CTL_ELEM_ID_NAME_MAXLEN];
-	__le64 formats;	/* supported formats SNDRV_PCM_FMTBIT_* */
-	__le32 rates;		/* supported rates SNDRV_PCM_RATE_* */
-	__le32 rate_min;	/* min rate */
-	__le32 rate_max;	/* max rate */
-	__le32 channels_min;	/* min channels */
-	__le32 channels_max;	/* max channels */
-	__le32 periods_min;	/* min number of periods */
-	__le32 periods_max;	/* max number of periods */
-	__le32 period_size_min;	/* min period size bytes */
-	__le32 period_size_max;	/* max period size bytes */
-	__le32 buffer_size_min;	/* min buffer size bytes */
-	__le32 buffer_size_max;	/* max buffer size bytes */
-} __packed;
-
-/* PCM v4 */
-struct snd_soc_tplg_pcm_v4 {
-	__le32 size;		/* in bytes of this structure */
-	char pcm_name[SNDRV_CTL_ELEM_ID_NAME_MAXLEN];
-	char dai_name[SNDRV_CTL_ELEM_ID_NAME_MAXLEN];
-	__le32 pcm_id;		/* unique ID - used to match with DAI link */
-	__le32 dai_id;		/* unique ID - used to match */
-	__le32 playback;	/* supports playback mode */
-	__le32 capture;		/* supports capture mode */
-	__le32 compress;	/* 1 = compressed; 0 = PCM */
-	struct snd_soc_tplg_stream stream[SND_SOC_TPLG_STREAM_CONFIG_MAX]; /* for DAI link */
-	__le32 num_streams;	/* number of streams */
-	struct snd_soc_tplg_stream_caps_v4 caps[2]; /* playback and capture for DAI */
-} __packed;
-
-/* Physical link config v4 */
-struct snd_soc_tplg_link_config_v4 {
-	__le32 size;            /* in bytes of this structure */
-	__le32 id;              /* unique ID - used to match */
-	struct snd_soc_tplg_stream stream[SND_SOC_TPLG_STREAM_CONFIG_MAX]; /* supported configs playback and captrure */
-	__le32 num_streams;     /* number of streams */
-} __packed;
-
 #endif
-- 
cgit v1.2.3


From 32666d9cb3ddcf0041b6377cbab68f4c2d7c4171 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 25 Mar 2024 14:32:57 +0200
Subject: ACPI: bus: Make container_of() no-op where it makes sense

Move list head node to be the first member in a few data structures
in order to make container_of() no-op at compile time. On x86_64 with
a custom (default + a few dozens of drivers enabled) configuration:

  add/remove: 0/0 grow/shrink: 5/12 up/down: 21/-124 (-103)
  ...
  Total: Before=39924675, After=39924572, chg -0.00%

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/acpi_bus.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 5de954e2b18a..e4d8d48bba1f 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -124,8 +124,8 @@ static inline struct acpi_hotplug_profile *to_acpi_hotplug_profile(
 }
 
 struct acpi_scan_handler {
-	const struct acpi_device_id *ids;
 	struct list_head list_node;
+	const struct acpi_device_id *ids;
 	bool (*match)(const char *idstr, const struct acpi_device_id **matchid);
 	int (*attach)(struct acpi_device *dev, const struct acpi_device_id *id);
 	void (*detach)(struct acpi_device *dev);
@@ -269,6 +269,7 @@ struct acpi_device_power_flags {
 };
 
 struct acpi_device_power_state {
+	struct list_head resources;	/* Power resources referenced */
 	struct {
 		u8 valid:1;
 		u8 explicit_set:1;	/* _PSx present? */
@@ -276,7 +277,6 @@ struct acpi_device_power_state {
 	} flags;
 	int power;		/* % Power (compared to D0) */
 	int latency;		/* Dx->D0 time (microseconds) */
-	struct list_head resources;	/* Power resources referenced */
 };
 
 struct acpi_device_power {
@@ -342,16 +342,16 @@ struct acpi_device_wakeup {
 };
 
 struct acpi_device_physical_node {
-	unsigned int node_id;
 	struct list_head node;
 	struct device *dev;
+	unsigned int node_id;
 	bool put_online:1;
 };
 
 struct acpi_device_properties {
+	struct list_head list;
 	const guid_t *guid;
 	union acpi_object *properties;
-	struct list_head list;
 	void **bufs;
 };
 
@@ -488,12 +488,12 @@ struct acpi_device {
 
 /* Non-device subnode */
 struct acpi_data_node {
+	struct list_head sibling;
 	const char *name;
 	acpi_handle handle;
 	struct fwnode_handle fwnode;
 	struct fwnode_handle *parent;
 	struct acpi_device_data data;
-	struct list_head sibling;
 	struct kobject kobj;
 	struct completion kobj_done;
 };
-- 
cgit v1.2.3


From 336153053293a090400cfdd4ae724b6b8a4beb98 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 25 Mar 2024 14:32:58 +0200
Subject: ACPI: bus: Don't use "proxy" headers

Update header inclusions to follow the IWYU (Include What You Use)
principle.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/acpi_bus.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index e4d8d48bba1f..a6ced88a08f3 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -9,8 +9,13 @@
 #ifndef __ACPI_BUS_H__
 #define __ACPI_BUS_H__
 
+#include <linux/completion.h>
+#include <linux/container_of.h>
 #include <linux/device.h>
+#include <linux/kobject.h>
+#include <linux/mutex.h>
 #include <linux/property.h>
+#include <linux/types.h>
 
 struct acpi_handle_list {
 	u32 count;
-- 
cgit v1.2.3


From f5c519fc3628915c2eccd65f86d7ee2ce5cc8645 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 25 Mar 2024 14:33:03 +0200
Subject: ACPI: scan: Introduce typedef:s for struct acpi_hotplug_context
 members

Follow the struct acpi_device_ops approach and introduce typedef:s
for the members. It makes code less verbose and more particular on
what parameters we take or types we use.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/dock.c     | 48 +++++++++++++++++-------------------------------
 drivers/acpi/scan.c     |  5 ++---
 include/acpi/acpi_bus.h | 13 ++++++++-----
 3 files changed, 27 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/dock.c b/drivers/acpi/dock.c
index a7c00ef78086..34affbda295e 100644
--- a/drivers/acpi/dock.c
+++ b/drivers/acpi/dock.c
@@ -88,43 +88,29 @@ static void dock_hotplug_event(struct dock_dependent_device *dd, u32 event,
 			       enum dock_callback_type cb_type)
 {
 	struct acpi_device *adev = dd->adev;
+	acpi_hp_fixup fixup = NULL;
+	acpi_hp_uevent uevent = NULL;
+	acpi_hp_notify notify = NULL;
 
 	acpi_lock_hp_context();
 
-	if (!adev->hp)
-		goto out;
-
-	if (cb_type == DOCK_CALL_FIXUP) {
-		void (*fixup)(struct acpi_device *);
-
-		fixup = adev->hp->fixup;
-		if (fixup) {
-			acpi_unlock_hp_context();
-			fixup(adev);
-			return;
-		}
-	} else if (cb_type == DOCK_CALL_UEVENT) {
-		void (*uevent)(struct acpi_device *, u32);
-
-		uevent = adev->hp->uevent;
-		if (uevent) {
-			acpi_unlock_hp_context();
-			uevent(adev, event);
-			return;
-		}
-	} else {
-		int (*notify)(struct acpi_device *, u32);
-
-		notify = adev->hp->notify;
-		if (notify) {
-			acpi_unlock_hp_context();
-			notify(adev, event);
-			return;
-		}
+	if (adev->hp) {
+		if (cb_type == DOCK_CALL_FIXUP)
+			fixup = adev->hp->fixup;
+		else if (cb_type == DOCK_CALL_UEVENT)
+			uevent = adev->hp->uevent;
+		else
+			notify = adev->hp->notify;
 	}
 
- out:
 	acpi_unlock_hp_context();
+
+	if (fixup)
+		fixup(adev);
+	else if (uevent)
+		uevent(adev, event);
+	else if (notify)
+		notify(adev, event);
 }
 
 static struct dock_station *find_dock_station(acpi_handle handle)
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index b9a33364e553..f06d92b1183f 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -73,8 +73,7 @@ void acpi_unlock_hp_context(void)
 
 void acpi_initialize_hp_context(struct acpi_device *adev,
 				struct acpi_hotplug_context *hp,
-				int (*notify)(struct acpi_device *, u32),
-				void (*uevent)(struct acpi_device *, u32))
+				acpi_hp_notify notify, acpi_hp_uevent uevent)
 {
 	acpi_lock_hp_context();
 	hp->notify = notify;
@@ -428,7 +427,7 @@ void acpi_device_hotplug(struct acpi_device *adev, u32 src)
 	} else if (adev->flags.hotplug_notify) {
 		error = acpi_generic_hotplug_event(adev, src);
 	} else {
-		int (*notify)(struct acpi_device *, u32);
+		acpi_hp_notify notify;
 
 		acpi_lock_hp_context();
 		notify = adev->hp ? adev->hp->notify : NULL;
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index a6ced88a08f3..6f8cc4cc61be 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -144,11 +144,15 @@ struct acpi_scan_handler {
  * --------------------
  */
 
+typedef int (*acpi_hp_notify) (struct acpi_device *, u32);
+typedef void (*acpi_hp_uevent) (struct acpi_device *, u32);
+typedef void (*acpi_hp_fixup) (struct acpi_device *);
+
 struct acpi_hotplug_context {
 	struct acpi_device *self;
-	int (*notify)(struct acpi_device *, u32);
-	void (*uevent)(struct acpi_device *, u32);
-	void (*fixup)(struct acpi_device *);
+	acpi_hp_notify notify;
+	acpi_hp_uevent uevent;
+	acpi_hp_fixup fixup;
 };
 
 /*
@@ -583,8 +587,7 @@ static inline void acpi_set_hp_context(struct acpi_device *adev,
 
 void acpi_initialize_hp_context(struct acpi_device *adev,
 				struct acpi_hotplug_context *hp,
-				int (*notify)(struct acpi_device *, u32),
-				void (*uevent)(struct acpi_device *, u32));
+				acpi_hp_notify notify, acpi_hp_uevent uevent);
 
 /* acpi_device.dev.bus == &acpi_bus_type */
 extern const struct bus_type acpi_bus_type;
-- 
cgit v1.2.3


From f91717007217d975aa975ddabd91ae1a107b9bff Mon Sep 17 00:00:00 2001
From: Anton Protopopov <aspsk@isovalent.com>
Date: Wed, 3 Apr 2024 14:33:03 +0200
Subject: bpf: Pack struct bpf_fib_lookup

The struct bpf_fib_lookup is supposed to be of size 64. A recent commit
59b418c7063d ("bpf: Add a check for struct bpf_fib_lookup size") added
a static assertion to check this property so that future changes to the
structure will not accidentally break this assumption.

As it immediately turned out, on some 32-bit arm systems, when AEABI=n,
the total size of the structure was equal to 68, see [1]. This happened
because the bpf_fib_lookup structure contains a union of two 16-bit
fields:

    union {
            __u16 tot_len;
            __u16 mtu_result;
    };

which was supposed to compile to a 16-bit-aligned 16-bit field. On the
aforementioned setups it was instead both aligned and padded to 32-bits.

Declare this inner union as __attribute__((packed, aligned(2))) such
that it always is of size 2 and is aligned to 16 bits.

  [1] https://lore.kernel.org/all/CA+G9fYtsoP51f-oP_Sp5MOq-Ffv8La2RztNpwvE6+R1VtFiLrw@mail.gmail.com/#t

Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>
Fixes: e1850ea9bd9e ("bpf: bpf_fib_lookup return MTU value as output when looked up")
Signed-off-by: Anton Protopopov <aspsk@isovalent.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20240403123303.1452184-1-aspsk@isovalent.com
---
 include/uapi/linux/bpf.h       | 2 +-
 tools/include/uapi/linux/bpf.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 79c548276b6b..6fe9f11c8abe 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -7157,7 +7157,7 @@ struct bpf_fib_lookup {
 
 		/* output: MTU value */
 		__u16	mtu_result;
-	};
+	} __attribute__((packed, aligned(2)));
 	/* input: L3 device index for lookup
 	 * output: device index from FIB lookup
 	 */
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 79c548276b6b..6fe9f11c8abe 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -7157,7 +7157,7 @@ struct bpf_fib_lookup {
 
 		/* output: MTU value */
 		__u16	mtu_result;
-	};
+	} __attribute__((packed, aligned(2)));
 	/* input: L3 device index for lookup
 	 * output: device index from FIB lookup
 	 */
-- 
cgit v1.2.3


From 59401c3c08e1a306e29a8d6c826685e2c5c6c794 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 26 Mar 2024 09:01:18 +0000
Subject: soundwire: remove unused sdw_bus_conf structure

This is redundant with sdw_bus_params, and was never used.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20240326090122.1051806-4-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw.h | 15 ---------------
 1 file changed, 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 66f814b63a43..e5d0aa58e7f6 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -542,21 +542,6 @@ enum sdw_reg_bank {
 	SDW_BANK1,
 };
 
-/**
- * struct sdw_bus_conf: Bus configuration
- *
- * @clk_freq: Clock frequency, in Hz
- * @num_rows: Number of rows in frame
- * @num_cols: Number of columns in frame
- * @bank: Next register bank
- */
-struct sdw_bus_conf {
-	unsigned int clk_freq;
-	unsigned int num_rows;
-	unsigned int num_cols;
-	unsigned int bank;
-};
-
 /**
  * struct sdw_prepare_ch: Prepare/De-prepare Data Port channel
  *
-- 
cgit v1.2.3


From bc13cf3f6e63dd708ccd160a28e6bb696af7e9f6 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 26 Mar 2024 09:01:20 +0000
Subject: soundwire: clarify maximum allowed address

The existing code sets the maximum address at 0x80000000, which is not
completely accurate. The last 2 Gbytes are indeed reserved, but so are
the 896 Mbytes just before. The maximum address which can be used with
paging or BRA is 0x47FFFFFF per Table 131 of the SoundWire 1.2.1
specification.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20240326090122.1051806-6-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw_registers.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/soundwire/sdw_registers.h b/include/linux/soundwire/sdw_registers.h
index 138bec908c40..658b10fa5b20 100644
--- a/include/linux/soundwire/sdw_registers.h
+++ b/include/linux/soundwire/sdw_registers.h
@@ -13,7 +13,7 @@
 
 #define SDW_REG_NO_PAGE				0x00008000
 #define SDW_REG_OPTIONAL_PAGE			0x00010000
-#define SDW_REG_MAX				0x80000000
+#define SDW_REG_MAX				0x48000000
 
 #define SDW_DPN_SIZE				0x100
 #define SDW_BANK1_OFFSET			0x10
-- 
cgit v1.2.3


From d0a69cd0369a390cc1c100e52e78a273695a170c Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 26 Mar 2024 09:20:26 +0000
Subject: soundwire: intel: add more values for SYNCPRD

Starting with MeteorLake, the input to the SoundWire IP can be 24.576
MHz (aka Audio Cardinal Clock) or 96 MHz (Audio PLL).

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20240326092030.1062802-4-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw_intel.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index 00bb22d96ae5..fa40b85d5019 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -34,8 +34,10 @@
 /* SYNC */
 #define SDW_SHIM_SYNC			0xC
 
-#define SDW_SHIM_SYNC_SYNCPRD_VAL_24	(24000 / SDW_CADENCE_GSYNC_KHZ - 1)
-#define SDW_SHIM_SYNC_SYNCPRD_VAL_38_4	(38400 / SDW_CADENCE_GSYNC_KHZ - 1)
+#define SDW_SHIM_SYNC_SYNCPRD_VAL_24		(24000 / SDW_CADENCE_GSYNC_KHZ - 1)
+#define SDW_SHIM_SYNC_SYNCPRD_VAL_24_576	(24576 / SDW_CADENCE_GSYNC_KHZ - 1)
+#define SDW_SHIM_SYNC_SYNCPRD_VAL_38_4		(38400 / SDW_CADENCE_GSYNC_KHZ - 1)
+#define SDW_SHIM_SYNC_SYNCPRD_VAL_96		(96000 / SDW_CADENCE_GSYNC_KHZ - 1)
 #define SDW_SHIM_SYNC_SYNCPRD		GENMASK(14, 0)
 #define SDW_SHIM_SYNC_SYNCCPU		BIT(15)
 #define SDW_SHIM_SYNC_CMDSYNC_MASK	GENMASK(19, 16)
-- 
cgit v1.2.3


From 09ee49e3de6bcecc57028682c673d180ec2d436b Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Tue, 26 Mar 2024 09:20:27 +0000
Subject: soundwire: intel: add support for MeteorLake additional clocks

In the MeteorLake hardware, the SoundWire link clock can be selected
from the Xtal, audio cardinal clock (24.576 MHz) or the 96 MHz audio
PLL.

This patches add the clock selection in a backwards-compatible manner,
using the ACPI firmware as the source of information and checking its
compatibility with hardware capabilities.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20240326092030.1062802-5-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/intel.c           | 43 +++++++++++++++++++++++++++++++------
 include/linux/soundwire/sdw_intel.h |  5 +++++
 2 files changed, 42 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/soundwire/intel.c b/drivers/soundwire/intel.c
index 839268175079..01e1a0f3ec39 100644
--- a/drivers/soundwire/intel.c
+++ b/drivers/soundwire/intel.c
@@ -345,8 +345,10 @@ static int intel_link_power_up(struct sdw_intel *sdw)
 	u32 spa_mask, cpa_mask;
 	u32 link_control;
 	int ret = 0;
+	u32 clock_source;
 	u32 syncprd;
 	u32 sync_reg;
+	bool lcap_mlcs;
 
 	mutex_lock(sdw->link_res->shim_lock);
 
@@ -358,12 +360,35 @@ static int intel_link_power_up(struct sdw_intel *sdw)
 	 * is only dependent on the oscillator clock provided to
 	 * the IP, so adjust based on _DSD properties reported in DSDT
 	 * tables. The values reported are based on either 24MHz
-	 * (CNL/CML) or 38.4 MHz (ICL/TGL+).
+	 * (CNL/CML) or 38.4 MHz (ICL/TGL+). On MeteorLake additional
+	 * frequencies are available with the MLCS clock source selection.
 	 */
-	if (prop->mclk_freq % 6000000)
-		syncprd = SDW_SHIM_SYNC_SYNCPRD_VAL_38_4;
-	else
-		syncprd = SDW_SHIM_SYNC_SYNCPRD_VAL_24;
+	lcap_mlcs = intel_readl(shim, SDW_SHIM_LCAP) & SDW_SHIM_LCAP_MLCS_MASK;
+
+	if (prop->mclk_freq % 6000000) {
+		if (prop->mclk_freq % 2400000) {
+			if (lcap_mlcs) {
+				syncprd = SDW_SHIM_SYNC_SYNCPRD_VAL_24_576;
+				clock_source = SDW_SHIM_MLCS_CARDINAL_CLK;
+			} else {
+				dev_err(sdw->cdns.dev, "%s: invalid clock configuration, mclk %d lcap_mlcs %d\n",
+					__func__, prop->mclk_freq, lcap_mlcs);
+				ret = -EINVAL;
+				goto out;
+			}
+		} else {
+			syncprd = SDW_SHIM_SYNC_SYNCPRD_VAL_38_4;
+			clock_source = SDW_SHIM_MLCS_XTAL_CLK;
+		}
+	} else {
+		if (lcap_mlcs) {
+			syncprd = SDW_SHIM_SYNC_SYNCPRD_VAL_96;
+			clock_source = SDW_SHIM_MLCS_AUDIO_PLL_CLK;
+		} else {
+			syncprd = SDW_SHIM_SYNC_SYNCPRD_VAL_24;
+			clock_source = SDW_SHIM_MLCS_XTAL_CLK;
+		}
+	}
 
 	if (!*shim_mask) {
 		dev_dbg(sdw->cdns.dev, "powering up all links\n");
@@ -403,6 +428,13 @@ static int intel_link_power_up(struct sdw_intel *sdw)
 				"Failed to set SHIM_SYNC: %d\n", ret);
 			goto out;
 		}
+
+		/* update link clock if needed */
+		if (lcap_mlcs) {
+			link_control = intel_readl(shim, SDW_SHIM_LCTL);
+			u32p_replace_bits(&link_control, clock_source, SDW_SHIM_LCTL_MLCS_MASK);
+			intel_writel(shim, SDW_SHIM_LCTL, link_control);
+		}
 	}
 
 	*shim_mask |= BIT(link_id);
@@ -1087,4 +1119,3 @@ const struct sdw_intel_hw_ops sdw_intel_cnl_hw_ops = {
 	.sync_check_cmdsync_unlocked = intel_check_cmdsync_unlocked,
 };
 EXPORT_SYMBOL_NS(sdw_intel_cnl_hw_ops, SOUNDWIRE_INTEL);
-
diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h
index fa40b85d5019..8e78417156e3 100644
--- a/include/linux/soundwire/sdw_intel.h
+++ b/include/linux/soundwire/sdw_intel.h
@@ -22,6 +22,7 @@
 /* LCAP */
 #define SDW_SHIM_LCAP			0x0
 #define SDW_SHIM_LCAP_LCOUNT_MASK	GENMASK(2, 0)
+#define SDW_SHIM_LCAP_MLCS_MASK		BIT(8)
 
 /* LCTL */
 #define SDW_SHIM_LCTL			0x4
@@ -30,6 +31,10 @@
 #define SDW_SHIM_LCTL_SPA_MASK		GENMASK(3, 0)
 #define SDW_SHIM_LCTL_CPA		BIT(8)
 #define SDW_SHIM_LCTL_CPA_MASK		GENMASK(11, 8)
+#define SDW_SHIM_LCTL_MLCS_MASK		GENMASK(29, 27)
+#define SDW_SHIM_MLCS_XTAL_CLK		0x0
+#define SDW_SHIM_MLCS_CARDINAL_CLK	0x1
+#define SDW_SHIM_MLCS_AUDIO_PLL_CLK	0x2
 
 /* SYNC */
 #define SDW_SHIM_SYNC			0xC
-- 
cgit v1.2.3


From 68d6f4f3fbd9b1baae53e7cf33fb3362b5a21494 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Mon, 25 Mar 2024 19:34:01 -0600
Subject: fs: Annotate struct file_handle with __counted_by() and use
 struct_size()

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time via CONFIG_UBSAN_BOUNDS (for
array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

While there, use struct_size() helper, instead of the open-coded
version.

[brauner@kernel.org: contains a fix by Edward for an OOB access]
Reported-by: syzbot+4139435cb1b34cf759c2@syzkaller.appspotmail.com
Signed-off-by: Edward Adam Davis <eadavis@qq.com>
Link: https://lore.kernel.org/r/tencent_A7845DD769577306D813742365E976E3A205@qq.com
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/ZgImCXTdGDTeBvSS@neat
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fhandle.c       | 6 +++---
 include/linux/fs.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/fs/fhandle.c b/fs/fhandle.c
index 57a12614addf..8a7f86c2139a 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -36,7 +36,7 @@ static long do_sys_name_to_handle(const struct path *path,
 	if (f_handle.handle_bytes > MAX_HANDLE_SZ)
 		return -EINVAL;
 
-	handle = kzalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
+	handle = kzalloc(struct_size(handle, f_handle, f_handle.handle_bytes),
 			 GFP_KERNEL);
 	if (!handle)
 		return -ENOMEM;
@@ -71,7 +71,7 @@ static long do_sys_name_to_handle(const struct path *path,
 	/* copy the mount id */
 	if (put_user(real_mount(path->mnt)->mnt_id, mnt_id) ||
 	    copy_to_user(ufh, handle,
-			 sizeof(struct file_handle) + handle_bytes))
+			 struct_size(handle, f_handle, handle_bytes)))
 		retval = -EFAULT;
 	kfree(handle);
 	return retval;
@@ -192,7 +192,7 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
 		retval = -EINVAL;
 		goto out_err;
 	}
-	handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
+	handle = kmalloc(struct_size(handle, f_handle, f_handle.handle_bytes),
 			 GFP_KERNEL);
 	if (!handle) {
 		retval = -ENOMEM;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 00fc429b0af0..914245085597 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1033,7 +1033,7 @@ struct file_handle {
 	__u32 handle_bytes;
 	int handle_type;
 	/* file identifier */
-	unsigned char f_handle[];
+	unsigned char f_handle[] __counted_by(handle_bytes);
 };
 
 static inline struct file *get_file(struct file *f)
-- 
cgit v1.2.3


From 5c9668cfc6d7c7de30fa57fa0a6c0ddc93462969 Mon Sep 17 00:00:00 2001
From: Gatien Chevallier <gatien.chevallier@foss.st.com>
Date: Fri, 5 Jan 2024 14:03:56 +0100
Subject: firewall: introduce stm32_firewall framework

Introduce a STM32 firewall framework that offers to firewall consumers
different firewall services such as the ability to check their access
rights against their firewall controller(s).

The STM32 firewall framework offers a generic API for STM32 firewall
controllers that is defined in their drivers to best fit the
specificity of each firewall.

There are various types of firewalls:
-Peripheral firewalls that filter accesses to peripherals
-Memory firewalls that filter accesses to memories or memory regions
-No type for undefined type of firewall

Signed-off-by: Gatien Chevallier <gatien.chevallier@foss.st.com>
Signed-off-by: Alexandre Torgue <alexandre.torgue@foss.st.com>
---
 MAINTAINERS                               |   5 +
 arch/arm/mach-stm32/Kconfig               |   1 +
 arch/arm64/Kconfig.platforms              |   1 +
 drivers/bus/Kconfig                       |  10 +
 drivers/bus/Makefile                      |   1 +
 drivers/bus/stm32_firewall.c              | 294 ++++++++++++++++++++++++++++++
 drivers/bus/stm32_firewall.h              |  83 +++++++++
 include/linux/bus/stm32_firewall_device.h | 142 +++++++++++++++
 8 files changed, 537 insertions(+)
 create mode 100644 drivers/bus/stm32_firewall.c
 create mode 100644 drivers/bus/stm32_firewall.h
 create mode 100644 include/linux/bus/stm32_firewall_device.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index aa3b947fb080..b896bd80d361 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -20811,6 +20811,11 @@ T:	git git://linuxtv.org/media_tree.git
 F:	Documentation/devicetree/bindings/media/i2c/st,st-mipid02.yaml
 F:	drivers/media/i2c/st-mipid02.c
 
+ST STM32 FIREWALL
+M:	Gatien Chevallier <gatien.chevallier@foss.st.com>
+S:	Maintained
+F:	drivers/bus/stm32_firewall.c
+
 ST STM32 I2C/SMBUS DRIVER
 M:	Pierre-Yves MORDRET <pierre-yves.mordret@foss.st.com>
 M:	Alain Volmat <alain.volmat@foss.st.com>
diff --git a/arch/arm/mach-stm32/Kconfig b/arch/arm/mach-stm32/Kconfig
index 98145031586f..ae21a9f78f9c 100644
--- a/arch/arm/mach-stm32/Kconfig
+++ b/arch/arm/mach-stm32/Kconfig
@@ -12,6 +12,7 @@ menuconfig ARCH_STM32
 	select PINCTRL
 	select RESET_CONTROLLER
 	select STM32_EXTI
+	select STM32_FIREWALL
 	help
 	  Support for STMicroelectronics STM32 processors.
 
diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms
index 24335565bad5..13ed0a483617 100644
--- a/arch/arm64/Kconfig.platforms
+++ b/arch/arm64/Kconfig.platforms
@@ -305,6 +305,7 @@ config ARCH_STM32
 	select ARM_SMC_MBOX
 	select ARM_SCMI_PROTOCOL
 	select COMMON_CLK_SCMI
+	select STM32_FIREWALL
 	help
 	  This enables support for ARMv8 based STMicroelectronics
 	  STM32 family, including:
diff --git a/drivers/bus/Kconfig b/drivers/bus/Kconfig
index d5e7fa9173a1..64cd2ee03aa3 100644
--- a/drivers/bus/Kconfig
+++ b/drivers/bus/Kconfig
@@ -163,6 +163,16 @@ config QCOM_SSC_BLOCK_BUS
 	  i2c/spi/uart controllers, a hexagon core, and a clock controller
 	  which provides clocks for the above.
 
+config STM32_FIREWALL
+	bool "STM32 Firewall framework"
+	depends on (ARCH_STM32 || COMPILE_TEST) && OF
+	select OF_DYNAMIC
+	help
+	  Say y to enable STM32 firewall framework and its services. Firewall
+	  controllers will be able to register to the framework. Access for
+	  hardware resources linked to a firewall controller can be requested
+	  through this STM32 framework.
+
 config SUN50I_DE2_BUS
 	bool "Allwinner A64 DE2 Bus Driver"
 	  default ARM64
diff --git a/drivers/bus/Makefile b/drivers/bus/Makefile
index d90eed189a65..fc0511450ec2 100644
--- a/drivers/bus/Makefile
+++ b/drivers/bus/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_OMAP_INTERCONNECT)	+= omap_l3_smx.o omap_l3_noc.o
 obj-$(CONFIG_OMAP_OCP2SCP)	+= omap-ocp2scp.o
 obj-$(CONFIG_QCOM_EBI2)		+= qcom-ebi2.o
 obj-$(CONFIG_QCOM_SSC_BLOCK_BUS)	+= qcom-ssc-block-bus.o
+obj-$(CONFIG_STM32_FIREWALL)	+= stm32_firewall.o
 obj-$(CONFIG_SUN50I_DE2_BUS)	+= sun50i-de2.o
 obj-$(CONFIG_SUNXI_RSB)		+= sunxi-rsb.o
 obj-$(CONFIG_OF)		+= simple-pm-bus.o
diff --git a/drivers/bus/stm32_firewall.c b/drivers/bus/stm32_firewall.c
new file mode 100644
index 000000000000..decb79449047
--- /dev/null
+++ b/drivers/bus/stm32_firewall.c
@@ -0,0 +1,294 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023, STMicroelectronics - All Rights Reserved
+ */
+
+#include <linux/bitfield.h>
+#include <linux/bits.h>
+#include <linux/bus/stm32_firewall_device.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+
+#include "stm32_firewall.h"
+
+/* Corresponds to STM32_FIREWALL_MAX_EXTRA_ARGS + firewall ID */
+#define STM32_FIREWALL_MAX_ARGS		(STM32_FIREWALL_MAX_EXTRA_ARGS + 1)
+
+static LIST_HEAD(firewall_controller_list);
+static DEFINE_MUTEX(firewall_controller_list_lock);
+
+/* Firewall device API */
+
+int stm32_firewall_get_firewall(struct device_node *np, struct stm32_firewall *firewall,
+				unsigned int nb_firewall)
+{
+	struct stm32_firewall_controller *ctrl;
+	struct of_phandle_iterator it;
+	unsigned int i, j = 0;
+	int err;
+
+	if (!firewall || !nb_firewall)
+		return -EINVAL;
+
+	/* Parse property with phandle parsed out */
+	of_for_each_phandle(&it, err, np, "access-controllers", "#access-controller-cells", 0) {
+		struct of_phandle_args provider_args;
+		struct device_node *provider = it.node;
+		const char *fw_entry;
+		bool match = false;
+
+		if (err) {
+			pr_err("Unable to get access-controllers property for node %s\n, err: %d",
+			       np->full_name, err);
+			of_node_put(provider);
+			return err;
+		}
+
+		if (j > nb_firewall) {
+			pr_err("Too many firewall controllers");
+			of_node_put(provider);
+			return -EINVAL;
+		}
+
+		provider_args.args_count = of_phandle_iterator_args(&it, provider_args.args,
+								    STM32_FIREWALL_MAX_ARGS);
+
+		/* Check if the parsed phandle corresponds to a registered firewall controller */
+		mutex_lock(&firewall_controller_list_lock);
+		list_for_each_entry(ctrl, &firewall_controller_list, entry) {
+			if (ctrl->dev->of_node->phandle == it.phandle) {
+				match = true;
+				firewall[j].firewall_ctrl = ctrl;
+				break;
+			}
+		}
+		mutex_unlock(&firewall_controller_list_lock);
+
+		if (!match) {
+			firewall[j].firewall_ctrl = NULL;
+			pr_err("No firewall controller registered for %s\n", np->full_name);
+			of_node_put(provider);
+			return -ENODEV;
+		}
+
+		err = of_property_read_string_index(np, "access-controller-names", j, &fw_entry);
+		if (err == 0)
+			firewall[j].entry = fw_entry;
+
+		/* Handle the case when there are no arguments given along with the phandle */
+		if (provider_args.args_count < 0 ||
+		    provider_args.args_count > STM32_FIREWALL_MAX_ARGS) {
+			of_node_put(provider);
+			return -EINVAL;
+		} else if (provider_args.args_count == 0) {
+			firewall[j].extra_args_size = 0;
+			firewall[j].firewall_id = U32_MAX;
+			j++;
+			continue;
+		}
+
+		/* The firewall ID is always the first argument */
+		firewall[j].firewall_id = provider_args.args[0];
+
+		/* Extra args start at the second argument */
+		for (i = 0; i < provider_args.args_count - 1; i++)
+			firewall[j].extra_args[i] = provider_args.args[i + 1];
+
+		/* Remove the firewall ID arg that is not an extra argument */
+		firewall[j].extra_args_size = provider_args.args_count - 1;
+
+		j++;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(stm32_firewall_get_firewall);
+
+int stm32_firewall_grant_access(struct stm32_firewall *firewall)
+{
+	struct stm32_firewall_controller *firewall_controller;
+
+	if (!firewall || firewall->firewall_id == U32_MAX)
+		return -EINVAL;
+
+	firewall_controller = firewall->firewall_ctrl;
+
+	if (!firewall_controller)
+		return -ENODEV;
+
+	return firewall_controller->grant_access(firewall_controller, firewall->firewall_id);
+}
+EXPORT_SYMBOL_GPL(stm32_firewall_grant_access);
+
+int stm32_firewall_grant_access_by_id(struct stm32_firewall *firewall, u32 subsystem_id)
+{
+	struct stm32_firewall_controller *firewall_controller;
+
+	if (!firewall || subsystem_id == U32_MAX || firewall->firewall_id == U32_MAX)
+		return -EINVAL;
+
+	firewall_controller = firewall->firewall_ctrl;
+
+	if (!firewall_controller)
+		return -ENODEV;
+
+	return firewall_controller->grant_access(firewall_controller, subsystem_id);
+}
+EXPORT_SYMBOL_GPL(stm32_firewall_grant_access_by_id);
+
+void stm32_firewall_release_access(struct stm32_firewall *firewall)
+{
+	struct stm32_firewall_controller *firewall_controller;
+
+	if (!firewall || firewall->firewall_id == U32_MAX) {
+		pr_debug("Incorrect arguments when releasing a firewall access\n");
+		return;
+	}
+
+	firewall_controller = firewall->firewall_ctrl;
+
+	if (!firewall_controller) {
+		pr_debug("No firewall controller to release\n");
+		return;
+	}
+
+	firewall_controller->release_access(firewall_controller, firewall->firewall_id);
+}
+EXPORT_SYMBOL_GPL(stm32_firewall_release_access);
+
+void stm32_firewall_release_access_by_id(struct stm32_firewall *firewall, u32 subsystem_id)
+{
+	struct stm32_firewall_controller *firewall_controller;
+
+	if (!firewall || subsystem_id == U32_MAX || firewall->firewall_id == U32_MAX) {
+		pr_debug("Incorrect arguments when releasing a firewall access");
+		return;
+	}
+
+	firewall_controller = firewall->firewall_ctrl;
+
+	if (!firewall_controller) {
+		pr_debug("No firewall controller to release");
+		return;
+	}
+
+	firewall_controller->release_access(firewall_controller, subsystem_id);
+}
+EXPORT_SYMBOL_GPL(stm32_firewall_release_access_by_id);
+
+/* Firewall controller API */
+
+int stm32_firewall_controller_register(struct stm32_firewall_controller *firewall_controller)
+{
+	struct stm32_firewall_controller *ctrl;
+
+	if (!firewall_controller)
+		return -ENODEV;
+
+	pr_info("Registering %s firewall controller\n", firewall_controller->name);
+
+	mutex_lock(&firewall_controller_list_lock);
+	list_for_each_entry(ctrl, &firewall_controller_list, entry) {
+		if (ctrl == firewall_controller) {
+			pr_debug("%s firewall controller already registered\n",
+				 firewall_controller->name);
+			mutex_unlock(&firewall_controller_list_lock);
+			return 0;
+		}
+	}
+	list_add_tail(&firewall_controller->entry, &firewall_controller_list);
+	mutex_unlock(&firewall_controller_list_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(stm32_firewall_controller_register);
+
+void stm32_firewall_controller_unregister(struct stm32_firewall_controller *firewall_controller)
+{
+	struct stm32_firewall_controller *ctrl;
+	bool controller_removed = false;
+
+	if (!firewall_controller) {
+		pr_debug("Null reference while unregistering firewall controller\n");
+		return;
+	}
+
+	mutex_lock(&firewall_controller_list_lock);
+	list_for_each_entry(ctrl, &firewall_controller_list, entry) {
+		if (ctrl == firewall_controller) {
+			controller_removed = true;
+			list_del_init(&ctrl->entry);
+			break;
+		}
+	}
+	mutex_unlock(&firewall_controller_list_lock);
+
+	if (!controller_removed)
+		pr_debug("There was no firewall controller named %s to unregister\n",
+			 firewall_controller->name);
+}
+EXPORT_SYMBOL_GPL(stm32_firewall_controller_unregister);
+
+int stm32_firewall_populate_bus(struct stm32_firewall_controller *firewall_controller)
+{
+	struct stm32_firewall *firewalls;
+	struct device_node *child;
+	struct device *parent;
+	unsigned int i;
+	int len;
+	int err;
+
+	parent = firewall_controller->dev;
+
+	dev_dbg(parent, "Populating %s system bus\n", dev_name(firewall_controller->dev));
+
+	for_each_available_child_of_node(dev_of_node(parent), child) {
+		/* The access-controllers property is mandatory for firewall bus devices */
+		len = of_count_phandle_with_args(child, "access-controllers",
+						 "#access-controller-cells");
+		if (len <= 0) {
+			of_node_put(child);
+			return -EINVAL;
+		}
+
+		firewalls = kcalloc(len, sizeof(*firewalls), GFP_KERNEL);
+		if (!firewalls) {
+			of_node_put(child);
+			return -ENOMEM;
+		}
+
+		err = stm32_firewall_get_firewall(child, firewalls, (unsigned int)len);
+		if (err) {
+			kfree(firewalls);
+			of_node_put(child);
+			return err;
+		}
+
+		for (i = 0; i < len; i++) {
+			if (firewall_controller->grant_access(firewall_controller,
+							      firewalls[i].firewall_id)) {
+				/*
+				 * Peripheral access not allowed or not defined.
+				 * Mark the node as populated so platform bus won't probe it
+				 */
+				of_detach_node(child);
+				dev_err(parent, "%s: Device driver will not be probed\n",
+					child->full_name);
+			}
+		}
+
+		kfree(firewalls);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(stm32_firewall_populate_bus);
diff --git a/drivers/bus/stm32_firewall.h b/drivers/bus/stm32_firewall.h
new file mode 100644
index 000000000000..e5fac85fe346
--- /dev/null
+++ b/drivers/bus/stm32_firewall.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2023, STMicroelectronics - All Rights Reserved
+ */
+
+#ifndef _STM32_FIREWALL_H
+#define _STM32_FIREWALL_H
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/types.h>
+
+/**
+ * STM32_PERIPHERAL_FIREWALL:		This type of firewall protects peripherals
+ * STM32_MEMORY_FIREWALL:		This type of firewall protects memories/subsets of memory
+ *					zones
+ * STM32_NOTYPE_FIREWALL:		Undefined firewall type
+ */
+
+#define STM32_PERIPHERAL_FIREWALL	BIT(1)
+#define STM32_MEMORY_FIREWALL		BIT(2)
+#define STM32_NOTYPE_FIREWALL		BIT(3)
+
+/**
+ * struct stm32_firewall_controller - Information on firewall controller supplying services
+ *
+ * @name:			Name of the firewall controller
+ * @dev:			Device reference of the firewall controller
+ * @mmio:			Base address of the firewall controller
+ * @entry:			List entry of the firewall controller list
+ * @type:			Type of firewall
+ * @max_entries:		Number of entries covered by the firewall
+ * @grant_access:		Callback used to grant access for a device access against a
+ *				firewall controller
+ * @release_access:		Callback used to release resources taken by a device when access was
+ *				granted
+ * @grant_memory_range_access:	Callback used to grant access for a device to a given memory region
+ */
+struct stm32_firewall_controller {
+	const char *name;
+	struct device *dev;
+	void __iomem *mmio;
+	struct list_head entry;
+	unsigned int type;
+	unsigned int max_entries;
+
+	int (*grant_access)(struct stm32_firewall_controller *ctrl, u32 id);
+	void (*release_access)(struct stm32_firewall_controller *ctrl, u32 id);
+	int (*grant_memory_range_access)(struct stm32_firewall_controller *ctrl, phys_addr_t paddr,
+					 size_t size);
+};
+
+/**
+ * stm32_firewall_controller_register - Register a firewall controller to the STM32 firewall
+ *					framework
+ * @firewall_controller:	Firewall controller to register
+ *
+ * Returns 0 in case of success or -ENODEV if no controller was given.
+ */
+int stm32_firewall_controller_register(struct stm32_firewall_controller *firewall_controller);
+
+/**
+ * stm32_firewall_controller_unregister - Unregister a firewall controller from the STM32
+ *					  firewall framework
+ * @firewall_controller:	Firewall controller to unregister
+ */
+void stm32_firewall_controller_unregister(struct stm32_firewall_controller *firewall_controller);
+
+/**
+ * stm32_firewall_populate_bus - Populate device tree nodes that have a correct firewall
+ *				 configuration. This is used at boot-time only, as a sanity check
+ *				 between device tree and firewalls hardware configurations to
+ *				 prevent a kernel crash when a device driver is not granted access
+ *
+ * @firewall_controller:	Firewall controller which nodes will be populated or not
+ *
+ * Returns 0 in case of success or appropriate errno code if error occurred.
+ */
+int stm32_firewall_populate_bus(struct stm32_firewall_controller *firewall_controller);
+
+#endif /* _STM32_FIREWALL_H */
diff --git a/include/linux/bus/stm32_firewall_device.h b/include/linux/bus/stm32_firewall_device.h
new file mode 100644
index 000000000000..18e0a2fc3816
--- /dev/null
+++ b/include/linux/bus/stm32_firewall_device.h
@@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2023, STMicroelectronics - All Rights Reserved
+ */
+
+#ifndef STM32_FIREWALL_DEVICE_H
+#define STM32_FIREWALL_DEVICE_H
+
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/types.h>
+
+#define STM32_FIREWALL_MAX_EXTRA_ARGS		5
+
+/* Opaque reference to stm32_firewall_controller */
+struct stm32_firewall_controller;
+
+/**
+ * struct stm32_firewall - Information on a device's firewall. Each device can have more than one
+ *			   firewall.
+ *
+ * @firewall_ctrl:		Pointer referencing a firewall controller of the device. It is
+ *				opaque so a device cannot manipulate the controller's ops or access
+ *				the controller's data
+ * @extra_args:			Extra arguments that are implementation dependent
+ * @entry:			Name of the firewall entry
+ * @extra_args_size:		Number of extra arguments
+ * @firewall_id:		Firewall ID associated the device for this firewall controller
+ */
+struct stm32_firewall {
+	struct stm32_firewall_controller *firewall_ctrl;
+	u32 extra_args[STM32_FIREWALL_MAX_EXTRA_ARGS];
+	const char *entry;
+	size_t extra_args_size;
+	u32 firewall_id;
+};
+
+#if IS_ENABLED(CONFIG_STM32_FIREWALL)
+/**
+ * stm32_firewall_get_firewall - Get the firewall(s) associated to given device.
+ *				 The firewall controller reference is always the first argument
+ *				 of each of the access-controller property entries.
+ *				 The firewall ID is always the second argument of each of the
+ *				 access-controller  property entries.
+ *				 If there's no argument linked to the phandle, then the firewall ID
+ *				 field is set to U32_MAX, which is an invalid ID.
+ *
+ * @np:				Device node to parse
+ * @firewall:			Array of firewall references
+ * @nb_firewall:		Number of firewall references to get. Must be at least 1.
+ *
+ * Returns 0 on success, -ENODEV if there's no match with a firewall controller or appropriate errno
+ * code if error occurred.
+ */
+int stm32_firewall_get_firewall(struct device_node *np, struct stm32_firewall *firewall,
+				unsigned int nb_firewall);
+
+/**
+ * stm32_firewall_grant_access - Request firewall access rights and grant access.
+ *
+ * @firewall:			Firewall reference containing the ID to check against its firewall
+ *				controller
+ *
+ * Returns 0 if access is granted, -EACCES if access is denied, -ENODEV if firewall is null or
+ * appropriate errno code if error occurred
+ */
+int stm32_firewall_grant_access(struct stm32_firewall *firewall);
+
+/**
+ * stm32_firewall_release_access - Release access granted from a call to
+ *				   stm32_firewall_grant_access().
+ *
+ * @firewall:			Firewall reference containing the ID to check against its firewall
+ *				controller
+ */
+void stm32_firewall_release_access(struct stm32_firewall *firewall);
+
+/**
+ * stm32_firewall_grant_access_by_id - Request firewall access rights of a given device
+ *				       based on a specific firewall ID
+ *
+ * Warnings:
+ * There is no way to ensure that the given ID will correspond to the firewall referenced in the
+ * device node if the ID did not come from stm32_firewall_get_firewall(). In that case, this
+ * function must be used with caution.
+ * This function should be used for subsystem resources that do not have the same firewall ID
+ * as their parent.
+ * U32_MAX is an invalid ID.
+ *
+ * @firewall:			Firewall reference containing the firewall controller
+ * @subsystem_id:		Firewall ID of the subsystem resource
+ *
+ * Returns 0 if access is granted, -EACCES if access is denied, -ENODEV if firewall is null or
+ * appropriate errno code if error occurred
+ */
+int stm32_firewall_grant_access_by_id(struct stm32_firewall *firewall, u32 subsystem_id);
+
+/**
+ * stm32_firewall_release_access_by_id - Release access granted from a call to
+ *					 stm32_firewall_grant_access_by_id().
+ *
+ * Warnings:
+ * There is no way to ensure that the given ID will correspond to the firewall referenced in the
+ * device node if the ID did not come from stm32_firewall_get_firewall(). In that case, this
+ * function must be used with caution.
+ * This function should be used for subsystem resources that do not have the same firewall ID
+ * as their parent.
+ * U32_MAX is an invalid ID.
+ *
+ * @firewall:			Firewall reference containing the firewall controller
+ * @subsystem_id:		Firewall ID of the subsystem resource
+ */
+void stm32_firewall_release_access_by_id(struct stm32_firewall *firewall, u32 subsystem_id);
+
+#else /* CONFIG_STM32_FIREWALL */
+
+int stm32_firewall_get_firewall(struct device_node *np, struct stm32_firewall *firewall,
+				unsigned int nb_firewall);
+{
+	return -ENODEV;
+}
+
+int stm32_firewall_grant_access(struct stm32_firewall *firewall)
+{
+	return -ENODEV;
+}
+
+void stm32_firewall_release_access(struct stm32_firewall *firewall)
+{
+}
+
+int stm32_firewall_grant_access_by_id(struct stm32_firewall *firewall, u32 subsystem_id)
+{
+	return -ENODEV;
+}
+
+void stm32_firewall_release_access_by_id(struct stm32_firewall *firewall, u32 subsystem_id)
+{
+}
+
+#endif /* CONFIG_STM32_FIREWALL */
+#endif /* STM32_FIREWALL_DEVICE_H */
-- 
cgit v1.2.3


From 72bea132f3680ee51e7ed2cee62892b6f5121909 Mon Sep 17 00:00:00 2001
From: Neil Armstrong <neil.armstrong@linaro.org>
Date: Fri, 22 Mar 2024 10:42:38 +0100
Subject: dt-bindings: phy: qcom,sc8280xp-qmp-pcie-phy: document PHY AUX clock
 on SM8[456]50 SoCs

The PCIe Gen4x2 PHY found in the SM8[456]50 SoCs have a second clock named
"PHY_AUX_CLK" which is an input of the Global Clock Controller (GCC) which
is muxed & gated then returned to the PHY as an input.

Document the clock IDs to select the PIPE clock or the AUX clock,
also enforce a second clock-output-names and a #clock-cells value of 1
for the PCIe Gen4x2 PHY found in the SM8[456]50 SoCs.

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Neil Armstrong <neil.armstrong@linaro.org>
Link: https://lore.kernel.org/r/20240322-topic-sm8x50-upstream-pcie-1-phy-aux-clk-v2-1-3ec0a966d52f@linaro.org
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 .../bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml   | 27 +++++++++++++++++++---
 include/dt-bindings/phy/phy-qcom-qmp.h             |  4 ++++
 2 files changed, 28 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml b/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml
index ba966a78a128..14ac341b1577 100644
--- a/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml
+++ b/Documentation/devicetree/bindings/phy/qcom,sc8280xp-qmp-pcie-phy.yaml
@@ -88,11 +88,11 @@ properties:
           - description: offset of PCIe 4-lane configuration register
           - description: offset of configuration bit for this PHY
 
-  "#clock-cells":
-    const: 0
+  "#clock-cells": true
 
   clock-output-names:
-    maxItems: 1
+    minItems: 1
+    maxItems: 2
 
   "#phy-cells":
     const: 0
@@ -213,6 +213,27 @@ allOf:
         reset-names:
           maxItems: 1
 
+  - if:
+      properties:
+        compatible:
+          contains:
+            enum:
+              - qcom,sm8450-qmp-gen4x2-pcie-phy
+              - qcom,sm8550-qmp-gen4x2-pcie-phy
+              - qcom,sm8650-qmp-gen4x2-pcie-phy
+    then:
+      properties:
+        clock-output-names:
+          minItems: 2
+        "#clock-cells":
+          const: 1
+    else:
+      properties:
+        clock-output-names:
+          maxItems: 1
+        "#clock-cells":
+          const: 0
+
 examples:
   - |
     #include <dt-bindings/clock/qcom,gcc-sc8280xp.h>
diff --git a/include/dt-bindings/phy/phy-qcom-qmp.h b/include/dt-bindings/phy/phy-qcom-qmp.h
index 4edec4c5b224..6b43ea9e0051 100644
--- a/include/dt-bindings/phy/phy-qcom-qmp.h
+++ b/include/dt-bindings/phy/phy-qcom-qmp.h
@@ -17,4 +17,8 @@
 #define QMP_USB43DP_USB3_PHY		0
 #define QMP_USB43DP_DP_PHY		1
 
+/* QMP PCIE PHYs */
+#define QMP_PCIE_PIPE_CLK		0
+#define QMP_PCIE_PHY_AUX_CLK		1
+
 #endif /* _DT_BINDINGS_PHY_QMP */
-- 
cgit v1.2.3


From 0a525621b7e5b49202b19d8f75382c6778fdd0c1 Mon Sep 17 00:00:00 2001
From: Philo Lu <lulie@linux.alibaba.com>
Date: Fri, 5 Apr 2024 10:55:34 +0800
Subject: bpf: store both map ptr and state in bpf_insn_aux_data

Currently, bpf_insn_aux_data->map_ptr_state is used to store either
map_ptr or its poison state (i.e., BPF_MAP_PTR_POISON). Thus
BPF_MAP_PTR_POISON must be checked before reading map_ptr. In certain
cases, we may need valid map_ptr even in case of poison state.
This will be explained in next patch with bpf_for_each_map_elem()
helper.

This patch changes map_ptr_state into a new struct including both map
pointer and its state (poison/unpriv). It's in the same union with
struct bpf_loop_inline_state, so there is no extra memory overhead.
Besides, macros BPF_MAP_PTR_UNPRIV/BPF_MAP_PTR_POISON/BPF_MAP_PTR are no
longer needed.

This patch does not change any existing functionality.

Signed-off-by: Philo Lu <lulie@linux.alibaba.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20240405025536.18113-2-lulie@linux.alibaba.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  9 ++++++++-
 kernel/bpf/verifier.c        | 36 ++++++++++++++++--------------------
 2 files changed, 24 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 7cb1b75eee38..36d19cd32eb5 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -502,6 +502,13 @@ struct bpf_loop_inline_state {
 	u32 callback_subprogno; /* valid when fit_for_inline is true */
 };
 
+/* pointer and state for maps */
+struct bpf_map_ptr_state {
+	struct bpf_map *map_ptr;
+	bool poison;
+	bool unpriv;
+};
+
 /* Possible states for alu_state member. */
 #define BPF_ALU_SANITIZE_SRC		(1U << 0)
 #define BPF_ALU_SANITIZE_DST		(1U << 1)
@@ -514,7 +521,7 @@ struct bpf_loop_inline_state {
 struct bpf_insn_aux_data {
 	union {
 		enum bpf_reg_type ptr_type;	/* pointer type for load/store insns */
-		unsigned long map_ptr_state;	/* pointer/poison value for maps */
+		struct bpf_map_ptr_state map_ptr_state;
 		s32 call_imm;			/* saved imm field of call insn */
 		u32 alu_limit;			/* limit for add/sub register with pointer */
 		struct {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4353cb09c35b..62fedb1f3312 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -190,11 +190,6 @@ struct bpf_verifier_stack_elem {
 #define BPF_MAP_KEY_POISON	(1ULL << 63)
 #define BPF_MAP_KEY_SEEN	(1ULL << 62)
 
-#define BPF_MAP_PTR_UNPRIV	1UL
-#define BPF_MAP_PTR_POISON	((void *)((0xeB9FUL << 1) +	\
-					  POISON_POINTER_DELTA))
-#define BPF_MAP_PTR(X)		((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV))
-
 #define BPF_GLOBAL_PERCPU_MA_MAX_SIZE  512
 
 static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
@@ -209,21 +204,22 @@ static bool is_trusted_reg(const struct bpf_reg_state *reg);
 
 static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
 {
-	return BPF_MAP_PTR(aux->map_ptr_state) == BPF_MAP_PTR_POISON;
+	return aux->map_ptr_state.poison;
 }
 
 static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux)
 {
-	return aux->map_ptr_state & BPF_MAP_PTR_UNPRIV;
+	return aux->map_ptr_state.unpriv;
 }
 
 static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux,
-			      const struct bpf_map *map, bool unpriv)
+			      struct bpf_map *map,
+			      bool unpriv, bool poison)
 {
-	BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV);
 	unpriv |= bpf_map_ptr_unpriv(aux);
-	aux->map_ptr_state = (unsigned long)map |
-			     (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL);
+	aux->map_ptr_state.unpriv = unpriv;
+	aux->map_ptr_state.poison = poison;
+	aux->map_ptr_state.map_ptr = map;
 }
 
 static bool bpf_map_key_poisoned(const struct bpf_insn_aux_data *aux)
@@ -9660,7 +9656,7 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env,
 		return -EINVAL;
 	}
 
-	map = BPF_MAP_PTR(insn_aux->map_ptr_state);
+	map = insn_aux->map_ptr_state.map_ptr;
 	if (!map->ops->map_set_for_each_callback_args ||
 	    !map->ops->map_for_each_callback) {
 		verbose(env, "callback function not allowed for map\n");
@@ -10019,12 +10015,12 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
 		return -EACCES;
 	}
 
-	if (!BPF_MAP_PTR(aux->map_ptr_state))
+	if (!aux->map_ptr_state.map_ptr)
+		bpf_map_ptr_store(aux, meta->map_ptr,
+				  !meta->map_ptr->bypass_spec_v1, false);
+	else if (aux->map_ptr_state.map_ptr != meta->map_ptr)
 		bpf_map_ptr_store(aux, meta->map_ptr,
-				  !meta->map_ptr->bypass_spec_v1);
-	else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr)
-		bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON,
-				  !meta->map_ptr->bypass_spec_v1);
+				  !meta->map_ptr->bypass_spec_v1, true);
 	return 0;
 }
 
@@ -19840,7 +19836,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			    !bpf_map_ptr_unpriv(aux)) {
 				struct bpf_jit_poke_descriptor desc = {
 					.reason = BPF_POKE_REASON_TAIL_CALL,
-					.tail_call.map = BPF_MAP_PTR(aux->map_ptr_state),
+					.tail_call.map = aux->map_ptr_state.map_ptr,
 					.tail_call.key = bpf_map_key_immediate(aux),
 					.insn_idx = i + delta,
 				};
@@ -19869,7 +19865,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 				return -EINVAL;
 			}
 
-			map_ptr = BPF_MAP_PTR(aux->map_ptr_state);
+			map_ptr = aux->map_ptr_state.map_ptr;
 			insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
 						  map_ptr->max_entries, 2);
 			insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
@@ -19977,7 +19973,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			if (bpf_map_ptr_poisoned(aux))
 				goto patch_call_imm;
 
-			map_ptr = BPF_MAP_PTR(aux->map_ptr_state);
+			map_ptr = aux->map_ptr_state.map_ptr;
 			ops = map_ptr->ops;
 			if (insn->imm == BPF_FUNC_map_lookup_elem &&
 			    ops->map_gen_lookup) {
-- 
cgit v1.2.3


From 65a09ba2693681da53099a0f41b7fb6d5ebbead5 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Date: Thu, 28 Mar 2024 21:45:45 +0100
Subject: scsi: core: Store owner from modules with scsi_register_driver()

Modules registering driver with scsi_driver_register() might forget to set
.owner field.  The field is used by some of other kernel parts for
reference counting (try_module_get()), so it is expected that drivers will
set it.

Solve the problem by moving this task away from the drivers to the core
scsi code, just like we did for platform_driver in commit 9447057eaff8
("platform_device: use a macro instead of platform_driver_register").

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20240328-b4-module-owner-scsi-v1-1-c86cb4f6e91c@linaro.org
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/scsi_sysfs.c  | 5 +++--
 include/scsi/scsi_driver.h | 4 +++-
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index 775df00021e4..b5aae4e8ae33 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -1609,13 +1609,14 @@ restart:
 }
 EXPORT_SYMBOL(scsi_remove_target);
 
-int scsi_register_driver(struct device_driver *drv)
+int __scsi_register_driver(struct device_driver *drv, struct module *owner)
 {
 	drv->bus = &scsi_bus_type;
+	drv->owner = owner;
 
 	return driver_register(drv);
 }
-EXPORT_SYMBOL(scsi_register_driver);
+EXPORT_SYMBOL(__scsi_register_driver);
 
 int scsi_register_interface(struct class_interface *intf)
 {
diff --git a/include/scsi/scsi_driver.h b/include/scsi/scsi_driver.h
index 4ce1988b2ba0..5c6724322112 100644
--- a/include/scsi/scsi_driver.h
+++ b/include/scsi/scsi_driver.h
@@ -22,7 +22,9 @@ struct scsi_driver {
 #define to_scsi_driver(drv) \
 	container_of((drv), struct scsi_driver, gendrv)
 
-extern int scsi_register_driver(struct device_driver *);
+#define scsi_register_driver(drv) \
+	__scsi_register_driver(drv, THIS_MODULE)
+int __scsi_register_driver(struct device_driver *, struct module *);
 #define scsi_unregister_driver(drv) \
 	driver_unregister(drv);
 
-- 
cgit v1.2.3


From 4aafb8ab2a626ac714931ec7894cf16df948b359 Mon Sep 17 00:00:00 2001
From: Cosmin Ratiu <cratiu@nvidia.com>
Date: Thu, 4 Apr 2024 20:33:54 +0300
Subject: net/mlx5e: Support FEC settings for 100G/lane modes

This consists of:
1. Expose the 100G/lane capability bit in the PCAM reg.
2. Expose the per link mode FEC capability masks in the PPLM reg.
3. Set the overrides according to ethtool parameters.
FEC for new modes is set if and only if the PCAM 100G/lane capability is
advertised and the capability mask for a given link mode reports that it
can accept the requested FEC mode.

Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Gal Pressman <gal@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://lore.kernel.org/r/20240404173357.123307-3-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/port.c | 36 +++++++++++++++++++++--
 include/linux/mlx5/mlx5_ifc.h                     | 20 +++++++++++--
 2 files changed, 52 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port.c b/drivers/net/ethernet/mellanox/mlx5/core/en/port.c
index b4681a93807d..b4efc780e297 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port.c
@@ -292,10 +292,15 @@ enum mlx5e_fec_supported_link_mode {
 	MLX5E_FEC_SUPPORTED_LINK_MODE_100G_2X,
 	MLX5E_FEC_SUPPORTED_LINK_MODE_200G_4X,
 	MLX5E_FEC_SUPPORTED_LINK_MODE_400G_8X,
+	MLX5E_FEC_SUPPORTED_LINK_MODE_100G_1X,
+	MLX5E_FEC_SUPPORTED_LINK_MODE_200G_2X,
+	MLX5E_FEC_SUPPORTED_LINK_MODE_400G_4X,
+	MLX5E_FEC_SUPPORTED_LINK_MODE_800G_8X,
 	MLX5E_MAX_FEC_SUPPORTED_LINK_MODE,
 };
 
 #define MLX5E_FEC_FIRST_50G_PER_LANE_MODE MLX5E_FEC_SUPPORTED_LINK_MODE_50G_1X
+#define MLX5E_FEC_FIRST_100G_PER_LANE_MODE MLX5E_FEC_SUPPORTED_LINK_MODE_100G_1X
 
 #define MLX5E_FEC_OVERRIDE_ADMIN_POLICY(buf, policy, write, link)			\
 	do {										\
@@ -313,7 +318,10 @@ static bool mlx5e_is_fec_supported_link_mode(struct mlx5_core_dev *dev,
 					     enum mlx5e_fec_supported_link_mode link_mode)
 {
 	return link_mode < MLX5E_FEC_FIRST_50G_PER_LANE_MODE ||
-	       MLX5_CAP_PCAM_FEATURE(dev, fec_50G_per_lane_in_pplm);
+	       (link_mode < MLX5E_FEC_FIRST_100G_PER_LANE_MODE &&
+		MLX5_CAP_PCAM_FEATURE(dev, fec_50G_per_lane_in_pplm)) ||
+	       (link_mode >= MLX5E_FEC_FIRST_100G_PER_LANE_MODE &&
+		MLX5_CAP_PCAM_FEATURE(dev, fec_100G_per_lane_in_pplm));
 }
 
 /* get/set FEC admin field for a given speed */
@@ -348,6 +356,18 @@ static int mlx5e_fec_admin_field(u32 *pplm, u16 *fec_policy, bool write,
 	case MLX5E_FEC_SUPPORTED_LINK_MODE_400G_8X:
 		MLX5E_FEC_OVERRIDE_ADMIN_POLICY(pplm, *fec_policy, write, 400g_8x);
 		break;
+	case MLX5E_FEC_SUPPORTED_LINK_MODE_100G_1X:
+		MLX5E_FEC_OVERRIDE_ADMIN_POLICY(pplm, *fec_policy, write, 100g_1x);
+		break;
+	case MLX5E_FEC_SUPPORTED_LINK_MODE_200G_2X:
+		MLX5E_FEC_OVERRIDE_ADMIN_POLICY(pplm, *fec_policy, write, 200g_2x);
+		break;
+	case MLX5E_FEC_SUPPORTED_LINK_MODE_400G_4X:
+		MLX5E_FEC_OVERRIDE_ADMIN_POLICY(pplm, *fec_policy, write, 400g_4x);
+		break;
+	case MLX5E_FEC_SUPPORTED_LINK_MODE_800G_8X:
+		MLX5E_FEC_OVERRIDE_ADMIN_POLICY(pplm, *fec_policy, write, 800g_8x);
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -389,6 +409,18 @@ static int mlx5e_get_fec_cap_field(u32 *pplm, u16 *fec_cap,
 	case MLX5E_FEC_SUPPORTED_LINK_MODE_400G_8X:
 		*fec_cap = MLX5E_GET_FEC_OVERRIDE_CAP(pplm, 400g_8x);
 		break;
+	case MLX5E_FEC_SUPPORTED_LINK_MODE_100G_1X:
+		*fec_cap = MLX5E_GET_FEC_OVERRIDE_CAP(pplm, 100g_1x);
+		break;
+	case MLX5E_FEC_SUPPORTED_LINK_MODE_200G_2X:
+		*fec_cap = MLX5E_GET_FEC_OVERRIDE_CAP(pplm, 200g_2x);
+		break;
+	case MLX5E_FEC_SUPPORTED_LINK_MODE_400G_4X:
+		*fec_cap = MLX5E_GET_FEC_OVERRIDE_CAP(pplm, 400g_4x);
+		break;
+	case MLX5E_FEC_SUPPORTED_LINK_MODE_800G_8X:
+		*fec_cap = MLX5E_GET_FEC_OVERRIDE_CAP(pplm, 800g_8x);
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -501,7 +533,7 @@ int mlx5e_set_fec_mode(struct mlx5_core_dev *dev, u16 fec_policy)
 		/* RS fec in ethtool is mapped to MLX5E_FEC_RS_528_514
 		 * to link modes up to 25G per lane and to
 		 * MLX5E_FEC_RS_544_514 in the new link modes based on
-		 * 50 G per lane
+		 * 50G or 100G per lane
 		 */
 		if (conf_fec == (1 << MLX5E_FEC_RS_528_514) &&
 		    i >= MLX5E_FEC_FIRST_50G_PER_LANE_MODE)
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index cc159d8563d1..35ffc9b9f241 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -9817,7 +9817,21 @@ struct mlx5_ifc_pplm_reg_bits {
 	u8         fec_override_admin_100g_2x[0x10];
 	u8         fec_override_admin_50g_1x[0x10];
 
-	u8         reserved_at_140[0x140];
+	u8         fec_override_cap_800g_8x[0x10];
+	u8         fec_override_cap_400g_4x[0x10];
+
+	u8         fec_override_cap_200g_2x[0x10];
+	u8         fec_override_cap_100g_1x[0x10];
+
+	u8         reserved_at_180[0xa0];
+
+	u8         fec_override_admin_800g_8x[0x10];
+	u8         fec_override_admin_400g_4x[0x10];
+
+	u8         fec_override_admin_200g_2x[0x10];
+	u8         fec_override_admin_100g_1x[0x10];
+
+	u8         reserved_at_260[0x20];
 };
 
 struct mlx5_ifc_ppcnt_reg_bits {
@@ -10189,7 +10203,9 @@ struct mlx5_ifc_mtutc_reg_bits {
 };
 
 struct mlx5_ifc_pcam_enhanced_features_bits {
-	u8         reserved_at_0[0x68];
+	u8         reserved_at_0[0x48];
+	u8         fec_100G_per_lane_in_pplm[0x1];
+	u8         reserved_at_49[0x1f];
 	u8         fec_50G_per_lane_in_pplm[0x1];
 	u8         reserved_at_69[0x4];
 	u8         rx_icrc_encapsulated_counter[0x1];
-- 
cgit v1.2.3


From 386f4a737964702e4364376285afd61315ae3c28 Mon Sep 17 00:00:00 2001
From: Justin Stitt <justinstitt@google.com>
Date: Mon, 1 Apr 2024 23:48:52 +0000
Subject: trace: events: cleanup deprecated strncpy uses

strncpy() is deprecated for use on NUL-terminated destination strings
[1] and as such we should prefer more robust and less ambiguous string
interfaces.

For 2 out of 3 of these changes we can simply swap in strscpy() as it
guarantess NUL-termination which is needed for the following trace
print.

trace_rpcgss_context() should use memcpy as its format specifier %.*s
allows for the length to be specifier (__entry->len). Due to this,
acceptor does not technically need to be NUL-terminated. Moreover,
swapping in strscpy() and keeping everything else the same could result
in truncation of the source string by one byte. To remedy this, we could
use `len + 1` but I am unsure of the size of the destination buffer so a
simple memcpy should suffice.
|	TP_printk("win_size=%u expiry=%lu now=%lu timeout=%u acceptor=%.*s",
|		__entry->window_size, __entry->expiry, __entry->now,
|		__entry->timeout, __entry->len, __get_str(acceptor))

I suspect acceptor not to naturally be a NUL-terminated string due to
the presence of some stringify methods.
|	.crstringify_acceptor	= gss_stringify_acceptor,

Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strncpy-on-nul-terminated-strings [1]
Link: https://manpages.debian.org/testing/linux-manual-4.8/strscpy.9.en.html [2]
Link: https://github.com/KSPP/linux/issues/90
Signed-off-by: Justin Stitt <justinstitt@google.com>
Acked-by: Chuck Lever <chuck.lever@oracle.com>
Link: https://lore.kernel.org/r/20240401-strncpy-include-trace-events-mdio-h-v1-1-9cb5a4cda116@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/trace/events/mdio.h   | 2 +-
 include/trace/events/rpcgss.h | 2 +-
 include/trace/events/sock.h   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/mdio.h b/include/trace/events/mdio.h
index 0f241cbe00ab..285b3e4f83ba 100644
--- a/include/trace/events/mdio.h
+++ b/include/trace/events/mdio.h
@@ -25,7 +25,7 @@ TRACE_EVENT_CONDITION(mdio_access,
 	),
 
 	TP_fast_assign(
-		strncpy(__entry->busid, bus->id, MII_BUS_ID_SIZE);
+		strscpy(__entry->busid, bus->id, MII_BUS_ID_SIZE);
 		__entry->read = read;
 		__entry->addr = addr;
 		__entry->regnum = regnum;
diff --git a/include/trace/events/rpcgss.h b/include/trace/events/rpcgss.h
index ba2d96a1bc2f..274c297f1b15 100644
--- a/include/trace/events/rpcgss.h
+++ b/include/trace/events/rpcgss.h
@@ -618,7 +618,7 @@ TRACE_EVENT(rpcgss_context,
 		__entry->timeout = timeout;
 		__entry->window_size = window_size;
 		__entry->len = len;
-		strncpy(__get_str(acceptor), data, len);
+		memcpy(__get_str(acceptor), data, len);
 	),
 
 	TP_printk("win_size=%u expiry=%lu now=%lu timeout=%u acceptor=%.*s",
diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h
index 0d1c5ce4e6a6..3836de435d9d 100644
--- a/include/trace/events/sock.h
+++ b/include/trace/events/sock.h
@@ -110,7 +110,7 @@ TRACE_EVENT(sock_exceed_buf_limit,
 	),
 
 	TP_fast_assign(
-		strncpy(__entry->name, prot->name, 32);
+		strscpy(__entry->name, prot->name, 32);
 		__entry->sysctl_mem[0] = READ_ONCE(prot->sysctl_mem[0]);
 		__entry->sysctl_mem[1] = READ_ONCE(prot->sysctl_mem[1]);
 		__entry->sysctl_mem[2] = READ_ONCE(prot->sysctl_mem[2]);
-- 
cgit v1.2.3


From 0e9c127729be19adcdf2f5cc1f3450a4322fdf3a Mon Sep 17 00:00:00 2001
From: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Date: Wed, 3 Apr 2024 14:28:39 -0700
Subject: ethtool: add interface to read Tx hardware timestamping statistics

Multiple network devices that support hardware timestamping appear to have
common behavior with regards to timestamp handling. Implement common Tx
hardware timestamping statistics in a tx_stats struct_group. Common Rx
hardware timestamping statistics can subsequently be implemented in a
rx_stats struct_group for ethtool_ts_stats.

Signed-off-by: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Reviewed-by: Dragos Tatulea <dtatulea@nvidia.com>
Link: https://lore.kernel.org/r/20240403212931.128541-2-rrameshbabu@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/ethtool.yaml     | 17 +++++++++
 Documentation/networking/ethtool-netlink.rst |  9 +++++
 include/linux/ethtool.h                      | 27 ++++++++++++++-
 include/uapi/linux/ethtool_netlink.h         | 14 ++++++++
 net/ethtool/tsinfo.c                         | 52 +++++++++++++++++++++++++++-
 5 files changed, 117 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml
index d0e4a47e0f21..b76916394ec9 100644
--- a/Documentation/netlink/specs/ethtool.yaml
+++ b/Documentation/netlink/specs/ethtool.yaml
@@ -565,6 +565,18 @@ attribute-sets:
       -
         name: tx-lpi-timer
         type: u32
+  -
+    name: ts-stat
+    attributes:
+      -
+        name: tx-pkts
+        type: uint
+      -
+        name: tx-lost
+        type: uint
+      -
+        name: tx-err
+        type: uint
   -
     name: tsinfo
     attributes:
@@ -587,6 +599,10 @@ attribute-sets:
       -
         name: phc-index
         type: u32
+      -
+        name: stats
+        type: nest
+        nested-attributes: ts-stat
   -
     name: cable-result
     attributes:
@@ -1394,6 +1410,7 @@ operations:
             - tx-types
             - rx-filters
             - phc-index
+            - stats
       dump: *tsinfo-get-op
     -
       name: cable-test-act
diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index d583d9abf2f8..08d330b0f50f 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -1237,12 +1237,21 @@ Kernel response contents:
   ``ETHTOOL_A_TSINFO_TX_TYPES``          bitset  supported Tx types
   ``ETHTOOL_A_TSINFO_RX_FILTERS``        bitset  supported Rx filters
   ``ETHTOOL_A_TSINFO_PHC_INDEX``         u32     PTP hw clock index
+  ``ETHTOOL_A_TSINFO_STATS``             nested  HW timestamping statistics
   =====================================  ======  ==========================
 
 ``ETHTOOL_A_TSINFO_PHC_INDEX`` is absent if there is no associated PHC (there
 is no special value for this case). The bitset attributes are omitted if they
 would be empty (no bit set).
 
+Additional hardware timestamping statistics response contents:
+
+  =====================================  ======  ===================================
+  ``ETHTOOL_A_TS_STAT_TX_PKTS``          u64     Packets with Tx HW timestamps
+  ``ETHTOOL_A_TS_STAT_TX_LOST``          u64     Tx HW timestamp not arrived count
+  ``ETHTOOL_A_TS_STAT_TX_ERR``           u64     HW error request Tx timestamp count
+  =====================================  ======  ===================================
+
 CABLE_TEST
 ==========
 
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 9901e563f706..6fd9107d3cc0 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -480,6 +480,26 @@ struct ethtool_rmon_stats {
 	);
 };
 
+/**
+ * struct ethtool_ts_stats - HW timestamping statistics
+ * @pkts: Number of packets successfully timestamped by the hardware.
+ * @lost: Number of hardware timestamping requests where the timestamping
+ *	information from the hardware never arrived for submission with
+ *	the skb.
+ * @err: Number of arbitrary timestamp generation error events that the
+ *	hardware encountered, exclusive of @lost statistics. Cases such
+ *	as resource exhaustion, unavailability, firmware errors, and
+ *	detected illogical timestamp values not submitted with the skb
+ *	are inclusive to this counter.
+ */
+struct ethtool_ts_stats {
+	struct_group(tx_stats,
+		u64 pkts;
+		u64 lost;
+		u64 err;
+	);
+};
+
 #define ETH_MODULE_EEPROM_PAGE_LEN	128
 #define ETH_MODULE_MAX_I2C_ADDRESS	0x7f
 
@@ -755,7 +775,10 @@ struct ethtool_rxfh_param {
  * @get_ts_info: Get the time stamping and PTP hardware clock capabilities.
  *	It may be called with RCU, or rtnl or reference on the device.
  *	Drivers supporting transmit time stamps in software should set this to
- *	ethtool_op_get_ts_info().
+ *	ethtool_op_get_ts_info(). Drivers must not zero statistics which they
+ *	don't report. The stats	structure is initialized to ETHTOOL_STAT_NOT_SET
+ *	indicating driver does not report statistics.
+ * @get_ts_stats: Query the device hardware timestamping statistics.
  * @get_module_info: Get the size and type of the eeprom contained within
  *	a plug-in module.
  * @get_module_eeprom: Get the eeprom information from the plug-in module
@@ -898,6 +921,8 @@ struct ethtool_ops {
 				 struct ethtool_dump *, void *);
 	int	(*set_dump)(struct net_device *, struct ethtool_dump *);
 	int	(*get_ts_info)(struct net_device *, struct ethtool_ts_info *);
+	void	(*get_ts_stats)(struct net_device *dev,
+				struct ethtool_ts_stats *ts_stats);
 	int     (*get_module_info)(struct net_device *,
 				   struct ethtool_modinfo *);
 	int     (*get_module_eeprom)(struct net_device *,
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index accbb1a231df..708272026d80 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -478,12 +478,26 @@ enum {
 	ETHTOOL_A_TSINFO_TX_TYPES,			/* bitset */
 	ETHTOOL_A_TSINFO_RX_FILTERS,			/* bitset */
 	ETHTOOL_A_TSINFO_PHC_INDEX,			/* u32 */
+	ETHTOOL_A_TSINFO_STATS,				/* nest - _A_TSINFO_STAT */
 
 	/* add new constants above here */
 	__ETHTOOL_A_TSINFO_CNT,
 	ETHTOOL_A_TSINFO_MAX = (__ETHTOOL_A_TSINFO_CNT - 1)
 };
 
+enum {
+	ETHTOOL_A_TS_STAT_UNSPEC,
+
+	ETHTOOL_A_TS_STAT_TX_PKTS,			/* u64 */
+	ETHTOOL_A_TS_STAT_TX_LOST,			/* u64 */
+	ETHTOOL_A_TS_STAT_TX_ERR,			/* u64 */
+
+	/* add new constants above here */
+	__ETHTOOL_A_TS_STAT_CNT,
+	ETHTOOL_A_TS_STAT_MAX = (__ETHTOOL_A_TS_STAT_CNT - 1)
+
+};
+
 /* PHC VCLOCKS */
 
 enum {
diff --git a/net/ethtool/tsinfo.c b/net/ethtool/tsinfo.c
index 9daed0aab162..be2755c8d8fd 100644
--- a/net/ethtool/tsinfo.c
+++ b/net/ethtool/tsinfo.c
@@ -13,14 +13,18 @@ struct tsinfo_req_info {
 struct tsinfo_reply_data {
 	struct ethnl_reply_data		base;
 	struct ethtool_ts_info		ts_info;
+	struct ethtool_ts_stats		stats;
 };
 
 #define TSINFO_REPDATA(__reply_base) \
 	container_of(__reply_base, struct tsinfo_reply_data, base)
 
+#define ETHTOOL_TS_STAT_CNT \
+	(__ETHTOOL_A_TS_STAT_CNT - (ETHTOOL_A_TS_STAT_UNSPEC + 1))
+
 const struct nla_policy ethnl_tsinfo_get_policy[] = {
 	[ETHTOOL_A_TSINFO_HEADER]		=
-		NLA_POLICY_NESTED(ethnl_header_policy),
+		NLA_POLICY_NESTED(ethnl_header_policy_stats),
 };
 
 static int tsinfo_prepare_data(const struct ethnl_req_info *req_base,
@@ -34,6 +38,12 @@ static int tsinfo_prepare_data(const struct ethnl_req_info *req_base,
 	ret = ethnl_ops_begin(dev);
 	if (ret < 0)
 		return ret;
+	if (req_base->flags & ETHTOOL_FLAG_STATS &&
+	    dev->ethtool_ops->get_ts_stats) {
+		ethtool_stats_init((u64 *)&data->stats,
+				   sizeof(data->stats) / sizeof(u64));
+		dev->ethtool_ops->get_ts_stats(dev, &data->stats);
+	}
 	ret = __ethtool_get_ts_info(dev, &data->ts_info);
 	ethnl_ops_complete(dev);
 
@@ -79,10 +89,47 @@ static int tsinfo_reply_size(const struct ethnl_req_info *req_base,
 	}
 	if (ts_info->phc_index >= 0)
 		len += nla_total_size(sizeof(u32));	/* _TSINFO_PHC_INDEX */
+	if (req_base->flags & ETHTOOL_FLAG_STATS)
+		len += nla_total_size(0) + /* _TSINFO_STATS */
+		       nla_total_size_64bit(sizeof(u64)) * ETHTOOL_TS_STAT_CNT;
 
 	return len;
 }
 
+static int tsinfo_put_stat(struct sk_buff *skb, u64 val, u16 attrtype)
+{
+	if (val == ETHTOOL_STAT_NOT_SET)
+		return 0;
+	if (nla_put_uint(skb, attrtype, val))
+		return -EMSGSIZE;
+	return 0;
+}
+
+static int tsinfo_put_stats(struct sk_buff *skb,
+			    const struct ethtool_ts_stats *stats)
+{
+	struct nlattr *nest;
+
+	nest = nla_nest_start(skb, ETHTOOL_A_TSINFO_STATS);
+	if (!nest)
+		return -EMSGSIZE;
+
+	if (tsinfo_put_stat(skb, stats->tx_stats.pkts,
+			    ETHTOOL_A_TS_STAT_TX_PKTS) ||
+	    tsinfo_put_stat(skb, stats->tx_stats.lost,
+			    ETHTOOL_A_TS_STAT_TX_LOST) ||
+	    tsinfo_put_stat(skb, stats->tx_stats.err,
+			    ETHTOOL_A_TS_STAT_TX_ERR))
+		goto err_cancel;
+
+	nla_nest_end(skb, nest);
+	return 0;
+
+err_cancel:
+	nla_nest_cancel(skb, nest);
+	return -EMSGSIZE;
+}
+
 static int tsinfo_fill_reply(struct sk_buff *skb,
 			     const struct ethnl_req_info *req_base,
 			     const struct ethnl_reply_data *reply_base)
@@ -119,6 +166,9 @@ static int tsinfo_fill_reply(struct sk_buff *skb,
 	if (ts_info->phc_index >= 0 &&
 	    nla_put_u32(skb, ETHTOOL_A_TSINFO_PHC_INDEX, ts_info->phc_index))
 		return -EMSGSIZE;
+	if (req_base->flags & ETHTOOL_FLAG_STATS &&
+	    tsinfo_put_stats(skb, &data->stats))
+		return -EMSGSIZE;
 
 	return 0;
 }
-- 
cgit v1.2.3


From ff8877b04ef282b2bdb16c9dccc2e42216a34f62 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 5 Apr 2024 22:22:38 -0700
Subject: netlink: specs: ethtool: define header-flags as an enum

Recent changes added header flags to the spec.
Use an enum instead of defines for more seamless codegen.

[Jakub: drop the already applied parts and rewrite message]

Signed-off-by: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Reviewed-by: Dragos Tatulea <dtatulea@nvidia.com>
Link: https://lore.kernel.org/r/20240403212931.128541-6-rrameshbabu@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/ethtool.yaml |  1 -
 include/uapi/linux/ethtool_netlink.h     | 11 +++++------
 2 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml
index b76916394ec9..87ae7b397984 100644
--- a/Documentation/netlink/specs/ethtool.yaml
+++ b/Documentation/netlink/specs/ethtool.yaml
@@ -18,7 +18,6 @@ definitions:
     entries: []
   -
     name: header-flags
-    enum-name:
     type: flags
     entries: [ compact-bitsets, omit-reply, stats ]
 
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index 708272026d80..e06a34cded78 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -117,12 +117,11 @@ enum {
 
 /* request header */
 
-/* use compact bitsets in reply */
-#define ETHTOOL_FLAG_COMPACT_BITSETS	(1 << 0)
-/* provide optional reply for SET or ACT requests */
-#define ETHTOOL_FLAG_OMIT_REPLY	(1 << 1)
-/* request statistics, if supported by the driver */
-#define ETHTOOL_FLAG_STATS		(1 << 2)
+enum ethtool_header_flags {
+	ETHTOOL_FLAG_COMPACT_BITSETS	= 1 << 0,	/* use compact bitsets in reply */
+	ETHTOOL_FLAG_OMIT_REPLY		= 1 << 1,	/* provide optional reply for SET or ACT requests */
+	ETHTOOL_FLAG_STATS		= 1 << 2,	/* request statistics, if supported by the driver */
+};
 
 #define ETHTOOL_FLAG_ALL (ETHTOOL_FLAG_COMPACT_BITSETS | \
 			  ETHTOOL_FLAG_OMIT_REPLY | \
-- 
cgit v1.2.3


From 9f06f87fef689d28588cde8c7ebb00a67da34026 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 3 Apr 2024 13:21:39 -0700
Subject: net: skbuff: generalize the skb->decrypted bit

The ->decrypted bit can be reused for other crypto protocols.
Remove the direct dependency on TLS, add helpers to clean up
the ifdefs leaking out everywhere.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 15 ++++++++++++---
 include/net/sock.h     |  4 +---
 net/Kconfig            |  3 +++
 net/core/sock.c        |  5 ++---
 net/ipv4/tcp_input.c   | 12 +++---------
 net/ipv4/tcp_ipv4.c    |  4 +---
 net/ipv4/tcp_offload.c |  4 +---
 net/tls/Kconfig        |  1 +
 8 files changed, 24 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 03ea36a82cdd..7dfb906d92f7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -992,7 +992,7 @@ struct sk_buff {
 #ifdef CONFIG_NETFILTER_SKIP_EGRESS
 	__u8			nf_skip_egress:1;
 #endif
-#ifdef CONFIG_TLS_DEVICE
+#ifdef CONFIG_SKB_DECRYPTED
 	__u8			decrypted:1;
 #endif
 	__u8			slow_gro:1;
@@ -1615,17 +1615,26 @@ static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from)
 static inline int skb_cmp_decrypted(const struct sk_buff *skb1,
 				    const struct sk_buff *skb2)
 {
-#ifdef CONFIG_TLS_DEVICE
+#ifdef CONFIG_SKB_DECRYPTED
 	return skb2->decrypted - skb1->decrypted;
 #else
 	return 0;
 #endif
 }
 
+static inline bool skb_is_decrypted(const struct sk_buff *skb)
+{
+#ifdef CONFIG_SKB_DECRYPTED
+	return skb->decrypted;
+#else
+	return false;
+#endif
+}
+
 static inline void skb_copy_decrypted(struct sk_buff *to,
 				      const struct sk_buff *from)
 {
-#ifdef CONFIG_TLS_DEVICE
+#ifdef CONFIG_SKB_DECRYPTED
 	to->decrypted = from->decrypted;
 #endif
 }
diff --git a/include/net/sock.h b/include/net/sock.h
index 2253eefe2848..a495330c5c49 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2835,12 +2835,10 @@ static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb,
 
 	if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) {
 		skb = sk->sk_validate_xmit_skb(sk, dev, skb);
-#ifdef CONFIG_TLS_DEVICE
-	} else if (unlikely(skb->decrypted)) {
+	} else if (unlikely(skb_is_decrypted(skb))) {
 		pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n");
 		kfree_skb(skb);
 		skb = NULL;
-#endif
 	}
 #endif
 
diff --git a/net/Kconfig b/net/Kconfig
index 3e57ccf0da27..d5ab791f7afa 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -60,6 +60,9 @@ config NET_XGRESS
 config NET_REDIRECT
 	bool
 
+config SKB_DECRYPTED
+	bool
+
 config SKB_EXTENSIONS
 	bool
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 5ed411231fc7..fe9195186c13 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2526,13 +2526,12 @@ EXPORT_SYMBOL(skb_set_owner_w);
 
 static bool can_skb_orphan_partial(const struct sk_buff *skb)
 {
-#ifdef CONFIG_TLS_DEVICE
 	/* Drivers depend on in-order delivery for crypto offload,
 	 * partial orphan breaks out-of-order-OK logic.
 	 */
-	if (skb->decrypted)
+	if (skb_is_decrypted(skb))
 		return false;
-#endif
+
 	return (skb->destructor == sock_wfree ||
 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
 }
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8d44ab5671ea..1f28a2561795 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4805,10 +4805,8 @@ static bool tcp_try_coalesce(struct sock *sk,
 	if (!mptcp_skb_can_collapse(to, from))
 		return false;
 
-#ifdef CONFIG_TLS_DEVICE
-	if (from->decrypted != to->decrypted)
+	if (skb_cmp_decrypted(from, to))
 		return false;
-#endif
 
 	if (!skb_try_coalesce(to, from, fragstolen, &delta))
 		return false;
@@ -5377,9 +5375,7 @@ restart:
 			break;
 
 		memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
-#ifdef CONFIG_TLS_DEVICE
-		nskb->decrypted = skb->decrypted;
-#endif
+		skb_copy_decrypted(nskb, skb);
 		TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
 		if (list)
 			__skb_queue_before(list, skb, nskb);
@@ -5409,10 +5405,8 @@ restart:
 				    !mptcp_skb_can_collapse(nskb, skb) ||
 				    (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
 					goto end;
-#ifdef CONFIG_TLS_DEVICE
-				if (skb->decrypted != nskb->decrypted)
+				if (skb_cmp_decrypted(skb, nskb))
 					goto end;
-#endif
 			}
 		}
 	}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 56b75efcfd12..52963c3bb8ca 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2044,10 +2044,8 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
 	      TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
 	    ((TCP_SKB_CB(tail)->tcp_flags ^
 	      TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
-#ifdef CONFIG_TLS_DEVICE
-	    tail->decrypted != skb->decrypted ||
-#endif
 	    !mptcp_skb_can_collapse(tail, skb) ||
+	    skb_cmp_decrypted(tail, skb) ||
 	    thtail->doff != th->doff ||
 	    memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
 		goto no_coalesce;
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index ebe4722bb020..fab0973f995b 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -265,9 +265,7 @@ found:
 		flush |= (len - 1) >= mss;
 
 	flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
-#ifdef CONFIG_TLS_DEVICE
-	flush |= p->decrypted ^ skb->decrypted;
-#endif
+	flush |= skb_cmp_decrypted(p, skb);
 
 	if (flush || skb_gro_receive(p, skb)) {
 		mss = 1;
diff --git a/net/tls/Kconfig b/net/tls/Kconfig
index 0cdc1f7b6b08..ce8d56a19187 100644
--- a/net/tls/Kconfig
+++ b/net/tls/Kconfig
@@ -20,6 +20,7 @@ config TLS
 config TLS_DEVICE
 	bool "Transport Layer Security HW offload"
 	depends on TLS
+	select SKB_DECRYPTED
 	select SOCK_VALIDATE_XMIT
 	select SOCK_RX_QUEUE_MAPPING
 	default n
-- 
cgit v1.2.3


From 8e69b3459ca1ed4f6f7bd0b0a11962ddb3e7d34a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Wed, 3 Apr 2024 13:22:59 -0700
Subject: netlink: add nlmsg_consume() and use it in devlink compat

devlink_compat_running_version() sticks out when running
netdevsim tests and watching dropped skbs. Add nlmsg_consume()
for cases were we want to free a netlink skb but it is expected,
rather than a drop. af_netlink code uses consume_skb() directly,
which is fine, but some may prefer the symmetry of nlmsg_new() /
nlmsg_consume().

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h | 14 ++++++++++++--
 net/devlink/dev.c     |  2 +-
 2 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 1d2bbcc50212..61cef3bd2d31 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -41,7 +41,8 @@
  *   nlmsg_get_pos()			return current position in message
  *   nlmsg_trim()			trim part of message
  *   nlmsg_cancel()			cancel message construction
- *   nlmsg_free()			free a netlink message
+ *   nlmsg_consume()			free a netlink message (expected)
+ *   nlmsg_free()			free a netlink message (drop)
  *
  * Message Sending:
  *   nlmsg_multicast()			multicast message to several groups
@@ -1082,7 +1083,7 @@ static inline void nlmsg_cancel(struct sk_buff *skb, struct nlmsghdr *nlh)
 }
 
 /**
- * nlmsg_free - free a netlink message
+ * nlmsg_free - drop a netlink message
  * @skb: socket buffer of netlink message
  */
 static inline void nlmsg_free(struct sk_buff *skb)
@@ -1090,6 +1091,15 @@ static inline void nlmsg_free(struct sk_buff *skb)
 	kfree_skb(skb);
 }
 
+/**
+ * nlmsg_consume - free a netlink message
+ * @skb: socket buffer of netlink message
+ */
+static inline void nlmsg_consume(struct sk_buff *skb)
+{
+	consume_skb(skb);
+}
+
 /**
  * nlmsg_multicast_filtered - multicast a netlink message with filter function
  * @sk: netlink socket to spread messages to
diff --git a/net/devlink/dev.c b/net/devlink/dev.c
index c609deb42e88..13c73f50da3d 100644
--- a/net/devlink/dev.c
+++ b/net/devlink/dev.c
@@ -1214,7 +1214,7 @@ static void __devlink_compat_running_version(struct devlink *devlink,
 		}
 	}
 free_msg:
-	nlmsg_free(msg);
+	nlmsg_consume(msg);
 }
 
 void devlink_compat_running_version(struct devlink *devlink,
-- 
cgit v1.2.3


From 6916e461e7933d3d003441291c543938f2ccb371 Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Thu, 4 Apr 2024 11:29:51 +0200
Subject: net: phy: Introduce ethernet link topology representation

Link topologies containing multiple network PHYs attached to the same
net_device can be found when using a PHY as a media converter for use
with an SFP connector, on which an SFP transceiver containing a PHY can
be used.

With the current model, the transceiver's PHY can't be used for
operations such as cable testing, timestamping, macsec offload, etc.

The reason being that most of the logic for these configuration, coming
from either ethtool netlink or ioctls tend to use netdev->phydev, which
in multi-phy systems will reference the PHY closest to the MAC.

Introduce a numbering scheme allowing to enumerate PHY devices that
belong to any netdev, which can in turn allow userspace to take more
precise decisions with regard to each PHY's configuration.

The numbering is maintained per-netdev, in a phy_device_list.
The numbering works similarly to a netdevice's ifindex, with
identifiers that are only recycled once INT_MAX has been reached.

This prevents races that could occur between PHY listing and SFP
transceiver removal/insertion.

The identifiers are assigned at phy_attach time, as the numbering
depends on the netdevice the phy is attached to. The PHY index can be
re-used for PHYs that are persistent.

Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                            |   2 +
 drivers/net/phy/Makefile               |   2 +-
 drivers/net/phy/phy_device.c           |   7 +++
 drivers/net/phy/phy_link_topology.c    | 105 +++++++++++++++++++++++++++++++++
 include/linux/netdevice.h              |   4 +-
 include/linux/phy.h                    |   4 ++
 include/linux/phy_link_topology.h      |  72 ++++++++++++++++++++++
 include/linux/phy_link_topology_core.h |  25 ++++++++
 include/uapi/linux/ethtool.h           |  16 +++++
 net/core/dev.c                         |   9 +++
 10 files changed, 244 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/phy/phy_link_topology.c
 create mode 100644 include/linux/phy_link_topology.h
 create mode 100644 include/linux/phy_link_topology_core.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index 046598b471b7..4745ea94d463 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8015,6 +8015,8 @@ F:	include/linux/mii.h
 F:	include/linux/of_net.h
 F:	include/linux/phy.h
 F:	include/linux/phy_fixed.h
+F:	include/linux/phy_link_topology.h
+F:	include/linux/phy_link_topology_core.h
 F:	include/linux/phylib_stubs.h
 F:	include/linux/platform_data/mdio-bcm-unimac.h
 F:	include/linux/platform_data/mdio-gpio.h
diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile
index 202ed7f450da..1d8be374915f 100644
--- a/drivers/net/phy/Makefile
+++ b/drivers/net/phy/Makefile
@@ -2,7 +2,7 @@
 # Makefile for Linux PHY drivers
 
 libphy-y			:= phy.o phy-c45.o phy-core.o phy_device.o \
-				   linkmode.o
+				   linkmode.o phy_link_topology.o
 mdio-bus-y			+= mdio_bus.o mdio_device.o
 
 ifdef CONFIG_MDIO_DEVICE
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 6c6ec9475709..452fc8b3406d 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -29,6 +29,7 @@
 #include <linux/phy.h>
 #include <linux/phylib_stubs.h>
 #include <linux/phy_led_triggers.h>
+#include <linux/phy_link_topology.h>
 #include <linux/pse-pd/pse.h>
 #include <linux/property.h>
 #include <linux/rtnetlink.h>
@@ -1511,6 +1512,11 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
 
 		if (phydev->sfp_bus_attached)
 			dev->sfp_bus = phydev->sfp_bus;
+
+		err = phy_link_topo_add_phy(dev->link_topo, phydev,
+					    PHY_UPSTREAM_MAC, dev);
+		if (err)
+			goto error;
 	}
 
 	/* Some Ethernet drivers try to connect to a PHY device before
@@ -1938,6 +1944,7 @@ void phy_detach(struct phy_device *phydev)
 	if (dev) {
 		phydev->attached_dev->phydev = NULL;
 		phydev->attached_dev = NULL;
+		phy_link_topo_del_phy(dev->link_topo, phydev);
 	}
 	phydev->phylink = NULL;
 
diff --git a/drivers/net/phy/phy_link_topology.c b/drivers/net/phy/phy_link_topology.c
new file mode 100644
index 000000000000..985941c5c558
--- /dev/null
+++ b/drivers/net/phy/phy_link_topology.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Infrastructure to handle all PHY devices connected to a given netdev,
+ * either directly or indirectly attached.
+ *
+ * Copyright (c) 2023 Maxime Chevallier<maxime.chevallier@bootlin.com>
+ */
+
+#include <linux/phy_link_topology.h>
+#include <linux/netdevice.h>
+#include <linux/phy.h>
+#include <linux/rtnetlink.h>
+#include <linux/xarray.h>
+
+struct phy_link_topology *phy_link_topo_create(struct net_device *dev)
+{
+	struct phy_link_topology *topo;
+
+	topo = kzalloc(sizeof(*topo), GFP_KERNEL);
+	if (!topo)
+		return ERR_PTR(-ENOMEM);
+
+	xa_init_flags(&topo->phys, XA_FLAGS_ALLOC1);
+	topo->next_phy_index = 1;
+
+	return topo;
+}
+
+void phy_link_topo_destroy(struct phy_link_topology *topo)
+{
+	if (!topo)
+		return;
+
+	xa_destroy(&topo->phys);
+	kfree(topo);
+}
+
+int phy_link_topo_add_phy(struct phy_link_topology *topo,
+			  struct phy_device *phy,
+			  enum phy_upstream upt, void *upstream)
+{
+	struct phy_device_node *pdn;
+	int ret;
+
+	pdn = kzalloc(sizeof(*pdn), GFP_KERNEL);
+	if (!pdn)
+		return -ENOMEM;
+
+	pdn->phy = phy;
+	switch (upt) {
+	case PHY_UPSTREAM_MAC:
+		pdn->upstream.netdev = (struct net_device *)upstream;
+		if (phy_on_sfp(phy))
+			pdn->parent_sfp_bus = pdn->upstream.netdev->sfp_bus;
+		break;
+	case PHY_UPSTREAM_PHY:
+		pdn->upstream.phydev = (struct phy_device *)upstream;
+		if (phy_on_sfp(phy))
+			pdn->parent_sfp_bus = pdn->upstream.phydev->sfp_bus;
+		break;
+	default:
+		ret = -EINVAL;
+		goto err;
+	}
+	pdn->upstream_type = upt;
+
+	/* Attempt to re-use a previously allocated phy_index */
+	if (phy->phyindex) {
+		ret = xa_insert(&topo->phys, phy->phyindex, pdn, GFP_KERNEL);
+
+		/* Errors could be either -ENOMEM or -EBUSY. If the phy has an
+		 * index, and there's another entry at the same index, this is
+		 * unexpected and we still error-out
+		 */
+		if (ret)
+			goto err;
+		return 0;
+	}
+
+	ret = xa_alloc_cyclic(&topo->phys, &phy->phyindex, pdn, xa_limit_32b,
+			      &topo->next_phy_index, GFP_KERNEL);
+	if (ret)
+		goto err;
+
+	return 0;
+
+err:
+	kfree(pdn);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(phy_link_topo_add_phy);
+
+void phy_link_topo_del_phy(struct phy_link_topology *topo,
+			   struct phy_device *phy)
+{
+	struct phy_device_node *pdn = xa_erase(&topo->phys, phy->phyindex);
+
+	/* We delete the PHY from the topology, however we don't re-set the
+	 * phy->phyindex field. If the PHY isn't gone, we can re-assign it the
+	 * same index next time it's added back to the topology
+	 */
+
+	kfree(pdn);
+}
+EXPORT_SYMBOL_GPL(phy_link_topo_del_phy);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0c198620ac93..d45f330d083d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -40,7 +40,6 @@
 #include <net/dcbnl.h>
 #endif
 #include <net/netprio_cgroup.h>
-
 #include <linux/netdev_features.h>
 #include <linux/neighbour.h>
 #include <uapi/linux/netdevice.h>
@@ -52,6 +51,7 @@
 #include <net/net_trackers.h>
 #include <net/net_debug.h>
 #include <net/dropreason-core.h>
+#include <linux/phy_link_topology_core.h>
 
 struct netpoll_info;
 struct device;
@@ -1974,6 +1974,7 @@ enum netdev_reg_state {
  *	@fcoe_ddp_xid:	Max exchange id for FCoE LRO by ddp
  *
  *	@priomap:	XXX: need comments on this one
+ *	@link_topo:	Physical link topology tracking attached PHYs
  *	@phydev:	Physical device may attach itself
  *			for hardware timestamping
  *	@sfp_bus:	attached &struct sfp_bus structure.
@@ -2364,6 +2365,7 @@ struct net_device {
 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 	struct netprio_map __rcu *priomap;
 #endif
+	struct phy_link_topology	*link_topo;
 	struct phy_device	*phydev;
 	struct sfp_bus		*sfp_bus;
 	struct lock_class_key	*qdisc_tx_busylock;
diff --git a/include/linux/phy.h b/include/linux/phy.h
index e6e83304558e..8c848c79b1fd 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -550,6 +550,9 @@ struct macsec_ops;
  * @drv: Pointer to the driver for this PHY instance
  * @devlink: Create a link between phy dev and mac dev, if the external phy
  *           used by current mac interface is managed by another mac interface.
+ * @phyindex: Unique id across the phy's parent tree of phys to address the PHY
+ *	      from userspace, similar to ifindex. A zero index means the PHY
+ *	      wasn't assigned an id yet.
  * @phy_id: UID for this device found during discovery
  * @c45_ids: 802.3-c45 Device Identifiers if is_c45.
  * @is_c45:  Set to true if this PHY uses clause 45 addressing.
@@ -650,6 +653,7 @@ struct phy_device {
 
 	struct device_link *devlink;
 
+	u32 phyindex;
 	u32 phy_id;
 
 	struct phy_c45_device_ids c45_ids;
diff --git a/include/linux/phy_link_topology.h b/include/linux/phy_link_topology.h
new file mode 100644
index 000000000000..6b79feb607e7
--- /dev/null
+++ b/include/linux/phy_link_topology.h
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * PHY device list allow maintaining a list of PHY devices that are
+ * part of a netdevice's link topology. PHYs can for example be chained,
+ * as is the case when using a PHY that exposes an SFP module, on which an
+ * SFP transceiver that embeds a PHY is connected.
+ *
+ * This list can then be used by userspace to leverage individual PHY
+ * capabilities.
+ */
+#ifndef __PHY_LINK_TOPOLOGY_H
+#define __PHY_LINK_TOPOLOGY_H
+
+#include <linux/ethtool.h>
+#include <linux/phy_link_topology_core.h>
+
+struct xarray;
+struct phy_device;
+struct net_device;
+struct sfp_bus;
+
+struct phy_device_node {
+	enum phy_upstream upstream_type;
+
+	union {
+		struct net_device	*netdev;
+		struct phy_device	*phydev;
+	} upstream;
+
+	struct sfp_bus *parent_sfp_bus;
+
+	struct phy_device *phy;
+};
+
+struct phy_link_topology {
+	struct xarray phys;
+	u32 next_phy_index;
+};
+
+static inline struct phy_device *
+phy_link_topo_get_phy(struct phy_link_topology *topo, u32 phyindex)
+{
+	struct phy_device_node *pdn = xa_load(&topo->phys, phyindex);
+
+	if (pdn)
+		return pdn->phy;
+
+	return NULL;
+}
+
+#if IS_REACHABLE(CONFIG_PHYLIB)
+int phy_link_topo_add_phy(struct phy_link_topology *topo,
+			  struct phy_device *phy,
+			  enum phy_upstream upt, void *upstream);
+
+void phy_link_topo_del_phy(struct phy_link_topology *lt, struct phy_device *phy);
+
+#else
+static inline int phy_link_topo_add_phy(struct phy_link_topology *topo,
+					struct phy_device *phy,
+					enum phy_upstream upt, void *upstream)
+{
+	return 0;
+}
+
+static inline void phy_link_topo_del_phy(struct phy_link_topology *topo,
+					 struct phy_device *phy)
+{
+}
+#endif
+
+#endif /* __PHY_LINK_TOPOLOGY_H */
diff --git a/include/linux/phy_link_topology_core.h b/include/linux/phy_link_topology_core.h
new file mode 100644
index 000000000000..0a6479055745
--- /dev/null
+++ b/include/linux/phy_link_topology_core.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __PHY_LINK_TOPOLOGY_CORE_H
+#define __PHY_LINK_TOPOLOGY_CORE_H
+
+struct phy_link_topology;
+
+#if IS_REACHABLE(CONFIG_PHYLIB)
+
+struct phy_link_topology *phy_link_topo_create(struct net_device *dev);
+void phy_link_topo_destroy(struct phy_link_topology *topo);
+
+#else
+
+static inline struct phy_link_topology *phy_link_topo_create(struct net_device *dev)
+{
+	return NULL;
+}
+
+static inline void phy_link_topo_destroy(struct phy_link_topology *topo)
+{
+}
+
+#endif
+
+#endif /* __PHY_LINK_TOPOLOGY_CORE_H */
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 11fc18988bc2..95c2f09f0d0a 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -2268,4 +2268,20 @@ struct ethtool_link_settings {
 	 * __u32 map_lp_advertising[link_mode_masks_nwords];
 	 */
 };
+
+/**
+ * enum phy_upstream - Represents the upstream component a given PHY device
+ * is connected to, as in what is on the other end of the MII bus. Most PHYs
+ * will be attached to an Ethernet MAC controller, but in some cases, there's
+ * an intermediate PHY used as a media-converter, which will driver another
+ * MII interface as its output.
+ * @PHY_UPSTREAM_MAC: Upstream component is a MAC (a switch port,
+ *		      or ethernet controller)
+ * @PHY_UPSTREAM_PHY: Upstream component is a PHY (likely a media converter)
+ */
+enum phy_upstream {
+	PHY_UPSTREAM_MAC,
+	PHY_UPSTREAM_PHY,
+};
+
 #endif /* _UAPI_LINUX_ETHTOOL_H */
diff --git a/net/core/dev.c b/net/core/dev.c
index 92f5bddbc2de..854a3a28a8d8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -158,6 +158,7 @@
 #include <net/page_pool/types.h>
 #include <net/page_pool/helpers.h>
 #include <net/rps.h>
+#include <linux/phy_link_topology_core.h>
 
 #include "dev.h"
 #include "net-sysfs.h"
@@ -10962,6 +10963,12 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 #ifdef CONFIG_NET_SCHED
 	hash_init(dev->qdisc_hash);
 #endif
+	dev->link_topo = phy_link_topo_create(dev);
+	if (IS_ERR(dev->link_topo)) {
+		dev->link_topo = NULL;
+		goto free_all;
+	}
+
 	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
 	setup(dev);
 
@@ -11050,6 +11057,8 @@ void free_netdev(struct net_device *dev)
 	free_percpu(dev->xdp_bulkq);
 	dev->xdp_bulkq = NULL;
 
+	phy_link_topo_destroy(dev->link_topo);
+
 	/*  Compatibility with error handling in drivers */
 	if (dev->reg_state == NETREG_UNINITIALIZED) {
 		netdev_freemem(dev);
-- 
cgit v1.2.3


From 0ec5ed6c130e3906ba4ec82d740444a21504fbbf Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Thu, 4 Apr 2024 11:29:52 +0200
Subject: net: sfp: pass the phy_device when disconnecting an sfp module's PHY

Pass the phy_device as a parameter to the sfp upstream .disconnect_phy
operation. This is preparatory work to help track phy devices across
a net_device's link.

Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phylink.c | 3 ++-
 drivers/net/phy/sfp-bus.c | 4 ++--
 include/linux/sfp.h       | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index 84a97088dfc6..0e692a3bcf1a 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -3408,7 +3408,8 @@ static int phylink_sfp_connect_phy(void *upstream, struct phy_device *phy)
 	return ret;
 }
 
-static void phylink_sfp_disconnect_phy(void *upstream)
+static void phylink_sfp_disconnect_phy(void *upstream,
+				       struct phy_device *phydev)
 {
 	phylink_disconnect_phy(upstream);
 }
diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c
index db39dec7f247..e05013aeecc3 100644
--- a/drivers/net/phy/sfp-bus.c
+++ b/drivers/net/phy/sfp-bus.c
@@ -486,7 +486,7 @@ static void sfp_unregister_bus(struct sfp_bus *bus)
 			bus->socket_ops->stop(bus->sfp);
 		bus->socket_ops->detach(bus->sfp);
 		if (bus->phydev && ops && ops->disconnect_phy)
-			ops->disconnect_phy(bus->upstream);
+			ops->disconnect_phy(bus->upstream, bus->phydev);
 	}
 	bus->registered = false;
 }
@@ -742,7 +742,7 @@ void sfp_remove_phy(struct sfp_bus *bus)
 	const struct sfp_upstream_ops *ops = sfp_get_upstream_ops(bus);
 
 	if (ops && ops->disconnect_phy)
-		ops->disconnect_phy(bus->upstream);
+		ops->disconnect_phy(bus->upstream, bus->phydev);
 	bus->phydev = NULL;
 }
 EXPORT_SYMBOL_GPL(sfp_remove_phy);
diff --git a/include/linux/sfp.h b/include/linux/sfp.h
index 9346cd44814d..0573e53b0c11 100644
--- a/include/linux/sfp.h
+++ b/include/linux/sfp.h
@@ -544,7 +544,7 @@ struct sfp_upstream_ops {
 	void (*link_down)(void *priv);
 	void (*link_up)(void *priv);
 	int (*connect_phy)(void *priv, struct phy_device *);
-	void (*disconnect_phy)(void *priv);
+	void (*disconnect_phy)(void *priv, struct phy_device *);
 };
 
 #if IS_ENABLED(CONFIG_SFP)
-- 
cgit v1.2.3


From e75e4e074c4475a3a6145593ecf2dcaf3995fa50 Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Thu, 4 Apr 2024 11:29:53 +0200
Subject: net: phy: add helpers to handle sfp phy connect/disconnect

There are a few PHY drivers that can handle SFP modules through their
sfp_upstream_ops. Introduce Phylib helpers to keep track of connected
SFP PHYs in a netdevice's namespace, by adding the SFP PHY to the
upstream PHY's netdev's namespace.

By doing so, these SFP PHYs can be enumerated and exposed to users,
which will be able to use their capabilities.

Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/marvell-88x2222.c |  2 ++
 drivers/net/phy/marvell.c         |  2 ++
 drivers/net/phy/marvell10g.c      |  2 ++
 drivers/net/phy/phy_device.c      | 48 +++++++++++++++++++++++++++++++++++++++
 drivers/net/phy/qcom/at803x.c     |  2 ++
 drivers/net/phy/qcom/qca807x.c    |  2 ++
 include/linux/phy.h               |  2 ++
 7 files changed, 60 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/marvell-88x2222.c b/drivers/net/phy/marvell-88x2222.c
index b88398e6872b..0b777cdd7078 100644
--- a/drivers/net/phy/marvell-88x2222.c
+++ b/drivers/net/phy/marvell-88x2222.c
@@ -553,6 +553,8 @@ static const struct sfp_upstream_ops sfp_phy_ops = {
 	.link_down = mv2222_sfp_link_down,
 	.attach = phy_sfp_attach,
 	.detach = phy_sfp_detach,
+	.connect_phy = phy_sfp_connect_phy,
+	.disconnect_phy = phy_sfp_disconnect_phy,
 };
 
 static int mv2222_probe(struct phy_device *phydev)
diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index 0003276452f6..860dc4001d41 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -3550,6 +3550,8 @@ static const struct sfp_upstream_ops m88e1510_sfp_ops = {
 	.module_remove = m88e1510_sfp_remove,
 	.attach = phy_sfp_attach,
 	.detach = phy_sfp_detach,
+	.connect_phy = phy_sfp_connect_phy,
+	.disconnect_phy = phy_sfp_disconnect_phy,
 };
 
 static int m88e1510_probe(struct phy_device *phydev)
diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c
index ad43e280930c..6642eb642d4b 100644
--- a/drivers/net/phy/marvell10g.c
+++ b/drivers/net/phy/marvell10g.c
@@ -503,6 +503,8 @@ static int mv3310_sfp_insert(void *upstream, const struct sfp_eeprom_id *id)
 static const struct sfp_upstream_ops mv3310_sfp_ops = {
 	.attach = phy_sfp_attach,
 	.detach = phy_sfp_detach,
+	.connect_phy = phy_sfp_connect_phy,
+	.disconnect_phy = phy_sfp_disconnect_phy,
 	.module_insert = mv3310_sfp_insert,
 };
 
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 452fc8b3406d..616bd7ba46cb 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -277,6 +277,14 @@ static void phy_mdio_device_remove(struct mdio_device *mdiodev)
 
 static struct phy_driver genphy_driver;
 
+static struct phy_link_topology *phy_get_link_topology(struct phy_device *phydev)
+{
+	if (phydev->attached_dev)
+		return phydev->attached_dev->link_topo;
+
+	return NULL;
+}
+
 static LIST_HEAD(phy_fixup_list);
 static DEFINE_MUTEX(phy_fixup_lock);
 
@@ -1370,6 +1378,46 @@ phy_standalone_show(struct device *dev, struct device_attribute *attr,
 }
 static DEVICE_ATTR_RO(phy_standalone);
 
+/**
+ * phy_sfp_connect_phy - Connect the SFP module's PHY to the upstream PHY
+ * @upstream: pointer to the upstream phy device
+ * @phy: pointer to the SFP module's phy device
+ *
+ * This helper allows keeping track of PHY devices on the link. It adds the
+ * SFP module's phy to the phy namespace of the upstream phy
+ */
+int phy_sfp_connect_phy(void *upstream, struct phy_device *phy)
+{
+	struct phy_device *phydev = upstream;
+	struct phy_link_topology *topo = phy_get_link_topology(phydev);
+
+	if (topo)
+		return phy_link_topo_add_phy(topo, phy, PHY_UPSTREAM_PHY, phydev);
+
+	return 0;
+}
+EXPORT_SYMBOL(phy_sfp_connect_phy);
+
+/**
+ * phy_sfp_disconnect_phy - Disconnect the SFP module's PHY from the upstream PHY
+ * @upstream: pointer to the upstream phy device
+ * @phy: pointer to the SFP module's phy device
+ *
+ * This helper allows keeping track of PHY devices on the link. It removes the
+ * SFP module's phy to the phy namespace of the upstream phy. As the module phy
+ * will be destroyed, re-inserting the same module will add a new phy with a
+ * new index.
+ */
+void phy_sfp_disconnect_phy(void *upstream, struct phy_device *phy)
+{
+	struct phy_device *phydev = upstream;
+	struct phy_link_topology *topo = phy_get_link_topology(phydev);
+
+	if (topo)
+		phy_link_topo_del_phy(topo, phy);
+}
+EXPORT_SYMBOL(phy_sfp_disconnect_phy);
+
 /**
  * phy_sfp_attach - attach the SFP bus to the PHY upstream network device
  * @upstream: pointer to the phy device
diff --git a/drivers/net/phy/qcom/at803x.c b/drivers/net/phy/qcom/at803x.c
index c8f83e5f78ab..105602581a03 100644
--- a/drivers/net/phy/qcom/at803x.c
+++ b/drivers/net/phy/qcom/at803x.c
@@ -770,6 +770,8 @@ static const struct sfp_upstream_ops at8031_sfp_ops = {
 	.attach = phy_sfp_attach,
 	.detach = phy_sfp_detach,
 	.module_insert = at8031_sfp_insert,
+	.connect_phy = phy_sfp_connect_phy,
+	.disconnect_phy = phy_sfp_disconnect_phy,
 };
 
 static int at8031_parse_dt(struct phy_device *phydev)
diff --git a/drivers/net/phy/qcom/qca807x.c b/drivers/net/phy/qcom/qca807x.c
index 672c6929119a..5eb0ab1cb70e 100644
--- a/drivers/net/phy/qcom/qca807x.c
+++ b/drivers/net/phy/qcom/qca807x.c
@@ -699,6 +699,8 @@ static const struct sfp_upstream_ops qca807x_sfp_ops = {
 	.detach = phy_sfp_detach,
 	.module_insert = qca807x_sfp_insert,
 	.module_remove = qca807x_sfp_remove,
+	.connect_phy = phy_sfp_connect_phy,
+	.disconnect_phy = phy_sfp_disconnect_phy,
 };
 
 static int qca807x_probe(struct phy_device *phydev)
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 8c848c79b1fd..3ddfe7fe781a 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1758,6 +1758,8 @@ int phy_suspend(struct phy_device *phydev);
 int phy_resume(struct phy_device *phydev);
 int __phy_resume(struct phy_device *phydev);
 int phy_loopback(struct phy_device *phydev, bool enable);
+int phy_sfp_connect_phy(void *upstream, struct phy_device *phy);
+void phy_sfp_disconnect_phy(void *upstream, struct phy_device *phy);
 void phy_sfp_attach(void *upstream, struct sfp_bus *bus);
 void phy_sfp_detach(void *upstream, struct sfp_bus *bus);
 int phy_sfp_probe(struct phy_device *phydev,
-- 
cgit v1.2.3


From fdd353965b52de7580795824de7010a73ff98f30 Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Thu, 4 Apr 2024 11:29:54 +0200
Subject: net: sfp: Add helper to return the SFP bus name

Knowing the bus name is helpful when we want to expose the link topology
to userspace, add a helper to return the SFP bus name.

Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/sfp-bus.c | 11 +++++++++++
 include/linux/sfp.h       |  6 ++++++
 2 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c
index e05013aeecc3..413021619afe 100644
--- a/drivers/net/phy/sfp-bus.c
+++ b/drivers/net/phy/sfp-bus.c
@@ -859,3 +859,14 @@ void sfp_unregister_socket(struct sfp_bus *bus)
 	sfp_bus_put(bus);
 }
 EXPORT_SYMBOL_GPL(sfp_unregister_socket);
+
+const char *sfp_get_name(struct sfp_bus *bus)
+{
+	ASSERT_RTNL();
+
+	if (bus->sfp_dev)
+		return dev_name(bus->sfp_dev);
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(sfp_get_name);
diff --git a/include/linux/sfp.h b/include/linux/sfp.h
index 0573e53b0c11..55c0ab17c9e2 100644
--- a/include/linux/sfp.h
+++ b/include/linux/sfp.h
@@ -570,6 +570,7 @@ struct sfp_bus *sfp_bus_find_fwnode(const struct fwnode_handle *fwnode);
 int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream,
 			 const struct sfp_upstream_ops *ops);
 void sfp_bus_del_upstream(struct sfp_bus *bus);
+const char *sfp_get_name(struct sfp_bus *bus);
 #else
 static inline int sfp_parse_port(struct sfp_bus *bus,
 				 const struct sfp_eeprom_id *id,
@@ -648,6 +649,11 @@ static inline int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream,
 static inline void sfp_bus_del_upstream(struct sfp_bus *bus)
 {
 }
+
+static inline const char *sfp_get_name(struct sfp_bus *bus)
+{
+	return NULL;
+}
 #endif
 
 #endif
-- 
cgit v1.2.3


From 841942bc62122c59f654f799c2d00ce3fda93efa Mon Sep 17 00:00:00 2001
From: Maxime Chevallier <maxime.chevallier@bootlin.com>
Date: Thu, 4 Apr 2024 11:29:55 +0200
Subject: net: ethtool: Allow passing a phy index for some commands

Some netlink commands are target towards ethernet PHYs, to control some
of their features. As there's several such commands, add the ability to
pass a PHY index in the ethnl request, which will populate the generic
ethnl_req_info with the relevant phydev when the command targets a PHY.

Signed-off-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ethtool-netlink.rst |  7 ++++
 include/uapi/linux/ethtool_netlink.h         |  1 +
 net/ethtool/netlink.c                        | 48 ++++++++++++++++++++++++++--
 net/ethtool/netlink.h                        |  5 +++
 4 files changed, 59 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index 08d330b0f50f..5dc42f7ce429 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -57,6 +57,7 @@ Structure of this header is
   ``ETHTOOL_A_HEADER_DEV_INDEX``  u32     device ifindex
   ``ETHTOOL_A_HEADER_DEV_NAME``   string  device name
   ``ETHTOOL_A_HEADER_FLAGS``      u32     flags common for all requests
+  ``ETHTOOL_A_HEADER_PHY_INDEX``  u32     phy device index
   ==============================  ======  =============================
 
 ``ETHTOOL_A_HEADER_DEV_INDEX`` and ``ETHTOOL_A_HEADER_DEV_NAME`` identify the
@@ -81,6 +82,12 @@ the behaviour is backward compatible, i.e. requests from old clients not aware
 of the flag should be interpreted the way the client expects. A client must
 not set flags it does not understand.
 
+``ETHTOOL_A_HEADER_PHY_INDEX`` identifies the Ethernet PHY the message relates to.
+As there are numerous commands that are related to PHY configuration, and because
+there may be more than one PHY on the link, the PHY index can be passed in the
+request for the commands that needs it. It is, however, not mandatory, and if it
+is not passed for commands that target a PHY, the net_device.phydev pointer
+is used.
 
 Bit sets
 ========
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index e06a34cded78..23e225f00fb0 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -132,6 +132,7 @@ enum {
 	ETHTOOL_A_HEADER_DEV_INDEX,		/* u32 */
 	ETHTOOL_A_HEADER_DEV_NAME,		/* string */
 	ETHTOOL_A_HEADER_FLAGS,			/* u32 - ETHTOOL_FLAG_* */
+	ETHTOOL_A_HEADER_PHY_INDEX,		/* u32 */
 
 	/* add new constants above here */
 	__ETHTOOL_A_HEADER_CNT,
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index bd04f28d5cf4..563e94e0cbd8 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -4,6 +4,7 @@
 #include <linux/ethtool_netlink.h>
 #include <linux/pm_runtime.h>
 #include "netlink.h"
+#include <linux/phy_link_topology.h>
 
 static struct genl_family ethtool_genl_family;
 
@@ -30,6 +31,24 @@ const struct nla_policy ethnl_header_policy_stats[] = {
 							  ETHTOOL_FLAGS_STATS),
 };
 
+const struct nla_policy ethnl_header_policy_phy[] = {
+	[ETHTOOL_A_HEADER_DEV_INDEX]	= { .type = NLA_U32 },
+	[ETHTOOL_A_HEADER_DEV_NAME]	= { .type = NLA_NUL_STRING,
+					    .len = ALTIFNAMSIZ - 1 },
+	[ETHTOOL_A_HEADER_FLAGS]	= NLA_POLICY_MASK(NLA_U32,
+							  ETHTOOL_FLAGS_BASIC),
+	[ETHTOOL_A_HEADER_PHY_INDEX]		= NLA_POLICY_MIN(NLA_U32, 1),
+};
+
+const struct nla_policy ethnl_header_policy_phy_stats[] = {
+	[ETHTOOL_A_HEADER_DEV_INDEX]	= { .type = NLA_U32 },
+	[ETHTOOL_A_HEADER_DEV_NAME]	= { .type = NLA_NUL_STRING,
+					    .len = ALTIFNAMSIZ - 1 },
+	[ETHTOOL_A_HEADER_FLAGS]	= NLA_POLICY_MASK(NLA_U32,
+							  ETHTOOL_FLAGS_STATS),
+	[ETHTOOL_A_HEADER_PHY_INDEX]		= NLA_POLICY_MIN(NLA_U32, 1),
+};
+
 int ethnl_ops_begin(struct net_device *dev)
 {
 	int ret;
@@ -89,8 +108,9 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info,
 			       const struct nlattr *header, struct net *net,
 			       struct netlink_ext_ack *extack, bool require_dev)
 {
-	struct nlattr *tb[ARRAY_SIZE(ethnl_header_policy)];
+	struct nlattr *tb[ARRAY_SIZE(ethnl_header_policy_phy)];
 	const struct nlattr *devname_attr;
+	struct phy_device *phydev = NULL;
 	struct net_device *dev = NULL;
 	u32 flags = 0;
 	int ret;
@@ -104,7 +124,7 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info,
 	/* No validation here, command policy should have a nested policy set
 	 * for the header, therefore validation should have already been done.
 	 */
-	ret = nla_parse_nested(tb, ARRAY_SIZE(ethnl_header_policy) - 1, header,
+	ret = nla_parse_nested(tb, ARRAY_SIZE(ethnl_header_policy_phy) - 1, header,
 			       NULL, extack);
 	if (ret < 0)
 		return ret;
@@ -145,6 +165,30 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info,
 		return -EINVAL;
 	}
 
+	if (dev) {
+		if (tb[ETHTOOL_A_HEADER_PHY_INDEX]) {
+			struct nlattr *phy_id;
+
+			phy_id = tb[ETHTOOL_A_HEADER_PHY_INDEX];
+			phydev = phy_link_topo_get_phy(dev->link_topo,
+						       nla_get_u32(phy_id));
+			if (!phydev) {
+				NL_SET_BAD_ATTR(extack, phy_id);
+				return -ENODEV;
+			}
+		} else {
+			/* If we need a PHY but no phy index is specified, fallback
+			 * to dev->phydev
+			 */
+			phydev = dev->phydev;
+		}
+	} else if (tb[ETHTOOL_A_HEADER_PHY_INDEX]) {
+		NL_SET_ERR_MSG_ATTR(extack, header,
+				    "can't target a PHY without a netdev");
+		return -EINVAL;
+	}
+
+	req_info->phydev = phydev;
 	req_info->dev = dev;
 	req_info->flags = flags;
 	return 0;
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index 9a333a8d04c1..d57a890b5d9e 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -250,6 +250,7 @@ static inline unsigned int ethnl_reply_header_size(void)
  * @dev:   network device the request is for (may be null)
  * @dev_tracker: refcount tracker for @dev reference
  * @flags: request flags common for all request types
+ * @phydev: phy_device connected to @dev this request is for (may be null)
  *
  * This is a common base for request specific structures holding data from
  * parsed userspace request. These always embed struct ethnl_req_info at
@@ -259,6 +260,7 @@ struct ethnl_req_info {
 	struct net_device	*dev;
 	netdevice_tracker	dev_tracker;
 	u32			flags;
+	struct phy_device	*phydev;
 };
 
 static inline void ethnl_parse_header_dev_put(struct ethnl_req_info *req_info)
@@ -395,9 +397,12 @@ extern const struct ethnl_request_ops ethnl_rss_request_ops;
 extern const struct ethnl_request_ops ethnl_plca_cfg_request_ops;
 extern const struct ethnl_request_ops ethnl_plca_status_request_ops;
 extern const struct ethnl_request_ops ethnl_mm_request_ops;
+extern const struct ethnl_request_ops ethnl_phy_request_ops;
 
 extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1];
 extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1];
+extern const struct nla_policy ethnl_header_policy_phy[ETHTOOL_A_HEADER_PHY_INDEX + 1];
+extern const struct nla_policy ethnl_header_policy_phy_stats[ETHTOOL_A_HEADER_PHY_INDEX + 1];
 extern const struct nla_policy ethnl_strset_get_policy[ETHTOOL_A_STRSET_COUNTS_ONLY + 1];
 extern const struct nla_policy ethnl_linkinfo_get_policy[ETHTOOL_A_LINKINFO_HEADER + 1];
 extern const struct nla_policy ethnl_linkinfo_set_policy[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL + 1];
-- 
cgit v1.2.3


From 3f3e0dfc83d586fe9204936fccae771754a9dbc2 Mon Sep 17 00:00:00 2001
From: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Date: Sat, 6 Apr 2024 08:48:15 +0200
Subject: ALSA: emux: prune unused parameter from snd_soundfont_load_guspatch()

The `client` parameter was not used, so eliminate it from the call
chain.

Signed-off-by: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Message-ID: <20240406064830.1029573-3-oswald.buddenhagen@gmx.de>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/soundfont.h     | 2 +-
 sound/synth/emux/emux_hwdep.c | 3 +--
 sound/synth/emux/emux_oss.c   | 3 +--
 sound/synth/emux/soundfont.c  | 7 +++----
 4 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/sound/soundfont.h b/include/sound/soundfont.h
index e445688a4f4f..98ed98d89d6d 100644
--- a/include/sound/soundfont.h
+++ b/include/sound/soundfont.h
@@ -89,7 +89,7 @@ struct snd_sf_list {
 int snd_soundfont_load(struct snd_sf_list *sflist, const void __user *data,
 		       long count, int client);
 int snd_soundfont_load_guspatch(struct snd_sf_list *sflist, const char __user *data,
-				long count, int client);
+				long count);
 int snd_soundfont_close_check(struct snd_sf_list *sflist, int client);
 
 struct snd_sf_list *snd_sf_new(struct snd_sf_callback *callback,
diff --git a/sound/synth/emux/emux_hwdep.c b/sound/synth/emux/emux_hwdep.c
index 81719bfb8ed7..fd8f978cde1c 100644
--- a/sound/synth/emux/emux_hwdep.c
+++ b/sound/synth/emux/emux_hwdep.c
@@ -27,8 +27,7 @@ snd_emux_hwdep_load_patch(struct snd_emux *emu, void __user *arg)
 
 	if (patch.key == GUS_PATCH)
 		return snd_soundfont_load_guspatch(emu->sflist, arg,
-						   patch.len + sizeof(patch),
-						   TMP_CLIENT_ID);
+						   patch.len + sizeof(patch));
 
 	if (patch.type >= SNDRV_SFNT_LOAD_INFO &&
 	    patch.type <= SNDRV_SFNT_PROBE_DATA) {
diff --git a/sound/synth/emux/emux_oss.c b/sound/synth/emux/emux_oss.c
index d8d32671f703..04df46b269d3 100644
--- a/sound/synth/emux/emux_oss.c
+++ b/sound/synth/emux/emux_oss.c
@@ -205,8 +205,7 @@ snd_emux_load_patch_seq_oss(struct snd_seq_oss_arg *arg, int format,
 		return -ENXIO;
 
 	if (format == GUS_PATCH)
-		rc = snd_soundfont_load_guspatch(emu->sflist, buf, count,
-						 SF_CLIENT_NO(p->chset.port));
+		rc = snd_soundfont_load_guspatch(emu->sflist, buf, count);
 	else if (format == SNDRV_OSS_SOUNDFONT_PATCH) {
 		struct soundfont_patch_info patch;
 		if (count < (int)sizeof(patch))
diff --git a/sound/synth/emux/soundfont.c b/sound/synth/emux/soundfont.c
index 16f00097cb95..e1e47518ac92 100644
--- a/sound/synth/emux/soundfont.c
+++ b/sound/synth/emux/soundfont.c
@@ -941,8 +941,7 @@ int snd_sf_vol_table[128] = {
 
 /* load GUS patch */
 static int
-load_guspatch(struct snd_sf_list *sflist, const char __user *data,
-	      long count, int client)
+load_guspatch(struct snd_sf_list *sflist, const char __user *data, long count)
 {
 	struct patch_info patch;
 	struct snd_soundfont *sf;
@@ -1122,11 +1121,11 @@ load_guspatch(struct snd_sf_list *sflist, const char __user *data,
 /* load GUS patch */
 int
 snd_soundfont_load_guspatch(struct snd_sf_list *sflist, const char __user *data,
-			    long count, int client)
+			    long count)
 {
 	int rc;
 	lock_preset(sflist);
-	rc = load_guspatch(sflist, data, count, client);
+	rc = load_guspatch(sflist, data, count);
 	unlock_preset(sflist);
 	return rc;
 }
-- 
cgit v1.2.3


From 38fc804a776ea66ca8ac8113022e445c587f5e01 Mon Sep 17 00:00:00 2001
From: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Date: Sat, 6 Apr 2024 08:48:22 +0200
Subject: ALSA: emu10k1: fix sample signedness issues in wavetable loader

The hardware supports S16LE and U8 samples, while U16LE and S8 (which
the driver implicitly claims to support) require sign flipping.

Note that this matters only for the GUS patch loader, as the implemented
SoundFont v2.01 spec is limited to S16LE.

Signed-off-by: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Message-ID: <20240406064830.1029573-10-oswald.buddenhagen@gmx.de>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/emu10k1.h           |  4 ++--
 sound/pci/emu10k1/emu10k1_patch.c | 30 +++++++++++-------------
 sound/pci/emu10k1/memory.c        | 49 ++++++++++++++++++++++++++++++++-------
 3 files changed, 55 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/sound/emu10k1.h b/include/sound/emu10k1.h
index 1af9e6819392..9e3bd4f81460 100644
--- a/include/sound/emu10k1.h
+++ b/include/sound/emu10k1.h
@@ -1882,8 +1882,8 @@ int snd_emu10k1_alloc_pages_maybe_wider(struct snd_emu10k1 *emu, size_t size,
 					struct snd_dma_buffer *dmab);
 struct snd_util_memblk *snd_emu10k1_synth_alloc(struct snd_emu10k1 *emu, unsigned int size);
 int snd_emu10k1_synth_free(struct snd_emu10k1 *emu, struct snd_util_memblk *blk);
-int snd_emu10k1_synth_bzero(struct snd_emu10k1 *emu, struct snd_util_memblk *blk, int offset, int size);
-int snd_emu10k1_synth_copy_from_user(struct snd_emu10k1 *emu, struct snd_util_memblk *blk, int offset, const char __user *data, int size);
+int snd_emu10k1_synth_memset(struct snd_emu10k1 *emu, struct snd_util_memblk *blk, int offset, int size, u8 value);
+int snd_emu10k1_synth_copy_from_user(struct snd_emu10k1 *emu, struct snd_util_memblk *blk, int offset, const char __user *data, int size, u32 xor);
 int snd_emu10k1_memblk_map(struct snd_emu10k1 *emu, struct snd_emu10k1_memblk *blk);
 
 /* voice allocation */
diff --git a/sound/pci/emu10k1/emu10k1_patch.c b/sound/pci/emu10k1/emu10k1_patch.c
index 55bb60d31fe4..eb3d1ef8a33a 100644
--- a/sound/pci/emu10k1/emu10k1_patch.c
+++ b/sound/pci/emu10k1/emu10k1_patch.c
@@ -26,6 +26,8 @@ snd_emu10k1_sample_new(struct snd_emux *rec, struct snd_sf_sample *sp,
 		       struct snd_util_memhdr *hdr,
 		       const void __user *data, long count)
 {
+	u8 fill;
+	u32 xor;
 	int offset;
 	int truesize, size, blocksize;
 	unsigned int start_addr;
@@ -41,6 +43,14 @@ snd_emu10k1_sample_new(struct snd_emux *rec, struct snd_sf_sample *sp,
 		       sp->v.sample);
 	}
 
+	if (sp->v.mode_flags & SNDRV_SFNT_SAMPLE_8BITS) {
+		fill = 0x80;
+		xor = (sp->v.mode_flags & SNDRV_SFNT_SAMPLE_UNSIGNED) ? 0 : 0x80808080;
+	} else {
+		fill = 0;
+		xor = (sp->v.mode_flags & SNDRV_SFNT_SAMPLE_UNSIGNED) ? 0x80008000 : 0;
+	}
+
 	/* compute true data size to be loaded */
 	truesize = sp->v.size + BLANK_HEAD_SIZE;
 	if (sp->v.mode_flags & SNDRV_SFNT_SAMPLE_NO_BLANK)
@@ -65,14 +75,14 @@ snd_emu10k1_sample_new(struct snd_emux *rec, struct snd_sf_sample *sp,
 	size = BLANK_HEAD_SIZE;
 	if (! (sp->v.mode_flags & SNDRV_SFNT_SAMPLE_8BITS))
 		size *= 2;
-	snd_emu10k1_synth_bzero(emu, sp->block, offset, size);
+	snd_emu10k1_synth_memset(emu, sp->block, offset, size, fill);
 	offset += size;
 
 	/* copy provided samples */
 	size = sp->v.size;
 	if (! (sp->v.mode_flags & SNDRV_SFNT_SAMPLE_8BITS))
 		size *= 2;
-	if (snd_emu10k1_synth_copy_from_user(emu, sp->block, offset, data, size)) {
+	if (snd_emu10k1_synth_copy_from_user(emu, sp->block, offset, data, size, xor)) {
 		snd_emu10k1_synth_free(emu, sp->block);
 		sp->block = NULL;
 		return -EFAULT;
@@ -81,7 +91,7 @@ snd_emu10k1_sample_new(struct snd_emux *rec, struct snd_sf_sample *sp,
 
 	/* clear rest of samples (if any) */
 	if (offset < blocksize)
-		snd_emu10k1_synth_bzero(emu, sp->block, offset, blocksize - offset);
+		snd_emu10k1_synth_memset(emu, sp->block, offset, blocksize - offset, fill);
 
 	if (sp->v.mode_flags & SNDRV_SFNT_SAMPLE_NO_BLANK) {
 		/* if no blank loop is attached in the sample, add it */
@@ -91,20 +101,6 @@ snd_emu10k1_sample_new(struct snd_emux *rec, struct snd_sf_sample *sp,
 		}
 	}
 
-#if 0 /* not supported yet */
-	if (sp->v.mode_flags & SNDRV_SFNT_SAMPLE_UNSIGNED) {
-		/* unsigned -> signed */
-		if (! (sp->v.mode_flags & SNDRV_SFNT_SAMPLE_8BITS)) {
-			unsigned short *wblock = (unsigned short*)block;
-			for (i = 0; i < truesize; i++)
-				wblock[i] ^= 0x8000;
-		} else {
-			for (i = 0; i < truesize; i++)
-				block[i] ^= 0x80;
-		}
-	}
-#endif
-
 	/* recalculate offset */
 	start_addr = BLANK_HEAD_SIZE * 2;
 	if (! (sp->v.mode_flags & SNDRV_SFNT_SAMPLE_8BITS))
diff --git a/sound/pci/emu10k1/memory.c b/sound/pci/emu10k1/memory.c
index fc9444404151..d29711777161 100644
--- a/sound/pci/emu10k1/memory.c
+++ b/sound/pci/emu10k1/memory.c
@@ -565,10 +565,10 @@ static inline void *offset_ptr(struct snd_emu10k1 *emu, int page, int offset)
 }
 
 /*
- * bzero(blk + offset, size)
+ * memset(blk + offset, value, size)
  */
-int snd_emu10k1_synth_bzero(struct snd_emu10k1 *emu, struct snd_util_memblk *blk,
-			    int offset, int size)
+int snd_emu10k1_synth_memset(struct snd_emu10k1 *emu, struct snd_util_memblk *blk,
+			     int offset, int size, u8 value)
 {
 	int page, nextofs, end_offset, temp, temp1;
 	void *ptr;
@@ -588,20 +588,47 @@ int snd_emu10k1_synth_bzero(struct snd_emu10k1 *emu, struct snd_util_memblk *blk
 			temp = temp1;
 		ptr = offset_ptr(emu, page + p->first_page, offset);
 		if (ptr)
-			memset(ptr, 0, temp);
+			memset(ptr, value, temp);
 		offset = nextofs;
 		page++;
 	} while (offset < end_offset);
 	return 0;
 }
 
-EXPORT_SYMBOL(snd_emu10k1_synth_bzero);
+EXPORT_SYMBOL(snd_emu10k1_synth_memset);
+
+// Note that the value is assumed to be suitably repetitive.
+static void xor_range(void *ptr, int size, u32 value)
+{
+	if ((long)ptr & 1) {
+		*(u8 *)ptr ^= (u8)value;
+		ptr++;
+		size--;
+	}
+	if (size > 1 && ((long)ptr & 2)) {
+		*(u16 *)ptr ^= (u16)value;
+		ptr += 2;
+		size -= 2;
+	}
+	while (size > 3) {
+		*(u32 *)ptr ^= value;
+		ptr += 4;
+		size -= 4;
+	}
+	if (size > 1) {
+		*(u16 *)ptr ^= (u16)value;
+		ptr += 2;
+		size -= 2;
+	}
+	if (size > 0)
+		*(u8 *)ptr ^= (u8)value;
+}
 
 /*
- * copy_from_user(blk + offset, data, size)
+ * copy_from_user(blk + offset, data, size) ^ xor
  */
 int snd_emu10k1_synth_copy_from_user(struct snd_emu10k1 *emu, struct snd_util_memblk *blk,
-				     int offset, const char __user *data, int size)
+				     int offset, const char __user *data, int size, u32 xor)
 {
 	int page, nextofs, end_offset, temp, temp1;
 	void *ptr;
@@ -620,8 +647,12 @@ int snd_emu10k1_synth_copy_from_user(struct snd_emu10k1 *emu, struct snd_util_me
 		if (temp1 < temp)
 			temp = temp1;
 		ptr = offset_ptr(emu, page + p->first_page, offset);
-		if (ptr && copy_from_user(ptr, data, temp))
-			return -EFAULT;
+		if (ptr) {
+			if (copy_from_user(ptr, data, temp))
+				return -EFAULT;
+			if (xor)
+				xor_range(ptr, temp, xor);
+		}
 		offset = nextofs;
 		data += temp;
 		page++;
-- 
cgit v1.2.3


From 65db949667b0b74f4534e96f762aff0e6687dc51 Mon Sep 17 00:00:00 2001
From: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Date: Sat, 6 Apr 2024 08:48:27 +0200
Subject: ALSA: emu10k1: improve cache behavior documentation

Resulting from more reverse engineering in the course of debugging.

Signed-off-by: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Message-ID: <20240406064830.1029573-15-oswald.buddenhagen@gmx.de>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/emu10k1.h | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/sound/emu10k1.h b/include/sound/emu10k1.h
index 9e3bd4f81460..12c7dc760724 100644
--- a/include/sound/emu10k1.h
+++ b/include/sound/emu10k1.h
@@ -598,17 +598,25 @@ SUB_REG(PEFE, FILTERAMOUNT,	0x000000ff)	/* Filter envlope amount				*/
 // In stereo mode, the two channels' caches are concatenated into one,
 // and hold the interleaved frames.
 // The cache holds 64 frames, so the upper half is not used in 8-bit mode.
-// All registers mentioned below count in frames.
-// The cache is a ring buffer; CCR_READADDRESS operates modulo 64.
-// The cache is filled from (CCCA_CURRADDR - CCR_CACHEINVALIDSIZE)
-// into (CCR_READADDRESS - CCR_CACHEINVALIDSIZE).
+// All registers mentioned below count in frames. Shortcuts:
+//   CA = CCCA_CURRADDR, CRA = CCR_READADDRESS,
+//   CLA = CCR_CACHELOOPADDRHI:CLP_CACHELOOPADDR,
+//   CIS = CCR_CACHEINVALIDSIZE, LIS = CCR_LOOPINVALSIZE,
+//   CLF = CCR_CACHELOOPFLAG, LF = CCR_LOOPFLAG
+// The cache is a ring buffer; CRA operates modulo 64.
+// The cache is filled from (CA - CIS) into (CRA - CIS).
 // The engine has a fetch threshold of 32 bytes, so it tries to keep
-// CCR_CACHEINVALIDSIZE below 8 (16-bit stereo), 16 (16-bit mono,
-// 8-bit stereo), or 32 (8-bit mono). The actual transfers are pretty
-// unpredictable, especially if several voices are running.
-// Frames are consumed at CCR_READADDRESS, which is incremented afterwards,
-// along with CCCA_CURRADDR and CCR_CACHEINVALIDSIZE. This implies that the
-// actual playback position always lags CCCA_CURRADDR by exactly 64 frames.
+// CIS below 8 (16-bit stereo), 16 (16-bit mono, 8-bit stereo), or
+// 32 (8-bit mono). The actual transfers are pretty unpredictable,
+// especially if several voices are running.
+// Frames are consumed at CRA, which is incremented afterwards,
+// along with CA and CIS. This implies that the actual playback
+// position always lags CA by exactly 64 frames.
+// When CA reaches DSL_LOOPENDADDR, LF is set for one frame's time.
+// LF's rising edge causes the current values of CA and CIS to be
+// copied into CLA and LIS, resp., and CLF to be set.
+// If CLF is set, the first LIS of the CIS frames are instead
+// filled from (CLA - LIS), and CLF is subsequently reset.
 #define CD0			0x20		/* Cache data registers 0 .. 0x1f			*/
 
 #define PTB			0x40		/* Page table base register				*/
-- 
cgit v1.2.3


From e82051193a171f393d2a165a7ce18d8a2e2b4837 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 7 Apr 2024 02:42:36 -0400
Subject: new helper: copy_to_iter_full()

... and convert copy_linear_skb() to using that.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/uio.h | 10 ++++++++++
 include/net/udp.h   |  9 +--------
 2 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 00cebe2b70de..7020adedfa08 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -205,6 +205,16 @@ size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
 	return 0;
 }
 
+static __always_inline __must_check
+bool copy_to_iter_full(const void *addr, size_t bytes, struct iov_iter *i)
+{
+	size_t copied = copy_to_iter(addr, bytes, i);
+	if (likely(copied == bytes))
+		return true;
+	iov_iter_revert(i, copied);
+	return false;
+}
+
 static __always_inline __must_check
 bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i)
 {
diff --git a/include/net/udp.h b/include/net/udp.h
index 488a6d2babcc..c4e05b14b648 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -379,14 +379,7 @@ static inline bool udp_skb_is_linear(struct sk_buff *skb)
 static inline int copy_linear_skb(struct sk_buff *skb, int len, int off,
 				  struct iov_iter *to)
 {
-	int n;
-
-	n = copy_to_iter(skb->data + off, len, to);
-	if (n == len)
-		return 0;
-
-	iov_iter_revert(to, n);
-	return -EFAULT;
+	return copy_to_iter_full(skb->data + off, len, to) ? 0 : -EFAULT;
 }
 
 /*
-- 
cgit v1.2.3


From 210a03c9d51aa0e6e6f06980116e3256da8d4c48 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 28 Mar 2024 13:27:24 +0100
Subject: fs: claw back a few FMODE_* bits

There's a bunch of flags that are purely based on what the file
operations support while also never being conditionally set or unset.
IOW, they're not subject to change for individual files. Imho, such
flags don't need to live in f_mode they might as well live in the fops
structs itself. And the fops struct already has that lonely
mmap_supported_flags member. We might as well turn that into a generic
fop_flags member and move a few flags from FMODE_* space into FOP_*
space. That gets us four FMODE_* bits back and the ability for new
static flags that are about file ops to not have to live in FMODE_*
space but in their own FOP_* space. It's not the most beautiful thing
ever but it gets the job done. Yes, there'll be an additional pointer
chase but hopefully that won't matter for these flags.

I suspect there's a few more we can move into there and that we can also
redirect a bunch of new flag suggestions that follow this pattern into
the fop_flags field instead of f_mode.

Link: https://lore.kernel.org/r/20240328-gewendet-spargel-aa60a030ef74@brauner
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 block/bdev.c         |  2 +-
 block/fops.c         |  1 +
 drivers/dax/device.c |  2 +-
 fs/btrfs/file.c      |  4 ++--
 fs/ext4/file.c       |  6 +++---
 fs/f2fs/file.c       |  3 ++-
 fs/read_write.c      |  2 +-
 fs/xfs/xfs_file.c    |  8 +++++---
 include/linux/fs.h   | 22 ++++++++++++----------
 io_uring/io_uring.c  |  2 +-
 io_uring/rw.c        |  9 +++++----
 mm/mmap.c            |  4 +++-
 12 files changed, 37 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/block/bdev.c b/block/bdev.c
index 7a5f611c3d2e..1322dfe32c5d 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -904,7 +904,7 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
 		disk_unblock_events(disk);
 
 	bdev_file->f_flags |= O_LARGEFILE;
-	bdev_file->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT;
+	bdev_file->f_mode |= FMODE_CAN_ODIRECT;
 	if (bdev_nowait(bdev))
 		bdev_file->f_mode |= FMODE_NOWAIT;
 	bdev_file->f_mapping = bdev->bd_inode->i_mapping;
diff --git a/block/fops.c b/block/fops.c
index 679d9b752fe8..af6c244314af 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -863,6 +863,7 @@ const struct file_operations def_blk_fops = {
 	.splice_read	= filemap_splice_read,
 	.splice_write	= iter_file_splice_write,
 	.fallocate	= blkdev_fallocate,
+	.fop_flags	= FOP_BUFFER_RASYNC,
 };
 
 static __init int blkdev_init(void)
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 93ebedc5ec8c..c24ef4d3cf31 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -377,7 +377,7 @@ static const struct file_operations dax_fops = {
 	.release = dax_release,
 	.get_unmapped_area = dax_get_unmapped_area,
 	.mmap = dax_mmap,
-	.mmap_supported_flags = MAP_SYNC,
+	.fop_flags = FOP_MMAP_SYNC,
 };
 
 static void dev_dax_cdev_del(void *cdev)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f9d76072398d..1640c46f2153 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3719,8 +3719,7 @@ static int btrfs_file_open(struct inode *inode, struct file *filp)
 {
 	int ret;
 
-	filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
-		        FMODE_CAN_ODIRECT;
+	filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
 
 	ret = fsverity_file_open(inode, filp);
 	if (ret)
@@ -3850,6 +3849,7 @@ const struct file_operations btrfs_file_operations = {
 	.compat_ioctl	= btrfs_compat_ioctl,
 #endif
 	.remap_file_range = btrfs_remap_file_range,
+	.fop_flags	= FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC,
 };
 
 int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 54d6ff22585c..28c51b0cc4db 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -885,8 +885,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
 			return ret;
 	}
 
-	filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC |
-			FMODE_DIO_PARALLEL_WRITE;
+	filp->f_mode |= FMODE_NOWAIT;
 	return dquot_file_open(inode, filp);
 }
 
@@ -938,7 +937,6 @@ const struct file_operations ext4_file_operations = {
 	.compat_ioctl	= ext4_compat_ioctl,
 #endif
 	.mmap		= ext4_file_mmap,
-	.mmap_supported_flags = MAP_SYNC,
 	.open		= ext4_file_open,
 	.release	= ext4_release_file,
 	.fsync		= ext4_sync_file,
@@ -946,6 +944,8 @@ const struct file_operations ext4_file_operations = {
 	.splice_read	= ext4_file_splice_read,
 	.splice_write	= iter_file_splice_write,
 	.fallocate	= ext4_fallocate,
+	.fop_flags	= FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
+			  FOP_DIO_PARALLEL_WRITE,
 };
 
 const struct inode_operations ext4_file_inode_operations = {
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 1761ad125f97..2b65e09822d4 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -569,7 +569,7 @@ static int f2fs_file_open(struct inode *inode, struct file *filp)
 	if (err)
 		return err;
 
-	filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
+	filp->f_mode |= FMODE_NOWAIT;
 	filp->f_mode |= FMODE_CAN_ODIRECT;
 
 	return dquot_file_open(inode, filp);
@@ -5045,4 +5045,5 @@ const struct file_operations f2fs_file_operations = {
 	.splice_read	= f2fs_file_splice_read,
 	.splice_write	= iter_file_splice_write,
 	.fadvise	= f2fs_file_fadvise,
+	.fop_flags	= FOP_BUFFER_RASYNC,
 };
diff --git a/fs/read_write.c b/fs/read_write.c
index d4c036e82b6c..2115d1f40bd5 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1685,7 +1685,7 @@ int generic_write_checks_count(struct kiocb *iocb, loff_t *count)
 
 	if ((iocb->ki_flags & IOCB_NOWAIT) &&
 	    !((iocb->ki_flags & IOCB_DIRECT) ||
-	      (file->f_mode & FMODE_BUF_WASYNC)))
+	      (file->f_op->fop_flags & FOP_BUFFER_WASYNC)))
 		return -EINVAL;
 
 	return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 632653e00906..147439ad3581 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1230,8 +1230,7 @@ xfs_file_open(
 {
 	if (xfs_is_shutdown(XFS_M(inode->i_sb)))
 		return -EIO;
-	file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
-			FMODE_DIO_PARALLEL_WRITE | FMODE_CAN_ODIRECT;
+	file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
 	return generic_file_open(inode, file);
 }
 
@@ -1490,7 +1489,6 @@ const struct file_operations xfs_file_operations = {
 	.compat_ioctl	= xfs_file_compat_ioctl,
 #endif
 	.mmap		= xfs_file_mmap,
-	.mmap_supported_flags = MAP_SYNC,
 	.open		= xfs_file_open,
 	.release	= xfs_file_release,
 	.fsync		= xfs_file_fsync,
@@ -1498,6 +1496,8 @@ const struct file_operations xfs_file_operations = {
 	.fallocate	= xfs_file_fallocate,
 	.fadvise	= xfs_file_fadvise,
 	.remap_file_range = xfs_file_remap_range,
+	.fop_flags	= FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC |
+			  FOP_DIO_PARALLEL_WRITE,
 };
 
 const struct file_operations xfs_dir_file_operations = {
@@ -1510,4 +1510,6 @@ const struct file_operations xfs_dir_file_operations = {
 	.compat_ioctl	= xfs_file_compat_ioctl,
 #endif
 	.fsync		= xfs_dir_fsync,
+	.fop_flags	= FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC |
+			  FOP_DIO_PARALLEL_WRITE,
 };
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 914245085597..53cddf0131ca 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -163,9 +163,6 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 
 #define	FMODE_NOREUSE		((__force fmode_t)0x800000)
 
-/* File supports non-exclusive O_DIRECT writes from multiple threads */
-#define FMODE_DIO_PARALLEL_WRITE	((__force fmode_t)0x1000000)
-
 /* File is embedded in backing_file object */
 #define FMODE_BACKING		((__force fmode_t)0x2000000)
 
@@ -181,12 +178,6 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* File does not contribute to nr_files count */
 #define FMODE_NOACCOUNT		((__force fmode_t)0x20000000)
 
-/* File supports async buffered reads */
-#define FMODE_BUF_RASYNC	((__force fmode_t)0x40000000)
-
-/* File supports async nowait buffered writes */
-#define FMODE_BUF_WASYNC	((__force fmode_t)0x80000000)
-
 /*
  * Attribute flags.  These should be or-ed together to figure out what
  * has been changed!
@@ -2001,8 +1992,11 @@ struct iov_iter;
 struct io_uring_cmd;
 struct offset_ctx;
 
+typedef unsigned int __bitwise fop_flags_t;
+
 struct file_operations {
 	struct module *owner;
+	fop_flags_t fop_flags;
 	loff_t (*llseek) (struct file *, loff_t, int);
 	ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
 	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
@@ -2015,7 +2009,6 @@ struct file_operations {
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
-	unsigned long mmap_supported_flags;
 	int (*open) (struct inode *, struct file *);
 	int (*flush) (struct file *, fl_owner_t id);
 	int (*release) (struct inode *, struct file *);
@@ -2046,6 +2039,15 @@ struct file_operations {
 				unsigned int poll_flags);
 } __randomize_layout;
 
+/* Supports async buffered reads */
+#define FOP_BUFFER_RASYNC	((__force fop_flags_t)(1 << 0))
+/* Supports async buffered writes */
+#define FOP_BUFFER_WASYNC	((__force fop_flags_t)(1 << 1))
+/* Supports synchronous page faults for mappings */
+#define FOP_MMAP_SYNC		((__force fop_flags_t)(1 << 2))
+/* Supports non-exclusive O_DIRECT writes from multiple threads */
+#define FOP_DIO_PARALLEL_WRITE	((__force fop_flags_t)(1 << 3))
+
 /* Wrap a directory iterator that needs exclusive inode access */
 int wrap_directory_iterator(struct file *, struct dir_context *,
 			    int (*) (struct file *, struct dir_context *));
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 5d4b448fdc50..d73c9ad2d2f8 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -471,7 +471,7 @@ static void io_prep_async_work(struct io_kiocb *req)
 
 		/* don't serialize this request if the fs doesn't need it */
 		if (should_hash && (req->file->f_flags & O_DIRECT) &&
-		    (req->file->f_mode & FMODE_DIO_PARALLEL_WRITE))
+		    (req->file->f_op->fop_flags & FOP_DIO_PARALLEL_WRITE))
 			should_hash = false;
 		if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL))
 			io_wq_hash_work(&req->work, file_inode(req->file));
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 0585ebcc9773..d9dfde1142a1 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -683,7 +683,8 @@ static bool io_rw_should_retry(struct io_kiocb *req)
 	 * just use poll if we can, and don't attempt if the fs doesn't
 	 * support callback based unlocks
 	 */
-	if (io_file_can_poll(req) || !(req->file->f_mode & FMODE_BUF_RASYNC))
+	if (io_file_can_poll(req) ||
+	    !(req->file->f_op->fop_flags & FOP_BUFFER_RASYNC))
 		return false;
 
 	wait->wait.func = io_async_buf_func;
@@ -1022,10 +1023,10 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
 		if (unlikely(!io_file_supports_nowait(req)))
 			goto copy_iov;
 
-		/* File path supports NOWAIT for non-direct_IO only for block devices. */
+		/* Check if we can support NOWAIT. */
 		if (!(kiocb->ki_flags & IOCB_DIRECT) &&
-			!(kiocb->ki_filp->f_mode & FMODE_BUF_WASYNC) &&
-			(req->flags & REQ_F_ISREG))
+		    !(req->file->f_op->fop_flags & FOP_BUFFER_WASYNC) &&
+		    (req->flags & REQ_F_ISREG))
 			goto copy_iov;
 
 		kiocb->ki_flags |= IOCB_NOWAIT;
diff --git a/mm/mmap.c b/mm/mmap.c
index 6dbda99a47da..3490af70f259 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1294,7 +1294,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 		if (!file_mmap_ok(file, inode, pgoff, len))
 			return -EOVERFLOW;
 
-		flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags;
+		flags_mask = LEGACY_MAP_MASK;
+		if (file->f_op->fop_flags & FOP_MMAP_SYNC)
+			flags_mask |= MAP_SYNC;
 
 		switch (flags & MAP_TYPE) {
 		case MAP_SHARED:
-- 
cgit v1.2.3


From 0a4f544d83990e4b7326020a802d64956fa366de Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Sun, 7 Apr 2024 15:16:55 +0200
Subject: fs: use bit shifts for FMODE_* flags

Make it easier to see what bits are still available.

Link: https://lore.kernel.org/r/20240406061604.GA538574@ZenIV
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 57 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 31 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 53cddf0131ca..2ba98930c4c4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -110,21 +110,24 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
  */
 
 /* file is open for reading */
-#define FMODE_READ		((__force fmode_t)0x1)
+#define FMODE_READ		((__force fmode_t)(1 << 0))
 /* file is open for writing */
-#define FMODE_WRITE		((__force fmode_t)0x2)
+#define FMODE_WRITE		((__force fmode_t)(1 << 1))
 /* file is seekable */
-#define FMODE_LSEEK		((__force fmode_t)0x4)
+#define FMODE_LSEEK		((__force fmode_t)(1 << 2))
 /* file can be accessed using pread */
-#define FMODE_PREAD		((__force fmode_t)0x8)
+#define FMODE_PREAD		((__force fmode_t)(1 << 3))
 /* file can be accessed using pwrite */
-#define FMODE_PWRITE		((__force fmode_t)0x10)
+#define FMODE_PWRITE		((__force fmode_t)(1 << 4))
 /* File is opened for execution with sys_execve / sys_uselib */
-#define FMODE_EXEC		((__force fmode_t)0x20)
+#define FMODE_EXEC		((__force fmode_t)(1 << 5))
+
+/* FMODE_* bits 6 to 8 */
+
 /* 32bit hashes as llseek() offset (for directories) */
-#define FMODE_32BITHASH         ((__force fmode_t)0x200)
+#define FMODE_32BITHASH         ((__force fmode_t)(1 << 9))
 /* 64bit hashes as llseek() offset (for directories) */
-#define FMODE_64BITHASH         ((__force fmode_t)0x400)
+#define FMODE_64BITHASH         ((__force fmode_t)(1 << 10))
 
 /*
  * Don't update ctime and mtime.
@@ -132,51 +135,53 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
  * Currently a special hack for the XFS open_by_handle ioctl, but we'll
  * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon.
  */
-#define FMODE_NOCMTIME		((__force fmode_t)0x800)
+#define FMODE_NOCMTIME		((__force fmode_t)(1 << 11))
 
 /* Expect random access pattern */
-#define FMODE_RANDOM		((__force fmode_t)0x1000)
+#define FMODE_RANDOM		((__force fmode_t)(1 << 12))
 
 /* File is huge (eg. /dev/mem): treat loff_t as unsigned */
-#define FMODE_UNSIGNED_OFFSET	((__force fmode_t)0x2000)
+#define FMODE_UNSIGNED_OFFSET	((__force fmode_t)(1 << 13))
 
 /* File is opened with O_PATH; almost nothing can be done with it */
-#define FMODE_PATH		((__force fmode_t)0x4000)
+#define FMODE_PATH		((__force fmode_t)(1 << 14))
 
 /* File needs atomic accesses to f_pos */
-#define FMODE_ATOMIC_POS	((__force fmode_t)0x8000)
+#define FMODE_ATOMIC_POS	((__force fmode_t)(1 << 15))
 /* Write access to underlying fs */
-#define FMODE_WRITER		((__force fmode_t)0x10000)
+#define FMODE_WRITER		((__force fmode_t)(1 << 16))
 /* Has read method(s) */
-#define FMODE_CAN_READ          ((__force fmode_t)0x20000)
+#define FMODE_CAN_READ          ((__force fmode_t)(1 << 17))
 /* Has write method(s) */
-#define FMODE_CAN_WRITE         ((__force fmode_t)0x40000)
+#define FMODE_CAN_WRITE         ((__force fmode_t)(1 << 18))
 
-#define FMODE_OPENED		((__force fmode_t)0x80000)
-#define FMODE_CREATED		((__force fmode_t)0x100000)
+#define FMODE_OPENED		((__force fmode_t)(1 << 19))
+#define FMODE_CREATED		((__force fmode_t)(1 << 20))
 
 /* File is stream-like */
-#define FMODE_STREAM		((__force fmode_t)0x200000)
+#define FMODE_STREAM		((__force fmode_t)(1 << 21))
 
 /* File supports DIRECT IO */
-#define	FMODE_CAN_ODIRECT	((__force fmode_t)0x400000)
+#define	FMODE_CAN_ODIRECT	((__force fmode_t)(1 << 22))
+
+#define	FMODE_NOREUSE		((__force fmode_t)(1 << 23))
 
-#define	FMODE_NOREUSE		((__force fmode_t)0x800000)
+/* FMODE_* bit 24 */
 
 /* File is embedded in backing_file object */
-#define FMODE_BACKING		((__force fmode_t)0x2000000)
+#define FMODE_BACKING		((__force fmode_t)(1 << 25))
 
 /* File was opened by fanotify and shouldn't generate fanotify events */
-#define FMODE_NONOTIFY		((__force fmode_t)0x4000000)
+#define FMODE_NONOTIFY		((__force fmode_t)(1 << 26))
 
 /* File is capable of returning -EAGAIN if I/O will block */
-#define FMODE_NOWAIT		((__force fmode_t)0x8000000)
+#define FMODE_NOWAIT		((__force fmode_t)(1 << 27))
 
 /* File represents mount that needs unmounting */
-#define FMODE_NEED_UNMOUNT	((__force fmode_t)0x10000000)
+#define FMODE_NEED_UNMOUNT	((__force fmode_t)(1 << 28))
 
 /* File does not contribute to nr_files count */
-#define FMODE_NOACCOUNT		((__force fmode_t)0x20000000)
+#define FMODE_NOACCOUNT		((__force fmode_t)(1 << 29))
 
 /*
  * Attribute flags.  These should be or-ed together to figure out what
-- 
cgit v1.2.3


From 6648e613226e18897231ab5e42ffc29e63fa3365 Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Thu, 4 Apr 2024 10:10:01 +0800
Subject: bpf, skmsg: Fix NULL pointer dereference in
 sk_psock_skb_ingress_enqueue

Fix NULL pointer data-races in sk_psock_skb_ingress_enqueue() which
syzbot reported [1].

[1]
BUG: KCSAN: data-race in sk_psock_drop / sk_psock_skb_ingress_enqueue

write to 0xffff88814b3278b8 of 8 bytes by task 10724 on cpu 1:
 sk_psock_stop_verdict net/core/skmsg.c:1257 [inline]
 sk_psock_drop+0x13e/0x1f0 net/core/skmsg.c:843
 sk_psock_put include/linux/skmsg.h:459 [inline]
 sock_map_close+0x1a7/0x260 net/core/sock_map.c:1648
 unix_release+0x4b/0x80 net/unix/af_unix.c:1048
 __sock_release net/socket.c:659 [inline]
 sock_close+0x68/0x150 net/socket.c:1421
 __fput+0x2c1/0x660 fs/file_table.c:422
 __fput_sync+0x44/0x60 fs/file_table.c:507
 __do_sys_close fs/open.c:1556 [inline]
 __se_sys_close+0x101/0x1b0 fs/open.c:1541
 __x64_sys_close+0x1f/0x30 fs/open.c:1541
 do_syscall_64+0xd3/0x1d0
 entry_SYSCALL_64_after_hwframe+0x6d/0x75

read to 0xffff88814b3278b8 of 8 bytes by task 10713 on cpu 0:
 sk_psock_data_ready include/linux/skmsg.h:464 [inline]
 sk_psock_skb_ingress_enqueue+0x32d/0x390 net/core/skmsg.c:555
 sk_psock_skb_ingress_self+0x185/0x1e0 net/core/skmsg.c:606
 sk_psock_verdict_apply net/core/skmsg.c:1008 [inline]
 sk_psock_verdict_recv+0x3e4/0x4a0 net/core/skmsg.c:1202
 unix_read_skb net/unix/af_unix.c:2546 [inline]
 unix_stream_read_skb+0x9e/0xf0 net/unix/af_unix.c:2682
 sk_psock_verdict_data_ready+0x77/0x220 net/core/skmsg.c:1223
 unix_stream_sendmsg+0x527/0x860 net/unix/af_unix.c:2339
 sock_sendmsg_nosec net/socket.c:730 [inline]
 __sock_sendmsg+0x140/0x180 net/socket.c:745
 ____sys_sendmsg+0x312/0x410 net/socket.c:2584
 ___sys_sendmsg net/socket.c:2638 [inline]
 __sys_sendmsg+0x1e9/0x280 net/socket.c:2667
 __do_sys_sendmsg net/socket.c:2676 [inline]
 __se_sys_sendmsg net/socket.c:2674 [inline]
 __x64_sys_sendmsg+0x46/0x50 net/socket.c:2674
 do_syscall_64+0xd3/0x1d0
 entry_SYSCALL_64_after_hwframe+0x6d/0x75

value changed: 0xffffffff83d7feb0 -> 0x0000000000000000

Reported by Kernel Concurrency Sanitizer on:
CPU: 0 PID: 10713 Comm: syz-executor.4 Tainted: G        W          6.8.0-syzkaller-08951-gfe46a7dd189e #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 02/29/2024

Prior to this, commit 4cd12c6065df ("bpf, sockmap: Fix NULL pointer
dereference in sk_psock_verdict_data_ready()") fixed one NULL pointer
similarly due to no protection of saved_data_ready. Here is another
different caller causing the same issue because of the same reason. So
we should protect it with sk_callback_lock read lock because the writer
side in the sk_psock_drop() uses "write_lock_bh(&sk->sk_callback_lock);".

To avoid errors that could happen in future, I move those two pairs of
lock into the sk_psock_data_ready(), which is suggested by John Fastabend.

Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface")
Reported-by: syzbot+aa8c8ec2538929f18f2d@syzkaller.appspotmail.com
Signed-off-by: Jason Xing <kernelxing@tencent.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: John Fastabend <john.fastabend@gmail.com>
Closes: https://syzkaller.appspot.com/bug?extid=aa8c8ec2538929f18f2d
Link: https://lore.kernel.org/all/20240329134037.92124-1-kerneljasonxing@gmail.com
Link: https://lore.kernel.org/bpf/20240404021001.94815-1-kerneljasonxing@gmail.com
---
 include/linux/skmsg.h | 2 ++
 net/core/skmsg.c      | 5 +----
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index e65ec3fd2799..a509caf823d6 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -461,10 +461,12 @@ static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock)
 
 static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock)
 {
+	read_lock_bh(&sk->sk_callback_lock);
 	if (psock->saved_data_ready)
 		psock->saved_data_ready(sk);
 	else
 		sk->sk_data_ready(sk);
+	read_unlock_bh(&sk->sk_callback_lock);
 }
 
 static inline void psock_set_prog(struct bpf_prog **pprog,
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 4d75ef9d24bf..fd20aae30be2 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -1226,11 +1226,8 @@ static void sk_psock_verdict_data_ready(struct sock *sk)
 
 		rcu_read_lock();
 		psock = sk_psock(sk);
-		if (psock) {
-			read_lock_bh(&sk->sk_callback_lock);
+		if (psock)
 			sk_psock_data_ready(sk, psock);
-			read_unlock_bh(&sk->sk_callback_lock);
-		}
 		rcu_read_unlock();
 	}
 }
-- 
cgit v1.2.3


From d2c3a7eb1afada8cfb5fde41489913ea5733a319 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 5 Apr 2024 10:29:26 +0000
Subject: tcp: more struct tcp_sock adjustments

tp->recvmsg_inq is used from tcp recvmsg() thus should
be in tcp_sock_read_rx group.

tp->tcp_clock_cache and tp->tcp_mstamp are written
both in rx and tx paths, thus are better placed
in tcp_sock_write_txrx group.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h | 6 +++---
 net/ipv4/tcp.c      | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 55399ee2a57e..6a5e08b937b3 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -244,6 +244,7 @@ struct tcp_sock {
 	/* OOO segments go in this rbtree. Socket lock must be held. */
 	struct rb_root	out_of_order_queue;
 	u32	snd_ssthresh;	/* Slow start size threshold		*/
+	u8	recvmsg_inq : 1;/* Indicate # of bytes in queue upon recvmsg */
 	__cacheline_group_end(tcp_sock_read_rx);
 
 	/* TX read-write hotpath cache lines */
@@ -266,8 +267,6 @@ struct tcp_sock {
 	u32	mdev_us;	/* medium deviation			*/
 	u32	rtt_seq;	/* sequence number to update rttvar	*/
 	u64	tcp_wstamp_ns;	/* departure time for next sent data packet */
-	u64	tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */
-	u64	tcp_mstamp;	/* most recent packet received/sent */
 	struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */
 	struct sk_buff *highest_sack;   /* skb just after the highest
 					 * skb with SACKed bit set
@@ -284,6 +283,8 @@ struct tcp_sock {
  *	0x5?10 << 16 + snd_wnd in net byte order
  */
 	__be32	pred_flags;
+	u64	tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */
+	u64	tcp_mstamp;	/* most recent packet received/sent */
 	u32	rcv_nxt;	/* What we want to receive next		*/
 	u32	snd_nxt;	/* Next sequence we send		*/
 	u32	snd_una;	/* First byte we want an ack for	*/
@@ -370,7 +371,6 @@ struct tcp_sock {
 		tlp_retrans:1,	/* TLP is a retransmission */
 		unused:5;
 	u8	thin_lto    : 1,/* Use linear timeouts for thin streams */
-		recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */
 		fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
 		fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
 		fastopen_client_fail:2, /* reason why fastopen failed */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 92ee60492314..664c8ecb076b 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4648,16 +4648,16 @@ static void __init tcp_struct_check(void)
 	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns);
-	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_clock_cache);
-	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_mstamp);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags);
-	CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 105);
+	CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 89);
 
 	/* TXRX read-write hotpath cache lines */
 	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, tcp_clock_cache);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, tcp_mstamp);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_nxt);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_nxt);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_una);
@@ -4670,7 +4670,7 @@ static void __init tcp_struct_check(void)
 	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd);
 	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt);
-	CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 76);
+	CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 92);
 
 	/* RX read-write hotpath cache lines */
 	CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received);
-- 
cgit v1.2.3


From ec20b283009346adc66d5a460b1f8fb5adafbcfe Mon Sep 17 00:00:00 2001
From: Guillaume Nault <gnault@redhat.com>
Date: Fri, 5 Apr 2024 22:05:00 +0200
Subject: ipv4: Set scope explicitly in ip_route_output().

Add a "scope" parameter to ip_route_output() so that callers don't have
to override the tos parameter with the RTO_ONLINK flag if they want a
local scope.

This will allow converting flowi4_tos to dscp_t in the future, thus
allowing static analysers to flag invalid interactions between
"tos" (the DSCP bits) and ECN.

Only three users ask for local scope (bonding, arp and atm). The others
continue to use RT_SCOPE_UNIVERSE. While there, add a comment to warn
users about the limitations of ip_route_output().

Signed-off-by: Guillaume Nault <gnault@redhat.com>
Acked-by: Leon Romanovsky <leonro@nvidia.com> # infiniband
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/irdma/cm.c        | 3 ++-
 drivers/infiniband/hw/qedr/qedr_iw_cm.c | 3 ++-
 drivers/net/bonding/bond_main.c         | 4 ++--
 drivers/net/ethernet/broadcom/cnic.c    | 3 ++-
 include/net/route.h                     | 9 ++++++++-
 net/atm/clip.c                          | 2 +-
 net/bridge/br_netfilter_hooks.c         | 3 ++-
 net/ipv4/arp.c                          | 9 ++++++---
 net/ipv4/igmp.c                         | 3 ++-
 net/mpls/af_mpls.c                      | 2 +-
 10 files changed, 28 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c
index 1ee7a4e0d8d8..36bb7e5ce638 100644
--- a/drivers/infiniband/hw/irdma/cm.c
+++ b/drivers/infiniband/hw/irdma/cm.c
@@ -1985,7 +1985,8 @@ static int irdma_addr_resolve_neigh(struct irdma_device *iwdev, u32 src_ip,
 	__be32 dst_ipaddr = htonl(dst_ip);
 	__be32 src_ipaddr = htonl(src_ip);
 
-	rt = ip_route_output(&init_net, dst_ipaddr, src_ipaddr, 0, 0);
+	rt = ip_route_output(&init_net, dst_ipaddr, src_ipaddr, 0, 0,
+			     RT_SCOPE_UNIVERSE);
 	if (IS_ERR(rt)) {
 		ibdev_dbg(&iwdev->ibdev, "CM: ip_route_output fail\n");
 		return -EINVAL;
diff --git a/drivers/infiniband/hw/qedr/qedr_iw_cm.c b/drivers/infiniband/hw/qedr/qedr_iw_cm.c
index a51fc6854984..259303b9907c 100644
--- a/drivers/infiniband/hw/qedr/qedr_iw_cm.c
+++ b/drivers/infiniband/hw/qedr/qedr_iw_cm.c
@@ -447,7 +447,8 @@ qedr_addr4_resolve(struct qedr_dev *dev,
 	struct rtable *rt = NULL;
 	int rc = 0;
 
-	rt = ip_route_output(&init_net, dst_ip, src_ip, 0, 0);
+	rt = ip_route_output(&init_net, dst_ip, src_ip, 0, 0,
+			     RT_SCOPE_UNIVERSE);
 	if (IS_ERR(rt)) {
 		DP_ERR(dev, "ip_route_output returned error\n");
 		return -EINVAL;
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 2c5ed0a7cb18..c9f0415f780a 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3014,8 +3014,8 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave)
 		tags = NULL;
 
 		/* Find out through which dev should the packet go */
-		rt = ip_route_output(dev_net(bond->dev), targets[i], 0,
-				     RTO_ONLINK, 0);
+		rt = ip_route_output(dev_net(bond->dev), targets[i], 0, 0, 0,
+				     RT_SCOPE_LINK);
 		if (IS_ERR(rt)) {
 			/* there's no route to target - try to send arp
 			 * probe to generate any traffic (arp_validate=0)
diff --git a/drivers/net/ethernet/broadcom/cnic.c b/drivers/net/ethernet/broadcom/cnic.c
index 3d63177e7e52..c2b4188a1ef1 100644
--- a/drivers/net/ethernet/broadcom/cnic.c
+++ b/drivers/net/ethernet/broadcom/cnic.c
@@ -3682,7 +3682,8 @@ static int cnic_get_v4_route(struct sockaddr_in *dst_addr,
 #if defined(CONFIG_INET)
 	struct rtable *rt;
 
-	rt = ip_route_output(&init_net, dst_addr->sin_addr.s_addr, 0, 0, 0);
+	rt = ip_route_output(&init_net, dst_addr->sin_addr.s_addr, 0, 0, 0,
+			     RT_SCOPE_UNIVERSE);
 	if (!IS_ERR(rt)) {
 		*dst = &rt->dst;
 		return 0;
diff --git a/include/net/route.h b/include/net/route.h
index d4a0147942f1..315a8acee6c6 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -141,15 +141,22 @@ static inline struct rtable *ip_route_output_key(struct net *net, struct flowi4
 	return ip_route_output_flow(net, flp, NULL);
 }
 
+/* Simplistic IPv4 route lookup function.
+ * This is only suitable for some particular use cases: since the flowi4
+ * structure is only partially set, it may bypass some fib-rules.
+ */
 static inline struct rtable *ip_route_output(struct net *net, __be32 daddr,
-					     __be32 saddr, u8 tos, int oif)
+					     __be32 saddr, u8 tos, int oif,
+					     __u8 scope)
 {
 	struct flowi4 fl4 = {
 		.flowi4_oif = oif,
 		.flowi4_tos = tos,
+		.flowi4_scope = scope,
 		.daddr = daddr,
 		.saddr = saddr,
 	};
+
 	return ip_route_output_key(net, &fl4);
 }
 
diff --git a/net/atm/clip.c b/net/atm/clip.c
index 294cb9efe3d3..362e8d25a79e 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -463,7 +463,7 @@ static int clip_setentry(struct atm_vcc *vcc, __be32 ip)
 		unlink_clip_vcc(clip_vcc);
 		return 0;
 	}
-	rt = ip_route_output(&init_net, ip, 0, 1, 0);
+	rt = ip_route_output(&init_net, ip, 0, 0, 0, RT_SCOPE_LINK);
 	if (IS_ERR(rt))
 		return PTR_ERR(rt);
 	neigh = __neigh_lookup(&arp_tbl, &ip, rt->dst.dev, 1);
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 35e10c5a766d..4242447be322 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -399,7 +399,8 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_
 				goto free_skb;
 
 			rt = ip_route_output(net, iph->daddr, 0,
-					     RT_TOS(iph->tos), 0);
+					     RT_TOS(iph->tos), 0,
+					     RT_SCOPE_UNIVERSE);
 			if (!IS_ERR(rt)) {
 				/* - Bridged-and-DNAT'ed traffic doesn't
 				 *   require ip_forwarding. */
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 0d0d725b46ad..ab82ca104496 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -456,7 +456,8 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
 	/*unsigned long now; */
 	struct net *net = dev_net(dev);
 
-	rt = ip_route_output(net, sip, tip, 0, l3mdev_master_ifindex_rcu(dev));
+	rt = ip_route_output(net, sip, tip, 0, l3mdev_master_ifindex_rcu(dev),
+			     RT_SCOPE_UNIVERSE);
 	if (IS_ERR(rt))
 		return 1;
 	if (rt->dst.dev != dev) {
@@ -1056,7 +1057,8 @@ static int arp_req_set(struct net *net, struct arpreq *r,
 	if (r->arp_flags & ATF_PERM)
 		r->arp_flags |= ATF_COM;
 	if (!dev) {
-		struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
+		struct rtable *rt = ip_route_output(net, ip, 0, 0, 0,
+						    RT_SCOPE_LINK);
 
 		if (IS_ERR(rt))
 			return PTR_ERR(rt);
@@ -1188,7 +1190,8 @@ static int arp_req_delete(struct net *net, struct arpreq *r,
 
 	ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
 	if (!dev) {
-		struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
+		struct rtable *rt = ip_route_output(net, ip, 0, 0, 0,
+						    RT_SCOPE_LINK);
 		if (IS_ERR(rt))
 			return PTR_ERR(rt);
 		dev = rt->dst.dev;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 717e97a389a8..9bf09de6a2e7 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1842,7 +1842,8 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
 	if (!dev) {
 		struct rtable *rt = ip_route_output(net,
 						    imr->imr_multiaddr.s_addr,
-						    0, 0, 0);
+						    0, 0, 0,
+						    RT_SCOPE_UNIVERSE);
 		if (!IS_ERR(rt)) {
 			dev = rt->dst.dev;
 			ip_rt_put(rt);
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 6dab883a08dd..1303acb9cdd2 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -594,7 +594,7 @@ static struct net_device *inet_fib_lookup_dev(struct net *net,
 	struct in_addr daddr;
 
 	memcpy(&daddr, addr, sizeof(struct in_addr));
-	rt = ip_route_output(net, daddr.s_addr, 0, 0, 0);
+	rt = ip_route_output(net, daddr.s_addr, 0, 0, 0, RT_SCOPE_UNIVERSE);
 	if (IS_ERR(rt))
 		return ERR_CAST(rt);
 
-- 
cgit v1.2.3


From 5e5e51422cd189bc1b627f619f0f99324e6e4de9 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Mon, 25 Mar 2024 08:40:08 +0200
Subject: math64: Tidy up mul_u64_u32_shr()

Put together declaration and initialization of local variables.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20240325064023.2997-5-adrian.hunter@intel.com
---
 include/linux/math64.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/math64.h b/include/linux/math64.h
index bf74478926d4..fd13622b2056 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -179,16 +179,12 @@ static __always_inline u64 mul_u64_u64_shr(u64 a, u64 mul, unsigned int shift)
 #ifndef mul_u64_u32_shr
 static __always_inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
 {
-	u32 ah, al;
+	u32 ah = a >> 32, al = a;
 	u64 ret;
 
-	al = a;
-	ah = a >> 32;
-
 	ret = mul_u32_u32(al, mul) >> shift;
 	if (ah)
 		ret += mul_u32_u32(ah, mul) << (32 - shift);
-
 	return ret;
 }
 #endif /* mul_u64_u32_shr */
-- 
cgit v1.2.3


From 1beb35ec615f676d49d68b6dc23c7418ba8ff145 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Mon, 25 Mar 2024 08:40:09 +0200
Subject: vdso, math64: Provide mul_u64_u32_add_u64_shr()

Provide mul_u64_u32_add_u64_shr() which is a calculation that will be used
by timekeeping and VDSO.

Place #include <vdso/math64.h> after #include <asm/div64.h> to allow
architecture-specific overrides, at least for the kernel.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20240325064023.2997-6-adrian.hunter@intel.com
---
 include/linux/math64.h |  2 +-
 include/vdso/math64.h  | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/math64.h b/include/linux/math64.h
index fd13622b2056..d34def7f9a8c 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -4,8 +4,8 @@
 
 #include <linux/types.h>
 #include <linux/math.h>
-#include <vdso/math64.h>
 #include <asm/div64.h>
+#include <vdso/math64.h>
 
 #if BITS_PER_LONG == 64
 
diff --git a/include/vdso/math64.h b/include/vdso/math64.h
index 7da703ee5561..22ae212f8b28 100644
--- a/include/vdso/math64.h
+++ b/include/vdso/math64.h
@@ -21,4 +21,42 @@ __iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
 	return ret;
 }
 
+#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
+
+#ifndef mul_u64_u32_add_u64_shr
+static __always_inline u64 mul_u64_u32_add_u64_shr(u64 a, u32 mul, u64 b, unsigned int shift)
+{
+	return (u64)((((unsigned __int128)a * mul) + b) >> shift);
+}
+#endif /* mul_u64_u32_add_u64_shr */
+
+#else
+
+#ifndef mul_u64_u32_add_u64_shr
+#ifndef mul_u32_u32
+static inline u64 mul_u32_u32(u32 a, u32 b)
+{
+	return (u64)a * b;
+}
+#define mul_u32_u32 mul_u32_u32
+#endif
+static __always_inline u64 mul_u64_u32_add_u64_shr(u64 a, u32 mul, u64 b, unsigned int shift)
+{
+	u32 ah = a >> 32, al = a;
+	bool ovf;
+	u64 ret;
+
+	ovf = __builtin_add_overflow(mul_u32_u32(al, mul), b, &ret);
+	ret >>= shift;
+	if (ovf && shift)
+		ret += 1ULL << (64 - shift);
+	if (ah)
+		ret += mul_u32_u32(ah, mul) << (32 - shift);
+
+	return ret;
+}
+#endif /* mul_u64_u32_add_u64_shr */
+
+#endif
+
 #endif /* __VDSO_MATH64_H */
-- 
cgit v1.2.3


From d2e58ab5cda2a225c406ac10d0a8b960bc5a39b6 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Mon, 25 Mar 2024 08:40:10 +0200
Subject: vdso: Add vdso_data:: Max_cycles

Add vdso_data::max_cycles in preparation to use it to detect potential
multiplication overflow.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20240325064023.2997-7-adrian.hunter@intel.com
---
 include/vdso/datapage.h | 4 ++++
 kernel/time/vsyscall.c  | 6 ++++++
 2 files changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h
index c71ddb6d4691..d04d394db064 100644
--- a/include/vdso/datapage.h
+++ b/include/vdso/datapage.h
@@ -61,6 +61,7 @@ struct vdso_timestamp {
  * @seq:		timebase sequence counter
  * @clock_mode:		clock mode
  * @cycle_last:		timebase at clocksource init
+ * @max_cycles:		maximum cycles which won't overflow 64bit multiplication
  * @mask:		clocksource mask
  * @mult:		clocksource multiplier
  * @shift:		clocksource shift
@@ -92,6 +93,9 @@ struct vdso_data {
 
 	s32			clock_mode;
 	u64			cycle_last;
+#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT
+	u64			max_cycles;
+#endif
 	u64			mask;
 	u32			mult;
 	u32			shift;
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
index f0d5062d9cbc..9193d6133e5d 100644
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -22,10 +22,16 @@ static inline void update_vdso_data(struct vdso_data *vdata,
 	u64 nsec, sec;
 
 	vdata[CS_HRES_COARSE].cycle_last	= tk->tkr_mono.cycle_last;
+#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT
+	vdata[CS_HRES_COARSE].max_cycles	= tk->tkr_mono.clock->max_cycles;
+#endif
 	vdata[CS_HRES_COARSE].mask		= tk->tkr_mono.mask;
 	vdata[CS_HRES_COARSE].mult		= tk->tkr_mono.mult;
 	vdata[CS_HRES_COARSE].shift		= tk->tkr_mono.shift;
 	vdata[CS_RAW].cycle_last		= tk->tkr_raw.cycle_last;
+#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT
+	vdata[CS_RAW].max_cycles		= tk->tkr_raw.clock->max_cycles;
+#endif
 	vdata[CS_RAW].mask			= tk->tkr_raw.mask;
 	vdata[CS_RAW].mult			= tk->tkr_raw.mult;
 	vdata[CS_RAW].shift			= tk->tkr_raw.shift;
-- 
cgit v1.2.3


From 70ee853eec5693fefd8348a2b049d9cb83362e58 Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Mon, 8 Apr 2024 11:18:00 +0100
Subject: regmap: Add regmap_read_bypassed()

Add a regmap_read_bypassed() to allow reads from the hardware registers
while the regmap is in cache-only mode.

A typical use for this is to keep the cache in cache-only mode until
the hardware has reached a valid state, but one or more status registers
must be polled to determine when this state is reached.

For example, firmware download on the cs35l56 can take several seconds if
there are multiple amps sharing limited bus bandwidth. This is too long
to block in probe() so it is done as a background task. The device must
be soft-reset to reboot the firmware and during this time the registers are
not accessible, so the cache should be in cache-only. But the driver must
poll a register to detect when reboot has completed.

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Fixes: 8a731fd37f8b ("ASoC: cs35l56: Move utility functions to shared file")
Link: https://msgid.link/r/20240408101803.43183-2-rf@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/base/regmap/regmap.c | 37 +++++++++++++++++++++++++++++++++++++
 include/linux/regmap.h       |  8 ++++++++
 2 files changed, 45 insertions(+)

(limited to 'include')

diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c
index 5cb425f6f02d..0a34dd3c4f38 100644
--- a/drivers/base/regmap/regmap.c
+++ b/drivers/base/regmap/regmap.c
@@ -2838,6 +2838,43 @@ int regmap_read(struct regmap *map, unsigned int reg, unsigned int *val)
 }
 EXPORT_SYMBOL_GPL(regmap_read);
 
+/**
+ * regmap_read_bypassed() - Read a value from a single register direct
+ *			    from the device, bypassing the cache
+ *
+ * @map: Register map to read from
+ * @reg: Register to be read from
+ * @val: Pointer to store read value
+ *
+ * A value of zero will be returned on success, a negative errno will
+ * be returned in error cases.
+ */
+int regmap_read_bypassed(struct regmap *map, unsigned int reg, unsigned int *val)
+{
+	int ret;
+	bool bypass, cache_only;
+
+	if (!IS_ALIGNED(reg, map->reg_stride))
+		return -EINVAL;
+
+	map->lock(map->lock_arg);
+
+	bypass = map->cache_bypass;
+	cache_only = map->cache_only;
+	map->cache_bypass = true;
+	map->cache_only = false;
+
+	ret = _regmap_read(map, reg, val);
+
+	map->cache_bypass = bypass;
+	map->cache_only = cache_only;
+
+	map->unlock(map->lock_arg);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(regmap_read_bypassed);
+
 /**
  * regmap_raw_read() - Read raw data from the device
  *
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index b743241cfb7c..d470303b1bbb 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -1230,6 +1230,7 @@ int regmap_multi_reg_write_bypassed(struct regmap *map,
 int regmap_raw_write_async(struct regmap *map, unsigned int reg,
 			   const void *val, size_t val_len);
 int regmap_read(struct regmap *map, unsigned int reg, unsigned int *val);
+int regmap_read_bypassed(struct regmap *map, unsigned int reg, unsigned int *val);
 int regmap_raw_read(struct regmap *map, unsigned int reg,
 		    void *val, size_t val_len);
 int regmap_noinc_read(struct regmap *map, unsigned int reg,
@@ -1739,6 +1740,13 @@ static inline int regmap_read(struct regmap *map, unsigned int reg,
 	return -EINVAL;
 }
 
+static inline int regmap_read_bypassed(struct regmap *map, unsigned int reg,
+				       unsigned int *val)
+{
+	WARN_ONCE(1, "regmap API is disabled");
+	return -EINVAL;
+}
+
 static inline int regmap_raw_read(struct regmap *map, unsigned int reg,
 				  void *val, size_t val_len)
 {
-- 
cgit v1.2.3


From dfd2ffb373999630a14d7ff614440f1c2fcc704c Mon Sep 17 00:00:00 2001
From: Richard Fitzgerald <rf@opensource.cirrus.com>
Date: Mon, 8 Apr 2024 11:18:03 +0100
Subject: ASoC: cs35l56: Prevent overwriting firmware ASP config

Only populate the ASP1 config registers in the regmap cache if the
ASP DAI is used. This prevents regcache_sync() from overwriting
these registers with their defaults when the firmware owns
control of these registers.

On a SoundWire system the ASP could be owned by the firmware to
share reference audio with the firmware on other cs35l56. Or it
can be used as a normal codec-codec interface owned by the driver.
The driver must not overwrite the registers if the firmware has
control of them.

The original implementation for this in commit 07f7d6e7a124
("ASoC: cs35l56: Fix for initializing ASP1 mixer registers") was
to still provide defaults for these registers, assuming that if
they were never reconfigured from defaults then regcache_sync()
would not write them out because they are not dirty. Unfortunately
regcache_sync() is not that smart. If the chip has not reset (so
the driver has not called regcache_mark_dirty()) a regcache_sync()
could write out registers that are not dirty.

To avoid accidental overwriting of the ASP registers, they are
removed from the table of defaults and instead are populated with
defaults only if one of the ASP DAI configuration functions is
called. So if the DAI has never been configured, the firmware is
assumed to have ownership of these registers, and the regmap cache
will not contain any entries for them.

Signed-off-by: Richard Fitzgerald <rf@opensource.cirrus.com>
Fixes: 07f7d6e7a124 ("ASoC: cs35l56: Fix for initializing ASP1 mixer registers")
Link: https://msgid.link/r/20240408101803.43183-5-rf@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/cs35l56.h           |  2 ++
 sound/soc/codecs/cs35l56-shared.c | 63 ++++++++++++++++++++++++++-------------
 sound/soc/codecs/cs35l56.c        | 24 ++++++++++++++-
 3 files changed, 67 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h
index e0629699b563..1a3c6f66f620 100644
--- a/include/sound/cs35l56.h
+++ b/include/sound/cs35l56.h
@@ -267,6 +267,7 @@ struct cs35l56_base {
 	bool fw_patched;
 	bool secured;
 	bool can_hibernate;
+	bool fw_owns_asp1;
 	bool cal_data_valid;
 	s8 cal_index;
 	struct cirrus_amp_cal_data cal_data;
@@ -283,6 +284,7 @@ extern const char * const cs35l56_tx_input_texts[CS35L56_NUM_INPUT_SRC];
 extern const unsigned int cs35l56_tx_input_values[CS35L56_NUM_INPUT_SRC];
 
 int cs35l56_set_patch(struct cs35l56_base *cs35l56_base);
+int cs35l56_init_asp1_regs_for_driver_control(struct cs35l56_base *cs35l56_base);
 int cs35l56_force_sync_asp1_registers_from_cache(struct cs35l56_base *cs35l56_base);
 int cs35l56_mbox_send(struct cs35l56_base *cs35l56_base, unsigned int command);
 int cs35l56_firmware_shutdown(struct cs35l56_base *cs35l56_base);
diff --git a/sound/soc/codecs/cs35l56-shared.c b/sound/soc/codecs/cs35l56-shared.c
index a83317db75ed..ec1d95e57061 100644
--- a/sound/soc/codecs/cs35l56-shared.c
+++ b/sound/soc/codecs/cs35l56-shared.c
@@ -40,16 +40,11 @@ EXPORT_SYMBOL_NS_GPL(cs35l56_set_patch, SND_SOC_CS35L56_SHARED);
 static const struct reg_default cs35l56_reg_defaults[] = {
 	/* no defaults for OTP_MEM - first read populates cache */
 
-	{ CS35L56_ASP1_ENABLES1,		0x00000000 },
-	{ CS35L56_ASP1_CONTROL1,		0x00000028 },
-	{ CS35L56_ASP1_CONTROL2,		0x18180200 },
-	{ CS35L56_ASP1_CONTROL3,		0x00000002 },
-	{ CS35L56_ASP1_FRAME_CONTROL1,		0x03020100 },
-	{ CS35L56_ASP1_FRAME_CONTROL5,		0x00020100 },
-	{ CS35L56_ASP1_DATA_CONTROL1,		0x00000018 },
-	{ CS35L56_ASP1_DATA_CONTROL5,		0x00000018 },
-
-	/* no defaults for ASP1TX mixer */
+	/*
+	 * No defaults for ASP1 control or ASP1TX mixer. See
+	 * cs35l56_populate_asp1_register_defaults() and
+	 * cs35l56_sync_asp1_mixer_widgets_with_firmware().
+	 */
 
 	{ CS35L56_SWIRE_DP3_CH1_INPUT,		0x00000018 },
 	{ CS35L56_SWIRE_DP3_CH2_INPUT,		0x00000019 },
@@ -210,6 +205,36 @@ static bool cs35l56_volatile_reg(struct device *dev, unsigned int reg)
 	}
 }
 
+static const struct reg_sequence cs35l56_asp1_defaults[] = {
+	REG_SEQ0(CS35L56_ASP1_ENABLES1,		0x00000000),
+	REG_SEQ0(CS35L56_ASP1_CONTROL1,		0x00000028),
+	REG_SEQ0(CS35L56_ASP1_CONTROL2,		0x18180200),
+	REG_SEQ0(CS35L56_ASP1_CONTROL3,		0x00000002),
+	REG_SEQ0(CS35L56_ASP1_FRAME_CONTROL1,	0x03020100),
+	REG_SEQ0(CS35L56_ASP1_FRAME_CONTROL5,	0x00020100),
+	REG_SEQ0(CS35L56_ASP1_DATA_CONTROL1,	0x00000018),
+	REG_SEQ0(CS35L56_ASP1_DATA_CONTROL5,	0x00000018),
+};
+
+/*
+ * The firmware can have control of the ASP so we don't provide regmap
+ * with defaults for these registers, to prevent a regcache_sync() from
+ * overwriting the firmware settings. But if the machine driver hooks up
+ * the ASP it means the driver is taking control of the ASP, so then the
+ * registers are populated with the defaults.
+ */
+int cs35l56_init_asp1_regs_for_driver_control(struct cs35l56_base *cs35l56_base)
+{
+	if (!cs35l56_base->fw_owns_asp1)
+		return 0;
+
+	cs35l56_base->fw_owns_asp1 = false;
+
+	return regmap_multi_reg_write(cs35l56_base->regmap, cs35l56_asp1_defaults,
+				      ARRAY_SIZE(cs35l56_asp1_defaults));
+}
+EXPORT_SYMBOL_NS_GPL(cs35l56_init_asp1_regs_for_driver_control, SND_SOC_CS35L56_SHARED);
+
 /*
  * The firmware boot sequence can overwrite the ASP1 config registers so that
  * they don't match regmap's view of their values. Rewrite the values from the
@@ -217,19 +242,15 @@ static bool cs35l56_volatile_reg(struct device *dev, unsigned int reg)
  */
 int cs35l56_force_sync_asp1_registers_from_cache(struct cs35l56_base *cs35l56_base)
 {
-	struct reg_sequence asp1_regs[] = {
-		{ .reg = CS35L56_ASP1_ENABLES1 },
-		{ .reg = CS35L56_ASP1_CONTROL1 },
-		{ .reg = CS35L56_ASP1_CONTROL2 },
-		{ .reg = CS35L56_ASP1_CONTROL3 },
-		{ .reg = CS35L56_ASP1_FRAME_CONTROL1 },
-		{ .reg = CS35L56_ASP1_FRAME_CONTROL5 },
-		{ .reg = CS35L56_ASP1_DATA_CONTROL1 },
-		{ .reg = CS35L56_ASP1_DATA_CONTROL5 },
-	};
+	struct reg_sequence asp1_regs[ARRAY_SIZE(cs35l56_asp1_defaults)];
 	int i, ret;
 
-	/* Read values from regmap cache into a write sequence */
+	if (cs35l56_base->fw_owns_asp1)
+		return 0;
+
+	memcpy(asp1_regs, cs35l56_asp1_defaults, sizeof(asp1_regs));
+
+	/* Read current values from regmap cache into the write sequence */
 	for (i = 0; i < ARRAY_SIZE(asp1_regs); ++i) {
 		ret = regmap_read(cs35l56_base->regmap, asp1_regs[i].reg, &asp1_regs[i].def);
 		if (ret)
diff --git a/sound/soc/codecs/cs35l56.c b/sound/soc/codecs/cs35l56.c
index 5a4e0e479414..6331b8c6136e 100644
--- a/sound/soc/codecs/cs35l56.c
+++ b/sound/soc/codecs/cs35l56.c
@@ -454,9 +454,14 @@ static int cs35l56_asp_dai_set_fmt(struct snd_soc_dai *codec_dai, unsigned int f
 {
 	struct cs35l56_private *cs35l56 = snd_soc_component_get_drvdata(codec_dai->component);
 	unsigned int val;
+	int ret;
 
 	dev_dbg(cs35l56->base.dev, "%s: %#x\n", __func__, fmt);
 
+	ret = cs35l56_init_asp1_regs_for_driver_control(&cs35l56->base);
+	if (ret)
+		return ret;
+
 	switch (fmt & SND_SOC_DAIFMT_CLOCK_PROVIDER_MASK) {
 	case SND_SOC_DAIFMT_CBC_CFC:
 		break;
@@ -530,6 +535,11 @@ static int cs35l56_asp_dai_set_tdm_slot(struct snd_soc_dai *dai, unsigned int tx
 					unsigned int rx_mask, int slots, int slot_width)
 {
 	struct cs35l56_private *cs35l56 = snd_soc_component_get_drvdata(dai->component);
+	int ret;
+
+	ret = cs35l56_init_asp1_regs_for_driver_control(&cs35l56->base);
+	if (ret)
+		return ret;
 
 	if ((slots == 0) || (slot_width == 0)) {
 		dev_dbg(cs35l56->base.dev, "tdm config cleared\n");
@@ -578,6 +588,11 @@ static int cs35l56_asp_dai_hw_params(struct snd_pcm_substream *substream,
 	struct cs35l56_private *cs35l56 = snd_soc_component_get_drvdata(dai->component);
 	unsigned int rate = params_rate(params);
 	u8 asp_width, asp_wl;
+	int ret;
+
+	ret = cs35l56_init_asp1_regs_for_driver_control(&cs35l56->base);
+	if (ret)
+		return ret;
 
 	asp_wl = params_width(params);
 	if (cs35l56->asp_slot_width)
@@ -634,7 +649,11 @@ static int cs35l56_asp_dai_set_sysclk(struct snd_soc_dai *dai,
 				      int clk_id, unsigned int freq, int dir)
 {
 	struct cs35l56_private *cs35l56 = snd_soc_component_get_drvdata(dai->component);
-	int freq_id;
+	int freq_id, ret;
+
+	ret = cs35l56_init_asp1_regs_for_driver_control(&cs35l56->base);
+	if (ret)
+		return ret;
 
 	if (freq == 0) {
 		cs35l56->sysclk_set = false;
@@ -1403,6 +1422,9 @@ int cs35l56_common_probe(struct cs35l56_private *cs35l56)
 	cs35l56->base.cal_index = -1;
 	cs35l56->speaker_id = -ENOENT;
 
+	/* Assume that the firmware owns ASP1 until we know different */
+	cs35l56->base.fw_owns_asp1 = true;
+
 	dev_set_drvdata(cs35l56->base.dev, cs35l56);
 
 	cs35l56_fill_supply_names(cs35l56->supplies);
-- 
cgit v1.2.3


From 5af3e3876d567fb79a355bec1cb48e432d69b4fb Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Sat, 6 Apr 2024 04:05:37 +0300
Subject: devlink: Support setting max_io_eqs

Many devices send event notifications for the IO queues,
such as tx and rx queues, through event queues.

Enable a privileged owner, such as a hypervisor PF, to set the number
of IO event queues for the VF and SF during the provisioning stage.

example:
Get maximum IO event queues of the VF device::

  $ devlink port show pci/0000:06:00.0/2
  pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1
      function:
          hw_addr 00:00:00:00:00:00 ipsec_packet disabled max_io_eqs 10

Set maximum IO event queues of the VF device::

  $ devlink port function set pci/0000:06:00.0/2 max_io_eqs 32

  $ devlink port show pci/0000:06:00.0/2
  pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1
      function:
          hw_addr 00:00:00:00:00:00 ipsec_packet disabled max_io_eqs 32

Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Parav Pandit <parav@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/devlink/devlink-port.rst | 33 ++++++++++++++
 include/net/devlink.h                             | 14 ++++++
 include/uapi/linux/devlink.h                      |  1 +
 net/devlink/port.c                                | 53 +++++++++++++++++++++++
 4 files changed, 101 insertions(+)

(limited to 'include')

diff --git a/Documentation/networking/devlink/devlink-port.rst b/Documentation/networking/devlink/devlink-port.rst
index 562f46b41274..9d22d41a7cd1 100644
--- a/Documentation/networking/devlink/devlink-port.rst
+++ b/Documentation/networking/devlink/devlink-port.rst
@@ -134,6 +134,9 @@ Users may also set the IPsec crypto capability of the function using
 Users may also set the IPsec packet capability of the function using
 `devlink port function set ipsec_packet` command.
 
+Users may also set the maximum IO event queues of the function
+using `devlink port function set max_io_eqs` command.
+
 Function attributes
 ===================
 
@@ -295,6 +298,36 @@ policy is processed in software by the kernel.
         function:
             hw_addr 00:00:00:00:00:00 ipsec_packet enabled
 
+Maximum IO events queues setup
+------------------------------
+When user sets maximum number of IO event queues for a SF or
+a VF, such function driver is limited to consume only enforced
+number of IO event queues.
+
+IO event queues deliver events related to IO queues, including network
+device transmit and receive queues (txq and rxq) and RDMA Queue Pairs (QPs).
+For example, the number of netdevice channels and RDMA device completion
+vectors are derived from the function's IO event queues. Usually, the number
+of interrupt vectors consumed by the driver is limited by the number of IO
+event queues per device, as each of the IO event queues is connected to an
+interrupt vector.
+
+- Get maximum IO event queues of the VF device::
+
+    $ devlink port show pci/0000:06:00.0/2
+    pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1
+        function:
+            hw_addr 00:00:00:00:00:00 ipsec_packet disabled max_io_eqs 10
+
+- Set maximum IO event queues of the VF device::
+
+    $ devlink port function set pci/0000:06:00.0/2 max_io_eqs 32
+
+    $ devlink port show pci/0000:06:00.0/2
+    pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1
+        function:
+            hw_addr 00:00:00:00:00:00 ipsec_packet disabled max_io_eqs 32
+
 Subfunction
 ============
 
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 9ac394bdfbe4..bb1af599d101 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1602,6 +1602,14 @@ void devlink_free(struct devlink *devlink);
  *			      capability. Should be used by device drivers to
  *			      enable/disable ipsec_packet capability of a
  *			      function managed by the devlink port.
+ * @port_fn_max_io_eqs_get: Callback used to get port function's maximum number
+ *			    of event queues. Should be used by device drivers to
+ *			    report the maximum event queues of a function
+ *			    managed by the devlink port.
+ * @port_fn_max_io_eqs_set: Callback used to set port function's maximum number
+ *			    of event queues. Should be used by device drivers to
+ *			    configure maximum number of event queues
+ *			    of a function managed by the devlink port.
  *
  * Note: Driver should return -EOPNOTSUPP if it doesn't support
  * port function (@port_fn_*) handling for a particular port.
@@ -1651,6 +1659,12 @@ struct devlink_port_ops {
 	int (*port_fn_ipsec_packet_set)(struct devlink_port *devlink_port,
 					bool enable,
 					struct netlink_ext_ack *extack);
+	int (*port_fn_max_io_eqs_get)(struct devlink_port *devlink_port,
+				      u32 *max_eqs,
+				      struct netlink_ext_ack *extack);
+	int (*port_fn_max_io_eqs_set)(struct devlink_port *devlink_port,
+				      u32 max_eqs,
+				      struct netlink_ext_ack *extack);
 };
 
 void devlink_port_init(struct devlink *devlink,
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 2da0c7eb6710..9401aa343673 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -686,6 +686,7 @@ enum devlink_port_function_attr {
 	DEVLINK_PORT_FN_ATTR_OPSTATE,	/* u8 */
 	DEVLINK_PORT_FN_ATTR_CAPS,	/* bitfield32 */
 	DEVLINK_PORT_FN_ATTR_DEVLINK,	/* nested */
+	DEVLINK_PORT_FN_ATTR_MAX_IO_EQS,	/* u32 */
 
 	__DEVLINK_PORT_FUNCTION_ATTR_MAX,
 	DEVLINK_PORT_FUNCTION_ATTR_MAX = __DEVLINK_PORT_FUNCTION_ATTR_MAX - 1
diff --git a/net/devlink/port.c b/net/devlink/port.c
index 118d130d2afd..be9158b4453c 100644
--- a/net/devlink/port.c
+++ b/net/devlink/port.c
@@ -16,6 +16,7 @@ static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_
 				 DEVLINK_PORT_FN_STATE_ACTIVE),
 	[DEVLINK_PORT_FN_ATTR_CAPS] =
 		NLA_POLICY_BITFIELD32(DEVLINK_PORT_FN_CAPS_VALID_MASK),
+	[DEVLINK_PORT_FN_ATTR_MAX_IO_EQS] = { .type = NLA_U32 },
 };
 
 #define ASSERT_DEVLINK_PORT_REGISTERED(devlink_port)				\
@@ -182,6 +183,30 @@ static int devlink_port_fn_caps_fill(struct devlink_port *devlink_port,
 	return 0;
 }
 
+static int devlink_port_fn_max_io_eqs_fill(struct devlink_port *port,
+					   struct sk_buff *msg,
+					   struct netlink_ext_ack *extack,
+					   bool *msg_updated)
+{
+	u32 max_io_eqs;
+	int err;
+
+	if (!port->ops->port_fn_max_io_eqs_get)
+		return 0;
+
+	err = port->ops->port_fn_max_io_eqs_get(port, &max_io_eqs, extack);
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			return 0;
+		return err;
+	}
+	err = nla_put_u32(msg, DEVLINK_PORT_FN_ATTR_MAX_IO_EQS, max_io_eqs);
+	if (err)
+		return err;
+	*msg_updated = true;
+	return 0;
+}
+
 int devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port)
 {
 	if (devlink_nl_put_handle(msg, devlink_port->devlink))
@@ -409,6 +434,18 @@ static int devlink_port_fn_caps_set(struct devlink_port *devlink_port,
 	return 0;
 }
 
+static int
+devlink_port_fn_max_io_eqs_set(struct devlink_port *devlink_port,
+			       const struct nlattr *attr,
+			       struct netlink_ext_ack *extack)
+{
+	u32 max_io_eqs;
+
+	max_io_eqs = nla_get_u32(attr);
+	return devlink_port->ops->port_fn_max_io_eqs_set(devlink_port,
+							 max_io_eqs, extack);
+}
+
 static int
 devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port,
 				   struct netlink_ext_ack *extack)
@@ -428,6 +465,9 @@ devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *por
 	if (err)
 		goto out;
 	err = devlink_port_fn_state_fill(port, msg, extack, &msg_updated);
+	if (err)
+		goto out;
+	err = devlink_port_fn_max_io_eqs_fill(port, msg, extack, &msg_updated);
 	if (err)
 		goto out;
 	err = devlink_rel_devlink_handle_put(msg, port->devlink,
@@ -726,6 +766,12 @@ static int devlink_port_function_validate(struct devlink_port *devlink_port,
 			}
 		}
 	}
+	if (tb[DEVLINK_PORT_FN_ATTR_MAX_IO_EQS] &&
+	    !ops->port_fn_max_io_eqs_set) {
+		NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FN_ATTR_MAX_IO_EQS],
+				    "Function does not support max_io_eqs setting");
+		return -EOPNOTSUPP;
+	}
 	return 0;
 }
 
@@ -761,6 +807,13 @@ static int devlink_port_function_set(struct devlink_port *port,
 			return err;
 	}
 
+	attr = tb[DEVLINK_PORT_FN_ATTR_MAX_IO_EQS];
+	if (attr) {
+		err = devlink_port_fn_max_io_eqs_set(port, attr, extack);
+		if (err)
+			return err;
+	}
+
 	/* Keep this as the last function attribute set, so that when
 	 * multiple port function attributes are set along with state,
 	 * Those can be applied first before activating the state.
-- 
cgit v1.2.3


From daeeb032f42d066a49e07b7f6effc9f51b7a5479 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 2 Apr 2024 20:56:43 +0200
Subject: thermal: core: Move threshold out of struct thermal_trip

The threshold field in struct thermal_trip is only used internally by
the thermal core and it is better to prevent drivers from misusing it.
It also takes some space unnecessarily in the trip tables passed by
drivers to the core during thermal zone registration.

For this reason, introduce struct thermal_trip_desc as a wrapper around
struct thermal_trip, move the threshold field directly into it and make
the thermal core store struct thermal_trip_desc objects in the internal
thermal zone trip tables.  Adjust all of the code using trip tables in
the thermal core accordingly.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
---
 drivers/thermal/gov_fair_share.c      |  7 ++++--
 drivers/thermal/gov_power_allocator.c |  6 +++--
 drivers/thermal/thermal_core.c        | 46 ++++++++++++++++++++---------------
 drivers/thermal/thermal_core.h        |  7 ++++--
 drivers/thermal/thermal_debugfs.c     |  6 +++--
 drivers/thermal/thermal_helpers.c     |  8 +++---
 drivers/thermal/thermal_netlink.c     |  6 +++--
 drivers/thermal/thermal_sysfs.c       | 20 +++++++--------
 drivers/thermal/thermal_trip.c        | 15 ++++++------
 include/linux/thermal.h               |  9 ++++---
 10 files changed, 78 insertions(+), 52 deletions(-)

(limited to 'include')

diff --git a/drivers/thermal/gov_fair_share.c b/drivers/thermal/gov_fair_share.c
index 4da25a0009d7..6ef8cfde7749 100644
--- a/drivers/thermal/gov_fair_share.c
+++ b/drivers/thermal/gov_fair_share.c
@@ -17,10 +17,13 @@
 
 static int get_trip_level(struct thermal_zone_device *tz)
 {
-	const struct thermal_trip *trip, *level_trip = NULL;
+	const struct thermal_trip *level_trip = NULL;
+	const struct thermal_trip_desc *td;
 	int trip_level = -1;
 
-	for_each_trip(tz, trip) {
+	for_each_trip_desc(tz, td) {
+		const struct thermal_trip *trip = &td->trip;
+
 		if (trip->temperature >= tz->temperature)
 			continue;
 
diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c
index e25e48d76aa7..ac1d02193a1b 100644
--- a/drivers/thermal/gov_power_allocator.c
+++ b/drivers/thermal/gov_power_allocator.c
@@ -496,9 +496,11 @@ static void get_governor_trips(struct thermal_zone_device *tz,
 	const struct thermal_trip *first_passive = NULL;
 	const struct thermal_trip *last_passive = NULL;
 	const struct thermal_trip *last_active = NULL;
-	const struct thermal_trip *trip;
+	const struct thermal_trip_desc *td;
+
+	for_each_trip_desc(tz, td) {
+		const struct thermal_trip *trip = &td->trip;
 
-	for_each_trip(tz, trip) {
 		switch (trip->type) {
 		case THERMAL_TRIP_PASSIVE:
 			if (!first_passive) {
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 34a31bc72023..fc6ff0a4faa1 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -361,17 +361,19 @@ static void handle_critical_trips(struct thermal_zone_device *tz,
 }
 
 static void handle_thermal_trip(struct thermal_zone_device *tz,
-				struct thermal_trip *trip)
+				struct thermal_trip_desc *td)
 {
+	const struct thermal_trip *trip = &td->trip;
+
 	if (trip->temperature == THERMAL_TEMP_INVALID)
 		return;
 
 	if (tz->last_temperature == THERMAL_TEMP_INVALID) {
 		/* Initialization. */
-		trip->threshold = trip->temperature;
-		if (tz->temperature >= trip->threshold)
-			trip->threshold -= trip->hysteresis;
-	} else if (tz->last_temperature < trip->threshold) {
+		td->threshold = trip->temperature;
+		if (tz->temperature >= td->threshold)
+			td->threshold -= trip->hysteresis;
+	} else if (tz->last_temperature < td->threshold) {
 		/*
 		 * The trip threshold is equal to the trip temperature, unless
 		 * the latter has changed in the meantime.  In either case,
@@ -382,9 +384,9 @@ static void handle_thermal_trip(struct thermal_zone_device *tz,
 		if (tz->temperature >= trip->temperature) {
 			thermal_notify_tz_trip_up(tz, trip);
 			thermal_debug_tz_trip_up(tz, trip);
-			trip->threshold = trip->temperature - trip->hysteresis;
+			td->threshold = trip->temperature - trip->hysteresis;
 		} else {
-			trip->threshold = trip->temperature;
+			td->threshold = trip->temperature;
 		}
 	} else {
 		/*
@@ -400,9 +402,9 @@ static void handle_thermal_trip(struct thermal_zone_device *tz,
 		if (tz->temperature < trip->temperature - trip->hysteresis) {
 			thermal_notify_tz_trip_down(tz, trip);
 			thermal_debug_tz_trip_down(tz, trip);
-			trip->threshold = trip->temperature;
+			td->threshold = trip->temperature;
 		} else {
-			trip->threshold = trip->temperature - trip->hysteresis;
+			td->threshold = trip->temperature - trip->hysteresis;
 		}
 	}
 
@@ -458,7 +460,7 @@ static void thermal_zone_device_init(struct thermal_zone_device *tz)
 void __thermal_zone_device_update(struct thermal_zone_device *tz,
 				  enum thermal_notify_event event)
 {
-	struct thermal_trip *trip;
+	struct thermal_trip_desc *td;
 
 	if (tz->suspended)
 		return;
@@ -472,8 +474,8 @@ void __thermal_zone_device_update(struct thermal_zone_device *tz,
 
 	tz->notify_event = event;
 
-	for_each_trip(tz, trip)
-		handle_thermal_trip(tz, trip);
+	for_each_trip_desc(tz, td)
+		handle_thermal_trip(tz, td);
 
 	monitor_thermal_zone(tz);
 }
@@ -766,7 +768,7 @@ int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
 	if (trip_index < 0 || trip_index >= tz->num_trips)
 		return -EINVAL;
 
-	return thermal_bind_cdev_to_trip(tz, &tz->trips[trip_index], cdev,
+	return thermal_bind_cdev_to_trip(tz, &tz->trips[trip_index].trip, cdev,
 					 upper, lower, weight);
 }
 EXPORT_SYMBOL_GPL(thermal_zone_bind_cooling_device);
@@ -825,7 +827,7 @@ int thermal_zone_unbind_cooling_device(struct thermal_zone_device *tz,
 	if (trip_index < 0 || trip_index >= tz->num_trips)
 		return -EINVAL;
 
-	return thermal_unbind_cdev_from_trip(tz, &tz->trips[trip_index], cdev);
+	return thermal_unbind_cdev_from_trip(tz, &tz->trips[trip_index].trip, cdev);
 }
 EXPORT_SYMBOL_GPL(thermal_zone_unbind_cooling_device);
 
@@ -1221,16 +1223,19 @@ static void thermal_set_delay_jiffies(unsigned long *delay_jiffies, int delay_ms
 
 int thermal_zone_get_crit_temp(struct thermal_zone_device *tz, int *temp)
 {
-	int i, ret = -EINVAL;
+	const struct thermal_trip_desc *td;
+	int ret = -EINVAL;
 
 	if (tz->ops.get_crit_temp)
 		return tz->ops.get_crit_temp(tz, temp);
 
 	mutex_lock(&tz->lock);
 
-	for (i = 0; i < tz->num_trips; i++) {
-		if (tz->trips[i].type == THERMAL_TRIP_CRITICAL) {
-			*temp = tz->trips[i].temperature;
+	for_each_trip_desc(tz, td) {
+		const struct thermal_trip *trip = &td->trip;
+
+		if (trip->type == THERMAL_TRIP_CRITICAL) {
+			*temp = trip->temperature;
 			ret = 0;
 			break;
 		}
@@ -1274,7 +1279,9 @@ thermal_zone_device_register_with_trips(const char *type,
 					const struct thermal_zone_params *tzp,
 					int passive_delay, int polling_delay)
 {
+	const struct thermal_trip *trip = trips;
 	struct thermal_zone_device *tz;
+	struct thermal_trip_desc *td;
 	int id;
 	int result;
 	struct thermal_governor *governor;
@@ -1339,7 +1346,8 @@ thermal_zone_device_register_with_trips(const char *type,
 	tz->device.class = thermal_class;
 	tz->devdata = devdata;
 	tz->num_trips = num_trips;
-	memcpy(tz->trips, trips, num_trips * sizeof(*trips));
+	for_each_trip_desc(tz, td)
+		td->trip = *trip++;
 
 	thermal_set_delay_jiffies(&tz->passive_delay_jiffies, passive_delay);
 	thermal_set_delay_jiffies(&tz->polling_delay_jiffies, polling_delay);
diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index 0d8a42bb7ce8..4ad9c671bc13 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -120,8 +120,11 @@ void thermal_governor_update_tz(struct thermal_zone_device *tz,
 				enum thermal_notify_event reason);
 
 /* Helpers */
-#define for_each_trip(__tz, __trip)	\
-	for (__trip = __tz->trips; __trip - __tz->trips < __tz->num_trips; __trip++)
+#define for_each_trip_desc(__tz, __td)	\
+	for (__td = __tz->trips; __td - __tz->trips < __tz->num_trips; __td++)
+
+#define trip_to_trip_desc(__trip)	\
+	container_of(__trip, struct thermal_trip_desc, trip)
 
 void __thermal_zone_set_trips(struct thermal_zone_device *tz);
 int thermal_zone_trip_id(const struct thermal_zone_device *tz,
diff --git a/drivers/thermal/thermal_debugfs.c b/drivers/thermal/thermal_debugfs.c
index c617e8b9f0dd..78c4cb30e91d 100644
--- a/drivers/thermal/thermal_debugfs.c
+++ b/drivers/thermal/thermal_debugfs.c
@@ -744,7 +744,7 @@ static void tze_seq_stop(struct seq_file *s, void *v)
 static int tze_seq_show(struct seq_file *s, void *v)
 {
 	struct thermal_zone_device *tz = s->private;
-	struct thermal_trip *trip;
+	struct thermal_trip_desc *td;
 	struct tz_episode *tze;
 	const char *type;
 	int trip_id;
@@ -757,7 +757,9 @@ static int tze_seq_show(struct seq_file *s, void *v)
 
 	seq_printf(s, "| trip |     type | temp(°mC) | hyst(°mC) |  duration  |  avg(°mC) |  min(°mC) |  max(°mC) |\n");
 
-	for_each_trip(tz, trip) {
+	for_each_trip_desc(tz, td) {
+		const struct thermal_trip *trip = &td->trip;
+
 		/*
 		 * There is no possible mitigation happening at the
 		 * critical trip point, so the stats will be always
diff --git a/drivers/thermal/thermal_helpers.c b/drivers/thermal/thermal_helpers.c
index c5a057b59c42..d9f4e26ec125 100644
--- a/drivers/thermal/thermal_helpers.c
+++ b/drivers/thermal/thermal_helpers.c
@@ -50,7 +50,7 @@ get_thermal_instance(struct thermal_zone_device *tz,
 	mutex_lock(&tz->lock);
 	mutex_lock(&cdev->lock);
 
-	trip = &tz->trips[trip_index];
+	trip = &tz->trips[trip_index].trip;
 
 	list_for_each_entry(pos, &tz->thermal_instances, tz_node) {
 		if (pos->tz == tz && pos->trip == trip && pos->cdev == cdev) {
@@ -82,7 +82,7 @@ EXPORT_SYMBOL(get_thermal_instance);
  */
 int __thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp)
 {
-	const struct thermal_trip *trip;
+	const struct thermal_trip_desc *td;
 	int crit_temp = INT_MAX;
 	int ret = -EINVAL;
 
@@ -91,7 +91,9 @@ int __thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp)
 	ret = tz->ops.get_temp(tz, temp);
 
 	if (IS_ENABLED(CONFIG_THERMAL_EMULATION) && tz->emul_temperature) {
-		for_each_trip(tz, trip) {
+		for_each_trip_desc(tz, td) {
+			const struct thermal_trip *trip = &td->trip;
+
 			if (trip->type == THERMAL_TRIP_CRITICAL) {
 				crit_temp = trip->temperature;
 				break;
diff --git a/drivers/thermal/thermal_netlink.c b/drivers/thermal/thermal_netlink.c
index 76a231a29654..0ef6368da766 100644
--- a/drivers/thermal/thermal_netlink.c
+++ b/drivers/thermal/thermal_netlink.c
@@ -445,7 +445,7 @@ out_cancel_nest:
 static int thermal_genl_cmd_tz_get_trip(struct param *p)
 {
 	struct sk_buff *msg = p->msg;
-	const struct thermal_trip *trip;
+	const struct thermal_trip_desc *td;
 	struct thermal_zone_device *tz;
 	struct nlattr *start_trip;
 	int id;
@@ -465,7 +465,9 @@ static int thermal_genl_cmd_tz_get_trip(struct param *p)
 
 	mutex_lock(&tz->lock);
 
-	for_each_trip(tz, trip) {
+	for_each_trip_desc(tz, td) {
+		const struct thermal_trip *trip = &td->trip;
+
 		if (nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TRIP_ID,
 				thermal_zone_trip_id(tz, trip)) ||
 		    nla_put_u32(msg, THERMAL_GENL_ATTR_TZ_TRIP_TYPE, trip->type) ||
diff --git a/drivers/thermal/thermal_sysfs.c b/drivers/thermal/thermal_sysfs.c
index 5b533fa40437..88211ccdfbd6 100644
--- a/drivers/thermal/thermal_sysfs.c
+++ b/drivers/thermal/thermal_sysfs.c
@@ -88,7 +88,7 @@ trip_point_type_show(struct device *dev, struct device_attribute *attr,
 	if (sscanf(attr->attr.name, "trip_point_%d_type", &trip_id) != 1)
 		return -EINVAL;
 
-	switch (tz->trips[trip_id].type) {
+	switch (tz->trips[trip_id].trip.type) {
 	case THERMAL_TRIP_CRITICAL:
 		return sprintf(buf, "critical\n");
 	case THERMAL_TRIP_HOT:
@@ -120,7 +120,7 @@ trip_point_temp_store(struct device *dev, struct device_attribute *attr,
 
 	mutex_lock(&tz->lock);
 
-	trip = &tz->trips[trip_id];
+	trip = &tz->trips[trip_id].trip;
 
 	if (temp != trip->temperature) {
 		if (tz->ops.set_trip_temp) {
@@ -150,7 +150,7 @@ trip_point_temp_show(struct device *dev, struct device_attribute *attr,
 	if (sscanf(attr->attr.name, "trip_point_%d_temp", &trip_id) != 1)
 		return -EINVAL;
 
-	return sprintf(buf, "%d\n", tz->trips[trip_id].temperature);
+	return sprintf(buf, "%d\n", tz->trips[trip_id].trip.temperature);
 }
 
 static ssize_t
@@ -171,7 +171,7 @@ trip_point_hyst_store(struct device *dev, struct device_attribute *attr,
 
 	mutex_lock(&tz->lock);
 
-	trip = &tz->trips[trip_id];
+	trip = &tz->trips[trip_id].trip;
 
 	if (hyst != trip->hysteresis) {
 		trip->hysteresis = hyst;
@@ -194,7 +194,7 @@ trip_point_hyst_show(struct device *dev, struct device_attribute *attr,
 	if (sscanf(attr->attr.name, "trip_point_%d_hyst", &trip_id) != 1)
 		return -EINVAL;
 
-	return sprintf(buf, "%d\n", tz->trips[trip_id].hysteresis);
+	return sprintf(buf, "%d\n", tz->trips[trip_id].trip.hysteresis);
 }
 
 static ssize_t
@@ -393,7 +393,7 @@ static const struct attribute_group *thermal_zone_attribute_groups[] = {
  */
 static int create_trip_attrs(struct thermal_zone_device *tz)
 {
-	const struct thermal_trip *trip;
+	const struct thermal_trip_desc *td;
 	struct attribute **attrs;
 
 	/* This function works only for zones with at least one trip */
@@ -429,8 +429,8 @@ static int create_trip_attrs(struct thermal_zone_device *tz)
 		return -ENOMEM;
 	}
 
-	for_each_trip(tz, trip) {
-		int indx = thermal_zone_trip_id(tz, trip);
+	for_each_trip_desc(tz, td) {
+		int indx = thermal_zone_trip_id(tz, &td->trip);
 
 		/* create trip type attribute */
 		snprintf(tz->trip_type_attrs[indx].name, THERMAL_NAME_LENGTH,
@@ -452,7 +452,7 @@ static int create_trip_attrs(struct thermal_zone_device *tz)
 						tz->trip_temp_attrs[indx].name;
 		tz->trip_temp_attrs[indx].attr.attr.mode = S_IRUGO;
 		tz->trip_temp_attrs[indx].attr.show = trip_point_temp_show;
-		if (trip->flags & THERMAL_TRIP_FLAG_RW_TEMP) {
+		if (td->trip.flags & THERMAL_TRIP_FLAG_RW_TEMP) {
 			tz->trip_temp_attrs[indx].attr.attr.mode |= S_IWUSR;
 			tz->trip_temp_attrs[indx].attr.store =
 							trip_point_temp_store;
@@ -467,7 +467,7 @@ static int create_trip_attrs(struct thermal_zone_device *tz)
 					tz->trip_hyst_attrs[indx].name;
 		tz->trip_hyst_attrs[indx].attr.attr.mode = S_IRUGO;
 		tz->trip_hyst_attrs[indx].attr.show = trip_point_hyst_show;
-		if (trip->flags & THERMAL_TRIP_FLAG_RW_HYST) {
+		if (td->trip.flags & THERMAL_TRIP_FLAG_RW_HYST) {
 			tz->trip_hyst_attrs[indx].attr.attr.mode |= S_IWUSR;
 			tz->trip_hyst_attrs[indx].attr.store =
 					trip_point_hyst_store;
diff --git a/drivers/thermal/thermal_trip.c b/drivers/thermal/thermal_trip.c
index 497abf0d47ca..7cf43b697272 100644
--- a/drivers/thermal/thermal_trip.c
+++ b/drivers/thermal/thermal_trip.c
@@ -13,11 +13,11 @@ int for_each_thermal_trip(struct thermal_zone_device *tz,
 			  int (*cb)(struct thermal_trip *, void *),
 			  void *data)
 {
-	struct thermal_trip *trip;
+	struct thermal_trip_desc *td;
 	int ret;
 
-	for_each_trip(tz, trip) {
-		ret = cb(trip, data);
+	for_each_trip_desc(tz, td) {
+		ret = cb(&td->trip, data);
 		if (ret)
 			return ret;
 	}
@@ -63,7 +63,7 @@ EXPORT_SYMBOL_GPL(thermal_zone_get_num_trips);
  */
 void __thermal_zone_set_trips(struct thermal_zone_device *tz)
 {
-	const struct thermal_trip *trip;
+	const struct thermal_trip_desc *td;
 	int low = -INT_MAX, high = INT_MAX;
 	int ret;
 
@@ -72,7 +72,8 @@ void __thermal_zone_set_trips(struct thermal_zone_device *tz)
 	if (!tz->ops.set_trips)
 		return;
 
-	for_each_trip(tz, trip) {
+	for_each_trip_desc(tz, td) {
+		const struct thermal_trip *trip = &td->trip;
 		int trip_low;
 
 		trip_low = trip->temperature - trip->hysteresis;
@@ -110,7 +111,7 @@ int __thermal_zone_get_trip(struct thermal_zone_device *tz, int trip_id,
 	if (!tz || trip_id < 0 || trip_id >= tz->num_trips || !trip)
 		return -EINVAL;
 
-	*trip = tz->trips[trip_id];
+	*trip = tz->trips[trip_id].trip;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(__thermal_zone_get_trip);
@@ -135,7 +136,7 @@ int thermal_zone_trip_id(const struct thermal_zone_device *tz,
 	 * Assume the trip to be located within the bounds of the thermal
 	 * zone's trips[] table.
 	 */
-	return trip - tz->trips;
+	return trip_to_trip_desc(trip) - tz->trips;
 }
 void thermal_zone_trip_updated(struct thermal_zone_device *tz,
 			       const struct thermal_trip *trip)
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index c33f50177f51..67bd13303349 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -61,7 +61,6 @@ enum thermal_notify_event {
  * struct thermal_trip - representation of a point in temperature domain
  * @temperature: temperature value in miliCelsius
  * @hysteresis: relative hysteresis in miliCelsius
- * @threshold: trip crossing notification threshold miliCelsius
  * @type: trip point type
  * @priv: pointer to driver data associated with this trip
  * @flags: flags representing binary properties of the trip
@@ -69,12 +68,16 @@ enum thermal_notify_event {
 struct thermal_trip {
 	int temperature;
 	int hysteresis;
-	int threshold;
 	enum thermal_trip_type type;
 	u8 flags;
 	void *priv;
 };
 
+struct thermal_trip_desc {
+	struct thermal_trip trip;
+	int threshold;
+};
+
 #define THERMAL_TRIP_FLAG_RW_TEMP	BIT(0)
 #define THERMAL_TRIP_FLAG_RW_HYST	BIT(1)
 
@@ -203,7 +206,7 @@ struct thermal_zone_device {
 #ifdef CONFIG_THERMAL_DEBUGFS
 	struct thermal_debugfs *debugfs;
 #endif
-	struct thermal_trip trips[] __counted_by(num_trips);
+	struct thermal_trip_desc trips[] __counted_by(num_trips);
 };
 
 /**
-- 
cgit v1.2.3


From b1ae92dcfa8ed7d5044c525c3a868fcb4d0f9c0e Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 2 Apr 2024 20:57:57 +0200
Subject: thermal: core: Make struct thermal_zone_device definition internal

Move the definitions of struct thermal_trip_desc and struct
thermal_zone_device to an internal header file in the thermal core,
as they don't need to be accessible to any code other than the thermal
core and so they don't need to be present in a global header.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
 drivers/thermal/thermal_core.h      | 85 ++++++++++++++++++++++++++++++++++++
 drivers/thermal/thermal_trace.h     |  2 +
 drivers/thermal/thermal_trace_ipa.h |  2 +
 include/linux/thermal.h             | 87 +------------------------------------
 4 files changed, 91 insertions(+), 85 deletions(-)

(limited to 'include')

diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index 4ad9c671bc13..5dd3be585e9e 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -15,6 +15,91 @@
 #include "thermal_netlink.h"
 #include "thermal_debugfs.h"
 
+struct thermal_trip_desc {
+	struct thermal_trip trip;
+	int threshold;
+};
+
+/**
+ * struct thermal_zone_device - structure for a thermal zone
+ * @id:		unique id number for each thermal zone
+ * @type:	the thermal zone device type
+ * @device:	&struct device for this thermal zone
+ * @removal:	removal completion
+ * @trip_temp_attrs:	attributes for trip points for sysfs: trip temperature
+ * @trip_type_attrs:	attributes for trip points for sysfs: trip type
+ * @trip_hyst_attrs:	attributes for trip points for sysfs: trip hysteresis
+ * @mode:		current mode of this thermal zone
+ * @devdata:	private pointer for device private data
+ * @num_trips:	number of trip points the thermal zone supports
+ * @passive_delay_jiffies: number of jiffies to wait between polls when
+ *			performing passive cooling.
+ * @polling_delay_jiffies: number of jiffies to wait between polls when
+ *			checking whether trip points have been crossed (0 for
+ *			interrupt driven systems)
+ * @temperature:	current temperature.  This is only for core code,
+ *			drivers should use thermal_zone_get_temp() to get the
+ *			current temperature
+ * @last_temperature:	previous temperature read
+ * @emul_temperature:	emulated temperature when using CONFIG_THERMAL_EMULATION
+ * @passive:		1 if you've crossed a passive trip point, 0 otherwise.
+ * @prev_low_trip:	the low current temperature if you've crossed a passive
+			trip point.
+ * @prev_high_trip:	the above current temperature if you've crossed a
+			passive trip point.
+ * @need_update:	if equals 1, thermal_zone_device_update needs to be invoked.
+ * @ops:	operations this &thermal_zone_device supports
+ * @tzp:	thermal zone parameters
+ * @governor:	pointer to the governor for this thermal zone
+ * @governor_data:	private pointer for governor data
+ * @thermal_instances:	list of &struct thermal_instance of this thermal zone
+ * @ida:	&struct ida to generate unique id for this zone's cooling
+ *		devices
+ * @lock:	lock to protect thermal_instances list
+ * @node:	node in thermal_tz_list (in thermal_core.c)
+ * @poll_queue:	delayed work for polling
+ * @notify_event: Last notification event
+ * @suspended: thermal zone suspend indicator
+ * @trips:	array of struct thermal_trip objects
+ */
+struct thermal_zone_device {
+	int id;
+	char type[THERMAL_NAME_LENGTH];
+	struct device device;
+	struct completion removal;
+	struct attribute_group trips_attribute_group;
+	struct thermal_attr *trip_temp_attrs;
+	struct thermal_attr *trip_type_attrs;
+	struct thermal_attr *trip_hyst_attrs;
+	enum thermal_device_mode mode;
+	void *devdata;
+	int num_trips;
+	unsigned long passive_delay_jiffies;
+	unsigned long polling_delay_jiffies;
+	int temperature;
+	int last_temperature;
+	int emul_temperature;
+	int passive;
+	int prev_low_trip;
+	int prev_high_trip;
+	atomic_t need_update;
+	struct thermal_zone_device_ops ops;
+	struct thermal_zone_params *tzp;
+	struct thermal_governor *governor;
+	void *governor_data;
+	struct list_head thermal_instances;
+	struct ida ida;
+	struct mutex lock;
+	struct list_head node;
+	struct delayed_work poll_queue;
+	enum thermal_notify_event notify_event;
+	bool suspended;
+#ifdef CONFIG_THERMAL_DEBUGFS
+	struct thermal_debugfs *debugfs;
+#endif
+	struct thermal_trip_desc trips[] __counted_by(num_trips);
+};
+
 /* Default Thermal Governor */
 #if defined(CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE)
 #define DEFAULT_THERMAL_GOVERNOR       "step_wise"
diff --git a/drivers/thermal/thermal_trace.h b/drivers/thermal/thermal_trace.h
index 459c8ce6cf3b..88a962f560f2 100644
--- a/drivers/thermal/thermal_trace.h
+++ b/drivers/thermal/thermal_trace.h
@@ -9,6 +9,8 @@
 #include <linux/thermal.h>
 #include <linux/tracepoint.h>
 
+#include "thermal_core.h"
+
 TRACE_DEFINE_ENUM(THERMAL_TRIP_CRITICAL);
 TRACE_DEFINE_ENUM(THERMAL_TRIP_HOT);
 TRACE_DEFINE_ENUM(THERMAL_TRIP_PASSIVE);
diff --git a/drivers/thermal/thermal_trace_ipa.h b/drivers/thermal/thermal_trace_ipa.h
index b16b5dd863d9..a82821eebc88 100644
--- a/drivers/thermal/thermal_trace_ipa.h
+++ b/drivers/thermal/thermal_trace_ipa.h
@@ -7,6 +7,8 @@
 
 #include <linux/tracepoint.h>
 
+#include "thermal_core.h"
+
 TRACE_EVENT(thermal_power_allocator,
 	TP_PROTO(struct thermal_zone_device *tz, u32 total_req_power,
 		 u32 total_granted_power, int num_actors, u32 power_range,
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 67bd13303349..62b174528cfe 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -73,17 +73,14 @@ struct thermal_trip {
 	void *priv;
 };
 
-struct thermal_trip_desc {
-	struct thermal_trip trip;
-	int threshold;
-};
-
 #define THERMAL_TRIP_FLAG_RW_TEMP	BIT(0)
 #define THERMAL_TRIP_FLAG_RW_HYST	BIT(1)
 
 #define THERMAL_TRIP_FLAG_RW	(THERMAL_TRIP_FLAG_RW_TEMP | \
 				 THERMAL_TRIP_FLAG_RW_HYST)
 
+struct thermal_zone_device;
+
 struct thermal_zone_device_ops {
 	int (*bind) (struct thermal_zone_device *,
 		     struct thermal_cooling_device *);
@@ -129,86 +126,6 @@ struct thermal_cooling_device {
 #endif
 };
 
-/**
- * struct thermal_zone_device - structure for a thermal zone
- * @id:		unique id number for each thermal zone
- * @type:	the thermal zone device type
- * @device:	&struct device for this thermal zone
- * @removal:	removal completion
- * @trip_temp_attrs:	attributes for trip points for sysfs: trip temperature
- * @trip_type_attrs:	attributes for trip points for sysfs: trip type
- * @trip_hyst_attrs:	attributes for trip points for sysfs: trip hysteresis
- * @mode:		current mode of this thermal zone
- * @devdata:	private pointer for device private data
- * @num_trips:	number of trip points the thermal zone supports
- * @passive_delay_jiffies: number of jiffies to wait between polls when
- *			performing passive cooling.
- * @polling_delay_jiffies: number of jiffies to wait between polls when
- *			checking whether trip points have been crossed (0 for
- *			interrupt driven systems)
- * @temperature:	current temperature.  This is only for core code,
- *			drivers should use thermal_zone_get_temp() to get the
- *			current temperature
- * @last_temperature:	previous temperature read
- * @emul_temperature:	emulated temperature when using CONFIG_THERMAL_EMULATION
- * @passive:		1 if you've crossed a passive trip point, 0 otherwise.
- * @prev_low_trip:	the low current temperature if you've crossed a passive
-			trip point.
- * @prev_high_trip:	the above current temperature if you've crossed a
-			passive trip point.
- * @need_update:	if equals 1, thermal_zone_device_update needs to be invoked.
- * @ops:	operations this &thermal_zone_device supports
- * @tzp:	thermal zone parameters
- * @governor:	pointer to the governor for this thermal zone
- * @governor_data:	private pointer for governor data
- * @thermal_instances:	list of &struct thermal_instance of this thermal zone
- * @ida:	&struct ida to generate unique id for this zone's cooling
- *		devices
- * @lock:	lock to protect thermal_instances list
- * @node:	node in thermal_tz_list (in thermal_core.c)
- * @poll_queue:	delayed work for polling
- * @notify_event: Last notification event
- * @suspended: thermal zone suspend indicator
- * @trips:	array of struct thermal_trip objects
- */
-struct thermal_zone_device {
-	int id;
-	char type[THERMAL_NAME_LENGTH];
-	struct device device;
-	struct completion removal;
-	struct attribute_group trips_attribute_group;
-	struct thermal_attr *trip_temp_attrs;
-	struct thermal_attr *trip_type_attrs;
-	struct thermal_attr *trip_hyst_attrs;
-	enum thermal_device_mode mode;
-	void *devdata;
-	int num_trips;
-	unsigned long passive_delay_jiffies;
-	unsigned long polling_delay_jiffies;
-	int temperature;
-	int last_temperature;
-	int emul_temperature;
-	int passive;
-	int prev_low_trip;
-	int prev_high_trip;
-	atomic_t need_update;
-	struct thermal_zone_device_ops ops;
-	struct thermal_zone_params *tzp;
-	struct thermal_governor *governor;
-	void *governor_data;
-	struct list_head thermal_instances;
-	struct ida ida;
-	struct mutex lock;
-	struct list_head node;
-	struct delayed_work poll_queue;
-	enum thermal_notify_event notify_event;
-	bool suspended;
-#ifdef CONFIG_THERMAL_DEBUGFS
-	struct thermal_debugfs *debugfs;
-#endif
-	struct thermal_trip_desc trips[] __counted_by(num_trips);
-};
-
 /**
  * struct thermal_governor - structure that holds thermal governor information
  * @name:	name of the governor
-- 
cgit v1.2.3


From 0444d574fbc3d3af0e426f7c2b72f5830269f096 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 4 Apr 2024 21:27:07 +0200
Subject: thermal: core: Relocate the struct thermal_governor definition

Notice that struct thermal_governor is only used by the thermal core
and so move its definition to thermal_core.h.

No functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
---
 drivers/thermal/thermal_core.h | 25 +++++++++++++++++++++++++
 include/linux/thermal.h        | 25 -------------------------
 2 files changed, 25 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index 9d3ef1e0645f..b461d9583834 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -22,6 +22,31 @@ struct thermal_trip_desc {
 	int threshold;
 };
 
+/**
+ * struct thermal_governor - structure that holds thermal governor information
+ * @name:	name of the governor
+ * @bind_to_tz: callback called when binding to a thermal zone.  If it
+ *		returns 0, the governor is bound to the thermal zone,
+ *		otherwise it fails.
+ * @unbind_from_tz:	callback called when a governor is unbound from a
+ *			thermal zone.
+ * @throttle:	callback called for every trip point even if temperature is
+ *		below the trip point temperature
+ * @update_tz:	callback called when thermal zone internals have changed, e.g.
+ *		thermal cooling instance was added/removed
+ * @governor_list:	node in thermal_governor_list (in thermal_core.c)
+ */
+struct thermal_governor {
+	const char *name;
+	int (*bind_to_tz)(struct thermal_zone_device *tz);
+	void (*unbind_from_tz)(struct thermal_zone_device *tz);
+	int (*throttle)(struct thermal_zone_device *tz,
+			const struct thermal_trip *trip);
+	void (*update_tz)(struct thermal_zone_device *tz,
+			  enum thermal_notify_event reason);
+	struct list_head	governor_list;
+};
+
 /**
  * struct thermal_zone_device - structure for a thermal zone
  * @id:		unique id number for each thermal zone
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 62b174528cfe..f1155c0439c4 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -126,31 +126,6 @@ struct thermal_cooling_device {
 #endif
 };
 
-/**
- * struct thermal_governor - structure that holds thermal governor information
- * @name:	name of the governor
- * @bind_to_tz: callback called when binding to a thermal zone.  If it
- *		returns 0, the governor is bound to the thermal zone,
- *		otherwise it fails.
- * @unbind_from_tz:	callback called when a governor is unbound from a
- *			thermal zone.
- * @throttle:	callback called for every trip point even if temperature is
- *		below the trip point temperature
- * @update_tz:	callback called when thermal zone internals have changed, e.g.
- *		thermal cooling instance was added/removed
- * @governor_list:	node in thermal_governor_list (in thermal_core.c)
- */
-struct thermal_governor {
-	const char *name;
-	int (*bind_to_tz)(struct thermal_zone_device *tz);
-	void (*unbind_from_tz)(struct thermal_zone_device *tz);
-	int (*throttle)(struct thermal_zone_device *tz,
-			const struct thermal_trip *trip);
-	void (*update_tz)(struct thermal_zone_device *tz,
-			  enum thermal_notify_event reason);
-	struct list_head	governor_list;
-};
-
 /* Structure to define Thermal Zone parameters */
 struct thermal_zone_params {
 	const char *governor_name;
-- 
cgit v1.2.3


From e3ac0f367d5806af09d2070bb7951af2f59d1f52 Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Wed, 3 Apr 2024 16:49:04 +0100
Subject: OPP: OF: Export dev_opp_pm_calc_power() for usage from EM

There are device drivers which can modify voltage values for OPPs. It
could be due to the chip binning and those drivers have specific chip
knowledge about it. This adjustment can happen after Energy Model is
registered, thus EM can have stale data about power.

Export dev_opp_pm_calc_power() which can be used by Energy Model to
calculate new power with the new voltage for OPPs.

Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/opp/of.c       | 17 ++++++++++++-----
 include/linux/pm_opp.h |  8 ++++++++
 2 files changed, 20 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/opp/of.c b/drivers/opp/of.c
index f9f0b22bccbb..282eb5966fd0 100644
--- a/drivers/opp/of.c
+++ b/drivers/opp/of.c
@@ -1494,20 +1494,26 @@ _get_dt_power(struct device *dev, unsigned long *uW, unsigned long *kHz)
 	return 0;
 }
 
-/*
- * Callback function provided to the Energy Model framework upon registration.
+/**
+ * dev_pm_opp_calc_power() - Calculate power value for device with EM
+ * @dev		: Device for which an Energy Model has to be registered
+ * @uW		: New power value that is calculated
+ * @kHz		: Frequency for which the new power is calculated
+ *
  * This computes the power estimated by @dev at @kHz if it is the frequency
  * of an existing OPP, or at the frequency of the first OPP above @kHz otherwise
  * (see dev_pm_opp_find_freq_ceil()). This function updates @kHz to the ceiled
  * frequency and @uW to the associated power. The power is estimated as
  * P = C * V^2 * f with C being the device's capacitance and V and f
  * respectively the voltage and frequency of the OPP.
+ * It is also used as a callback function provided to the Energy Model
+ * framework upon registration.
  *
  * Returns -EINVAL if the power calculation failed because of missing
  * parameters, 0 otherwise.
  */
-static int __maybe_unused _get_power(struct device *dev, unsigned long *uW,
-				     unsigned long *kHz)
+int dev_pm_opp_calc_power(struct device *dev, unsigned long *uW,
+			  unsigned long *kHz)
 {
 	struct dev_pm_opp *opp;
 	struct device_node *np;
@@ -1544,6 +1550,7 @@ static int __maybe_unused _get_power(struct device *dev, unsigned long *uW,
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(dev_pm_opp_calc_power);
 
 static bool _of_has_opp_microwatt_property(struct device *dev)
 {
@@ -1619,7 +1626,7 @@ int dev_pm_opp_of_register_em(struct device *dev, struct cpumask *cpus)
 		goto failed;
 	}
 
-	EM_SET_ACTIVE_POWER_CB(em_cb, _get_power);
+	EM_SET_ACTIVE_POWER_CB(em_cb, dev_pm_opp_calc_power);
 
 register_em:
 	ret = em_dev_register_perf_domain(dev, nr_opp, &em_cb, cpus, true);
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 065a47382302..dd7c8441af42 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -476,6 +476,8 @@ struct device_node *dev_pm_opp_get_of_node(struct dev_pm_opp *opp);
 int of_get_required_opp_performance_state(struct device_node *np, int index);
 int dev_pm_opp_of_find_icc_paths(struct device *dev, struct opp_table *opp_table);
 int dev_pm_opp_of_register_em(struct device *dev, struct cpumask *cpus);
+int dev_pm_opp_calc_power(struct device *dev, unsigned long *uW,
+			  unsigned long *kHz);
 static inline void dev_pm_opp_of_unregister_em(struct device *dev)
 {
 	em_dev_unregister_perf_domain(dev);
@@ -539,6 +541,12 @@ static inline void dev_pm_opp_of_unregister_em(struct device *dev)
 {
 }
 
+static inline int dev_pm_opp_calc_power(struct device *dev, unsigned long *uW,
+					unsigned long *kHz)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int of_get_required_opp_performance_state(struct device_node *np, int index)
 {
 	return -EOPNOTSUPP;
-- 
cgit v1.2.3


From cf61d53b026805e8222ca28ac2795611eb7fa547 Mon Sep 17 00:00:00 2001
From: Lukasz Luba <lukasz.luba@arm.com>
Date: Wed, 3 Apr 2024 16:49:06 +0100
Subject: PM: EM: Add em_dev_update_chip_binning()

Add a function which allows to modify easily the EM after the new voltage
information is available. The device drivers for the chip can adjust
the voltage values after setup. The voltage for the same frequency in OPP
can be different due to chip binning. The voltage impacts the power usage
and the EM power values can be updated to reflect that.

Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/energy_model.h |  5 +++++
 kernel/power/energy_model.c  | 48 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+)

(limited to 'include')

diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index 70cd7258cd29..1ff52020cf75 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -172,6 +172,7 @@ struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd);
 void em_table_free(struct em_perf_table __rcu *table);
 int em_dev_compute_costs(struct device *dev, struct em_perf_state *table,
 			 int nr_states);
+int em_dev_update_chip_binning(struct device *dev);
 
 /**
  * em_pd_get_efficient_state() - Get an efficient performance state from the EM
@@ -386,6 +387,10 @@ int em_dev_compute_costs(struct device *dev, struct em_perf_state *table,
 {
 	return -EINVAL;
 }
+static inline int em_dev_update_chip_binning(struct device *dev)
+{
+	return -EINVAL;
+}
 #endif
 
 #endif
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 6960dd7393b2..927cc55ba0b3 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -808,3 +808,51 @@ static void em_update_workfn(struct work_struct *work)
 {
 	em_check_capacity_update();
 }
+
+/**
+ * em_dev_update_chip_binning() - Update Energy Model after the new voltage
+ *				information is present in the OPPs.
+ * @dev		: Device for which the Energy Model has to be updated.
+ *
+ * This function allows to update easily the EM with new values available in
+ * the OPP framework and DT. It can be used after the chip has been properly
+ * verified by device drivers and the voltages adjusted for the 'chip binning'.
+ */
+int em_dev_update_chip_binning(struct device *dev)
+{
+	struct em_perf_table __rcu *em_table;
+	struct em_perf_domain *pd;
+	int i, ret;
+
+	if (IS_ERR_OR_NULL(dev))
+		return -EINVAL;
+
+	pd = em_pd_get(dev);
+	if (!pd) {
+		dev_warn(dev, "Couldn't find Energy Model\n");
+		return -EINVAL;
+	}
+
+	em_table = em_table_dup(pd);
+	if (!em_table) {
+		dev_warn(dev, "EM: allocation failed\n");
+		return -ENOMEM;
+	}
+
+	/* Update power values which might change due to new voltage in OPPs */
+	for (i = 0; i < pd->nr_perf_states; i++) {
+		unsigned long freq = em_table->state[i].frequency;
+		unsigned long power;
+
+		ret = dev_pm_opp_calc_power(dev, &power, &freq);
+		if (ret) {
+			em_table_free(em_table);
+			return ret;
+		}
+
+		em_table->state[i].power = power;
+	}
+
+	return em_recalc_and_update(dev, pd, em_table);
+}
+EXPORT_SYMBOL_GPL(em_dev_update_chip_binning);
-- 
cgit v1.2.3


From 48b9c4862bd303628b2e772591f2ef958e02a316 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Date: Thu, 28 Mar 2024 20:49:11 +0100
Subject: ACPI: store owner from modules with acpi_bus_register_driver()

Modules registering driver with acpi_bus_register_driver() often forget to
set .owner field.  The field is used by some of other kernel parts for
reference counting (try_module_get()), so it is expected that drivers
will set it.

Solve the problem by moving this task away from the drivers to the core
ACPI bus code, just like we did for platform_driver in
commit 9447057eaff8 ("platform_device: use a macro instead of
platform_driver_register").

Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/bus.c      | 9 +++++----
 include/acpi/acpi_bus.h | 7 ++++++-
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index a87b10eef77d..844c46447914 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -995,25 +995,26 @@ EXPORT_SYMBOL_GPL(acpi_driver_match_device);
    -------------------------------------------------------------------------- */
 
 /**
- * acpi_bus_register_driver - register a driver with the ACPI bus
+ * __acpi_bus_register_driver - register a driver with the ACPI bus
  * @driver: driver being registered
+ * @owner: owning module/driver
  *
  * Registers a driver with the ACPI bus.  Searches the namespace for all
  * devices that match the driver's criteria and binds.  Returns zero for
  * success or a negative error status for failure.
  */
-int acpi_bus_register_driver(struct acpi_driver *driver)
+int __acpi_bus_register_driver(struct acpi_driver *driver, struct module *owner)
 {
 	if (acpi_disabled)
 		return -ENODEV;
 	driver->drv.name = driver->name;
 	driver->drv.bus = &acpi_bus_type;
-	driver->drv.owner = driver->owner;
+	driver->drv.owner = owner;
 
 	return driver_register(&driver->drv);
 }
 
-EXPORT_SYMBOL(acpi_bus_register_driver);
+EXPORT_SYMBOL(__acpi_bus_register_driver);
 
 /**
  * acpi_bus_unregister_driver - unregisters a driver with the ACPI bus
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 5de954e2b18a..7453be56f855 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -656,7 +656,12 @@ void acpi_scan_lock_release(void);
 void acpi_lock_hp_context(void);
 void acpi_unlock_hp_context(void);
 int acpi_scan_add_handler(struct acpi_scan_handler *handler);
-int acpi_bus_register_driver(struct acpi_driver *driver);
+/*
+ * use a macro to avoid include chaining to get THIS_MODULE
+ */
+#define acpi_bus_register_driver(drv) \
+	__acpi_bus_register_driver(drv, THIS_MODULE)
+int __acpi_bus_register_driver(struct acpi_driver *driver, struct module *owner);
 void acpi_bus_unregister_driver(struct acpi_driver *driver);
 int acpi_bus_scan(acpi_handle handle);
 void acpi_bus_trim(struct acpi_device *start);
-- 
cgit v1.2.3


From cc85f9c05bba23eb525497b42ac5b74801ccbd87 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Date: Thu, 28 Mar 2024 20:49:29 +0100
Subject: ACPI: drop redundant owner from acpi_driver

Once all .owner is removed from all acpi_driver instances, drop it from
the structure.

Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/acpi_bus.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 7453be56f855..32aae3ee99ac 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -170,7 +170,6 @@ struct acpi_driver {
 	unsigned int flags;
 	struct acpi_device_ops ops;
 	struct device_driver drv;
-	struct module *owner;
 };
 
 /*
-- 
cgit v1.2.3


From f81d13df1aa8b02fa8c6ac4b396dcda92fdfdac0 Mon Sep 17 00:00:00 2001
From: "Luke D. Jones" <luke@ljones.dev>
Date: Thu, 4 Apr 2024 13:16:44 +1300
Subject: platform/x86: asus-wmi: add support for 2024 ROG Mini-LED
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Support the 2024 mini-led backlight and adjust the related functions
to select the relevant dev-id. Also add `available_mini_led_mode` to the
platform sysfs since the available mini-led levels can be different.

Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
signed-off-by: Luke D. Jones <luke@ljones.dev>
Link: https://lore.kernel.org/r/20240404001652.86207-2-luke@ljones.dev
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 Documentation/ABI/testing/sysfs-platform-asus-wmi |  8 ++
 drivers/platform/x86/asus-wmi.c                   | 96 ++++++++++++++++++++---
 include/linux/platform_data/x86/asus-wmi.h        |  1 +
 3 files changed, 95 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-platform-asus-wmi b/Documentation/ABI/testing/sysfs-platform-asus-wmi
index 8a7e25bde085..ef1ac1a20a71 100644
--- a/Documentation/ABI/testing/sysfs-platform-asus-wmi
+++ b/Documentation/ABI/testing/sysfs-platform-asus-wmi
@@ -126,6 +126,14 @@ Description:
 		Change the mini-LED mode:
 			* 0 - Single-zone,
 			* 1 - Multi-zone
+			* 2 - Multi-zone strong (available on newer generation mini-led)
+
+What:		/sys/devices/platform/<platform>/available_mini_led_mode
+Date:		Apr 2024
+KernelVersion:	6.10
+Contact:	"Luke Jones" <luke@ljones.dev>
+Description:
+		List the available mini-led modes.
 
 What:		/sys/devices/platform/<platform>/ppt_pl1_spl
 Date:		Jun 2023
diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index df4c103459da..e8fbd7f9d343 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -126,6 +126,17 @@ module_param(fnlock_default, bool, 0444);
 #define ASUS_SCREENPAD_BRIGHT_MAX 255
 #define ASUS_SCREENPAD_BRIGHT_DEFAULT 60
 
+#define ASUS_MINI_LED_MODE_MASK		0x03
+/* Standard modes for devices with only on/off */
+#define ASUS_MINI_LED_OFF		0x00
+#define ASUS_MINI_LED_ON		0x01
+/* New mode on some devices, define here to clarify remapping later */
+#define ASUS_MINI_LED_STRONG_MODE	0x02
+/* New modes for devices with 3 mini-led mode types */
+#define ASUS_MINI_LED_2024_WEAK		0x00
+#define ASUS_MINI_LED_2024_STRONG	0x01
+#define ASUS_MINI_LED_2024_OFF		0x02
+
 /* Controls the power state of the USB0 hub on ROG Ally which input is on */
 #define ASUS_USB0_PWR_EC0_CSEE "\\_SB.PCI0.SBRG.EC0.CSEE"
 /* 300ms so far seems to produce a reliable result on AC and battery */
@@ -288,7 +299,7 @@ struct asus_wmi {
 	bool battery_rsoc_available;
 
 	bool panel_overdrive_available;
-	bool mini_led_mode_available;
+	u32 mini_led_dev_id;
 
 	struct hotplug_slot hotplug_slot;
 	struct mutex hotplug_lock;
@@ -2108,13 +2119,33 @@ static ssize_t mini_led_mode_show(struct device *dev,
 				   struct device_attribute *attr, char *buf)
 {
 	struct asus_wmi *asus = dev_get_drvdata(dev);
-	int result;
+	u32 value;
+	int err;
 
-	result = asus_wmi_get_devstate_simple(asus, ASUS_WMI_DEVID_MINI_LED_MODE);
-	if (result < 0)
-		return result;
+	err = asus_wmi_get_devstate(asus, asus->mini_led_dev_id, &value);
+	if (err < 0)
+		return err;
+	value = value & ASUS_MINI_LED_MODE_MASK;
 
-	return sysfs_emit(buf, "%d\n", result);
+	/*
+	 * Remap the mode values to match previous generation mini-led. The last gen
+	 * WMI 0 == off, while on this version WMI 2 ==off (flipped).
+	 */
+	if (asus->mini_led_dev_id == ASUS_WMI_DEVID_MINI_LED_MODE2) {
+		switch (value) {
+		case ASUS_MINI_LED_2024_WEAK:
+			value = ASUS_MINI_LED_ON;
+			break;
+		case ASUS_MINI_LED_2024_STRONG:
+			value = ASUS_MINI_LED_STRONG_MODE;
+			break;
+		case ASUS_MINI_LED_2024_OFF:
+			value = ASUS_MINI_LED_OFF;
+			break;
+		}
+	}
+
+	return sysfs_emit(buf, "%d\n", value);
 }
 
 static ssize_t mini_led_mode_store(struct device *dev,
@@ -2130,11 +2161,32 @@ static ssize_t mini_led_mode_store(struct device *dev,
 	if (result)
 		return result;
 
-	if (mode > 1)
+	if (asus->mini_led_dev_id == ASUS_WMI_DEVID_MINI_LED_MODE &&
+	    mode > ASUS_MINI_LED_ON)
+		return -EINVAL;
+	if (asus->mini_led_dev_id == ASUS_WMI_DEVID_MINI_LED_MODE2 &&
+	    mode > ASUS_MINI_LED_STRONG_MODE)
 		return -EINVAL;
 
-	err = asus_wmi_set_devstate(ASUS_WMI_DEVID_MINI_LED_MODE, mode, &result);
+	/*
+	 * Remap the mode values so expected behaviour is the same as the last
+	 * generation of mini-LED with 0 == off, 1 == on.
+	 */
+	if (asus->mini_led_dev_id == ASUS_WMI_DEVID_MINI_LED_MODE2) {
+		switch (mode) {
+		case ASUS_MINI_LED_OFF:
+			mode = ASUS_MINI_LED_2024_OFF;
+			break;
+		case ASUS_MINI_LED_ON:
+			mode = ASUS_MINI_LED_2024_WEAK;
+			break;
+		case ASUS_MINI_LED_STRONG_MODE:
+			mode = ASUS_MINI_LED_2024_STRONG;
+			break;
+		}
+	}
 
+	err = asus_wmi_set_devstate(asus->mini_led_dev_id, mode, &result);
 	if (err) {
 		pr_warn("Failed to set mini-LED: %d\n", err);
 		return err;
@@ -2151,6 +2203,23 @@ static ssize_t mini_led_mode_store(struct device *dev,
 }
 static DEVICE_ATTR_RW(mini_led_mode);
 
+static ssize_t available_mini_led_mode_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	struct asus_wmi *asus = dev_get_drvdata(dev);
+
+	switch (asus->mini_led_dev_id) {
+	case ASUS_WMI_DEVID_MINI_LED_MODE:
+		return sysfs_emit(buf, "0 1\n");
+	case ASUS_WMI_DEVID_MINI_LED_MODE2:
+		return sysfs_emit(buf, "0 1 2\n");
+	}
+
+	return sysfs_emit(buf, "0\n");
+}
+
+static DEVICE_ATTR_RO(available_mini_led_mode);
+
 /* Quirks *********************************************************************/
 
 static void asus_wmi_set_xusb2pr(struct asus_wmi *asus)
@@ -4139,6 +4208,7 @@ static struct attribute *platform_attributes[] = {
 	&dev_attr_nv_temp_target.attr,
 	&dev_attr_panel_od.attr,
 	&dev_attr_mini_led_mode.attr,
+	&dev_attr_available_mini_led_mode.attr,
 	NULL
 };
 
@@ -4191,7 +4261,9 @@ static umode_t asus_sysfs_is_visible(struct kobject *kobj,
 	else if (attr == &dev_attr_panel_od.attr)
 		ok = asus->panel_overdrive_available;
 	else if (attr == &dev_attr_mini_led_mode.attr)
-		ok = asus->mini_led_mode_available;
+		ok = asus->mini_led_dev_id != 0;
+	else if (attr == &dev_attr_available_mini_led_mode.attr)
+		ok = asus->mini_led_dev_id != 0;
 
 	if (devid != -1)
 		ok = !(asus_wmi_get_devstate_simple(asus, devid) < 0);
@@ -4444,10 +4516,14 @@ static int asus_wmi_add(struct platform_device *pdev)
 	asus->nv_dyn_boost_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_NV_DYN_BOOST);
 	asus->nv_temp_tgt_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_NV_THERM_TARGET);
 	asus->panel_overdrive_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_PANEL_OD);
-	asus->mini_led_mode_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_MINI_LED_MODE);
 	asus->ally_mcu_usb_switch = acpi_has_method(NULL, ASUS_USB0_PWR_EC0_CSEE)
 						&& dmi_match(DMI_BOARD_NAME, "RC71L");
 
+	if (asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_MINI_LED_MODE))
+		asus->mini_led_dev_id = ASUS_WMI_DEVID_MINI_LED_MODE;
+	else if (asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_MINI_LED_MODE2))
+		asus->mini_led_dev_id = ASUS_WMI_DEVID_MINI_LED_MODE2;
+
 	err = fan_boost_mode_check_present(asus);
 	if (err)
 		goto fail_fan_boost_mode;
diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index ab1c7deff118..9cadce10ad9a 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -71,6 +71,7 @@
 #define ASUS_WMI_DEVID_LID_FLIP		0x00060062
 #define ASUS_WMI_DEVID_LID_FLIP_ROG	0x00060077
 #define ASUS_WMI_DEVID_MINI_LED_MODE	0x0005001E
+#define ASUS_WMI_DEVID_MINI_LED_MODE2	0x0005002E
 
 /* Storage */
 #define ASUS_WMI_DEVID_CARDREADER	0x00080013
-- 
cgit v1.2.3


From eb3bac90549a226df204a57eac45bbe5d6b4a0cf Mon Sep 17 00:00:00 2001
From: "Luke D. Jones" <luke@ljones.dev>
Date: Thu, 4 Apr 2024 13:16:45 +1300
Subject: platform/x86: asus-wmi: add support for Vivobook GPU MUX
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for the Vivobook dgpu MUX available on the ASUS Viviobook
and some of the other ranges (Zen).

This MUX functions exactly the same as the existing ROG MUX support so
the existing functionality now detects which MUX is available and uses
that for control.

Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Luke D. Jones <luke@ljones.dev>
Link: https://lore.kernel.org/r/20240404001652.86207-3-luke@ljones.dev
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 drivers/platform/x86/asus-wmi.c            | 22 +++++++++++++---------
 include/linux/platform_data/x86/asus-wmi.h |  1 +
 2 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index e8fbd7f9d343..51e4df081046 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -270,7 +270,7 @@ struct asus_wmi {
 	bool egpu_enable_available;
 	bool egpu_connect_available;
 	bool dgpu_disable_available;
-	bool gpu_mux_mode_available;
+	u32 gpu_mux_dev;
 
 	/* Tunables provided by ASUS for gaming laptops */
 	bool ppt_pl2_sppt_available;
@@ -693,8 +693,8 @@ static ssize_t dgpu_disable_store(struct device *dev,
 	if (disable > 1)
 		return -EINVAL;
 
-	if (asus->gpu_mux_mode_available) {
-		result = asus_wmi_get_devstate_simple(asus, ASUS_WMI_DEVID_GPU_MUX);
+	if (asus->gpu_mux_dev) {
+		result = asus_wmi_get_devstate_simple(asus, asus->gpu_mux_dev);
 		if (result < 0)
 			/* An error here may signal greater failure of GPU handling */
 			return result;
@@ -759,8 +759,8 @@ static ssize_t egpu_enable_store(struct device *dev,
 		return err;
 	}
 
-	if (asus->gpu_mux_mode_available) {
-		result = asus_wmi_get_devstate_simple(asus, ASUS_WMI_DEVID_GPU_MUX);
+	if (asus->gpu_mux_dev) {
+		result = asus_wmi_get_devstate_simple(asus, asus->gpu_mux_dev);
 		if (result < 0) {
 			/* An error here may signal greater failure of GPU handling */
 			pr_warn("Failed to get gpu mux status: %d\n", result);
@@ -813,7 +813,7 @@ static ssize_t gpu_mux_mode_show(struct device *dev,
 	struct asus_wmi *asus = dev_get_drvdata(dev);
 	int result;
 
-	result = asus_wmi_get_devstate_simple(asus, ASUS_WMI_DEVID_GPU_MUX);
+	result = asus_wmi_get_devstate_simple(asus, asus->gpu_mux_dev);
 	if (result < 0)
 		return result;
 
@@ -859,7 +859,7 @@ static ssize_t gpu_mux_mode_store(struct device *dev,
 		}
 	}
 
-	err = asus_wmi_set_devstate(ASUS_WMI_DEVID_GPU_MUX, optimus, &result);
+	err = asus_wmi_set_devstate(asus->gpu_mux_dev, optimus, &result);
 	if (err) {
 		dev_err(dev, "Failed to set GPU MUX mode: %d\n", err);
 		return err;
@@ -4239,7 +4239,7 @@ static umode_t asus_sysfs_is_visible(struct kobject *kobj,
 	else if (attr == &dev_attr_dgpu_disable.attr)
 		ok = asus->dgpu_disable_available;
 	else if (attr == &dev_attr_gpu_mux_mode.attr)
-		ok = asus->gpu_mux_mode_available;
+		ok = asus->gpu_mux_dev != 0;
 	else if (attr == &dev_attr_fan_boost_mode.attr)
 		ok = asus->fan_boost_mode_available;
 	else if (attr == &dev_attr_throttle_thermal_policy.attr)
@@ -4505,7 +4505,6 @@ static int asus_wmi_add(struct platform_device *pdev)
 	asus->egpu_enable_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_EGPU);
 	asus->egpu_connect_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_EGPU_CONNECTED);
 	asus->dgpu_disable_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_DGPU);
-	asus->gpu_mux_mode_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_GPU_MUX);
 	asus->kbd_rgb_mode_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_TUF_RGB_MODE);
 	asus->kbd_rgb_state_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_TUF_RGB_STATE);
 	asus->ppt_pl2_sppt_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_PPT_PL2_SPPT);
@@ -4524,6 +4523,11 @@ static int asus_wmi_add(struct platform_device *pdev)
 	else if (asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_MINI_LED_MODE2))
 		asus->mini_led_dev_id = ASUS_WMI_DEVID_MINI_LED_MODE2;
 
+	if (asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_GPU_MUX))
+		asus->gpu_mux_dev = ASUS_WMI_DEVID_GPU_MUX;
+	else if (asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_GPU_MUX_VIVO))
+		asus->gpu_mux_dev = ASUS_WMI_DEVID_GPU_MUX_VIVO;
+
 	err = fan_boost_mode_check_present(asus);
 	if (err)
 		goto fail_fan_boost_mode;
diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index 9cadce10ad9a..b48b024dd844 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -128,6 +128,7 @@
 
 /* gpu mux switch, 0 = dGPU, 1 = Optimus */
 #define ASUS_WMI_DEVID_GPU_MUX		0x00090016
+#define ASUS_WMI_DEVID_GPU_MUX_VIVO	0x00090026
 
 /* TUF laptop RGB modes/colours */
 #define ASUS_WMI_DEVID_TUF_RGB_MODE	0x00100056
-- 
cgit v1.2.3


From ae834a549ec1d4cf372ffba1af1c14148807af55 Mon Sep 17 00:00:00 2001
From: "Luke D. Jones" <luke@ljones.dev>
Date: Thu, 4 Apr 2024 13:16:46 +1300
Subject: platform/x86: asus-wmi: add support variant of TUF RGB
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds support for a second TUF RGB wmi call that some versions of the TUF
laptop come with. Also adjusts existing support to select whichever is
available.

Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Luke D. Jones <luke@ljones.dev>
Link: https://lore.kernel.org/r/20240404001652.86207-4-luke@ljones.dev
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 drivers/platform/x86/asus-wmi.c            | 13 +++++++++----
 include/linux/platform_data/x86/asus-wmi.h |  1 +
 2 files changed, 10 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index 51e4df081046..113a80ae0efe 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -281,7 +281,7 @@ struct asus_wmi {
 	bool nv_dyn_boost_available;
 	bool nv_temp_tgt_available;
 
-	bool kbd_rgb_mode_available;
+	u32 kbd_rgb_dev;
 	bool kbd_rgb_state_available;
 
 	bool throttle_thermal_policy_available;
@@ -881,6 +881,7 @@ static ssize_t kbd_rgb_mode_store(struct device *dev,
 				 struct device_attribute *attr,
 				 const char *buf, size_t count)
 {
+	struct asus_wmi *asus = dev_get_drvdata(dev);
 	u32 cmd, mode, r, g, b, speed;
 	int err;
 
@@ -917,7 +918,7 @@ static ssize_t kbd_rgb_mode_store(struct device *dev,
 		speed = 0xeb;
 	}
 
-	err = asus_wmi_evaluate_method3(ASUS_WMI_METHODID_DEVS, ASUS_WMI_DEVID_TUF_RGB_MODE,
+	err = asus_wmi_evaluate_method3(ASUS_WMI_METHODID_DEVS, asus->kbd_rgb_dev,
 			cmd | (mode << 8) | (r << 16) | (g << 24), b | (speed << 8), NULL);
 	if (err)
 		return err;
@@ -1560,7 +1561,7 @@ static int asus_wmi_led_init(struct asus_wmi *asus)
 {
 	int rv = 0, num_rgb_groups = 0, led_val;
 
-	if (asus->kbd_rgb_mode_available)
+	if (asus->kbd_rgb_dev)
 		kbd_rgb_mode_groups[num_rgb_groups++] = &kbd_rgb_mode_group;
 	if (asus->kbd_rgb_state_available)
 		kbd_rgb_mode_groups[num_rgb_groups++] = &kbd_rgb_state_group;
@@ -4505,7 +4506,6 @@ static int asus_wmi_add(struct platform_device *pdev)
 	asus->egpu_enable_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_EGPU);
 	asus->egpu_connect_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_EGPU_CONNECTED);
 	asus->dgpu_disable_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_DGPU);
-	asus->kbd_rgb_mode_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_TUF_RGB_MODE);
 	asus->kbd_rgb_state_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_TUF_RGB_STATE);
 	asus->ppt_pl2_sppt_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_PPT_PL2_SPPT);
 	asus->ppt_pl1_spl_available = asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_PPT_PL1_SPL);
@@ -4528,6 +4528,11 @@ static int asus_wmi_add(struct platform_device *pdev)
 	else if (asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_GPU_MUX_VIVO))
 		asus->gpu_mux_dev = ASUS_WMI_DEVID_GPU_MUX_VIVO;
 
+	if (asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_TUF_RGB_MODE))
+		asus->kbd_rgb_dev = ASUS_WMI_DEVID_TUF_RGB_MODE;
+	else if (asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_TUF_RGB_MODE2))
+		asus->kbd_rgb_dev = ASUS_WMI_DEVID_TUF_RGB_MODE2;
+
 	err = fan_boost_mode_check_present(asus);
 	if (err)
 		goto fail_fan_boost_mode;
diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index b48b024dd844..3e9a01467c67 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -132,6 +132,7 @@
 
 /* TUF laptop RGB modes/colours */
 #define ASUS_WMI_DEVID_TUF_RGB_MODE	0x00100056
+#define ASUS_WMI_DEVID_TUF_RGB_MODE2	0x0010005A
 
 /* TUF laptop RGB power/state */
 #define ASUS_WMI_DEVID_TUF_RGB_STATE	0x00100057
-- 
cgit v1.2.3


From e0ae0ecce4869f841ea7cb20fca1b2c865c13339 Mon Sep 17 00:00:00 2001
From: "Luke D. Jones" <luke@ljones.dev>
Date: Thu, 4 Apr 2024 13:16:47 +1300
Subject: platform/x86: asus-wmi: support toggling POST sound
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for toggling the BIOS POST sound on some ASUS laptops.

Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Luke D. Jones <luke@ljones.dev>
Link: https://lore.kernel.org/r/20240404001652.86207-5-luke@ljones.dev
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 Documentation/ABI/testing/sysfs-platform-asus-wmi |  9 ++++
 drivers/platform/x86/asus-wmi.c                   | 51 +++++++++++++++++++++++
 include/linux/platform_data/x86/asus-wmi.h        |  3 ++
 3 files changed, 63 insertions(+)

(limited to 'include')

diff --git a/Documentation/ABI/testing/sysfs-platform-asus-wmi b/Documentation/ABI/testing/sysfs-platform-asus-wmi
index ef1ac1a20a71..72933527d2e4 100644
--- a/Documentation/ABI/testing/sysfs-platform-asus-wmi
+++ b/Documentation/ABI/testing/sysfs-platform-asus-wmi
@@ -194,3 +194,12 @@ Contact:	"Luke Jones" <luke@ljones.dev>
 Description:
 		Set the target temperature limit of the Nvidia dGPU:
 			* min=75, max=87
+
+What:		/sys/devices/platform/<platform>/boot_sound
+Date:		Apr 2024
+KernelVersion:	6.10
+Contact:	"Luke Jones" <luke@ljones.dev>
+Description:
+		Set if the BIOS POST sound is played on boot.
+			* 0 - False,
+			* 1 - True
diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index 113a80ae0efe..1dbe473294f7 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -2115,6 +2115,54 @@ static ssize_t panel_od_store(struct device *dev,
 }
 static DEVICE_ATTR_RW(panel_od);
 
+/* Bootup sound ***************************************************************/
+
+static ssize_t boot_sound_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	struct asus_wmi *asus = dev_get_drvdata(dev);
+	int result;
+
+	result = asus_wmi_get_devstate_simple(asus, ASUS_WMI_DEVID_BOOT_SOUND);
+	if (result < 0)
+		return result;
+
+	return sysfs_emit(buf, "%d\n", result);
+}
+
+static ssize_t boot_sound_store(struct device *dev,
+			      struct device_attribute *attr,
+			      const char *buf, size_t count)
+{
+	int result, err;
+	u32 snd;
+
+	struct asus_wmi *asus = dev_get_drvdata(dev);
+
+	result = kstrtou32(buf, 10, &snd);
+	if (result)
+		return result;
+
+	if (snd > 1)
+		return -EINVAL;
+
+	err = asus_wmi_set_devstate(ASUS_WMI_DEVID_BOOT_SOUND, snd, &result);
+	if (err) {
+		pr_warn("Failed to set boot sound: %d\n", err);
+		return err;
+	}
+
+	if (result > 1) {
+		pr_warn("Failed to set panel boot sound (result): 0x%x\n", result);
+		return -EIO;
+	}
+
+	sysfs_notify(&asus->platform_device->dev.kobj, NULL, "boot_sound");
+
+	return count;
+}
+static DEVICE_ATTR_RW(boot_sound);
+
 /* Mini-LED mode **************************************************************/
 static ssize_t mini_led_mode_show(struct device *dev,
 				   struct device_attribute *attr, char *buf)
@@ -4207,6 +4255,7 @@ static struct attribute *platform_attributes[] = {
 	&dev_attr_ppt_platform_sppt.attr,
 	&dev_attr_nv_dynamic_boost.attr,
 	&dev_attr_nv_temp_target.attr,
+	&dev_attr_boot_sound.attr,
 	&dev_attr_panel_od.attr,
 	&dev_attr_mini_led_mode.attr,
 	&dev_attr_available_mini_led_mode.attr,
@@ -4259,6 +4308,8 @@ static umode_t asus_sysfs_is_visible(struct kobject *kobj,
 		ok = asus->nv_dyn_boost_available;
 	else if (attr == &dev_attr_nv_temp_target.attr)
 		ok = asus->nv_temp_tgt_available;
+	else if (attr == &dev_attr_boot_sound.attr)
+		devid = ASUS_WMI_DEVID_BOOT_SOUND;
 	else if (attr == &dev_attr_panel_od.attr)
 		ok = asus->panel_overdrive_available;
 	else if (attr == &dev_attr_mini_led_mode.attr)
diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h
index 3e9a01467c67..3eb5cd6773ad 100644
--- a/include/linux/platform_data/x86/asus-wmi.h
+++ b/include/linux/platform_data/x86/asus-wmi.h
@@ -137,6 +137,9 @@
 /* TUF laptop RGB power/state */
 #define ASUS_WMI_DEVID_TUF_RGB_STATE	0x00100057
 
+/* Bootup sound control */
+#define ASUS_WMI_DEVID_BOOT_SOUND	0x00130022
+
 /* DSTS masks */
 #define ASUS_WMI_DSTS_STATUS_BIT	0x00000001
 #define ASUS_WMI_DSTS_UNKNOWN_BIT	0x00000002
-- 
cgit v1.2.3


From 2125c0034c5dfd61171b494bd309bb7637bff6eb Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Thu, 4 Apr 2024 09:47:48 -0400
Subject: cgroup/cpuset: Make cpuset hotplug processing synchronous

Since commit 3a5a6d0c2b03("cpuset: don't nest cgroup_mutex inside
get_online_cpus()"), cpuset hotplug was done asynchronously via a work
function. This is to avoid recursive locking of cgroup_mutex.

Since then, the cgroup locking scheme has changed quite a bit. A
cpuset_mutex was introduced to protect cpuset specific operations.
The cpuset_mutex is then replaced by a cpuset_rwsem. With commit
d74b27d63a8b ("cgroup/cpuset: Change cpuset_rwsem and hotplug lock
order"), cpu_hotplug_lock is acquired before cpuset_rwsem. Later on,
cpuset_rwsem is reverted back to cpuset_mutex. All these locking changes
allow the hotplug code to call into cpuset core directly.

The following commits were also merged due to the asynchronous nature
of cpuset hotplug processing.

  - commit b22afcdf04c9 ("cpu/hotplug: Cure the cpusets trainwreck")
  - commit 50e76632339d ("sched/cpuset/pm: Fix cpuset vs. suspend-resume
    bugs")
  - commit 28b89b9e6f7b ("cpuset: handle race between CPU hotplug and
    cpuset_hotplug_work")

Clean up all these bandages by making cpuset hotplug
processing synchronous again with the exception that the call to
cgroup_transfer_tasks() to transfer tasks out of an empty cgroup v1
cpuset, if necessary, will still be done via a work function due to the
existing cgroup_mutex -> cpu_hotplug_lock dependency. It is possible
to reverse that dependency, but that will require updating a number of
different cgroup controllers. This special hotplug code path should be
rarely taken anyway.

As all the cpuset states will be updated by the end of the hotplug
operation, we can revert most the above commits except commit
50e76632339d ("sched/cpuset/pm: Fix cpuset vs. suspend-resume bugs")
which is partially reverted.  Also removing some cpus_read_lock trylock
attempts in the cpuset partition code as they are no longer necessary
since the cpu_hotplug_lock is now held for the whole duration of the
cpuset hotplug code path.

Signed-off-by: Waiman Long <longman@redhat.com>
Tested-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cpuset.h |   3 --
 kernel/cgroup/cpuset.c | 141 ++++++++++++++++++++-----------------------------
 kernel/cpu.c           |  48 -----------------
 kernel/power/process.c |   2 -
 4 files changed, 56 insertions(+), 138 deletions(-)

(limited to 'include')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 0ce6ff0d9c9a..de4cf0ee96f7 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -70,7 +70,6 @@ extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
 extern void cpuset_force_rebuild(void);
 extern void cpuset_update_active_cpus(void);
-extern void cpuset_wait_for_hotplug(void);
 extern void inc_dl_tasks_cs(struct task_struct *task);
 extern void dec_dl_tasks_cs(struct task_struct *task);
 extern void cpuset_lock(void);
@@ -185,8 +184,6 @@ static inline void cpuset_update_active_cpus(void)
 	partition_sched_domains(1, NULL, NULL);
 }
 
-static inline void cpuset_wait_for_hotplug(void) { }
-
 static inline void inc_dl_tasks_cs(struct task_struct *task) { }
 static inline void dec_dl_tasks_cs(struct task_struct *task) { }
 static inline void cpuset_lock(void) { }
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 4237c8748715..d8d3439eda4e 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -201,6 +201,14 @@ struct cpuset {
 	struct list_head remote_sibling;
 };
 
+/*
+ * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
+ */
+struct cpuset_remove_tasks_struct {
+	struct work_struct work;
+	struct cpuset *cs;
+};
+
 /*
  * Exclusive CPUs distributed out to sub-partitions of top_cpuset
  */
@@ -449,12 +457,6 @@ static DEFINE_SPINLOCK(callback_lock);
 
 static struct workqueue_struct *cpuset_migrate_mm_wq;
 
-/*
- * CPU / memory hotplug is handled asynchronously.
- */
-static void cpuset_hotplug_workfn(struct work_struct *work);
-static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
-
 static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
 
 static inline void check_insane_mems_config(nodemask_t *nodes)
@@ -540,22 +542,10 @@ static void guarantee_online_cpus(struct task_struct *tsk,
 	rcu_read_lock();
 	cs = task_cs(tsk);
 
-	while (!cpumask_intersects(cs->effective_cpus, pmask)) {
+	while (!cpumask_intersects(cs->effective_cpus, pmask))
 		cs = parent_cs(cs);
-		if (unlikely(!cs)) {
-			/*
-			 * The top cpuset doesn't have any online cpu as a
-			 * consequence of a race between cpuset_hotplug_work
-			 * and cpu hotplug notifier.  But we know the top
-			 * cpuset's effective_cpus is on its way to be
-			 * identical to cpu_online_mask.
-			 */
-			goto out_unlock;
-		}
-	}
-	cpumask_and(pmask, pmask, cs->effective_cpus);
 
-out_unlock:
+	cpumask_and(pmask, pmask, cs->effective_cpus);
 	rcu_read_unlock();
 }
 
@@ -1217,7 +1207,7 @@ static void rebuild_sched_domains_locked(void)
 	/*
 	 * If we have raced with CPU hotplug, return early to avoid
 	 * passing doms with offlined cpu to partition_sched_domains().
-	 * Anyways, cpuset_hotplug_workfn() will rebuild sched domains.
+	 * Anyways, cpuset_handle_hotplug() will rebuild sched domains.
 	 *
 	 * With no CPUs in any subpartitions, top_cpuset's effective CPUs
 	 * should be the same as the active CPUs, so checking only top_cpuset
@@ -1260,12 +1250,17 @@ static void rebuild_sched_domains_locked(void)
 }
 #endif /* CONFIG_SMP */
 
-void rebuild_sched_domains(void)
+static void rebuild_sched_domains_cpuslocked(void)
 {
-	cpus_read_lock();
 	mutex_lock(&cpuset_mutex);
 	rebuild_sched_domains_locked();
 	mutex_unlock(&cpuset_mutex);
+}
+
+void rebuild_sched_domains(void)
+{
+	cpus_read_lock();
+	rebuild_sched_domains_cpuslocked();
 	cpus_read_unlock();
 }
 
@@ -2079,14 +2074,11 @@ write_error:
 
 	/*
 	 * For partcmd_update without newmask, it is being called from
-	 * cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken.
-	 * Update the load balance flag and scheduling domain if
-	 * cpus_read_trylock() is successful.
+	 * cpuset_handle_hotplug(). Update the load balance flag and
+	 * scheduling domain accordingly.
 	 */
-	if ((cmd == partcmd_update) && !newmask && cpus_read_trylock()) {
+	if ((cmd == partcmd_update) && !newmask)
 		update_partition_sd_lb(cs, old_prs);
-		cpus_read_unlock();
-	}
 
 	notify_partition_change(cs, old_prs);
 	return 0;
@@ -3599,8 +3591,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
 	 * proceeding, so that we don't end up keep removing tasks added
 	 * after execution capability is restored.
 	 *
-	 * cpuset_hotplug_work calls back into cgroup core via
-	 * cgroup_transfer_tasks() and waiting for it from a cgroupfs
+	 * cpuset_handle_hotplug may call back into cgroup core asynchronously
+	 * via cgroup_transfer_tasks() and waiting for it from a cgroupfs
 	 * operation like this one can lead to a deadlock through kernfs
 	 * active_ref protection.  Let's break the protection.  Losing the
 	 * protection is okay as we check whether @cs is online after
@@ -3609,7 +3601,6 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
 	 */
 	css_get(&cs->css);
 	kernfs_break_active_protection(of->kn);
-	flush_work(&cpuset_hotplug_work);
 
 	cpus_read_lock();
 	mutex_lock(&cpuset_mutex);
@@ -4354,6 +4345,16 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
 	}
 }
 
+static void cpuset_migrate_tasks_workfn(struct work_struct *work)
+{
+	struct cpuset_remove_tasks_struct *s;
+
+	s = container_of(work, struct cpuset_remove_tasks_struct, work);
+	remove_tasks_in_empty_cpuset(s->cs);
+	css_put(&s->cs->css);
+	kfree(s);
+}
+
 static void
 hotplug_update_tasks_legacy(struct cpuset *cs,
 			    struct cpumask *new_cpus, nodemask_t *new_mems,
@@ -4383,12 +4384,21 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
 	/*
 	 * Move tasks to the nearest ancestor with execution resources,
 	 * This is full cgroup operation which will also call back into
-	 * cpuset. Should be done outside any lock.
+	 * cpuset. Execute it asynchronously using workqueue.
 	 */
-	if (is_empty) {
-		mutex_unlock(&cpuset_mutex);
-		remove_tasks_in_empty_cpuset(cs);
-		mutex_lock(&cpuset_mutex);
+	if (is_empty && cs->css.cgroup->nr_populated_csets &&
+	    css_tryget_online(&cs->css)) {
+		struct cpuset_remove_tasks_struct *s;
+
+		s = kzalloc(sizeof(*s), GFP_KERNEL);
+		if (WARN_ON_ONCE(!s)) {
+			css_put(&cs->css);
+			return;
+		}
+
+		s->cs = cs;
+		INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
+		schedule_work(&s->work);
 	}
 }
 
@@ -4421,30 +4431,6 @@ void cpuset_force_rebuild(void)
 	force_rebuild = true;
 }
 
-/*
- * Attempt to acquire a cpus_read_lock while a hotplug operation may be in
- * progress.
- * Return: true if successful, false otherwise
- *
- * To avoid circular lock dependency between cpuset_mutex and cpus_read_lock,
- * cpus_read_trylock() is used here to acquire the lock.
- */
-static bool cpuset_hotplug_cpus_read_trylock(void)
-{
-	int retries = 0;
-
-	while (!cpus_read_trylock()) {
-		/*
-		 * CPU hotplug still in progress. Retry 5 times
-		 * with a 10ms wait before bailing out.
-		 */
-		if (++retries > 5)
-			return false;
-		msleep(10);
-	}
-	return true;
-}
-
 /**
  * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
  * @cs: cpuset in interest
@@ -4493,13 +4479,11 @@ retry:
 		compute_partition_effective_cpumask(cs, &new_cpus);
 
 	if (remote && cpumask_empty(&new_cpus) &&
-	    partition_is_populated(cs, NULL) &&
-	    cpuset_hotplug_cpus_read_trylock()) {
+	    partition_is_populated(cs, NULL)) {
 		remote_partition_disable(cs, tmp);
 		compute_effective_cpumask(&new_cpus, cs, parent);
 		remote = false;
 		cpuset_force_rebuild();
-		cpus_read_unlock();
 	}
 
 	/*
@@ -4519,18 +4503,8 @@ retry:
 	else if (is_partition_valid(parent) && is_partition_invalid(cs))
 		partcmd = partcmd_update;
 
-	/*
-	 * cpus_read_lock needs to be held before calling
-	 * update_parent_effective_cpumask(). To avoid circular lock
-	 * dependency between cpuset_mutex and cpus_read_lock,
-	 * cpus_read_trylock() is used here to acquire the lock.
-	 */
 	if (partcmd >= 0) {
-		if (!cpuset_hotplug_cpus_read_trylock())
-			goto update_tasks;
-
 		update_parent_effective_cpumask(cs, partcmd, NULL, tmp);
-		cpus_read_unlock();
 		if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) {
 			compute_partition_effective_cpumask(cs, &new_cpus);
 			cpuset_force_rebuild();
@@ -4558,8 +4532,7 @@ unlock:
 }
 
 /**
- * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
- * @work: unused
+ * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset
  *
  * This function is called after either CPU or memory configuration has
  * changed and updates cpuset accordingly.  The top_cpuset is always
@@ -4573,8 +4546,10 @@ unlock:
  *
  * Note that CPU offlining during suspend is ignored.  We don't modify
  * cpusets across suspend/resume cycles at all.
+ *
+ * CPU / memory hotplug is handled synchronously.
  */
-static void cpuset_hotplug_workfn(struct work_struct *work)
+static void cpuset_handle_hotplug(void)
 {
 	static cpumask_t new_cpus;
 	static nodemask_t new_mems;
@@ -4585,6 +4560,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 	if (on_dfl && !alloc_cpumasks(NULL, &tmp))
 		ptmp = &tmp;
 
+	lockdep_assert_cpus_held();
 	mutex_lock(&cpuset_mutex);
 
 	/* fetch the available cpus/mems and find out which changed how */
@@ -4666,7 +4642,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 	/* rebuild sched domains if cpus_allowed has changed */
 	if (cpus_updated || force_rebuild) {
 		force_rebuild = false;
-		rebuild_sched_domains();
+		rebuild_sched_domains_cpuslocked();
 	}
 
 	free_cpumasks(NULL, ptmp);
@@ -4679,12 +4655,7 @@ void cpuset_update_active_cpus(void)
 	 * inside cgroup synchronization.  Bounce actual hotplug processing
 	 * to a work item to avoid reverse locking order.
 	 */
-	schedule_work(&cpuset_hotplug_work);
-}
-
-void cpuset_wait_for_hotplug(void)
-{
-	flush_work(&cpuset_hotplug_work);
+	cpuset_handle_hotplug();
 }
 
 /*
@@ -4695,7 +4666,7 @@ void cpuset_wait_for_hotplug(void)
 static int cpuset_track_online_nodes(struct notifier_block *self,
 				unsigned long action, void *arg)
 {
-	schedule_work(&cpuset_hotplug_work);
+	cpuset_handle_hotplug();
 	return NOTIFY_OK;
 }
 
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8f6affd051f7..49ce3f309688 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1208,52 +1208,6 @@ void __init cpuhp_threads_init(void)
 	kthread_unpark(this_cpu_read(cpuhp_state.thread));
 }
 
-/*
- *
- * Serialize hotplug trainwrecks outside of the cpu_hotplug_lock
- * protected region.
- *
- * The operation is still serialized against concurrent CPU hotplug via
- * cpu_add_remove_lock, i.e. CPU map protection.  But it is _not_
- * serialized against other hotplug related activity like adding or
- * removing of state callbacks and state instances, which invoke either the
- * startup or the teardown callback of the affected state.
- *
- * This is required for subsystems which are unfixable vs. CPU hotplug and
- * evade lock inversion problems by scheduling work which has to be
- * completed _before_ cpu_up()/_cpu_down() returns.
- *
- * Don't even think about adding anything to this for any new code or even
- * drivers. It's only purpose is to keep existing lock order trainwrecks
- * working.
- *
- * For cpu_down() there might be valid reasons to finish cleanups which are
- * not required to be done under cpu_hotplug_lock, but that's a different
- * story and would be not invoked via this.
- */
-static void cpu_up_down_serialize_trainwrecks(bool tasks_frozen)
-{
-	/*
-	 * cpusets delegate hotplug operations to a worker to "solve" the
-	 * lock order problems. Wait for the worker, but only if tasks are
-	 * _not_ frozen (suspend, hibernate) as that would wait forever.
-	 *
-	 * The wait is required because otherwise the hotplug operation
-	 * returns with inconsistent state, which could even be observed in
-	 * user space when a new CPU is brought up. The CPU plug uevent
-	 * would be delivered and user space reacting on it would fail to
-	 * move tasks to the newly plugged CPU up to the point where the
-	 * work has finished because up to that point the newly plugged CPU
-	 * is not assignable in cpusets/cgroups. On unplug that's not
-	 * necessarily a visible issue, but it is still inconsistent state,
-	 * which is the real problem which needs to be "fixed". This can't
-	 * prevent the transient state between scheduling the work and
-	 * returning from waiting for it.
-	 */
-	if (!tasks_frozen)
-		cpuset_wait_for_hotplug();
-}
-
 #ifdef CONFIG_HOTPLUG_CPU
 #ifndef arch_clear_mm_cpumask_cpu
 #define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm))
@@ -1494,7 +1448,6 @@ out:
 	 */
 	lockup_detector_cleanup();
 	arch_smt_update();
-	cpu_up_down_serialize_trainwrecks(tasks_frozen);
 	return ret;
 }
 
@@ -1728,7 +1681,6 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
 out:
 	cpus_write_unlock();
 	arch_smt_update();
-	cpu_up_down_serialize_trainwrecks(tasks_frozen);
 	return ret;
 }
 
diff --git a/kernel/power/process.c b/kernel/power/process.c
index cae81a87cc91..66ac067d9ae6 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -194,8 +194,6 @@ void thaw_processes(void)
 	__usermodehelper_set_disable_depth(UMH_FREEZING);
 	thaw_workqueues();
 
-	cpuset_wait_for_hotplug();
-
 	read_lock(&tasklist_lock);
 	for_each_process_thread(g, p) {
 		/* No other threads should have PF_SUSPEND_TASK set */
-- 
cgit v1.2.3


From 51d3c07a27056cd91a7ed0b59fb67d0b9e073830 Mon Sep 17 00:00:00 2001
From: Jeff Johnson <quic_jjohnson@quicinc.com>
Date: Fri, 29 Mar 2024 13:27:17 -0700
Subject: wifi: mac80211: remove ieee80211_set_hw_80211_encap()

While fixing kernel-doc issues it was discovered that the
ieee80211_set_hw_80211_encap() prototype doesn't actually have an
implementation, so remove it.

Note the implementation was removed in commit 6aea26ce5a4c ("mac80211:
rework tx encapsulation offload API").

Signed-off-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://msgid.link/20240329-mac80211-kdoc-retval-v1-1-5e4d1ad6c250@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index f57c29de3a91..bed3a34e13b0 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -7483,18 +7483,6 @@ u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw,
 u32 ieee80211_calc_tx_airtime(struct ieee80211_hw *hw,
 			      struct ieee80211_tx_info *info,
 			      int len);
-/**
- * ieee80211_set_hw_80211_encap - enable hardware encapsulation offloading.
- *
- * This function is used to notify mac80211 that a vif can be passed raw 802.3
- * frames. The driver needs to then handle the 802.11 encapsulation inside the
- * hardware or firmware.
- *
- * @vif: &struct ieee80211_vif pointer from the add_interface callback.
- * @enable: indicate if the feature should be turned on or off
- */
-bool ieee80211_set_hw_80211_encap(struct ieee80211_vif *vif, bool enable);
-
 /**
  * ieee80211_get_fils_discovery_tmpl - Get FILS discovery template.
  * @hw: pointer obtained from ieee80211_alloc_hw().
-- 
cgit v1.2.3


From 0e9824e0d59b20f6df1e01da536db6e310aaadd8 Mon Sep 17 00:00:00 2001
From: Jeff Johnson <quic_jjohnson@quicinc.com>
Date: Fri, 29 Mar 2024 13:27:18 -0700
Subject: wifi: mac80211: Add missing return value documentation

kernel-doc is reporting some warnings, so fix them:

% scripts/kernel-doc -Wall -Werror -none include/net/mac80211.h
include/net/mac80211.h:2056: warning: No description found for return value of 'wdev_to_ieee80211_vif'
include/net/mac80211.h:2066: warning: No description found for return value of 'ieee80211_vif_to_wdev'
include/net/mac80211.h:5603: warning: No description found for return value of 'ieee80211_beacon_cntdwn_is_complete'
include/net/mac80211.h:5968: warning: No description found for return value of 'ieee80211_gtk_rekey_add'
include/net/mac80211.h:6350: warning: No description found for return value of 'ieee80211_find_sta_by_link_addrs'
include/net/mac80211.h:6478: warning: No description found for return value of 'ieee80211_txq_airtime_check'
include/net/mac80211.h:6981: warning: No description found for return value of 'rate_control_set_rates'
include/net/mac80211.h:7142: warning: No description found for return value of 'ieee80211_tx_prepare_skb'
include/net/mac80211.h:7156: warning: No description found for return value of 'ieee80211_parse_tx_radiotap'
include/net/mac80211.h:7277: warning: No description found for return value of 'ieee80211_tx_dequeue'
include/net/mac80211.h:7292: warning: No description found for return value of 'ieee80211_tx_dequeue_ni'
include/net/mac80211.h:7324: warning: No description found for return value of 'ieee80211_next_txq'
include/net/mac80211.h:7405: warning: No description found for return value of 'ieee80211_txq_may_transmit'
include/net/mac80211.h:7466: warning: No description found for return value of 'ieee80211_calc_rx_airtime'
include/net/mac80211.h:7480: warning: No description found for return value of 'ieee80211_calc_tx_airtime'
include/net/mac80211.h:7528: warning: No description found for return value of 'ieee80211_is_tx_data'
include/net/mac80211.h:7562: warning: No description found for return value of 'ieee80211_set_active_links'
17 warnings as Errors

Signed-off-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://msgid.link/20240329-mac80211-kdoc-retval-v1-2-5e4d1ad6c250@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 38 ++++++++++++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index bed3a34e13b0..460387916c2e 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2051,7 +2051,7 @@ static inline bool ieee80211_vif_is_mesh(struct ieee80211_vif *vif)
  * This can be used by mac80211 drivers with direct cfg80211 APIs
  * (like the vendor commands) that get a wdev.
  *
- * Note that this function may return %NULL if the given wdev isn't
+ * Return: pointer to the wdev, or %NULL if the given wdev isn't
  * associated with a vif that the driver knows about (e.g. monitor
  * or AP_VLAN interfaces.)
  */
@@ -2064,6 +2064,8 @@ struct ieee80211_vif *wdev_to_ieee80211_vif(struct wireless_dev *wdev);
  * This can be used by mac80211 drivers with direct cfg80211 APIs
  * (like the vendor commands) that needs to get the wdev for a vif.
  * This can also be useful to get the netdev associated to a vif.
+ *
+ * Return: pointer to the wdev
  */
 struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif);
 
@@ -5602,7 +5604,7 @@ void ieee80211_csa_finish(struct ieee80211_vif *vif, unsigned int link_id);
  * @vif: &struct ieee80211_vif pointer from the add_interface callback.
  * @link_id: valid link_id during MLO or 0 for non-MLO
  *
- * This function returns whether the countdown reached zero.
+ * Return: %true if the countdown reached 1, %false otherwise
  */
 bool ieee80211_beacon_cntdwn_is_complete(struct ieee80211_vif *vif,
 					 unsigned int link_id);
@@ -5947,8 +5949,8 @@ void ieee80211_remove_key(struct ieee80211_key_conf *keyconf);
  * key(s) will be available. These will be needed by mac80211 for proper
  * RX processing, so this function allows setting them.
  *
- * The function returns the newly allocated key structure, which will
- * have similar contents to the passed key configuration but point to
+ * Return: the newly allocated key structure, which will have
+ * similar contents to the passed key configuration but point to
  * mac80211-owned memory. In case of errors, the function returns an
  * ERR_PTR(), use IS_ERR() etc.
  *
@@ -6347,6 +6349,8 @@ struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw,
  *	may be %NULL if the link ID is not needed
  *
  * Obtain the STA by link address, must use RCU protection.
+ *
+ * Return: pointer to STA if found, otherwise %NULL.
  */
 struct ieee80211_sta *
 ieee80211_find_sta_by_link_addrs(struct ieee80211_hw *hw,
@@ -6476,8 +6480,8 @@ void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid,
  * @hw: pointer obtained from ieee80211_alloc_hw()
  * @txq: pointer obtained from station or virtual interface
  *
- * Return true if the AQL's airtime limit has not been reached and the txq can
- * continue to send more packets to the device. Otherwise return false.
+ * Return: %true if the AQL's airtime limit has not been reached and the txq can
+ * continue to send more packets to the device. Otherwise return %false.
  */
 bool
 ieee80211_txq_airtime_check(struct ieee80211_hw *hw, struct ieee80211_txq *txq);
@@ -6980,6 +6984,8 @@ bool rate_usable_index_exists(struct ieee80211_supported_band *sband,
  * @hw: pointer as obtained from ieee80211_alloc_hw()
  * @pubsta: &struct ieee80211_sta pointer to the target destination.
  * @rates: new tx rate set to be used for this station.
+ *
+ * Return: 0 on success. An error code otherwise.
  */
 int rate_control_set_rates(struct ieee80211_hw *hw,
 			   struct ieee80211_sta *pubsta,
@@ -7140,6 +7146,8 @@ void ieee80211_report_wowlan_wakeup(struct ieee80211_vif *vif,
  * @band: the band to transmit on
  * @sta: optional pointer to get the station to send the frame to
  *
+ * Return: %true if the skb was prepared, %false otherwise
+ *
  * Note: must be called under RCU lock
  */
 bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw,
@@ -7156,6 +7164,8 @@ bool ieee80211_tx_prepare_skb(struct ieee80211_hw *hw,
  *
  * @skb: packet injected by userspace
  * @dev: the &struct device of this 802.11 device
+ *
+ * Return: %true if the radiotap header was parsed, %false otherwise
  */
 bool ieee80211_parse_tx_radiotap(struct sk_buff *skb,
 				 struct net_device *dev);
@@ -7265,7 +7275,7 @@ void ieee80211_unreserve_tid(struct ieee80211_sta *sta, u8 tid);
  * @txq: pointer obtained from station or virtual interface, or from
  *	ieee80211_next_txq()
  *
- * Returns the skb if successful, %NULL if no frame was available.
+ * Return: the skb if successful, %NULL if no frame was available.
  *
  * Note that this must be called in an rcu_read_lock() critical section,
  * which can only be released after the SKB was handled. Some pointers in
@@ -7291,6 +7301,8 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
  * @hw: pointer as obtained from ieee80211_alloc_hw()
  * @txq: pointer obtained from station or virtual interface, or from
  *	ieee80211_next_txq()
+ *
+ * Return: the skb if successful, %NULL if no frame was available.
  */
 static inline struct sk_buff *ieee80211_tx_dequeue_ni(struct ieee80211_hw *hw,
 						      struct ieee80211_txq *txq)
@@ -7322,7 +7334,7 @@ void ieee80211_handle_wake_tx_queue(struct ieee80211_hw *hw,
  * @hw: pointer as obtained from ieee80211_alloc_hw()
  * @ac: AC number to return packets from.
  *
- * Returns the next txq if successful, %NULL if no queue is eligible. If a txq
+ * Return: the next txq if successful, %NULL if no queue is eligible. If a txq
  * is returned, it should be returned with ieee80211_return_txq() after the
  * driver has finished scheduling it.
  */
@@ -7405,6 +7417,8 @@ ieee80211_return_txq(struct ieee80211_hw *hw, struct ieee80211_txq *txq,
  *
  * @hw: pointer as obtained from ieee80211_alloc_hw()
  * @txq: pointer obtained from station or virtual interface
+ *
+ * Return: %true if transmission is allowed, %false otherwise
  */
 bool ieee80211_txq_may_transmit(struct ieee80211_hw *hw,
 				struct ieee80211_txq *txq);
@@ -7465,6 +7479,8 @@ void ieee80211_nan_func_match(struct ieee80211_vif *vif,
  * @status: &struct ieee80211_rx_status containing the transmission rate
  *          information.
  * @len: frame length in bytes
+ *
+ * Return: the airtime estimate
  */
 u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw,
 			      struct ieee80211_rx_status *status,
@@ -7479,6 +7495,8 @@ u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw,
  * @hw: pointer as obtained from ieee80211_alloc_hw()
  * @info: &struct ieee80211_tx_info of the frame.
  * @len: frame length in bytes
+ *
+ * Return: the airtime estimate
  */
 u32 ieee80211_calc_tx_airtime(struct ieee80211_hw *hw,
 			      struct ieee80211_tx_info *info,
@@ -7528,6 +7546,8 @@ ieee80211_obss_color_collision_notify(struct ieee80211_vif *vif,
  * hardware encapsulation enabled are data frames.
  *
  * @skb: the frame to be transmitted.
+ *
+ * Return: %true if @skb is a data frame, %false otherwise
  */
 static inline bool ieee80211_is_tx_data(struct sk_buff *skb)
 {
@@ -7563,6 +7583,8 @@ static inline bool ieee80211_is_tx_data(struct sk_buff *skb)
  *  - change_sta_links(0x10) for each affected STA (the AP)
  *  - assign_vif_chanctx(link_id=4)
  *  - change_vif_links(0x10)
+ *
+ * Return: 0 on success. An error code otherwise.
  */
 int ieee80211_set_active_links(struct ieee80211_vif *vif, u16 active_links);
 
-- 
cgit v1.2.3


From a35b36e6ee5dcf323d846ec3881159e4af90c9b4 Mon Sep 17 00:00:00 2001
From: Michael-CY Lee <michael-cy.lee@mediatek.com>
Date: Tue, 26 Mar 2024 08:30:36 +0800
Subject: wifi: mac80211: extend IEEE80211_KEY_FLAG_GENERATE_MMIE to other
 ciphers

Extend the flag IEEE80211_KEY_FLAG_GENERATE_MMIE to BIP-CMAC-256,
BIP-GMAC-128 and BIP-GMAC-256 for the same reason and in the same
way that the flag was added originally in commit a0b4496a4368
("mac80211: add IEEE80211_KEY_FLAG_GENERATE_MMIE to ieee80211_key_flags").

Signed-off-by: Michael-CY Lee <michael-cy.lee@mediatek.com>
Link: https://msgid.link/20240326003036.15215-1-michael-cy.lee@mediatek.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h |  4 ++--
 net/mac80211/wpa.c     | 12 ++++++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 460387916c2e..41f135a793ab 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2124,8 +2124,8 @@ static inline bool lockdep_vif_wiphy_mutex_held(struct ieee80211_vif *vif)
  *	@IEEE80211_KEY_FLAG_GENERATE_MMIC on the same key.
  * @IEEE80211_KEY_FLAG_NO_AUTO_TX: Key needs explicit Tx activation.
  * @IEEE80211_KEY_FLAG_GENERATE_MMIE: This flag should be set by the driver
- *	for a AES_CMAC key to indicate that it requires sequence number
- *	generation only
+ *	for a AES_CMAC or a AES_GMAC key to indicate that it requires sequence
+ *	number generation only
  * @IEEE80211_KEY_FLAG_SPP_AMSDU: SPP A-MSDUs can be used with this key
  *	(set by mac80211 from the sta->spp_amsdu flag)
  */
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index e40529b8c5c9..047a33797020 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -895,7 +895,8 @@ ieee80211_crypto_aes_cmac_256_encrypt(struct ieee80211_tx_data *tx)
 
 	info = IEEE80211_SKB_CB(skb);
 
-	if (info->control.hw_key)
+	if (info->control.hw_key &&
+	    !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIE))
 		return TX_CONTINUE;
 
 	if (WARN_ON(skb_tailroom(skb) < sizeof(*mmie)))
@@ -911,6 +912,9 @@ ieee80211_crypto_aes_cmac_256_encrypt(struct ieee80211_tx_data *tx)
 
 	bip_ipn_set64(mmie->sequence_number, pn64);
 
+	if (info->control.hw_key)
+		return TX_CONTINUE;
+
 	bip_aad(skb, aad);
 
 	/* MIC = AES-256-CMAC(IGTK, AAD || Management Frame Body || MMIE, 128)
@@ -1040,7 +1044,8 @@ ieee80211_crypto_aes_gmac_encrypt(struct ieee80211_tx_data *tx)
 
 	info = IEEE80211_SKB_CB(skb);
 
-	if (info->control.hw_key)
+	if (info->control.hw_key &&
+	    !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIE))
 		return TX_CONTINUE;
 
 	if (WARN_ON(skb_tailroom(skb) < sizeof(*mmie)))
@@ -1056,6 +1061,9 @@ ieee80211_crypto_aes_gmac_encrypt(struct ieee80211_tx_data *tx)
 
 	bip_ipn_set64(mmie->sequence_number, pn64);
 
+	if (info->control.hw_key)
+		return TX_CONTINUE;
+
 	bip_aad(skb, aad);
 
 	hdr = (struct ieee80211_hdr *)skb->data;
-- 
cgit v1.2.3


From 31ced035ecde7b24aef57c3c4b85bbc3283c81f2 Mon Sep 17 00:00:00 2001
From: Francois Dugast <francois.dugast@intel.com>
Date: Fri, 29 Mar 2024 12:44:03 +0000
Subject: drm/xe/uapi: Restore flags VM_BIND_FLAG_READONLY and
 VM_BIND_FLAG_IMMEDIATE

The commit 84a1ed5e6756 ("drm/xe/uapi: Remove unused flags") is partially
reverted. At the time, flags not used by user space were removed during
cleanup. Some flags now needed by the compute runtime are brought back in
this commit:
- DRM_XE_VM_BIND_FLAG_READONLY is used to write protect kernel ISA thus
preventing accidental overwrites.
- DRM_XE_VM_BIND_FLAG_IMMEDIATE is used to trigger mapping at the time of
binding in order to prevent faulting at execution time.

The changes in the compute runtime are ready and approved, see link below.

v2: Include a link to the PR in the commit message (Matthew Brost)

v3: Update kernel doc and improve commit message (Lucas De Marchi)

Cc: Mateusz Jablonski <mateusz.jablonski@intel.com>
Cc: Michal Mrozek <michal.mrozek@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Lucas De Marchi <lucas.demarchi@intel.com>
Link: https://github.com/intel/compute-runtime/pull/717
Signed-off-by: Francois Dugast <francois.dugast@intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240329124403.7-1-francois.dugast@intel.com
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
 drivers/gpu/drm/xe/xe_vm.c       | 13 +++++++++++--
 drivers/gpu/drm/xe/xe_vm_types.h |  4 ++++
 include/uapi/drm/xe_drm.h        |  8 ++++++++
 3 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
index fff10e1717ff..66b70fd3d105 100644
--- a/drivers/gpu/drm/xe/xe_vm.c
+++ b/drivers/gpu/drm/xe/xe_vm.c
@@ -2213,6 +2213,10 @@ vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_bo *bo,
 		struct xe_vma_op *op = gpuva_op_to_vma_op(__op);
 
 		if (__op->op == DRM_GPUVA_OP_MAP) {
+			op->map.immediate =
+				flags & DRM_XE_VM_BIND_FLAG_IMMEDIATE;
+			op->map.read_only =
+				flags & DRM_XE_VM_BIND_FLAG_READONLY;
 			op->map.is_null = flags & DRM_XE_VM_BIND_FLAG_NULL;
 			op->map.dumpable = flags & DRM_XE_VM_BIND_FLAG_DUMPABLE;
 			op->map.pat_index = pat_index;
@@ -2407,6 +2411,8 @@ static int vm_bind_ioctl_ops_parse(struct xe_vm *vm, struct xe_exec_queue *q,
 		switch (op->base.op) {
 		case DRM_GPUVA_OP_MAP:
 		{
+			flags |= op->map.read_only ?
+				VMA_CREATE_FLAG_READ_ONLY : 0;
 			flags |= op->map.is_null ?
 				VMA_CREATE_FLAG_IS_NULL : 0;
 			flags |= op->map.dumpable ?
@@ -2551,7 +2557,7 @@ static int op_execute(struct drm_exec *exec, struct xe_vm *vm,
 	case DRM_GPUVA_OP_MAP:
 		err = xe_vm_bind(vm, vma, op->q, xe_vma_bo(vma),
 				 op->syncs, op->num_syncs,
-				 !xe_vm_in_fault_mode(vm),
+				 op->map.immediate || !xe_vm_in_fault_mode(vm),
 				 op->flags & XE_VMA_OP_FIRST,
 				 op->flags & XE_VMA_OP_LAST);
 		break;
@@ -2826,7 +2832,10 @@ static int vm_bind_ioctl_ops_execute(struct xe_vm *vm,
 	return 0;
 }
 
-#define SUPPORTED_FLAGS	(DRM_XE_VM_BIND_FLAG_NULL | \
+#define SUPPORTED_FLAGS	\
+	(DRM_XE_VM_BIND_FLAG_READONLY | \
+	 DRM_XE_VM_BIND_FLAG_IMMEDIATE | \
+	 DRM_XE_VM_BIND_FLAG_NULL | \
 	 DRM_XE_VM_BIND_FLAG_DUMPABLE)
 #define XE_64K_PAGE_MASK 0xffffull
 #define ALL_DRM_XE_SYNCS_FLAGS (DRM_XE_SYNCS_FLAG_WAIT_FOR_OP)
diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h
index badf3945083d..0447c79c40a2 100644
--- a/drivers/gpu/drm/xe/xe_vm_types.h
+++ b/drivers/gpu/drm/xe/xe_vm_types.h
@@ -276,6 +276,10 @@ struct xe_vm {
 struct xe_vma_op_map {
 	/** @vma: VMA to map */
 	struct xe_vma *vma;
+	/** @immediate: Immediate bind */
+	bool immediate;
+	/** @read_only: Read only */
+	bool read_only;
 	/** @is_null: is NULL binding */
 	bool is_null;
 	/** @dumpable: whether BO is dumped on GPU hang */
diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
index 95a8ecca21f4..1446c3bae515 100644
--- a/include/uapi/drm/xe_drm.h
+++ b/include/uapi/drm/xe_drm.h
@@ -871,6 +871,12 @@ struct drm_xe_vm_destroy {
  *  - %DRM_XE_VM_BIND_OP_PREFETCH
  *
  * and the @flags can be:
+ *  - %DRM_XE_VM_BIND_FLAG_READONLY - Setup the page tables as read-only
+ *    to ensure write protection
+ *  - %DRM_XE_VM_BIND_FLAG_IMMEDIATE - On a faulting VM, do the
+ *    MAP operation immediately rather than deferring the MAP to the page
+ *    fault handler. This is implied on a non-faulting VM as there is no
+ *    fault handler to defer to.
  *  - %DRM_XE_VM_BIND_FLAG_NULL - When the NULL flag is set, the page
  *    tables are setup with a special bit which indicates writes are
  *    dropped and all reads return zero. In the future, the NULL flags
@@ -963,6 +969,8 @@ struct drm_xe_vm_bind_op {
 	/** @op: Bind operation to perform */
 	__u32 op;
 
+#define DRM_XE_VM_BIND_FLAG_READONLY	(1 << 0)
+#define DRM_XE_VM_BIND_FLAG_IMMEDIATE	(1 << 1)
 #define DRM_XE_VM_BIND_FLAG_NULL	(1 << 2)
 #define DRM_XE_VM_BIND_FLAG_DUMPABLE	(1 << 3)
 	/** @flags: Bind flags */
-- 
cgit v1.2.3


From fcf8829fd993348af9f498e486af0a209996f548 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 7 Apr 2024 19:54:20 -0700
Subject: scsi: core: Add kernel-doc for scsi_msg_to_host_byte()

Add entries for missing documentation to prevent kernel-doc warnings:

scsi_cmnd.h:365: warning: Function parameter or struct member 'cmd' not described in 'scsi_msg_to_host_byte'
scsi_cmnd.h:365: warning: Function parameter or struct member 'msg' not described in 'scsi_msg_to_host_byte'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/20240408025425.18778-4-rdunlap@infradead.org
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/scsi/scsi_cmnd.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index 526def14e7fb..45c40d200154 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -353,6 +353,8 @@ static inline u8 get_host_byte(struct scsi_cmnd *cmd)
 
 /**
  * scsi_msg_to_host_byte() - translate message byte
+ * @cmd: the SCSI command
+ * @msg: the SCSI parallel message byte to translate
  *
  * Translate the SCSI parallel message byte to a matching
  * host byte setting. A message of COMMAND_COMPLETE indicates
-- 
cgit v1.2.3


From 11d99e91846a1c751173c8f21953bd755e7ae4df Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 7 Apr 2024 19:54:21 -0700
Subject: scsi: iser: Fix @read_stag kernel-doc warning

Correct kernel-doc comments for struct iser_ctrl to prevent warnings:

iser.h:76: warning: Function parameter or struct member 'read_stag' not described in 'iser_ctrl'
iser.h:76: warning: Excess struct member 'reaf_stag' description in 'iser_ctrl'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/20240408025425.18778-5-rdunlap@infradead.org
Acked-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/scsi/iser.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/scsi/iser.h b/include/scsi/iser.h
index 2e678fa74eca..07a83bfa2b8e 100644
--- a/include/scsi/iser.h
+++ b/include/scsi/iser.h
@@ -63,7 +63,7 @@ struct iser_cm_hdr {
  * @rsvd:         reserved
  * @write_stag:   write rkey
  * @write_va:     write virtual address
- * @reaf_stag:    read rkey
+ * @read_stag:    read rkey
  * @read_va:      read virtual address
  */
 struct iser_ctrl {
-- 
cgit v1.2.3


From d9c91182414524f1fa0f30584eb90f8211724bdb Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 7 Apr 2024 19:54:22 -0700
Subject: scsi: libfcoe: Fix a slew of kernel-doc warnings

Fix all kernel-doc warnings in <scsi/libfcoe.h>:

libfcoe.h:163: warning: Function parameter or struct member 'ctlr' not described in 'fcoe_ctlr_priv'
libfcoe.h:163: warning: Excess function parameter 'cltr' description in 'fcoe_ctlr_priv'
libfcoe.h:163: warning: No description found for return value of 'fcoe_ctlr_priv'
libfcoe.h:218: warning: Function parameter or struct member 'fd_flags' not described in 'fcoe_fcf'
libfcoe.h:218: warning: Excess struct member 'event' description in 'fcoe_fcf'
libfcoe.h:240: warning: Function parameter or struct member 'rdata' not described in 'fcoe_rport'
libfcoe.h:273: warning: No description found for return value of 'is_fip_mode'
libfcoe.h:332: warning: Function parameter or struct member 'crc_eof_page' not described in 'fcoe_percpu_s'
libfcoe.h:332: warning: Function parameter or struct member 'lock' not described in 'fcoe_percpu_s'
libfcoe.h:332: warning: Excess struct member 'page' description in 'fcoe_percpu_s'
libfcoe.h:362: warning: Function parameter or struct member 'data_src_addr' not described in 'fcoe_port'
libfcoe.h:362: warning: Function parameter or struct member 'get_netdev' not described in 'fcoe_port'
libfcoe.h:362: warning: Excess struct member 'data_srt_addr' description in 'fcoe_port'
libfcoe.h:369: warning: No description found for return value of 'fcoe_get_netdev'
libfcoe.h:386: warning: missing initial short description on line:
 * struct netdev_list
libfcoe.h:393: warning: expecting prototype for struct netdev_list. Prototype was for struct fcoe_netdev_mapping instead

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/20240408025425.18778-6-rdunlap@infradead.org
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/scsi/libfcoe.h | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/scsi/libfcoe.h b/include/scsi/libfcoe.h
index 8300ef1a982e..3c5899290aed 100644
--- a/include/scsi/libfcoe.h
+++ b/include/scsi/libfcoe.h
@@ -157,7 +157,9 @@ struct fcoe_ctlr {
 
 /**
  * fcoe_ctlr_priv() - Return the private data from a fcoe_ctlr
- * @cltr: The fcoe_ctlr whose private data will be returned
+ * @ctlr: The fcoe_ctlr whose private data will be returned
+ *
+ * Returns: pointer to the private data
  */
 static inline void *fcoe_ctlr_priv(const struct fcoe_ctlr *ctlr)
 {
@@ -174,7 +176,6 @@ static inline void *fcoe_ctlr_priv(const struct fcoe_ctlr *ctlr)
  * struct fcoe_fcf - Fibre-Channel Forwarder
  * @list:	 list linkage
  * @event_work:  Work for FC Transport actions queue
- * @event:       The event to be processed
  * @fip:         The controller that the FCF was discovered on
  * @fcf_dev:     The associated fcoe_fcf_device instance
  * @time:	 system time (jiffies) when an advertisement was last received
@@ -188,6 +189,7 @@ static inline void *fcoe_ctlr_priv(const struct fcoe_ctlr *ctlr)
  * @flogi_sent:	 current FLOGI sent to this FCF
  * @flags:	 flags received from advertisement
  * @fka_period:	 keep-alive period, in jiffies
+ * @fd_flags:	 no need for FKA from ENode
  *
  * A Fibre-Channel Forwarder (FCF) is the entity on the Ethernet that
  * passes FCoE frames on to an FC fabric.  This structure represents
@@ -222,6 +224,7 @@ struct fcoe_fcf {
 
 /**
  * struct fcoe_rport - VN2VN remote port
+ * @rdata:	libfc remote port private data
  * @time:	time of create or last beacon packet received from node
  * @fcoe_len:	max FCoE frame size, not including VLAN or Ethernet headers
  * @flags:	flags from probe or claim
@@ -266,8 +269,10 @@ void fcoe_get_lesb(struct fc_lport *, struct fc_els_lesb *);
 void fcoe_ctlr_get_lesb(struct fcoe_ctlr_device *ctlr_dev);
 
 /**
- * is_fip_mode() - returns true if FIP mode selected.
+ * is_fip_mode() - test if FIP mode selected.
  * @fip:	FCoE controller.
+ *
+ * Returns: %true if FIP mode is selected
  */
 static inline bool is_fip_mode(struct fcoe_ctlr *fip)
 {
@@ -318,9 +323,10 @@ struct fcoe_transport {
  * @kthread:	    The thread context (used by bnx2fc)
  * @work:	    The work item (used by fcoe)
  * @fcoe_rx_list:   The queue of pending packets to process
- * @page:	    The memory page for calculating frame trailer CRCs
+ * @crc_eof_page:   The memory page for calculating frame trailer CRCs
  * @crc_eof_offset: The offset into the CRC page pointing to available
  *		    memory for a new trailer
+ * @lock:	    local lock for members of this struct
  */
 struct fcoe_percpu_s {
 	struct task_struct *kthread;
@@ -343,7 +349,8 @@ struct fcoe_percpu_s {
  * @timer:		       The queue timer
  * @destroy_work:	       Handle for work context
  *			       (to prevent RTNL deadlocks)
- * @data_srt_addr:	       Source address for data
+ * @data_src_addr:	       Source address for data
+ * @get_netdev:                function that returns a &net_device from @lport
  *
  * An instance of this structure is to be allocated along with the
  * Scsi_Host and libfc fc_lport structures.
@@ -364,6 +371,8 @@ struct fcoe_port {
 /**
  * fcoe_get_netdev() - Return the net device associated with a local port
  * @lport: The local port to get the net device from
+ *
+ * Returns: the &net_device associated with this @lport
  */
 static inline struct net_device *fcoe_get_netdev(const struct fc_lport *lport)
 {
@@ -383,8 +392,10 @@ void fcoe_fcf_get_selected(struct fcoe_fcf_device *);
 void fcoe_ctlr_set_fip_mode(struct fcoe_ctlr_device *);
 
 /**
- * struct netdev_list
- * A mapping from netdevice to fcoe_transport
+ * struct fcoe_netdev_mapping - A mapping from &net_device to &fcoe_transport
+ * @list: list linkage of the mappings
+ * @netdev: the &net_device
+ * @ft: the fcoe_transport associated with @netdev
  */
 struct fcoe_netdev_mapping {
 	struct list_head list;
-- 
cgit v1.2.3


From 8d523f0f5383a4f6ae74b6ccf3e0ec953a56dec6 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 7 Apr 2024 19:54:23 -0700
Subject: scsi: core: Add function return kernel-doc for 2 functions

Add missing function return values to prevent kernel-doc warnings:

scsi.h:75: warning: No description found for return value of 'scsi_status_is_check_condition'
scsi.h:202: warning: No description found for return value of 'scsi_status_is_good'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/20240408025425.18778-7-rdunlap@infradead.org
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/scsi/scsi.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h
index 4498f845b112..d90645f06a3a 100644
--- a/include/scsi/scsi.h
+++ b/include/scsi/scsi.h
@@ -69,7 +69,7 @@ static inline int scsi_is_wlun(u64 lun)
  * @status: the status passed up from the driver (including host and
  *          driver components)
  *
- * This returns true if the status code is SAM_STAT_CHECK_CONDITION.
+ * Returns: %true if the status code is SAM_STAT_CHECK_CONDITION.
  */
 static inline int scsi_status_is_check_condition(int status)
 {
@@ -189,12 +189,13 @@ enum scsi_disposition {
 /* Used to obtain the PCI location of a device */
 #define SCSI_IOCTL_GET_PCI		0x5387
 
-/** scsi_status_is_good - check the status return.
+/**
+ * scsi_status_is_good - check the status return.
  *
  * @status: the status passed up from the driver (including host and
  *          driver components)
  *
- * This returns true for known good conditions that may be treated as
+ * Returns: %true for known good conditions that may be treated as
  * command completed normally
  */
 static inline bool scsi_status_is_good(int status)
-- 
cgit v1.2.3


From 007c04e535262417f8206b1bc0aaf1744467430c Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 7 Apr 2024 19:54:24 -0700
Subject: scsi: scsi_transport_fc: Add kernel-doc for function return

Add function return value to prevent a kernel-doc warning:

scsi_transport_fc.h:780: warning: No description found for return value of 'fc_remote_port_chkready'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/20240408025425.18778-8-rdunlap@infradead.org
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/scsi/scsi_transport_fc.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/scsi/scsi_transport_fc.h b/include/scsi/scsi_transport_fc.h
index 483513c57597..ebb1bd565103 100644
--- a/include/scsi/scsi_transport_fc.h
+++ b/include/scsi/scsi_transport_fc.h
@@ -770,10 +770,9 @@ struct fc_function_template {
 /**
  * fc_remote_port_chkready - called to validate the remote port state
  *   prior to initiating io to the port.
- *
- * Returns a scsi result code that can be returned by the LLDD.
- *
  * @rport:	remote port to be checked
+ *
+ * Returns: a scsi result code that can be returned by the LLDD.
  **/
 static inline int
 fc_remote_port_chkready(struct fc_rport *rport)
-- 
cgit v1.2.3


From a2530eb748fff3966d3b20d1be81393fb403ec51 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 7 Apr 2024 19:54:25 -0700
Subject: scsi: scsi_transport_srp: Fix a couple of kernel-doc warnings

Add a struct short description and a function return value to prevent
kernel-doc warnings:

scsi_transport_srp.h:77: warning: missing initial short description on line:
 * struct srp_function_template
scsi_transport_srp.h:132: warning: No description found for return value of 'srp_chkready'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/20240408025425.18778-9-rdunlap@infradead.org
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/scsi/scsi_transport_srp.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/scsi/scsi_transport_srp.h b/include/scsi/scsi_transport_srp.h
index dfc78aa112ad..5b70b538447e 100644
--- a/include/scsi/scsi_transport_srp.h
+++ b/include/scsi/scsi_transport_srp.h
@@ -74,7 +74,7 @@ struct srp_rport {
 };
 
 /**
- * struct srp_function_template
+ * struct srp_function_template - template for SRP initiator drivers
  *
  * Fields that are only relevant for SRP initiator drivers:
  * @has_rport_state: Whether or not to create the state, fast_io_fail_tmo and
@@ -124,7 +124,7 @@ enum scsi_timeout_action srp_timed_out(struct scsi_cmnd *scmd);
  * srp_chkready() - evaluate the transport layer state before I/O
  * @rport: SRP target port pointer.
  *
- * Returns a SCSI result code that can be returned by the LLD queuecommand()
+ * Returns: a SCSI result code that can be returned by the LLD queuecommand()
  * implementation. The role of this function is similar to that of
  * fc_remote_port_chkready().
  */
-- 
cgit v1.2.3


From 3f9f231236ce7e48780d8a4f1f8cb9fae2df1e4e Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Wed, 17 Jan 2024 11:19:45 +0800
Subject: md: Fix overflow in is_mddev_idle

UBSAN reports this problem:

  UBSAN: Undefined behaviour in drivers/md/md.c:8175:15
  signed integer overflow:
  -2147483291 - 2072033152 cannot be represented in type 'int'
  Call trace:
   dump_backtrace+0x0/0x310
   show_stack+0x28/0x38
   dump_stack+0xec/0x15c
   ubsan_epilogue+0x18/0x84
   handle_overflow+0x14c/0x19c
   __ubsan_handle_sub_overflow+0x34/0x44
   is_mddev_idle+0x338/0x3d8
   md_do_sync+0x1bb8/0x1cf8
   md_thread+0x220/0x288
   kthread+0x1d8/0x1e0
   ret_from_fork+0x10/0x18

'curr_events' will overflow when stat accum or 'sync_io' is greater than
INT_MAX.

Fix it by changing sync_io, last_events and curr_events to 64bit.

Signed-off-by: Li Nan <linan122@huawei.com>
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20240117031946.2324519-2-linan666@huaweicloud.com
Signed-off-by: Song Liu <song@kernel.org>
---
 drivers/md/md.c        | 7 ++++---
 drivers/md/md.h        | 4 ++--
 include/linux/blkdev.h | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index fd574b015d78..f28e41f9ac4a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8577,14 +8577,15 @@ static int is_mddev_idle(struct mddev *mddev, int init)
 {
 	struct md_rdev *rdev;
 	int idle;
-	int curr_events;
+	long long curr_events;
 
 	idle = 1;
 	rcu_read_lock();
 	rdev_for_each_rcu(rdev, mddev) {
 		struct gendisk *disk = rdev->bdev->bd_disk;
-		curr_events = (int)part_stat_read_accum(disk->part0, sectors) -
-			      atomic_read(&disk->sync_io);
+		curr_events =
+			(long long)part_stat_read_accum(disk->part0, sectors) -
+			atomic64_read(&disk->sync_io);
 		/* sync IO will cause sync_io to increase before the disk_stats
 		 * as sync_io is counted when a request starts, and
 		 * disk_stats is counted when it completes.
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 097d9dbd69b8..d0db98c0d33b 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -51,7 +51,7 @@ struct md_rdev {
 
 	sector_t sectors;		/* Device size (in 512bytes sectors) */
 	struct mddev *mddev;		/* RAID array if running */
-	int last_events;		/* IO event timestamp */
+	long long last_events;		/* IO event timestamp */
 
 	/*
 	 * If meta_bdev is non-NULL, it means that a separate device is
@@ -621,7 +621,7 @@ extern void mddev_unlock(struct mddev *mddev);
 
 static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
 {
-	atomic_add(nr_sectors, &bdev->bd_disk->sync_io);
+	atomic64_add(nr_sectors, &bdev->bd_disk->sync_io);
 }
 
 static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c3e8f7cf96be..d0fa4f1c29a2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -172,7 +172,7 @@ struct gendisk {
 	struct list_head slave_bdevs;
 #endif
 	struct timer_rand_state *random;
-	atomic_t sync_io;		/* RAID */
+	atomic64_t sync_io;		/* RAID */
 	struct disk_events *ev;
 
 #ifdef CONFIG_BLK_DEV_ZONED
-- 
cgit v1.2.3


From 009e12561369e2fcb992af47001a06f016e9d6b5 Mon Sep 17 00:00:00 2001
From: Paweł Anikiel <panikiel@google.com>
Date: Fri, 5 Apr 2024 14:13:56 +0000
Subject: media: v4l2-subdev: Add pad versions of dv timing subdev calls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, subdev dv timing calls (i.e. g/s/query_dv_timings) are video
ops without a pad argument. This is a problem if the subdevice can have
different dv timings for each pad (e.g. a DisplayPort receiver with
multiple virtual channels).

To solve this, change these calls to include a pad argument, and put
them into pad ops. Keep the old ones temporarily to make the switch
easier.

Signed-off-by: Paweł Anikiel <panikiel@google.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 drivers/media/v4l2-core/v4l2-subdev.c | 33 +++++++++++++++++++++++++++++++++
 include/media/v4l2-subdev.h           | 14 ++++++++++++++
 2 files changed, 47 insertions(+)

(limited to 'include')

diff --git a/drivers/media/v4l2-core/v4l2-subdev.c b/drivers/media/v4l2-core/v4l2-subdev.c
index 4c6198c48dd6..07759cdd0844 100644
--- a/drivers/media/v4l2-core/v4l2-subdev.c
+++ b/drivers/media/v4l2-core/v4l2-subdev.c
@@ -369,6 +369,36 @@ static int call_set_edid(struct v4l2_subdev *sd, struct v4l2_subdev_edid *edid)
 	return check_edid(sd, edid) ? : sd->ops->pad->set_edid(sd, edid);
 }
 
+static int call_s_dv_timings(struct v4l2_subdev *sd, unsigned int pad,
+			     struct v4l2_dv_timings *timings)
+{
+	if (!timings)
+		return -EINVAL;
+
+	return check_pad(sd, pad) ? :
+	       sd->ops->pad->s_dv_timings(sd, pad, timings);
+}
+
+static int call_g_dv_timings(struct v4l2_subdev *sd, unsigned int pad,
+			     struct v4l2_dv_timings *timings)
+{
+	if (!timings)
+		return -EINVAL;
+
+	return check_pad(sd, pad) ? :
+	       sd->ops->pad->g_dv_timings(sd, pad, timings);
+}
+
+static int call_query_dv_timings(struct v4l2_subdev *sd, unsigned int pad,
+				 struct v4l2_dv_timings *timings)
+{
+	if (!timings)
+		return -EINVAL;
+
+	return check_pad(sd, pad) ? :
+	       sd->ops->pad->query_dv_timings(sd, pad, timings);
+}
+
 static int call_dv_timings_cap(struct v4l2_subdev *sd,
 			       struct v4l2_dv_timings_cap *cap)
 {
@@ -487,6 +517,9 @@ static const struct v4l2_subdev_pad_ops v4l2_subdev_call_pad_wrappers = {
 	.set_frame_interval	= call_set_frame_interval,
 	.get_edid		= call_get_edid,
 	.set_edid		= call_set_edid,
+	.s_dv_timings		= call_s_dv_timings,
+	.g_dv_timings		= call_g_dv_timings,
+	.query_dv_timings	= call_query_dv_timings,
 	.dv_timings_cap		= call_dv_timings_cap,
 	.enum_dv_timings	= call_enum_dv_timings,
 	.get_frame_desc		= call_get_frame_desc,
diff --git a/include/media/v4l2-subdev.h b/include/media/v4l2-subdev.h
index a9e6b8146279..a5213411ef2b 100644
--- a/include/media/v4l2-subdev.h
+++ b/include/media/v4l2-subdev.h
@@ -791,6 +791,14 @@ struct v4l2_subdev_state {
  *
  * @set_edid: callback for VIDIOC_SUBDEV_S_EDID() ioctl handler code.
  *
+ * @s_dv_timings: Set custom dv timings in the sub device. This is used
+ *	when sub device is capable of setting detailed timing information
+ *	in the hardware to generate/detect the video signal.
+ *
+ * @g_dv_timings: Get custom dv timings in the sub device.
+ *
+ * @query_dv_timings: callback for VIDIOC_QUERY_DV_TIMINGS() ioctl handler code.
+ *
  * @dv_timings_cap: callback for VIDIOC_SUBDEV_DV_TIMINGS_CAP() ioctl handler
  *		    code.
  *
@@ -864,6 +872,12 @@ struct v4l2_subdev_pad_ops {
 				  struct v4l2_subdev_frame_interval *interval);
 	int (*get_edid)(struct v4l2_subdev *sd, struct v4l2_edid *edid);
 	int (*set_edid)(struct v4l2_subdev *sd, struct v4l2_edid *edid);
+	int (*s_dv_timings)(struct v4l2_subdev *sd, unsigned int pad,
+			    struct v4l2_dv_timings *timings);
+	int (*g_dv_timings)(struct v4l2_subdev *sd, unsigned int pad,
+			    struct v4l2_dv_timings *timings);
+	int (*query_dv_timings)(struct v4l2_subdev *sd, unsigned int pad,
+				struct v4l2_dv_timings *timings);
 	int (*dv_timings_cap)(struct v4l2_subdev *sd,
 			      struct v4l2_dv_timings_cap *cap);
 	int (*enum_dv_timings)(struct v4l2_subdev *sd,
-- 
cgit v1.2.3


From d8c9a6e204f1466d228428174ce6d40ccdfb583e Mon Sep 17 00:00:00 2001
From: Paweł Anikiel <panikiel@google.com>
Date: Fri, 5 Apr 2024 14:14:11 +0000
Subject: media: v4l2-subdev: Remove non-pad dv timing callbacks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After the conversion of dv timing calls to use a pad argument is done,
remove the old callbacks. Update the subdev ioctl handlers to use the
new callbacks.

Signed-off-by: Paweł Anikiel <panikiel@google.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 drivers/media/v4l2-core/v4l2-subdev.c |  6 +++---
 include/media/v4l2-subdev.h           | 14 --------------
 2 files changed, 3 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/media/v4l2-core/v4l2-subdev.c b/drivers/media/v4l2-core/v4l2-subdev.c
index 07759cdd0844..6572667fd5c4 100644
--- a/drivers/media/v4l2-core/v4l2-subdev.c
+++ b/drivers/media/v4l2-core/v4l2-subdev.c
@@ -906,16 +906,16 @@ static long subdev_do_ioctl(struct file *file, unsigned int cmd, void *arg,
 	}
 
 	case VIDIOC_SUBDEV_QUERY_DV_TIMINGS:
-		return v4l2_subdev_call(sd, video, query_dv_timings, arg);
+		return v4l2_subdev_call(sd, pad, query_dv_timings, 0, arg);
 
 	case VIDIOC_SUBDEV_G_DV_TIMINGS:
-		return v4l2_subdev_call(sd, video, g_dv_timings, arg);
+		return v4l2_subdev_call(sd, pad, g_dv_timings, 0, arg);
 
 	case VIDIOC_SUBDEV_S_DV_TIMINGS:
 		if (ro_subdev)
 			return -EPERM;
 
-		return v4l2_subdev_call(sd, video, s_dv_timings, arg);
+		return v4l2_subdev_call(sd, pad, s_dv_timings, 0, arg);
 
 	case VIDIOC_SUBDEV_G_STD:
 		return v4l2_subdev_call(sd, video, g_std, arg);
diff --git a/include/media/v4l2-subdev.h b/include/media/v4l2-subdev.h
index a5213411ef2b..1af16b16f0bf 100644
--- a/include/media/v4l2-subdev.h
+++ b/include/media/v4l2-subdev.h
@@ -452,14 +452,6 @@ enum v4l2_subdev_pre_streamon_flags {
  *
  * @g_pixelaspect: callback to return the pixelaspect ratio.
  *
- * @s_dv_timings: Set custom dv timings in the sub device. This is used
- *	when sub device is capable of setting detailed timing information
- *	in the hardware to generate/detect the video signal.
- *
- * @g_dv_timings: Get custom dv timings in the sub device.
- *
- * @query_dv_timings: callback for VIDIOC_QUERY_DV_TIMINGS() ioctl handler code.
- *
  * @s_rx_buffer: set a host allocated memory buffer for the subdev. The subdev
  *	can adjust @size to a lower value and must not write more data to the
  *	buffer starting at @data than the original value of @size.
@@ -490,12 +482,6 @@ struct v4l2_subdev_video_ops {
 	int (*g_input_status)(struct v4l2_subdev *sd, u32 *status);
 	int (*s_stream)(struct v4l2_subdev *sd, int enable);
 	int (*g_pixelaspect)(struct v4l2_subdev *sd, struct v4l2_fract *aspect);
-	int (*s_dv_timings)(struct v4l2_subdev *sd,
-			struct v4l2_dv_timings *timings);
-	int (*g_dv_timings)(struct v4l2_subdev *sd,
-			struct v4l2_dv_timings *timings);
-	int (*query_dv_timings)(struct v4l2_subdev *sd,
-			struct v4l2_dv_timings *timings);
 	int (*s_rx_buffer)(struct v4l2_subdev *sd, void *buf,
 			   unsigned int *size);
 	int (*pre_streamon)(struct v4l2_subdev *sd, u32 flags);
-- 
cgit v1.2.3


From ee20cc17e9d8fd85225e18351637460f3482be2f Mon Sep 17 00:00:00 2001
From: Junxian Huang <huangjunxian6@hisilicon.com>
Date: Fri, 15 Mar 2024 17:35:51 +0800
Subject: RDMA/hns: Support DSCP

Add support for DSCP configuration. For DSCP, get dscp-prio mapping
via hns3 nic driver api .get_dscp_prio() and fill the SL (in WQE for
UD or in QPC for RC) with the priority value. The prio-tc mapping is
configured to HW by hns3 nic driver. HW will select a corresponding
TC according to SL and the prio-tc mapping.

Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
Link: https://lore.kernel.org/r/20240315093551.1650088-1-huangjunxian6@hisilicon.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/infiniband/hw/hns/hns_roce_ah.c     | 33 +++++++----
 drivers/infiniband/hw/hns/hns_roce_device.h |  6 ++
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c  | 85 +++++++++++++++++++++++------
 drivers/infiniband/hw/hns/hns_roce_qp.c     | 13 +++++
 include/uapi/rdma/hns-abi.h                 |  9 ++-
 5 files changed, 116 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c
index b4209b6aed8d..3e02c474f59f 100644
--- a/drivers/infiniband/hw/hns/hns_roce_ah.c
+++ b/drivers/infiniband/hw/hns/hns_roce_ah.c
@@ -59,8 +59,10 @@ int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr,
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibah->device);
 	struct hns_roce_ib_create_ah_resp resp = {};
 	struct hns_roce_ah *ah = to_hr_ah(ibah);
-	int ret = 0;
-	u32 max_sl;
+	u8 tclass = get_tclass(grh);
+	u8 priority = 0;
+	u8 tc_mode = 0;
+	int ret;
 
 	if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08 && udata)
 		return -EOPNOTSUPP;
@@ -74,16 +76,23 @@ int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr,
 	ah->av.hop_limit = grh->hop_limit;
 	ah->av.flowlabel = grh->flow_label;
 	ah->av.udp_sport = get_ah_udp_sport(ah_attr);
-	ah->av.tclass = get_tclass(grh);
-
-	ah->av.sl = rdma_ah_get_sl(ah_attr);
-	max_sl = min_t(u32, MAX_SERVICE_LEVEL, hr_dev->caps.sl_num - 1);
-	if (unlikely(ah->av.sl > max_sl)) {
-		ibdev_err_ratelimited(&hr_dev->ib_dev,
-				      "failed to set sl, sl (%u) shouldn't be larger than %u.\n",
-				      ah->av.sl, max_sl);
+	ah->av.tclass = tclass;
+
+	ret = hr_dev->hw->get_dscp(hr_dev, tclass, &tc_mode, &priority);
+	if (ret == -EOPNOTSUPP)
+		ret = 0;
+
+	if (ret && grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)
+		return ret;
+
+	if (tc_mode == HNAE3_TC_MAP_MODE_DSCP &&
+	    grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)
+		ah->av.sl = priority;
+	else
+		ah->av.sl = rdma_ah_get_sl(ah_attr);
+
+	if (!check_sl_valid(hr_dev, ah->av.sl))
 		return -EINVAL;
-	}
 
 	memcpy(ah->av.dgid, grh->dgid.raw, HNS_ROCE_GID_SIZE);
 	memcpy(ah->av.mac, ah_attr->roce.dmac, ETH_ALEN);
@@ -99,6 +108,8 @@ int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr,
 	}
 
 	if (udata) {
+		resp.priority = ah->av.sl;
+		resp.tc_mode = tc_mode;
 		memcpy(resp.dmac, ah_attr->roce.dmac, ETH_ALEN);
 		ret = ib_copy_to_udata(udata, &resp,
 				       min(udata->outlen, sizeof(resp)));
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index c3cbd0a494bf..78b4d19ff848 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -645,6 +645,8 @@ struct hns_roce_qp {
 	struct hns_user_mmap_entry *dwqe_mmap_entry;
 	u32			config;
 	enum hns_roce_cong_type	cong_type;
+	u8			tc_mode;
+	u8			priority;
 };
 
 struct hns_roce_ib_iboe {
@@ -950,6 +952,8 @@ struct hns_roce_hw {
 	int (*query_sccc)(struct hns_roce_dev *hr_dev, u32 qpn, void *buffer);
 	int (*query_hw_counter)(struct hns_roce_dev *hr_dev,
 				u64 *stats, u32 port, int *hw_counters);
+	int (*get_dscp)(struct hns_roce_dev *hr_dev, u8 dscp,
+			u8 *tc_mode, u8 *priority);
 	const struct ib_device_ops *hns_roce_dev_ops;
 	const struct ib_device_ops *hns_roce_dev_srq_ops;
 };
@@ -1292,4 +1296,6 @@ struct hns_user_mmap_entry *
 hns_roce_user_mmap_entry_insert(struct ib_ucontext *ucontext, u64 address,
 				size_t length,
 				enum hns_roce_mmap_type mmap_type);
+bool check_sl_valid(struct hns_roce_dev *hr_dev, u8 sl);
+
 #endif /* _HNS_ROCE_DEVICE_H */
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index ba7ae792d279..423ab66c5856 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -443,10 +443,6 @@ static int fill_ud_av(struct hns_roce_v2_ud_send_wqe *ud_sq_wqe,
 	hr_reg_write(ud_sq_wqe, UD_SEND_WQE_HOPLIMIT, ah->av.hop_limit);
 	hr_reg_write(ud_sq_wqe, UD_SEND_WQE_TCLASS, ah->av.tclass);
 	hr_reg_write(ud_sq_wqe, UD_SEND_WQE_FLOW_LABEL, ah->av.flowlabel);
-
-	if (WARN_ON(ah->av.sl > MAX_SERVICE_LEVEL))
-		return -EINVAL;
-
 	hr_reg_write(ud_sq_wqe, UD_SEND_WQE_SL, ah->av.sl);
 
 	ud_sq_wqe->sgid_index = ah->av.gid_index;
@@ -4828,6 +4824,69 @@ static int fill_cong_field(struct ib_qp *ibqp, const struct ib_qp_attr *attr,
 	return 0;
 }
 
+static int hns_roce_hw_v2_get_dscp(struct hns_roce_dev *hr_dev, u8 dscp,
+				   u8 *tc_mode, u8 *priority)
+{
+	struct hns_roce_v2_priv *priv = hr_dev->priv;
+	struct hnae3_handle *handle = priv->handle;
+	const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
+
+	if (!ops->get_dscp_prio)
+		return -EOPNOTSUPP;
+
+	return ops->get_dscp_prio(handle, dscp, tc_mode, priority);
+}
+
+bool check_sl_valid(struct hns_roce_dev *hr_dev, u8 sl)
+{
+	u32 max_sl;
+
+	max_sl = min_t(u32, MAX_SERVICE_LEVEL, hr_dev->caps.sl_num - 1);
+	if (unlikely(sl > max_sl)) {
+		ibdev_err_ratelimited(&hr_dev->ib_dev,
+				      "failed to set SL(%u). Shouldn't be larger than %u.\n",
+				      sl, max_sl);
+		return false;
+	}
+
+	return true;
+}
+
+static int hns_roce_set_sl(struct ib_qp *ibqp,
+			   const struct ib_qp_attr *attr,
+			   struct hns_roce_v2_qp_context *context,
+			   struct hns_roce_v2_qp_context *qpc_mask)
+{
+	const struct ib_global_route *grh = rdma_ah_read_grh(&attr->ah_attr);
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	int ret;
+
+	ret = hns_roce_hw_v2_get_dscp(hr_dev, get_tclass(&attr->ah_attr.grh),
+				      &hr_qp->tc_mode, &hr_qp->priority);
+	if (ret && ret != -EOPNOTSUPP &&
+	    grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) {
+		ibdev_err_ratelimited(ibdev,
+				      "failed to get dscp, ret = %d.\n", ret);
+		return ret;
+	}
+
+	if (hr_qp->tc_mode == HNAE3_TC_MAP_MODE_DSCP &&
+	    grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)
+		hr_qp->sl = hr_qp->priority;
+	else
+		hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr);
+
+	if (!check_sl_valid(hr_dev, hr_qp->sl))
+		return -EINVAL;
+
+	hr_reg_write(context, QPC_SL, hr_qp->sl);
+	hr_reg_clear(qpc_mask, QPC_SL);
+
+	return 0;
+}
+
 static int hns_roce_v2_set_path(struct ib_qp *ibqp,
 				const struct ib_qp_attr *attr,
 				int attr_mask,
@@ -4843,25 +4902,18 @@ static int hns_roce_v2_set_path(struct ib_qp *ibqp,
 	int is_roce_protocol;
 	u16 vlan_id = 0xffff;
 	bool is_udp = false;
-	u32 max_sl;
 	u8 ib_port;
 	u8 hr_port;
 	int ret;
 
-	max_sl = min_t(u32, MAX_SERVICE_LEVEL, hr_dev->caps.sl_num - 1);
-	if (unlikely(sl > max_sl)) {
-		ibdev_err_ratelimited(ibdev,
-				      "failed to fill QPC, sl (%u) shouldn't be larger than %u.\n",
-				      sl, max_sl);
-		return -EINVAL;
-	}
-
 	/*
 	 * If free_mr_en of qp is set, it means that this qp comes from
 	 * free mr. This qp will perform the loopback operation.
 	 * In the loopback scenario, only sl needs to be set.
 	 */
 	if (hr_qp->free_mr_en) {
+		if (!check_sl_valid(hr_dev, sl))
+			return -EINVAL;
 		hr_reg_write(context, QPC_SL, sl);
 		hr_reg_clear(qpc_mask, QPC_SL);
 		hr_qp->sl = sl;
@@ -4931,11 +4983,7 @@ static int hns_roce_v2_set_path(struct ib_qp *ibqp,
 	memcpy(context->dgid, grh->dgid.raw, sizeof(grh->dgid.raw));
 	memset(qpc_mask->dgid, 0, sizeof(grh->dgid.raw));
 
-	hr_qp->sl = sl;
-	hr_reg_write(context, QPC_SL, hr_qp->sl);
-	hr_reg_clear(qpc_mask, QPC_SL);
-
-	return 0;
+	return  hns_roce_set_sl(ibqp, attr, context, qpc_mask);
 }
 
 static bool check_qp_state(enum ib_qp_state cur_state,
@@ -6735,6 +6783,7 @@ static const struct hns_roce_hw hns_roce_hw_v2 = {
 	.query_srqc = hns_roce_v2_query_srqc,
 	.query_sccc = hns_roce_v2_query_sccc,
 	.query_hw_counter = hns_roce_hw_v2_query_counter,
+	.get_dscp = hns_roce_hw_v2_get_dscp,
 	.hns_roce_dev_ops = &hns_roce_v2_dev_ops,
 	.hns_roce_dev_srq_ops = &hns_roce_v2_dev_srq_ops,
 };
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index f35a66325d9a..697230f964b1 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -1386,6 +1386,7 @@ int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		       int attr_mask, struct ib_udata *udata)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+	struct hns_roce_ib_modify_qp_resp resp = {};
 	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
 	enum ib_qp_state cur_state, new_state;
 	int ret = -EINVAL;
@@ -1427,6 +1428,18 @@ int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 
 	ret = hr_dev->hw->modify_qp(ibqp, attr, attr_mask, cur_state,
 				    new_state, udata);
+	if (ret)
+		goto out;
+
+	if (udata && udata->outlen) {
+		resp.tc_mode = hr_qp->tc_mode;
+		resp.priority = hr_qp->sl;
+		ret = ib_copy_to_udata(udata, &resp,
+				       min(udata->outlen, sizeof(resp)));
+		if (ret)
+			ibdev_err_ratelimited(&hr_dev->ib_dev,
+					      "failed to copy modify qp resp.\n");
+	}
 
 out:
 	mutex_unlock(&hr_qp->mutex);
diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h
index 158670da2b2a..94e861870e27 100644
--- a/include/uapi/rdma/hns-abi.h
+++ b/include/uapi/rdma/hns-abi.h
@@ -109,6 +109,12 @@ struct hns_roce_ib_create_qp_resp {
 	__aligned_u64 dwqe_mmap_key;
 };
 
+struct hns_roce_ib_modify_qp_resp {
+	__u8	tc_mode;
+	__u8	priority;
+	__u8	reserved[6];
+};
+
 enum {
 	HNS_ROCE_EXSGE_FLAGS = 1 << 0,
 	HNS_ROCE_RQ_INLINE_FLAGS = 1 << 1,
@@ -143,7 +149,8 @@ struct hns_roce_ib_alloc_pd_resp {
 
 struct hns_roce_ib_create_ah_resp {
 	__u8 dmac[6];
-	__u8 reserved[2];
+	__u8 priority;
+	__u8 tc_mode;
 };
 
 #endif /* HNS_ABI_USER_H */
-- 
cgit v1.2.3


From e1900d7ba9c9fd9edb214c43d2826876a5a35057 Mon Sep 17 00:00:00 2001
From: Diogo Ivo <diogo.ivo@siemens.com>
Date: Wed, 3 Apr 2024 11:48:12 +0100
Subject: eth: Move IPv4/IPv6 multicast address bases to their own symbols

As these addresses can be useful outside of checking if an address
is a multicast address (for example in device drivers) make them
accessible to users of etherdevice.h to avoid code duplication.

Signed-off-by: Diogo Ivo <diogo.ivo@siemens.com>
Reviewed-by: MD Danish Anwar <danishanwar@ti.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/linux/etherdevice.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 224645f17c33..8d6daf828427 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -71,6 +71,12 @@ static const u8 eth_reserved_addr_base[ETH_ALEN] __aligned(2) =
 { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };
 #define eth_stp_addr eth_reserved_addr_base
 
+static const u8 eth_ipv4_mcast_addr_base[ETH_ALEN] __aligned(2) =
+{ 0x01, 0x00, 0x5e, 0x00, 0x00, 0x00 };
+
+static const u8 eth_ipv6_mcast_addr_base[ETH_ALEN] __aligned(2) =
+{ 0x33, 0x33, 0x00, 0x00, 0x00, 0x00 };
+
 /**
  * is_link_local_ether_addr - Determine if given Ethernet address is link-local
  * @addr: Pointer to a six-byte array containing the Ethernet address
@@ -430,18 +436,16 @@ static inline bool ether_addr_equal_masked(const u8 *addr1, const u8 *addr2,
 
 static inline bool ether_addr_is_ipv4_mcast(const u8 *addr)
 {
-	u8 base[ETH_ALEN] = { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x00 };
 	u8 mask[ETH_ALEN] = { 0xff, 0xff, 0xff, 0x80, 0x00, 0x00 };
 
-	return ether_addr_equal_masked(addr, base, mask);
+	return ether_addr_equal_masked(addr, eth_ipv4_mcast_addr_base, mask);
 }
 
 static inline bool ether_addr_is_ipv6_mcast(const u8 *addr)
 {
-	u8 base[ETH_ALEN] = { 0x33, 0x33, 0x00, 0x00, 0x00, 0x00 };
 	u8 mask[ETH_ALEN] = { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 };
 
-	return ether_addr_equal_masked(addr, base, mask);
+	return ether_addr_equal_masked(addr, eth_ipv6_mcast_addr_base, mask);
 }
 
 static inline bool ether_addr_is_ip_mcast(const u8 *addr)
-- 
cgit v1.2.3


From 886b94d25a8eba4c42634dddc3cbfd6391a24d25 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 9 Apr 2024 10:41:33 +0200
Subject: fs: Add FOP_HUGE_PAGES

Instead of checking for specific file_operations, add a bit to
file_operations which denotes a file that only contain hugetlb pages.
This lets us make hugetlbfs_file_operations static, and removes
is_file_shm_hugepages() completely.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Link: https://lore.kernel.org/r/20240407201122.3783877-1-willy@infradead.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/hugetlbfs/inode.c    |  5 +++--
 include/linux/fs.h      |  2 ++
 include/linux/hugetlb.h |  8 ++------
 include/linux/shm.h     |  5 -----
 ipc/shm.c               | 10 +++-------
 5 files changed, 10 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6502c7e776d1..34ac73cc36b1 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -40,7 +40,7 @@
 #include <linux/sched/mm.h>
 
 static const struct address_space_operations hugetlbfs_aops;
-const struct file_operations hugetlbfs_file_operations;
+static const struct file_operations hugetlbfs_file_operations;
 static const struct inode_operations hugetlbfs_dir_inode_operations;
 static const struct inode_operations hugetlbfs_inode_operations;
 
@@ -1301,13 +1301,14 @@ static void init_once(void *foo)
 	inode_init_once(&ei->vfs_inode);
 }
 
-const struct file_operations hugetlbfs_file_operations = {
+static const struct file_operations hugetlbfs_file_operations = {
 	.read_iter		= hugetlbfs_read_iter,
 	.mmap			= hugetlbfs_file_mmap,
 	.fsync			= noop_fsync,
 	.get_unmapped_area	= hugetlb_get_unmapped_area,
 	.llseek			= default_llseek,
 	.fallocate		= hugetlbfs_fallocate,
+	.fop_flags		= FOP_HUGE_PAGES,
 };
 
 static const struct inode_operations hugetlbfs_dir_inode_operations = {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2ba98930c4c4..b053f98393e6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2052,6 +2052,8 @@ struct file_operations {
 #define FOP_MMAP_SYNC		((__force fop_flags_t)(1 << 2))
 /* Supports non-exclusive O_DIRECT writes from multiple threads */
 #define FOP_DIO_PARALLEL_WRITE	((__force fop_flags_t)(1 << 3))
+/* Contains huge pages */
+#define FOP_HUGE_PAGES		((__force fop_flags_t)(1 << 4))
 
 /* Wrap a directory iterator that needs exclusive inode access */
 int wrap_directory_iterator(struct file *, struct dir_context *,
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 77b30a8c6076..b06f7c426d38 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -554,17 +554,13 @@ static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
 	return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
 }
 
-extern const struct file_operations hugetlbfs_file_operations;
 extern const struct vm_operations_struct hugetlb_vm_ops;
 struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
 				int creat_flags, int page_size_log);
 
-static inline bool is_file_hugepages(struct file *file)
+static inline bool is_file_hugepages(const struct file *file)
 {
-	if (file->f_op == &hugetlbfs_file_operations)
-		return true;
-
-	return is_file_shm_hugepages(file);
+	return file->f_op->fop_flags & FOP_HUGE_PAGES;
 }
 
 static inline struct hstate *hstate_inode(struct inode *i)
diff --git a/include/linux/shm.h b/include/linux/shm.h
index c55bef0538e5..1d3d3ae958fb 100644
--- a/include/linux/shm.h
+++ b/include/linux/shm.h
@@ -16,7 +16,6 @@ struct sysv_shm {
 
 long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr,
 	      unsigned long shmlba);
-bool is_file_shm_hugepages(struct file *file);
 void exit_shm(struct task_struct *task);
 #define shm_init_task(task) INIT_LIST_HEAD(&(task)->sysvshm.shm_clist)
 #else
@@ -30,10 +29,6 @@ static inline long do_shmat(int shmid, char __user *shmaddr,
 {
 	return -ENOSYS;
 }
-static inline bool is_file_shm_hugepages(struct file *file)
-{
-	return false;
-}
 static inline void exit_shm(struct task_struct *task)
 {
 }
diff --git a/ipc/shm.c b/ipc/shm.c
index a89f001a8bf0..3e3071252dac 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -662,8 +662,8 @@ static const struct file_operations shm_file_operations = {
 };
 
 /*
- * shm_file_operations_huge is now identical to shm_file_operations,
- * but we keep it distinct for the sake of is_file_shm_hugepages().
+ * shm_file_operations_huge is now identical to shm_file_operations
+ * except for fop_flags
  */
 static const struct file_operations shm_file_operations_huge = {
 	.mmap		= shm_mmap,
@@ -672,13 +672,9 @@ static const struct file_operations shm_file_operations_huge = {
 	.get_unmapped_area	= shm_get_unmapped_area,
 	.llseek		= noop_llseek,
 	.fallocate	= shm_fallocate,
+	.fop_flags	= FOP_HUGE_PAGES,
 };
 
-bool is_file_shm_hugepages(struct file *file)
-{
-	return file->f_op == &shm_file_operations_huge;
-}
-
 static const struct vm_operations_struct shm_vm_ops = {
 	.open	= shm_open,	/* callback for a new vm-area open */
 	.close	= shm_close,	/* callback for when the vm-area is released */
-- 
cgit v1.2.3


From b9e810405880c99baafd550ada7043e86465396e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 7 Apr 2024 09:33:21 +0000
Subject: tcp: propagate tcp_tw_isn via an extra parameter to ->route_req()

tcp_v6_init_req() reads TCP_SKB_CB(skb)->tcp_tw_isn to find
out if the request socket is created by a SYN hitting a TIMEWAIT socket.

This has been buggy for a decade, lets directly pass the information
from tcp_conn_request().

This is a preparatory patch to make the following one easier to review.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/tcp.h    |  3 ++-
 net/ipv4/tcp_input.c |  2 +-
 net/ipv4/tcp_ipv4.c  |  3 ++-
 net/ipv6/tcp_ipv6.c  | 10 ++++++----
 net/mptcp/subflow.c  | 10 ++++++----
 5 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 9ab5b37e9d53..fa0ab77acee2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2284,7 +2284,8 @@ struct tcp_request_sock_ops {
 	struct dst_entry *(*route_req)(const struct sock *sk,
 				       struct sk_buff *skb,
 				       struct flowi *fl,
-				       struct request_sock *req);
+				       struct request_sock *req,
+				       u32 tw_isn);
 	u32 (*init_seq)(const struct sk_buff *skb);
 	u32 (*init_ts_off)(const struct net *net, const struct sk_buff *skb);
 	int (*send_synack)(const struct sock *sk, struct dst_entry *dst,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1f28a2561795..48c275e6ef02 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -7160,7 +7160,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	/* Note: tcp_v6_init_req() might override ir_iif for link locals */
 	inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
 
-	dst = af_ops->route_req(sk, skb, &fl, req);
+	dst = af_ops->route_req(sk, skb, &fl, req, isn);
 	if (!dst)
 		goto drop_and_free;
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 52963c3bb8ca..81e2f05c244d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1666,7 +1666,8 @@ static void tcp_v4_init_req(struct request_sock *req,
 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
 					  struct sk_buff *skb,
 					  struct flowi *fl,
-					  struct request_sock *req)
+					  struct request_sock *req,
+					  u32 tw_isn)
 {
 	tcp_v4_init_req(req, sk, skb);
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index cffebaec66f1..5141f7033abd 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -793,7 +793,8 @@ clear_hash_nostart:
 
 static void tcp_v6_init_req(struct request_sock *req,
 			    const struct sock *sk_listener,
-			    struct sk_buff *skb)
+			    struct sk_buff *skb,
+			    u32 tw_isn)
 {
 	bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags);
 	struct inet_request_sock *ireq = inet_rsk(req);
@@ -807,7 +808,7 @@ static void tcp_v6_init_req(struct request_sock *req,
 	    ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
 		ireq->ir_iif = tcp_v6_iif(skb);
 
-	if (!TCP_SKB_CB(skb)->tcp_tw_isn &&
+	if (!tw_isn &&
 	    (ipv6_opt_accepted(sk_listener, skb, &TCP_SKB_CB(skb)->header.h6) ||
 	     np->rxopt.bits.rxinfo ||
 	     np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim ||
@@ -820,9 +821,10 @@ static void tcp_v6_init_req(struct request_sock *req,
 static struct dst_entry *tcp_v6_route_req(const struct sock *sk,
 					  struct sk_buff *skb,
 					  struct flowi *fl,
-					  struct request_sock *req)
+					  struct request_sock *req,
+					  u32 tw_isn)
 {
-	tcp_v6_init_req(req, sk, skb);
+	tcp_v6_init_req(req, sk, skb, tw_isn);
 
 	if (security_inet_conn_request(sk, skb, req))
 		return NULL;
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 162b218d9858..b94d1dca1094 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -289,7 +289,8 @@ EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req);
 static struct dst_entry *subflow_v4_route_req(const struct sock *sk,
 					      struct sk_buff *skb,
 					      struct flowi *fl,
-					      struct request_sock *req)
+					      struct request_sock *req,
+					      u32 tw_isn)
 {
 	struct dst_entry *dst;
 	int err;
@@ -297,7 +298,7 @@ static struct dst_entry *subflow_v4_route_req(const struct sock *sk,
 	tcp_rsk(req)->is_mptcp = 1;
 	subflow_init_req(req, sk);
 
-	dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req);
+	dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req, tw_isn);
 	if (!dst)
 		return NULL;
 
@@ -356,7 +357,8 @@ static int subflow_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
 static struct dst_entry *subflow_v6_route_req(const struct sock *sk,
 					      struct sk_buff *skb,
 					      struct flowi *fl,
-					      struct request_sock *req)
+					      struct request_sock *req,
+					      u32 tw_isn)
 {
 	struct dst_entry *dst;
 	int err;
@@ -364,7 +366,7 @@ static struct dst_entry *subflow_v6_route_req(const struct sock *sk,
 	tcp_rsk(req)->is_mptcp = 1;
 	subflow_init_req(req, sk);
 
-	dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req);
+	dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req, tw_isn);
 	if (!dst)
 		return NULL;
 
-- 
cgit v1.2.3


From 41eecbd712b73f0d5dcf1152b9a1c27b1f238028 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sun, 7 Apr 2024 09:33:22 +0000
Subject: tcp: replace TCP_SKB_CB(skb)->tcp_tw_isn with a per-cpu field

TCP can transform a TIMEWAIT socket into a SYN_RECV one from
a SYN packet, and the ISN of the SYNACK packet is normally
generated using TIMEWAIT tw_snd_nxt :

tcp_timewait_state_process()
...
    u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
    if (isn == 0)
        isn++;
    TCP_SKB_CB(skb)->tcp_tw_isn = isn;
    return TCP_TW_SYN;

This SYN packet also bypasses normal checks against listen queue
being full or not.

tcp_conn_request()
...
       __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
...
        /* TW buckets are converted to open requests without
         * limitations, they conserve resources and peer is
         * evidently real one.
         */
        if ((syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) && !isn) {
                want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name);
                if (!want_cookie)
                        goto drop;
        }

This was using TCP_SKB_CB(skb)->tcp_tw_isn field in skb.

Unfortunately this field has been accidentally cleared
after the call to tcp_timewait_state_process() returning
TCP_TW_SYN.

Using a field in TCP_SKB_CB(skb) for a temporary state
is overkill.

Switch instead to a per-cpu variable.

As a bonus, we do not have to clear tcp_tw_isn in TCP receive
fast path.
It is temporarily set then cleared only in the TCP_TW_SYN dance.

Fixes: 4ad19de8774e ("net: tcp6: fix double call of tcp_v6_fill_cb()")
Fixes: eeea10b83a13 ("tcp: add tcp_v4_fill_cb()/tcp_v4_restore_cb()")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/tcp.h        | 10 +++++-----
 net/ipv4/tcp.c           |  3 +++
 net/ipv4/tcp_input.c     | 26 ++++++++++++++++----------
 net/ipv4/tcp_ipv4.c      |  5 +++--
 net/ipv4/tcp_minisocks.c |  4 ++--
 net/ipv6/tcp_ipv6.c      |  5 +++--
 6 files changed, 32 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index fa0ab77acee2..ba6c5ae86e22 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -52,6 +52,8 @@ extern struct inet_hashinfo tcp_hashinfo;
 DECLARE_PER_CPU(unsigned int, tcp_orphan_count);
 int tcp_orphan_count_sum(void);
 
+DECLARE_PER_CPU(u32, tcp_tw_isn);
+
 void tcp_time_wait(struct sock *sk, int state, int timeo);
 
 #define MAX_TCP_HEADER	L1_CACHE_ALIGN(128 + MAX_HEADER)
@@ -392,7 +394,8 @@ enum tcp_tw_status {
 
 enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw,
 					      struct sk_buff *skb,
-					      const struct tcphdr *th);
+					      const struct tcphdr *th,
+					      u32 *tw_isn);
 struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 			   struct request_sock *req, bool fastopen,
 			   bool *lost_race);
@@ -935,13 +938,10 @@ struct tcp_skb_cb {
 	__u32		seq;		/* Starting sequence number	*/
 	__u32		end_seq;	/* SEQ + FIN + SYN + datalen	*/
 	union {
-		/* Note : tcp_tw_isn is used in input path only
-		 *	  (isn chosen by tcp_timewait_state_process())
-		 *
+		/* Note :
 		 * 	  tcp_gso_segs/size are used in write queue only,
 		 *	  cf tcp_skb_pcount()/tcp_skb_mss()
 		 */
-		__u32		tcp_tw_isn;
 		struct {
 			u16	tcp_gso_segs;
 			u16	tcp_gso_size;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 664c8ecb076b..b07aa71b24ec 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -290,6 +290,9 @@ enum {
 DEFINE_PER_CPU(unsigned int, tcp_orphan_count);
 EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count);
 
+DEFINE_PER_CPU(u32, tcp_tw_isn);
+EXPORT_PER_CPU_SYMBOL_GPL(tcp_tw_isn);
+
 long sysctl_tcp_mem[3] __read_mostly;
 EXPORT_SYMBOL(sysctl_tcp_mem);
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 48c275e6ef02..5a45a0923a1f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -7097,7 +7097,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 		     struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_fastopen_cookie foc = { .len = -1 };
-	__u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
 	struct tcp_options_received tmp_opt;
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct net *net = sock_net(sk);
@@ -7107,21 +7106,28 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	struct dst_entry *dst;
 	struct flowi fl;
 	u8 syncookies;
+	u32 isn;
 
 #ifdef CONFIG_TCP_AO
 	const struct tcp_ao_hdr *aoh;
 #endif
 
-	syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
+	isn = __this_cpu_read(tcp_tw_isn);
+	if (isn) {
+		/* TW buckets are converted to open requests without
+		 * limitations, they conserve resources and peer is
+		 * evidently real one.
+		 */
+		__this_cpu_write(tcp_tw_isn, 0);
+	} else {
+		syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
 
-	/* TW buckets are converted to open requests without
-	 * limitations, they conserve resources and peer is
-	 * evidently real one.
-	 */
-	if ((syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) && !isn) {
-		want_cookie = tcp_syn_flood_action(sk, rsk_ops->slab_name);
-		if (!want_cookie)
-			goto drop;
+		if (syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) {
+			want_cookie = tcp_syn_flood_action(sk,
+							   rsk_ops->slab_name);
+			if (!want_cookie)
+				goto drop;
+		}
 	}
 
 	if (sk_acceptq_is_full(sk)) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 81e2f05c244d..1e650ec71d2f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2146,7 +2146,6 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
 				    skb->len - th->doff * 4);
 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
-	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
 	TCP_SKB_CB(skb)->sacked	 = 0;
 	TCP_SKB_CB(skb)->has_rxtstamp =
@@ -2168,6 +2167,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
 	bool refcounted;
 	struct sock *sk;
 	int ret;
+	u32 isn;
 
 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
 	if (skb->pkt_type != PACKET_HOST)
@@ -2383,7 +2383,7 @@ do_time_wait:
 		inet_twsk_put(inet_twsk(sk));
 		goto csum_error;
 	}
-	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
+	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) {
 	case TCP_TW_SYN: {
 		struct sock *sk2 = inet_lookup_listener(net,
 							net->ipv4.tcp_death_row.hashinfo,
@@ -2397,6 +2397,7 @@ do_time_wait:
 			sk = sk2;
 			tcp_v4_restore_cb(skb);
 			refcounted = false;
+			__this_cpu_write(tcp_tw_isn, isn);
 			goto process;
 		}
 	}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 5b21a07ddf9a..f53c7ada2ace 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -95,7 +95,7 @@ static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq)
  */
 enum tcp_tw_status
 tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
-			   const struct tcphdr *th)
+			   const struct tcphdr *th, u32 *tw_isn)
 {
 	struct tcp_options_received tmp_opt;
 	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
@@ -228,7 +228,7 @@ kill:
 		u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
 		if (isn == 0)
 			isn++;
-		TCP_SKB_CB(skb)->tcp_tw_isn = isn;
+		*tw_isn = isn;
 		return TCP_TW_SYN;
 	}
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 5141f7033abd..3aa9da5c9a66 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1741,7 +1741,6 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
 				    skb->len - th->doff*4);
 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
 	TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
-	TCP_SKB_CB(skb)->tcp_tw_isn = 0;
 	TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr);
 	TCP_SKB_CB(skb)->sacked = 0;
 	TCP_SKB_CB(skb)->has_rxtstamp =
@@ -1758,6 +1757,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
 	bool refcounted;
 	struct sock *sk;
 	int ret;
+	u32 isn;
 	struct net *net = dev_net(skb->dev);
 
 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
@@ -1967,7 +1967,7 @@ do_time_wait:
 		goto csum_error;
 	}
 
-	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
+	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) {
 	case TCP_TW_SYN:
 	{
 		struct sock *sk2;
@@ -1985,6 +1985,7 @@ do_time_wait:
 			sk = sk2;
 			tcp_v6_restore_cb(skb);
 			refcounted = false;
+			__this_cpu_write(tcp_tw_isn, isn);
 			goto process;
 		}
 	}
-- 
cgit v1.2.3


From c342b42fa47f4257fccfeadc8e32c51b1be17a1f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 23 Feb 2024 15:16:20 -0800
Subject: rcu-tasks: Make Tasks RCU wait idly for grace-period delays

Currently, all waits for grace periods sleep at TASK_UNINTERRUPTIBLE,
regardless of RCU flavor.  This has worked well, but there have been
cases where a longer-than-average Tasks RCU grace period has triggered
softlockup splats, many of them, before the Tasks RCU CPU stall warning
appears.  These softlockup splats unnecessarily consume console bandwidth
and complicate diagnosis of the underlying problem.  Plus a long but not
pathologically long Tasks RCU grace period might trigger a few softlockup
splats before completing normally, which generates noise for no good
reason.

This commit therefore causes Tasks RCU grace periods to sleep at TASK_IDLE
priority.  If there really is a persistent problem, the eventual Tasks
RCU CPU stall warning will flag it, and without the extra noise.

Reported-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 include/linux/rcupdate_wait.h | 18 +++++++++---------
 kernel/rcu/tasks.h            |  6 +++++-
 kernel/rcu/update.c           |  4 ++--
 3 files changed, 16 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/rcupdate_wait.h b/include/linux/rcupdate_wait.h
index d07f0848802e..303ab9bee155 100644
--- a/include/linux/rcupdate_wait.h
+++ b/include/linux/rcupdate_wait.h
@@ -19,18 +19,18 @@ struct rcu_synchronize {
 };
 void wakeme_after_rcu(struct rcu_head *head);
 
-void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
+void __wait_rcu_gp(bool checktiny, unsigned int state, int n, call_rcu_func_t *crcu_array,
 		   struct rcu_synchronize *rs_array);
 
-#define _wait_rcu_gp(checktiny, ...) \
-do {									\
-	call_rcu_func_t __crcu_array[] = { __VA_ARGS__ };		\
-	struct rcu_synchronize __rs_array[ARRAY_SIZE(__crcu_array)];	\
-	__wait_rcu_gp(checktiny, ARRAY_SIZE(__crcu_array),		\
-			__crcu_array, __rs_array);			\
+#define _wait_rcu_gp(checktiny, state, ...) \
+do {												\
+	call_rcu_func_t __crcu_array[] = { __VA_ARGS__ };					\
+	struct rcu_synchronize __rs_array[ARRAY_SIZE(__crcu_array)];				\
+	__wait_rcu_gp(checktiny, state, ARRAY_SIZE(__crcu_array), __crcu_array, __rs_array);	\
 } while (0)
 
-#define wait_rcu_gp(...) _wait_rcu_gp(false, __VA_ARGS__)
+#define wait_rcu_gp(...) _wait_rcu_gp(false, TASK_UNINTERRUPTIBLE, __VA_ARGS__)
+#define wait_rcu_gp_state(state, ...) _wait_rcu_gp(false, state, __VA_ARGS__)
 
 /**
  * synchronize_rcu_mult - Wait concurrently for multiple grace periods
@@ -54,7 +54,7 @@ do {									\
  * grace period.
  */
 #define synchronize_rcu_mult(...) \
-	_wait_rcu_gp(IS_ENABLED(CONFIG_TINY_RCU), __VA_ARGS__)
+	_wait_rcu_gp(IS_ENABLED(CONFIG_TINY_RCU), TASK_UNINTERRUPTIBLE, __VA_ARGS__)
 
 static inline void cond_resched_rcu(void)
 {
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 147b5945d67a..82e458ea0728 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -74,6 +74,7 @@ struct rcu_tasks_percpu {
  * @holdouts_func: This flavor's holdout-list scan function (optional).
  * @postgp_func: This flavor's post-grace-period function (optional).
  * @call_func: This flavor's call_rcu()-equivalent function.
+ * @wait_state: Task state for synchronous grace-period waits (default TASK_UNINTERRUPTIBLE).
  * @rtpcpu: This flavor's rcu_tasks_percpu structure.
  * @percpu_enqueue_shift: Shift down CPU ID this much when enqueuing callbacks.
  * @percpu_enqueue_lim: Number of per-CPU callback queues in use for enqueuing.
@@ -107,6 +108,7 @@ struct rcu_tasks {
 	holdouts_func_t holdouts_func;
 	postgp_func_t postgp_func;
 	call_rcu_func_t call_func;
+	unsigned int wait_state;
 	struct rcu_tasks_percpu __percpu *rtpcpu;
 	int percpu_enqueue_shift;
 	int percpu_enqueue_lim;
@@ -134,6 +136,7 @@ static struct rcu_tasks rt_name =							\
 	.tasks_gp_mutex = __MUTEX_INITIALIZER(rt_name.tasks_gp_mutex),			\
 	.gp_func = gp,									\
 	.call_func = call,								\
+	.wait_state = TASK_UNINTERRUPTIBLE,						\
 	.rtpcpu = &rt_name ## __percpu,							\
 	.lazy_jiffies = DIV_ROUND_UP(HZ, 4),						\
 	.name = n,									\
@@ -638,7 +641,7 @@ static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
 
 	// If the grace-period kthread is running, use it.
 	if (READ_ONCE(rtp->kthread_ptr)) {
-		wait_rcu_gp(rtp->call_func);
+		wait_rcu_gp_state(rtp->wait_state, rtp->call_func);
 		return;
 	}
 	rcu_tasks_one_gp(rtp, true);
@@ -1160,6 +1163,7 @@ static int __init rcu_spawn_tasks_kthread(void)
 	rcu_tasks.postscan_func = rcu_tasks_postscan;
 	rcu_tasks.holdouts_func = check_all_holdout_tasks;
 	rcu_tasks.postgp_func = rcu_tasks_postgp;
+	rcu_tasks.wait_state = TASK_IDLE;
 	rcu_spawn_tasks_kthread_generic(&rcu_tasks);
 	return 0;
 }
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 46aaaa9fe339..f8436969e0c8 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -408,7 +408,7 @@ void wakeme_after_rcu(struct rcu_head *head)
 }
 EXPORT_SYMBOL_GPL(wakeme_after_rcu);
 
-void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
+void __wait_rcu_gp(bool checktiny, unsigned int state, int n, call_rcu_func_t *crcu_array,
 		   struct rcu_synchronize *rs_array)
 {
 	int i;
@@ -440,7 +440,7 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
 			if (crcu_array[j] == crcu_array[i])
 				break;
 		if (j == i) {
-			wait_for_completion(&rs_array[i].completion);
+			wait_for_completion_state(&rs_array[i].completion, state);
 			destroy_rcu_head_on_stack(&rs_array[i].head);
 		}
 	}
-- 
cgit v1.2.3


From 779087fe2fecf8b8aa7bdd69982f9c0c8266927b Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Fri, 5 Apr 2024 08:08:12 +0200
Subject: kfifo: drop __kfifo_dma_out_finish_r()

It is the same as __kfifo_skip_r(), so:
* drop __kfifo_dma_out_finish_r() completely, and
* replace its (only) use by __kfifo_skip_r().

Signed-off-by: Jiri Slaby (SUSE) <jirislaby@kernel.org>
Cc: Stefani Seibold <stefani@seibold.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: https://lore.kernel.org/r/20240405060826.2521-2-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/kfifo.h | 4 +---
 lib/kfifo.c           | 8 --------
 2 files changed, 1 insertion(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h
index 0b35a41440ff..bc7a1f5bb0ce 100644
--- a/include/linux/kfifo.h
+++ b/include/linux/kfifo.h
@@ -797,7 +797,7 @@ __kfifo_int_must_check_helper( \
 	const size_t __recsize = sizeof(*__tmp->rectype); \
 	struct __kfifo *__kfifo = &__tmp->kfifo; \
 	if (__recsize) \
-		__kfifo_dma_out_finish_r(__kfifo, __recsize); \
+		__kfifo_skip_r(__kfifo, __recsize); \
 	else \
 		__kfifo->out += __len / sizeof(*__tmp->type); \
 })
@@ -879,8 +879,6 @@ extern void __kfifo_dma_in_finish_r(struct __kfifo *fifo,
 extern unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo,
 	struct scatterlist *sgl, int nents, unsigned int len, size_t recsize);
 
-extern void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize);
-
 extern unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize);
 
 extern void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize);
diff --git a/lib/kfifo.c b/lib/kfifo.c
index 12f5a347aa13..958099cc4914 100644
--- a/lib/kfifo.c
+++ b/lib/kfifo.c
@@ -582,11 +582,3 @@ unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo,
 }
 EXPORT_SYMBOL(__kfifo_dma_out_prepare_r);
 
-void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize)
-{
-	unsigned int len;
-
-	len = __kfifo_peek_n(fifo, recsize);
-	fifo->out += len + recsize;
-}
-EXPORT_SYMBOL(__kfifo_dma_out_finish_r);
-- 
cgit v1.2.3


From 76738194be605c9fb279f87d8a6b0b95904bfb60 Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Fri, 5 Apr 2024 08:08:13 +0200
Subject: kfifo: introduce and use kfifo_skip_count()

kfifo_skip_count() is an extended version of kfifo_skip(), accepting
also count. This will be useful in the serial code later.

Now, it can be used to implement both kfifo_skip() and
kfifo_dma_out_finish(). In the latter, 'len' only needs to be divided by
'type' size (as it was until now).

And stop using statement expressions when the return value is cast to
'void'. Use classic 'do {} while (0)' instead.

Note: perhaps we should skip 'count' records for the 'recsize' case, but
the original (kfifo_dma_out_finish()) used to skip only one record. So
this is kept unchanged and 'count' is still ignored in the recsize case.

Signed-off-by: Jiri Slaby (SUSE) <jirislaby@kernel.org>
Cc: Stefani Seibold <stefani@seibold.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: https://lore.kernel.org/r/20240405060826.2521-3-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/kfifo.h | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h
index bc7a1f5bb0ce..8f3369ec528b 100644
--- a/include/linux/kfifo.h
+++ b/include/linux/kfifo.h
@@ -304,19 +304,25 @@ __kfifo_uint_must_check_helper( \
 )
 
 /**
- * kfifo_skip - skip output data
+ * kfifo_skip_count - skip output data
  * @fifo: address of the fifo to be used
+ * @count: count of data to skip
  */
-#define	kfifo_skip(fifo) \
-(void)({ \
+#define	kfifo_skip_count(fifo, count) do { \
 	typeof((fifo) + 1) __tmp = (fifo); \
 	const size_t __recsize = sizeof(*__tmp->rectype); \
 	struct __kfifo *__kfifo = &__tmp->kfifo; \
 	if (__recsize) \
 		__kfifo_skip_r(__kfifo, __recsize); \
 	else \
-		__kfifo->out++; \
-})
+		__kfifo->out += (count); \
+} while(0)
+
+/**
+ * kfifo_skip - skip output data
+ * @fifo: address of the fifo to be used
+ */
+#define	kfifo_skip(fifo)	kfifo_skip_count(fifo, 1)
 
 /**
  * kfifo_peek_len - gets the size of the next fifo record
@@ -790,17 +796,10 @@ __kfifo_int_must_check_helper( \
  * Note that with only one concurrent reader and one concurrent
  * writer, you don't need extra locking to use these macros.
  */
-#define kfifo_dma_out_finish(fifo, len) \
-(void)({ \
-	typeof((fifo) + 1) __tmp = (fifo); \
-	unsigned int __len = (len); \
-	const size_t __recsize = sizeof(*__tmp->rectype); \
-	struct __kfifo *__kfifo = &__tmp->kfifo; \
-	if (__recsize) \
-		__kfifo_skip_r(__kfifo, __recsize); \
-	else \
-		__kfifo->out += __len / sizeof(*__tmp->type); \
-})
+#define kfifo_dma_out_finish(fifo, len) do { \
+	typeof((fifo) + 1) ___tmp = (fifo); \
+	kfifo_skip_count(___tmp, (len) / sizeof(*___tmp->type)); \
+} while (0)
 
 /**
  * kfifo_out_peek - gets some data from the fifo
-- 
cgit v1.2.3


From 4edd7e96a1f159f43bd1cb82616f81eaddd54262 Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Fri, 5 Apr 2024 08:08:14 +0200
Subject: kfifo: add kfifo_out_linear{,_ptr}()

These are helpers which are going to be used in the serial layer. We
need a wrapper around kfifo which provides us with a tail (sometimes
"tail" offset, sometimes a pointer) to the kfifo data. And which returns
count of available data -- but not larger than to the end of the buffer
(hence _linear in the names). I.e. something like CIRC_CNT_TO_END() in
the legacy circ_buf.

This patch adds such two helpers.

Signed-off-by: Jiri Slaby (SUSE) <jirislaby@kernel.org>
Cc: Stefani Seibold <stefani@seibold.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: https://lore.kernel.org/r/20240405060826.2521-4-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/kfifo.h | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/kfifo.c           | 26 +++++++++++++++++++++
 2 files changed, 89 insertions(+)

(limited to 'include')

diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h
index 8f3369ec528b..3def70e1a3e3 100644
--- a/include/linux/kfifo.h
+++ b/include/linux/kfifo.h
@@ -827,6 +827,63 @@ __kfifo_uint_must_check_helper( \
 }) \
 )
 
+/**
+ * kfifo_out_linear - gets a tail of/offset to available data
+ * @fifo: address of the fifo to be used
+ * @tail: pointer to an unsigned int to store the value of tail
+ * @n: max. number of elements to point at
+ *
+ * This macro obtains the offset (tail) to the available data in the fifo
+ * buffer and returns the
+ * numbers of elements available. It returns the available count till the end
+ * of data or till the end of the buffer. So that it can be used for linear
+ * data processing (like memcpy() of (@fifo->data + @tail) with count
+ * returned).
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these macro.
+ */
+#define kfifo_out_linear(fifo, tail, n) \
+__kfifo_uint_must_check_helper( \
+({ \
+	typeof((fifo) + 1) __tmp = (fifo); \
+	unsigned int *__tail = (tail); \
+	unsigned long __n = (n); \
+	const size_t __recsize = sizeof(*__tmp->rectype); \
+	struct __kfifo *__kfifo = &__tmp->kfifo; \
+	(__recsize) ? \
+	__kfifo_out_linear_r(__kfifo, __tail, __n, __recsize) : \
+	__kfifo_out_linear(__kfifo, __tail, __n); \
+}) \
+)
+
+/**
+ * kfifo_out_linear_ptr - gets a pointer to the available data
+ * @fifo: address of the fifo to be used
+ * @ptr: pointer to data to store the pointer to tail
+ * @n: max. number of elements to point at
+ *
+ * Similarly to kfifo_out_linear(), this macro obtains the pointer to the
+ * available data in the fifo buffer and returns the numbers of elements
+ * available. It returns the available count till the end of available data or
+ * till the end of the buffer. So that it can be used for linear data
+ * processing (like memcpy() of @ptr with count returned).
+ *
+ * Note that with only one concurrent reader and one concurrent
+ * writer, you don't need extra locking to use these macro.
+ */
+#define kfifo_out_linear_ptr(fifo, ptr, n) \
+__kfifo_uint_must_check_helper( \
+({ \
+	typeof((fifo) + 1) ___tmp = (fifo); \
+	unsigned int ___tail; \
+	unsigned int ___n = kfifo_out_linear(___tmp, &___tail, (n)); \
+	*(ptr) = ___tmp->kfifo.data + ___tail * kfifo_esize(___tmp); \
+	___n; \
+}) \
+)
+
+
 extern int __kfifo_alloc(struct __kfifo *fifo, unsigned int size,
 	size_t esize, gfp_t gfp_mask);
 
@@ -856,6 +913,9 @@ extern unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo,
 extern unsigned int __kfifo_out_peek(struct __kfifo *fifo,
 	void *buf, unsigned int len);
 
+extern unsigned int __kfifo_out_linear(struct __kfifo *fifo,
+	unsigned int *tail, unsigned int n);
+
 extern unsigned int __kfifo_in_r(struct __kfifo *fifo,
 	const void *buf, unsigned int len, size_t recsize);
 
@@ -885,6 +945,9 @@ extern void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize);
 extern unsigned int __kfifo_out_peek_r(struct __kfifo *fifo,
 	void *buf, unsigned int len, size_t recsize);
 
+extern unsigned int __kfifo_out_linear_r(struct __kfifo *fifo,
+	unsigned int *tail, unsigned int n, size_t recsize);
+
 extern unsigned int __kfifo_max_r(unsigned int len, size_t recsize);
 
 #endif
diff --git a/lib/kfifo.c b/lib/kfifo.c
index 958099cc4914..a36bfdbdb17d 100644
--- a/lib/kfifo.c
+++ b/lib/kfifo.c
@@ -163,6 +163,19 @@ unsigned int __kfifo_out_peek(struct __kfifo *fifo,
 }
 EXPORT_SYMBOL(__kfifo_out_peek);
 
+unsigned int __kfifo_out_linear(struct __kfifo *fifo,
+		unsigned int *tail, unsigned int n)
+{
+	unsigned int size = fifo->mask + 1;
+	unsigned int off = fifo->out & fifo->mask;
+
+	if (tail)
+		*tail = off;
+
+	return min3(n, fifo->in - fifo->out, size - off);
+}
+EXPORT_SYMBOL(__kfifo_out_linear);
+
 unsigned int __kfifo_out(struct __kfifo *fifo,
 		void *buf, unsigned int len)
 {
@@ -473,6 +486,19 @@ unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf,
 }
 EXPORT_SYMBOL(__kfifo_out_peek_r);
 
+unsigned int __kfifo_out_linear_r(struct __kfifo *fifo,
+		unsigned int *tail, unsigned int n, size_t recsize)
+{
+	if (fifo->in == fifo->out)
+		return 0;
+
+	if (tail)
+		*tail = fifo->out + recsize;
+
+	return min(n, __kfifo_peek_n(fifo, recsize));
+}
+EXPORT_SYMBOL(__kfifo_out_linear_r);
+
 unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf,
 		unsigned int len, size_t recsize)
 {
-- 
cgit v1.2.3


From d52b761e4b1acc897e65bcc8eff42b0537ac0134 Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Fri, 5 Apr 2024 08:08:18 +0200
Subject: kfifo: add kfifo_dma_out_prepare_mapped()

When the kfifo buffer is already dma-mapped, one cannot use the kfifo
API to fill in an SG list.

Add kfifo_dma_in_prepare_mapped() which allows exactly this. A mapped
dma_addr_t is passed and it is filled into provided sgl too. Including
the dma_len.

Signed-off-by: Jiri Slaby (SUSE) <jirislaby@kernel.org>
Cc: Stefani Seibold <stefani@seibold.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: https://lore.kernel.org/r/20240405060826.2521-8-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/kfifo.h | 37 +++++++++++++++++++++++++------------
 lib/kfifo.c           | 34 ++++++++++++++++++++++------------
 2 files changed, 47 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h
index 3def70e1a3e3..c7cc25b2808b 100644
--- a/include/linux/kfifo.h
+++ b/include/linux/kfifo.h
@@ -36,6 +36,7 @@
  * to lock the reader.
  */
 
+#include <linux/dma-mapping.h>
 #include <linux/kernel.h>
 #include <linux/spinlock.h>
 #include <linux/stddef.h>
@@ -709,11 +710,12 @@ __kfifo_int_must_check_helper( \
 )
 
 /**
- * kfifo_dma_in_prepare - setup a scatterlist for DMA input
+ * kfifo_dma_in_prepare_mapped - setup a scatterlist for DMA input
  * @fifo: address of the fifo to be used
  * @sgl: pointer to the scatterlist array
  * @nents: number of entries in the scatterlist array
  * @len: number of elements to transfer
+ * @dma: mapped dma address to fill into @sgl
  *
  * This macro fills a scatterlist for DMA input.
  * It returns the number entries in the scatterlist array.
@@ -721,7 +723,7 @@ __kfifo_int_must_check_helper( \
  * Note that with only one concurrent reader and one concurrent
  * writer, you don't need extra locking to use these macros.
  */
-#define	kfifo_dma_in_prepare(fifo, sgl, nents, len) \
+#define	kfifo_dma_in_prepare_mapped(fifo, sgl, nents, len, dma) \
 ({ \
 	typeof((fifo) + 1) __tmp = (fifo); \
 	struct scatterlist *__sgl = (sgl); \
@@ -730,10 +732,14 @@ __kfifo_int_must_check_helper( \
 	const size_t __recsize = sizeof(*__tmp->rectype); \
 	struct __kfifo *__kfifo = &__tmp->kfifo; \
 	(__recsize) ? \
-	__kfifo_dma_in_prepare_r(__kfifo, __sgl, __nents, __len, __recsize) : \
-	__kfifo_dma_in_prepare(__kfifo, __sgl, __nents, __len); \
+	__kfifo_dma_in_prepare_r(__kfifo, __sgl, __nents, __len, __recsize, \
+				 dma) : \
+	__kfifo_dma_in_prepare(__kfifo, __sgl, __nents, __len, dma); \
 })
 
+#define kfifo_dma_in_prepare(fifo, sgl, nents, len) \
+	kfifo_dma_in_prepare_mapped(fifo, sgl, nents, len, DMA_MAPPING_ERROR)
+
 /**
  * kfifo_dma_in_finish - finish a DMA IN operation
  * @fifo: address of the fifo to be used
@@ -758,11 +764,12 @@ __kfifo_int_must_check_helper( \
 })
 
 /**
- * kfifo_dma_out_prepare - setup a scatterlist for DMA output
+ * kfifo_dma_out_prepare_mapped - setup a scatterlist for DMA output
  * @fifo: address of the fifo to be used
  * @sgl: pointer to the scatterlist array
  * @nents: number of entries in the scatterlist array
  * @len: number of elements to transfer
+ * @dma: mapped dma address to fill into @sgl
  *
  * This macro fills a scatterlist for DMA output which at most @len bytes
  * to transfer.
@@ -772,7 +779,7 @@ __kfifo_int_must_check_helper( \
  * Note that with only one concurrent reader and one concurrent
  * writer, you don't need extra locking to use these macros.
  */
-#define	kfifo_dma_out_prepare(fifo, sgl, nents, len) \
+#define	kfifo_dma_out_prepare_mapped(fifo, sgl, nents, len, dma) \
 ({ \
 	typeof((fifo) + 1) __tmp = (fifo);  \
 	struct scatterlist *__sgl = (sgl); \
@@ -781,10 +788,14 @@ __kfifo_int_must_check_helper( \
 	const size_t __recsize = sizeof(*__tmp->rectype); \
 	struct __kfifo *__kfifo = &__tmp->kfifo; \
 	(__recsize) ? \
-	__kfifo_dma_out_prepare_r(__kfifo, __sgl, __nents, __len, __recsize) : \
-	__kfifo_dma_out_prepare(__kfifo, __sgl, __nents, __len); \
+	__kfifo_dma_out_prepare_r(__kfifo, __sgl, __nents, __len, __recsize, \
+				  dma) : \
+	__kfifo_dma_out_prepare(__kfifo, __sgl, __nents, __len, dma); \
 })
 
+#define	kfifo_dma_out_prepare(fifo, sgl, nents, len) \
+	kfifo_dma_out_prepare_mapped(fifo, sgl, nents, len, DMA_MAPPING_ERROR)
+
 /**
  * kfifo_dma_out_finish - finish a DMA OUT operation
  * @fifo: address of the fifo to be used
@@ -905,10 +916,10 @@ extern int __kfifo_to_user(struct __kfifo *fifo,
 	void __user *to, unsigned long len, unsigned int *copied);
 
 extern unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo,
-	struct scatterlist *sgl, int nents, unsigned int len);
+	struct scatterlist *sgl, int nents, unsigned int len, dma_addr_t dma);
 
 extern unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo,
-	struct scatterlist *sgl, int nents, unsigned int len);
+	struct scatterlist *sgl, int nents, unsigned int len, dma_addr_t dma);
 
 extern unsigned int __kfifo_out_peek(struct __kfifo *fifo,
 	void *buf, unsigned int len);
@@ -930,13 +941,15 @@ extern int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to,
 	unsigned long len, unsigned int *copied, size_t recsize);
 
 extern unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo,
-	struct scatterlist *sgl, int nents, unsigned int len, size_t recsize);
+	struct scatterlist *sgl, int nents, unsigned int len, size_t recsize,
+	dma_addr_t dma);
 
 extern void __kfifo_dma_in_finish_r(struct __kfifo *fifo,
 	unsigned int len, size_t recsize);
 
 extern unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo,
-	struct scatterlist *sgl, int nents, unsigned int len, size_t recsize);
+	struct scatterlist *sgl, int nents, unsigned int len, size_t recsize,
+	dma_addr_t dma);
 
 extern unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize);
 
diff --git a/lib/kfifo.c b/lib/kfifo.c
index 3a249ce4f281..75ce9225548a 100644
--- a/lib/kfifo.c
+++ b/lib/kfifo.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/kernel.h>
+#include <linux/dma-mapping.h>
 #include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/err.h>
@@ -307,7 +308,7 @@ EXPORT_SYMBOL(__kfifo_to_user);
 
 static unsigned int setup_sgl_buf(struct __kfifo *fifo, struct scatterlist *sgl,
 				  unsigned int data_offset, int nents,
-				  unsigned int len)
+				  unsigned int len, dma_addr_t dma)
 {
 	const void *buf = fifo->data + data_offset;
 
@@ -316,11 +317,16 @@ static unsigned int setup_sgl_buf(struct __kfifo *fifo, struct scatterlist *sgl,
 
 	sg_set_buf(sgl, buf, len);
 
+	if (dma != DMA_MAPPING_ERROR) {
+		sg_dma_address(sgl) = dma + data_offset;
+		sg_dma_len(sgl) = len;
+	}
+
 	return 1;
 }
 
 static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl,
-		int nents, unsigned int len, unsigned int off)
+		int nents, unsigned int len, unsigned int off, dma_addr_t dma)
 {
 	unsigned int size = fifo->mask + 1;
 	unsigned int esize = fifo->esize;
@@ -335,14 +341,15 @@ static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl,
 	}
 	len_to_end = min(len, size - off);
 
-	n = setup_sgl_buf(fifo, sgl, off, nents, len_to_end);
-	n += setup_sgl_buf(fifo, sgl + n, 0, nents - n, len - len_to_end);
+	n = setup_sgl_buf(fifo, sgl, off, nents, len_to_end, dma);
+	n += setup_sgl_buf(fifo, sgl + n, 0, nents - n, len - len_to_end, dma);
 
 	return n;
 }
 
 unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo,
-		struct scatterlist *sgl, int nents, unsigned int len)
+		struct scatterlist *sgl, int nents, unsigned int len,
+		dma_addr_t dma)
 {
 	unsigned int l;
 
@@ -350,12 +357,13 @@ unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo,
 	if (len > l)
 		len = l;
 
-	return setup_sgl(fifo, sgl, nents, len, fifo->in);
+	return setup_sgl(fifo, sgl, nents, len, fifo->in, dma);
 }
 EXPORT_SYMBOL(__kfifo_dma_in_prepare);
 
 unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo,
-		struct scatterlist *sgl, int nents, unsigned int len)
+		struct scatterlist *sgl, int nents, unsigned int len,
+		dma_addr_t dma)
 {
 	unsigned int l;
 
@@ -363,7 +371,7 @@ unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo,
 	if (len > l)
 		len = l;
 
-	return setup_sgl(fifo, sgl, nents, len, fifo->out);
+	return setup_sgl(fifo, sgl, nents, len, fifo->out, dma);
 }
 EXPORT_SYMBOL(__kfifo_dma_out_prepare);
 
@@ -547,7 +555,8 @@ int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to,
 EXPORT_SYMBOL(__kfifo_to_user_r);
 
 unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo,
-	struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
+	struct scatterlist *sgl, int nents, unsigned int len, size_t recsize,
+	dma_addr_t dma)
 {
 	BUG_ON(!nents);
 
@@ -556,7 +565,7 @@ unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo,
 	if (len + recsize > kfifo_unused(fifo))
 		return 0;
 
-	return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize);
+	return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize, dma);
 }
 EXPORT_SYMBOL(__kfifo_dma_in_prepare_r);
 
@@ -570,7 +579,8 @@ void __kfifo_dma_in_finish_r(struct __kfifo *fifo,
 EXPORT_SYMBOL(__kfifo_dma_in_finish_r);
 
 unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo,
-	struct scatterlist *sgl, int nents, unsigned int len, size_t recsize)
+	struct scatterlist *sgl, int nents, unsigned int len, size_t recsize,
+	dma_addr_t dma)
 {
 	BUG_ON(!nents);
 
@@ -579,7 +589,7 @@ unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo,
 	if (len + recsize > fifo->in - fifo->out)
 		return 0;
 
-	return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize);
+	return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize, dma);
 }
 EXPORT_SYMBOL(__kfifo_dma_out_prepare_r);
 
-- 
cgit v1.2.3


From 2ab682d221551d5eb9606dad1efa01e184de7b00 Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Fri, 5 Apr 2024 08:08:19 +0200
Subject: kfifo: fix typos in kernel-doc

Obviously:
"This macro finish" -> "This macro finishes"
and similar.

Signed-off-by: Jiri Slaby (SUSE) <jirislaby@kernel.org>
Cc: Stefani Seibold <stefani@seibold.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: https://lore.kernel.org/r/20240405060826.2521-9-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/kfifo.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h
index c7cc25b2808b..d613748de7ff 100644
--- a/include/linux/kfifo.h
+++ b/include/linux/kfifo.h
@@ -585,7 +585,7 @@ __kfifo_uint_must_check_helper( \
  * @buf: pointer to the storage buffer
  * @n: max. number of elements to get
  *
- * This macro get some data from the fifo and return the numbers of elements
+ * This macro gets some data from the fifo and returns the numbers of elements
  * copied.
  *
  * Note that with only one concurrent reader and one concurrent
@@ -612,7 +612,7 @@ __kfifo_uint_must_check_helper( \
  * @n: max. number of elements to get
  * @lock: pointer to the spinlock to use for locking
  *
- * This macro get the data from the fifo and return the numbers of elements
+ * This macro gets the data from the fifo and returns the numbers of elements
  * copied.
  */
 #define	kfifo_out_spinlocked(fifo, buf, n, lock) \
@@ -745,7 +745,7 @@ __kfifo_int_must_check_helper( \
  * @fifo: address of the fifo to be used
  * @len: number of bytes to received
  *
- * This macro finish a DMA IN operation. The in counter will be updated by
+ * This macro finishes a DMA IN operation. The in counter will be updated by
  * the len parameter. No error checking will be done.
  *
  * Note that with only one concurrent reader and one concurrent
@@ -801,7 +801,7 @@ __kfifo_int_must_check_helper( \
  * @fifo: address of the fifo to be used
  * @len: number of bytes transferred
  *
- * This macro finish a DMA OUT operation. The out counter will be updated by
+ * This macro finishes a DMA OUT operation. The out counter will be updated by
  * the len parameter. No error checking will be done.
  *
  * Note that with only one concurrent reader and one concurrent
@@ -818,7 +818,7 @@ __kfifo_int_must_check_helper( \
  * @buf: pointer to the storage buffer
  * @n: max. number of elements to get
  *
- * This macro get the data from the fifo and return the numbers of elements
+ * This macro gets the data from the fifo and returns the numbers of elements
  * copied. The data is not removed from the fifo.
  *
  * Note that with only one concurrent reader and one concurrent
-- 
cgit v1.2.3


From 1788cf6a91d9fa9aa61fc2917afe192c23d67f6a Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Fri, 5 Apr 2024 08:08:23 +0200
Subject: tty: serial: switch from circ_buf to kfifo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Switch from struct circ_buf to proper kfifo. kfifo provides much better
API, esp. when wrap-around of the buffer needs to be taken into account.
Look at pl011_dma_tx_refill() or cpm_uart_tx_pump() changes for example.

Kfifo API can also fill in scatter-gather DMA structures, so it easier
for that use case too. Look at lpuart_dma_tx() for example. Note that
not all drivers can be converted to that (like atmel_serial), they
handle DMA specially.

Note that usb-serial uses kfifo for TX for ages.

omap needed a bit more care as it needs to put a char into FIFO to start
the DMA transfer when OMAP_DMA_TX_KICK is set. In that case, we have to
do kfifo_dma_out_prepare twice: once to find out the tx_size (to find
out if it is worths to do DMA at all -- size >= 4), the second time for
the actual transfer.

All traces of circ_buf are removed from serial_core.h (and its struct
uart_state).

Signed-off-by: Jiri Slaby (SUSE) <jirislaby@kernel.org>
Cc: Al Cooper <alcooperx@gmail.com>
Cc: Matthias Brugger <matthias.bgg@gmail.com>
Cc: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Cc: Kumaravel Thiagarajan <kumaravel.thiagarajan@microchip.com>
Cc: Tharun Kumar P <tharunkumar.pasumarthi@microchip.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Vineet Gupta <vgupta@kernel.org>
Cc: Richard Genoud <richard.genoud@gmail.com>
Cc: Nicolas Ferre <nicolas.ferre@microchip.com>
Cc: Alexandre Belloni <alexandre.belloni@bootlin.com>
Cc: Claudiu Beznea <claudiu.beznea@tuxon.dev>
Cc: Alexander Shiyan <shc_work@mail.ru>
Cc: Baruch Siach <baruch@tkos.co.il>
Cc: Maciej W. Rozycki <macro@orcam.me.uk>
Cc: Shawn Guo <shawnguo@kernel.org>
Cc: Sascha Hauer <s.hauer@pengutronix.de>
Cc: Fabio Estevam <festevam@gmail.com>
Cc: Neil Armstrong <neil.armstrong@linaro.org>
Cc: Kevin Hilman <khilman@baylibre.com>
Cc: Jerome Brunet <jbrunet@baylibre.com>
Cc: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
Cc: Taichi Sugaya <sugaya.taichi@socionext.com>
Cc: Takao Orito <orito.takao@socionext.com>
Cc: Bjorn Andersson <andersson@kernel.org>
Cc: Konrad Dybcio <konrad.dybcio@linaro.org>
Cc: Pali Rohár <pali@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Cc: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Cc: Alim Akhtar <alim.akhtar@samsung.com>
Cc: Laxman Dewangan <ldewangan@nvidia.com>
Cc: Thierry Reding <thierry.reding@gmail.com>
Cc: Jonathan Hunter <jonathanh@nvidia.com>
Cc: Orson Zhai <orsonzhai@gmail.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Chunyan Zhang <zhang.lyra@gmail.com>
Cc: Patrice Chotard <patrice.chotard@foss.st.com>
Cc: Maxime Coquelin <mcoquelin.stm32@gmail.com>
Cc: Alexandre Torgue <alexandre.torgue@foss.st.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Hammer Hsieh <hammerh0314@gmail.com>
Cc: Peter Korsgaard <jacmet@sunsite.dk>
Cc: Timur Tabi <timur@kernel.org>
Cc: Michal Simek <michal.simek@amd.com>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: Christian König <christian.koenig@amd.com>
Link: https://lore.kernel.org/r/20240405060826.2521-13-jirislaby@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/8250/8250_bcm7271.c  | 14 ++++----
 drivers/tty/serial/8250/8250_core.c     |  3 +-
 drivers/tty/serial/8250/8250_dma.c      | 23 +++++++------
 drivers/tty/serial/8250/8250_exar.c     |  5 +--
 drivers/tty/serial/8250/8250_mtk.c      |  2 +-
 drivers/tty/serial/8250/8250_omap.c     | 47 ++++++++++++++++++---------
 drivers/tty/serial/8250/8250_pci1xxxx.c | 50 +++++++++++++----------------
 drivers/tty/serial/8250/8250_port.c     | 22 +++++++------
 drivers/tty/serial/amba-pl011.c         | 46 +++++++++++---------------
 drivers/tty/serial/ar933x_uart.c        | 15 ++++-----
 drivers/tty/serial/arc_uart.c           |  8 ++---
 drivers/tty/serial/atmel_serial.c       | 57 +++++++++++++++++----------------
 drivers/tty/serial/clps711x.c           | 12 +++----
 drivers/tty/serial/cpm_uart.c           | 20 ++++--------
 drivers/tty/serial/digicolor-usart.c    | 12 +++----
 drivers/tty/serial/dz.c                 | 13 ++++----
 drivers/tty/serial/fsl_linflexuart.c    | 17 +++++-----
 drivers/tty/serial/fsl_lpuart.c         | 39 ++++++++++------------
 drivers/tty/serial/icom.c               | 25 ++++-----------
 drivers/tty/serial/imx.c                | 54 +++++++++++++------------------
 drivers/tty/serial/ip22zilog.c          | 26 +++++++--------
 drivers/tty/serial/jsm/jsm_cls.c        | 29 +++++------------
 drivers/tty/serial/jsm/jsm_neo.c        | 38 ++++++++--------------
 drivers/tty/serial/max3100.c            | 14 ++++----
 drivers/tty/serial/max310x.c            | 35 +++++++++-----------
 drivers/tty/serial/men_z135_uart.c      | 26 ++++++---------
 drivers/tty/serial/meson_uart.c         | 11 +++----
 drivers/tty/serial/milbeaut_usio.c      | 15 +++++----
 drivers/tty/serial/msm_serial.c         | 30 ++++++++---------
 drivers/tty/serial/mvebu-uart.c         |  8 ++---
 drivers/tty/serial/mxs-auart.c          | 23 ++++---------
 drivers/tty/serial/pch_uart.c           | 21 ++++++------
 drivers/tty/serial/pic32_uart.c         | 15 ++++-----
 drivers/tty/serial/pmac_zilog.c         | 24 +++++++-------
 drivers/tty/serial/qcom_geni_serial.c   | 36 ++++++++++-----------
 drivers/tty/serial/rda-uart.c           | 17 ++++------
 drivers/tty/serial/samsung_tty.c        | 54 ++++++++++++++++---------------
 drivers/tty/serial/sb1250-duart.c       | 13 ++++----
 drivers/tty/serial/sc16is7xx.c          | 40 +++++++++--------------
 drivers/tty/serial/sccnxp.c             | 16 +++++----
 drivers/tty/serial/serial-tegra.c       | 43 ++++++++++++++-----------
 drivers/tty/serial/serial_core.c        | 56 ++++++++++++--------------------
 drivers/tty/serial/serial_port.c        |  2 +-
 drivers/tty/serial/sh-sci.c             | 51 +++++++++++++++--------------
 drivers/tty/serial/sprd_serial.c        | 20 ++++++------
 drivers/tty/serial/st-asc.c             |  4 +--
 drivers/tty/serial/stm32-usart.c        | 52 ++++++++++++------------------
 drivers/tty/serial/sunhv.c              | 35 +++++++++++---------
 drivers/tty/serial/sunplus-uart.c       | 16 +++++----
 drivers/tty/serial/sunsab.c             | 30 +++++++++--------
 drivers/tty/serial/sunsu.c              | 15 +++++----
 drivers/tty/serial/sunzilog.c           | 27 ++++++++--------
 drivers/tty/serial/tegra-tcu.c          | 10 +++---
 drivers/tty/serial/timbuart.c           | 17 ++++------
 drivers/tty/serial/uartlite.c           | 13 +++++---
 drivers/tty/serial/ucc_uart.c           | 20 ++++--------
 drivers/tty/serial/xilinx_uartps.c      | 20 ++++++------
 drivers/tty/serial/zs.c                 | 13 ++++----
 include/linux/serial_core.h             | 49 +++++++++++++++++-----------
 59 files changed, 688 insertions(+), 780 deletions(-)

(limited to 'include')

diff --git a/drivers/tty/serial/8250/8250_bcm7271.c b/drivers/tty/serial/8250/8250_bcm7271.c
index 5daa38d9c64e..de270863eb5e 100644
--- a/drivers/tty/serial/8250/8250_bcm7271.c
+++ b/drivers/tty/serial/8250/8250_bcm7271.c
@@ -413,20 +413,18 @@ static int stop_tx_dma(struct uart_8250_port *p)
 static int brcmuart_tx_dma(struct uart_8250_port *p)
 {
 	struct brcmuart_priv *priv = p->port.private_data;
-	struct circ_buf *xmit = &p->port.state->xmit;
+	struct tty_port *tport = &p->port.state->port;
 	u32 tx_size;
 
 	if (uart_tx_stopped(&p->port) || priv->tx_running ||
-		uart_circ_empty(xmit)) {
+		kfifo_is_empty(&tport->xmit_fifo)) {
 		return 0;
 	}
-	tx_size = CIRC_CNT_TO_END(xmit->head, xmit->tail, UART_XMIT_SIZE);
 
 	priv->dma.tx_err = 0;
-	memcpy(priv->tx_buf, &xmit->buf[xmit->tail], tx_size);
-	uart_xmit_advance(&p->port, tx_size);
+	tx_size = uart_fifo_out(&p->port, priv->tx_buf, UART_XMIT_SIZE);
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(&p->port);
 
 	udma_writel(priv, REGS_DMA_TX, UDMA_TX_TRANSFER_LEN, tx_size);
@@ -540,7 +538,7 @@ static void brcmuart_tx_isr(struct uart_port *up, u32 isr)
 	struct brcmuart_priv *priv = up->private_data;
 	struct device *dev = up->dev;
 	struct uart_8250_port *port_8250 = up_to_u8250p(up);
-	struct circ_buf	*xmit = &port_8250->port.state->xmit;
+	struct tty_port *tport = &port_8250->port.state->port;
 
 	if (isr & UDMA_INTR_TX_ABORT) {
 		if (priv->tx_running)
@@ -548,7 +546,7 @@ static void brcmuart_tx_isr(struct uart_port *up, u32 isr)
 		return;
 	}
 	priv->tx_running = false;
-	if (!uart_circ_empty(xmit) && !uart_tx_stopped(up))
+	if (!kfifo_is_empty(&tport->xmit_fifo) && !uart_tx_stopped(up))
 		brcmuart_tx_dma(port_8250);
 }
 
diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
index b62ad9006780..3a1936b7f05f 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -280,7 +280,8 @@ static void serial8250_backup_timeout(struct timer_list *t)
 	 */
 	lsr = serial_lsr_in(up);
 	if ((iir & UART_IIR_NO_INT) && (up->ier & UART_IER_THRI) &&
-	    (!uart_circ_empty(&up->port.state->xmit) || up->port.x_char) &&
+	    (!kfifo_is_empty(&up->port.state->port.xmit_fifo) ||
+	     up->port.x_char) &&
 	    (lsr & UART_LSR_THRE)) {
 		iir &= ~(UART_IIR_ID | UART_IIR_NO_INT);
 		iir |= UART_IIR_THRI;
diff --git a/drivers/tty/serial/8250/8250_dma.c b/drivers/tty/serial/8250/8250_dma.c
index 8b2c3f478b17..8a353e3cc3dd 100644
--- a/drivers/tty/serial/8250/8250_dma.c
+++ b/drivers/tty/serial/8250/8250_dma.c
@@ -15,7 +15,7 @@ static void __dma_tx_complete(void *param)
 {
 	struct uart_8250_port	*p = param;
 	struct uart_8250_dma	*dma = p->dma;
-	struct circ_buf		*xmit = &p->port.state->xmit;
+	struct tty_port		*tport = &p->port.state->port;
 	unsigned long	flags;
 	int		ret;
 
@@ -28,7 +28,7 @@ static void __dma_tx_complete(void *param)
 
 	uart_xmit_advance(&p->port, dma->tx_size);
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(&p->port);
 
 	ret = serial8250_tx_dma(p);
@@ -86,7 +86,7 @@ static void dma_rx_complete(void *param)
 int serial8250_tx_dma(struct uart_8250_port *p)
 {
 	struct uart_8250_dma		*dma = p->dma;
-	struct circ_buf			*xmit = &p->port.state->xmit;
+	struct tty_port			*tport = &p->port.state->port;
 	struct dma_async_tx_descriptor	*desc;
 	struct uart_port		*up = &p->port;
 	struct scatterlist sg;
@@ -103,18 +103,23 @@ int serial8250_tx_dma(struct uart_8250_port *p)
 		uart_xchar_out(up, UART_TX);
 	}
 
-	if (uart_tx_stopped(&p->port) || uart_circ_empty(xmit)) {
+	if (uart_tx_stopped(&p->port) || kfifo_is_empty(&tport->xmit_fifo)) {
 		/* We have been called from __dma_tx_complete() */
 		return 0;
 	}
 
-	dma->tx_size = CIRC_CNT_TO_END(xmit->head, xmit->tail, UART_XMIT_SIZE);
-
 	serial8250_do_prepare_tx_dma(p);
 
 	sg_init_table(&sg, 1);
-	sg_dma_address(&sg) = dma->tx_addr + xmit->tail;
-	sg_dma_len(&sg) = dma->tx_size;
+	/* kfifo can do more than one sg, we don't (quite yet) */
+	ret = kfifo_dma_out_prepare_mapped(&tport->xmit_fifo, &sg, 1,
+					   UART_XMIT_SIZE, dma->tx_addr);
+
+	/* we already checked empty fifo above, so there should be something */
+	if (WARN_ON_ONCE(ret != 1))
+		return 0;
+
+	dma->tx_size = sg_dma_len(&sg);
 
 	desc = dmaengine_prep_slave_sg(dma->txchan, &sg, 1,
 				       DMA_MEM_TO_DEV,
@@ -257,7 +262,7 @@ int serial8250_request_dma(struct uart_8250_port *p)
 
 	/* TX buffer */
 	dma->tx_addr = dma_map_single(dma->txchan->device->dev,
-					p->port.state->xmit.buf,
+					p->port.state->port.xmit_buf,
 					UART_XMIT_SIZE,
 					DMA_TO_DEVICE);
 	if (dma_mapping_error(dma->txchan->device->dev, dma->tx_addr)) {
diff --git a/drivers/tty/serial/8250/8250_exar.c b/drivers/tty/serial/8250/8250_exar.c
index 0440df7de1ed..f89dabfc0f97 100644
--- a/drivers/tty/serial/8250/8250_exar.c
+++ b/drivers/tty/serial/8250/8250_exar.c
@@ -214,7 +214,7 @@ static void exar_shutdown(struct uart_port *port)
 {
 	bool tx_complete = false;
 	struct uart_8250_port *up = up_to_u8250p(port);
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
 	int i = 0;
 	u16 lsr;
 
@@ -225,7 +225,8 @@ static void exar_shutdown(struct uart_port *port)
 		else
 			tx_complete = false;
 		usleep_range(1000, 1100);
-	} while (!uart_circ_empty(xmit) && !tx_complete && i++ < 1000);
+	} while (!kfifo_is_empty(&tport->xmit_fifo) &&
+			!tx_complete && i++ < 1000);
 
 	serial8250_do_shutdown(port);
 }
diff --git a/drivers/tty/serial/8250/8250_mtk.c b/drivers/tty/serial/8250/8250_mtk.c
index 9ff6bbe9c086..c365a349421a 100644
--- a/drivers/tty/serial/8250/8250_mtk.c
+++ b/drivers/tty/serial/8250/8250_mtk.c
@@ -199,7 +199,7 @@ static int mtk8250_startup(struct uart_port *port)
 
 	if (up->dma) {
 		data->rx_status = DMA_RX_START;
-		uart_circ_clear(&port->state->xmit);
+		kfifo_reset(&port->state->port.xmit_fifo);
 	}
 #endif
 	memset(&port->icount, 0, sizeof(port->icount));
diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c
index 879e77b30701..c07d924d5add 100644
--- a/drivers/tty/serial/8250/8250_omap.c
+++ b/drivers/tty/serial/8250/8250_omap.c
@@ -1094,7 +1094,7 @@ static void omap_8250_dma_tx_complete(void *param)
 {
 	struct uart_8250_port	*p = param;
 	struct uart_8250_dma	*dma = p->dma;
-	struct circ_buf		*xmit = &p->port.state->xmit;
+	struct tty_port		*tport = &p->port.state->port;
 	unsigned long		flags;
 	bool			en_thri = false;
 	struct omap8250_priv	*priv = p->port.private_data;
@@ -1113,10 +1113,10 @@ static void omap_8250_dma_tx_complete(void *param)
 		omap8250_restore_regs(p);
 	}
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(&p->port);
 
-	if (!uart_circ_empty(xmit) && !uart_tx_stopped(&p->port)) {
+	if (!kfifo_is_empty(&tport->xmit_fifo) && !uart_tx_stopped(&p->port)) {
 		int ret;
 
 		ret = omap_8250_tx_dma(p);
@@ -1138,15 +1138,15 @@ static int omap_8250_tx_dma(struct uart_8250_port *p)
 {
 	struct uart_8250_dma		*dma = p->dma;
 	struct omap8250_priv		*priv = p->port.private_data;
-	struct circ_buf			*xmit = &p->port.state->xmit;
+	struct tty_port			*tport = &p->port.state->port;
 	struct dma_async_tx_descriptor	*desc;
 	struct scatterlist sg;
-	unsigned int	skip_byte = 0;
+	int skip_byte = -1;
 	int ret;
 
 	if (dma->tx_running)
 		return 0;
-	if (uart_tx_stopped(&p->port) || uart_circ_empty(xmit)) {
+	if (uart_tx_stopped(&p->port) || kfifo_is_empty(&tport->xmit_fifo)) {
 
 		/*
 		 * Even if no data, we need to return an error for the two cases
@@ -1161,8 +1161,18 @@ static int omap_8250_tx_dma(struct uart_8250_port *p)
 		return 0;
 	}
 
-	dma->tx_size = CIRC_CNT_TO_END(xmit->head, xmit->tail, UART_XMIT_SIZE);
+	sg_init_table(&sg, 1);
+	ret = kfifo_dma_out_prepare_mapped(&tport->xmit_fifo, &sg, 1,
+					   UART_XMIT_SIZE, dma->tx_addr);
+	if (ret != 1) {
+		serial8250_clear_THRI(p);
+		return 0;
+	}
+
+	dma->tx_size = sg_dma_len(&sg);
+
 	if (priv->habit & OMAP_DMA_TX_KICK) {
+		unsigned char c;
 		u8 tx_lvl;
 
 		/*
@@ -1189,13 +1199,16 @@ static int omap_8250_tx_dma(struct uart_8250_port *p)
 			ret = -EINVAL;
 			goto err;
 		}
-		skip_byte = 1;
+		if (!kfifo_get(&tport->xmit_fifo, &c)) {
+			ret = -EINVAL;
+			goto err;
+		}
+		skip_byte = c;
+		/* now we need to recompute due to kfifo_get */
+		kfifo_dma_out_prepare_mapped(&tport->xmit_fifo, &sg, 1,
+				UART_XMIT_SIZE, dma->tx_addr);
 	}
 
-	sg_init_table(&sg, 1);
-	sg_dma_address(&sg) = dma->tx_addr + xmit->tail + skip_byte;
-	sg_dma_len(&sg) = dma->tx_size - skip_byte;
-
 	desc = dmaengine_prep_slave_sg(dma->txchan, &sg, 1, DMA_MEM_TO_DEV,
 			DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
 	if (!desc) {
@@ -1218,11 +1231,13 @@ static int omap_8250_tx_dma(struct uart_8250_port *p)
 		dma->tx_err = 0;
 
 	serial8250_clear_THRI(p);
-	if (skip_byte)
-		serial_out(p, UART_TX, xmit->buf[xmit->tail]);
-	return 0;
+	ret = 0;
+	goto out_skip;
 err:
 	dma->tx_err = 1;
+out_skip:
+	if (skip_byte >= 0)
+		serial_out(p, UART_TX, skip_byte);
 	return ret;
 }
 
@@ -1311,7 +1326,7 @@ static int omap_8250_dma_handle_irq(struct uart_port *port)
 	serial8250_modem_status(up);
 	if (status & UART_LSR_THRE && up->dma->tx_err) {
 		if (uart_tx_stopped(&up->port) ||
-		    uart_circ_empty(&up->port.state->xmit)) {
+		    kfifo_is_empty(&up->port.state->port.xmit_fifo)) {
 			up->dma->tx_err = 0;
 			serial8250_tx_chars(up);
 		} else  {
diff --git a/drivers/tty/serial/8250/8250_pci1xxxx.c b/drivers/tty/serial/8250/8250_pci1xxxx.c
index 2fbb5851f788..d3930bf32fe4 100644
--- a/drivers/tty/serial/8250/8250_pci1xxxx.c
+++ b/drivers/tty/serial/8250/8250_pci1xxxx.c
@@ -382,10 +382,10 @@ static void pci1xxxx_rx_burst(struct uart_port *port, u32 uart_status)
 }
 
 static void pci1xxxx_process_write_data(struct uart_port *port,
-					struct circ_buf *xmit,
 					int *data_empty_count,
 					u32 *valid_byte_count)
 {
+	struct tty_port *tport = &port->state->port;
 	u32 valid_burst_count = *valid_byte_count / UART_BURST_SIZE;
 
 	/*
@@ -395,41 +395,36 @@ static void pci1xxxx_process_write_data(struct uart_port *port,
 	 * one byte at a time.
 	 */
 	while (valid_burst_count) {
+		u32 c;
+
 		if (*data_empty_count - UART_BURST_SIZE < 0)
 			break;
-		if (xmit->tail > (UART_XMIT_SIZE - UART_BURST_SIZE))
+		if (kfifo_len(&tport->xmit_fifo) < UART_BURST_SIZE)
+			break;
+		if (WARN_ON(kfifo_out(&tport->xmit_fifo, (u8 *)&c, sizeof(c)) !=
+		    sizeof(c)))
 			break;
-		writel(*(unsigned int *)&xmit->buf[xmit->tail],
-		       port->membase + UART_TX_BURST_FIFO);
+		writel(c, port->membase + UART_TX_BURST_FIFO);
 		*valid_byte_count -= UART_BURST_SIZE;
 		*data_empty_count -= UART_BURST_SIZE;
 		valid_burst_count -= UART_BYTE_SIZE;
-
-		xmit->tail = (xmit->tail + UART_BURST_SIZE) &
-			     (UART_XMIT_SIZE - 1);
 	}
 
 	while (*valid_byte_count) {
-		if (*data_empty_count - UART_BYTE_SIZE < 0)
+		u8 c;
+
+		if (!kfifo_get(&tport->xmit_fifo, &c))
 			break;
-		writeb(xmit->buf[xmit->tail], port->membase +
-		       UART_TX_BYTE_FIFO);
+		writeb(c, port->membase + UART_TX_BYTE_FIFO);
 		*data_empty_count -= UART_BYTE_SIZE;
 		*valid_byte_count -= UART_BYTE_SIZE;
 
-		/*
-		 * When the tail of the circular buffer is reached, the next
-		 * byte is transferred to the beginning of the buffer.
-		 */
-		xmit->tail = (xmit->tail + UART_BYTE_SIZE) &
-			     (UART_XMIT_SIZE - 1);
-
 		/*
 		 * If there are any pending burst count, data is handled by
 		 * transmitting DWORDs at a time.
 		 */
-		if (valid_burst_count && (xmit->tail <
-		   (UART_XMIT_SIZE - UART_BURST_SIZE)))
+		if (valid_burst_count &&
+		    kfifo_len(&tport->xmit_fifo) >= UART_BURST_SIZE)
 			break;
 	}
 }
@@ -437,11 +432,9 @@ static void pci1xxxx_process_write_data(struct uart_port *port,
 static void pci1xxxx_tx_burst(struct uart_port *port, u32 uart_status)
 {
 	struct uart_8250_port *up = up_to_u8250p(port);
+	struct tty_port *tport = &port->state->port;
 	u32 valid_byte_count;
 	int data_empty_count;
-	struct circ_buf *xmit;
-
-	xmit = &port->state->xmit;
 
 	if (port->x_char) {
 		writeb(port->x_char, port->membase + UART_TX);
@@ -450,25 +443,25 @@ static void pci1xxxx_tx_burst(struct uart_port *port, u32 uart_status)
 		return;
 	}
 
-	if ((uart_tx_stopped(port)) || (uart_circ_empty(xmit))) {
+	if ((uart_tx_stopped(port)) || kfifo_is_empty(&tport->xmit_fifo)) {
 		port->ops->stop_tx(port);
 	} else {
 		data_empty_count = (pci1xxxx_read_burst_status(port) &
 				    UART_BST_STAT_TX_COUNT_MASK) >> 8;
 		do {
-			valid_byte_count = uart_circ_chars_pending(xmit);
+			valid_byte_count = kfifo_len(&tport->xmit_fifo);
 
-			pci1xxxx_process_write_data(port, xmit,
+			pci1xxxx_process_write_data(port,
 						    &data_empty_count,
 						    &valid_byte_count);
 
 			port->icount.tx++;
-			if (uart_circ_empty(xmit))
+			if (kfifo_is_empty(&tport->xmit_fifo))
 				break;
 		} while (data_empty_count && valid_byte_count);
 	}
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
 
 	 /*
@@ -476,7 +469,8 @@ static void pci1xxxx_tx_burst(struct uart_port *port, u32 uart_status)
 	  * the HW can go idle. So we get here once again with empty FIFO and
 	  * disable the interrupt and RPM in __stop_tx()
 	  */
-	if (uart_circ_empty(xmit) && !(up->capabilities & UART_CAP_RPM))
+	if (kfifo_is_empty(&tport->xmit_fifo) &&
+	    !(up->capabilities & UART_CAP_RPM))
 		port->ops->stop_tx(port);
 }
 
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c
index fc9dd5d45295..0c19451d0332 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -1630,7 +1630,7 @@ static void serial8250_start_tx(struct uart_port *port)
 	/* Port locked to synchronize UART_IER access against the console. */
 	lockdep_assert_held_once(&port->lock);
 
-	if (!port->x_char && uart_circ_empty(&port->state->xmit))
+	if (!port->x_char && kfifo_is_empty(&port->state->port.xmit_fifo))
 		return;
 
 	serial8250_rpm_get_tx(up);
@@ -1778,7 +1778,7 @@ EXPORT_SYMBOL_GPL(serial8250_rx_chars);
 void serial8250_tx_chars(struct uart_8250_port *up)
 {
 	struct uart_port *port = &up->port;
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
 	int count;
 
 	if (port->x_char) {
@@ -1789,14 +1789,19 @@ void serial8250_tx_chars(struct uart_8250_port *up)
 		serial8250_stop_tx(port);
 		return;
 	}
-	if (uart_circ_empty(xmit)) {
+	if (kfifo_is_empty(&tport->xmit_fifo)) {
 		__stop_tx(up);
 		return;
 	}
 
 	count = up->tx_loadsz;
 	do {
-		serial_out(up, UART_TX, xmit->buf[xmit->tail]);
+		unsigned char c;
+
+		if (!uart_fifo_get(port, &c))
+			break;
+
+		serial_out(up, UART_TX, c);
 		if (up->bugs & UART_BUG_TXRACE) {
 			/*
 			 * The Aspeed BMC virtual UARTs have a bug where data
@@ -1809,9 +1814,7 @@ void serial8250_tx_chars(struct uart_8250_port *up)
 			 */
 			serial_in(up, UART_SCR);
 		}
-		uart_xmit_advance(port, 1);
-		if (uart_circ_empty(xmit))
-			break;
+
 		if ((up->capabilities & UART_CAP_HFIFO) &&
 		    !uart_lsr_tx_empty(serial_in(up, UART_LSR)))
 			break;
@@ -1821,7 +1824,7 @@ void serial8250_tx_chars(struct uart_8250_port *up)
 			break;
 	} while (--count > 0);
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
 
 	/*
@@ -1829,7 +1832,8 @@ void serial8250_tx_chars(struct uart_8250_port *up)
 	 * HW can go idle. So we get here once again with empty FIFO and disable
 	 * the interrupt and RPM in __stop_tx()
 	 */
-	if (uart_circ_empty(xmit) && !(up->capabilities & UART_CAP_RPM))
+	if (kfifo_is_empty(&tport->xmit_fifo) &&
+	    !(up->capabilities & UART_CAP_RPM))
 		__stop_tx(up);
 }
 EXPORT_SYMBOL_GPL(serial8250_tx_chars);
diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
index 2fa3fb30dc6c..51b202eb8296 100644
--- a/drivers/tty/serial/amba-pl011.c
+++ b/drivers/tty/serial/amba-pl011.c
@@ -535,6 +535,7 @@ static void pl011_start_tx_pio(struct uart_amba_port *uap);
 static void pl011_dma_tx_callback(void *data)
 {
 	struct uart_amba_port *uap = data;
+	struct tty_port *tport = &uap->port.state->port;
 	struct pl011_dmatx_data *dmatx = &uap->dmatx;
 	unsigned long flags;
 	u16 dmacr;
@@ -558,7 +559,7 @@ static void pl011_dma_tx_callback(void *data)
 	 * get further refills (hence we check dmacr).
 	 */
 	if (!(dmacr & UART011_TXDMAE) || uart_tx_stopped(&uap->port) ||
-	    uart_circ_empty(&uap->port.state->xmit)) {
+	    kfifo_is_empty(&tport->xmit_fifo)) {
 		uap->dmatx.queued = false;
 		uart_port_unlock_irqrestore(&uap->port, flags);
 		return;
@@ -588,7 +589,7 @@ static int pl011_dma_tx_refill(struct uart_amba_port *uap)
 	struct dma_chan *chan = dmatx->chan;
 	struct dma_device *dma_dev = chan->device;
 	struct dma_async_tx_descriptor *desc;
-	struct circ_buf *xmit = &uap->port.state->xmit;
+	struct tty_port *tport = &uap->port.state->port;
 	unsigned int count;
 
 	/*
@@ -597,7 +598,7 @@ static int pl011_dma_tx_refill(struct uart_amba_port *uap)
 	 * the standard interrupt handling.  This ensures that we
 	 * issue a uart_write_wakeup() at the appropriate time.
 	 */
-	count = uart_circ_chars_pending(xmit);
+	count = kfifo_len(&tport->xmit_fifo);
 	if (count < (uap->fifosize >> 1)) {
 		uap->dmatx.queued = false;
 		return 0;
@@ -613,21 +614,7 @@ static int pl011_dma_tx_refill(struct uart_amba_port *uap)
 	if (count > PL011_DMA_BUFFER_SIZE)
 		count = PL011_DMA_BUFFER_SIZE;
 
-	if (xmit->tail < xmit->head) {
-		memcpy(&dmatx->buf[0], &xmit->buf[xmit->tail], count);
-	} else {
-		size_t first = UART_XMIT_SIZE - xmit->tail;
-		size_t second;
-
-		if (first > count)
-			first = count;
-		second = count - first;
-
-		memcpy(&dmatx->buf[0], &xmit->buf[xmit->tail], first);
-		if (second)
-			memcpy(&dmatx->buf[first], &xmit->buf[0], second);
-	}
-
+	count = kfifo_out_peek(&tport->xmit_fifo, dmatx->buf, count);
 	dmatx->len = count;
 	dmatx->dma = dma_map_single(dma_dev->dev, dmatx->buf, count,
 				    DMA_TO_DEVICE);
@@ -670,7 +657,7 @@ static int pl011_dma_tx_refill(struct uart_amba_port *uap)
 	 */
 	uart_xmit_advance(&uap->port, count);
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(&uap->port);
 
 	return 1;
@@ -1454,7 +1441,7 @@ static bool pl011_tx_char(struct uart_amba_port *uap, unsigned char c,
 /* Returns true if tx interrupts have to be (kept) enabled  */
 static bool pl011_tx_chars(struct uart_amba_port *uap, bool from_irq)
 {
-	struct circ_buf *xmit = &uap->port.state->xmit;
+	struct tty_port *tport = &uap->port.state->port;
 	int count = uap->fifosize >> 1;
 
 	if (uap->port.x_char) {
@@ -1463,7 +1450,7 @@ static bool pl011_tx_chars(struct uart_amba_port *uap, bool from_irq)
 		uap->port.x_char = 0;
 		--count;
 	}
-	if (uart_circ_empty(xmit) || uart_tx_stopped(&uap->port)) {
+	if (kfifo_is_empty(&tport->xmit_fifo) || uart_tx_stopped(&uap->port)) {
 		pl011_stop_tx(&uap->port);
 		return false;
 	}
@@ -1472,20 +1459,25 @@ static bool pl011_tx_chars(struct uart_amba_port *uap, bool from_irq)
 	if (pl011_dma_tx_irq(uap))
 		return true;
 
-	do {
+	while (1) {
+		unsigned char c;
+
 		if (likely(from_irq) && count-- == 0)
 			break;
 
-		if (!pl011_tx_char(uap, xmit->buf[xmit->tail], from_irq))
+		if (!kfifo_peek(&tport->xmit_fifo, &c))
+			break;
+
+		if (!pl011_tx_char(uap, c, from_irq))
 			break;
 
-		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
-	} while (!uart_circ_empty(xmit));
+		kfifo_skip(&tport->xmit_fifo);
+	}
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(&uap->port);
 
-	if (uart_circ_empty(xmit)) {
+	if (kfifo_is_empty(&tport->xmit_fifo)) {
 		pl011_stop_tx(&uap->port);
 		return false;
 	}
diff --git a/drivers/tty/serial/ar933x_uart.c b/drivers/tty/serial/ar933x_uart.c
index 7790cbc57391..f2836ecc5c19 100644
--- a/drivers/tty/serial/ar933x_uart.c
+++ b/drivers/tty/serial/ar933x_uart.c
@@ -390,7 +390,7 @@ static void ar933x_uart_rx_chars(struct ar933x_uart_port *up)
 
 static void ar933x_uart_tx_chars(struct ar933x_uart_port *up)
 {
-	struct circ_buf *xmit = &up->port.state->xmit;
+	struct tty_port *tport = &up->port.state->port;
 	struct serial_rs485 *rs485conf = &up->port.rs485;
 	int count;
 	bool half_duplex_send = false;
@@ -399,7 +399,7 @@ static void ar933x_uart_tx_chars(struct ar933x_uart_port *up)
 		return;
 
 	if ((rs485conf->flags & SER_RS485_ENABLED) &&
-	    (up->port.x_char || !uart_circ_empty(xmit))) {
+	    (up->port.x_char || !kfifo_is_empty(&tport->xmit_fifo))) {
 		ar933x_uart_stop_rx_interrupt(up);
 		gpiod_set_value(up->rts_gpiod, !!(rs485conf->flags & SER_RS485_RTS_ON_SEND));
 		half_duplex_send = true;
@@ -408,6 +408,7 @@ static void ar933x_uart_tx_chars(struct ar933x_uart_port *up)
 	count = up->port.fifosize;
 	do {
 		unsigned int rdata;
+		unsigned char c;
 
 		rdata = ar933x_uart_read(up, AR933X_UART_DATA_REG);
 		if ((rdata & AR933X_UART_DATA_TX_CSR) == 0)
@@ -420,18 +421,16 @@ static void ar933x_uart_tx_chars(struct ar933x_uart_port *up)
 			continue;
 		}
 
-		if (uart_circ_empty(xmit))
+		if (!uart_fifo_get(&up->port, &c))
 			break;
 
-		ar933x_uart_putc(up, xmit->buf[xmit->tail]);
-
-		uart_xmit_advance(&up->port, 1);
+		ar933x_uart_putc(up, c);
 	} while (--count > 0);
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(&up->port);
 
-	if (!uart_circ_empty(xmit)) {
+	if (!kfifo_is_empty(&tport->xmit_fifo)) {
 		ar933x_uart_start_tx_interrupt(up);
 	} else if (half_duplex_send) {
 		ar933x_uart_wait_tx_complete(up);
diff --git a/drivers/tty/serial/arc_uart.c b/drivers/tty/serial/arc_uart.c
index 1aa5b2b49c26..5c4895d154c0 100644
--- a/drivers/tty/serial/arc_uart.c
+++ b/drivers/tty/serial/arc_uart.c
@@ -155,7 +155,7 @@ static unsigned int arc_serial_tx_empty(struct uart_port *port)
  */
 static void arc_serial_tx_chars(struct uart_port *port)
 {
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
 	int sent = 0;
 	unsigned char ch;
 
@@ -164,9 +164,7 @@ static void arc_serial_tx_chars(struct uart_port *port)
 		port->icount.tx++;
 		port->x_char = 0;
 		sent = 1;
-	} else if (!uart_circ_empty(xmit)) {
-		ch = xmit->buf[xmit->tail];
-		uart_xmit_advance(port, 1);
+	} else if (uart_fifo_get(port, &ch)) {
 		while (!(UART_GET_STATUS(port) & TXEMPTY))
 			cpu_relax();
 		UART_SET_DATA(port, ch);
@@ -177,7 +175,7 @@ static void arc_serial_tx_chars(struct uart_port *port)
 	 * If num chars in xmit buffer are too few, ask tty layer for more.
 	 * By Hard ISR to schedule processing in software interrupt part
 	 */
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
 
 	if (sent)
diff --git a/drivers/tty/serial/atmel_serial.c b/drivers/tty/serial/atmel_serial.c
index 85667f709515..5bb5e4303754 100644
--- a/drivers/tty/serial/atmel_serial.c
+++ b/drivers/tty/serial/atmel_serial.c
@@ -857,7 +857,7 @@ static void atmel_complete_tx_dma(void *arg)
 {
 	struct atmel_uart_port *atmel_port = arg;
 	struct uart_port *port = &atmel_port->uart;
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
 	struct dma_chan *chan = atmel_port->chan_tx;
 	unsigned long flags;
 
@@ -873,15 +873,15 @@ static void atmel_complete_tx_dma(void *arg)
 	atmel_port->desc_tx = NULL;
 	spin_unlock(&atmel_port->lock_tx);
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
 
 	/*
-	 * xmit is a circular buffer so, if we have just send data from
-	 * xmit->tail to the end of xmit->buf, now we have to transmit the
-	 * remaining data from the beginning of xmit->buf to xmit->head.
+	 * xmit is a circular buffer so, if we have just send data from the
+	 * tail to the end, now we have to transmit the remaining data from the
+	 * beginning to the head.
 	 */
-	if (!uart_circ_empty(xmit))
+	if (!kfifo_is_empty(&tport->xmit_fifo))
 		atmel_tasklet_schedule(atmel_port, &atmel_port->tasklet_tx);
 	else if (atmel_uart_is_half_duplex(port)) {
 		/*
@@ -919,18 +919,18 @@ static void atmel_release_tx_dma(struct uart_port *port)
 static void atmel_tx_dma(struct uart_port *port)
 {
 	struct atmel_uart_port *atmel_port = to_atmel_uart_port(port);
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
 	struct dma_chan *chan = atmel_port->chan_tx;
 	struct dma_async_tx_descriptor *desc;
 	struct scatterlist sgl[2], *sg, *sg_tx = &atmel_port->sg_tx;
-	unsigned int tx_len, part1_len, part2_len, sg_len;
+	unsigned int tx_len, tail, part1_len, part2_len, sg_len;
 	dma_addr_t phys_addr;
 
 	/* Make sure we have an idle channel */
 	if (atmel_port->desc_tx != NULL)
 		return;
 
-	if (!uart_circ_empty(xmit) && !uart_tx_stopped(port)) {
+	if (!kfifo_is_empty(&tport->xmit_fifo) && !uart_tx_stopped(port)) {
 		/*
 		 * DMA is idle now.
 		 * Port xmit buffer is already mapped,
@@ -940,9 +940,8 @@ static void atmel_tx_dma(struct uart_port *port)
 		 * Take the port lock to get a
 		 * consistent xmit buffer state.
 		 */
-		tx_len = CIRC_CNT_TO_END(xmit->head,
-					 xmit->tail,
-					 UART_XMIT_SIZE);
+		tx_len = kfifo_out_linear(&tport->xmit_fifo, &tail,
+				UART_XMIT_SIZE);
 
 		if (atmel_port->fifo_size) {
 			/* multi data mode */
@@ -956,7 +955,7 @@ static void atmel_tx_dma(struct uart_port *port)
 
 		sg_init_table(sgl, 2);
 		sg_len = 0;
-		phys_addr = sg_dma_address(sg_tx) + xmit->tail;
+		phys_addr = sg_dma_address(sg_tx) + tail;
 		if (part1_len) {
 			sg = &sgl[sg_len++];
 			sg_dma_address(sg) = phys_addr;
@@ -973,7 +972,7 @@ static void atmel_tx_dma(struct uart_port *port)
 
 		/*
 		 * save tx_len so atmel_complete_tx_dma() will increase
-		 * xmit->tail correctly
+		 * tail correctly
 		 */
 		atmel_port->tx_len = tx_len;
 
@@ -1003,13 +1002,14 @@ static void atmel_tx_dma(struct uart_port *port)
 		dma_async_issue_pending(chan);
 	}
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
 }
 
 static int atmel_prepare_tx_dma(struct uart_port *port)
 {
 	struct atmel_uart_port *atmel_port = to_atmel_uart_port(port);
+	struct tty_port *tport = &port->state->port;
 	struct device *mfd_dev = port->dev->parent;
 	dma_cap_mask_t		mask;
 	struct dma_slave_config config;
@@ -1031,11 +1031,11 @@ static int atmel_prepare_tx_dma(struct uart_port *port)
 	spin_lock_init(&atmel_port->lock_tx);
 	sg_init_table(&atmel_port->sg_tx, 1);
 	/* UART circular tx buffer is an aligned page. */
-	BUG_ON(!PAGE_ALIGNED(port->state->xmit.buf));
+	BUG_ON(!PAGE_ALIGNED(tport->xmit_buf));
 	sg_set_page(&atmel_port->sg_tx,
-			virt_to_page(port->state->xmit.buf),
+			virt_to_page(tport->xmit_buf),
 			UART_XMIT_SIZE,
-			offset_in_page(port->state->xmit.buf));
+			offset_in_page(tport->xmit_buf));
 	nent = dma_map_sg(port->dev,
 				&atmel_port->sg_tx,
 				1,
@@ -1047,7 +1047,7 @@ static int atmel_prepare_tx_dma(struct uart_port *port)
 	} else {
 		dev_dbg(port->dev, "%s: mapped %d@%p to %pad\n", __func__,
 			sg_dma_len(&atmel_port->sg_tx),
-			port->state->xmit.buf,
+			tport->xmit_buf,
 			&sg_dma_address(&atmel_port->sg_tx));
 	}
 
@@ -1459,9 +1459,8 @@ static void atmel_release_tx_pdc(struct uart_port *port)
 static void atmel_tx_pdc(struct uart_port *port)
 {
 	struct atmel_uart_port *atmel_port = to_atmel_uart_port(port);
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
 	struct atmel_dma_buffer *pdc = &atmel_port->pdc_tx;
-	int count;
 
 	/* nothing left to transmit? */
 	if (atmel_uart_readl(port, ATMEL_PDC_TCR))
@@ -1474,17 +1473,19 @@ static void atmel_tx_pdc(struct uart_port *port)
 	/* disable PDC transmit */
 	atmel_uart_writel(port, ATMEL_PDC_PTCR, ATMEL_PDC_TXTDIS);
 
-	if (!uart_circ_empty(xmit) && !uart_tx_stopped(port)) {
+	if (!kfifo_is_empty(&tport->xmit_fifo) && !uart_tx_stopped(port)) {
+		unsigned int count, tail;
+
 		dma_sync_single_for_device(port->dev,
 					   pdc->dma_addr,
 					   pdc->dma_size,
 					   DMA_TO_DEVICE);
 
-		count = CIRC_CNT_TO_END(xmit->head, xmit->tail, UART_XMIT_SIZE);
+		count = kfifo_out_linear(&tport->xmit_fifo, &tail,
+				UART_XMIT_SIZE);
 		pdc->ofs = count;
 
-		atmel_uart_writel(port, ATMEL_PDC_TPR,
-				  pdc->dma_addr + xmit->tail);
+		atmel_uart_writel(port, ATMEL_PDC_TPR, pdc->dma_addr + tail);
 		atmel_uart_writel(port, ATMEL_PDC_TCR, count);
 		/* re-enable PDC transmit */
 		atmel_uart_writel(port, ATMEL_PDC_PTCR, ATMEL_PDC_TXTEN);
@@ -1498,7 +1499,7 @@ static void atmel_tx_pdc(struct uart_port *port)
 		}
 	}
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
 }
 
@@ -1506,9 +1507,9 @@ static int atmel_prepare_tx_pdc(struct uart_port *port)
 {
 	struct atmel_uart_port *atmel_port = to_atmel_uart_port(port);
 	struct atmel_dma_buffer *pdc = &atmel_port->pdc_tx;
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
 
-	pdc->buf = xmit->buf;
+	pdc->buf = tport->xmit_buf;
 	pdc->dma_addr = dma_map_single(port->dev,
 					pdc->buf,
 					UART_XMIT_SIZE,
diff --git a/drivers/tty/serial/clps711x.c b/drivers/tty/serial/clps711x.c
index 7927725b8957..30425a3d19fb 100644
--- a/drivers/tty/serial/clps711x.c
+++ b/drivers/tty/serial/clps711x.c
@@ -146,7 +146,8 @@ static irqreturn_t uart_clps711x_int_tx(int irq, void *dev_id)
 {
 	struct uart_port *port = dev_id;
 	struct clps711x_port *s = dev_get_drvdata(port->dev);
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
+	unsigned char c;
 
 	if (port->x_char) {
 		writew(port->x_char, port->membase + UARTDR_OFFSET);
@@ -155,7 +156,7 @@ static irqreturn_t uart_clps711x_int_tx(int irq, void *dev_id)
 		return IRQ_HANDLED;
 	}
 
-	if (uart_circ_empty(xmit) || uart_tx_stopped(port)) {
+	if (kfifo_is_empty(&tport->xmit_fifo) || uart_tx_stopped(port)) {
 		if (s->tx_enabled) {
 			disable_irq_nosync(port->irq);
 			s->tx_enabled = 0;
@@ -163,18 +164,17 @@ static irqreturn_t uart_clps711x_int_tx(int irq, void *dev_id)
 		return IRQ_HANDLED;
 	}
 
-	while (!uart_circ_empty(xmit)) {
+	while (uart_fifo_get(port, &c)) {
 		u32 sysflg = 0;
 
-		writew(xmit->buf[xmit->tail], port->membase + UARTDR_OFFSET);
-		uart_xmit_advance(port, 1);
+		writew(c, port->membase + UARTDR_OFFSET);
 
 		regmap_read(s->syscon, SYSFLG_OFFSET, &sysflg);
 		if (sysflg & SYSFLG_UTXFF)
 			break;
 	}
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
 
 	return IRQ_HANDLED;
diff --git a/drivers/tty/serial/cpm_uart.c b/drivers/tty/serial/cpm_uart.c
index df56c6c5afd0..a927478f581d 100644
--- a/drivers/tty/serial/cpm_uart.c
+++ b/drivers/tty/serial/cpm_uart.c
@@ -648,7 +648,7 @@ static int cpm_uart_tx_pump(struct uart_port *port)
 	int count;
 	struct uart_cpm_port *pinfo =
 		container_of(port, struct uart_cpm_port, port);
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
 
 	/* Handle xon/xoff */
 	if (port->x_char) {
@@ -673,7 +673,7 @@ static int cpm_uart_tx_pump(struct uart_port *port)
 		return 1;
 	}
 
-	if (uart_circ_empty(xmit) || uart_tx_stopped(port)) {
+	if (kfifo_is_empty(&tport->xmit_fifo) || uart_tx_stopped(port)) {
 		cpm_uart_stop_tx(port);
 		return 0;
 	}
@@ -681,16 +681,10 @@ static int cpm_uart_tx_pump(struct uart_port *port)
 	/* Pick next descriptor and fill from buffer */
 	bdp = pinfo->tx_cur;
 
-	while (!(in_be16(&bdp->cbd_sc) & BD_SC_READY) && !uart_circ_empty(xmit)) {
-		count = 0;
+	while (!(in_be16(&bdp->cbd_sc) & BD_SC_READY) &&
+			!kfifo_is_empty(&tport->xmit_fifo)) {
 		p = cpm2cpu_addr(in_be32(&bdp->cbd_bufaddr), pinfo);
-		while (count < pinfo->tx_fifosize) {
-			*p++ = xmit->buf[xmit->tail];
-			uart_xmit_advance(port, 1);
-			count++;
-			if (uart_circ_empty(xmit))
-				break;
-		}
+		count = uart_fifo_out(port, p, pinfo->tx_fifosize);
 		out_be16(&bdp->cbd_datlen, count);
 		setbits16(&bdp->cbd_sc, BD_SC_READY);
 		/* Get next BD. */
@@ -701,10 +695,10 @@ static int cpm_uart_tx_pump(struct uart_port *port)
 	}
 	pinfo->tx_cur = bdp;
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
 
-	if (uart_circ_empty(xmit)) {
+	if (kfifo_is_empty(&tport->xmit_fifo)) {
 		cpm_uart_stop_tx(port);
 		return 0;
 	}
diff --git a/drivers/tty/serial/digicolor-usart.c b/drivers/tty/serial/digicolor-usart.c
index e419c4bde8b7..2ccd13cc0a89 100644
--- a/drivers/tty/serial/digicolor-usart.c
+++ b/drivers/tty/serial/digicolor-usart.c
@@ -179,8 +179,9 @@ static void digicolor_uart_rx(struct uart_port *port)
 
 static void digicolor_uart_tx(struct uart_port *port)
 {
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
 	unsigned long flags;
+	unsigned char c;
 
 	if (digicolor_uart_tx_full(port))
 		return;
@@ -194,20 +195,19 @@ static void digicolor_uart_tx(struct uart_port *port)
 		goto out;
 	}
 
-	if (uart_circ_empty(xmit) || uart_tx_stopped(port)) {
+	if (kfifo_is_empty(&tport->xmit_fifo) || uart_tx_stopped(port)) {
 		digicolor_uart_stop_tx(port);
 		goto out;
 	}
 
-	while (!uart_circ_empty(xmit)) {
-		writeb(xmit->buf[xmit->tail], port->membase + UA_EMI_REC);
-		uart_xmit_advance(port, 1);
+	while (uart_fifo_get(port, &c)) {
+		writeb(c, port->membase + UA_EMI_REC);
 
 		if (digicolor_uart_tx_full(port))
 			break;
 	}
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
 
 out:
diff --git a/drivers/tty/serial/dz.c b/drivers/tty/serial/dz.c
index 6df7af9edc1c..eba91daedef8 100644
--- a/drivers/tty/serial/dz.c
+++ b/drivers/tty/serial/dz.c
@@ -252,13 +252,13 @@ static inline void dz_receive_chars(struct dz_mux *mux)
 static inline void dz_transmit_chars(struct dz_mux *mux)
 {
 	struct dz_port *dport = &mux->dport[0];
-	struct circ_buf *xmit;
+	struct tty_port *tport;
 	unsigned char tmp;
 	u16 status;
 
 	status = dz_in(dport, DZ_CSR);
 	dport = &mux->dport[LINE(status)];
-	xmit = &dport->port.state->xmit;
+	tport = &dport->port.state->port;
 
 	if (dport->port.x_char) {		/* XON/XOFF chars */
 		dz_out(dport, DZ_TDR, dport->port.x_char);
@@ -267,7 +267,8 @@ static inline void dz_transmit_chars(struct dz_mux *mux)
 		return;
 	}
 	/* If nothing to do or stopped or hardware stopped. */
-	if (uart_circ_empty(xmit) || uart_tx_stopped(&dport->port)) {
+	if (uart_tx_stopped(&dport->port) ||
+			!uart_fifo_get(&dport->port, &tmp)) {
 		uart_port_lock(&dport->port);
 		dz_stop_tx(&dport->port);
 		uart_port_unlock(&dport->port);
@@ -278,15 +279,13 @@ static inline void dz_transmit_chars(struct dz_mux *mux)
 	 * If something to do... (remember the dz has no output fifo,
 	 * so we go one char at a time) :-<
 	 */
-	tmp = xmit->buf[xmit->tail];
 	dz_out(dport, DZ_TDR, tmp);
-	uart_xmit_advance(&dport->port, 1);
 
-	if (uart_circ_chars_pending(xmit) < DZ_WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < DZ_WAKEUP_CHARS)
 		uart_write_wakeup(&dport->port);
 
 	/* Are we are done. */
-	if (uart_circ_empty(xmit)) {
+	if (kfifo_is_empty(&tport->xmit_fifo)) {
 		uart_port_lock(&dport->port);
 		dz_stop_tx(&dport->port);
 		uart_port_unlock(&dport->port);
diff --git a/drivers/tty/serial/fsl_linflexuart.c b/drivers/tty/serial/fsl_linflexuart.c
index 5426322b5f0c..e972df4b188d 100644
--- a/drivers/tty/serial/fsl_linflexuart.c
+++ b/drivers/tty/serial/fsl_linflexuart.c
@@ -174,17 +174,18 @@ static void linflex_put_char(struct uart_port *sport, unsigned char c)
 
 static inline void linflex_transmit_buffer(struct uart_port *sport)
 {
-	struct circ_buf *xmit = &sport->state->xmit;
+	struct tty_port *tport = &sport->state->port;
+	unsigned char c;
 
-	while (!uart_circ_empty(xmit)) {
-		linflex_put_char(sport, xmit->buf[xmit->tail]);
-		uart_xmit_advance(sport, 1);
+	while (uart_fifo_get(sport, &c)) {
+		linflex_put_char(sport, c);
+		sport->icount.tx++;
 	}
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(sport);
 
-	if (uart_circ_empty(xmit))
+	if (kfifo_is_empty(&tport->xmit_fifo))
 		linflex_stop_tx(sport);
 }
 
@@ -200,7 +201,7 @@ static void linflex_start_tx(struct uart_port *port)
 static irqreturn_t linflex_txint(int irq, void *dev_id)
 {
 	struct uart_port *sport = dev_id;
-	struct circ_buf *xmit = &sport->state->xmit;
+	struct tty_port *tport = &sport->state->port;
 	unsigned long flags;
 
 	uart_port_lock_irqsave(sport, &flags);
@@ -210,7 +211,7 @@ static irqreturn_t linflex_txint(int irq, void *dev_id)
 		goto out;
 	}
 
-	if (uart_circ_empty(xmit) || uart_tx_stopped(sport)) {
+	if (kfifo_is_empty(&tport->xmit_fifo) || uart_tx_stopped(sport)) {
 		linflex_stop_tx(sport);
 		goto out;
 	}
diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c
index bbcbc91482af..ae5e1ecc48fc 100644
--- a/drivers/tty/serial/fsl_lpuart.c
+++ b/drivers/tty/serial/fsl_lpuart.c
@@ -7,6 +7,7 @@
 
 #include <linux/bitfield.h>
 #include <linux/bits.h>
+#include <linux/circ_buf.h>
 #include <linux/clk.h>
 #include <linux/console.h>
 #include <linux/delay.h>
@@ -473,7 +474,7 @@ static void lpuart32_stop_rx(struct uart_port *port)
 
 static void lpuart_dma_tx(struct lpuart_port *sport)
 {
-	struct circ_buf *xmit = &sport->port.state->xmit;
+	struct tty_port *tport = &sport->port.state->port;
 	struct scatterlist *sgl = sport->tx_sgl;
 	struct device *dev = sport->port.dev;
 	struct dma_chan *chan = sport->dma_tx_chan;
@@ -482,18 +483,10 @@ static void lpuart_dma_tx(struct lpuart_port *sport)
 	if (sport->dma_tx_in_progress)
 		return;
 
-	sport->dma_tx_bytes = uart_circ_chars_pending(xmit);
-
-	if (xmit->tail < xmit->head || xmit->head == 0) {
-		sport->dma_tx_nents = 1;
-		sg_init_one(sgl, xmit->buf + xmit->tail, sport->dma_tx_bytes);
-	} else {
-		sport->dma_tx_nents = 2;
-		sg_init_table(sgl, 2);
-		sg_set_buf(sgl, xmit->buf + xmit->tail,
-				UART_XMIT_SIZE - xmit->tail);
-		sg_set_buf(sgl + 1, xmit->buf, xmit->head);
-	}
+	sg_init_table(sgl, ARRAY_SIZE(sport->tx_sgl));
+	sport->dma_tx_bytes = kfifo_len(&tport->xmit_fifo);
+	sport->dma_tx_nents = kfifo_dma_out_prepare(&tport->xmit_fifo, sgl,
+			ARRAY_SIZE(sport->tx_sgl), sport->dma_tx_bytes);
 
 	ret = dma_map_sg(chan->device->dev, sgl, sport->dma_tx_nents,
 			 DMA_TO_DEVICE);
@@ -521,14 +514,15 @@ static void lpuart_dma_tx(struct lpuart_port *sport)
 
 static bool lpuart_stopped_or_empty(struct uart_port *port)
 {
-	return uart_circ_empty(&port->state->xmit) || uart_tx_stopped(port);
+	return kfifo_is_empty(&port->state->port.xmit_fifo) ||
+		uart_tx_stopped(port);
 }
 
 static void lpuart_dma_tx_complete(void *arg)
 {
 	struct lpuart_port *sport = arg;
 	struct scatterlist *sgl = &sport->tx_sgl[0];
-	struct circ_buf *xmit = &sport->port.state->xmit;
+	struct tty_port *tport = &sport->port.state->port;
 	struct dma_chan *chan = sport->dma_tx_chan;
 	unsigned long flags;
 
@@ -545,7 +539,7 @@ static void lpuart_dma_tx_complete(void *arg)
 	sport->dma_tx_in_progress = false;
 	uart_port_unlock_irqrestore(&sport->port, flags);
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(&sport->port);
 
 	if (waitqueue_active(&sport->dma_wait)) {
@@ -756,8 +750,9 @@ static inline void lpuart_transmit_buffer(struct lpuart_port *sport)
 
 static inline void lpuart32_transmit_buffer(struct lpuart_port *sport)
 {
-	struct circ_buf *xmit = &sport->port.state->xmit;
+	struct tty_port *tport = &sport->port.state->port;
 	unsigned long txcnt;
+	unsigned char c;
 
 	if (sport->port.x_char) {
 		lpuart32_write(&sport->port, sport->port.x_char, UARTDATA);
@@ -774,18 +769,18 @@ static inline void lpuart32_transmit_buffer(struct lpuart_port *sport)
 	txcnt = lpuart32_read(&sport->port, UARTWATER);
 	txcnt = txcnt >> UARTWATER_TXCNT_OFF;
 	txcnt &= UARTWATER_COUNT_MASK;
-	while (!uart_circ_empty(xmit) && (txcnt < sport->txfifo_size)) {
-		lpuart32_write(&sport->port, xmit->buf[xmit->tail], UARTDATA);
-		uart_xmit_advance(&sport->port, 1);
+	while (txcnt < sport->txfifo_size &&
+			uart_fifo_get(&sport->port, &c)) {
+		lpuart32_write(&sport->port, c, UARTDATA);
 		txcnt = lpuart32_read(&sport->port, UARTWATER);
 		txcnt = txcnt >> UARTWATER_TXCNT_OFF;
 		txcnt &= UARTWATER_COUNT_MASK;
 	}
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(&sport->port);
 
-	if (uart_circ_empty(xmit))
+	if (kfifo_is_empty(&tport->xmit_fifo))
 		lpuart32_stop_tx(&sport->port);
 }
 
diff --git a/drivers/tty/serial/icom.c b/drivers/tty/serial/icom.c
index a75eafbcbea3..29e42831df39 100644
--- a/drivers/tty/serial/icom.c
+++ b/drivers/tty/serial/icom.c
@@ -877,10 +877,10 @@ unlock:
 static int icom_write(struct uart_port *port)
 {
 	struct icom_port *icom_port = to_icom_port(port);
+	struct tty_port *tport = &port->state->port;
 	unsigned long data_count;
 	unsigned char cmdReg;
 	unsigned long offset;
-	int temp_tail = port->state->xmit.tail;
 
 	trace(icom_port, "WRITE", 0);
 
@@ -890,16 +890,8 @@ static int icom_write(struct uart_port *port)
 		return 0;
 	}
 
-	data_count = 0;
-	while ((port->state->xmit.head != temp_tail) &&
-	       (data_count <= XMIT_BUFF_SZ)) {
-
-		icom_port->xmit_buf[data_count++] =
-		    port->state->xmit.buf[temp_tail];
-
-		temp_tail++;
-		temp_tail &= (UART_XMIT_SIZE - 1);
-	}
+	data_count = kfifo_out_peek(&tport->xmit_fifo, icom_port->xmit_buf,
+			XMIT_BUFF_SZ);
 
 	if (data_count) {
 		icom_port->statStg->xmit[0].flags =
@@ -956,7 +948,8 @@ static inline void check_modem_status(struct icom_port *icom_port)
 
 static void xmit_interrupt(u16 port_int_reg, struct icom_port *icom_port)
 {
-	u16 count, i;
+	struct tty_port *tport = &icom_port->uart_port.state->port;
+	u16 count;
 
 	if (port_int_reg & (INT_XMIT_COMPLETED)) {
 		trace(icom_port, "XMIT_COMPLETE", 0);
@@ -968,13 +961,7 @@ static void xmit_interrupt(u16 port_int_reg, struct icom_port *icom_port)
 		count = le16_to_cpu(icom_port->statStg->xmit[0].leLength);
 		icom_port->uart_port.icount.tx += count;
 
-		for (i=0; i<count &&
-			!uart_circ_empty(&icom_port->uart_port.state->xmit); i++) {
-
-			icom_port->uart_port.state->xmit.tail++;
-			icom_port->uart_port.state->xmit.tail &=
-				(UART_XMIT_SIZE - 1);
-		}
+		kfifo_skip_count(&tport->xmit_fifo, count);
 
 		if (!icom_write(&icom_port->uart_port))
 			/* activate write queue */
diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c
index e14813250616..56b76a221082 100644
--- a/drivers/tty/serial/imx.c
+++ b/drivers/tty/serial/imx.c
@@ -8,6 +8,7 @@
  * Copyright (C) 2004 Pengutronix
  */
 
+#include <linux/circ_buf.h>
 #include <linux/module.h>
 #include <linux/ioport.h>
 #include <linux/init.h>
@@ -521,7 +522,8 @@ static void imx_uart_dma_tx(struct imx_port *sport);
 /* called with port.lock taken and irqs off */
 static inline void imx_uart_transmit_buffer(struct imx_port *sport)
 {
-	struct circ_buf *xmit = &sport->port.state->xmit;
+	struct tty_port *tport = &sport->port.state->port;
+	unsigned char c;
 
 	if (sport->port.x_char) {
 		/* Send next char */
@@ -531,7 +533,8 @@ static inline void imx_uart_transmit_buffer(struct imx_port *sport)
 		return;
 	}
 
-	if (uart_circ_empty(xmit) || uart_tx_stopped(&sport->port)) {
+	if (kfifo_is_empty(&tport->xmit_fifo) ||
+			uart_tx_stopped(&sport->port)) {
 		imx_uart_stop_tx(&sport->port);
 		return;
 	}
@@ -555,26 +558,22 @@ static inline void imx_uart_transmit_buffer(struct imx_port *sport)
 		return;
 	}
 
-	while (!uart_circ_empty(xmit) &&
-	       !(imx_uart_readl(sport, imx_uart_uts_reg(sport)) & UTS_TXFULL)) {
-		/* send xmit->buf[xmit->tail]
-		 * out the port here */
-		imx_uart_writel(sport, xmit->buf[xmit->tail], URTX0);
-		uart_xmit_advance(&sport->port, 1);
-	}
+	while (!(imx_uart_readl(sport, imx_uart_uts_reg(sport)) & UTS_TXFULL) &&
+			uart_fifo_get(&sport->port, &c))
+		imx_uart_writel(sport, c, URTX0);
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(&sport->port);
 
-	if (uart_circ_empty(xmit))
+	if (kfifo_is_empty(&tport->xmit_fifo))
 		imx_uart_stop_tx(&sport->port);
 }
 
 static void imx_uart_dma_tx_callback(void *data)
 {
 	struct imx_port *sport = data;
+	struct tty_port *tport = &sport->port.state->port;
 	struct scatterlist *sgl = &sport->tx_sgl[0];
-	struct circ_buf *xmit = &sport->port.state->xmit;
 	unsigned long flags;
 	u32 ucr1;
 
@@ -592,10 +591,11 @@ static void imx_uart_dma_tx_callback(void *data)
 
 	sport->dma_is_txing = 0;
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(&sport->port);
 
-	if (!uart_circ_empty(xmit) && !uart_tx_stopped(&sport->port))
+	if (!kfifo_is_empty(&tport->xmit_fifo) &&
+			!uart_tx_stopped(&sport->port))
 		imx_uart_dma_tx(sport);
 	else if (sport->port.rs485.flags & SER_RS485_ENABLED) {
 		u32 ucr4 = imx_uart_readl(sport, UCR4);
@@ -609,7 +609,7 @@ static void imx_uart_dma_tx_callback(void *data)
 /* called with port.lock taken and irqs off */
 static void imx_uart_dma_tx(struct imx_port *sport)
 {
-	struct circ_buf *xmit = &sport->port.state->xmit;
+	struct tty_port *tport = &sport->port.state->port;
 	struct scatterlist *sgl = sport->tx_sgl;
 	struct dma_async_tx_descriptor *desc;
 	struct dma_chan	*chan = sport->dma_chan_tx;
@@ -624,18 +624,10 @@ static void imx_uart_dma_tx(struct imx_port *sport)
 	ucr4 &= ~UCR4_TCEN;
 	imx_uart_writel(sport, ucr4, UCR4);
 
-	sport->tx_bytes = uart_circ_chars_pending(xmit);
-
-	if (xmit->tail < xmit->head || xmit->head == 0) {
-		sport->dma_tx_nents = 1;
-		sg_init_one(sgl, xmit->buf + xmit->tail, sport->tx_bytes);
-	} else {
-		sport->dma_tx_nents = 2;
-		sg_init_table(sgl, 2);
-		sg_set_buf(sgl, xmit->buf + xmit->tail,
-				UART_XMIT_SIZE - xmit->tail);
-		sg_set_buf(sgl + 1, xmit->buf, xmit->head);
-	}
+	sg_init_table(sgl, ARRAY_SIZE(sport->tx_sgl));
+	sport->tx_bytes = kfifo_len(&tport->xmit_fifo);
+	sport->dma_tx_nents = kfifo_dma_out_prepare(&tport->xmit_fifo, sgl,
+			ARRAY_SIZE(sport->tx_sgl), sport->tx_bytes);
 
 	ret = dma_map_sg(dev, sgl, sport->dma_tx_nents, DMA_TO_DEVICE);
 	if (ret == 0) {
@@ -653,8 +645,7 @@ static void imx_uart_dma_tx(struct imx_port *sport)
 	desc->callback = imx_uart_dma_tx_callback;
 	desc->callback_param = sport;
 
-	dev_dbg(dev, "TX: prepare to send %lu bytes by DMA.\n",
-			uart_circ_chars_pending(xmit));
+	dev_dbg(dev, "TX: prepare to send %u bytes by DMA.\n", sport->tx_bytes);
 
 	ucr1 = imx_uart_readl(sport, UCR1);
 	ucr1 |= UCR1_TXDMAEN;
@@ -671,9 +662,10 @@ static void imx_uart_dma_tx(struct imx_port *sport)
 static void imx_uart_start_tx(struct uart_port *port)
 {
 	struct imx_port *sport = (struct imx_port *)port;
+	struct tty_port *tport = &sport->port.state->port;
 	u32 ucr1;
 
-	if (!sport->port.x_char && uart_circ_empty(&port->state->xmit))
+	if (!sport->port.x_char && kfifo_is_empty(&tport->xmit_fifo))
 		return;
 
 	/*
@@ -749,7 +741,7 @@ static void imx_uart_start_tx(struct uart_port *port)
 			return;
 		}
 
-		if (!uart_circ_empty(&port->state->xmit) &&
+		if (!kfifo_is_empty(&tport->xmit_fifo) &&
 		    !uart_tx_stopped(port))
 			imx_uart_dma_tx(sport);
 		return;
diff --git a/drivers/tty/serial/ip22zilog.c b/drivers/tty/serial/ip22zilog.c
index 320b29cd4683..c2cae50f06f3 100644
--- a/drivers/tty/serial/ip22zilog.c
+++ b/drivers/tty/serial/ip22zilog.c
@@ -355,7 +355,8 @@ static void ip22zilog_status_handle(struct uart_ip22zilog_port *up,
 static void ip22zilog_transmit_chars(struct uart_ip22zilog_port *up,
 				    struct zilog_channel *channel)
 {
-	struct circ_buf *xmit;
+	struct tty_port *tport;
+	unsigned char c;
 
 	if (ZS_IS_CONS(up)) {
 		unsigned char status = readb(&channel->control);
@@ -398,20 +399,18 @@ static void ip22zilog_transmit_chars(struct uart_ip22zilog_port *up,
 
 	if (up->port.state == NULL)
 		goto ack_tx_int;
-	xmit = &up->port.state->xmit;
-	if (uart_circ_empty(xmit))
-		goto ack_tx_int;
+	tport = &up->port.state->port;
 	if (uart_tx_stopped(&up->port))
 		goto ack_tx_int;
+	if (!uart_fifo_get(&up->port, &c))
+		goto ack_tx_int;
 
 	up->flags |= IP22ZILOG_FLAG_TX_ACTIVE;
-	writeb(xmit->buf[xmit->tail], &channel->data);
+	writeb(c, &channel->data);
 	ZSDELAY();
 	ZS_WSYNC(channel);
 
-	uart_xmit_advance(&up->port, 1);
-
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(&up->port);
 
 	return;
@@ -600,17 +599,16 @@ static void ip22zilog_start_tx(struct uart_port *port)
 		port->icount.tx++;
 		port->x_char = 0;
 	} else {
-		struct circ_buf *xmit = &port->state->xmit;
+		struct tty_port *tport = &port->state->port;
+		unsigned char c;
 
-		if (uart_circ_empty(xmit))
+		if (!uart_fifo_get(port, &c))
 			return;
-		writeb(xmit->buf[xmit->tail], &channel->data);
+		writeb(c, &channel->data);
 		ZSDELAY();
 		ZS_WSYNC(channel);
 
-		uart_xmit_advance(port, 1);
-
-		if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 			uart_write_wakeup(&up->port);
 	}
 }
diff --git a/drivers/tty/serial/jsm/jsm_cls.c b/drivers/tty/serial/jsm/jsm_cls.c
index ddbd42c09637..6e40792f92cf 100644
--- a/drivers/tty/serial/jsm/jsm_cls.c
+++ b/drivers/tty/serial/jsm/jsm_cls.c
@@ -443,20 +443,14 @@ static void cls_copy_data_from_uart_to_queue(struct jsm_channel *ch)
 
 static void cls_copy_data_from_queue_to_uart(struct jsm_channel *ch)
 {
-	u16 tail;
+	struct tty_port *tport;
 	int n;
-	int qlen;
 	u32 len_written = 0;
-	struct circ_buf *circ;
 
 	if (!ch)
 		return;
 
-	circ = &ch->uart_port.state->xmit;
-
-	/* No data to write to the UART */
-	if (uart_circ_empty(circ))
-		return;
+	tport = &ch->uart_port.state->port;
 
 	/* If port is "stopped", don't send any data to the UART */
 	if ((ch->ch_flags & CH_STOP) || (ch->ch_flags & CH_BREAK_SENDING))
@@ -467,29 +461,22 @@ static void cls_copy_data_from_queue_to_uart(struct jsm_channel *ch)
 		return;
 
 	n = 32;
+	while (n > 0) {
+		unsigned char c;
 
-	/* cache tail of queue */
-	tail = circ->tail & (UART_XMIT_SIZE - 1);
-	qlen = uart_circ_chars_pending(circ);
-
-	/* Find minimum of the FIFO space, versus queue length */
-	n = min(n, qlen);
+		if (!kfifo_get(&tport->xmit_fifo, &c))
+			break;
 
-	while (n > 0) {
-		writeb(circ->buf[tail], &ch->ch_cls_uart->txrx);
-		tail = (tail + 1) & (UART_XMIT_SIZE - 1);
+		writeb(c, &ch->ch_cls_uart->txrx);
 		n--;
 		ch->ch_txcount++;
 		len_written++;
 	}
 
-	/* Update the final tail */
-	circ->tail = tail & (UART_XMIT_SIZE - 1);
-
 	if (len_written > ch->ch_t_tlevel)
 		ch->ch_flags &= ~(CH_TX_FIFO_EMPTY | CH_TX_FIFO_LWM);
 
-	if (uart_circ_empty(circ))
+	if (kfifo_is_empty(&tport->xmit_fifo))
 		uart_write_wakeup(&ch->uart_port);
 }
 
diff --git a/drivers/tty/serial/jsm/jsm_neo.c b/drivers/tty/serial/jsm/jsm_neo.c
index 1fa10f19368f..e8e13bf056e2 100644
--- a/drivers/tty/serial/jsm/jsm_neo.c
+++ b/drivers/tty/serial/jsm/jsm_neo.c
@@ -474,21 +474,21 @@ static void neo_copy_data_from_uart_to_queue(struct jsm_channel *ch)
 
 static void neo_copy_data_from_queue_to_uart(struct jsm_channel *ch)
 {
-	u16 head;
-	u16 tail;
+	struct tty_port *tport;
+	unsigned char *tail;
+	unsigned char c;
 	int n;
 	int s;
 	int qlen;
 	u32 len_written = 0;
-	struct circ_buf *circ;
 
 	if (!ch)
 		return;
 
-	circ = &ch->uart_port.state->xmit;
+	tport = &ch->uart_port.state->port;
 
 	/* No data to write to the UART */
-	if (uart_circ_empty(circ))
+	if (kfifo_is_empty(&tport->xmit_fifo))
 		return;
 
 	/* If port is "stopped", don't send any data to the UART */
@@ -504,10 +504,9 @@ static void neo_copy_data_from_queue_to_uart(struct jsm_channel *ch)
 		if (ch->ch_cached_lsr & UART_LSR_THRE) {
 			ch->ch_cached_lsr &= ~(UART_LSR_THRE);
 
-			writeb(circ->buf[circ->tail], &ch->ch_neo_uart->txrx);
-			jsm_dbg(WRITE, &ch->ch_bd->pci_dev,
-				"Tx data: %x\n", circ->buf[circ->tail]);
-			circ->tail = (circ->tail + 1) & (UART_XMIT_SIZE - 1);
+			WARN_ON_ONCE(!kfifo_get(&tport->xmit_fifo, &c));
+			writeb(c, &ch->ch_neo_uart->txrx);
+			jsm_dbg(WRITE, &ch->ch_bd->pci_dev, "Tx data: %x\n", c);
 			ch->ch_txcount++;
 		}
 		return;
@@ -520,38 +519,27 @@ static void neo_copy_data_from_queue_to_uart(struct jsm_channel *ch)
 		return;
 
 	n = UART_17158_TX_FIFOSIZE - ch->ch_t_tlevel;
-
-	/* cache head and tail of queue */
-	head = circ->head & (UART_XMIT_SIZE - 1);
-	tail = circ->tail & (UART_XMIT_SIZE - 1);
-	qlen = uart_circ_chars_pending(circ);
+	qlen = kfifo_len(&tport->xmit_fifo);
 
 	/* Find minimum of the FIFO space, versus queue length */
 	n = min(n, qlen);
 
 	while (n > 0) {
-
-		s = ((head >= tail) ? head : UART_XMIT_SIZE) - tail;
-		s = min(s, n);
-
+		s = kfifo_out_linear_ptr(&tport->xmit_fifo, &tail, n);
 		if (s <= 0)
 			break;
 
-		memcpy_toio(&ch->ch_neo_uart->txrxburst, circ->buf + tail, s);
-		/* Add and flip queue if needed */
-		tail = (tail + s) & (UART_XMIT_SIZE - 1);
+		memcpy_toio(&ch->ch_neo_uart->txrxburst, tail, s);
+		kfifo_skip_count(&tport->xmit_fifo, s);
 		n -= s;
 		ch->ch_txcount += s;
 		len_written += s;
 	}
 
-	/* Update the final tail */
-	circ->tail = tail & (UART_XMIT_SIZE - 1);
-
 	if (len_written >= ch->ch_t_tlevel)
 		ch->ch_flags &= ~(CH_TX_FIFO_EMPTY | CH_TX_FIFO_LWM);
 
-	if (uart_circ_empty(circ))
+	if (kfifo_is_empty(&tport->xmit_fifo))
 		uart_write_wakeup(&ch->uart_port);
 }
 
diff --git a/drivers/tty/serial/max3100.c b/drivers/tty/serial/max3100.c
index 5efb2b593be3..b83eee37c17d 100644
--- a/drivers/tty/serial/max3100.c
+++ b/drivers/tty/serial/max3100.c
@@ -257,10 +257,11 @@ static int max3100_handlerx(struct max3100_port *s, u16 rx)
 static void max3100_work(struct work_struct *w)
 {
 	struct max3100_port *s = container_of(w, struct max3100_port, work);
+	struct tty_port *tport = &s->port.state->port;
+	unsigned char ch;
 	int rxchars;
 	u16 tx, rx;
 	int conf, cconf, crts;
-	struct circ_buf *xmit = &s->port.state->xmit;
 
 	dev_dbg(&s->spi->dev, "%s\n", __func__);
 
@@ -290,10 +291,9 @@ static void max3100_work(struct work_struct *w)
 				tx = s->port.x_char;
 				s->port.icount.tx++;
 				s->port.x_char = 0;
-			} else if (!uart_circ_empty(xmit) &&
-				   !uart_tx_stopped(&s->port)) {
-				tx = xmit->buf[xmit->tail];
-				uart_xmit_advance(&s->port, 1);
+			} else if (!uart_tx_stopped(&s->port) &&
+					uart_fifo_get(&s->port, &ch)) {
+				tx = ch;
 			}
 			if (tx != 0xffff) {
 				max3100_calc_parity(s, &tx);
@@ -307,13 +307,13 @@ static void max3100_work(struct work_struct *w)
 			tty_flip_buffer_push(&s->port.state->port);
 			rxchars = 0;
 		}
-		if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 			uart_write_wakeup(&s->port);
 
 	} while (!s->force_end_work &&
 		 !freezing(current) &&
 		 ((rx & MAX3100_R) ||
-		  (!uart_circ_empty(xmit) &&
+		  (!kfifo_is_empty(&tport->xmit_fifo) &&
 		   !uart_tx_stopped(&s->port))));
 
 	if (rxchars > 0)
diff --git a/drivers/tty/serial/max310x.c b/drivers/tty/serial/max310x.c
index 14dd9cfaa9f7..f0eb96429dae 100644
--- a/drivers/tty/serial/max310x.c
+++ b/drivers/tty/serial/max310x.c
@@ -747,8 +747,9 @@ static void max310x_handle_rx(struct uart_port *port, unsigned int rxlen)
 
 static void max310x_handle_tx(struct uart_port *port)
 {
-	struct circ_buf *xmit = &port->state->xmit;
-	unsigned int txlen, to_send, until_end;
+	struct tty_port *tport = &port->state->port;
+	unsigned int txlen, to_send;
+	unsigned char *tail;
 
 	if (unlikely(port->x_char)) {
 		max310x_port_write(port, MAX310X_THR_REG, port->x_char);
@@ -757,32 +758,26 @@ static void max310x_handle_tx(struct uart_port *port)
 		return;
 	}
 
-	if (uart_circ_empty(xmit) || uart_tx_stopped(port))
+	if (kfifo_is_empty(&tport->xmit_fifo) || uart_tx_stopped(port))
 		return;
 
-	/* Get length of data pending in circular buffer */
-	to_send = uart_circ_chars_pending(xmit);
-	until_end = CIRC_CNT_TO_END(xmit->head, xmit->tail, UART_XMIT_SIZE);
-	if (likely(to_send)) {
+	/*
+	 * It's a circ buffer -- wrap around.
+	 * We could do that in one SPI transaction, but meh.
+	 */
+	while (!kfifo_is_empty(&tport->xmit_fifo)) {
 		/* Limit to space available in TX FIFO */
 		txlen = max310x_port_read(port, MAX310X_TXFIFOLVL_REG);
 		txlen = port->fifosize - txlen;
-		to_send = (to_send > txlen) ? txlen : to_send;
-
-		if (until_end < to_send) {
-			/*
-			 * It's a circ buffer -- wrap around.
-			 * We could do that in one SPI transaction, but meh.
-			 */
-			max310x_batch_write(port, xmit->buf + xmit->tail, until_end);
-			max310x_batch_write(port, xmit->buf, to_send - until_end);
-		} else {
-			max310x_batch_write(port, xmit->buf + xmit->tail, to_send);
-		}
+		if (!txlen)
+			break;
+
+		to_send = kfifo_out_linear_ptr(&tport->xmit_fifo, &tail, txlen);
+		max310x_batch_write(port, tail, to_send);
 		uart_xmit_advance(port, to_send);
 	}
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
 }
 
diff --git a/drivers/tty/serial/men_z135_uart.c b/drivers/tty/serial/men_z135_uart.c
index 8048fa542fc4..4bff422bb1bc 100644
--- a/drivers/tty/serial/men_z135_uart.c
+++ b/drivers/tty/serial/men_z135_uart.c
@@ -293,17 +293,14 @@ static void men_z135_handle_rx(struct men_z135_port *uart)
 static void men_z135_handle_tx(struct men_z135_port *uart)
 {
 	struct uart_port *port = &uart->port;
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
+	unsigned char *tail;
+	unsigned int n, txfree;
 	u32 txc;
 	u32 wptr;
 	int qlen;
-	int n;
-	int txfree;
-	int head;
-	int tail;
-	int s;
 
-	if (uart_circ_empty(xmit))
+	if (kfifo_is_empty(&tport->xmit_fifo))
 		goto out;
 
 	if (uart_tx_stopped(port))
@@ -313,7 +310,7 @@ static void men_z135_handle_tx(struct men_z135_port *uart)
 		goto out;
 
 	/* calculate bytes to copy */
-	qlen = uart_circ_chars_pending(xmit);
+	qlen = kfifo_len(&tport->xmit_fifo);
 	if (qlen <= 0)
 		goto out;
 
@@ -345,21 +342,18 @@ static void men_z135_handle_tx(struct men_z135_port *uart)
 	if (n <= 0)
 		goto irq_en;
 
-	head = xmit->head & (UART_XMIT_SIZE - 1);
-	tail = xmit->tail & (UART_XMIT_SIZE - 1);
-
-	s = ((head >= tail) ? head : UART_XMIT_SIZE) - tail;
-	n = min(n, s);
+	n = kfifo_out_linear_ptr(&tport->xmit_fifo, &tail,
+			min_t(unsigned int, UART_XMIT_SIZE, n));
+	memcpy_toio(port->membase + MEN_Z135_TX_RAM, tail, n);
 
-	memcpy_toio(port->membase + MEN_Z135_TX_RAM, &xmit->buf[xmit->tail], n);
 	iowrite32(n & 0x3ff, port->membase + MEN_Z135_TX_CTRL);
 	uart_xmit_advance(port, n);
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
 
 irq_en:
-	if (!uart_circ_empty(xmit))
+	if (!kfifo_is_empty(&tport->xmit_fifo))
 		men_z135_reg_set(uart, MEN_Z135_CONF_REG, MEN_Z135_IER_TXCIEN);
 	else
 		men_z135_reg_clr(uart, MEN_Z135_CONF_REG, MEN_Z135_IER_TXCIEN);
diff --git a/drivers/tty/serial/meson_uart.c b/drivers/tty/serial/meson_uart.c
index 6feac459c0cf..4587ed4d4d5d 100644
--- a/drivers/tty/serial/meson_uart.c
+++ b/drivers/tty/serial/meson_uart.c
@@ -141,8 +141,8 @@ static void meson_uart_shutdown(struct uart_port *port)
 
 static void meson_uart_start_tx(struct uart_port *port)
 {
-	struct circ_buf *xmit = &port->state->xmit;
-	unsigned int ch;
+	struct tty_port *tport = &port->state->port;
+	unsigned char ch;
 	u32 val;
 
 	if (uart_tx_stopped(port)) {
@@ -158,21 +158,20 @@ static void meson_uart_start_tx(struct uart_port *port)
 			continue;
 		}
 
-		if (uart_circ_empty(xmit))
+		if (!uart_fifo_get(port, &ch))
 			break;
 
-		ch = xmit->buf[xmit->tail];
 		writel(ch, port->membase + AML_UART_WFIFO);
 		uart_xmit_advance(port, 1);
 	}
 
-	if (!uart_circ_empty(xmit)) {
+	if (!kfifo_is_empty(&tport->xmit_fifo)) {
 		val = readl(port->membase + AML_UART_CONTROL);
 		val |= AML_UART_TX_INT_EN;
 		writel(val, port->membase + AML_UART_CONTROL);
 	}
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
 }
 
diff --git a/drivers/tty/serial/milbeaut_usio.c b/drivers/tty/serial/milbeaut_usio.c
index da4c6f7e2a30..fb082ee73d5b 100644
--- a/drivers/tty/serial/milbeaut_usio.c
+++ b/drivers/tty/serial/milbeaut_usio.c
@@ -72,7 +72,7 @@ static void mlb_usio_stop_tx(struct uart_port *port)
 
 static void mlb_usio_tx_chars(struct uart_port *port)
 {
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
 	int count;
 
 	writew(readw(port->membase + MLB_USIO_REG_FCR) & ~MLB_USIO_FCR_FTIE,
@@ -87,7 +87,7 @@ static void mlb_usio_tx_chars(struct uart_port *port)
 		port->x_char = 0;
 		return;
 	}
-	if (uart_circ_empty(xmit) || uart_tx_stopped(port)) {
+	if (kfifo_is_empty(&tport->xmit_fifo) || uart_tx_stopped(port)) {
 		mlb_usio_stop_tx(port);
 		return;
 	}
@@ -96,12 +96,13 @@ static void mlb_usio_tx_chars(struct uart_port *port)
 		(readw(port->membase + MLB_USIO_REG_FBYTE) & 0xff);
 
 	do {
-		writew(xmit->buf[xmit->tail], port->membase + MLB_USIO_REG_DR);
+		unsigned char ch;
 
-		uart_xmit_advance(port, 1);
-		if (uart_circ_empty(xmit))
+		if (!uart_fifo_get(port, &ch))
 			break;
 
+		writew(ch, port->membase + MLB_USIO_REG_DR);
+		port->icount.tx++;
 	} while (--count > 0);
 
 	writew(readw(port->membase + MLB_USIO_REG_FCR) & ~MLB_USIO_FCR_FDRQ,
@@ -110,10 +111,10 @@ static void mlb_usio_tx_chars(struct uart_port *port)
 	writeb(readb(port->membase + MLB_USIO_REG_SCR) | MLB_USIO_SCR_TBIE,
 	       port->membase + MLB_USIO_REG_SCR);
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
 
-	if (uart_circ_empty(xmit))
+	if (kfifo_is_empty(&tport->xmit_fifo))
 		mlb_usio_stop_tx(port);
 }
 
diff --git a/drivers/tty/serial/msm_serial.c b/drivers/tty/serial/msm_serial.c
index 7bf30e632313..ae7a8e3cf467 100644
--- a/drivers/tty/serial/msm_serial.c
+++ b/drivers/tty/serial/msm_serial.c
@@ -452,7 +452,7 @@ static void msm_complete_tx_dma(void *args)
 {
 	struct msm_port *msm_port = args;
 	struct uart_port *port = &msm_port->uart;
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
 	struct msm_dma *dma = &msm_port->tx_dma;
 	struct dma_tx_state state;
 	unsigned long flags;
@@ -486,7 +486,7 @@ static void msm_complete_tx_dma(void *args)
 	msm_port->imr |= MSM_UART_IMR_TXLEV;
 	msm_write(port, msm_port->imr, MSM_UART_IMR);
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
 
 	msm_handle_tx(port);
@@ -496,14 +496,14 @@ done:
 
 static int msm_handle_tx_dma(struct msm_port *msm_port, unsigned int count)
 {
-	struct circ_buf *xmit = &msm_port->uart.state->xmit;
 	struct uart_port *port = &msm_port->uart;
+	struct tty_port *tport = &port->state->port;
 	struct msm_dma *dma = &msm_port->tx_dma;
 	int ret;
 	u32 val;
 
 	sg_init_table(&dma->tx_sg, 1);
-	sg_set_buf(&dma->tx_sg, &xmit->buf[xmit->tail], count);
+	kfifo_dma_out_prepare(&tport->xmit_fifo, &dma->tx_sg, 1, count);
 
 	ret = dma_map_sg(port->dev, &dma->tx_sg, 1, dma->dir);
 	if (ret)
@@ -843,8 +843,8 @@ static void msm_handle_rx(struct uart_port *port)
 
 static void msm_handle_tx_pio(struct uart_port *port, unsigned int tx_count)
 {
-	struct circ_buf *xmit = &port->state->xmit;
 	struct msm_port *msm_port = to_msm_port(port);
+	struct tty_port *tport = &port->state->port;
 	unsigned int num_chars;
 	unsigned int tf_pointer = 0;
 	void __iomem *tf;
@@ -858,8 +858,7 @@ static void msm_handle_tx_pio(struct uart_port *port, unsigned int tx_count)
 		msm_reset_dm_count(port, tx_count);
 
 	while (tf_pointer < tx_count) {
-		int i;
-		char buf[4] = { 0 };
+		unsigned char buf[4] = { 0 };
 
 		if (!(msm_read(port, MSM_UART_SR) & MSM_UART_SR_TX_READY))
 			break;
@@ -870,26 +869,23 @@ static void msm_handle_tx_pio(struct uart_port *port, unsigned int tx_count)
 		else
 			num_chars = 1;
 
-		for (i = 0; i < num_chars; i++)
-			buf[i] = xmit->buf[xmit->tail + i];
-
+		num_chars = uart_fifo_out(port, buf, num_chars);
 		iowrite32_rep(tf, buf, 1);
-		uart_xmit_advance(port, num_chars);
 		tf_pointer += num_chars;
 	}
 
 	/* disable tx interrupts if nothing more to send */
-	if (uart_circ_empty(xmit))
+	if (kfifo_is_empty(&tport->xmit_fifo))
 		msm_stop_tx(port);
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
 }
 
 static void msm_handle_tx(struct uart_port *port)
 {
 	struct msm_port *msm_port = to_msm_port(port);
-	struct circ_buf *xmit = &msm_port->uart.state->xmit;
+	struct tty_port *tport = &port->state->port;
 	struct msm_dma *dma = &msm_port->tx_dma;
 	unsigned int pio_count, dma_count, dma_min;
 	char buf[4] = { 0 };
@@ -913,13 +909,13 @@ static void msm_handle_tx(struct uart_port *port)
 		return;
 	}
 
-	if (uart_circ_empty(xmit) || uart_tx_stopped(port)) {
+	if (kfifo_is_empty(&tport->xmit_fifo) || uart_tx_stopped(port)) {
 		msm_stop_tx(port);
 		return;
 	}
 
-	pio_count = CIRC_CNT_TO_END(xmit->head, xmit->tail, UART_XMIT_SIZE);
-	dma_count = CIRC_CNT_TO_END(xmit->head, xmit->tail, UART_XMIT_SIZE);
+	dma_count = pio_count = kfifo_out_linear(&tport->xmit_fifo, NULL,
+			UART_XMIT_SIZE);
 
 	dma_min = 1;	/* Always DMA */
 	if (msm_port->is_uartdm > UARTDM_1P3) {
diff --git a/drivers/tty/serial/mvebu-uart.c b/drivers/tty/serial/mvebu-uart.c
index 0255646bc175..5de57b77abdb 100644
--- a/drivers/tty/serial/mvebu-uart.c
+++ b/drivers/tty/serial/mvebu-uart.c
@@ -219,12 +219,10 @@ static void mvebu_uart_stop_tx(struct uart_port *port)
 static void mvebu_uart_start_tx(struct uart_port *port)
 {
 	unsigned int ctl;
-	struct circ_buf *xmit = &port->state->xmit;
+	unsigned char c;
 
-	if (IS_EXTENDED(port) && !uart_circ_empty(xmit)) {
-		writel(xmit->buf[xmit->tail], port->membase + UART_TSH(port));
-		uart_xmit_advance(port, 1);
-	}
+	if (IS_EXTENDED(port) && uart_fifo_get(port, &c))
+		writel(c, port->membase + UART_TSH(port));
 
 	ctl = readl(port->membase + UART_INTR(port));
 	ctl |= CTRL_TX_RDY_INT(port);
diff --git a/drivers/tty/serial/mxs-auart.c b/drivers/tty/serial/mxs-auart.c
index 4749331fe618..144b35d31497 100644
--- a/drivers/tty/serial/mxs-auart.c
+++ b/drivers/tty/serial/mxs-auart.c
@@ -517,7 +517,7 @@ static void mxs_auart_tx_chars(struct mxs_auart_port *s);
 static void dma_tx_callback(void *param)
 {
 	struct mxs_auart_port *s = param;
-	struct circ_buf *xmit = &s->port.state->xmit;
+	struct tty_port *tport = &s->port.state->port;
 
 	dma_unmap_sg(s->dev, &s->tx_sgl, 1, DMA_TO_DEVICE);
 
@@ -526,7 +526,7 @@ static void dma_tx_callback(void *param)
 	smp_mb__after_atomic();
 
 	/* wake up the possible processes. */
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(&s->port);
 
 	mxs_auart_tx_chars(s);
@@ -568,33 +568,22 @@ static int mxs_auart_dma_tx(struct mxs_auart_port *s, int size)
 
 static void mxs_auart_tx_chars(struct mxs_auart_port *s)
 {
-	struct circ_buf *xmit = &s->port.state->xmit;
+	struct tty_port *tport = &s->port.state->port;
 	bool pending;
 	u8 ch;
 
 	if (auart_dma_enabled(s)) {
 		u32 i = 0;
-		int size;
 		void *buffer = s->tx_dma_buf;
 
 		if (test_and_set_bit(MXS_AUART_DMA_TX_SYNC, &s->flags))
 			return;
 
-		while (!uart_circ_empty(xmit) && !uart_tx_stopped(&s->port)) {
-			size = min_t(u32, UART_XMIT_SIZE - i,
-				     CIRC_CNT_TO_END(xmit->head,
-						     xmit->tail,
-						     UART_XMIT_SIZE));
-			memcpy(buffer + i, xmit->buf + xmit->tail, size);
-			xmit->tail = (xmit->tail + size) & (UART_XMIT_SIZE - 1);
-
-			i += size;
-			if (i >= UART_XMIT_SIZE)
-				break;
-		}
-
 		if (uart_tx_stopped(&s->port))
 			mxs_auart_stop_tx(&s->port);
+		else
+			i = kfifo_out(&tport->xmit_fifo, buffer,
+					UART_XMIT_SIZE);
 
 		if (i) {
 			mxs_auart_dma_tx(s, i);
diff --git a/drivers/tty/serial/pch_uart.c b/drivers/tty/serial/pch_uart.c
index 89257cddf540..c7cee5fee603 100644
--- a/drivers/tty/serial/pch_uart.c
+++ b/drivers/tty/serial/pch_uart.c
@@ -808,7 +808,7 @@ static int dma_handle_rx(struct eg20t_port *priv)
 static unsigned int handle_tx(struct eg20t_port *priv)
 {
 	struct uart_port *port = &priv->port;
-	struct circ_buf *xmit = &port->state->xmit;
+	unsigned char ch;
 	int fifo_size;
 	int tx_empty;
 
@@ -830,9 +830,9 @@ static unsigned int handle_tx(struct eg20t_port *priv)
 		fifo_size--;
 	}
 
-	while (!uart_tx_stopped(port) && !uart_circ_empty(xmit) && fifo_size) {
-		iowrite8(xmit->buf[xmit->tail], priv->membase + PCH_UART_THR);
-		uart_xmit_advance(port, 1);
+	while (!uart_tx_stopped(port) && fifo_size &&
+			uart_fifo_get(port, &ch)) {
+		iowrite8(ch, priv->membase + PCH_UART_THR);
 		fifo_size--;
 		tx_empty = 0;
 	}
@@ -850,14 +850,14 @@ static unsigned int handle_tx(struct eg20t_port *priv)
 static unsigned int dma_handle_tx(struct eg20t_port *priv)
 {
 	struct uart_port *port = &priv->port;
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
 	struct scatterlist *sg;
 	int nent;
 	int fifo_size;
 	struct dma_async_tx_descriptor *desc;
+	unsigned int bytes, tail;
 	int num;
 	int i;
-	int bytes;
 	int size;
 	int rem;
 
@@ -886,7 +886,7 @@ static unsigned int dma_handle_tx(struct eg20t_port *priv)
 		fifo_size--;
 	}
 
-	bytes = CIRC_CNT_TO_END(xmit->head, xmit->tail, UART_XMIT_SIZE);
+	bytes = kfifo_out_linear(&tport->xmit_fifo, &tail, UART_XMIT_SIZE);
 	if (!bytes) {
 		dev_dbg(priv->port.dev, "%s 0 bytes return\n", __func__);
 		pch_uart_hal_disable_interrupt(priv, PCH_UART_HAL_TX_INT);
@@ -920,10 +920,10 @@ static unsigned int dma_handle_tx(struct eg20t_port *priv)
 
 	for (i = 0; i < num; i++, sg++) {
 		if (i == (num - 1))
-			sg_set_page(sg, virt_to_page(xmit->buf),
+			sg_set_page(sg, virt_to_page(tport->xmit_buf),
 				    rem, fifo_size * i);
 		else
-			sg_set_page(sg, virt_to_page(xmit->buf),
+			sg_set_page(sg, virt_to_page(tport->xmit_buf),
 				    size, fifo_size * i);
 	}
 
@@ -937,8 +937,7 @@ static unsigned int dma_handle_tx(struct eg20t_port *priv)
 	priv->nent = nent;
 
 	for (i = 0; i < nent; i++, sg++) {
-		sg->offset = (xmit->tail & (UART_XMIT_SIZE - 1)) +
-			      fifo_size * i;
+		sg->offset = tail + fifo_size * i;
 		sg_dma_address(sg) = (sg_dma_address(sg) &
 				    ~(UART_XMIT_SIZE - 1)) + sg->offset;
 		if (i == (nent - 1))
diff --git a/drivers/tty/serial/pic32_uart.c b/drivers/tty/serial/pic32_uart.c
index bbb46e6e98a2..f5af336a869b 100644
--- a/drivers/tty/serial/pic32_uart.c
+++ b/drivers/tty/serial/pic32_uart.c
@@ -342,7 +342,7 @@ static void pic32_uart_do_rx(struct uart_port *port)
 static void pic32_uart_do_tx(struct uart_port *port)
 {
 	struct pic32_sport *sport = to_pic32_sport(port);
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
 	unsigned int max_count = PIC32_UART_TX_FIFO_DEPTH;
 
 	if (port->x_char) {
@@ -357,7 +357,7 @@ static void pic32_uart_do_tx(struct uart_port *port)
 		return;
 	}
 
-	if (uart_circ_empty(xmit))
+	if (kfifo_is_empty(&tport->xmit_fifo))
 		goto txq_empty;
 
 	/* keep stuffing chars into uart tx buffer
@@ -371,21 +371,20 @@ static void pic32_uart_do_tx(struct uart_port *port)
 	 */
 	while (!(PIC32_UART_STA_UTXBF &
 		pic32_uart_readl(sport, PIC32_UART_STA))) {
-		unsigned int c = xmit->buf[xmit->tail];
+		unsigned char c;
 
+		if (!uart_fifo_get(port, &c))
+			break;
 		pic32_uart_writel(sport, PIC32_UART_TX, c);
 
-		uart_xmit_advance(port, 1);
-		if (uart_circ_empty(xmit))
-			break;
 		if (--max_count == 0)
 			break;
 	}
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
 
-	if (uart_circ_empty(xmit))
+	if (kfifo_is_empty(&tport->xmit_fifo))
 		goto txq_empty;
 
 	return;
diff --git a/drivers/tty/serial/pmac_zilog.c b/drivers/tty/serial/pmac_zilog.c
index 05d97e89511e..63bc726273fd 100644
--- a/drivers/tty/serial/pmac_zilog.c
+++ b/drivers/tty/serial/pmac_zilog.c
@@ -347,7 +347,8 @@ static void pmz_status_handle(struct uart_pmac_port *uap)
 
 static void pmz_transmit_chars(struct uart_pmac_port *uap)
 {
-	struct circ_buf *xmit;
+	struct tty_port *tport;
+	unsigned char ch;
 
 	if (ZS_IS_CONS(uap)) {
 		unsigned char status = read_zsreg(uap, R0);
@@ -398,8 +399,8 @@ static void pmz_transmit_chars(struct uart_pmac_port *uap)
 
 	if (uap->port.state == NULL)
 		goto ack_tx_int;
-	xmit = &uap->port.state->xmit;
-	if (uart_circ_empty(xmit)) {
+	tport = &uap->port.state->port;
+	if (kfifo_is_empty(&tport->xmit_fifo)) {
 		uart_write_wakeup(&uap->port);
 		goto ack_tx_int;
 	}
@@ -407,12 +408,11 @@ static void pmz_transmit_chars(struct uart_pmac_port *uap)
 		goto ack_tx_int;
 
 	uap->flags |= PMACZILOG_FLAG_TX_ACTIVE;
-	write_zsdata(uap, xmit->buf[xmit->tail]);
+	WARN_ON(!uart_fifo_get(&uap->port, &ch));
+	write_zsdata(uap, ch);
 	zssync(uap);
 
-	uart_xmit_advance(&uap->port, 1);
-
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(&uap->port);
 
 	return;
@@ -620,15 +620,15 @@ static void pmz_start_tx(struct uart_port *port)
 		port->icount.tx++;
 		port->x_char = 0;
 	} else {
-		struct circ_buf *xmit = &port->state->xmit;
+		struct tty_port *tport = &port->state->port;
+		unsigned char ch;
 
-		if (uart_circ_empty(xmit))
+		if (!uart_fifo_get(&uap->port, &ch))
 			return;
-		write_zsdata(uap, xmit->buf[xmit->tail]);
+		write_zsdata(uap, ch);
 		zssync(uap);
-		uart_xmit_advance(port, 1);
 
-		if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 			uart_write_wakeup(&uap->port);
 	}
 }
diff --git a/drivers/tty/serial/qcom_geni_serial.c b/drivers/tty/serial/qcom_geni_serial.c
index f9f7ac1a10df..7814982f1921 100644
--- a/drivers/tty/serial/qcom_geni_serial.c
+++ b/drivers/tty/serial/qcom_geni_serial.c
@@ -505,7 +505,7 @@ static void qcom_geni_serial_console_write(struct console *co, const char *s,
 		 */
 		qcom_geni_serial_poll_tx_done(uport);
 
-		if (!uart_circ_empty(&uport->state->xmit)) {
+		if (!kfifo_is_empty(&uport->state->port.xmit_fifo)) {
 			irq_en = readl(uport->membase + SE_GENI_M_IRQ_EN);
 			writel(irq_en | M_TX_FIFO_WATERMARK_EN,
 					uport->membase + SE_GENI_M_IRQ_EN);
@@ -620,22 +620,24 @@ static void qcom_geni_serial_stop_tx_dma(struct uart_port *uport)
 static void qcom_geni_serial_start_tx_dma(struct uart_port *uport)
 {
 	struct qcom_geni_serial_port *port = to_dev_port(uport);
-	struct circ_buf *xmit = &uport->state->xmit;
+	struct tty_port *tport = &uport->state->port;
 	unsigned int xmit_size;
+	u8 *tail;
 	int ret;
 
 	if (port->tx_dma_addr)
 		return;
 
-	if (uart_circ_empty(xmit))
+	if (kfifo_is_empty(&tport->xmit_fifo))
 		return;
 
-	xmit_size = CIRC_CNT_TO_END(xmit->head, xmit->tail, UART_XMIT_SIZE);
+	xmit_size = kfifo_out_linear_ptr(&tport->xmit_fifo, &tail,
+			UART_XMIT_SIZE);
 
 	qcom_geni_serial_setup_tx(uport, xmit_size);
 
-	ret = geni_se_tx_dma_prep(&port->se, &xmit->buf[xmit->tail],
-				  xmit_size, &port->tx_dma_addr);
+	ret = geni_se_tx_dma_prep(&port->se, tail, xmit_size,
+				  &port->tx_dma_addr);
 	if (ret) {
 		dev_err(uport->dev, "unable to start TX SE DMA: %d\n", ret);
 		qcom_geni_serial_stop_tx_dma(uport);
@@ -853,18 +855,16 @@ static void qcom_geni_serial_send_chunk_fifo(struct uart_port *uport,
 					     unsigned int chunk)
 {
 	struct qcom_geni_serial_port *port = to_dev_port(uport);
-	struct circ_buf *xmit = &uport->state->xmit;
-	unsigned int tx_bytes, c, remaining = chunk;
+	struct tty_port *tport = &uport->state->port;
+	unsigned int tx_bytes, remaining = chunk;
 	u8 buf[BYTES_PER_FIFO_WORD];
 
 	while (remaining) {
 		memset(buf, 0, sizeof(buf));
 		tx_bytes = min(remaining, BYTES_PER_FIFO_WORD);
 
-		for (c = 0; c < tx_bytes ; c++) {
-			buf[c] = xmit->buf[xmit->tail];
-			uart_xmit_advance(uport, 1);
-		}
+		tx_bytes = kfifo_out(&tport->xmit_fifo, buf, tx_bytes);
+		uart_xmit_advance(uport, tx_bytes);
 
 		iowrite32_rep(uport->membase + SE_GENI_TX_FIFOn, buf, 1);
 
@@ -877,7 +877,7 @@ static void qcom_geni_serial_handle_tx_fifo(struct uart_port *uport,
 					    bool done, bool active)
 {
 	struct qcom_geni_serial_port *port = to_dev_port(uport);
-	struct circ_buf *xmit = &uport->state->xmit;
+	struct tty_port *tport = &uport->state->port;
 	size_t avail;
 	size_t pending;
 	u32 status;
@@ -890,7 +890,7 @@ static void qcom_geni_serial_handle_tx_fifo(struct uart_port *uport,
 	if (active)
 		pending = port->tx_remaining;
 	else
-		pending = uart_circ_chars_pending(xmit);
+		pending = kfifo_len(&tport->xmit_fifo);
 
 	/* All data has been transmitted and acknowledged as received */
 	if (!pending && !status && done) {
@@ -933,24 +933,24 @@ out_write_wakeup:
 					uport->membase + SE_GENI_M_IRQ_EN);
 	}
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(uport);
 }
 
 static void qcom_geni_serial_handle_tx_dma(struct uart_port *uport)
 {
 	struct qcom_geni_serial_port *port = to_dev_port(uport);
-	struct circ_buf *xmit = &uport->state->xmit;
+	struct tty_port *tport = &uport->state->port;
 
 	uart_xmit_advance(uport, port->tx_remaining);
 	geni_se_tx_dma_unprep(&port->se, port->tx_dma_addr, port->tx_remaining);
 	port->tx_dma_addr = 0;
 	port->tx_remaining = 0;
 
-	if (!uart_circ_empty(xmit))
+	if (!kfifo_is_empty(&tport->xmit_fifo))
 		qcom_geni_serial_start_tx_dma(uport);
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(uport);
 }
 
diff --git a/drivers/tty/serial/rda-uart.c b/drivers/tty/serial/rda-uart.c
index 82def9b8632a..663e35e424bd 100644
--- a/drivers/tty/serial/rda-uart.c
+++ b/drivers/tty/serial/rda-uart.c
@@ -330,8 +330,8 @@ static void rda_uart_set_termios(struct uart_port *port,
 
 static void rda_uart_send_chars(struct uart_port *port)
 {
-	struct circ_buf *xmit = &port->state->xmit;
-	unsigned int ch;
+	struct tty_port *tport = &port->state->port;
+	unsigned char ch;
 	u32 val;
 
 	if (uart_tx_stopped(port))
@@ -347,19 +347,14 @@ static void rda_uart_send_chars(struct uart_port *port)
 		port->x_char = 0;
 	}
 
-	while (rda_uart_read(port, RDA_UART_STATUS) & RDA_UART_TX_FIFO_MASK) {
-		if (uart_circ_empty(xmit))
-			break;
-
-		ch = xmit->buf[xmit->tail];
+	while ((rda_uart_read(port, RDA_UART_STATUS) & RDA_UART_TX_FIFO_MASK) &&
+			uart_fifo_get(port, &ch))
 		rda_uart_write(port, ch, RDA_UART_RXTX_BUFFER);
-		uart_xmit_advance(port, 1);
-	}
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
 
-	if (!uart_circ_empty(xmit)) {
+	if (!kfifo_is_empty(&tport->xmit_fifo)) {
 		/* Re-enable Tx FIFO interrupt */
 		val = rda_uart_read(port, RDA_UART_IRQ_MASK);
 		val |= RDA_UART_TX_DATA_NEEDED;
diff --git a/drivers/tty/serial/samsung_tty.c b/drivers/tty/serial/samsung_tty.c
index a2d07e05c502..dc35eb77d2ef 100644
--- a/drivers/tty/serial/samsung_tty.c
+++ b/drivers/tty/serial/samsung_tty.c
@@ -329,7 +329,7 @@ static void s3c24xx_serial_tx_dma_complete(void *args)
 {
 	struct s3c24xx_uart_port *ourport = args;
 	struct uart_port *port = &ourport->port;
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
 	struct s3c24xx_uart_dma *dma = ourport->dma;
 	struct dma_tx_state state;
 	unsigned long flags;
@@ -348,7 +348,7 @@ static void s3c24xx_serial_tx_dma_complete(void *args)
 	uart_xmit_advance(port, count);
 	ourport->tx_in_progress = 0;
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
 
 	s3c24xx_serial_start_next_tx(ourport);
@@ -431,17 +431,15 @@ static void s3c24xx_serial_start_tx_pio(struct s3c24xx_uart_port *ourport)
 }
 
 static int s3c24xx_serial_start_tx_dma(struct s3c24xx_uart_port *ourport,
-				      unsigned int count)
+				      unsigned int count, unsigned int tail)
 {
-	struct uart_port *port = &ourport->port;
-	struct circ_buf *xmit = &port->state->xmit;
 	struct s3c24xx_uart_dma *dma = ourport->dma;
 
 	if (ourport->tx_mode != S3C24XX_TX_DMA)
 		enable_tx_dma(ourport);
 
 	dma->tx_size = count & ~(dma_get_cache_alignment() - 1);
-	dma->tx_transfer_addr = dma->tx_addr + xmit->tail;
+	dma->tx_transfer_addr = dma->tx_addr + tail;
 
 	dma_sync_single_for_device(dma->tx_chan->device->dev,
 				   dma->tx_transfer_addr, dma->tx_size,
@@ -468,11 +466,11 @@ static int s3c24xx_serial_start_tx_dma(struct s3c24xx_uart_port *ourport,
 static void s3c24xx_serial_start_next_tx(struct s3c24xx_uart_port *ourport)
 {
 	struct uart_port *port = &ourport->port;
-	struct circ_buf *xmit = &port->state->xmit;
-	unsigned long count;
+	struct tty_port *tport = &port->state->port;
+	unsigned int count, tail;
 
 	/* Get data size up to the end of buffer */
-	count = CIRC_CNT_TO_END(xmit->head, xmit->tail, UART_XMIT_SIZE);
+	count = kfifo_out_linear(&tport->xmit_fifo, &tail, UART_XMIT_SIZE);
 
 	if (!count) {
 		s3c24xx_serial_stop_tx(port);
@@ -481,16 +479,16 @@ static void s3c24xx_serial_start_next_tx(struct s3c24xx_uart_port *ourport)
 
 	if (!ourport->dma || !ourport->dma->tx_chan ||
 	    count < ourport->min_dma_size ||
-	    xmit->tail & (dma_get_cache_alignment() - 1))
+	    tail & (dma_get_cache_alignment() - 1))
 		s3c24xx_serial_start_tx_pio(ourport);
 	else
-		s3c24xx_serial_start_tx_dma(ourport, count);
+		s3c24xx_serial_start_tx_dma(ourport, count, tail);
 }
 
 static void s3c24xx_serial_start_tx(struct uart_port *port)
 {
 	struct s3c24xx_uart_port *ourport = to_ourport(port);
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
 
 	if (!ourport->tx_enabled) {
 		if (port->flags & UPF_CONS_FLOW)
@@ -502,7 +500,8 @@ static void s3c24xx_serial_start_tx(struct uart_port *port)
 	}
 
 	if (ourport->dma && ourport->dma->tx_chan) {
-		if (!uart_circ_empty(xmit) && !ourport->tx_in_progress)
+		if (!kfifo_is_empty(&tport->xmit_fifo) &&
+				!ourport->tx_in_progress)
 			s3c24xx_serial_start_next_tx(ourport);
 	}
 }
@@ -868,18 +867,19 @@ static irqreturn_t s3c24xx_serial_rx_irq(int irq, void *dev_id)
 static void s3c24xx_serial_tx_chars(struct s3c24xx_uart_port *ourport)
 {
 	struct uart_port *port = &ourport->port;
-	struct circ_buf *xmit = &port->state->xmit;
-	int count, dma_count = 0;
+	struct tty_port *tport = &port->state->port;
+	unsigned int count, dma_count = 0, tail;
 
-	count = CIRC_CNT_TO_END(xmit->head, xmit->tail, UART_XMIT_SIZE);
+	count = kfifo_out_linear(&tport->xmit_fifo, &tail, UART_XMIT_SIZE);
 
 	if (ourport->dma && ourport->dma->tx_chan &&
 	    count >= ourport->min_dma_size) {
 		int align = dma_get_cache_alignment() -
-			(xmit->tail & (dma_get_cache_alignment() - 1));
+			(tail & (dma_get_cache_alignment() - 1));
 		if (count - align >= ourport->min_dma_size) {
 			dma_count = count - align;
 			count = align;
+			tail += align;
 		}
 	}
 
@@ -894,7 +894,7 @@ static void s3c24xx_serial_tx_chars(struct s3c24xx_uart_port *ourport)
 	 * stopped, disable the uart and exit
 	 */
 
-	if (uart_circ_empty(xmit) || uart_tx_stopped(port)) {
+	if (kfifo_is_empty(&tport->xmit_fifo) || uart_tx_stopped(port)) {
 		s3c24xx_serial_stop_tx(port);
 		return;
 	}
@@ -906,24 +906,25 @@ static void s3c24xx_serial_tx_chars(struct s3c24xx_uart_port *ourport)
 		dma_count = 0;
 	}
 
-	while (!uart_circ_empty(xmit) && count > 0) {
-		if (rd_regl(port, S3C2410_UFSTAT) & ourport->info->tx_fifofull)
+	while (!(rd_regl(port, S3C2410_UFSTAT) & ourport->info->tx_fifofull)) {
+		unsigned char ch;
+
+		if (!uart_fifo_get(port, &ch))
 			break;
 
-		wr_reg(port, S3C2410_UTXH, xmit->buf[xmit->tail]);
-		uart_xmit_advance(port, 1);
+		wr_reg(port, S3C2410_UTXH, ch);
 		count--;
 	}
 
 	if (!count && dma_count) {
-		s3c24xx_serial_start_tx_dma(ourport, dma_count);
+		s3c24xx_serial_start_tx_dma(ourport, dma_count, tail);
 		return;
 	}
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
 
-	if (uart_circ_empty(xmit))
+	if (kfifo_is_empty(&tport->xmit_fifo))
 		s3c24xx_serial_stop_tx(port);
 }
 
@@ -1118,7 +1119,8 @@ static int s3c24xx_serial_request_dma(struct s3c24xx_uart_port *p)
 
 	/* TX buffer */
 	dma->tx_addr = dma_map_single(dma->tx_chan->device->dev,
-				      p->port.state->xmit.buf, UART_XMIT_SIZE,
+				      p->port.state->port.xmit_buf,
+				      UART_XMIT_SIZE,
 				      DMA_TO_DEVICE);
 	if (dma_mapping_error(dma->tx_chan->device->dev, dma->tx_addr)) {
 		reason = "DMA mapping error for TX buffer";
diff --git a/drivers/tty/serial/sb1250-duart.c b/drivers/tty/serial/sb1250-duart.c
index dbec29d9a6c3..b4e1b90e5960 100644
--- a/drivers/tty/serial/sb1250-duart.c
+++ b/drivers/tty/serial/sb1250-duart.c
@@ -382,7 +382,8 @@ static void sbd_receive_chars(struct sbd_port *sport)
 static void sbd_transmit_chars(struct sbd_port *sport)
 {
 	struct uart_port *uport = &sport->port;
-	struct circ_buf *xmit = &sport->port.state->xmit;
+	struct tty_port *tport = &sport->port.state->port;
+	unsigned char ch;
 	unsigned int mask;
 	int stop_tx;
 
@@ -395,19 +396,19 @@ static void sbd_transmit_chars(struct sbd_port *sport)
 	}
 
 	/* If nothing to do or stopped or hardware stopped.  */
-	stop_tx = (uart_circ_empty(xmit) || uart_tx_stopped(&sport->port));
+	stop_tx = uart_tx_stopped(&sport->port) ||
+		!uart_fifo_get(&sport->port, &ch);
 
 	/* Send char.  */
 	if (!stop_tx) {
-		write_sbdchn(sport, R_DUART_TX_HOLD, xmit->buf[xmit->tail]);
-		uart_xmit_advance(&sport->port, 1);
+		write_sbdchn(sport, R_DUART_TX_HOLD, ch);
 
-		if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 			uart_write_wakeup(&sport->port);
 	}
 
 	/* Are we are done?  */
-	if (stop_tx || uart_circ_empty(xmit)) {
+	if (stop_tx || kfifo_is_empty(&tport->xmit_fifo)) {
 		/* Disable tx interrupts.  */
 		mask = read_sbdshr(sport, R_DUART_IMRREG((uport->line) % 2));
 		mask &= ~M_DUART_IMR_TX;
diff --git a/drivers/tty/serial/sc16is7xx.c b/drivers/tty/serial/sc16is7xx.c
index 929206a9a6e1..c6983b7bd78c 100644
--- a/drivers/tty/serial/sc16is7xx.c
+++ b/drivers/tty/serial/sc16is7xx.c
@@ -676,9 +676,9 @@ static void sc16is7xx_handle_rx(struct uart_port *port, unsigned int rxlen,
 static void sc16is7xx_handle_tx(struct uart_port *port)
 {
 	struct sc16is7xx_port *s = dev_get_drvdata(port->dev);
-	struct circ_buf *xmit = &port->state->xmit;
-	unsigned int txlen, to_send, i;
+	struct tty_port *tport = &port->state->port;
 	unsigned long flags;
+	unsigned int txlen;
 
 	if (unlikely(port->x_char)) {
 		sc16is7xx_port_write(port, SC16IS7XX_THR_REG, port->x_char);
@@ -687,40 +687,30 @@ static void sc16is7xx_handle_tx(struct uart_port *port)
 		return;
 	}
 
-	if (uart_circ_empty(xmit) || uart_tx_stopped(port)) {
+	if (kfifo_is_empty(&tport->xmit_fifo) || uart_tx_stopped(port)) {
 		uart_port_lock_irqsave(port, &flags);
 		sc16is7xx_stop_tx(port);
 		uart_port_unlock_irqrestore(port, flags);
 		return;
 	}
 
-	/* Get length of data pending in circular buffer */
-	to_send = uart_circ_chars_pending(xmit);
-	if (likely(to_send)) {
-		/* Limit to space available in TX FIFO */
-		txlen = sc16is7xx_port_read(port, SC16IS7XX_TXLVL_REG);
-		if (txlen > SC16IS7XX_FIFO_SIZE) {
-			dev_err_ratelimited(port->dev,
-				"chip reports %d free bytes in TX fifo, but it only has %d",
-				txlen, SC16IS7XX_FIFO_SIZE);
-			txlen = 0;
-		}
-		to_send = (to_send > txlen) ? txlen : to_send;
-
-		/* Convert to linear buffer */
-		for (i = 0; i < to_send; ++i) {
-			s->buf[i] = xmit->buf[xmit->tail];
-			uart_xmit_advance(port, 1);
-		}
-
-		sc16is7xx_fifo_write(port, s->buf, to_send);
+	/* Limit to space available in TX FIFO */
+	txlen = sc16is7xx_port_read(port, SC16IS7XX_TXLVL_REG);
+	if (txlen > SC16IS7XX_FIFO_SIZE) {
+		dev_err_ratelimited(port->dev,
+			"chip reports %d free bytes in TX fifo, but it only has %d",
+			txlen, SC16IS7XX_FIFO_SIZE);
+		txlen = 0;
 	}
 
+	txlen = uart_fifo_out(port, s->buf, txlen);
+	sc16is7xx_fifo_write(port, s->buf, txlen);
+
 	uart_port_lock_irqsave(port, &flags);
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
 
-	if (uart_circ_empty(xmit))
+	if (kfifo_is_empty(&tport->xmit_fifo))
 		sc16is7xx_stop_tx(port);
 	else
 		sc16is7xx_ier_set(port, SC16IS7XX_IER_THRI_BIT);
diff --git a/drivers/tty/serial/sccnxp.c b/drivers/tty/serial/sccnxp.c
index f24217a560d7..6d1d142fd216 100644
--- a/drivers/tty/serial/sccnxp.c
+++ b/drivers/tty/serial/sccnxp.c
@@ -439,7 +439,7 @@ static void sccnxp_handle_rx(struct uart_port *port)
 static void sccnxp_handle_tx(struct uart_port *port)
 {
 	u8 sr;
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
 	struct sccnxp_port *s = dev_get_drvdata(port->dev);
 
 	if (unlikely(port->x_char)) {
@@ -449,7 +449,7 @@ static void sccnxp_handle_tx(struct uart_port *port)
 		return;
 	}
 
-	if (uart_circ_empty(xmit) || uart_tx_stopped(port)) {
+	if (kfifo_is_empty(&tport->xmit_fifo) || uart_tx_stopped(port)) {
 		/* Disable TX if FIFO is empty */
 		if (sccnxp_port_read(port, SCCNXP_SR_REG) & SR_TXEMT) {
 			sccnxp_disable_irq(port, IMR_TXRDY);
@@ -461,16 +461,20 @@ static void sccnxp_handle_tx(struct uart_port *port)
 		return;
 	}
 
-	while (!uart_circ_empty(xmit)) {
+	while (1) {
+		unsigned char ch;
+
 		sr = sccnxp_port_read(port, SCCNXP_SR_REG);
 		if (!(sr & SR_TXRDY))
 			break;
 
-		sccnxp_port_write(port, SCCNXP_THR_REG, xmit->buf[xmit->tail]);
-		uart_xmit_advance(port, 1);
+		if (!uart_fifo_get(port, &ch))
+			break;
+
+		sccnxp_port_write(port, SCCNXP_THR_REG, ch);
 	}
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
 }
 
diff --git a/drivers/tty/serial/serial-tegra.c b/drivers/tty/serial/serial-tegra.c
index 525f3a2f7bd4..1183ca54ab92 100644
--- a/drivers/tty/serial/serial-tegra.c
+++ b/drivers/tty/serial/serial-tegra.c
@@ -484,18 +484,18 @@ static void tegra_uart_release_port(struct uart_port *u)
 
 static void tegra_uart_fill_tx_fifo(struct tegra_uart_port *tup, int max_bytes)
 {
-	struct circ_buf *xmit = &tup->uport.state->xmit;
+	unsigned char ch;
 	int i;
 
 	for (i = 0; i < max_bytes; i++) {
-		BUG_ON(uart_circ_empty(xmit));
 		if (tup->cdata->tx_fifo_full_status) {
 			unsigned long lsr = tegra_uart_read(tup, UART_LSR);
 			if ((lsr & TEGRA_UART_LSR_TXFIFO_FULL))
 				break;
 		}
-		tegra_uart_write(tup, xmit->buf[xmit->tail], UART_TX);
-		uart_xmit_advance(&tup->uport, 1);
+		if (WARN_ON_ONCE(!uart_fifo_get(&tup->uport, &ch)))
+			break;
+		tegra_uart_write(tup, ch, UART_TX);
 	}
 }
 
@@ -514,7 +514,7 @@ static void tegra_uart_start_pio_tx(struct tegra_uart_port *tup,
 static void tegra_uart_tx_dma_complete(void *args)
 {
 	struct tegra_uart_port *tup = args;
-	struct circ_buf *xmit = &tup->uport.state->xmit;
+	struct tty_port *tport = &tup->uport.state->port;
 	struct dma_tx_state state;
 	unsigned long flags;
 	unsigned int count;
@@ -525,7 +525,7 @@ static void tegra_uart_tx_dma_complete(void *args)
 	uart_port_lock_irqsave(&tup->uport, &flags);
 	uart_xmit_advance(&tup->uport, count);
 	tup->tx_in_progress = 0;
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(&tup->uport);
 	tegra_uart_start_next_tx(tup);
 	uart_port_unlock_irqrestore(&tup->uport, flags);
@@ -534,11 +534,14 @@ static void tegra_uart_tx_dma_complete(void *args)
 static int tegra_uart_start_tx_dma(struct tegra_uart_port *tup,
 		unsigned long count)
 {
-	struct circ_buf *xmit = &tup->uport.state->xmit;
+	struct tty_port *tport = &tup->uport.state->port;
 	dma_addr_t tx_phys_addr;
+	unsigned int tail;
 
 	tup->tx_bytes = count & ~(0xF);
-	tx_phys_addr = tup->tx_dma_buf_phys + xmit->tail;
+	WARN_ON_ONCE(kfifo_out_linear(&tport->xmit_fifo, &tail,
+			UART_XMIT_SIZE) < count);
+	tx_phys_addr = tup->tx_dma_buf_phys + tail;
 
 	dma_sync_single_for_device(tup->uport.dev, tx_phys_addr,
 				   tup->tx_bytes, DMA_TO_DEVICE);
@@ -562,18 +565,21 @@ static int tegra_uart_start_tx_dma(struct tegra_uart_port *tup,
 
 static void tegra_uart_start_next_tx(struct tegra_uart_port *tup)
 {
+	struct tty_port *tport = &tup->uport.state->port;
+	unsigned char *tail_ptr;
 	unsigned long tail;
-	unsigned long count;
-	struct circ_buf *xmit = &tup->uport.state->xmit;
+	unsigned int count;
 
 	if (!tup->current_baud)
 		return;
 
-	tail = (unsigned long)&xmit->buf[xmit->tail];
-	count = CIRC_CNT_TO_END(xmit->head, xmit->tail, UART_XMIT_SIZE);
+	count = kfifo_out_linear_ptr(&tport->xmit_fifo, &tail_ptr,
+			UART_XMIT_SIZE);
 	if (!count)
 		return;
 
+	tail = (unsigned long)tail_ptr;
+
 	if (tup->use_tx_pio || count < TEGRA_UART_MIN_DMA)
 		tegra_uart_start_pio_tx(tup, count);
 	else if (BYTES_TO_ALIGN(tail) > 0)
@@ -586,9 +592,9 @@ static void tegra_uart_start_next_tx(struct tegra_uart_port *tup)
 static void tegra_uart_start_tx(struct uart_port *u)
 {
 	struct tegra_uart_port *tup = to_tegra_uport(u);
-	struct circ_buf *xmit = &u->state->xmit;
+	struct tty_port *tport = &u->state->port;
 
-	if (!uart_circ_empty(xmit) && !tup->tx_in_progress)
+	if (!kfifo_is_empty(&tport->xmit_fifo) && !tup->tx_in_progress)
 		tegra_uart_start_next_tx(tup);
 }
 
@@ -628,11 +634,11 @@ static void tegra_uart_stop_tx(struct uart_port *u)
 
 static void tegra_uart_handle_tx_pio(struct tegra_uart_port *tup)
 {
-	struct circ_buf *xmit = &tup->uport.state->xmit;
+	struct tty_port *tport = &tup->uport.state->port;
 
 	tegra_uart_fill_tx_fifo(tup, tup->tx_bytes);
 	tup->tx_in_progress = 0;
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(&tup->uport);
 	tegra_uart_start_next_tx(tup);
 }
@@ -1169,15 +1175,14 @@ static int tegra_uart_dma_channel_allocate(struct tegra_uart_port *tup,
 		tup->rx_dma_buf_virt = dma_buf;
 		tup->rx_dma_buf_phys = dma_phys;
 	} else {
+		dma_buf = tup->uport.state->port.xmit_buf;
 		dma_phys = dma_map_single(tup->uport.dev,
-			tup->uport.state->xmit.buf, UART_XMIT_SIZE,
-			DMA_TO_DEVICE);
+			dma_buf, UART_XMIT_SIZE, DMA_TO_DEVICE);
 		if (dma_mapping_error(tup->uport.dev, dma_phys)) {
 			dev_err(tup->uport.dev, "dma_map_single tx failed\n");
 			dma_release_channel(dma_chan);
 			return -ENOMEM;
 		}
-		dma_buf = tup->uport.state->xmit.buf;
 		dma_sconfig.dst_addr = tup->uport.mapbase;
 		dma_sconfig.dst_addr_width = DMA_SLAVE_BUSWIDTH_1_BYTE;
 		dma_sconfig.dst_maxburst = 16;
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index ff85ebd3a007..3c0931fba1c6 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -272,9 +272,10 @@ static int uart_port_startup(struct tty_struct *tty, struct uart_state *state,
 		return -ENOMEM;
 
 	uart_port_lock(state, flags);
-	if (!state->xmit.buf) {
-		state->xmit.buf = (unsigned char *) page;
-		uart_circ_clear(&state->xmit);
+	if (!state->port.xmit_buf) {
+		state->port.xmit_buf = (unsigned char *)page;
+		kfifo_init(&state->port.xmit_fifo, state->port.xmit_buf,
+				PAGE_SIZE);
 		uart_port_unlock(uport, flags);
 	} else {
 		uart_port_unlock(uport, flags);
@@ -387,8 +388,9 @@ static void uart_shutdown(struct tty_struct *tty, struct uart_state *state)
 	 * can endup in printk() recursion.
 	 */
 	uart_port_lock(state, flags);
-	xmit_buf = state->xmit.buf;
-	state->xmit.buf = NULL;
+	xmit_buf = port->xmit_buf;
+	port->xmit_buf = NULL;
+	INIT_KFIFO(port->xmit_fifo);
 	uart_port_unlock(uport, flags);
 
 	free_page((unsigned long)xmit_buf);
@@ -552,22 +554,17 @@ static int uart_put_char(struct tty_struct *tty, u8 c)
 {
 	struct uart_state *state = tty->driver_data;
 	struct uart_port *port;
-	struct circ_buf *circ;
 	unsigned long flags;
 	int ret = 0;
 
-	circ = &state->xmit;
 	port = uart_port_lock(state, flags);
-	if (!circ->buf) {
+	if (WARN_ON_ONCE(!state->port.xmit_buf)) {
 		uart_port_unlock(port, flags);
 		return 0;
 	}
 
-	if (port && uart_circ_chars_free(circ) != 0) {
-		circ->buf[circ->head] = c;
-		circ->head = (circ->head + 1) & (UART_XMIT_SIZE - 1);
-		ret = 1;
-	}
+	if (port)
+		ret = kfifo_put(&state->port.xmit_fifo, c);
 	uart_port_unlock(port, flags);
 	return ret;
 }
@@ -581,9 +578,8 @@ static ssize_t uart_write(struct tty_struct *tty, const u8 *buf, size_t count)
 {
 	struct uart_state *state = tty->driver_data;
 	struct uart_port *port;
-	struct circ_buf *circ;
 	unsigned long flags;
-	int c, ret = 0;
+	int ret = 0;
 
 	/*
 	 * This means you called this function _after_ the port was
@@ -593,24 +589,13 @@ static ssize_t uart_write(struct tty_struct *tty, const u8 *buf, size_t count)
 		return -EL3HLT;
 
 	port = uart_port_lock(state, flags);
-	circ = &state->xmit;
-	if (!circ->buf) {
+	if (WARN_ON_ONCE(!state->port.xmit_buf)) {
 		uart_port_unlock(port, flags);
 		return 0;
 	}
 
-	while (port) {
-		c = CIRC_SPACE_TO_END(circ->head, circ->tail, UART_XMIT_SIZE);
-		if (count < c)
-			c = count;
-		if (c <= 0)
-			break;
-		memcpy(circ->buf + circ->head, buf, c);
-		circ->head = (circ->head + c) & (UART_XMIT_SIZE - 1);
-		buf += c;
-		count -= c;
-		ret += c;
-	}
+	if (port)
+		ret = kfifo_in(&state->port.xmit_fifo, buf, count);
 
 	__uart_start(state);
 	uart_port_unlock(port, flags);
@@ -625,7 +610,7 @@ static unsigned int uart_write_room(struct tty_struct *tty)
 	unsigned int ret;
 
 	port = uart_port_lock(state, flags);
-	ret = uart_circ_chars_free(&state->xmit);
+	ret = kfifo_avail(&state->port.xmit_fifo);
 	uart_port_unlock(port, flags);
 	return ret;
 }
@@ -638,7 +623,7 @@ static unsigned int uart_chars_in_buffer(struct tty_struct *tty)
 	unsigned int ret;
 
 	port = uart_port_lock(state, flags);
-	ret = uart_circ_chars_pending(&state->xmit);
+	ret = kfifo_len(&state->port.xmit_fifo);
 	uart_port_unlock(port, flags);
 	return ret;
 }
@@ -661,7 +646,7 @@ static void uart_flush_buffer(struct tty_struct *tty)
 	port = uart_port_lock(state, flags);
 	if (!port)
 		return;
-	uart_circ_clear(&state->xmit);
+	kfifo_reset(&state->port.xmit_fifo);
 	if (port->ops->flush_buffer)
 		port->ops->flush_buffer(port);
 	uart_port_unlock(port, flags);
@@ -1064,7 +1049,7 @@ static int uart_get_lsr_info(struct tty_struct *tty,
 	 * interrupt happens).
 	 */
 	if (uport->x_char ||
-	    ((uart_circ_chars_pending(&state->xmit) > 0) &&
+	    (!kfifo_is_empty(&state->port.xmit_fifo) &&
 	     !uart_tx_stopped(uport)))
 		result &= ~TIOCSER_TEMT;
 
@@ -1788,8 +1773,9 @@ static void uart_tty_port_shutdown(struct tty_port *port)
 	 * Free the transmit buffer.
 	 */
 	uart_port_lock_irq(uport);
-	buf = state->xmit.buf;
-	state->xmit.buf = NULL;
+	buf = port->xmit_buf;
+	port->xmit_buf = NULL;
+	INIT_KFIFO(port->xmit_fifo);
 	uart_port_unlock_irq(uport);
 
 	free_page((unsigned long)buf);
diff --git a/drivers/tty/serial/serial_port.c b/drivers/tty/serial/serial_port.c
index 22b9eeb23e68..3408c8827561 100644
--- a/drivers/tty/serial/serial_port.c
+++ b/drivers/tty/serial/serial_port.c
@@ -23,7 +23,7 @@
 static int __serial_port_busy(struct uart_port *port)
 {
 	return !uart_tx_stopped(port) &&
-		uart_circ_chars_pending(&port->state->xmit);
+		!kfifo_is_empty(&port->state->port.xmit_fifo);
 }
 
 static int serial_port_runtime_resume(struct device *dev)
diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c
index e512eaa57ed5..97031db26ae4 100644
--- a/drivers/tty/serial/sh-sci.c
+++ b/drivers/tty/serial/sh-sci.c
@@ -585,7 +585,7 @@ static void sci_start_tx(struct uart_port *port)
 			sci_serial_out(port, SCSCR, new);
 	}
 
-	if (s->chan_tx && !uart_circ_empty(&s->port.state->xmit) &&
+	if (s->chan_tx && !kfifo_is_empty(&port->state->port.xmit_fifo) &&
 	    dma_submit_error(s->cookie_tx)) {
 		if (s->cfg->regtype == SCIx_RZ_SCIFA_REGTYPE)
 			/* Switch irq from SCIF to DMA */
@@ -817,7 +817,7 @@ static int sci_rxfill(struct uart_port *port)
 
 static void sci_transmit_chars(struct uart_port *port)
 {
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
 	unsigned int stopped = uart_tx_stopped(port);
 	unsigned short status;
 	unsigned short ctrl;
@@ -826,7 +826,7 @@ static void sci_transmit_chars(struct uart_port *port)
 	status = sci_serial_in(port, SCxSR);
 	if (!(status & SCxSR_TDxE(port))) {
 		ctrl = sci_serial_in(port, SCSCR);
-		if (uart_circ_empty(xmit))
+		if (kfifo_is_empty(&tport->xmit_fifo))
 			ctrl &= ~SCSCR_TIE;
 		else
 			ctrl |= SCSCR_TIE;
@@ -842,15 +842,14 @@ static void sci_transmit_chars(struct uart_port *port)
 		if (port->x_char) {
 			c = port->x_char;
 			port->x_char = 0;
-		} else if (!uart_circ_empty(xmit) && !stopped) {
-			c = xmit->buf[xmit->tail];
-			xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
-		} else if (port->type == PORT_SCI && uart_circ_empty(xmit)) {
-			ctrl = sci_serial_in(port, SCSCR);
-			ctrl &= ~SCSCR_TE;
-			sci_serial_out(port, SCSCR, ctrl);
-			return;
-		} else {
+		} else if (stopped || !kfifo_get(&tport->xmit_fifo, &c)) {
+			if (port->type == PORT_SCI &&
+				   kfifo_is_empty(&tport->xmit_fifo)) {
+				ctrl = sci_serial_in(port, SCSCR);
+				ctrl &= ~SCSCR_TE;
+				sci_serial_out(port, SCSCR, ctrl);
+				return;
+			}
 			break;
 		}
 
@@ -861,9 +860,9 @@ static void sci_transmit_chars(struct uart_port *port)
 
 	sci_clear_SCxSR(port, SCxSR_TDxE_CLEAR(port));
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
-	if (uart_circ_empty(xmit)) {
+	if (kfifo_is_empty(&tport->xmit_fifo)) {
 		if (port->type == PORT_SCI) {
 			ctrl = sci_serial_in(port, SCSCR);
 			ctrl &= ~SCSCR_TIE;
@@ -1199,7 +1198,7 @@ static void sci_dma_tx_complete(void *arg)
 {
 	struct sci_port *s = arg;
 	struct uart_port *port = &s->port;
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
 	unsigned long flags;
 
 	dev_dbg(port->dev, "%s(%d)\n", __func__, port->line);
@@ -1208,10 +1207,10 @@ static void sci_dma_tx_complete(void *arg)
 
 	uart_xmit_advance(port, s->tx_dma_len);
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
 
-	if (!uart_circ_empty(xmit)) {
+	if (!kfifo_is_empty(&tport->xmit_fifo)) {
 		s->cookie_tx = 0;
 		schedule_work(&s->work_tx);
 	} else {
@@ -1424,10 +1423,10 @@ static void sci_dma_tx_work_fn(struct work_struct *work)
 	struct dma_async_tx_descriptor *desc;
 	struct dma_chan *chan = s->chan_tx;
 	struct uart_port *port = &s->port;
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
 	unsigned long flags;
+	unsigned int tail;
 	dma_addr_t buf;
-	int head, tail;
 
 	/*
 	 * DMA is idle now.
@@ -1437,10 +1436,9 @@ static void sci_dma_tx_work_fn(struct work_struct *work)
 	 * consistent xmit buffer state.
 	 */
 	uart_port_lock_irq(port);
-	head = xmit->head;
-	tail = xmit->tail;
+	s->tx_dma_len = kfifo_out_linear(&tport->xmit_fifo, &tail,
+			UART_XMIT_SIZE);
 	buf = s->tx_dma_addr + tail;
-	s->tx_dma_len = CIRC_CNT_TO_END(head, tail, UART_XMIT_SIZE);
 	if (!s->tx_dma_len) {
 		/* Transmit buffer has been flushed */
 		uart_port_unlock_irq(port);
@@ -1469,8 +1467,8 @@ static void sci_dma_tx_work_fn(struct work_struct *work)
 	}
 
 	uart_port_unlock_irq(port);
-	dev_dbg(port->dev, "%s: %p: %d...%d, cookie %d\n",
-		__func__, xmit->buf, tail, head, s->cookie_tx);
+	dev_dbg(port->dev, "%s: %p: %u, cookie %d\n",
+		__func__, tport->xmit_buf, tail, s->cookie_tx);
 
 	dma_async_issue_pending(chan);
 	return;
@@ -1585,6 +1583,7 @@ static struct dma_chan *sci_request_dma_chan(struct uart_port *port,
 static void sci_request_dma(struct uart_port *port)
 {
 	struct sci_port *s = to_sci_port(port);
+	struct tty_port *tport = &port->state->port;
 	struct dma_chan *chan;
 
 	dev_dbg(port->dev, "%s: port %d\n", __func__, port->line);
@@ -1613,7 +1612,7 @@ static void sci_request_dma(struct uart_port *port)
 	if (chan) {
 		/* UART circular tx buffer is an aligned page. */
 		s->tx_dma_addr = dma_map_single(chan->device->dev,
-						port->state->xmit.buf,
+						tport->xmit_buf,
 						UART_XMIT_SIZE,
 						DMA_TO_DEVICE);
 		if (dma_mapping_error(chan->device->dev, s->tx_dma_addr)) {
@@ -1622,7 +1621,7 @@ static void sci_request_dma(struct uart_port *port)
 		} else {
 			dev_dbg(port->dev, "%s: mapped %lu@%p to %pad\n",
 				__func__, UART_XMIT_SIZE,
-				port->state->xmit.buf, &s->tx_dma_addr);
+				tport->xmit_buf, &s->tx_dma_addr);
 
 			INIT_WORK(&s->work_tx, sci_dma_tx_work_fn);
 			s->chan_tx_saved = s->chan_tx = chan;
diff --git a/drivers/tty/serial/sprd_serial.c b/drivers/tty/serial/sprd_serial.c
index 15f14fa593da..3fc54cc02a1f 100644
--- a/drivers/tty/serial/sprd_serial.c
+++ b/drivers/tty/serial/sprd_serial.c
@@ -227,13 +227,13 @@ static int sprd_tx_buf_remap(struct uart_port *port)
 {
 	struct sprd_uart_port *sp =
 		container_of(port, struct sprd_uart_port, port);
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
+	unsigned char *tail;
 
-	sp->tx_dma.trans_len =
-		CIRC_CNT_TO_END(xmit->head, xmit->tail, UART_XMIT_SIZE);
+	sp->tx_dma.trans_len = kfifo_out_linear_ptr(&tport->xmit_fifo, &tail,
+			UART_XMIT_SIZE);
 
-	sp->tx_dma.phys_addr = dma_map_single(port->dev,
-					      (void *)&(xmit->buf[xmit->tail]),
+	sp->tx_dma.phys_addr = dma_map_single(port->dev, tail,
 					      sp->tx_dma.trans_len,
 					      DMA_TO_DEVICE);
 	return dma_mapping_error(port->dev, sp->tx_dma.phys_addr);
@@ -244,7 +244,7 @@ static void sprd_complete_tx_dma(void *data)
 	struct uart_port *port = (struct uart_port *)data;
 	struct sprd_uart_port *sp =
 		container_of(port, struct sprd_uart_port, port);
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
 	unsigned long flags;
 
 	uart_port_lock_irqsave(port, &flags);
@@ -253,10 +253,10 @@ static void sprd_complete_tx_dma(void *data)
 
 	uart_xmit_advance(port, sp->tx_dma.trans_len);
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
 
-	if (uart_circ_empty(xmit) || sprd_tx_buf_remap(port) ||
+	if (kfifo_is_empty(&tport->xmit_fifo) || sprd_tx_buf_remap(port) ||
 	    sprd_tx_dma_config(port))
 		sp->tx_dma.trans_len = 0;
 
@@ -319,7 +319,7 @@ static void sprd_start_tx_dma(struct uart_port *port)
 {
 	struct sprd_uart_port *sp =
 		container_of(port, struct sprd_uart_port, port);
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
 
 	if (port->x_char) {
 		serial_out(port, SPRD_TXD, port->x_char);
@@ -328,7 +328,7 @@ static void sprd_start_tx_dma(struct uart_port *port)
 		return;
 	}
 
-	if (uart_circ_empty(xmit) || uart_tx_stopped(port)) {
+	if (kfifo_is_empty(&tport->xmit_fifo) || uart_tx_stopped(port)) {
 		sprd_stop_tx_dma(port);
 		return;
 	}
diff --git a/drivers/tty/serial/st-asc.c b/drivers/tty/serial/st-asc.c
index a23e59551848..f91753a40a69 100644
--- a/drivers/tty/serial/st-asc.c
+++ b/drivers/tty/serial/st-asc.c
@@ -387,9 +387,9 @@ static unsigned int asc_get_mctrl(struct uart_port *port)
 /* There are probably characters waiting to be transmitted. */
 static void asc_start_tx(struct uart_port *port)
 {
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
 
-	if (!uart_circ_empty(xmit))
+	if (!kfifo_is_empty(&tport->xmit_fifo))
 		asc_enable_tx_interrupts(port);
 }
 
diff --git a/drivers/tty/serial/stm32-usart.c b/drivers/tty/serial/stm32-usart.c
index 58d169e5c1db..8c66abcfe6ca 100644
--- a/drivers/tty/serial/stm32-usart.c
+++ b/drivers/tty/serial/stm32-usart.c
@@ -696,18 +696,23 @@ static void stm32_usart_transmit_chars_pio(struct uart_port *port)
 {
 	struct stm32_port *stm32_port = to_stm32_port(port);
 	const struct stm32_usart_offsets *ofs = &stm32_port->info->ofs;
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
+
+	while (1) {
+		unsigned char ch;
 
-	while (!uart_circ_empty(xmit)) {
 		/* Check that TDR is empty before filling FIFO */
 		if (!(readl_relaxed(port->membase + ofs->isr) & USART_SR_TXE))
 			break;
-		writel_relaxed(xmit->buf[xmit->tail], port->membase + ofs->tdr);
-		uart_xmit_advance(port, 1);
+
+		if (!uart_fifo_get(port, &ch))
+			break;
+
+		writel_relaxed(ch, port->membase + ofs->tdr);
 	}
 
 	/* rely on TXE irq (mask or unmask) for sending remaining data */
-	if (uart_circ_empty(xmit))
+	if (kfifo_is_empty(&tport->xmit_fifo))
 		stm32_usart_tx_interrupt_disable(port);
 	else
 		stm32_usart_tx_interrupt_enable(port);
@@ -716,7 +721,7 @@ static void stm32_usart_transmit_chars_pio(struct uart_port *port)
 static void stm32_usart_transmit_chars_dma(struct uart_port *port)
 {
 	struct stm32_port *stm32port = to_stm32_port(port);
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
 	struct dma_async_tx_descriptor *desc = NULL;
 	unsigned int count;
 	int ret;
@@ -728,25 +733,8 @@ static void stm32_usart_transmit_chars_dma(struct uart_port *port)
 		return;
 	}
 
-	count = uart_circ_chars_pending(xmit);
-
-	if (count > TX_BUF_L)
-		count = TX_BUF_L;
-
-	if (xmit->tail < xmit->head) {
-		memcpy(&stm32port->tx_buf[0], &xmit->buf[xmit->tail], count);
-	} else {
-		size_t one = UART_XMIT_SIZE - xmit->tail;
-		size_t two;
-
-		if (one > count)
-			one = count;
-		two = count - one;
-
-		memcpy(&stm32port->tx_buf[0], &xmit->buf[xmit->tail], one);
-		if (two)
-			memcpy(&stm32port->tx_buf[one], &xmit->buf[0], two);
-	}
+	count =	kfifo_out_peek(&tport->xmit_fifo, &stm32port->tx_buf[0],
+			TX_BUF_L);
 
 	desc = dmaengine_prep_slave_single(stm32port->tx_ch,
 					   stm32port->tx_dma_buf,
@@ -792,14 +780,14 @@ static void stm32_usart_transmit_chars(struct uart_port *port)
 {
 	struct stm32_port *stm32_port = to_stm32_port(port);
 	const struct stm32_usart_offsets *ofs = &stm32_port->info->ofs;
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
 	u32 isr;
 	int ret;
 
 	if (!stm32_port->hw_flow_control &&
 	    port->rs485.flags & SER_RS485_ENABLED &&
 	    (port->x_char ||
-	     !(uart_circ_empty(xmit) || uart_tx_stopped(port)))) {
+	     !(kfifo_is_empty(&tport->xmit_fifo) || uart_tx_stopped(port)))) {
 		stm32_usart_tc_interrupt_disable(port);
 		stm32_usart_rs485_rts_enable(port);
 	}
@@ -826,7 +814,7 @@ static void stm32_usart_transmit_chars(struct uart_port *port)
 		return;
 	}
 
-	if (uart_circ_empty(xmit) || uart_tx_stopped(port)) {
+	if (kfifo_is_empty(&tport->xmit_fifo) || uart_tx_stopped(port)) {
 		stm32_usart_tx_interrupt_disable(port);
 		return;
 	}
@@ -841,10 +829,10 @@ static void stm32_usart_transmit_chars(struct uart_port *port)
 	else
 		stm32_usart_transmit_chars_pio(port);
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
 
-	if (uart_circ_empty(xmit)) {
+	if (kfifo_is_empty(&tport->xmit_fifo)) {
 		stm32_usart_tx_interrupt_disable(port);
 		if (!stm32_port->hw_flow_control &&
 		    port->rs485.flags & SER_RS485_ENABLED) {
@@ -967,9 +955,9 @@ static void stm32_usart_stop_tx(struct uart_port *port)
 /* There are probably characters waiting to be transmitted. */
 static void stm32_usart_start_tx(struct uart_port *port)
 {
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
 
-	if (uart_circ_empty(xmit) && !port->x_char) {
+	if (kfifo_is_empty(&tport->xmit_fifo) && !port->x_char) {
 		stm32_usart_rs485_rts_disable(port);
 		return;
 	}
diff --git a/drivers/tty/serial/sunhv.c b/drivers/tty/serial/sunhv.c
index 8d612ab80680..7f60679fdde1 100644
--- a/drivers/tty/serial/sunhv.c
+++ b/drivers/tty/serial/sunhv.c
@@ -39,10 +39,13 @@ static char *con_read_page;
 
 static int hung_up = 0;
 
-static void transmit_chars_putchar(struct uart_port *port, struct circ_buf *xmit)
+static void transmit_chars_putchar(struct uart_port *port,
+		struct tty_port *tport)
 {
-	while (!uart_circ_empty(xmit)) {
-		long status = sun4v_con_putchar(xmit->buf[xmit->tail]);
+	unsigned char ch;
+
+	while (kfifo_peek(&tport->xmit_fifo, &ch)) {
+		long status = sun4v_con_putchar(ch);
 
 		if (status != HV_EOK)
 			break;
@@ -51,14 +54,16 @@ static void transmit_chars_putchar(struct uart_port *port, struct circ_buf *xmit
 	}
 }
 
-static void transmit_chars_write(struct uart_port *port, struct circ_buf *xmit)
+static void transmit_chars_write(struct uart_port *port, struct tty_port *tport)
 {
-	while (!uart_circ_empty(xmit)) {
-		unsigned long ra = __pa(xmit->buf + xmit->tail);
-		unsigned long len, status, sent;
+	while (!kfifo_is_empty(&tport->xmit_fifo)) {
+		unsigned long len, ra, status, sent;
+		unsigned char *tail;
+
+		len = kfifo_out_linear_ptr(&tport->xmit_fifo, &tail,
+				UART_XMIT_SIZE);
+		ra = __pa(tail);
 
-		len = CIRC_CNT_TO_END(xmit->head, xmit->tail,
-				      UART_XMIT_SIZE);
 		status = sun4v_con_write(ra, len, &sent);
 		if (status != HV_EOK)
 			break;
@@ -165,7 +170,7 @@ static int receive_chars_read(struct uart_port *port)
 }
 
 struct sunhv_ops {
-	void (*transmit_chars)(struct uart_port *port, struct circ_buf *xmit);
+	void (*transmit_chars)(struct uart_port *port, struct tty_port *tport);
 	int (*receive_chars)(struct uart_port *port);
 };
 
@@ -196,18 +201,18 @@ static struct tty_port *receive_chars(struct uart_port *port)
 
 static void transmit_chars(struct uart_port *port)
 {
-	struct circ_buf *xmit;
+	struct tty_port *tport;
 
 	if (!port->state)
 		return;
 
-	xmit = &port->state->xmit;
-	if (uart_circ_empty(xmit) || uart_tx_stopped(port))
+	tport = &port->state->port;
+	if (kfifo_is_empty(&tport->xmit_fifo) || uart_tx_stopped(port))
 		return;
 
-	sunhv_ops->transmit_chars(port, xmit);
+	sunhv_ops->transmit_chars(port, tport);
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
 }
 
diff --git a/drivers/tty/serial/sunplus-uart.c b/drivers/tty/serial/sunplus-uart.c
index f5e29eb4a4ce..abf7c449308d 100644
--- a/drivers/tty/serial/sunplus-uart.c
+++ b/drivers/tty/serial/sunplus-uart.c
@@ -200,7 +200,7 @@ static void sunplus_break_ctl(struct uart_port *port, int ctl)
 
 static void transmit_chars(struct uart_port *port)
 {
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
 
 	if (port->x_char) {
 		sp_uart_put_char(port, port->x_char);
@@ -209,22 +209,24 @@ static void transmit_chars(struct uart_port *port)
 		return;
 	}
 
-	if (uart_circ_empty(xmit) || uart_tx_stopped(port)) {
+	if (kfifo_is_empty(&tport->xmit_fifo) || uart_tx_stopped(port)) {
 		sunplus_stop_tx(port);
 		return;
 	}
 
 	do {
-		sp_uart_put_char(port, xmit->buf[xmit->tail]);
-		uart_xmit_advance(port, 1);
-		if (uart_circ_empty(xmit))
+		unsigned char ch;
+
+		if (!uart_fifo_get(port, &ch))
 			break;
+
+		sp_uart_put_char(port, ch);
 	} while (sunplus_tx_buf_not_full(port));
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
 
-	if (uart_circ_empty(xmit))
+	if (kfifo_is_empty(&tport->xmit_fifo))
 		sunplus_stop_tx(port);
 }
 
diff --git a/drivers/tty/serial/sunsab.c b/drivers/tty/serial/sunsab.c
index 1ea2f33a07a7..1acbe2fba746 100644
--- a/drivers/tty/serial/sunsab.c
+++ b/drivers/tty/serial/sunsab.c
@@ -232,7 +232,7 @@ static void sunsab_tx_idle(struct uart_sunsab_port *);
 static void transmit_chars(struct uart_sunsab_port *up,
 			   union sab82532_irq_status *stat)
 {
-	struct circ_buf *xmit = &up->port.state->xmit;
+	struct tty_port *tport = &up->port.state->port;
 	int i;
 
 	if (stat->sreg.isr1 & SAB82532_ISR1_ALLS) {
@@ -252,7 +252,7 @@ static void transmit_chars(struct uart_sunsab_port *up,
 	set_bit(SAB82532_XPR, &up->irqflags);
 	sunsab_tx_idle(up);
 
-	if (uart_circ_empty(xmit) || uart_tx_stopped(&up->port)) {
+	if (kfifo_is_empty(&tport->xmit_fifo) || uart_tx_stopped(&up->port)) {
 		up->interrupt_mask1 |= SAB82532_IMR1_XPR;
 		writeb(up->interrupt_mask1, &up->regs->w.imr1);
 		return;
@@ -265,21 +265,22 @@ static void transmit_chars(struct uart_sunsab_port *up,
 	/* Stuff 32 bytes into Transmit FIFO. */
 	clear_bit(SAB82532_XPR, &up->irqflags);
 	for (i = 0; i < up->port.fifosize; i++) {
-		writeb(xmit->buf[xmit->tail],
-		       &up->regs->w.xfifo[i]);
-		uart_xmit_advance(&up->port, 1);
-		if (uart_circ_empty(xmit))
+		unsigned char ch;
+
+		if (!uart_fifo_get(&up->port, &ch))
 			break;
+
+		writeb(ch, &up->regs->w.xfifo[i]);
 	}
 
 	/* Issue a Transmit Frame command. */
 	sunsab_cec_wait(up);
 	writeb(SAB82532_CMDR_XF, &up->regs->w.cmdr);
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(&up->port);
 
-	if (uart_circ_empty(xmit))
+	if (kfifo_is_empty(&tport->xmit_fifo))
 		sunsab_stop_tx(&up->port);
 }
 
@@ -435,10 +436,10 @@ static void sunsab_start_tx(struct uart_port *port)
 {
 	struct uart_sunsab_port *up =
 		container_of(port, struct uart_sunsab_port, port);
-	struct circ_buf *xmit = &up->port.state->xmit;
+	struct tty_port *tport = &up->port.state->port;
 	int i;
 
-	if (uart_circ_empty(xmit) || uart_tx_stopped(port))
+	if (kfifo_is_empty(&tport->xmit_fifo) || uart_tx_stopped(port))
 		return;
 
 	up->interrupt_mask1 &= ~(SAB82532_IMR1_ALLS|SAB82532_IMR1_XPR);
@@ -451,11 +452,12 @@ static void sunsab_start_tx(struct uart_port *port)
 	clear_bit(SAB82532_XPR, &up->irqflags);
 
 	for (i = 0; i < up->port.fifosize; i++) {
-		writeb(xmit->buf[xmit->tail],
-		       &up->regs->w.xfifo[i]);
-		uart_xmit_advance(&up->port, 1);
-		if (uart_circ_empty(xmit))
+		unsigned char ch;
+
+		if (!uart_fifo_get(&up->port, &ch))
 			break;
+
+		writeb(ch, &up->regs->w.xfifo[i]);
 	}
 
 	/* Issue a Transmit Frame command.  */
diff --git a/drivers/tty/serial/sunsu.c b/drivers/tty/serial/sunsu.c
index c8b65f4b2710..67a5fc70bb4b 100644
--- a/drivers/tty/serial/sunsu.c
+++ b/drivers/tty/serial/sunsu.c
@@ -396,7 +396,8 @@ receive_chars(struct uart_sunsu_port *up, unsigned char *status)
 
 static void transmit_chars(struct uart_sunsu_port *up)
 {
-	struct circ_buf *xmit = &up->port.state->xmit;
+	struct tty_port *tport = &up->port.state->port;
+	unsigned char ch;
 	int count;
 
 	if (up->port.x_char) {
@@ -409,23 +410,23 @@ static void transmit_chars(struct uart_sunsu_port *up)
 		sunsu_stop_tx(&up->port);
 		return;
 	}
-	if (uart_circ_empty(xmit)) {
+	if (kfifo_is_empty(&tport->xmit_fifo)) {
 		__stop_tx(up);
 		return;
 	}
 
 	count = up->port.fifosize;
 	do {
-		serial_out(up, UART_TX, xmit->buf[xmit->tail]);
-		uart_xmit_advance(&up->port, 1);
-		if (uart_circ_empty(xmit))
+		if (!uart_fifo_get(&up->port, &ch))
 			break;
+
+		serial_out(up, UART_TX, ch);
 	} while (--count > 0);
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(&up->port);
 
-	if (uart_circ_empty(xmit))
+	if (kfifo_is_empty(&tport->xmit_fifo))
 		__stop_tx(up);
 }
 
diff --git a/drivers/tty/serial/sunzilog.c b/drivers/tty/serial/sunzilog.c
index c99289c6c8f8..71758ad4241c 100644
--- a/drivers/tty/serial/sunzilog.c
+++ b/drivers/tty/serial/sunzilog.c
@@ -453,7 +453,8 @@ static void sunzilog_status_handle(struct uart_sunzilog_port *up,
 static void sunzilog_transmit_chars(struct uart_sunzilog_port *up,
 				    struct zilog_channel __iomem *channel)
 {
-	struct circ_buf *xmit;
+	struct tty_port *tport;
+	unsigned char ch;
 
 	if (ZS_IS_CONS(up)) {
 		unsigned char status = readb(&channel->control);
@@ -496,21 +497,20 @@ static void sunzilog_transmit_chars(struct uart_sunzilog_port *up,
 
 	if (up->port.state == NULL)
 		goto ack_tx_int;
-	xmit = &up->port.state->xmit;
-	if (uart_circ_empty(xmit))
-		goto ack_tx_int;
+	tport = &up->port.state->port;
 
 	if (uart_tx_stopped(&up->port))
 		goto ack_tx_int;
 
+	if (!uart_fifo_get(&up->port, &ch))
+		goto ack_tx_int;
+
 	up->flags |= SUNZILOG_FLAG_TX_ACTIVE;
-	writeb(xmit->buf[xmit->tail], &channel->data);
+	writeb(ch, &channel->data);
 	ZSDELAY();
 	ZS_WSYNC(channel);
 
-	uart_xmit_advance(&up->port, 1);
-
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(&up->port);
 
 	return;
@@ -700,17 +700,16 @@ static void sunzilog_start_tx(struct uart_port *port)
 		port->icount.tx++;
 		port->x_char = 0;
 	} else {
-		struct circ_buf *xmit = &port->state->xmit;
+		struct tty_port *tport = &port->state->port;
+		unsigned char ch;
 
-		if (uart_circ_empty(xmit))
+		if (!uart_fifo_get(&up->port, &ch))
 			return;
-		writeb(xmit->buf[xmit->tail], &channel->data);
+		writeb(ch, &channel->data);
 		ZSDELAY();
 		ZS_WSYNC(channel);
 
-		uart_xmit_advance(port, 1);
-
-		if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 			uart_write_wakeup(&up->port);
 	}
 }
diff --git a/drivers/tty/serial/tegra-tcu.c b/drivers/tty/serial/tegra-tcu.c
index d9c78320eb02..21ca5fcadf49 100644
--- a/drivers/tty/serial/tegra-tcu.c
+++ b/drivers/tty/serial/tegra-tcu.c
@@ -91,15 +91,17 @@ static void tegra_tcu_write(struct tegra_tcu *tcu, const char *s,
 static void tegra_tcu_uart_start_tx(struct uart_port *port)
 {
 	struct tegra_tcu *tcu = port->private_data;
-	struct circ_buf *xmit = &port->state->xmit;
-	unsigned long count;
+	struct tty_port *tport = &port->state->port;
+	unsigned char *tail;
+	unsigned int count;
 
 	for (;;) {
-		count = CIRC_CNT_TO_END(xmit->head, xmit->tail, UART_XMIT_SIZE);
+		count = kfifo_out_linear_ptr(&tport->xmit_fifo, &tail,
+				UART_XMIT_SIZE);
 		if (!count)
 			break;
 
-		tegra_tcu_write(tcu, &xmit->buf[xmit->tail], count);
+		tegra_tcu_write(tcu, tail, count);
 		uart_xmit_advance(port, count);
 	}
 
diff --git a/drivers/tty/serial/timbuart.c b/drivers/tty/serial/timbuart.c
index 4bc89a9b380a..43fa0938b5e3 100644
--- a/drivers/tty/serial/timbuart.c
+++ b/drivers/tty/serial/timbuart.c
@@ -95,14 +95,11 @@ static void timbuart_rx_chars(struct uart_port *port)
 
 static void timbuart_tx_chars(struct uart_port *port)
 {
-	struct circ_buf *xmit = &port->state->xmit;
+	unsigned char ch;
 
 	while (!(ioread32(port->membase + TIMBUART_ISR) & TXBF) &&
-		!uart_circ_empty(xmit)) {
-		iowrite8(xmit->buf[xmit->tail],
-			port->membase + TIMBUART_TXFIFO);
-		uart_xmit_advance(port, 1);
-	}
+			uart_fifo_get(port, &ch))
+		iowrite8(ch, port->membase + TIMBUART_TXFIFO);
 
 	dev_dbg(port->dev,
 		"%s - total written %d bytes, CTL: %x, RTS: %x, baud: %x\n",
@@ -117,9 +114,9 @@ static void timbuart_handle_tx_port(struct uart_port *port, u32 isr, u32 *ier)
 {
 	struct timbuart_port *uart =
 		container_of(port, struct timbuart_port, port);
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
 
-	if (uart_circ_empty(xmit) || uart_tx_stopped(port))
+	if (kfifo_is_empty(&tport->xmit_fifo) || uart_tx_stopped(port))
 		return;
 
 	if (port->x_char)
@@ -130,7 +127,7 @@ static void timbuart_handle_tx_port(struct uart_port *port, u32 isr, u32 *ier)
 		/* clear all TX interrupts */
 		iowrite32(TXFLAGS, port->membase + TIMBUART_ISR);
 
-		if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 			uart_write_wakeup(port);
 	} else
 		/* Re-enable any tx interrupt */
@@ -141,7 +138,7 @@ static void timbuart_handle_tx_port(struct uart_port *port, u32 isr, u32 *ier)
 	 * we wake up the upper layer later when we got the interrupt
 	 * to give it some time to go out...
 	 */
-	if (!uart_circ_empty(xmit))
+	if (!kfifo_is_empty(&tport->xmit_fifo))
 		*ier |= TXBAE;
 
 	dev_dbg(port->dev, "%s - leaving\n", __func__);
diff --git a/drivers/tty/serial/uartlite.c b/drivers/tty/serial/uartlite.c
index 10ba41b7be99..68357ac8ffe3 100644
--- a/drivers/tty/serial/uartlite.c
+++ b/drivers/tty/serial/uartlite.c
@@ -189,7 +189,8 @@ static int ulite_receive(struct uart_port *port, int stat)
 
 static int ulite_transmit(struct uart_port *port, int stat)
 {
-	struct circ_buf *xmit  = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
+	unsigned char ch;
 
 	if (stat & ULITE_STATUS_TXFULL)
 		return 0;
@@ -201,14 +202,16 @@ static int ulite_transmit(struct uart_port *port, int stat)
 		return 1;
 	}
 
-	if (uart_circ_empty(xmit) || uart_tx_stopped(port))
+	if (uart_tx_stopped(port))
+		return 0;
+
+	if (!uart_fifo_get(port, &ch))
 		return 0;
 
-	uart_out32(xmit->buf[xmit->tail], ULITE_TX, port);
-	uart_xmit_advance(port, 1);
+	uart_out32(ch, ULITE_TX, port);
 
 	/* wake up */
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
 
 	return 1;
diff --git a/drivers/tty/serial/ucc_uart.c b/drivers/tty/serial/ucc_uart.c
index 397b95dff7ed..53bb8c5ef499 100644
--- a/drivers/tty/serial/ucc_uart.c
+++ b/drivers/tty/serial/ucc_uart.c
@@ -334,7 +334,7 @@ static int qe_uart_tx_pump(struct uart_qe_port *qe_port)
 	unsigned char *p;
 	unsigned int count;
 	struct uart_port *port = &qe_port->port;
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
 
 	/* Handle xon/xoff */
 	if (port->x_char) {
@@ -358,7 +358,7 @@ static int qe_uart_tx_pump(struct uart_qe_port *qe_port)
 		return 1;
 	}
 
-	if (uart_circ_empty(xmit) || uart_tx_stopped(port)) {
+	if (kfifo_is_empty(&tport->xmit_fifo) || uart_tx_stopped(port)) {
 		qe_uart_stop_tx(port);
 		return 0;
 	}
@@ -366,16 +366,10 @@ static int qe_uart_tx_pump(struct uart_qe_port *qe_port)
 	/* Pick next descriptor and fill from buffer */
 	bdp = qe_port->tx_cur;
 
-	while (!(ioread16be(&bdp->status) & BD_SC_READY) && !uart_circ_empty(xmit)) {
-		count = 0;
+	while (!(ioread16be(&bdp->status) & BD_SC_READY) &&
+	       !kfifo_is_empty(&tport->xmit_fifo)) {
 		p = qe2cpu_addr(ioread32be(&bdp->buf), qe_port);
-		while (count < qe_port->tx_fifosize) {
-			*p++ = xmit->buf[xmit->tail];
-			uart_xmit_advance(port, 1);
-			count++;
-			if (uart_circ_empty(xmit))
-				break;
-		}
+		count =	uart_fifo_out(port, p, qe_port->tx_fifosize);
 
 		iowrite16be(count, &bdp->length);
 		qe_setbits_be16(&bdp->status, BD_SC_READY);
@@ -388,10 +382,10 @@ static int qe_uart_tx_pump(struct uart_qe_port *qe_port)
 	}
 	qe_port->tx_cur = bdp;
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
 
-	if (uart_circ_empty(xmit)) {
+	if (kfifo_is_empty(&tport->xmit_fifo)) {
 		/* The kernel buffer is empty, so turn off TX interrupts.  We
 		   don't need to be told when the QE is finished transmitting
 		   the data. */
diff --git a/drivers/tty/serial/xilinx_uartps.c b/drivers/tty/serial/xilinx_uartps.c
index 5f48ec37cb25..de3487206bcb 100644
--- a/drivers/tty/serial/xilinx_uartps.c
+++ b/drivers/tty/serial/xilinx_uartps.c
@@ -425,32 +425,32 @@ static void cdns_uart_handle_tx(void *dev_id)
 {
 	struct uart_port *port = (struct uart_port *)dev_id;
 	struct cdns_uart *cdns_uart = port->private_data;
-	struct circ_buf *xmit = &port->state->xmit;
+	struct tty_port *tport = &port->state->port;
 	unsigned int numbytes;
+	unsigned char ch;
 
-	if (uart_circ_empty(xmit) || uart_tx_stopped(port)) {
+	if (kfifo_is_empty(&tport->xmit_fifo) || uart_tx_stopped(port)) {
 		/* Disable the TX Empty interrupt */
 		writel(CDNS_UART_IXR_TXEMPTY, port->membase + CDNS_UART_IDR);
 		return;
 	}
 
 	numbytes = port->fifosize;
-	while (numbytes && !uart_circ_empty(xmit) &&
-	       !(readl(port->membase + CDNS_UART_SR) & CDNS_UART_SR_TXFULL)) {
-
-		writel(xmit->buf[xmit->tail], port->membase + CDNS_UART_FIFO);
-		uart_xmit_advance(port, 1);
+	while (numbytes &&
+	       !(readl(port->membase + CDNS_UART_SR) & CDNS_UART_SR_TXFULL) &&
+	       uart_fifo_get(port, &ch)) {
+		writel(ch, port->membase + CDNS_UART_FIFO);
 		numbytes--;
 	}
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(port);
 
 	/* Enable the TX Empty interrupt */
 	writel(CDNS_UART_IXR_TXEMPTY, cdns_uart->port->membase + CDNS_UART_IER);
 
 	if (cdns_uart->port->rs485.flags & SER_RS485_ENABLED &&
-	    (uart_circ_empty(xmit) || uart_tx_stopped(port))) {
+	    (kfifo_is_empty(&tport->xmit_fifo) || uart_tx_stopped(port))) {
 		cdns_uart->tx_timer.function = &cdns_rs485_rx_callback;
 		hrtimer_start(&cdns_uart->tx_timer,
 			      ns_to_ktime(cdns_calc_after_tx_delay(cdns_uart)), HRTIMER_MODE_REL);
@@ -723,7 +723,7 @@ static void cdns_uart_start_tx(struct uart_port *port)
 	status |= CDNS_UART_CR_TX_EN;
 	writel(status, port->membase + CDNS_UART_CR);
 
-	if (uart_circ_empty(&port->state->xmit))
+	if (kfifo_is_empty(&port->state->port.xmit_fifo))
 		return;
 
 	/* Clear the TX Empty interrupt */
diff --git a/drivers/tty/serial/zs.c b/drivers/tty/serial/zs.c
index 65ca4da6e368..79ea7108a0f3 100644
--- a/drivers/tty/serial/zs.c
+++ b/drivers/tty/serial/zs.c
@@ -606,7 +606,8 @@ static void zs_receive_chars(struct zs_port *zport)
 
 static void zs_raw_transmit_chars(struct zs_port *zport)
 {
-	struct circ_buf *xmit = &zport->port.state->xmit;
+	struct tty_port *tport = &zport->port.state->port;
+	unsigned char ch;
 
 	/* XON/XOFF chars.  */
 	if (zport->port.x_char) {
@@ -617,20 +618,20 @@ static void zs_raw_transmit_chars(struct zs_port *zport)
 	}
 
 	/* If nothing to do or stopped or hardware stopped.  */
-	if (uart_circ_empty(xmit) || uart_tx_stopped(&zport->port)) {
+	if (uart_tx_stopped(&zport->port) ||
+			!uart_fifo_get(&zport->port, &ch)) {
 		zs_raw_stop_tx(zport);
 		return;
 	}
 
 	/* Send char.  */
-	write_zsdata(zport, xmit->buf[xmit->tail]);
-	uart_xmit_advance(&zport->port, 1);
+	write_zsdata(zport, ch);
 
-	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+	if (kfifo_len(&tport->xmit_fifo) < WAKEUP_CHARS)
 		uart_write_wakeup(&zport->port);
 
 	/* Are we are done?  */
-	if (uart_circ_empty(xmit))
+	if (kfifo_is_empty(&tport->xmit_fifo))
 		zs_raw_stop_tx(zport);
 }
 
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 0a0f6e21d40e..8cb65f50e830 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -11,7 +11,6 @@
 #include <linux/compiler.h>
 #include <linux/console.h>
 #include <linux/interrupt.h>
-#include <linux/circ_buf.h>
 #include <linux/spinlock.h>
 #include <linux/sched.h>
 #include <linux/tty.h>
@@ -699,7 +698,6 @@ struct uart_state {
 	struct tty_port		port;
 
 	enum uart_pm_state	pm_state;
-	struct circ_buf		xmit;
 
 	atomic_t		refcount;
 	wait_queue_head_t	remove_wait;
@@ -723,12 +721,35 @@ struct uart_state {
  */
 static inline void uart_xmit_advance(struct uart_port *up, unsigned int chars)
 {
-	struct circ_buf *xmit = &up->state->xmit;
+	struct tty_port *tport = &up->state->port;
 
-	xmit->tail = (xmit->tail + chars) & (UART_XMIT_SIZE - 1);
+	kfifo_skip_count(&tport->xmit_fifo, chars);
 	up->icount.tx += chars;
 }
 
+static inline unsigned int uart_fifo_out(struct uart_port *up,
+		unsigned char *buf, unsigned int chars)
+{
+	struct tty_port *tport = &up->state->port;
+
+	chars = kfifo_out(&tport->xmit_fifo, buf, chars);
+	up->icount.tx += chars;
+
+	return chars;
+}
+
+static inline unsigned int uart_fifo_get(struct uart_port *up,
+		unsigned char *ch)
+{
+	struct tty_port *tport = &up->state->port;
+	unsigned int chars;
+
+	chars = kfifo_get(&tport->xmit_fifo, ch);
+	up->icount.tx += chars;
+
+	return chars;
+}
+
 struct module;
 struct tty_driver;
 
@@ -764,7 +785,7 @@ enum UART_TX_FLAGS {
 		       for_test, for_post)				      \
 ({									      \
 	struct uart_port *__port = (uport);				      \
-	struct circ_buf *xmit = &__port->state->xmit;			      \
+	struct tty_port *__tport = &__port->state->port;		      \
 	unsigned int pending;						      \
 									      \
 	for (; (for_test) && (tx_ready); (for_post), __port->icount.tx++) {   \
@@ -775,17 +796,18 @@ enum UART_TX_FLAGS {
 			continue;					      \
 		}							      \
 									      \
-		if (uart_circ_empty(xmit) || uart_tx_stopped(__port))	      \
+		if (uart_tx_stopped(__port))				      \
+			break;						      \
+									      \
+		if (!kfifo_get(&__tport->xmit_fifo, &(ch)))		      \
 			break;						      \
 									      \
-		(ch) = xmit->buf[xmit->tail];				      \
 		(put_char);						      \
-		xmit->tail = (xmit->tail + 1) % UART_XMIT_SIZE;		      \
 	}								      \
 									      \
 	(tx_done);							      \
 									      \
-	pending = uart_circ_chars_pending(xmit);			      \
+	pending = kfifo_len(&__tport->xmit_fifo);			      \
 	if (pending < WAKEUP_CHARS) {					      \
 		uart_write_wakeup(__port);				      \
 									      \
@@ -974,15 +996,6 @@ bool uart_match_port(const struct uart_port *port1,
 int uart_suspend_port(struct uart_driver *reg, struct uart_port *port);
 int uart_resume_port(struct uart_driver *reg, struct uart_port *port);
 
-#define uart_circ_empty(circ)		((circ)->head == (circ)->tail)
-#define uart_circ_clear(circ)		((circ)->head = (circ)->tail = 0)
-
-#define uart_circ_chars_pending(circ)	\
-	(CIRC_CNT((circ)->head, (circ)->tail, UART_XMIT_SIZE))
-
-#define uart_circ_chars_free(circ)	\
-	(CIRC_SPACE((circ)->head, (circ)->tail, UART_XMIT_SIZE))
-
 static inline int uart_tx_stopped(struct uart_port *port)
 {
 	struct tty_struct *tty = port->state->port.tty;
-- 
cgit v1.2.3


From f03e8c1060f86c23eb49bafee99d9fcbd1c1bd77 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Wed, 27 Mar 2024 12:59:35 +0200
Subject: printk: Save console options for add_preferred_console_match()

Driver subsystems may need to translate the preferred console name to the
character device name used. We already do some of this in console_setup()
with a few hardcoded names, but that does not scale well.

The console options are parsed early in console_setup(), and the consoles
are added with __add_preferred_console(). At this point we don't know much
about the character device names and device drivers getting probed.

To allow driver subsystems to set up a preferred console, let's save the
kernel command line console options. To add a preferred console from a
driver subsystem with optional character device name translation, let's
add a new function add_preferred_console_match().

This allows the serial core layer to support console=DEVNAME:0.0 style
hardware based addressing in addition to the current console=ttyS0 style
naming. And we can start moving console_setup() character device parsing
to the driver subsystem specific code.

We use a separate array from the console_cmdline array as the character
device name and index may be unknown at the console_setup() time. And
eventually there's no need to call __add_preferred_console() until the
subsystem is ready to handle the console.

Adding the console name in addition to the character device name, and a
flag for an added console, could be added to the struct console_cmdline.
And the console_cmdline array handling could be modified accordingly. But
that complicates things compared saving the console options, and then
adding the consoles when the subsystems handling the consoles are ready.

Co-developed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20240327110021.59793-2-tony@atomide.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/printk.h          |   3 +
 kernel/printk/Makefile          |   2 +-
 kernel/printk/conopt.c          | 146 ++++++++++++++++++++++++++++++++++++++++
 kernel/printk/console_cmdline.h |   6 ++
 kernel/printk/printk.c          |  14 +++-
 5 files changed, 167 insertions(+), 4 deletions(-)
 create mode 100644 kernel/printk/conopt.c

(limited to 'include')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 955e31860095..5e038e782256 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -60,6 +60,9 @@ static inline const char *printk_skip_headers(const char *buffer)
 #define CONSOLE_LOGLEVEL_DEFAULT CONFIG_CONSOLE_LOGLEVEL_DEFAULT
 #define CONSOLE_LOGLEVEL_QUIET	 CONFIG_CONSOLE_LOGLEVEL_QUIET
 
+int add_preferred_console_match(const char *match, const char *name,
+				const short idx);
+
 extern int console_printk[];
 
 #define console_loglevel (console_printk[0])
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index 39a2b61c7232..040fe7d1eda2 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
-obj-y	= printk.o
+obj-y	= printk.o conopt.o
 obj-$(CONFIG_PRINTK)	+= printk_safe.o nbcon.o
 obj-$(CONFIG_A11Y_BRAILLE_CONSOLE)	+= braille.o
 obj-$(CONFIG_PRINTK_INDEX)	+= index.o
diff --git a/kernel/printk/conopt.c b/kernel/printk/conopt.c
new file mode 100644
index 000000000000..9d507bac3657
--- /dev/null
+++ b/kernel/printk/conopt.c
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Kernel command line console options for hardware based addressing
+ *
+ * Copyright (C) 2023 Texas Instruments Incorporated - https://www.ti.com/
+ * Author: Tony Lindgren <tony@atomide.com>
+ */
+
+#include <linux/console.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include <asm/errno.h>
+
+#include "console_cmdline.h"
+
+/*
+ * Allow longer DEVNAME:0.0 style console naming such as abcd0000.serial:0.0
+ * in addition to the legacy ttyS0 style naming.
+ */
+#define CONSOLE_NAME_MAX	32
+
+#define CONSOLE_OPT_MAX		16
+#define CONSOLE_BRL_OPT_MAX	16
+
+struct console_option {
+	char name[CONSOLE_NAME_MAX];
+	char opt[CONSOLE_OPT_MAX];
+	char brl_opt[CONSOLE_BRL_OPT_MAX];
+	u8 has_brl_opt:1;
+};
+
+/* Updated only at console_setup() time, no locking needed */
+static struct console_option conopt[MAX_CMDLINECONSOLES];
+
+/**
+ * console_opt_save - Saves kernel command line console option for driver use
+ * @str: Kernel command line console name and option
+ * @brl_opt: Braille console options
+ *
+ * Saves a kernel command line console option for driver subsystems to use for
+ * adding a preferred console during init. Called from console_setup() only.
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int __init console_opt_save(const char *str, const char *brl_opt)
+{
+	struct console_option *con;
+	size_t namelen, optlen;
+	const char *opt;
+	int i;
+
+	namelen = strcspn(str, ",");
+	if (namelen == 0 || namelen >= CONSOLE_NAME_MAX)
+		return -EINVAL;
+
+	opt = str + namelen;
+	if (*opt == ',')
+		opt++;
+
+	optlen = strlen(opt);
+	if (optlen >= CONSOLE_OPT_MAX)
+		return -EINVAL;
+
+	for (i = 0; i < MAX_CMDLINECONSOLES; i++) {
+		con = &conopt[i];
+
+		if (con->name[0]) {
+			if (!strncmp(str, con->name, namelen))
+				return 0;
+			continue;
+		}
+
+		/*
+		 * The name isn't terminated, only opt is. Empty opt is fine,
+		 * but brl_opt can be either empty or NULL. For more info, see
+		 * _braille_console_setup().
+		 */
+		strscpy(con->name, str, namelen + 1);
+		strscpy(con->opt, opt, CONSOLE_OPT_MAX);
+		if (brl_opt) {
+			strscpy(con->brl_opt, brl_opt, CONSOLE_BRL_OPT_MAX);
+			con->has_brl_opt = 1;
+		}
+
+		return 0;
+	}
+
+	return -ENOMEM;
+}
+
+static struct console_option *console_opt_find(const char *name)
+{
+	struct console_option *con;
+	int i;
+
+	for (i = 0; i < MAX_CMDLINECONSOLES; i++) {
+		con = &conopt[i];
+		if (!strcmp(name, con->name))
+			return con;
+	}
+
+	return NULL;
+}
+
+/**
+ * add_preferred_console_match - Adds a preferred console if a match is found
+ * @match: Expected console on kernel command line, such as console=DEVNAME:0.0
+ * @name: Name of the console character device to add such as ttyS
+ * @idx: Index for the console
+ *
+ * Allows driver subsystems to add a console after translating the command
+ * line name to the character device name used for the console. Options are
+ * added automatically based on the kernel command line. Duplicate preferred
+ * consoles are ignored by __add_preferred_console().
+ *
+ * Return: 0 on success, negative error code on failure.
+ */
+int add_preferred_console_match(const char *match, const char *name,
+				const short idx)
+{
+	struct console_option *con;
+	char *brl_opt = NULL;
+
+	if (!match || !strlen(match) || !name || !strlen(name) ||
+	    idx < 0)
+		return -EINVAL;
+
+	con = console_opt_find(match);
+	if (!con)
+		return -ENOENT;
+
+	/*
+	 * See __add_preferred_console(). It checks for NULL brl_options to set
+	 * the preferred_console flag. Empty brl_opt instead of NULL leads into
+	 * the preferred_console flag not set, and CON_CONSDEV not being set,
+	 * and the boot console won't get disabled at the end of console_setup().
+	 */
+	if (con->has_brl_opt)
+		brl_opt = con->brl_opt;
+
+	console_opt_add_preferred_console(name, idx, con->opt, brl_opt);
+
+	return 0;
+}
diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h
index 3ca74ad391d6..a125e0235589 100644
--- a/kernel/printk/console_cmdline.h
+++ b/kernel/printk/console_cmdline.h
@@ -2,6 +2,12 @@
 #ifndef _CONSOLE_CMDLINE_H
 #define _CONSOLE_CMDLINE_H
 
+#define MAX_CMDLINECONSOLES 8
+
+int console_opt_save(const char *str, const char *brl_opt);
+int console_opt_add_preferred_console(const char *name, const short idx,
+				      char *options, char *brl_options);
+
 struct console_cmdline
 {
 	char	name[16];			/* Name of the driver	    */
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index adf99c05adca..05919491c5a2 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -383,9 +383,6 @@ static int console_locked;
 /*
  *	Array of consoles built from command line options (console=)
  */
-
-#define MAX_CMDLINECONSOLES 8
-
 static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
 
 static int preferred_console = -1;
@@ -2503,6 +2500,10 @@ static int __init console_setup(char *str)
 	if (_braille_console_setup(&str, &brl_options))
 		return 1;
 
+	/* Save the console for driver subsystem use */
+	if (console_opt_save(str, brl_options))
+		return 1;
+
 	/*
 	 * Decode str into name, index, options.
 	 */
@@ -2533,6 +2534,13 @@ static int __init console_setup(char *str)
 }
 __setup("console=", console_setup);
 
+/* Only called from add_preferred_console_match() */
+int console_opt_add_preferred_console(const char *name, const short idx,
+				      char *options, char *brl_options)
+{
+	return __add_preferred_console(name, idx, options, brl_options, true);
+}
+
 /**
  * add_preferred_console - add a device to the list of preferred consoles.
  * @name: device name
-- 
cgit v1.2.3


From 0487724912abc14eb0e95c352a29e6691a733631 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 2 Apr 2024 22:50:31 +0300
Subject: serial: max3100: Make struct plat_max3100 local

There is no user of the struct plat_max3100 outside the driver.
Inline its contents into the driver. While at it, drop outdated
example in the comment.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20240402195306.269276-5-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/max3100.c   | 38 ++++++++++++++++-----------------
 include/linux/serial_max3100.h | 48 ------------------------------------------
 2 files changed, 18 insertions(+), 68 deletions(-)
 delete mode 100644 include/linux/serial_max3100.h

(limited to 'include')

diff --git a/drivers/tty/serial/max3100.c b/drivers/tty/serial/max3100.c
index 8c6eaf908bce..0e30201e9689 100644
--- a/drivers/tty/serial/max3100.c
+++ b/drivers/tty/serial/max3100.c
@@ -1,6 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0+
 /*
- *
  *  Copyright (C) 2008 Christian Pellegrin <chripell@evolware.org>
  *
  * Notes: the MAX3100 doesn't provide an interrupt on CTS so we have
@@ -8,24 +7,6 @@
  * writing conf clears FIFO buffer and we cannot have this interrupt
  * always asking us for attention.
  *
- * Example platform data:
-
- static struct plat_max3100 max3100_plat_data = {
- .loopback = 0,
- .crystal = 0,
- .poll_time = 100,
- };
-
- static struct spi_board_info spi_board_info[] = {
- {
- .modalias	= "max3100",
- .platform_data	= &max3100_plat_data,
- .irq		= IRQ_EINT12,
- .max_speed_hz	= 5*1000*1000,
- .chip_select	= 0,
- },
- };
-
  * The initial minor number is 209 in the low-density serial port:
  * mknod /dev/ttyMAX0 c 204 209
  */
@@ -49,7 +30,24 @@
 
 #include <asm/unaligned.h>
 
-#include <linux/serial_max3100.h>
+/**
+ * struct plat_max3100 - MAX3100 SPI UART platform data
+ * @loopback:            force MAX3100 in loopback
+ * @crystal:             1 for 3.6864 Mhz, 0 for 1.8432
+ * @max3100_hw_suspend:  MAX3100 has a shutdown pin. This is a hook
+ *                       called on suspend and resume to activate it.
+ * @poll_time:           poll time for CTS signal in ms, 0 disables (so no hw
+ *                       flow ctrl is possible but you have less CPU usage)
+ *
+ * You should use this structure in your machine description to specify
+ * how the MAX3100 is connected.
+ */
+struct plat_max3100 {
+	int loopback;
+	int crystal;
+	void (*max3100_hw_suspend) (int suspend);
+	int poll_time;
+};
 
 #define MAX3100_C    (1<<14)
 #define MAX3100_D    (0<<14)
diff --git a/include/linux/serial_max3100.h b/include/linux/serial_max3100.h
deleted file mode 100644
index befd55c08a7c..000000000000
--- a/include/linux/serial_max3100.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- *
- *  Copyright (C) 2007 Christian Pellegrin
- */
-
-
-#ifndef _LINUX_SERIAL_MAX3100_H
-#define _LINUX_SERIAL_MAX3100_H 1
-
-
-/**
- * struct plat_max3100 - MAX3100 SPI UART platform data
- * @loopback:            force MAX3100 in loopback
- * @crystal:             1 for 3.6864 Mhz, 0 for 1.8432
- * @max3100_hw_suspend:  MAX3100 has a shutdown pin. This is a hook
- *                       called on suspend and resume to activate it.
- * @poll_time:           poll time for CTS signal in ms, 0 disables (so no hw
- *                       flow ctrl is possible but you have less CPU usage)
- *
- * You should use this structure in your machine description to specify
- * how the MAX3100 is connected. Example:
- *
- * static struct plat_max3100 max3100_plat_data = {
- *  .loopback = 0,
- *  .crystal = 0,
- *  .poll_time = 100,
- * };
- *
- * static struct spi_board_info spi_board_info[] = {
- * {
- *  .modalias	= "max3100",
- *  .platform_data	= &max3100_plat_data,
- *  .irq		= IRQ_EINT12,
- *  .max_speed_hz	= 5*1000*1000,
- *  .chip_select	= 0,
- * },
- * };
- *
- **/
-struct plat_max3100 {
-	int loopback;
-	int crystal;
-	void (*max3100_hw_suspend) (int suspend);
-	int poll_time;
-};
-
-#endif
-- 
cgit v1.2.3


From eebab7e3eb4bb906a8ebc3b70d28059ff1d9271c Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@amd.com>
Date: Thu, 7 Mar 2024 13:20:06 +1100
Subject: PCI/DOE: Support discovery version 2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PCIe r6.1, sec 6.30.1.1 defines a "DOE Discovery Version" field in
the DOE Discovery Request Data Object Contents (3rd DW) as:

15:8 DOE Discovery Version – must be 02h if the Capability Version in
the Data Object Exchange Extended Capability is 02h or greater.

Add support for the version on devices with the DOE v2 capability.

Link: https://lore.kernel.org/r/20240307022006.3657433-1-aik@amd.com
Signed-off-by: Alexey Kardashevskiy <aik@amd.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
---
 drivers/pci/doe.c             | 12 +++++++++---
 include/uapi/linux/pci_regs.h |  1 +
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/doe.c b/drivers/pci/doe.c
index e3aab5edaf70..652d63df9d22 100644
--- a/drivers/pci/doe.c
+++ b/drivers/pci/doe.c
@@ -383,11 +383,13 @@ static void pci_doe_task_complete(struct pci_doe_task *task)
 	complete(task->private);
 }
 
-static int pci_doe_discovery(struct pci_doe_mb *doe_mb, u8 *index, u16 *vid,
+static int pci_doe_discovery(struct pci_doe_mb *doe_mb, u8 capver, u8 *index, u16 *vid,
 			     u8 *protocol)
 {
 	u32 request_pl = FIELD_PREP(PCI_DOE_DATA_OBJECT_DISC_REQ_3_INDEX,
-				    *index);
+				    *index) |
+			 FIELD_PREP(PCI_DOE_DATA_OBJECT_DISC_REQ_3_VER,
+				    (capver >= 2) ? 2 : 0);
 	__le32 request_pl_le = cpu_to_le32(request_pl);
 	__le32 response_pl_le;
 	u32 response_pl;
@@ -421,13 +423,17 @@ static int pci_doe_cache_protocols(struct pci_doe_mb *doe_mb)
 {
 	u8 index = 0;
 	u8 xa_idx = 0;
+	u32 hdr = 0;
+
+	pci_read_config_dword(doe_mb->pdev, doe_mb->cap_offset, &hdr);
 
 	do {
 		int rc;
 		u16 vid;
 		u8 prot;
 
-		rc = pci_doe_discovery(doe_mb, &index, &vid, &prot);
+		rc = pci_doe_discovery(doe_mb, PCI_EXT_CAP_VER(hdr), &index,
+				       &vid, &prot);
 		if (rc)
 			return rc;
 
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index a39193213ff2..fbca743b2b86 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -1144,6 +1144,7 @@
 #define PCI_DOE_DATA_OBJECT_HEADER_2_LENGTH		0x0003ffff
 
 #define PCI_DOE_DATA_OBJECT_DISC_REQ_3_INDEX		0x000000ff
+#define PCI_DOE_DATA_OBJECT_DISC_REQ_3_VER		0x0000ff00
 #define PCI_DOE_DATA_OBJECT_DISC_RSP_3_VID		0x0000ffff
 #define PCI_DOE_DATA_OBJECT_DISC_RSP_3_PROTOCOL		0x00ff0000
 #define PCI_DOE_DATA_OBJECT_DISC_RSP_3_NEXT_INDEX	0xff000000
-- 
cgit v1.2.3


From 897fa2c38c076c801bd1f1238af0af927e339c8f Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 3 Apr 2024 23:59:41 +0800
Subject: cpumask: add cpumask_any_and_but()

In some cases, it's useful to be able to select a random cpu from the
intersection of two masks, excluding a particular CPU.

For example, in some systems an uncore PMU is shared by a subset of
CPUs, and management of this PMU is assigned to some arbitrary CPU in
this set. Whenever the management CPU is hotplugged out, we wish to
migrate responsibility to another arbitrary CPU which is both in this
set and online.

Today we can use cpumask_any_and() to select an arbitrary CPU in the
intersection of two masks. We can also use cpumask_any_but() to select
any arbitrary cpu in a mask excluding, a particular CPU.

To do both, we either need to use a temporary cpumask, which is
wasteful, or use some lower-level cpumask helpers, which can be unclear.

This patch adds a new cpumask_any_and_but() to cater for these cases.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Dawei Li <dawei.li@shingroup.cn>
Acked-by: Yury Norov <yury.norov@gmail.com>
Link: https://lore.kernel.org/r/20240403155950.2068109-2-dawei.li@shingroup.cn
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/cpumask.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'include')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 1c29947db848..121f3ac757ff 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -388,6 +388,29 @@ unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
 	return i;
 }
 
+/**
+ * cpumask_any_and_but - pick a "random" cpu from *mask1 & *mask2, but not this one.
+ * @mask1: the first input cpumask
+ * @mask2: the second input cpumask
+ * @cpu: the cpu to ignore
+ *
+ * Returns >= nr_cpu_ids if no cpus set.
+ */
+static inline
+unsigned int cpumask_any_and_but(const struct cpumask *mask1,
+				 const struct cpumask *mask2,
+				 unsigned int cpu)
+{
+	unsigned int i;
+
+	cpumask_check(cpu);
+	i = cpumask_first_and(mask1, mask2);
+	if (i != cpu)
+		return i;
+
+	return cpumask_next_and(cpu, mask1, mask2);
+}
+
 /**
  * cpumask_nth - get the Nth cpu in a cpumask
  * @srcp: the cpumask pointer
-- 
cgit v1.2.3


From 904fb8f843a99ae7473d184412b6c7bd46a51593 Mon Sep 17 00:00:00 2001
From: Bastien Curutchet <bastien.curutchet@bootlin.com>
Date: Tue, 2 Apr 2024 09:12:03 +0200
Subject: ASoC: ti: davinci-i2s: Remove the unused clk_input_pin attribute

The clk_input_pin attribute of davinci_mcbsp_dev struct is not set since
commit 257ade78b601 ("ASoC: davinci-i2s: Convert to use edma-pcm").

Remove the attribute.
Keep the behaviour of the MCBSP_CLKR case as MCBSP_CLKR == 0.
I can't test the BC_FP format so I added back the initial comment that
was removed by commit ec6375533748 ("ASoC: DaVinci: Added selection of
clk input pin for McBSP"). This was the last dependency to
linux/platform_data/davinci_asp.h so it is not included anymore.

Remove the enum mcbsp_clk_input_pin from davinci_asp.h as it is not used
anywhere else.

Signed-off-by: Bastien Curutchet <bastien.curutchet@bootlin.com>
Acked-by: Peter Ujfalusi <peter.ujfalusi@gmail.com>
Link: https://msgid.link/r/20240402071213.11671-4-bastien.curutchet@bootlin.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/platform_data/davinci_asp.h | 15 ---------------
 sound/soc/ti/davinci-i2s.c                | 24 ++++--------------------
 2 files changed, 4 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/linux/platform_data/davinci_asp.h b/include/linux/platform_data/davinci_asp.h
index c8645b2ed3c0..b9c8520b4bd3 100644
--- a/include/linux/platform_data/davinci_asp.h
+++ b/include/linux/platform_data/davinci_asp.h
@@ -25,16 +25,6 @@ struct davinci_mcasp_pdata {
 	unsigned sram_size_capture;
 	struct gen_pool *sram_pool;
 
-	/*
-	 * If McBSP peripheral gets the clock from an external pin,
-	 * there are three chooses, that are MCBSP_CLKX, MCBSP_CLKR
-	 * and MCBSP_CLKS.
-	 * Depending on different hardware connections it is possible
-	 * to use this setting to change the behaviour of McBSP
-	 * driver.
-	 */
-	int clk_input_pin;
-
 	/*
 	 * This flag works when both clock and FS are outputs for the cpu
 	 * and makes clock more accurate (FS is not symmetrical and the
@@ -91,11 +81,6 @@ enum {
 	MCASP_VERSION_OMAP,	/* OMAP4/5 */
 };
 
-enum mcbsp_clk_input_pin {
-	MCBSP_CLKR = 0,		/* as in DM365 */
-	MCBSP_CLKS,
-};
-
 #define INACTIVE_MODE	0
 #define TX_MODE		1
 #define RX_MODE		2
diff --git a/sound/soc/ti/davinci-i2s.c b/sound/soc/ti/davinci-i2s.c
index 07c8b2259208..5c906641640e 100644
--- a/sound/soc/ti/davinci-i2s.c
+++ b/sound/soc/ti/davinci-i2s.c
@@ -19,7 +19,6 @@
 #include <linux/delay.h>
 #include <linux/io.h>
 #include <linux/clk.h>
-#include <linux/platform_data/davinci_asp.h>
 
 #include <sound/core.h>
 #include <sound/pcm.h>
@@ -159,7 +158,6 @@ struct davinci_mcbsp_dev {
 
 	unsigned int fmt;
 	int clk_div;
-	int clk_input_pin;
 	bool i2s_accurate_sck;
 };
 
@@ -239,26 +237,12 @@ static int davinci_i2s_set_dai_fmt(struct snd_soc_dai *cpu_dai,
 			DAVINCI_MCBSP_PCR_CLKRM;
 		break;
 	case SND_SOC_DAIFMT_BC_FP:
-		pcr = DAVINCI_MCBSP_PCR_FSRM | DAVINCI_MCBSP_PCR_FSXM;
 		/*
-		 * Selection of the clock input pin that is the
-		 * input for the Sample Rate Generator.
-		 * McBSP FSR and FSX are driven by the Sample Rate
-		 * Generator.
+		 * McBSP CLKR pin is the input for the Sample Rate Generator.
+		 * McBSP FSR and FSX are driven by the Sample Rate Generator.
 		 */
-		switch (dev->clk_input_pin) {
-		case MCBSP_CLKS:
-			pcr |= DAVINCI_MCBSP_PCR_CLKXM |
-				DAVINCI_MCBSP_PCR_CLKRM;
-			break;
-		case MCBSP_CLKR:
-			pcr |= DAVINCI_MCBSP_PCR_SCLKME;
-			break;
-		default:
-			dev_err(dev->dev, "bad clk_input_pin\n");
-			return -EINVAL;
-		}
-
+		pcr = DAVINCI_MCBSP_PCR_FSRM | DAVINCI_MCBSP_PCR_FSXM;
+		pcr |= DAVINCI_MCBSP_PCR_SCLKME;
 		break;
 	case SND_SOC_DAIFMT_BC_FC:
 		/* codec is master */
-- 
cgit v1.2.3


From d503a04f8bc0c75dc9db9452d8cc79d748afb752 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Fri, 5 Apr 2024 16:11:33 -0700
Subject: bpf: Add support for certain atomics in bpf_arena to x86 JIT

Support atomics in bpf_arena that can be JITed as a single x86 instruction.
Instructions that are JITed as loops are not supported at the moment,
since they require more complex extable and loop logic.

JITs can choose to do smarter things with bpf_jit_supports_insn().
Like arm64 may decide to support all bpf atomics instructions
when emit_lse_atomic is available and none in ll_sc mode.

bpf_jit_supports_percpu_insn(), bpf_jit_supports_ptr_xchg() and
other such callbacks can be replaced with bpf_jit_supports_insn()
in the future.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Eduard Zingerman <eddyz87@gmail.com>
Link: https://lore.kernel.org/r/20240405231134.17274-1-alexei.starovoitov@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c | 72 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/filter.h      |  4 +++
 kernel/bpf/core.c           |  5 ++++
 kernel/bpf/verifier.c       | 19 +++++++++++-
 4 files changed, 99 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 6cf9a5697c09..2b5a475c4dd0 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -1172,6 +1172,54 @@ static int emit_atomic(u8 **pprog, u8 atomic_op,
 	return 0;
 }
 
+static int emit_atomic_index(u8 **pprog, u8 atomic_op, u32 size,
+			     u32 dst_reg, u32 src_reg, u32 index_reg, int off)
+{
+	u8 *prog = *pprog;
+
+	EMIT1(0xF0); /* lock prefix */
+	switch (size) {
+	case BPF_W:
+		EMIT1(add_3mod(0x40, dst_reg, src_reg, index_reg));
+		break;
+	case BPF_DW:
+		EMIT1(add_3mod(0x48, dst_reg, src_reg, index_reg));
+		break;
+	default:
+		pr_err("bpf_jit: 1 and 2 byte atomics are not supported\n");
+		return -EFAULT;
+	}
+
+	/* emit opcode */
+	switch (atomic_op) {
+	case BPF_ADD:
+	case BPF_AND:
+	case BPF_OR:
+	case BPF_XOR:
+		/* lock *(u32/u64*)(dst_reg + idx_reg + off) <op>= src_reg */
+		EMIT1(simple_alu_opcodes[atomic_op]);
+		break;
+	case BPF_ADD | BPF_FETCH:
+		/* src_reg = atomic_fetch_add(dst_reg + idx_reg + off, src_reg); */
+		EMIT2(0x0F, 0xC1);
+		break;
+	case BPF_XCHG:
+		/* src_reg = atomic_xchg(dst_reg + idx_reg + off, src_reg); */
+		EMIT1(0x87);
+		break;
+	case BPF_CMPXCHG:
+		/* r0 = atomic_cmpxchg(dst_reg + idx_reg + off, r0, src_reg); */
+		EMIT2(0x0F, 0xB1);
+		break;
+	default:
+		pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op);
+		return -EFAULT;
+	}
+	emit_insn_suffix_SIB(&prog, dst_reg, src_reg, index_reg, off);
+	*pprog = prog;
+	return 0;
+}
+
 #define DONT_CLEAR 1
 
 bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs)
@@ -1982,6 +2030,15 @@ populate_extable:
 				return err;
 			break;
 
+		case BPF_STX | BPF_PROBE_ATOMIC | BPF_W:
+		case BPF_STX | BPF_PROBE_ATOMIC | BPF_DW:
+			start_of_ldx = prog;
+			err = emit_atomic_index(&prog, insn->imm, BPF_SIZE(insn->code),
+						dst_reg, src_reg, X86_REG_R12, insn->off);
+			if (err)
+				return err;
+			goto populate_extable;
+
 			/* call */
 		case BPF_JMP | BPF_CALL: {
 			int offs;
@@ -3486,6 +3543,21 @@ bool bpf_jit_supports_arena(void)
 	return true;
 }
 
+bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena)
+{
+	if (!in_arena)
+		return true;
+	switch (insn->code) {
+	case BPF_STX | BPF_ATOMIC | BPF_W:
+	case BPF_STX | BPF_ATOMIC | BPF_DW:
+		if (insn->imm == (BPF_AND | BPF_FETCH) ||
+		    insn->imm == (BPF_OR | BPF_FETCH) ||
+		    insn->imm == (BPF_XOR | BPF_FETCH))
+			return false;
+	}
+	return true;
+}
+
 bool bpf_jit_supports_ptr_xchg(void)
 {
 	return true;
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 161d5f7b64ed..7a27f19bf44d 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -75,6 +75,9 @@ struct ctl_table_header;
 /* unused opcode to mark special load instruction. Same as BPF_MSH */
 #define BPF_PROBE_MEM32	0xa0
 
+/* unused opcode to mark special atomic instruction */
+#define BPF_PROBE_ATOMIC 0xe0
+
 /* unused opcode to mark call to interpreter with arguments */
 #define BPF_CALL_ARGS	0xe0
 
@@ -997,6 +1000,7 @@ bool bpf_jit_supports_far_kfunc_call(void);
 bool bpf_jit_supports_exceptions(void);
 bool bpf_jit_supports_ptr_xchg(void);
 bool bpf_jit_supports_arena(void);
+bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena);
 void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
 bool bpf_helper_changes_pkt_data(void *func);
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7a33a3a7e63c..a41718eaeefe 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2965,6 +2965,11 @@ bool __weak bpf_jit_supports_arena(void)
 	return false;
 }
 
+bool __weak bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena)
+{
+	return false;
+}
+
 /* Return TRUE if the JIT backend satisfies the following two conditions:
  * 1) JIT backend supports atomic_xchg() on pointer-sized words.
  * 2) Under the specific arch, the implementation of xchg() is the same
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 590db4e4c071..2aad6d90550f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6970,6 +6970,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 	return err;
 }
 
+static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type,
+			     bool allow_trust_missmatch);
+
 static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
 {
 	int load_reg;
@@ -7030,7 +7033,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
 	    is_pkt_reg(env, insn->dst_reg) ||
 	    is_flow_key_reg(env, insn->dst_reg) ||
 	    is_sk_reg(env, insn->dst_reg) ||
-	    is_arena_reg(env, insn->dst_reg)) {
+	    (is_arena_reg(env, insn->dst_reg) && !bpf_jit_supports_insn(insn, true))) {
 		verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n",
 			insn->dst_reg,
 			reg_type_str(env, reg_state(env, insn->dst_reg)->type));
@@ -7066,6 +7069,11 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
 	if (err)
 		return err;
 
+	if (is_arena_reg(env, insn->dst_reg)) {
+		err = save_aux_ptr_type(env, PTR_TO_ARENA, false);
+		if (err)
+			return err;
+	}
 	/* Check whether we can write into the same memory. */
 	err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
 			       BPF_SIZE(insn->code), BPF_WRITE, -1, true, false);
@@ -18955,6 +18963,12 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 			   insn->code == (BPF_ST | BPF_MEM | BPF_W) ||
 			   insn->code == (BPF_ST | BPF_MEM | BPF_DW)) {
 			type = BPF_WRITE;
+		} else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) ||
+			    insn->code == (BPF_STX | BPF_ATOMIC | BPF_DW)) &&
+			   env->insn_aux_data[i + delta].ptr_type == PTR_TO_ARENA) {
+			insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code);
+			env->prog->aux->num_exentries++;
+			continue;
 		} else {
 			continue;
 		}
@@ -19226,6 +19240,9 @@ static int jit_subprogs(struct bpf_verifier_env *env)
 			     BPF_CLASS(insn->code) == BPF_ST) &&
 			     BPF_MODE(insn->code) == BPF_PROBE_MEM32)
 				num_exentries++;
+			if (BPF_CLASS(insn->code) == BPF_STX &&
+			     BPF_MODE(insn->code) == BPF_PROBE_ATOMIC)
+				num_exentries++;
 		}
 		func[i]->aux->num_exentries = num_exentries;
 		func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable;
-- 
cgit v1.2.3


From 3253804773c0613a1bad5bfea2edf172b760d8b6 Mon Sep 17 00:00:00 2001
From: Stefan Berger <stefanb@linux.ibm.com>
Date: Fri, 23 Feb 2024 12:25:05 -0500
Subject: security: allow finer granularity in permitting copy-up of security
 xattrs

Copying up xattrs is solely based on the security xattr name. For finer
granularity add a dentry parameter to the security_inode_copy_up_xattr
hook definition, allowing decisions to be based on the xattr content as
well.

Co-developed-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Stefan Berger <stefanb@linux.ibm.com>
Acked-by: Amir Goldstein <amir73il@gmail.com>
Acked-by: Paul Moore <paul@paul-moore.com> (LSM,SELinux)
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 fs/overlayfs/copy_up.c            | 2 +-
 include/linux/lsm_hook_defs.h     | 3 ++-
 include/linux/security.h          | 4 ++--
 security/integrity/evm/evm_main.c | 2 +-
 security/security.c               | 5 +++--
 security/selinux/hooks.c          | 2 +-
 security/smack/smack_lsm.c        | 2 +-
 7 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 0762575a1e70..a5ef2005a2cc 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -114,7 +114,7 @@ int ovl_copy_xattr(struct super_block *sb, const struct path *oldpath, struct de
 		if (ovl_is_private_xattr(sb, name))
 			continue;
 
-		error = security_inode_copy_up_xattr(name);
+		error = security_inode_copy_up_xattr(old, name);
 		if (error < 0 && error != -EOPNOTSUPP)
 			break;
 		if (error == 1) {
diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h
index 334e00efbde4..f804b76cde44 100644
--- a/include/linux/lsm_hook_defs.h
+++ b/include/linux/lsm_hook_defs.h
@@ -176,7 +176,8 @@ LSM_HOOK(int, 0, inode_listsecurity, struct inode *inode, char *buffer,
 	 size_t buffer_size)
 LSM_HOOK(void, LSM_RET_VOID, inode_getsecid, struct inode *inode, u32 *secid)
 LSM_HOOK(int, 0, inode_copy_up, struct dentry *src, struct cred **new)
-LSM_HOOK(int, -EOPNOTSUPP, inode_copy_up_xattr, const char *name)
+LSM_HOOK(int, -EOPNOTSUPP, inode_copy_up_xattr, struct dentry *src,
+	 const char *name)
 LSM_HOOK(int, 0, kernfs_init_security, struct kernfs_node *kn_dir,
 	 struct kernfs_node *kn)
 LSM_HOOK(int, 0, file_permission, struct file *file, int mask)
diff --git a/include/linux/security.h b/include/linux/security.h
index 41a8f667bdfa..21cf70346b33 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -398,7 +398,7 @@ int security_inode_setsecurity(struct inode *inode, const char *name, const void
 int security_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size);
 void security_inode_getsecid(struct inode *inode, u32 *secid);
 int security_inode_copy_up(struct dentry *src, struct cred **new);
-int security_inode_copy_up_xattr(const char *name);
+int security_inode_copy_up_xattr(struct dentry *src, const char *name);
 int security_kernfs_init_security(struct kernfs_node *kn_dir,
 				  struct kernfs_node *kn);
 int security_file_permission(struct file *file, int mask);
@@ -1016,7 +1016,7 @@ static inline int security_kernfs_init_security(struct kernfs_node *kn_dir,
 	return 0;
 }
 
-static inline int security_inode_copy_up_xattr(const char *name)
+static inline int security_inode_copy_up_xattr(struct dentry *src, const char *name)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c
index 81dbade5b9b3..b0a862bfd74a 100644
--- a/security/integrity/evm/evm_main.c
+++ b/security/integrity/evm/evm_main.c
@@ -946,7 +946,7 @@ static void evm_inode_post_setattr(struct mnt_idmap *idmap,
 		evm_update_evmxattr(dentry, NULL, NULL, 0);
 }
 
-static int evm_inode_copy_up_xattr(const char *name)
+static int evm_inode_copy_up_xattr(struct dentry *src, const char *name)
 {
 	if (strcmp(name, XATTR_NAME_EVM) == 0)
 		return 1; /* Discard */
diff --git a/security/security.c b/security/security.c
index 0a9a0ac3f266..e5da848c50b9 100644
--- a/security/security.c
+++ b/security/security.c
@@ -2628,6 +2628,7 @@ EXPORT_SYMBOL(security_inode_copy_up);
 
 /**
  * security_inode_copy_up_xattr() - Filter xattrs in an overlayfs copy-up op
+ * @src: union dentry of copy-up file
  * @name: xattr name
  *
  * Filter the xattrs being copied up when a unioned file is copied up from a
@@ -2638,7 +2639,7 @@ EXPORT_SYMBOL(security_inode_copy_up);
  *         if the security module does not know about attribute, or a negative
  *         error code to abort the copy up.
  */
-int security_inode_copy_up_xattr(const char *name)
+int security_inode_copy_up_xattr(struct dentry *src, const char *name)
 {
 	int rc;
 
@@ -2647,7 +2648,7 @@ int security_inode_copy_up_xattr(const char *name)
 	 * xattr), -EOPNOTSUPP if it does not know anything about the xattr or
 	 * any other error code in case of an error.
 	 */
-	rc = call_int_hook(inode_copy_up_xattr, name);
+	rc = call_int_hook(inode_copy_up_xattr, src, name);
 	if (rc != LSM_RET_DEFAULT(inode_copy_up_xattr))
 		return rc;
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 3448454c82d0..9f018f51e47a 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3530,7 +3530,7 @@ static int selinux_inode_copy_up(struct dentry *src, struct cred **new)
 	return 0;
 }
 
-static int selinux_inode_copy_up_xattr(const char *name)
+static int selinux_inode_copy_up_xattr(struct dentry *dentry, const char *name)
 {
 	/* The copy_up hook above sets the initial context on an inode, but we
 	 * don't then want to overwrite it by blindly copying all the lower
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 146667937811..79d8dc633b9a 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -4885,7 +4885,7 @@ static int smack_inode_copy_up(struct dentry *dentry, struct cred **new)
 	return 0;
 }
 
-static int smack_inode_copy_up_xattr(const char *name)
+static int smack_inode_copy_up_xattr(struct dentry *src, const char *name)
 {
 	/*
 	 * Return 1 if this is the smack access Smack attribute.
-- 
cgit v1.2.3


From 309e2b775da8b2c28fccc4ac2621801f06920ce0 Mon Sep 17 00:00:00 2001
From: Stefan Berger <stefanb@linux.ibm.com>
Date: Fri, 23 Feb 2024 12:25:08 -0500
Subject: ima: Move file-change detection variables into new structure

Move all the variables used for file change detection into a structure
that can be used by IMA and EVM. Implement an inline function for storing
the identification of an inode and one for detecting changes to an inode
based on this new structure.

Co-developed-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Stefan Berger <stefanb@linux.ibm.com>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 include/linux/integrity.h         | 34 ++++++++++++++++++++++++++++++++++
 security/integrity/ima/ima.h      |  4 +---
 security/integrity/ima/ima_api.c  | 10 +++++-----
 security/integrity/ima/ima_iint.c |  2 +-
 security/integrity/ima/ima_main.c |  7 +++----
 5 files changed, 44 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/integrity.h b/include/linux/integrity.h
index 459b79683783..f5842372359b 100644
--- a/include/linux/integrity.h
+++ b/include/linux/integrity.h
@@ -8,6 +8,7 @@
 #define _LINUX_INTEGRITY_H
 
 #include <linux/fs.h>
+#include <linux/iversion.h>
 
 enum integrity_status {
 	INTEGRITY_PASS = 0,
@@ -28,4 +29,37 @@ static inline void integrity_load_keys(void)
 }
 #endif /* CONFIG_INTEGRITY */
 
+/* An inode's attributes for detection of changes */
+struct integrity_inode_attributes {
+	u64 version;		/* track inode changes */
+	unsigned long ino;
+	dev_t dev;
+};
+
+/*
+ * On stacked filesystems the i_version alone is not enough to detect file data
+ * or metadata change. Additional metadata is required.
+ */
+static inline void
+integrity_inode_attrs_store(struct integrity_inode_attributes *attrs,
+			    u64 i_version, const struct inode *inode)
+{
+	attrs->version = i_version;
+	attrs->dev = inode->i_sb->s_dev;
+	attrs->ino = inode->i_ino;
+}
+
+/*
+ * On stacked filesystems detect whether the inode or its content has changed.
+ */
+static inline bool
+integrity_inode_attrs_changed(const struct integrity_inode_attributes *attrs,
+			      const struct inode *inode)
+{
+	return (inode->i_sb->s_dev != attrs->dev ||
+		inode->i_ino != attrs->ino ||
+		!inode_eq_iversion(inode, attrs->version));
+}
+
+
 #endif /* _LINUX_INTEGRITY_H */
diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index 11d7c0332207..9151b5369cdc 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -175,12 +175,10 @@ struct ima_kexec_hdr {
 /* IMA integrity metadata associated with an inode */
 struct ima_iint_cache {
 	struct mutex mutex;	/* protects: version, flags, digest */
-	u64 version;		/* track inode changes */
+	struct integrity_inode_attributes real_inode;
 	unsigned long flags;
 	unsigned long measured_pcrs;
 	unsigned long atomic_flags;
-	unsigned long real_ino;
-	dev_t real_dev;
 	enum integrity_status ima_file_status:4;
 	enum integrity_status ima_mmap_status:4;
 	enum integrity_status ima_bprm_status:4;
diff --git a/security/integrity/ima/ima_api.c b/security/integrity/ima/ima_api.c
index 3d286de231e1..984e861f6e33 100644
--- a/security/integrity/ima/ima_api.c
+++ b/security/integrity/ima/ima_api.c
@@ -305,11 +305,11 @@ int ima_collect_measurement(struct ima_iint_cache *iint, struct file *file,
 
 	iint->ima_hash = tmpbuf;
 	memcpy(iint->ima_hash, &hash, length);
-	iint->version = i_version;
-	if (real_inode != inode) {
-		iint->real_ino = real_inode->i_ino;
-		iint->real_dev = real_inode->i_sb->s_dev;
-	}
+	if (real_inode == inode)
+		iint->real_inode.version = i_version;
+	else
+		integrity_inode_attrs_store(&iint->real_inode, i_version,
+					    real_inode);
 
 	/* Possibly temporary failure due to type of read (eg. O_DIRECT) */
 	if (!result)
diff --git a/security/integrity/ima/ima_iint.c b/security/integrity/ima/ima_iint.c
index e7c9c216c1c6..e23412a2c56b 100644
--- a/security/integrity/ima/ima_iint.c
+++ b/security/integrity/ima/ima_iint.c
@@ -59,7 +59,7 @@ static void ima_iint_init_always(struct ima_iint_cache *iint,
 				 struct inode *inode)
 {
 	iint->ima_hash = NULL;
-	iint->version = 0;
+	iint->real_inode.version = 0;
 	iint->flags = 0UL;
 	iint->atomic_flags = 0UL;
 	iint->ima_file_status = INTEGRITY_UNKNOWN;
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index eebf629f192e..4b215d85c14b 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -173,7 +173,7 @@ static void ima_check_last_writer(struct ima_iint_cache *iint,
 				      STATX_CHANGE_COOKIE,
 				      AT_STATX_SYNC_AS_STAT) ||
 		    !(stat.result_mask & STATX_CHANGE_COOKIE) ||
-		    stat.change_cookie != iint->version) {
+		    stat.change_cookie != iint->real_inode.version) {
 			iint->flags &= ~(IMA_DONE_MASK | IMA_NEW_FILE);
 			iint->measured_pcrs = 0;
 			if (update)
@@ -292,9 +292,8 @@ static int process_measurement(struct file *file, const struct cred *cred,
 	if (real_inode != inode &&
 	    (action & IMA_DO_MASK) && (iint->flags & IMA_DONE_MASK)) {
 		if (!IS_I_VERSION(real_inode) ||
-		    real_inode->i_sb->s_dev != iint->real_dev ||
-		    real_inode->i_ino != iint->real_ino ||
-		    !inode_eq_iversion(real_inode, iint->version)) {
+		    integrity_inode_attrs_changed(&iint->real_inode,
+						  real_inode)) {
 			iint->flags &= ~IMA_DONE_MASK;
 			iint->measured_pcrs = 0;
 		}
-- 
cgit v1.2.3


From a652aa59068bd78d96a1ab6ea7c0c4d3c79fb5e8 Mon Sep 17 00:00:00 2001
From: Stefan Berger <stefanb@linux.ibm.com>
Date: Fri, 23 Feb 2024 12:25:09 -0500
Subject: evm: Store and detect metadata inode attributes changes

On stacked filesystem the metadata inode may be different than the one
file data inode and therefore changes to it need to be detected
independently. Therefore, store the i_version, device number, and inode
number associated with the file metadata inode.

Implement a function to detect changes to the inode and if a change is
detected reset the evm_status. This function will be called by IMA when
IMA detects that the metadata inode is different from the file's inode.

Co-developed-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Stefan Berger <stefanb@linux.ibm.com>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 include/linux/evm.h                 |  8 ++++++++
 security/integrity/evm/evm.h        |  6 ++++--
 security/integrity/evm/evm_crypto.c | 23 +++++++++++++++++------
 security/integrity/evm/evm_main.c   | 30 ++++++++++++++++++++++++++++--
 4 files changed, 57 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/evm.h b/include/linux/evm.h
index d48d6da32315..ddece4a6b25d 100644
--- a/include/linux/evm.h
+++ b/include/linux/evm.h
@@ -26,6 +26,8 @@ extern int evm_protected_xattr_if_enabled(const char *req_xattr_name);
 extern int evm_read_protected_xattrs(struct dentry *dentry, u8 *buffer,
 				     int buffer_size, char type,
 				     bool canonical_fmt);
+extern bool evm_metadata_changed(struct inode *inode,
+				 struct inode *metadata_inode);
 #ifdef CONFIG_FS_POSIX_ACL
 extern int posix_xattr_acl(const char *xattrname);
 #else
@@ -76,5 +78,11 @@ static inline int evm_read_protected_xattrs(struct dentry *dentry, u8 *buffer,
 	return -EOPNOTSUPP;
 }
 
+static inline bool evm_metadata_changed(struct inode *inode,
+					struct inode *metadata_inode)
+{
+	return false;
+}
+
 #endif /* CONFIG_EVM */
 #endif /* LINUX_EVM_H */
diff --git a/security/integrity/evm/evm.h b/security/integrity/evm/evm.h
index 72e3341ae6f7..51aba5a54275 100644
--- a/security/integrity/evm/evm.h
+++ b/security/integrity/evm/evm.h
@@ -39,6 +39,7 @@ struct xattr_list {
 struct evm_iint_cache {
 	unsigned long flags;
 	enum integrity_status evm_status:4;
+	struct integrity_inode_attributes metadata_inode;
 };
 
 extern struct lsm_blob_sizes evm_blob_sizes;
@@ -74,11 +75,12 @@ int evm_update_evmxattr(struct dentry *dentry,
 			size_t req_xattr_value_len);
 int evm_calc_hmac(struct dentry *dentry, const char *req_xattr_name,
 		  const char *req_xattr_value,
-		  size_t req_xattr_value_len, struct evm_digest *data);
+		  size_t req_xattr_value_len, struct evm_digest *data,
+		  struct evm_iint_cache *iint);
 int evm_calc_hash(struct dentry *dentry, const char *req_xattr_name,
 		  const char *req_xattr_value,
 		  size_t req_xattr_value_len, char type,
-		  struct evm_digest *data);
+		  struct evm_digest *data, struct evm_iint_cache *iint);
 int evm_init_hmac(struct inode *inode, const struct xattr *xattrs,
 		  char *hmac_val);
 int evm_init_secfs(void);
diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c
index 35416f55391c..7c06ffd633d2 100644
--- a/security/integrity/evm/evm_crypto.c
+++ b/security/integrity/evm/evm_crypto.c
@@ -221,7 +221,8 @@ static int evm_calc_hmac_or_hash(struct dentry *dentry,
 				 const char *req_xattr_name,
 				 const char *req_xattr_value,
 				 size_t req_xattr_value_len,
-				 uint8_t type, struct evm_digest *data)
+				 uint8_t type, struct evm_digest *data,
+				 struct evm_iint_cache *iint)
 {
 	struct inode *inode = d_inode(d_real(dentry, D_REAL_METADATA));
 	struct xattr_list *xattr;
@@ -231,6 +232,7 @@ static int evm_calc_hmac_or_hash(struct dentry *dentry,
 	int error;
 	int size, user_space_size;
 	bool ima_present = false;
+	u64 i_version = 0;
 
 	if (!(inode->i_opflags & IOP_XATTR) ||
 	    inode->i_sb->s_user_ns != &init_user_ns)
@@ -294,6 +296,13 @@ static int evm_calc_hmac_or_hash(struct dentry *dentry,
 	}
 	hmac_add_misc(desc, inode, type, data->digest);
 
+	if (inode != d_backing_inode(dentry) && iint) {
+		if (IS_I_VERSION(inode))
+			i_version = inode_query_iversion(inode);
+		integrity_inode_attrs_store(&iint->metadata_inode, i_version,
+					    inode);
+	}
+
 	/* Portable EVM signatures must include an IMA hash */
 	if (type == EVM_XATTR_PORTABLE_DIGSIG && !ima_present)
 		error = -EPERM;
@@ -305,18 +314,19 @@ out:
 
 int evm_calc_hmac(struct dentry *dentry, const char *req_xattr_name,
 		  const char *req_xattr_value, size_t req_xattr_value_len,
-		  struct evm_digest *data)
+		  struct evm_digest *data, struct evm_iint_cache *iint)
 {
 	return evm_calc_hmac_or_hash(dentry, req_xattr_name, req_xattr_value,
-				    req_xattr_value_len, EVM_XATTR_HMAC, data);
+				    req_xattr_value_len, EVM_XATTR_HMAC, data,
+				    iint);
 }
 
 int evm_calc_hash(struct dentry *dentry, const char *req_xattr_name,
 		  const char *req_xattr_value, size_t req_xattr_value_len,
-		  char type, struct evm_digest *data)
+		  char type, struct evm_digest *data, struct evm_iint_cache *iint)
 {
 	return evm_calc_hmac_or_hash(dentry, req_xattr_name, req_xattr_value,
-				     req_xattr_value_len, type, data);
+				     req_xattr_value_len, type, data, iint);
 }
 
 static int evm_is_immutable(struct dentry *dentry, struct inode *inode)
@@ -357,6 +367,7 @@ int evm_update_evmxattr(struct dentry *dentry, const char *xattr_name,
 			const char *xattr_value, size_t xattr_value_len)
 {
 	struct inode *inode = d_backing_inode(dentry);
+	struct evm_iint_cache *iint = evm_iint_inode(inode);
 	struct evm_digest data;
 	int rc = 0;
 
@@ -372,7 +383,7 @@ int evm_update_evmxattr(struct dentry *dentry, const char *xattr_name,
 
 	data.hdr.algo = HASH_ALGO_SHA1;
 	rc = evm_calc_hmac(dentry, xattr_name, xattr_value,
-			   xattr_value_len, &data);
+			   xattr_value_len, &data, iint);
 	if (rc == 0) {
 		data.hdr.xattr.sha1.type = EVM_XATTR_HMAC;
 		rc = __vfs_setxattr_noperm(&nop_mnt_idmap, dentry,
diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c
index c658d2f1494b..c1ca0894cd8a 100644
--- a/security/integrity/evm/evm_main.c
+++ b/security/integrity/evm/evm_main.c
@@ -226,7 +226,7 @@ static enum integrity_status evm_verify_hmac(struct dentry *dentry,
 
 		digest.hdr.algo = HASH_ALGO_SHA1;
 		rc = evm_calc_hmac(dentry, xattr_name, xattr_value,
-				   xattr_value_len, &digest);
+				   xattr_value_len, &digest, iint);
 		if (rc)
 			break;
 		rc = crypto_memneq(xattr_data->data, digest.digest,
@@ -247,7 +247,8 @@ static enum integrity_status evm_verify_hmac(struct dentry *dentry,
 		hdr = (struct signature_v2_hdr *)xattr_data;
 		digest.hdr.algo = hdr->hash_algo;
 		rc = evm_calc_hash(dentry, xattr_name, xattr_value,
-				   xattr_value_len, xattr_data->type, &digest);
+				   xattr_value_len, xattr_data->type, &digest,
+				   iint);
 		if (rc)
 			break;
 		rc = integrity_digsig_verify(INTEGRITY_KEYRING_EVM,
@@ -733,6 +734,31 @@ static void evm_reset_status(struct inode *inode)
 		iint->evm_status = INTEGRITY_UNKNOWN;
 }
 
+/**
+ * evm_metadata_changed: Detect changes to the metadata
+ * @inode: a file's inode
+ * @metadata_inode: metadata inode
+ *
+ * On a stacked filesystem detect whether the metadata has changed. If this is
+ * the case reset the evm_status associated with the inode that represents the
+ * file.
+ */
+bool evm_metadata_changed(struct inode *inode, struct inode *metadata_inode)
+{
+	struct evm_iint_cache *iint = evm_iint_inode(inode);
+	bool ret = false;
+
+	if (iint) {
+		ret = (!IS_I_VERSION(metadata_inode) ||
+		       integrity_inode_attrs_changed(&iint->metadata_inode,
+						     metadata_inode));
+		if (ret)
+			iint->evm_status = INTEGRITY_UNKNOWN;
+	}
+
+	return ret;
+}
+
 /**
  * evm_revalidate_status - report whether EVM status re-validation is necessary
  * @xattr_name: pointer to the affected extended attribute name
-- 
cgit v1.2.3


From 1f65e57dc5417b166843438bef31c70b9a5208fe Mon Sep 17 00:00:00 2001
From: Stefan Berger <stefanb@linux.ibm.com>
Date: Fri, 23 Feb 2024 12:25:12 -0500
Subject: fs: Rename SB_I_EVM_UNSUPPORTED to SB_I_EVM_HMAC_UNSUPPORTED

Now that EVM supports RSA signatures for previously completely
unsupported filesystems rename the flag SB_I_EVM_UNSUPPORTED to
SB_I_EVM_HMAC_UNSUPPORTED to reflect that only HMAC is not supported.

Suggested-by: Amir Goldstein <amir73il@gmail.com>
Suggested-by: Mimi Zohar <zohar@linux.ibm.com>
Signed-off-by: Stefan Berger <stefanb@linux.ibm.com>
Acked-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Mimi Zohar <zohar@linux.ibm.com>
---
 fs/overlayfs/super.c              | 2 +-
 include/linux/fs.h                | 2 +-
 security/integrity/evm/evm_main.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index a40fc7e05525..06a231970cb5 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1460,7 +1460,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
 	 * lead to unexpected results.
 	 */
 	sb->s_iflags |= SB_I_NOUMASK;
-	sb->s_iflags |= SB_I_EVM_UNSUPPORTED;
+	sb->s_iflags |= SB_I_EVM_HMAC_UNSUPPORTED;
 
 	err = -ENOMEM;
 	root_dentry = ovl_get_root(sb, ctx->upper.dentry, oe);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8dfd53b52744..7bbd5e2d2a20 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1177,7 +1177,7 @@ extern int send_sigurg(struct fown_struct *fown);
 #define SB_I_USERNS_VISIBLE		0x00000010 /* fstype already mounted */
 #define SB_I_IMA_UNVERIFIABLE_SIGNATURE	0x00000020
 #define SB_I_UNTRUSTED_MOUNTER		0x00000040
-#define SB_I_EVM_UNSUPPORTED		0x00000080
+#define SB_I_EVM_HMAC_UNSUPPORTED	0x00000080
 
 #define SB_I_SKIP_SYNC	0x00000100	/* Skip superblock at global sync */
 #define SB_I_PERSB_BDI	0x00000200	/* has a per-sb bdi */
diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c
index cfb4f9809369..c4a6081ce596 100644
--- a/security/integrity/evm/evm_main.c
+++ b/security/integrity/evm/evm_main.c
@@ -155,7 +155,7 @@ static int is_unsupported_fs(struct dentry *dentry)
 {
 	struct inode *inode = d_backing_inode(dentry);
 
-	if (inode->i_sb->s_iflags & SB_I_EVM_UNSUPPORTED) {
+	if (inode->i_sb->s_iflags & SB_I_EVM_HMAC_UNSUPPORTED) {
 		pr_info_once("%s not supported\n", inode->i_sb->s_type->name);
 		return 1;
 	}
-- 
cgit v1.2.3


From 959fa5c188bf095558c417554e4772ac1fda3531 Mon Sep 17 00:00:00 2001
From: Mina Almasry <almasrymina@google.com>
Date: Mon, 8 Apr 2024 08:29:56 -0700
Subject: net: make napi_frag_unref reuse skb_page_unref

The implementations of these 2 functions are almost identical. Remove
the implementation of napi_frag_unref, and make it a call into
skb_page_unref so we don't duplicate the implementation.

Signed-off-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Link: https://lore.kernel.org/r/20240408153000.2152844-2-almasrymina@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 12 +++---------
 net/ipv4/esp4.c        |  2 +-
 net/ipv6/esp6.c        |  2 +-
 3 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7dfb906d92f7..c0ff85bb087a 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3522,10 +3522,10 @@ int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
 bool napi_pp_put_page(struct page *page);
 
 static inline void
-skb_page_unref(const struct sk_buff *skb, struct page *page)
+skb_page_unref(struct page *page, bool recycle)
 {
 #ifdef CONFIG_PAGE_POOL
-	if (skb->pp_recycle && napi_pp_put_page(page))
+	if (recycle && napi_pp_put_page(page))
 		return;
 #endif
 	put_page(page);
@@ -3534,13 +3534,7 @@ skb_page_unref(const struct sk_buff *skb, struct page *page)
 static inline void
 napi_frag_unref(skb_frag_t *frag, bool recycle)
 {
-	struct page *page = skb_frag_page(frag);
-
-#ifdef CONFIG_PAGE_POOL
-	if (recycle && napi_pp_put_page(page))
-		return;
-#endif
-	put_page(page);
+	skb_page_unref(skb_frag_page(frag), recycle);
 }
 
 /**
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 3d647c9a7a21..40330253f076 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -114,7 +114,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb)
 	 */
 	if (req->src != req->dst)
 		for (sg = sg_next(req->src); sg; sg = sg_next(sg))
-			skb_page_unref(skb, sg_page(sg));
+			skb_page_unref(sg_page(sg), skb->pp_recycle);
 }
 
 #ifdef CONFIG_INET_ESPINTCP
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index fe8d53f5a5ee..fb431d0a3475 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -131,7 +131,7 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb)
 	 */
 	if (req->src != req->dst)
 		for (sg = sg_next(req->src); sg; sg = sg_next(sg))
-			skb_page_unref(skb, sg_page(sg));
+			skb_page_unref(sg_page(sg), skb->pp_recycle);
 }
 
 #ifdef CONFIG_INET6_ESPINTCP
-- 
cgit v1.2.3


From f58f3c9563409e618e591d0d540316286cb0665f Mon Sep 17 00:00:00 2001
From: Mina Almasry <almasrymina@google.com>
Date: Mon, 8 Apr 2024 08:29:58 -0700
Subject: net: remove napi_frag_unref

With the changes in the last patches, napi_frag_unref() is now
reduandant. Remove it and use skb_page_unref directly.

Signed-off-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Dragos Tatulea <dtatulea@nvidia.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Link: https://lore.kernel.org/r/20240408153000.2152844-4-almasrymina@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h | 8 +-------
 net/core/skbuff.c      | 2 +-
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c0ff85bb087a..7135a3e94afd 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3531,12 +3531,6 @@ skb_page_unref(struct page *page, bool recycle)
 	put_page(page);
 }
 
-static inline void
-napi_frag_unref(skb_frag_t *frag, bool recycle)
-{
-	skb_page_unref(skb_frag_page(frag), recycle);
-}
-
 /**
  * __skb_frag_unref - release a reference on a paged fragment.
  * @frag: the paged fragment
@@ -3547,7 +3541,7 @@ napi_frag_unref(skb_frag_t *frag, bool recycle)
  */
 static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
 {
-	napi_frag_unref(frag, recycle);
+	skb_page_unref(skb_frag_page(frag), recycle);
 }
 
 /**
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 21cd01641f4c..888874ef8566 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1101,7 +1101,7 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
 	}
 
 	for (i = 0; i < shinfo->nr_frags; i++)
-		napi_frag_unref(&shinfo->frags[i], skb->pp_recycle);
+		__skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
 
 free_head:
 	if (shinfo->frag_list)
-- 
cgit v1.2.3


From a88d970c8bb55dc20f319ce3e33c7ce0e9647480 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Sun, 17 Mar 2024 14:44:38 -0700
Subject: lib: Add one-byte emulation function

Architectures are required to provide four-byte cmpxchg() and 64-bit
architectures are additionally required to provide eight-byte cmpxchg().
However, there are cases where one-byte cmpxchg() would be extremely
useful.  Therefore, provide cmpxchg_emu_u8() that emulates one-byte
cmpxchg() in terms of four-byte cmpxchg().

Note that this emulations is fully ordered, and can (for example) cause
one-byte cmpxchg_relaxed() to incur the overhead of full ordering.
If this causes problems for a given architecture, that architecture is
free to provide its own lighter-weight primitives.

[ paulmck: Apply Marco Elver feedback. ]
[ paulmck: Apply kernel test robot feedback. ]
[ paulmck: Drop two-byte support per Arnd Bergmann feedback. ]

Link: https://lore.kernel.org/all/0733eb10-5e7a-4450-9b8a-527b97c842ff@paulmck-laptop/

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Acked-by: Marco Elver <elver@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Cc: Douglas Anderson <dianders@chromium.org>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: <linux-arch@vger.kernel.org>
---
 arch/Kconfig                |  3 +++
 include/linux/cmpxchg-emu.h | 15 +++++++++++++++
 lib/Makefile                |  1 +
 lib/cmpxchg-emu.c           | 45 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 64 insertions(+)
 create mode 100644 include/linux/cmpxchg-emu.h
 create mode 100644 lib/cmpxchg-emu.c

(limited to 'include')

diff --git a/arch/Kconfig b/arch/Kconfig
index 9f066785bb71..284663392eef 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1609,4 +1609,7 @@ config CC_HAS_SANE_FUNCTION_ALIGNMENT
 	# strict alignment always, even with -falign-functions.
 	def_bool CC_HAS_MIN_FUNCTION_ALIGNMENT || CC_IS_CLANG
 
+config ARCH_NEED_CMPXCHG_1_EMU
+	bool
+
 endmenu
diff --git a/include/linux/cmpxchg-emu.h b/include/linux/cmpxchg-emu.h
new file mode 100644
index 000000000000..998deec67740
--- /dev/null
+++ b/include/linux/cmpxchg-emu.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Emulated 1-byte and 2-byte cmpxchg operations for architectures
+ * lacking direct support for these sizes.  These are implemented in terms
+ * of 4-byte cmpxchg operations.
+ *
+ * Copyright (C) 2024 Paul E. McKenney.
+ */
+
+#ifndef __LINUX_CMPXCHG_EMU_H
+#define __LINUX_CMPXCHG_EMU_H
+
+uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new);
+
+#endif /* __LINUX_CMPXCHG_EMU_H */
diff --git a/lib/Makefile b/lib/Makefile
index ffc6b2341b45..cc3d52fdb477 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -236,6 +236,7 @@ obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
 lib-$(CONFIG_GENERIC_BUG) += bug.o
 
 obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o
+obj-$(CONFIG_ARCH_NEED_CMPXCHG_1_EMU) += cmpxchg-emu.o
 
 obj-$(CONFIG_DYNAMIC_DEBUG_CORE) += dynamic_debug.o
 #ensure exported functions have prototypes
diff --git a/lib/cmpxchg-emu.c b/lib/cmpxchg-emu.c
new file mode 100644
index 000000000000..27f6f97cb60d
--- /dev/null
+++ b/lib/cmpxchg-emu.c
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Emulated 1-byte cmpxchg operation for architectures lacking direct
+ * support for this size.  This is implemented in terms of 4-byte cmpxchg
+ * operations.
+ *
+ * Copyright (C) 2024 Paul E. McKenney.
+ */
+
+#include <linux/types.h>
+#include <linux/export.h>
+#include <linux/instrumented.h>
+#include <linux/atomic.h>
+#include <linux/panic.h>
+#include <linux/bug.h>
+#include <asm-generic/rwonce.h>
+#include <linux/cmpxchg-emu.h>
+
+union u8_32 {
+	u8 b[4];
+	u32 w;
+};
+
+/* Emulate one-byte cmpxchg() in terms of 4-byte cmpxchg. */
+uintptr_t cmpxchg_emu_u8(volatile u8 *p, uintptr_t old, uintptr_t new)
+{
+	u32 *p32 = (u32 *)(((uintptr_t)p) & ~0x3);
+	int i = ((uintptr_t)p) & 0x3;
+	union u8_32 old32;
+	union u8_32 new32;
+	u32 ret;
+
+	ret = READ_ONCE(*p32);
+	do {
+		old32.w = ret;
+		if (old32.b[i] != old)
+			return old32.b[i];
+		new32.w = old32.w;
+		new32.b[i] = new;
+		instrument_atomic_read_write(p, 1);
+		ret = data_race(cmpxchg(p32, old32.w, new32.w)); // Overridden above.
+	} while (ret != old32.w);
+	return old;
+}
+EXPORT_SYMBOL_GPL(cmpxchg_emu_u8);
-- 
cgit v1.2.3


From 575bc7b477e3f6c505f49c3d99d7be965594d640 Mon Sep 17 00:00:00 2001
From: Sascha Hauer <s.hauer@pengutronix.de>
Date: Fri, 5 Apr 2024 09:38:36 +0200
Subject: dt-bindings: clock: rockchip: add USB480M_PHY mux

The USB480M clock can source from a MUX that selects the clock to come
from either of the USB-phy internal 480MHz PLLs. These clocks are
provided by the USB phy driver. This adds the define for it.

Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20240405-clk-rk3568-usb480m-phy-mux-v1-1-6c89de20a6ff@pengutronix.de
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
---
 include/dt-bindings/clock/rk3568-cru.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/rk3568-cru.h b/include/dt-bindings/clock/rk3568-cru.h
index d29890865150..5263085c5b23 100644
--- a/include/dt-bindings/clock/rk3568-cru.h
+++ b/include/dt-bindings/clock/rk3568-cru.h
@@ -78,6 +78,7 @@
 #define CPLL_333M		9
 #define ARMCLK			10
 #define USB480M			11
+#define USB480M_PHY		12
 #define ACLK_CORE_NIU2BUS	18
 #define CLK_CORE_PVTM		19
 #define CLK_CORE_PVTM_CORE	20
-- 
cgit v1.2.3


From ca151fd56b5736a7adbdba5675b9d87d70f20b23 Mon Sep 17 00:00:00 2001
From: Shreeya Patel <shreeya.patel@collabora.com>
Date: Thu, 28 Mar 2024 04:20:52 +0530
Subject: dt-bindings: reset: Define reset id used for HDMI Receiver

Add reset id used for HDMI Receiver in RK3588 SoCs

Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Shreeya Patel <shreeya.patel@collabora.com>
Link: https://lore.kernel.org/r/20240327225057.672304-2-shreeya.patel@collabora.com
Signed-off-by: Heiko Stuebner <heiko@sntech.de>
---
 include/dt-bindings/reset/rockchip,rk3588-cru.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/reset/rockchip,rk3588-cru.h b/include/dt-bindings/reset/rockchip,rk3588-cru.h
index d4264db2a07f..e2fe4bd5f7f0 100644
--- a/include/dt-bindings/reset/rockchip,rk3588-cru.h
+++ b/include/dt-bindings/reset/rockchip,rk3588-cru.h
@@ -751,4 +751,6 @@
 #define SRST_P_TRNG_CHK			658
 #define SRST_TRNG_S			659
 
+#define SRST_A_HDMIRX_BIU		660
+
 #endif
-- 
cgit v1.2.3


From f5d65f40bdd0b2ebd541c580efde975ee3db288c Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Mon, 8 Apr 2024 13:42:30 +0300
Subject: drm: remove unused header gma_drm.h

gma_drm.h has become an empty, unused header. Remove.

Cc: Patrik Jakobsson <patrik.r.jakobsson@gmail.com>
Reviewed-by: Javier Martinez Canillas <javierm@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240408104230.3191827-1-jani.nikula@intel.com
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 include/drm/gma_drm.h | 13 -------------
 1 file changed, 13 deletions(-)
 delete mode 100644 include/drm/gma_drm.h

(limited to 'include')

diff --git a/include/drm/gma_drm.h b/include/drm/gma_drm.h
deleted file mode 100644
index 228f43e8df89..000000000000
--- a/include/drm/gma_drm.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/**************************************************************************
- * Copyright (c) 2007-2011, Intel Corporation.
- * All Rights Reserved.
- * Copyright (c) 2008, Tungsten Graphics Inc.  Cedar Park, TX., USA.
- * All Rights Reserved.
- *
- **************************************************************************/
-
-#ifndef _GMA_DRM_H_
-#define _GMA_DRM_H_
-
-#endif
-- 
cgit v1.2.3


From 7cb8a625355d22c8e50726fc2ca663544eb8631f Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Wed, 22 Nov 2017 17:30:45 +0200
Subject: media: mc: Add nop implementations of media_device_{init,cleanup}

To support compilation with Media controller disabled, drivers were
required to conditionally call media_device_init and media_device_cleanup.
Add nop implementations of both so drivers don't need to care (or at least
care less).

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Acked-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 include/media/media-device.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/media/media-device.h b/include/media/media-device.h
index 2c146d0b2b1c..53d2a16a70b0 100644
--- a/include/media/media-device.h
+++ b/include/media/media-device.h
@@ -429,6 +429,9 @@ void __media_device_usb_init(struct media_device *mdev,
 			     const char *driver_name);
 
 #else
+static inline void media_device_init(struct media_device *mdev)
+{
+}
 static inline int media_device_register(struct media_device *mdev)
 {
 	return 0;
@@ -436,6 +439,9 @@ static inline int media_device_register(struct media_device *mdev)
 static inline void media_device_unregister(struct media_device *mdev)
 {
 }
+static inline void media_device_cleanup(struct media_device *mdev)
+{
+}
 static inline int media_device_register_entity(struct media_device *mdev,
 						struct media_entity *entity)
 {
-- 
cgit v1.2.3


From 5599f80196612efde96dbe6ef18f6ecc0cb4ba19 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <bentiss@kernel.org>
Date: Fri, 15 Mar 2024 15:44:39 +0100
Subject: HID: bpf: export hid_hw_output_report as a BPF kfunc

We currently only export hid_hw_raw_request() as a BPF kfunc.
However, some devices require an explicit write on the Output Report
instead of the use of the control channel.

So also export hid_hw_output_report to BPF

Link: https://lore.kernel.org/r/20240315-b4-hid-bpf-new-funcs-v4-2-079c282469d3@kernel.org
Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
---
 Documentation/hid/hid-bpf.rst      |   2 +-
 drivers/hid/bpf/hid_bpf_dispatch.c | 112 +++++++++++++++++++++++++++----------
 drivers/hid/hid-core.c             |   1 +
 include/linux/hid_bpf.h            |   1 +
 4 files changed, 86 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/Documentation/hid/hid-bpf.rst b/Documentation/hid/hid-bpf.rst
index 4fad83a6ebc3..a575004d9025 100644
--- a/Documentation/hid/hid-bpf.rst
+++ b/Documentation/hid/hid-bpf.rst
@@ -179,7 +179,7 @@ Available API that can be used in syscall HID-BPF programs:
 -----------------------------------------------------------
 
 .. kernel-doc:: drivers/hid/bpf/hid_bpf_dispatch.c
-   :functions: hid_bpf_attach_prog hid_bpf_hw_request hid_bpf_allocate_context hid_bpf_release_context
+   :functions: hid_bpf_attach_prog hid_bpf_hw_request hid_bpf_hw_output_report hid_bpf_allocate_context hid_bpf_release_context
 
 General overview of a HID-BPF program
 =====================================
diff --git a/drivers/hid/bpf/hid_bpf_dispatch.c b/drivers/hid/bpf/hid_bpf_dispatch.c
index 52abb27426f4..0ccd7cdd29d8 100644
--- a/drivers/hid/bpf/hid_bpf_dispatch.c
+++ b/drivers/hid/bpf/hid_bpf_dispatch.c
@@ -376,6 +376,46 @@ hid_bpf_release_context(struct hid_bpf_ctx *ctx)
 	put_device(&hid->dev);
 }
 
+static int
+__hid_bpf_hw_check_params(struct hid_bpf_ctx *ctx, __u8 *buf, size_t *buf__sz,
+			  enum hid_report_type rtype)
+{
+	struct hid_report_enum *report_enum;
+	struct hid_report *report;
+	struct hid_device *hdev;
+	u32 report_len;
+
+	/* check arguments */
+	if (!ctx || !hid_bpf_ops || !buf)
+		return -EINVAL;
+
+	switch (rtype) {
+	case HID_INPUT_REPORT:
+	case HID_OUTPUT_REPORT:
+	case HID_FEATURE_REPORT:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (*buf__sz < 1)
+		return -EINVAL;
+
+	hdev = (struct hid_device *)ctx->hid; /* discard const */
+
+	report_enum = hdev->report_enum + rtype;
+	report = hid_bpf_ops->hid_get_report(report_enum, buf);
+	if (!report)
+		return -EINVAL;
+
+	report_len = hid_report_len(report);
+
+	if (*buf__sz > report_len)
+		*buf__sz = report_len;
+
+	return 0;
+}
+
 /**
  * hid_bpf_hw_request - Communicate with a HID device
  *
@@ -392,24 +432,14 @@ hid_bpf_hw_request(struct hid_bpf_ctx *ctx, __u8 *buf, size_t buf__sz,
 		   enum hid_report_type rtype, enum hid_class_request reqtype)
 {
 	struct hid_device *hdev;
-	struct hid_report *report;
-	struct hid_report_enum *report_enum;
+	size_t size = buf__sz;
 	u8 *dma_data;
-	u32 report_len;
 	int ret;
 
 	/* check arguments */
-	if (!ctx || !hid_bpf_ops || !buf)
-		return -EINVAL;
-
-	switch (rtype) {
-	case HID_INPUT_REPORT:
-	case HID_OUTPUT_REPORT:
-	case HID_FEATURE_REPORT:
-		break;
-	default:
-		return -EINVAL;
-	}
+	ret = __hid_bpf_hw_check_params(ctx, buf, &size, rtype);
+	if (ret)
+		return ret;
 
 	switch (reqtype) {
 	case HID_REQ_GET_REPORT:
@@ -423,29 +453,16 @@ hid_bpf_hw_request(struct hid_bpf_ctx *ctx, __u8 *buf, size_t buf__sz,
 		return -EINVAL;
 	}
 
-	if (buf__sz < 1)
-		return -EINVAL;
-
 	hdev = (struct hid_device *)ctx->hid; /* discard const */
 
-	report_enum = hdev->report_enum + rtype;
-	report = hid_bpf_ops->hid_get_report(report_enum, buf);
-	if (!report)
-		return -EINVAL;
-
-	report_len = hid_report_len(report);
-
-	if (buf__sz > report_len)
-		buf__sz = report_len;
-
-	dma_data = kmemdup(buf, buf__sz, GFP_KERNEL);
+	dma_data = kmemdup(buf, size, GFP_KERNEL);
 	if (!dma_data)
 		return -ENOMEM;
 
 	ret = hid_bpf_ops->hid_hw_raw_request(hdev,
 					      dma_data[0],
 					      dma_data,
-					      buf__sz,
+					      size,
 					      rtype,
 					      reqtype);
 
@@ -455,6 +472,42 @@ hid_bpf_hw_request(struct hid_bpf_ctx *ctx, __u8 *buf, size_t buf__sz,
 	kfree(dma_data);
 	return ret;
 }
+
+/**
+ * hid_bpf_hw_output_report - Send an output report to a HID device
+ *
+ * @ctx: the HID-BPF context previously allocated in hid_bpf_allocate_context()
+ * @buf: a %PTR_TO_MEM buffer
+ * @buf__sz: the size of the data to transfer
+ *
+ * Returns the number of bytes transferred on success, a negative error code otherwise.
+ */
+__bpf_kfunc int
+hid_bpf_hw_output_report(struct hid_bpf_ctx *ctx, __u8 *buf, size_t buf__sz)
+{
+	struct hid_device *hdev;
+	size_t size = buf__sz;
+	u8 *dma_data;
+	int ret;
+
+	/* check arguments */
+	ret = __hid_bpf_hw_check_params(ctx, buf, &size, HID_OUTPUT_REPORT);
+	if (ret)
+		return ret;
+
+	hdev = (struct hid_device *)ctx->hid; /* discard const */
+
+	dma_data = kmemdup(buf, size, GFP_KERNEL);
+	if (!dma_data)
+		return -ENOMEM;
+
+	ret = hid_bpf_ops->hid_hw_output_report(hdev,
+						dma_data,
+						size);
+
+	kfree(dma_data);
+	return ret;
+}
 __bpf_kfunc_end_defs();
 
 /*
@@ -488,6 +541,7 @@ BTF_ID_FLAGS(func, hid_bpf_attach_prog)
 BTF_ID_FLAGS(func, hid_bpf_allocate_context, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, hid_bpf_release_context, KF_RELEASE)
 BTF_ID_FLAGS(func, hid_bpf_hw_request)
+BTF_ID_FLAGS(func, hid_bpf_hw_output_report)
 BTF_KFUNCS_END(hid_bpf_syscall_kfunc_ids)
 
 static const struct btf_kfunc_id_set hid_bpf_syscall_kfunc_set = {
diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index de7a477d6665..1243595890ba 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -2974,6 +2974,7 @@ EXPORT_SYMBOL_GPL(hid_check_keys_pressed);
 static struct hid_bpf_ops hid_ops = {
 	.hid_get_report = hid_get_report,
 	.hid_hw_raw_request = hid_hw_raw_request,
+	.hid_hw_output_report = hid_hw_output_report,
 	.owner = THIS_MODULE,
 	.bus_type = &hid_bus_type,
 };
diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h
index 7118ac28d468..5c7ff93dc73e 100644
--- a/include/linux/hid_bpf.h
+++ b/include/linux/hid_bpf.h
@@ -103,6 +103,7 @@ struct hid_bpf_ops {
 				  unsigned char reportnum, __u8 *buf,
 				  size_t len, enum hid_report_type rtype,
 				  enum hid_class_request reqtype);
+	int (*hid_hw_output_report)(struct hid_device *hdev, __u8 *buf, size_t len);
 	struct module *owner;
 	const struct bus_type *bus_type;
 };
-- 
cgit v1.2.3


From 9be50ac30a83896a753ab9f64e941763bb7900be Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <bentiss@kernel.org>
Date: Fri, 15 Mar 2024 15:44:42 +0100
Subject: HID: bpf: allow to inject HID event from BPF

It can be interesting to inject events from BPF as if the event were
to come from the device.
For example, some multitouch devices do not all the time send a proximity
out event, and we might want to send it for the physical device.

Compared to uhid, we can now inject events on any physical device, not
just uhid virtual ones.

Link: https://lore.kernel.org/r/20240315-b4-hid-bpf-new-funcs-v4-5-079c282469d3@kernel.org
Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
---
 Documentation/hid/hid-bpf.rst      |  2 +-
 drivers/hid/bpf/hid_bpf_dispatch.c | 29 +++++++++++++++++++++++++++++
 drivers/hid/hid-core.c             |  1 +
 include/linux/hid_bpf.h            |  2 ++
 4 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/hid/hid-bpf.rst b/Documentation/hid/hid-bpf.rst
index a575004d9025..0765b3298ecf 100644
--- a/Documentation/hid/hid-bpf.rst
+++ b/Documentation/hid/hid-bpf.rst
@@ -179,7 +179,7 @@ Available API that can be used in syscall HID-BPF programs:
 -----------------------------------------------------------
 
 .. kernel-doc:: drivers/hid/bpf/hid_bpf_dispatch.c
-   :functions: hid_bpf_attach_prog hid_bpf_hw_request hid_bpf_hw_output_report hid_bpf_allocate_context hid_bpf_release_context
+   :functions: hid_bpf_attach_prog hid_bpf_hw_request hid_bpf_hw_output_report hid_bpf_input_report hid_bpf_allocate_context hid_bpf_release_context
 
 General overview of a HID-BPF program
 =====================================
diff --git a/drivers/hid/bpf/hid_bpf_dispatch.c b/drivers/hid/bpf/hid_bpf_dispatch.c
index 0ccd7cdd29d8..83e5c73ce995 100644
--- a/drivers/hid/bpf/hid_bpf_dispatch.c
+++ b/drivers/hid/bpf/hid_bpf_dispatch.c
@@ -508,6 +508,34 @@ hid_bpf_hw_output_report(struct hid_bpf_ctx *ctx, __u8 *buf, size_t buf__sz)
 	kfree(dma_data);
 	return ret;
 }
+
+/**
+ * hid_bpf_input_report - Inject a HID report in the kernel from a HID device
+ *
+ * @ctx: the HID-BPF context previously allocated in hid_bpf_allocate_context()
+ * @type: the type of the report (%HID_INPUT_REPORT, %HID_FEATURE_REPORT, %HID_OUTPUT_REPORT)
+ * @buf: a %PTR_TO_MEM buffer
+ * @buf__sz: the size of the data to transfer
+ *
+ * Returns %0 on success, a negative error code otherwise.
+ */
+__bpf_kfunc int
+hid_bpf_input_report(struct hid_bpf_ctx *ctx, enum hid_report_type type, u8 *buf,
+		     const size_t buf__sz)
+{
+	struct hid_device *hdev;
+	size_t size = buf__sz;
+	int ret;
+
+	/* check arguments */
+	ret = __hid_bpf_hw_check_params(ctx, buf, &size, type);
+	if (ret)
+		return ret;
+
+	hdev = (struct hid_device *)ctx->hid; /* discard const */
+
+	return hid_input_report(hdev, type, buf, size, 0);
+}
 __bpf_kfunc_end_defs();
 
 /*
@@ -542,6 +570,7 @@ BTF_ID_FLAGS(func, hid_bpf_allocate_context, KF_ACQUIRE | KF_RET_NULL)
 BTF_ID_FLAGS(func, hid_bpf_release_context, KF_RELEASE)
 BTF_ID_FLAGS(func, hid_bpf_hw_request)
 BTF_ID_FLAGS(func, hid_bpf_hw_output_report)
+BTF_ID_FLAGS(func, hid_bpf_input_report)
 BTF_KFUNCS_END(hid_bpf_syscall_kfunc_ids)
 
 static const struct btf_kfunc_id_set hid_bpf_syscall_kfunc_set = {
diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index 1243595890ba..b1fa0378e8f4 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -2975,6 +2975,7 @@ static struct hid_bpf_ops hid_ops = {
 	.hid_get_report = hid_get_report,
 	.hid_hw_raw_request = hid_hw_raw_request,
 	.hid_hw_output_report = hid_hw_output_report,
+	.hid_input_report = hid_input_report,
 	.owner = THIS_MODULE,
 	.bus_type = &hid_bus_type,
 };
diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h
index 5c7ff93dc73e..17b08f500098 100644
--- a/include/linux/hid_bpf.h
+++ b/include/linux/hid_bpf.h
@@ -104,6 +104,8 @@ struct hid_bpf_ops {
 				  size_t len, enum hid_report_type rtype,
 				  enum hid_class_request reqtype);
 	int (*hid_hw_output_report)(struct hid_device *hdev, __u8 *buf, size_t len);
+	int (*hid_input_report)(struct hid_device *hid, enum hid_report_type type,
+				u8 *data, u32 size, int interrupt);
 	struct module *owner;
 	const struct bus_type *bus_type;
 };
-- 
cgit v1.2.3


From a01e7214bef904723d26d293eb17586078610379 Mon Sep 17 00:00:00 2001
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Date: Wed, 27 Mar 2024 14:43:37 +0530
Subject: PCI: endpoint: Remove "core_init_notifier" flag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

"core_init_notifier" flag is set by the glue drivers requiring refclk from
the host to complete the DWC core initialization. Also, those drivers will
send a notification to the EPF drivers once the initialization is fully
completed using the pci_epc_init_notify() API. Only then, the EPF drivers
will start functioning.

For the rest of the drivers generating refclk locally, EPF drivers will
start functioning post binding with them. EPF drivers rely on the
'core_init_notifier' flag to differentiate between the drivers.
Unfortunately, this creates two different flows for the EPF drivers.

So to avoid that, let's get rid of the "core_init_notifier" flag and follow
a single initialization flow for the EPF drivers. This is done by calling
the dw_pcie_ep_init_notify() from all glue drivers after the completion of
dw_pcie_ep_init_registers() API. This will allow all the glue drivers to
send the notification to the EPF drivers once the initialization is fully
completed.

Only difference here is that, the drivers requiring refclk from host will
send the notification once refclk is received, while others will send it
during probe time itself.

But this also requires the EPC core driver to deliver the notification
after EPF driver bind. Because, the glue driver can send the notification
before the EPF drivers bind() and in those cases the EPF drivers will miss
the event. To accommodate this, EPC core is now caching the state of the
EPC initialization in 'init_complete' flag and pci-ep-cfs driver sends the
notification to EPF drivers based on that after each EPF driver bind.

Link: https://lore.kernel.org/linux-pci/20240327-pci-dbi-rework-v12-8-082625472414@linaro.org
Tested-by: Niklas Cassel <cassel@kernel.org>
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Signed-off-by: Krzysztof Wilczyński <kwilczynski@kernel.org>
Reviewed-by: Frank Li <Frank.Li@nxp.com>
Reviewed-by: Niklas Cassel <cassel@kernel.org>
---
 drivers/pci/controller/cadence/pcie-cadence-ep.c  |  2 ++
 drivers/pci/controller/dwc/pci-dra7xx.c           |  2 ++
 drivers/pci/controller/dwc/pci-imx6.c             |  2 ++
 drivers/pci/controller/dwc/pci-keystone.c         |  2 ++
 drivers/pci/controller/dwc/pci-layerscape-ep.c    |  2 ++
 drivers/pci/controller/dwc/pcie-artpec6.c         |  2 ++
 drivers/pci/controller/dwc/pcie-designware-ep.c   |  1 +
 drivers/pci/controller/dwc/pcie-designware-plat.c |  2 ++
 drivers/pci/controller/dwc/pcie-keembay.c         |  2 ++
 drivers/pci/controller/dwc/pcie-qcom-ep.c         |  1 -
 drivers/pci/controller/dwc/pcie-rcar-gen4.c       |  2 ++
 drivers/pci/controller/dwc/pcie-tegra194.c        |  1 -
 drivers/pci/controller/dwc/pcie-uniphier-ep.c     |  2 ++
 drivers/pci/controller/pcie-rcar-ep.c             |  2 ++
 drivers/pci/controller/pcie-rockchip-ep.c         |  2 ++
 drivers/pci/endpoint/functions/pci-epf-test.c     | 18 +++++-------------
 drivers/pci/endpoint/pci-ep-cfs.c                 |  9 +++++++++
 drivers/pci/endpoint/pci-epc-core.c               | 22 ++++++++++++++++++++++
 include/linux/pci-epc.h                           |  7 ++++---
 19 files changed, 65 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/controller/cadence/pcie-cadence-ep.c b/drivers/pci/controller/cadence/pcie-cadence-ep.c
index 81c50dc64da9..55c42ca2b777 100644
--- a/drivers/pci/controller/cadence/pcie-cadence-ep.c
+++ b/drivers/pci/controller/cadence/pcie-cadence-ep.c
@@ -746,6 +746,8 @@ int cdns_pcie_ep_setup(struct cdns_pcie_ep *ep)
 
 	spin_lock_init(&ep->lock);
 
+	pci_epc_init_notify(epc);
+
 	return 0;
 
  free_epc_mem:
diff --git a/drivers/pci/controller/dwc/pci-dra7xx.c b/drivers/pci/controller/dwc/pci-dra7xx.c
index 395042b29ffc..d2d17d37d3e0 100644
--- a/drivers/pci/controller/dwc/pci-dra7xx.c
+++ b/drivers/pci/controller/dwc/pci-dra7xx.c
@@ -474,6 +474,8 @@ static int dra7xx_add_pcie_ep(struct dra7xx_pcie *dra7xx,
 		return ret;
 	}
 
+	dw_pcie_ep_init_notify(ep);
+
 	return 0;
 }
 
diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c
index 8d28ecc381bc..917c69edee1d 100644
--- a/drivers/pci/controller/dwc/pci-imx6.c
+++ b/drivers/pci/controller/dwc/pci-imx6.c
@@ -1131,6 +1131,8 @@ static int imx6_add_pcie_ep(struct imx6_pcie *imx6_pcie,
 		return ret;
 	}
 
+	dw_pcie_ep_init_notify(ep);
+
 	/* Start LTSSM. */
 	imx6_pcie_ltssm_enable(dev);
 
diff --git a/drivers/pci/controller/dwc/pci-keystone.c b/drivers/pci/controller/dwc/pci-keystone.c
index 81ebac520650..d3a7d14ee685 100644
--- a/drivers/pci/controller/dwc/pci-keystone.c
+++ b/drivers/pci/controller/dwc/pci-keystone.c
@@ -1293,6 +1293,8 @@ static int ks_pcie_probe(struct platform_device *pdev)
 			goto err_ep_init;
 		}
 
+		dw_pcie_ep_init_notify(&pci->ep);
+
 		break;
 	default:
 		dev_err(dev, "INVALID device type %d\n", mode);
diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c b/drivers/pci/controller/dwc/pci-layerscape-ep.c
index 9eb2233e3d7f..7dde6d5fa4d8 100644
--- a/drivers/pci/controller/dwc/pci-layerscape-ep.c
+++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c
@@ -286,6 +286,8 @@ static int __init ls_pcie_ep_probe(struct platform_device *pdev)
 		return ret;
 	}
 
+	dw_pcie_ep_init_notify(&pci->ep);
+
 	return ls_pcie_ep_interrupt_init(pcie, pdev);
 }
 
diff --git a/drivers/pci/controller/dwc/pcie-artpec6.c b/drivers/pci/controller/dwc/pcie-artpec6.c
index a6095561db4a..a4630b92489b 100644
--- a/drivers/pci/controller/dwc/pcie-artpec6.c
+++ b/drivers/pci/controller/dwc/pcie-artpec6.c
@@ -452,6 +452,8 @@ static int artpec6_pcie_probe(struct platform_device *pdev)
 			return ret;
 		}
 
+		dw_pcie_ep_init_notify(&pci->ep);
+
 		break;
 	default:
 		dev_err(dev, "INVALID device type %d\n", artpec6_pcie->mode);
diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c
index 2063cf2049e5..47391d7d3a73 100644
--- a/drivers/pci/controller/dwc/pcie-designware-ep.c
+++ b/drivers/pci/controller/dwc/pcie-designware-ep.c
@@ -632,6 +632,7 @@ void dw_pcie_ep_cleanup(struct dw_pcie_ep *ep)
 	struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
 
 	dw_pcie_edma_remove(pci);
+	ep->epc->init_complete = false;
 }
 EXPORT_SYMBOL_GPL(dw_pcie_ep_cleanup);
 
diff --git a/drivers/pci/controller/dwc/pcie-designware-plat.c b/drivers/pci/controller/dwc/pcie-designware-plat.c
index ca9b22e654cd..8490c5d6ff9f 100644
--- a/drivers/pci/controller/dwc/pcie-designware-plat.c
+++ b/drivers/pci/controller/dwc/pcie-designware-plat.c
@@ -154,6 +154,8 @@ static int dw_plat_pcie_probe(struct platform_device *pdev)
 			dw_pcie_ep_deinit(&pci->ep);
 		}
 
+		dw_pcie_ep_init_notify(&pci->ep);
+
 		break;
 	default:
 		dev_err(dev, "INVALID device type %d\n", dw_plat_pcie->mode);
diff --git a/drivers/pci/controller/dwc/pcie-keembay.c b/drivers/pci/controller/dwc/pcie-keembay.c
index b2556dbcffb5..98bbc83182b4 100644
--- a/drivers/pci/controller/dwc/pcie-keembay.c
+++ b/drivers/pci/controller/dwc/pcie-keembay.c
@@ -442,6 +442,8 @@ static int keembay_pcie_probe(struct platform_device *pdev)
 			return ret;
 		}
 
+		dw_pcie_ep_init_notify(&pci->ep);
+
 		break;
 	default:
 		dev_err(dev, "Invalid device type %d\n", pcie->mode);
diff --git a/drivers/pci/controller/dwc/pcie-qcom-ep.c b/drivers/pci/controller/dwc/pcie-qcom-ep.c
index 3697b4a944cc..2fb8c15e7a91 100644
--- a/drivers/pci/controller/dwc/pcie-qcom-ep.c
+++ b/drivers/pci/controller/dwc/pcie-qcom-ep.c
@@ -775,7 +775,6 @@ static void qcom_pcie_ep_init_debugfs(struct qcom_pcie_ep *pcie_ep)
 
 static const struct pci_epc_features qcom_pcie_epc_features = {
 	.linkup_notifier = true,
-	.core_init_notifier = true,
 	.msi_capable = true,
 	.msix_capable = false,
 	.align = SZ_4K,
diff --git a/drivers/pci/controller/dwc/pcie-rcar-gen4.c b/drivers/pci/controller/dwc/pcie-rcar-gen4.c
index e155a905fb4f..cfeccc2f9ee1 100644
--- a/drivers/pci/controller/dwc/pcie-rcar-gen4.c
+++ b/drivers/pci/controller/dwc/pcie-rcar-gen4.c
@@ -437,6 +437,8 @@ static int rcar_gen4_add_dw_pcie_ep(struct rcar_gen4_pcie *rcar)
 		rcar_gen4_pcie_ep_deinit(rcar);
 	}
 
+	dw_pcie_ep_init_notify(ep);
+
 	return ret;
 }
 
diff --git a/drivers/pci/controller/dwc/pcie-tegra194.c b/drivers/pci/controller/dwc/pcie-tegra194.c
index db043f579fbe..ddc23602eca7 100644
--- a/drivers/pci/controller/dwc/pcie-tegra194.c
+++ b/drivers/pci/controller/dwc/pcie-tegra194.c
@@ -2006,7 +2006,6 @@ static int tegra_pcie_ep_raise_irq(struct dw_pcie_ep *ep, u8 func_no,
 
 static const struct pci_epc_features tegra_pcie_epc_features = {
 	.linkup_notifier = true,
-	.core_init_notifier = true,
 	.msi_capable = false,
 	.msix_capable = false,
 	.bar[BAR_0] = { .type = BAR_FIXED, .fixed_size = SZ_1M,
diff --git a/drivers/pci/controller/dwc/pcie-uniphier-ep.c b/drivers/pci/controller/dwc/pcie-uniphier-ep.c
index 0e5e7344de48..a2b844268e28 100644
--- a/drivers/pci/controller/dwc/pcie-uniphier-ep.c
+++ b/drivers/pci/controller/dwc/pcie-uniphier-ep.c
@@ -410,6 +410,8 @@ static int uniphier_pcie_ep_probe(struct platform_device *pdev)
 		return ret;
 	}
 
+	dw_pcie_ep_init_notify(&priv->pci.ep);
+
 	return 0;
 }
 
diff --git a/drivers/pci/controller/pcie-rcar-ep.c b/drivers/pci/controller/pcie-rcar-ep.c
index 05967c6c0b42..047e2cef5afc 100644
--- a/drivers/pci/controller/pcie-rcar-ep.c
+++ b/drivers/pci/controller/pcie-rcar-ep.c
@@ -542,6 +542,8 @@ static int rcar_pcie_ep_probe(struct platform_device *pdev)
 		goto err_pm_put;
 	}
 
+	pci_epc_init_notify(epc);
+
 	return 0;
 
 err_pm_put:
diff --git a/drivers/pci/controller/pcie-rockchip-ep.c b/drivers/pci/controller/pcie-rockchip-ep.c
index c9046e97a1d2..8613df8184df 100644
--- a/drivers/pci/controller/pcie-rockchip-ep.c
+++ b/drivers/pci/controller/pcie-rockchip-ep.c
@@ -609,6 +609,8 @@ static int rockchip_pcie_ep_probe(struct platform_device *pdev)
 	rockchip_pcie_write(rockchip, PCIE_CLIENT_CONF_ENABLE,
 			    PCIE_CLIENT_CONFIG);
 
+	pci_epc_init_notify(epc);
+
 	return 0;
 err_epc_mem_exit:
 	pci_epc_mem_exit(epc);
diff --git a/drivers/pci/endpoint/functions/pci-epf-test.c b/drivers/pci/endpoint/functions/pci-epf-test.c
index cd4ffb39dcdc..212fc303fb63 100644
--- a/drivers/pci/endpoint/functions/pci-epf-test.c
+++ b/drivers/pci/endpoint/functions/pci-epf-test.c
@@ -753,6 +753,7 @@ static int pci_epf_test_core_init(struct pci_epf *epf)
 	const struct pci_epc_features *epc_features;
 	struct pci_epc *epc = epf->epc;
 	struct device *dev = &epf->dev;
+	bool linkup_notifier = false;
 	bool msix_capable = false;
 	bool msi_capable = true;
 	int ret;
@@ -795,6 +796,10 @@ static int pci_epf_test_core_init(struct pci_epf *epf)
 		}
 	}
 
+	linkup_notifier = epc_features->linkup_notifier;
+	if (!linkup_notifier)
+		queue_work(kpcitest_workqueue, &epf_test->cmd_handler.work);
+
 	return 0;
 }
 
@@ -890,8 +895,6 @@ static int pci_epf_test_bind(struct pci_epf *epf)
 	const struct pci_epc_features *epc_features;
 	enum pci_barno test_reg_bar = BAR_0;
 	struct pci_epc *epc = epf->epc;
-	bool linkup_notifier = false;
-	bool core_init_notifier = false;
 
 	if (WARN_ON_ONCE(!epc))
 		return -EINVAL;
@@ -902,8 +905,6 @@ static int pci_epf_test_bind(struct pci_epf *epf)
 		return -EOPNOTSUPP;
 	}
 
-	linkup_notifier = epc_features->linkup_notifier;
-	core_init_notifier = epc_features->core_init_notifier;
 	test_reg_bar = pci_epc_get_first_free_bar(epc_features);
 	if (test_reg_bar < 0)
 		return -EINVAL;
@@ -916,21 +917,12 @@ static int pci_epf_test_bind(struct pci_epf *epf)
 	if (ret)
 		return ret;
 
-	if (!core_init_notifier) {
-		ret = pci_epf_test_core_init(epf);
-		if (ret)
-			return ret;
-	}
-
 	epf_test->dma_supported = true;
 
 	ret = pci_epf_test_init_dma_chan(epf_test);
 	if (ret)
 		epf_test->dma_supported = false;
 
-	if (!linkup_notifier && !core_init_notifier)
-		queue_work(kpcitest_workqueue, &epf_test->cmd_handler.work);
-
 	return 0;
 }
 
diff --git a/drivers/pci/endpoint/pci-ep-cfs.c b/drivers/pci/endpoint/pci-ep-cfs.c
index 0ea64e24ed61..3b21e28f9b59 100644
--- a/drivers/pci/endpoint/pci-ep-cfs.c
+++ b/drivers/pci/endpoint/pci-ep-cfs.c
@@ -64,6 +64,9 @@ static int pci_secondary_epc_epf_link(struct config_item *epf_item,
 		return ret;
 	}
 
+	/* Send any pending EPC initialization complete to the EPF driver */
+	pci_epc_notify_pending_init(epc, epf);
+
 	return 0;
 }
 
@@ -125,6 +128,9 @@ static int pci_primary_epc_epf_link(struct config_item *epf_item,
 		return ret;
 	}
 
+	/* Send any pending EPC initialization complete to the EPF driver */
+	pci_epc_notify_pending_init(epc, epf);
+
 	return 0;
 }
 
@@ -230,6 +236,9 @@ static int pci_epc_epf_link(struct config_item *epc_item,
 		return ret;
 	}
 
+	/* Send any pending EPC initialization complete to the EPF driver */
+	pci_epc_notify_pending_init(epc, epf);
+
 	return 0;
 }
 
diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c
index da3fc0795b0b..47d27ec7439d 100644
--- a/drivers/pci/endpoint/pci-epc-core.c
+++ b/drivers/pci/endpoint/pci-epc-core.c
@@ -748,10 +748,32 @@ void pci_epc_init_notify(struct pci_epc *epc)
 			epf->event_ops->core_init(epf);
 		mutex_unlock(&epf->lock);
 	}
+	epc->init_complete = true;
 	mutex_unlock(&epc->list_lock);
 }
 EXPORT_SYMBOL_GPL(pci_epc_init_notify);
 
+/**
+ * pci_epc_notify_pending_init() - Notify the pending EPC device initialization
+ *                                 complete to the EPF device
+ * @epc: the EPC device whose core initialization is pending to be notified
+ * @epf: the EPF device to be notified
+ *
+ * Invoke to notify the pending EPC device initialization complete to the EPF
+ * device. This is used to deliver the notification if the EPC initialization
+ * got completed before the EPF driver bind.
+ */
+void pci_epc_notify_pending_init(struct pci_epc *epc, struct pci_epf *epf)
+{
+	if (epc->init_complete) {
+		mutex_lock(&epf->lock);
+		if (epf->event_ops && epf->event_ops->core_init)
+			epf->event_ops->core_init(epf);
+		mutex_unlock(&epf->lock);
+	}
+}
+EXPORT_SYMBOL_GPL(pci_epc_notify_pending_init);
+
 /**
  * pci_epc_bme_notify() - Notify the EPF device that the EPC device has received
  *			  the BME event from the Root complex
diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
index cc2f70d061c8..acc5f96161fe 100644
--- a/include/linux/pci-epc.h
+++ b/include/linux/pci-epc.h
@@ -128,6 +128,8 @@ struct pci_epc_mem {
  * @group: configfs group representing the PCI EPC device
  * @lock: mutex to protect pci_epc ops
  * @function_num_map: bitmap to manage physical function number
+ * @init_complete: flag to indicate whether the EPC initialization is complete
+ *                 or not
  */
 struct pci_epc {
 	struct device			dev;
@@ -143,6 +145,7 @@ struct pci_epc {
 	/* mutex to protect against concurrent access of EP controller */
 	struct mutex			lock;
 	unsigned long			function_num_map;
+	bool				init_complete;
 };
 
 /**
@@ -179,8 +182,6 @@ struct pci_epc_bar_desc {
 /**
  * struct pci_epc_features - features supported by a EPC device per function
  * @linkup_notifier: indicate if the EPC device can notify EPF driver on link up
- * @core_init_notifier: indicate cores that can notify about their availability
- *			for initialization
  * @msi_capable: indicate if the endpoint function has MSI capability
  * @msix_capable: indicate if the endpoint function has MSI-X capability
  * @bar: array specifying the hardware description for each BAR
@@ -188,7 +189,6 @@ struct pci_epc_bar_desc {
  */
 struct pci_epc_features {
 	unsigned int	linkup_notifier : 1;
-	unsigned int	core_init_notifier : 1;
 	unsigned int	msi_capable : 1;
 	unsigned int	msix_capable : 1;
 	struct	pci_epc_bar_desc bar[PCI_STD_NUM_BARS];
@@ -225,6 +225,7 @@ int pci_epc_add_epf(struct pci_epc *epc, struct pci_epf *epf,
 void pci_epc_linkup(struct pci_epc *epc);
 void pci_epc_linkdown(struct pci_epc *epc);
 void pci_epc_init_notify(struct pci_epc *epc);
+void pci_epc_notify_pending_init(struct pci_epc *epc, struct pci_epf *epf);
 void pci_epc_bme_notify(struct pci_epc *epc);
 void pci_epc_remove_epf(struct pci_epc *epc, struct pci_epf *epf,
 			enum pci_epc_interface_type type);
-- 
cgit v1.2.3


From 65f35aa76c0e21b0243fd734e513fd2263f22a18 Mon Sep 17 00:00:00 2001
From: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Date: Tue, 9 Apr 2024 16:25:16 -0700
Subject: ethtool: update tsinfo statistics attribute docs with correct type

nla_put_uint can either write a u32 or u64 netlink attribute value. The
size depends on whether the value can be represented with a u32 or requires
a u64. Use a uint annotation in various documentation to represent this.

Signed-off-by: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Link: https://lore.kernel.org/r/20240409232520.237613-2-rrameshbabu@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/ethtool-netlink.rst | 6 +++---
 include/uapi/linux/ethtool_netlink.h         | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index 5dc42f7ce429..4e63d3708ed9 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -1254,9 +1254,9 @@ would be empty (no bit set).
 Additional hardware timestamping statistics response contents:
 
   =====================================  ======  ===================================
-  ``ETHTOOL_A_TS_STAT_TX_PKTS``          u64     Packets with Tx HW timestamps
-  ``ETHTOOL_A_TS_STAT_TX_LOST``          u64     Tx HW timestamp not arrived count
-  ``ETHTOOL_A_TS_STAT_TX_ERR``           u64     HW error request Tx timestamp count
+  ``ETHTOOL_A_TS_STAT_TX_PKTS``          uint    Packets with Tx HW timestamps
+  ``ETHTOOL_A_TS_STAT_TX_LOST``          uint    Tx HW timestamp not arrived count
+  ``ETHTOOL_A_TS_STAT_TX_ERR``           uint    HW error request Tx timestamp count
   =====================================  ======  ===================================
 
 CABLE_TEST
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index 23e225f00fb0..b4f0d233d048 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -488,9 +488,9 @@ enum {
 enum {
 	ETHTOOL_A_TS_STAT_UNSPEC,
 
-	ETHTOOL_A_TS_STAT_TX_PKTS,			/* u64 */
-	ETHTOOL_A_TS_STAT_TX_LOST,			/* u64 */
-	ETHTOOL_A_TS_STAT_TX_ERR,			/* u64 */
+	ETHTOOL_A_TS_STAT_TX_PKTS,			/* uint */
+	ETHTOOL_A_TS_STAT_TX_LOST,			/* uint */
+	ETHTOOL_A_TS_STAT_TX_ERR,			/* uint */
 
 	/* add new constants above here */
 	__ETHTOOL_A_TS_STAT_CNT,
-- 
cgit v1.2.3


From 699c23f02c65cbfc3e638f14ce0d70c23a2e1f02 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yonghong.song@linux.dev>
Date: Tue, 9 Apr 2024 21:35:27 -0700
Subject: bpf: Add bpf_link support for sk_msg and sk_skb progs

Add bpf_link support for sk_msg and sk_skb programs. We have an
internal request to support bpf_link for sk_msg programs so user
space can have a uniform handling with bpf_link based libbpf
APIs. Using bpf_link based libbpf API also has a benefit which
makes system robust by decoupling prog life cycle and
attachment life cycle.

Reviewed-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20240410043527.3737160-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h            |   6 +
 include/linux/skmsg.h          |   4 +
 include/uapi/linux/bpf.h       |   5 +
 kernel/bpf/syscall.c           |   4 +
 net/core/sock_map.c            | 263 ++++++++++++++++++++++++++++++++++++++---
 tools/include/uapi/linux/bpf.h |   5 +
 6 files changed, 271 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 62762390c93d..5034c1b4ded7 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2996,6 +2996,7 @@ int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
 int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags);
 int sock_map_bpf_prog_query(const union bpf_attr *attr,
 			    union bpf_attr __user *uattr);
+int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog);
 
 void sock_map_unhash(struct sock *sk);
 void sock_map_destroy(struct sock *sk);
@@ -3094,6 +3095,11 @@ static inline int sock_map_bpf_prog_query(const union bpf_attr *attr,
 {
 	return -EINVAL;
 }
+
+static inline int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	return -EOPNOTSUPP;
+}
 #endif /* CONFIG_BPF_SYSCALL */
 #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
 
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index e65ec3fd2799..9c8dd4c01412 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -58,6 +58,10 @@ struct sk_psock_progs {
 	struct bpf_prog			*stream_parser;
 	struct bpf_prog			*stream_verdict;
 	struct bpf_prog			*skb_verdict;
+	struct bpf_link			*msg_parser_link;
+	struct bpf_link			*stream_parser_link;
+	struct bpf_link			*stream_verdict_link;
+	struct bpf_link			*skb_verdict_link;
 };
 
 enum sk_psock_state_bits {
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6fe9f11c8abe..cee0a7915c08 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1135,6 +1135,7 @@ enum bpf_link_type {
 	BPF_LINK_TYPE_TCX = 11,
 	BPF_LINK_TYPE_UPROBE_MULTI = 12,
 	BPF_LINK_TYPE_NETKIT = 13,
+	BPF_LINK_TYPE_SOCKMAP = 14,
 	__MAX_BPF_LINK_TYPE,
 };
 
@@ -6724,6 +6725,10 @@ struct bpf_link_info {
 			__u32 ifindex;
 			__u32 attach_type;
 		} netkit;
+		struct {
+			__u32 map_id;
+			__u32 attach_type;
+		} sockmap;
 	};
 } __attribute__((aligned(8)));
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e44c276e8617..7d392ec83655 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -5213,6 +5213,10 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 	case BPF_PROG_TYPE_SK_LOOKUP:
 		ret = netns_bpf_link_create(attr, prog);
 		break;
+	case BPF_PROG_TYPE_SK_MSG:
+	case BPF_PROG_TYPE_SK_SKB:
+		ret = sock_map_link_create(attr, prog);
+		break;
 #ifdef CONFIG_NET
 	case BPF_PROG_TYPE_XDP:
 		ret = bpf_xdp_link_attach(attr, prog);
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 27d733c0f65e..63c016b4c169 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -24,8 +24,16 @@ struct bpf_stab {
 #define SOCK_CREATE_FLAG_MASK				\
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
 
+/* This mutex is used to
+ *  - protect race between prog/link attach/detach and link prog update, and
+ *  - protect race between releasing and accessing map in bpf_link.
+ * A single global mutex lock is used since it is expected contention is low.
+ */
+static DEFINE_MUTEX(sockmap_mutex);
+
 static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
-				struct bpf_prog *old, u32 which);
+				struct bpf_prog *old, struct bpf_link *link,
+				u32 which);
 static struct sk_psock_progs *sock_map_progs(struct bpf_map *map);
 
 static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
@@ -71,7 +79,9 @@ int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog)
 	map = __bpf_map_get(f);
 	if (IS_ERR(map))
 		return PTR_ERR(map);
-	ret = sock_map_prog_update(map, prog, NULL, attr->attach_type);
+	mutex_lock(&sockmap_mutex);
+	ret = sock_map_prog_update(map, prog, NULL, NULL, attr->attach_type);
+	mutex_unlock(&sockmap_mutex);
 	fdput(f);
 	return ret;
 }
@@ -103,7 +113,9 @@ int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
 		goto put_prog;
 	}
 
-	ret = sock_map_prog_update(map, NULL, prog, attr->attach_type);
+	mutex_lock(&sockmap_mutex);
+	ret = sock_map_prog_update(map, NULL, prog, NULL, attr->attach_type);
+	mutex_unlock(&sockmap_mutex);
 put_prog:
 	bpf_prog_put(prog);
 put_map:
@@ -1454,55 +1466,84 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map)
 	return NULL;
 }
 
-static int sock_map_prog_lookup(struct bpf_map *map, struct bpf_prog ***pprog,
-				u32 which)
+static int sock_map_prog_link_lookup(struct bpf_map *map, struct bpf_prog ***pprog,
+				     struct bpf_link ***plink, u32 which)
 {
 	struct sk_psock_progs *progs = sock_map_progs(map);
+	struct bpf_prog **cur_pprog;
+	struct bpf_link **cur_plink;
 
 	if (!progs)
 		return -EOPNOTSUPP;
 
 	switch (which) {
 	case BPF_SK_MSG_VERDICT:
-		*pprog = &progs->msg_parser;
+		cur_pprog = &progs->msg_parser;
+		cur_plink = &progs->msg_parser_link;
 		break;
 #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
 	case BPF_SK_SKB_STREAM_PARSER:
-		*pprog = &progs->stream_parser;
+		cur_pprog = &progs->stream_parser;
+		cur_plink = &progs->stream_parser_link;
 		break;
 #endif
 	case BPF_SK_SKB_STREAM_VERDICT:
 		if (progs->skb_verdict)
 			return -EBUSY;
-		*pprog = &progs->stream_verdict;
+		cur_pprog = &progs->stream_verdict;
+		cur_plink = &progs->stream_verdict_link;
 		break;
 	case BPF_SK_SKB_VERDICT:
 		if (progs->stream_verdict)
 			return -EBUSY;
-		*pprog = &progs->skb_verdict;
+		cur_pprog = &progs->skb_verdict;
+		cur_plink = &progs->skb_verdict_link;
 		break;
 	default:
 		return -EOPNOTSUPP;
 	}
 
+	*pprog = cur_pprog;
+	if (plink)
+		*plink = cur_plink;
 	return 0;
 }
 
+/* Handle the following four cases:
+ * prog_attach: prog != NULL, old == NULL, link == NULL
+ * prog_detach: prog == NULL, old != NULL, link == NULL
+ * link_attach: prog != NULL, old == NULL, link != NULL
+ * link_detach: prog == NULL, old != NULL, link != NULL
+ */
 static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
-				struct bpf_prog *old, u32 which)
+				struct bpf_prog *old, struct bpf_link *link,
+				u32 which)
 {
 	struct bpf_prog **pprog;
+	struct bpf_link **plink;
 	int ret;
 
-	ret = sock_map_prog_lookup(map, &pprog, which);
+	ret = sock_map_prog_link_lookup(map, &pprog, &plink, which);
 	if (ret)
 		return ret;
 
-	if (old)
-		return psock_replace_prog(pprog, prog, old);
+	/* for prog_attach/prog_detach/link_attach, return error if a bpf_link
+	 * exists for that prog.
+	 */
+	if ((!link || prog) && *plink)
+		return -EBUSY;
 
-	psock_set_prog(pprog, prog);
-	return 0;
+	if (old) {
+		ret = psock_replace_prog(pprog, prog, old);
+		if (!ret)
+			*plink = NULL;
+	} else {
+		psock_set_prog(pprog, prog);
+		if (link)
+			*plink = link;
+	}
+
+	return ret;
 }
 
 int sock_map_bpf_prog_query(const union bpf_attr *attr,
@@ -1527,7 +1568,7 @@ int sock_map_bpf_prog_query(const union bpf_attr *attr,
 
 	rcu_read_lock();
 
-	ret = sock_map_prog_lookup(map, &pprog, attr->query.attach_type);
+	ret = sock_map_prog_link_lookup(map, &pprog, NULL, attr->query.attach_type);
 	if (ret)
 		goto end;
 
@@ -1657,6 +1698,196 @@ void sock_map_close(struct sock *sk, long timeout)
 }
 EXPORT_SYMBOL_GPL(sock_map_close);
 
+struct sockmap_link {
+	struct bpf_link link;
+	struct bpf_map *map;
+	enum bpf_attach_type attach_type;
+};
+
+static void sock_map_link_release(struct bpf_link *link)
+{
+	struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
+
+	mutex_lock(&sockmap_mutex);
+	if (!sockmap_link->map)
+		goto out;
+
+	WARN_ON_ONCE(sock_map_prog_update(sockmap_link->map, NULL, link->prog, link,
+					  sockmap_link->attach_type));
+
+	bpf_map_put_with_uref(sockmap_link->map);
+	sockmap_link->map = NULL;
+out:
+	mutex_unlock(&sockmap_mutex);
+}
+
+static int sock_map_link_detach(struct bpf_link *link)
+{
+	sock_map_link_release(link);
+	return 0;
+}
+
+static void sock_map_link_dealloc(struct bpf_link *link)
+{
+	kfree(link);
+}
+
+/* Handle the following two cases:
+ * case 1: link != NULL, prog != NULL, old != NULL
+ * case 2: link != NULL, prog != NULL, old == NULL
+ */
+static int sock_map_link_update_prog(struct bpf_link *link,
+				     struct bpf_prog *prog,
+				     struct bpf_prog *old)
+{
+	const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
+	struct bpf_prog **pprog, *old_link_prog;
+	struct bpf_link **plink;
+	int ret = 0;
+
+	mutex_lock(&sockmap_mutex);
+
+	/* If old prog is not NULL, ensure old prog is the same as link->prog. */
+	if (old && link->prog != old) {
+		ret = -EPERM;
+		goto out;
+	}
+	/* Ensure link->prog has the same type/attach_type as the new prog. */
+	if (link->prog->type != prog->type ||
+	    link->prog->expected_attach_type != prog->expected_attach_type) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = sock_map_prog_link_lookup(sockmap_link->map, &pprog, &plink,
+					sockmap_link->attach_type);
+	if (ret)
+		goto out;
+
+	/* return error if the stored bpf_link does not match the incoming bpf_link. */
+	if (link != *plink) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	if (old) {
+		ret = psock_replace_prog(pprog, prog, old);
+		if (ret)
+			goto out;
+	} else {
+		psock_set_prog(pprog, prog);
+	}
+
+	bpf_prog_inc(prog);
+	old_link_prog = xchg(&link->prog, prog);
+	bpf_prog_put(old_link_prog);
+
+out:
+	mutex_unlock(&sockmap_mutex);
+	return ret;
+}
+
+static u32 sock_map_link_get_map_id(const struct sockmap_link *sockmap_link)
+{
+	u32 map_id = 0;
+
+	mutex_lock(&sockmap_mutex);
+	if (sockmap_link->map)
+		map_id = sockmap_link->map->id;
+	mutex_unlock(&sockmap_mutex);
+	return map_id;
+}
+
+static int sock_map_link_fill_info(const struct bpf_link *link,
+				   struct bpf_link_info *info)
+{
+	const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
+	u32 map_id = sock_map_link_get_map_id(sockmap_link);
+
+	info->sockmap.map_id = map_id;
+	info->sockmap.attach_type = sockmap_link->attach_type;
+	return 0;
+}
+
+static void sock_map_link_show_fdinfo(const struct bpf_link *link,
+				      struct seq_file *seq)
+{
+	const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
+	u32 map_id = sock_map_link_get_map_id(sockmap_link);
+
+	seq_printf(seq, "map_id:\t%u\n", map_id);
+	seq_printf(seq, "attach_type:\t%u\n", sockmap_link->attach_type);
+}
+
+static const struct bpf_link_ops sock_map_link_ops = {
+	.release = sock_map_link_release,
+	.dealloc = sock_map_link_dealloc,
+	.detach = sock_map_link_detach,
+	.update_prog = sock_map_link_update_prog,
+	.fill_link_info = sock_map_link_fill_info,
+	.show_fdinfo = sock_map_link_show_fdinfo,
+};
+
+int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	struct bpf_link_primer link_primer;
+	struct sockmap_link *sockmap_link;
+	enum bpf_attach_type attach_type;
+	struct bpf_map *map;
+	int ret;
+
+	if (attr->link_create.flags)
+		return -EINVAL;
+
+	map = bpf_map_get_with_uref(attr->link_create.target_fd);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+	if (map->map_type != BPF_MAP_TYPE_SOCKMAP && map->map_type != BPF_MAP_TYPE_SOCKHASH) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	sockmap_link = kzalloc(sizeof(*sockmap_link), GFP_USER);
+	if (!sockmap_link) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	attach_type = attr->link_create.attach_type;
+	bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog);
+	sockmap_link->map = map;
+	sockmap_link->attach_type = attach_type;
+
+	ret = bpf_link_prime(&sockmap_link->link, &link_primer);
+	if (ret) {
+		kfree(sockmap_link);
+		goto out;
+	}
+
+	mutex_lock(&sockmap_mutex);
+	ret = sock_map_prog_update(map, prog, NULL, &sockmap_link->link, attach_type);
+	mutex_unlock(&sockmap_mutex);
+	if (ret) {
+		bpf_link_cleanup(&link_primer);
+		goto out;
+	}
+
+	/* Increase refcnt for the prog since when old prog is replaced with
+	 * psock_replace_prog() and psock_set_prog() its refcnt will be decreased.
+	 *
+	 * Actually, we do not need to increase refcnt for the prog since bpf_link
+	 * will hold a reference. But in order to have less complexity w.r.t.
+	 * replacing/setting prog, let us increase the refcnt to make things simpler.
+	 */
+	bpf_prog_inc(prog);
+
+	return bpf_link_settle(&link_primer);
+
+out:
+	bpf_map_put_with_uref(map);
+	return ret;
+}
+
 static int sock_map_iter_attach_target(struct bpf_prog *prog,
 				       union bpf_iter_link_info *linfo,
 				       struct bpf_iter_aux_info *aux)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 6fe9f11c8abe..cee0a7915c08 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1135,6 +1135,7 @@ enum bpf_link_type {
 	BPF_LINK_TYPE_TCX = 11,
 	BPF_LINK_TYPE_UPROBE_MULTI = 12,
 	BPF_LINK_TYPE_NETKIT = 13,
+	BPF_LINK_TYPE_SOCKMAP = 14,
 	__MAX_BPF_LINK_TYPE,
 };
 
@@ -6724,6 +6725,10 @@ struct bpf_link_info {
 			__u32 ifindex;
 			__u32 attach_type;
 		} netkit;
+		struct {
+			__u32 map_id;
+			__u32 attach_type;
+		} sockmap;
 	};
 } __attribute__((aligned(8)));
 
-- 
cgit v1.2.3


From 0b1bfd15f3f9ffb1e36c88861e9ada3f58ccb234 Mon Sep 17 00:00:00 2001
From: Binbin Zhou <zhoubinbin@loongson.cn>
Date: Thu, 11 Apr 2024 10:58:06 +0800
Subject: dt-bindings: clock: Add Loongson-2K expand clock index

In the new Loongson-2K family of SoCs, more clock indexes are needed,
such as clock gates.
The patch adds these clock indexes

Signed-off-by: Binbin Zhou <zhoubinbin@loongson.cn>
Acked-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://lore.kernel.org/r/76844e0e4dae290425f7c8025f7f36810cb3a3a8.1712731524.git.zhoubinbin@loongson.cn
Acked-by: Huacai Chen <chenhuacai@loongson.cn>
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/dt-bindings/clock/loongson,ls2k-clk.h | 54 +++++++++++++++++----------
 1 file changed, 35 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/dt-bindings/clock/loongson,ls2k-clk.h b/include/dt-bindings/clock/loongson,ls2k-clk.h
index 3bc4dfc193c2..4279ba595f1e 100644
--- a/include/dt-bindings/clock/loongson,ls2k-clk.h
+++ b/include/dt-bindings/clock/loongson,ls2k-clk.h
@@ -7,24 +7,40 @@
 #ifndef __DT_BINDINGS_CLOCK_LOONGSON2_H
 #define __DT_BINDINGS_CLOCK_LOONGSON2_H
 
-#define LOONGSON2_REF_100M				0
-#define LOONGSON2_NODE_PLL				1
-#define LOONGSON2_DDR_PLL				2
-#define LOONGSON2_DC_PLL				3
-#define LOONGSON2_PIX0_PLL				4
-#define LOONGSON2_PIX1_PLL				5
-#define LOONGSON2_NODE_CLK				6
-#define LOONGSON2_HDA_CLK				7
-#define LOONGSON2_GPU_CLK				8
-#define LOONGSON2_DDR_CLK				9
-#define LOONGSON2_GMAC_CLK				10
-#define LOONGSON2_DC_CLK				11
-#define LOONGSON2_APB_CLK				12
-#define LOONGSON2_USB_CLK				13
-#define LOONGSON2_SATA_CLK				14
-#define LOONGSON2_PIX0_CLK				15
-#define LOONGSON2_PIX1_CLK				16
-#define LOONGSON2_BOOT_CLK				17
-#define LOONGSON2_CLK_END				18
+#define LOONGSON2_REF_100M	0
+#define LOONGSON2_NODE_PLL	1
+#define LOONGSON2_DDR_PLL	2
+#define LOONGSON2_DC_PLL	3
+#define LOONGSON2_PIX0_PLL	4
+#define LOONGSON2_PIX1_PLL	5
+#define LOONGSON2_NODE_CLK	6
+#define LOONGSON2_HDA_CLK	7
+#define LOONGSON2_GPU_CLK	8
+#define LOONGSON2_DDR_CLK	9
+#define LOONGSON2_GMAC_CLK	10
+#define LOONGSON2_DC_CLK	11
+#define LOONGSON2_APB_CLK	12
+#define LOONGSON2_USB_CLK	13
+#define LOONGSON2_SATA_CLK	14
+#define LOONGSON2_PIX0_CLK	15
+#define LOONGSON2_PIX1_CLK	16
+#define LOONGSON2_BOOT_CLK	17
+#define LOONGSON2_OUT0_GATE	18
+#define LOONGSON2_GMAC_GATE	19
+#define LOONGSON2_RIO_GATE	20
+#define LOONGSON2_DC_GATE	21
+#define LOONGSON2_GPU_GATE	22
+#define LOONGSON2_DDR_GATE	23
+#define LOONGSON2_HDA_GATE	24
+#define LOONGSON2_NODE_GATE	25
+#define LOONGSON2_EMMC_GATE	26
+#define LOONGSON2_PIX0_GATE	27
+#define LOONGSON2_PIX1_GATE	28
+#define LOONGSON2_OUT0_CLK	29
+#define LOONGSON2_RIO_CLK	30
+#define LOONGSON2_EMMC_CLK	31
+#define LOONGSON2_DES_CLK	32
+#define LOONGSON2_I2S_CLK	33
+#define LOONGSON2_MISC_CLK	34
 
 #endif
-- 
cgit v1.2.3


From 8dfd00f7069c54db78463f2a8a8cb677844fdf1f Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Mon, 8 Apr 2024 06:38:22 +0000
Subject: soundwire: reconcile dp0_prop and dpn_prop

The definitions for DP0 are missing a set of fields that are required
to reuse the same configuration code as DPn.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Rander Wang <rander.wang@intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20240408063822.421963-1-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 include/linux/soundwire/sdw.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index e5d0aa58e7f6..7bb9119f3069 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -235,6 +235,7 @@ enum sdw_clk_stop_mode {
  * @BRA_flow_controlled: Slave implementation results in an OK_NotReady
  * response
  * @simple_ch_prep_sm: If channel prepare sequence is required
+ * @ch_prep_timeout: Port-specific timeout value, in milliseconds
  * @imp_def_interrupts: If set, each bit corresponds to support for
  * implementation-defined interrupts
  *
@@ -249,6 +250,7 @@ struct sdw_dp0_prop {
 	u32 *words;
 	bool BRA_flow_controlled;
 	bool simple_ch_prep_sm;
+	u32 ch_prep_timeout;
 	bool imp_def_interrupts;
 };
 
-- 
cgit v1.2.3


From 63752ad191f93144c99e848b7dbda050bda16fa6 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 5 Apr 2024 21:56:46 +0300
Subject: genirq: Fix trivial typo in the comment CPY ==> COPY

IRQ_SET_MASK_NOCOPY is defined with 'O' letter. Fix the comment.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20240405185726.3931703-3-andriy.shevchenko@linux.intel.com
---
 include/linux/irq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 97baa937ab5b..a217e1029c1d 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -115,7 +115,7 @@ enum {
  * Return value for chip->irq_set_affinity()
  *
  * IRQ_SET_MASK_OK	- OK, core updates irq_common_data.affinity
- * IRQ_SET_MASK_NOCPY	- OK, chip did update irq_common_data.affinity
+ * IRQ_SET_MASK_NOCOPY	- OK, chip did update irq_common_data.affinity
  * IRQ_SET_MASK_OK_DONE	- Same as IRQ_SET_MASK_OK for core. Special code to
  *			  support stacked irqchips, which indicates skipping
  *			  all descendant irqchips.
-- 
cgit v1.2.3


From bfec4e18f94351d2ff9073c4bffcc6f37bfa3e0b Mon Sep 17 00:00:00 2001
From: Erick Archer <erick.archer@outlook.com>
Date: Sat, 6 Apr 2024 16:23:35 +0200
Subject: net: mana: Add flex array to struct mana_cfg_rx_steer_req_v2

The "struct mana_cfg_rx_steer_req_v2" uses a dynamically sized set of
trailing elements. Specifically, it uses a "mana_handle_t" array. So,
use the preferred way in the kernel declaring a flexible array [1].

At the same time, prepare for the coming implementation by GCC and Clang
of the __counted_by attribute. Flexible array members annotated with
__counted_by can have their accesses bounds-checked at run-time via
CONFIG_UBSAN_BOUNDS (for array indexing) and CONFIG_FORTIFY_SOURCE (for
strcpy/memcpy-family functions).

This is a previous step to refactor the two consumers of this structure.

 drivers/infiniband/hw/mana/qp.c
 drivers/net/ethernet/microsoft/mana/mana_en.c

The ultimate goal is to avoid the open-coded arithmetic in the memory
allocator functions [2] using the "struct_size" macro.

Link: https://www.kernel.org/doc/html/next/process/deprecated.html#zero-length-and-one-element-arrays [1]
Link: https://www.kernel.org/doc/html/next/process/deprecated.html#open-coded-arithmetic-in-allocator-arguments [2]
Signed-off-by: Erick Archer <erick.archer@outlook.com>
Link: https://lore.kernel.org/r/AS8PR02MB7237E2900247571C9CB84C678B022@AS8PR02MB7237.eurprd02.prod.outlook.com
Reviewed-by: Long Li <longli@microsoft.com>
Reviewed-by: Justin Stitt <justinstitt@google.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 include/net/mana/mana.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 76147feb0d10..46f741ebce21 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -671,6 +671,7 @@ struct mana_cfg_rx_steer_req_v2 {
 	u8 hashkey[MANA_HASH_KEY_SIZE];
 	u8 cqe_coalescing_enable;
 	u8 reserved2[7];
+	mana_handle_t indir_tab[] __counted_by(num_indir_entries);
 }; /* HW DATA */
 
 struct mana_cfg_rx_steer_resp {
-- 
cgit v1.2.3


From 693f75b91a9171e99f84fc193e39f48e21ba4a4f Mon Sep 17 00:00:00 2001
From: Sreenath Vijayan <sreenath.vijayan@sony.com>
Date: Wed, 13 Mar 2024 15:50:52 +0530
Subject: printk: Add function to replay kernel log on consoles

Add a generic function console_replay_all() for replaying
the kernel log on consoles, in any context. It would allow
viewing the logs on an unresponsive terminal via sysrq.

Reuse the existing code from console_flush_on_panic() for
resetting the sequence numbers, by introducing a new helper
function __console_rewind_all(). It is safe to be called
under console_lock().

Try to acquire lock on the console subsystem without waiting.
If successful, reset the sequence number to oldest available
record on all consoles and call console_unlock() which will
automatically flush the messages to the consoles.

Suggested-by: John Ogness <john.ogness@linutronix.de>
Suggested-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Shimoyashiki Taichi <taichi.shimoyashiki@sony.com>
Reviewed-by: John Ogness <john.ogness@linutronix.de>
Signed-off-by: Sreenath Vijayan <sreenath.vijayan@sony.com>
Link: https://lore.kernel.org/r/90ee131c643a5033d117b556c0792de65129d4c3.1710220326.git.sreenath.vijayan@sony.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/printk.h |  4 +++
 kernel/printk/printk.c | 77 ++++++++++++++++++++++++++++++++++----------------
 2 files changed, 57 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 5e038e782256..1c5936ab991f 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -195,6 +195,7 @@ void show_regs_print_info(const char *log_lvl);
 extern asmlinkage void dump_stack_lvl(const char *log_lvl) __cold;
 extern asmlinkage void dump_stack(void) __cold;
 void printk_trigger_flush(void);
+void console_replay_all(void);
 #else
 static inline __printf(1, 0)
 int vprintk(const char *s, va_list args)
@@ -274,6 +275,9 @@ static inline void dump_stack(void)
 static inline void printk_trigger_flush(void)
 {
 }
+static inline void console_replay_all(void)
+{
+}
 #endif
 
 bool this_cpu_in_panic(void);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 1e3b21682547..6206804d275b 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3161,6 +3161,40 @@ void console_unblank(void)
 		pr_flush(1000, true);
 }
 
+/*
+ * Rewind all consoles to the oldest available record.
+ *
+ * IMPORTANT: The function is safe only when called under
+ *            console_lock(). It is not enforced because
+ *            it is used as a best effort in panic().
+ */
+static void __console_rewind_all(void)
+{
+	struct console *c;
+	short flags;
+	int cookie;
+	u64 seq;
+
+	seq = prb_first_valid_seq(prb);
+
+	cookie = console_srcu_read_lock();
+	for_each_console_srcu(c) {
+		flags = console_srcu_read_flags(c);
+
+		if (flags & CON_NBCON) {
+			nbcon_seq_force(c, seq);
+		} else {
+			/*
+			 * This assignment is safe only when called under
+			 * console_lock(). On panic, legacy consoles are
+			 * only best effort.
+			 */
+			c->seq = seq;
+		}
+	}
+	console_srcu_read_unlock(cookie);
+}
+
 /**
  * console_flush_on_panic - flush console content on panic
  * @mode: flush all messages in buffer or just the pending ones
@@ -3189,30 +3223,8 @@ void console_flush_on_panic(enum con_flush_mode mode)
 	 */
 	console_may_schedule = 0;
 
-	if (mode == CONSOLE_REPLAY_ALL) {
-		struct console *c;
-		short flags;
-		int cookie;
-		u64 seq;
-
-		seq = prb_first_valid_seq(prb);
-
-		cookie = console_srcu_read_lock();
-		for_each_console_srcu(c) {
-			flags = console_srcu_read_flags(c);
-
-			if (flags & CON_NBCON) {
-				nbcon_seq_force(c, seq);
-			} else {
-				/*
-				 * This is an unsynchronized assignment. On
-				 * panic legacy consoles are only best effort.
-				 */
-				c->seq = seq;
-			}
-		}
-		console_srcu_read_unlock(cookie);
-	}
+	if (mode == CONSOLE_REPLAY_ALL)
+		__console_rewind_all();
 
 	console_flush_all(false, &next_seq, &handover);
 }
@@ -4301,6 +4313,23 @@ void kmsg_dump_rewind(struct kmsg_dump_iter *iter)
 }
 EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
 
+/**
+ * console_replay_all - replay kernel log on consoles
+ *
+ * Try to obtain lock on console subsystem and replay all
+ * available records in printk buffer on the consoles.
+ * Does nothing if lock is not obtained.
+ *
+ * Context: Any context.
+ */
+void console_replay_all(void)
+{
+	if (console_trylock()) {
+		__console_rewind_all();
+		/* Consoles are flushed as part of console_unlock(). */
+		console_unlock();
+	}
+}
 #endif
 
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3


From 1019fa4839c97c4efff9c26af4d74a184921d8da Mon Sep 17 00:00:00 2001
From: Andrew Davis <afd@ti.com>
Date: Wed, 10 Apr 2024 09:48:03 -0500
Subject: uio: pruss: Remove this driver

This UIO driver was used to control the PRU processors found on various
TI SoCs. It was created before the Remoteproc framework, but now with
that we have a standard way to program and manage the PRU processors.
The proper PRU Remoteproc driver should be used instead of this driver.

This driver only supported the original class of PRUSS (OMAP-L1xx /
AM17xx / AM18xx / TMS320C674x / DA8xx) but when these platforms were
switched to use Device Tree the support for DT was not added to this
driver and so it is now unused/unusable. Support for these platforms
can be added to the proper PRU Remoteproc driver if ever needed.

Remove this driver.

Signed-off-by: Andrew Davis <afd@ti.com>
Link: https://lore.kernel.org/r/20240410144803.126831-1-afd@ti.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/uio/Kconfig                     |  18 ---
 drivers/uio/Makefile                    |   1 -
 drivers/uio/uio_pruss.c                 | 255 --------------------------------
 include/linux/platform_data/uio_pruss.h |  18 ---
 4 files changed, 292 deletions(-)
 delete mode 100644 drivers/uio/uio_pruss.c
 delete mode 100644 include/linux/platform_data/uio_pruss.h

(limited to 'include')

diff --git a/drivers/uio/Kconfig b/drivers/uio/Kconfig
index 2e16c5338e5b..b060dcd7c635 100644
--- a/drivers/uio/Kconfig
+++ b/drivers/uio/Kconfig
@@ -125,24 +125,6 @@ config UIO_FSL_ELBC_GPCM_NETX5152
 	  Information about this hardware can be found at:
 	  http://www.hilscher.com/netx
 
-config UIO_PRUSS
-	tristate "Texas Instruments PRUSS driver"
-	select GENERIC_ALLOCATOR
-	depends on HAS_IOMEM && HAS_DMA
-	help
-	  PRUSS driver for OMAPL138/DA850/AM18XX devices
-	  PRUSS driver requires user space components, examples and user space
-	  driver is available from below SVN repo - you may use anonymous login
-
-	  https://gforge.ti.com/gf/project/pru_sw/
-
-	  More info on API is available at below wiki
-
-	  http://processors.wiki.ti.com/index.php/PRU_Linux_Application_Loader
-
-	  To compile this driver as a module, choose M here: the module
-	  will be called uio_pruss.
-
 config UIO_MF624
 	tristate "Humusoft MF624 DAQ PCI card driver"
 	depends on PCI
diff --git a/drivers/uio/Makefile b/drivers/uio/Makefile
index f2f416a14228..1c5f3b5a95cf 100644
--- a/drivers/uio/Makefile
+++ b/drivers/uio/Makefile
@@ -7,7 +7,6 @@ obj-$(CONFIG_UIO_AEC)	+= uio_aec.o
 obj-$(CONFIG_UIO_SERCOS3)	+= uio_sercos3.o
 obj-$(CONFIG_UIO_PCI_GENERIC)	+= uio_pci_generic.o
 obj-$(CONFIG_UIO_NETX)	+= uio_netx.o
-obj-$(CONFIG_UIO_PRUSS)         += uio_pruss.o
 obj-$(CONFIG_UIO_MF624)         += uio_mf624.o
 obj-$(CONFIG_UIO_FSL_ELBC_GPCM)	+= uio_fsl_elbc_gpcm.o
 obj-$(CONFIG_UIO_HV_GENERIC)	+= uio_hv_generic.o
diff --git a/drivers/uio/uio_pruss.c b/drivers/uio/uio_pruss.c
deleted file mode 100644
index f67881cba645..000000000000
--- a/drivers/uio/uio_pruss.c
+++ /dev/null
@@ -1,255 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Programmable Real-Time Unit Sub System (PRUSS) UIO driver (uio_pruss)
- *
- * This driver exports PRUSS host event out interrupts and PRUSS, L3 RAM,
- * and DDR RAM to user space for applications interacting with PRUSS firmware
- *
- * Copyright (C) 2010-11 Texas Instruments Incorporated - http://www.ti.com/
- */
-#include <linux/device.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/platform_device.h>
-#include <linux/uio_driver.h>
-#include <linux/platform_data/uio_pruss.h>
-#include <linux/io.h>
-#include <linux/clk.h>
-#include <linux/dma-mapping.h>
-#include <linux/sizes.h>
-#include <linux/slab.h>
-#include <linux/genalloc.h>
-
-#define DRV_NAME "pruss_uio"
-#define DRV_VERSION "1.0"
-
-static int sram_pool_sz = SZ_16K;
-module_param(sram_pool_sz, int, 0);
-MODULE_PARM_DESC(sram_pool_sz, "sram pool size to allocate ");
-
-static int extram_pool_sz = SZ_256K;
-module_param(extram_pool_sz, int, 0);
-MODULE_PARM_DESC(extram_pool_sz, "external ram pool size to allocate");
-
-/*
- * Host event IRQ numbers from PRUSS - PRUSS can generate up to 8 interrupt
- * events to AINTC of ARM host processor - which can be used for IPC b/w PRUSS
- * firmware and user space application, async notification from PRU firmware
- * to user space application
- * 3	PRU_EVTOUT0
- * 4	PRU_EVTOUT1
- * 5	PRU_EVTOUT2
- * 6	PRU_EVTOUT3
- * 7	PRU_EVTOUT4
- * 8	PRU_EVTOUT5
- * 9	PRU_EVTOUT6
- * 10	PRU_EVTOUT7
-*/
-#define MAX_PRUSS_EVT	8
-
-#define PINTC_HIDISR	0x0038
-#define PINTC_HIPIR	0x0900
-#define HIPIR_NOPEND	0x80000000
-#define PINTC_HIER	0x1500
-
-struct uio_pruss_dev {
-	struct uio_info *info;
-	struct clk *pruss_clk;
-	dma_addr_t sram_paddr;
-	dma_addr_t ddr_paddr;
-	void __iomem *prussio_vaddr;
-	unsigned long sram_vaddr;
-	void *ddr_vaddr;
-	unsigned int hostirq_start;
-	unsigned int pintc_base;
-	struct gen_pool *sram_pool;
-};
-
-static irqreturn_t pruss_handler(int irq, struct uio_info *info)
-{
-	struct uio_pruss_dev *gdev = info->priv;
-	int intr_bit = (irq - gdev->hostirq_start + 2);
-	int val, intr_mask = (1 << intr_bit);
-	void __iomem *base = gdev->prussio_vaddr + gdev->pintc_base;
-	void __iomem *intren_reg = base + PINTC_HIER;
-	void __iomem *intrdis_reg = base + PINTC_HIDISR;
-	void __iomem *intrstat_reg = base + PINTC_HIPIR + (intr_bit << 2);
-
-	val = ioread32(intren_reg);
-	/* Is interrupt enabled and active ? */
-	if (!(val & intr_mask) && (ioread32(intrstat_reg) & HIPIR_NOPEND))
-		return IRQ_NONE;
-	/* Disable interrupt */
-	iowrite32(intr_bit, intrdis_reg);
-	return IRQ_HANDLED;
-}
-
-static void pruss_cleanup(struct device *dev, struct uio_pruss_dev *gdev)
-{
-	int cnt;
-	struct uio_info *p = gdev->info;
-
-	for (cnt = 0; cnt < MAX_PRUSS_EVT; cnt++, p++) {
-		uio_unregister_device(p);
-	}
-	iounmap(gdev->prussio_vaddr);
-	if (gdev->ddr_vaddr) {
-		dma_free_coherent(dev, extram_pool_sz, gdev->ddr_vaddr,
-			gdev->ddr_paddr);
-	}
-	if (gdev->sram_vaddr)
-		gen_pool_free(gdev->sram_pool,
-			      gdev->sram_vaddr,
-			      sram_pool_sz);
-	clk_disable(gdev->pruss_clk);
-}
-
-static int pruss_probe(struct platform_device *pdev)
-{
-	struct uio_info *p;
-	struct uio_pruss_dev *gdev;
-	struct resource *regs_prussio;
-	struct device *dev = &pdev->dev;
-	int ret, cnt, i, len;
-	struct uio_pruss_pdata *pdata = dev_get_platdata(dev);
-
-	gdev = devm_kzalloc(dev, sizeof(struct uio_pruss_dev), GFP_KERNEL);
-	if (!gdev)
-		return -ENOMEM;
-
-	gdev->info = devm_kcalloc(dev, MAX_PRUSS_EVT, sizeof(*p), GFP_KERNEL);
-	if (!gdev->info)
-		return -ENOMEM;
-
-	/* Power on PRU in case its not done as part of boot-loader */
-	gdev->pruss_clk = devm_clk_get(dev, "pruss");
-	if (IS_ERR(gdev->pruss_clk)) {
-		dev_err(dev, "Failed to get clock\n");
-		return PTR_ERR(gdev->pruss_clk);
-	}
-
-	ret = clk_enable(gdev->pruss_clk);
-	if (ret) {
-		dev_err(dev, "Failed to enable clock\n");
-		return ret;
-	}
-
-	regs_prussio = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!regs_prussio) {
-		dev_err(dev, "No PRUSS I/O resource specified\n");
-		ret = -EIO;
-		goto err_clk_disable;
-	}
-
-	if (!regs_prussio->start) {
-		dev_err(dev, "Invalid memory resource\n");
-		ret = -EIO;
-		goto err_clk_disable;
-	}
-
-	if (pdata->sram_pool) {
-		gdev->sram_pool = pdata->sram_pool;
-		gdev->sram_vaddr =
-			(unsigned long)gen_pool_dma_alloc(gdev->sram_pool,
-					sram_pool_sz, &gdev->sram_paddr);
-		if (!gdev->sram_vaddr) {
-			dev_err(dev, "Could not allocate SRAM pool\n");
-			ret = -ENOMEM;
-			goto err_clk_disable;
-		}
-	}
-
-	gdev->ddr_vaddr = dma_alloc_coherent(dev, extram_pool_sz,
-				&(gdev->ddr_paddr), GFP_KERNEL | GFP_DMA);
-	if (!gdev->ddr_vaddr) {
-		dev_err(dev, "Could not allocate external memory\n");
-		ret = -ENOMEM;
-		goto err_free_sram;
-	}
-
-	len = resource_size(regs_prussio);
-	gdev->prussio_vaddr = ioremap(regs_prussio->start, len);
-	if (!gdev->prussio_vaddr) {
-		dev_err(dev, "Can't remap PRUSS I/O  address range\n");
-		ret = -ENOMEM;
-		goto err_free_ddr_vaddr;
-	}
-
-	ret = platform_get_irq(pdev, 0);
-	if (ret < 0)
-		goto err_unmap;
-
-	gdev->hostirq_start = ret;
-	gdev->pintc_base = pdata->pintc_base;
-
-	for (cnt = 0, p = gdev->info; cnt < MAX_PRUSS_EVT; cnt++, p++) {
-		p->mem[0].addr = regs_prussio->start;
-		p->mem[0].size = resource_size(regs_prussio);
-		p->mem[0].memtype = UIO_MEM_PHYS;
-
-		p->mem[1].addr = gdev->sram_paddr;
-		p->mem[1].size = sram_pool_sz;
-		p->mem[1].memtype = UIO_MEM_PHYS;
-
-		p->mem[2].addr = (uintptr_t) gdev->ddr_vaddr;
-		p->mem[2].dma_addr = gdev->ddr_paddr;
-		p->mem[2].size = extram_pool_sz;
-		p->mem[2].memtype = UIO_MEM_DMA_COHERENT;
-		p->mem[2].dma_device = dev;
-
-		p->name = devm_kasprintf(dev, GFP_KERNEL, "pruss_evt%d", cnt);
-		p->version = DRV_VERSION;
-
-		/* Register PRUSS IRQ lines */
-		p->irq = gdev->hostirq_start + cnt;
-		p->handler = pruss_handler;
-		p->priv = gdev;
-
-		ret = uio_register_device(dev, p);
-		if (ret < 0)
-			goto err_unloop;
-	}
-
-	platform_set_drvdata(pdev, gdev);
-	return 0;
-
-err_unloop:
-	for (i = 0, p = gdev->info; i < cnt; i++, p++) {
-		uio_unregister_device(p);
-	}
-err_unmap:
-	iounmap(gdev->prussio_vaddr);
-err_free_ddr_vaddr:
-	dma_free_coherent(dev, extram_pool_sz, gdev->ddr_vaddr,
-			  gdev->ddr_paddr);
-err_free_sram:
-	if (pdata->sram_pool)
-		gen_pool_free(gdev->sram_pool, gdev->sram_vaddr, sram_pool_sz);
-err_clk_disable:
-	clk_disable(gdev->pruss_clk);
-
-	return ret;
-}
-
-static int pruss_remove(struct platform_device *dev)
-{
-	struct uio_pruss_dev *gdev = platform_get_drvdata(dev);
-
-	pruss_cleanup(&dev->dev, gdev);
-	return 0;
-}
-
-static struct platform_driver pruss_driver = {
-	.probe = pruss_probe,
-	.remove = pruss_remove,
-	.driver = {
-		   .name = DRV_NAME,
-		   },
-};
-
-module_platform_driver(pruss_driver);
-
-MODULE_LICENSE("GPL v2");
-MODULE_VERSION(DRV_VERSION);
-MODULE_AUTHOR("Amit Chatterjee <amit.chatterjee@ti.com>");
-MODULE_AUTHOR("Pratheesh Gangadhar <pratheesh@ti.com>");
diff --git a/include/linux/platform_data/uio_pruss.h b/include/linux/platform_data/uio_pruss.h
deleted file mode 100644
index f76fa393b802..000000000000
--- a/include/linux/platform_data/uio_pruss.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * include/linux/platform_data/uio_pruss.h
- *
- * Platform data for uio_pruss driver
- *
- * Copyright (C) 2010-11 Texas Instruments Incorporated - https://www.ti.com/
- */
-
-#ifndef _UIO_PRUSS_H_
-#define _UIO_PRUSS_H_
-
-/* To configure the PRUSS INTC base offset for UIO driver */
-struct uio_pruss_pdata {
-	u32		pintc_base;
-	struct gen_pool *sram_pool;
-};
-#endif /* _UIO_PRUSS_H_ */
-- 
cgit v1.2.3


From e8c4bd6c6e6b7e7b416c42806981c2a81370001e Mon Sep 17 00:00:00 2001
From: Saurabh Sengar <ssengar@linux.microsoft.com>
Date: Sat, 30 Mar 2024 01:51:57 -0700
Subject: Drivers: hv: vmbus: Add utility function for querying ring size

Add a function to query for the preferred ring buffer size of VMBus
device. This will allow the drivers (eg. UIO) to allocate the most
optimized ring buffer size for devices.

Signed-off-by: Saurabh Sengar <ssengar@linux.microsoft.com>
Reviewed-by: Long Li <longli@microsoft.com>
Link: https://lore.kernel.org/r/1711788723-8593-2-git-send-email-ssengar@linux.microsoft.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/channel_mgmt.c | 15 ++++++++++++---
 drivers/hv/hyperv_vmbus.h |  5 +++++
 include/linux/hyperv.h    |  2 ++
 3 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 2f4d09ce027a..3c6011a48dab 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -120,7 +120,9 @@ const struct vmbus_device vmbus_devs[] = {
 	},
 
 	/* File copy */
-	{ .dev_type = HV_FCOPY,
+	/* fcopy always uses 16KB ring buffer size and is working well for last many years */
+	{ .pref_ring_size = 0x4000,
+	  .dev_type = HV_FCOPY,
 	  HV_FCOPY_GUID,
 	  .perf_device = false,
 	  .allowed_in_isolated = false,
@@ -140,12 +142,19 @@ const struct vmbus_device vmbus_devs[] = {
 	  .allowed_in_isolated = false,
 	},
 
-	/* Unknown GUID */
-	{ .dev_type = HV_UNKNOWN,
+	/*
+	 * Unknown GUID
+	 * 64 KB ring buffer + 4 KB header should be sufficient size for any Hyper-V device apart
+	 * from HV_NIC and HV_SCSI. This case avoid the fallback for unknown devices to allocate
+	 * much bigger (2 MB) of ring size.
+	 */
+	{ .pref_ring_size = 0x11000,
+	  .dev_type = HV_UNKNOWN,
 	  .perf_device = false,
 	  .allowed_in_isolated = false,
 	},
 };
+EXPORT_SYMBOL_GPL(vmbus_devs);
 
 static const struct {
 	guid_t guid;
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index f6b1e710f805..76ac5185a01a 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -417,6 +417,11 @@ static inline bool hv_is_perf_channel(struct vmbus_channel *channel)
 	return vmbus_devs[channel->device_id].perf_device;
 }
 
+static inline size_t hv_dev_ring_size(struct vmbus_channel *channel)
+{
+	return vmbus_devs[channel->device_id].pref_ring_size;
+}
+
 static inline bool hv_is_allocated_cpu(unsigned int cpu)
 {
 	struct vmbus_channel *channel, *sc;
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 6ef0557b4bff..7de9f90d3f95 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -820,6 +820,8 @@ struct vmbus_requestor {
 #define VMBUS_RQST_RESET (U64_MAX - 3)
 
 struct vmbus_device {
+	/* preferred ring buffer size in KB, 0 means no preferred size for this device */
+	size_t pref_ring_size;
 	u16  dev_type;
 	guid_t guid;
 	bool perf_device;
-- 
cgit v1.2.3


From 043327875298030095f9c5d04bd41def28275175 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Wed, 13 Mar 2024 19:19:30 +0100
Subject: misc/pvpanic: use bit macros
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The macros are easier to read.

Suggested-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://lore.kernel.org/lkml/2023110407-unselect-uptight-b96d@gregkh/
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Link: https://lore.kernel.org/r/20240313-pvpanic-shutdown-header-v1-1-7f1970d66366@weissschuh.net
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/misc/pvpanic.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/misc/pvpanic.h b/include/uapi/misc/pvpanic.h
index 54b7485390d3..9ea6a965ca7a 100644
--- a/include/uapi/misc/pvpanic.h
+++ b/include/uapi/misc/pvpanic.h
@@ -3,7 +3,9 @@
 #ifndef __PVPANIC_H__
 #define __PVPANIC_H__
 
-#define PVPANIC_PANICKED	(1 << 0)
-#define PVPANIC_CRASH_LOADED	(1 << 1)
+#include <linux/const.h>
+
+#define PVPANIC_PANICKED	_BITUL(0)
+#define PVPANIC_CRASH_LOADED	_BITUL(1)
 
 #endif /* __PVPANIC_H__ */
-- 
cgit v1.2.3


From ad76f3e8f57cef368fa98f2d4d8902ad66481a3e Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Wed, 13 Mar 2024 19:19:31 +0100
Subject: misc/pvpanic: add shutdown event definition
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Shutdown requests are normally hardware dependent.
By extending pvpanic to also handle shutdown requests, guests can
submit such requests with an easily implementable and cross-platform
mechanism.

The event was added to the specification in qemu commit
73279cecca03 ("docs/specs/pvpanic: document shutdown event").

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20240313-pvpanic-shutdown-header-v1-2-7f1970d66366@weissschuh.net
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/misc/pvpanic.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/uapi/misc/pvpanic.h b/include/uapi/misc/pvpanic.h
index 9ea6a965ca7a..3f1745cd1b52 100644
--- a/include/uapi/misc/pvpanic.h
+++ b/include/uapi/misc/pvpanic.h
@@ -7,5 +7,6 @@
 
 #define PVPANIC_PANICKED	_BITUL(0)
 #define PVPANIC_CRASH_LOADED	_BITUL(1)
+#define PVPANIC_SHUTDOWN	_BITUL(2)
 
 #endif /* __PVPANIC_H__ */
-- 
cgit v1.2.3


From b46271ec40a05580d55f917c9ac52cb93553160a Mon Sep 17 00:00:00 2001
From: Elizabeth Figura <zfigura@codeweavers.com>
Date: Thu, 28 Mar 2024 19:05:53 -0500
Subject: ntsync: Introduce NTSYNC_IOC_CREATE_SEM.

This corresponds to the NT syscall NtCreateSemaphore().

Semaphores are one of three types of object to be implemented in this driver,
the others being mutexes and events.

An NT semaphore contains a 32-bit counter, and is signaled and can be acquired
when the counter is nonzero. The counter has a maximum value which is specified
at creation time. The initial value of the semaphore is also specified at
creation time. There are no restrictions on the maximum and initial value.

Each object is exposed as an file, to which any number of fds may be opened.
When all fds are closed, the object is deleted.

Objects hold a pointer to the ntsync_device that created them. The device's
reference count is driven by struct file.

Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
Link: https://lore.kernel.org/r/20240329000621.148791-3-zfigura@codeweavers.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/userspace-api/ioctl/ioctl-number.rst |   2 +
 drivers/misc/ntsync.c                              | 131 +++++++++++++++++++++
 include/uapi/linux/ntsync.h                        |  21 ++++
 3 files changed, 154 insertions(+)
 create mode 100644 include/uapi/linux/ntsync.h

(limited to 'include')

diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst b/Documentation/userspace-api/ioctl/ioctl-number.rst
index c472423412bf..a141e8e65c5d 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -174,6 +174,8 @@ Code  Seq#    Include File                                           Comments
 'M'   00-0F  drivers/video/fsl-diu-fb.h                              conflict!
 'N'   00-1F  drivers/usb/scanner.h
 'N'   40-7F  drivers/block/nvme.c
+'N'   80-8F  uapi/linux/ntsync.h                                     NT synchronization primitives
+                                                                     <mailto:wine-devel@winehq.org>
 'O'   00-06  mtd/ubi-user.h                                          UBI
 'P'   all    linux/soundcard.h                                       conflict!
 'P'   60-6F  sound/sscape_ioctl.h                                    conflict!
diff --git a/drivers/misc/ntsync.c b/drivers/misc/ntsync.c
index bd76e653d83e..20158ec148bc 100644
--- a/drivers/misc/ntsync.c
+++ b/drivers/misc/ntsync.c
@@ -5,26 +5,157 @@
  * Copyright (C) 2024 Elizabeth Figura <zfigura@codeweavers.com>
  */
 
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/miscdevice.h>
 #include <linux/module.h>
+#include <linux/slab.h>
+#include <uapi/linux/ntsync.h>
 
 #define NTSYNC_NAME	"ntsync"
 
+enum ntsync_type {
+	NTSYNC_TYPE_SEM,
+};
+
+/*
+ * Individual synchronization primitives are represented by
+ * struct ntsync_obj, and each primitive is backed by a file.
+ *
+ * The whole namespace is represented by a struct ntsync_device also
+ * backed by a file.
+ *
+ * Both rely on struct file for reference counting. Individual
+ * ntsync_obj objects take a reference to the device when created.
+ */
+
+struct ntsync_obj {
+	enum ntsync_type type;
+
+	union {
+		struct {
+			__u32 count;
+			__u32 max;
+		} sem;
+	} u;
+
+	struct file *file;
+	struct ntsync_device *dev;
+};
+
+struct ntsync_device {
+	struct file *file;
+};
+
+static int ntsync_obj_release(struct inode *inode, struct file *file)
+{
+	struct ntsync_obj *obj = file->private_data;
+
+	fput(obj->dev->file);
+	kfree(obj);
+
+	return 0;
+}
+
+static const struct file_operations ntsync_obj_fops = {
+	.owner		= THIS_MODULE,
+	.release	= ntsync_obj_release,
+	.llseek		= no_llseek,
+};
+
+static struct ntsync_obj *ntsync_alloc_obj(struct ntsync_device *dev,
+					   enum ntsync_type type)
+{
+	struct ntsync_obj *obj;
+
+	obj = kzalloc(sizeof(*obj), GFP_KERNEL);
+	if (!obj)
+		return NULL;
+	obj->type = type;
+	obj->dev = dev;
+	get_file(dev->file);
+
+	return obj;
+}
+
+static int ntsync_obj_get_fd(struct ntsync_obj *obj)
+{
+	struct file *file;
+	int fd;
+
+	fd = get_unused_fd_flags(O_CLOEXEC);
+	if (fd < 0)
+		return fd;
+	file = anon_inode_getfile("ntsync", &ntsync_obj_fops, obj, O_RDWR);
+	if (IS_ERR(file)) {
+		put_unused_fd(fd);
+		return PTR_ERR(file);
+	}
+	obj->file = file;
+	fd_install(fd, file);
+
+	return fd;
+}
+
+static int ntsync_create_sem(struct ntsync_device *dev, void __user *argp)
+{
+	struct ntsync_sem_args __user *user_args = argp;
+	struct ntsync_sem_args args;
+	struct ntsync_obj *sem;
+	int fd;
+
+	if (copy_from_user(&args, argp, sizeof(args)))
+		return -EFAULT;
+
+	if (args.count > args.max)
+		return -EINVAL;
+
+	sem = ntsync_alloc_obj(dev, NTSYNC_TYPE_SEM);
+	if (!sem)
+		return -ENOMEM;
+	sem->u.sem.count = args.count;
+	sem->u.sem.max = args.max;
+	fd = ntsync_obj_get_fd(sem);
+	if (fd < 0) {
+		kfree(sem);
+		return fd;
+	}
+
+	return put_user(fd, &user_args->sem);
+}
+
 static int ntsync_char_open(struct inode *inode, struct file *file)
 {
+	struct ntsync_device *dev;
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
+
+	file->private_data = dev;
+	dev->file = file;
 	return nonseekable_open(inode, file);
 }
 
 static int ntsync_char_release(struct inode *inode, struct file *file)
 {
+	struct ntsync_device *dev = file->private_data;
+
+	kfree(dev);
+
 	return 0;
 }
 
 static long ntsync_char_ioctl(struct file *file, unsigned int cmd,
 			      unsigned long parm)
 {
+	struct ntsync_device *dev = file->private_data;
+	void __user *argp = (void __user *)parm;
+
 	switch (cmd) {
+	case NTSYNC_IOC_CREATE_SEM:
+		return ntsync_create_sem(dev, argp);
 	default:
 		return -ENOIOCTLCMD;
 	}
diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h
new file mode 100644
index 000000000000..6a4867a6c97b
--- /dev/null
+++ b/include/uapi/linux/ntsync.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Kernel support for NT synchronization primitive emulation
+ *
+ * Copyright (C) 2021-2022 Elizabeth Figura <zfigura@codeweavers.com>
+ */
+
+#ifndef __LINUX_NTSYNC_H
+#define __LINUX_NTSYNC_H
+
+#include <linux/types.h>
+
+struct ntsync_sem_args {
+	__u32 sem;
+	__u32 count;
+	__u32 max;
+};
+
+#define NTSYNC_IOC_CREATE_SEM		_IOWR('N', 0x80, struct ntsync_sem_args)
+
+#endif
-- 
cgit v1.2.3


From dc806bd48abc1b8a4ae72709a37e65db42a32048 Mon Sep 17 00:00:00 2001
From: Elizabeth Figura <zfigura@codeweavers.com>
Date: Thu, 28 Mar 2024 19:05:54 -0500
Subject: ntsync: Introduce NTSYNC_IOC_SEM_POST.

This corresponds to the NT syscall NtReleaseSemaphore().

This increases the semaphore's internal counter by the given value, and returns
the previous value. If the counter would overflow the defined maximum, the
function instead fails and returns -EOVERFLOW.

Signed-off-by: Elizabeth Figura <zfigura@codeweavers.com>
Link: https://lore.kernel.org/r/20240329000621.148791-4-zfigura@codeweavers.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/ntsync.c       | 72 +++++++++++++++++++++++++++++++++++++++++++--
 include/uapi/linux/ntsync.h |  2 ++
 2 files changed, 71 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/misc/ntsync.c b/drivers/misc/ntsync.c
index 20158ec148bc..3c2f743c58b0 100644
--- a/drivers/misc/ntsync.c
+++ b/drivers/misc/ntsync.c
@@ -10,7 +10,9 @@
 #include <linux/fs.h>
 #include <linux/miscdevice.h>
 #include <linux/module.h>
+#include <linux/overflow.h>
 #include <linux/slab.h>
+#include <linux/spinlock.h>
 #include <uapi/linux/ntsync.h>
 
 #define NTSYNC_NAME	"ntsync"
@@ -31,23 +33,70 @@ enum ntsync_type {
  */
 
 struct ntsync_obj {
+	spinlock_t lock;
+
 	enum ntsync_type type;
 
+	struct file *file;
+	struct ntsync_device *dev;
+
+	/* The following fields are protected by the object lock. */
 	union {
 		struct {
 			__u32 count;
 			__u32 max;
 		} sem;
 	} u;
-
-	struct file *file;
-	struct ntsync_device *dev;
 };
 
 struct ntsync_device {
 	struct file *file;
 };
 
+/*
+ * Actually change the semaphore state, returning -EOVERFLOW if it is made
+ * invalid.
+ */
+static int post_sem_state(struct ntsync_obj *sem, __u32 count)
+{
+	__u32 sum;
+
+	lockdep_assert_held(&sem->lock);
+
+	if (check_add_overflow(sem->u.sem.count, count, &sum) ||
+	    sum > sem->u.sem.max)
+		return -EOVERFLOW;
+
+	sem->u.sem.count = sum;
+	return 0;
+}
+
+static int ntsync_sem_post(struct ntsync_obj *sem, void __user *argp)
+{
+	__u32 __user *user_args = argp;
+	__u32 prev_count;
+	__u32 args;
+	int ret;
+
+	if (copy_from_user(&args, argp, sizeof(args)))
+		return -EFAULT;
+
+	if (sem->type != NTSYNC_TYPE_SEM)
+		return -EINVAL;
+
+	spin_lock(&sem->lock);
+
+	prev_count = sem->u.sem.count;
+	ret = post_sem_state(sem, args);
+
+	spin_unlock(&sem->lock);
+
+	if (!ret && put_user(prev_count, user_args))
+		ret = -EFAULT;
+
+	return ret;
+}
+
 static int ntsync_obj_release(struct inode *inode, struct file *file)
 {
 	struct ntsync_obj *obj = file->private_data;
@@ -58,9 +107,25 @@ static int ntsync_obj_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static long ntsync_obj_ioctl(struct file *file, unsigned int cmd,
+			     unsigned long parm)
+{
+	struct ntsync_obj *obj = file->private_data;
+	void __user *argp = (void __user *)parm;
+
+	switch (cmd) {
+	case NTSYNC_IOC_SEM_POST:
+		return ntsync_sem_post(obj, argp);
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
+
 static const struct file_operations ntsync_obj_fops = {
 	.owner		= THIS_MODULE,
 	.release	= ntsync_obj_release,
+	.unlocked_ioctl	= ntsync_obj_ioctl,
+	.compat_ioctl	= compat_ptr_ioctl,
 	.llseek		= no_llseek,
 };
 
@@ -75,6 +140,7 @@ static struct ntsync_obj *ntsync_alloc_obj(struct ntsync_device *dev,
 	obj->type = type;
 	obj->dev = dev;
 	get_file(dev->file);
+	spin_lock_init(&obj->lock);
 
 	return obj;
 }
diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h
index 6a4867a6c97b..dcfa38fdc93c 100644
--- a/include/uapi/linux/ntsync.h
+++ b/include/uapi/linux/ntsync.h
@@ -18,4 +18,6 @@ struct ntsync_sem_args {
 
 #define NTSYNC_IOC_CREATE_SEM		_IOWR('N', 0x80, struct ntsync_sem_args)
 
+#define NTSYNC_IOC_SEM_POST		_IOWR('N', 0x81, __u32)
+
 #endif
-- 
cgit v1.2.3


From a28380f119a918135c6b7155fb4eb95eaabb62dc Mon Sep 17 00:00:00 2001
From: José Roberto de Souza <jose.souza@intel.com>
Date: Tue, 9 Apr 2024 13:02:05 -0700
Subject: devcoredump: Add dev_coredump_put()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It is useful for modules that do not want to keep coredump available
after its unload.
Otherwise, the coredump would only be removed after DEVCD_TIMEOUT
seconds.

v2:
- dev_coredump_put() documentation updated (Mukesh)

Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Mukesh Ojha <quic_mojha@quicinc.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Jonathan Cavitt <jonathan.cavitt@intel.com>
Reviewed-by: Johannes Berg <johannes@sipsolutions.net>
Acked-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
Signed-off-by: José Roberto de Souza <jose.souza@intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20240409200206.108452-1-jose.souza@intel.com
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
---
 drivers/base/devcoredump.c  | 23 +++++++++++++++++++++++
 include/linux/devcoredump.h |  5 +++++
 2 files changed, 28 insertions(+)

(limited to 'include')

diff --git a/drivers/base/devcoredump.c b/drivers/base/devcoredump.c
index 7e2d1f0d903a..82aeb09b3d1b 100644
--- a/drivers/base/devcoredump.c
+++ b/drivers/base/devcoredump.c
@@ -304,6 +304,29 @@ static ssize_t devcd_read_from_sgtable(char *buffer, loff_t offset,
 				  offset);
 }
 
+/**
+ * dev_coredump_put - remove device coredump
+ * @dev: the struct device for the crashed device
+ *
+ * dev_coredump_put() removes coredump, if exists, for a given device from
+ * the file system and free its associated data otherwise, does nothing.
+ *
+ * It is useful for modules that do not want to keep coredump
+ * available after its unload.
+ */
+void dev_coredump_put(struct device *dev)
+{
+	struct device *existing;
+
+	existing = class_find_device(&devcd_class, NULL, dev,
+				     devcd_match_failing);
+	if (existing) {
+		devcd_free(existing, NULL);
+		put_device(existing);
+	}
+}
+EXPORT_SYMBOL_GPL(dev_coredump_put);
+
 /**
  * dev_coredumpm - create device coredump with read/free methods
  * @dev: the struct device for the crashed device
diff --git a/include/linux/devcoredump.h b/include/linux/devcoredump.h
index c008169ed2c6..c8f7eb6cc191 100644
--- a/include/linux/devcoredump.h
+++ b/include/linux/devcoredump.h
@@ -63,6 +63,8 @@ void dev_coredumpm(struct device *dev, struct module *owner,
 
 void dev_coredumpsg(struct device *dev, struct scatterlist *table,
 		    size_t datalen, gfp_t gfp);
+
+void dev_coredump_put(struct device *dev);
 #else
 static inline void dev_coredumpv(struct device *dev, void *data,
 				 size_t datalen, gfp_t gfp)
@@ -85,6 +87,9 @@ static inline void dev_coredumpsg(struct device *dev, struct scatterlist *table,
 {
 	_devcd_free_sgtable(table);
 }
+static inline void dev_coredump_put(struct device *dev)
+{
+}
 #endif /* CONFIG_DEV_COREDUMP */
 
 #endif /* __DEVCOREDUMP_H */
-- 
cgit v1.2.3


From d48c03198a92edf41e89477dab4f602df15165ee Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Sat, 6 Apr 2024 15:52:01 +0200
Subject: sysfs: Add sysfs_bin_attr_simple_read() helper

When drivers expose a bin_attribute in sysfs which is backed by a buffer
in memory, a common pattern is to set the @private and @size members in
struct bin_attribute to the buffer's location and size.

The ->read() callback then merely consists of a single memcpy() call.
It's not even necessary to perform bounds checks as these are already
handled by sysfs_kf_bin_read().

However each driver is so far providing its own ->read() implementation.
The pattern is sufficiently frequent to merit a public helper, so add
sysfs_bin_attr_simple_read() as well as BIN_ATTR_SIMPLE_RO() and
BIN_ATTR_SIMPLE_ADMIN_RO() macros to ease declaration of such
bin_attributes and reduce LoC and .text section size.

Signed-off-by: Lukas Wunner <lukas@wunner.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/5ed62b197a442ec6db53d8746d9d806dd0576e2d.1712410202.git.lukas@wunner.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/file.c       | 27 +++++++++++++++++++++++++++
 include/linux/sysfs.h | 15 +++++++++++++++
 2 files changed, 42 insertions(+)

(limited to 'include')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 6b7652fb8050..289b57d32c89 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -783,3 +783,30 @@ int sysfs_emit_at(char *buf, int at, const char *fmt, ...)
 	return len;
 }
 EXPORT_SYMBOL_GPL(sysfs_emit_at);
+
+/**
+ *	sysfs_bin_attr_simple_read - read callback to simply copy from memory.
+ *	@file:	attribute file which is being read.
+ *	@kobj:	object to which the attribute belongs.
+ *	@attr:	attribute descriptor.
+ *	@buf:	destination buffer.
+ *	@off:	offset in bytes from which to read.
+ *	@count:	maximum number of bytes to read.
+ *
+ * Simple ->read() callback for bin_attributes backed by a buffer in memory.
+ * The @private and @size members in struct bin_attribute must be set to the
+ * buffer's location and size before the bin_attribute is created in sysfs.
+ *
+ * Bounds check for @off and @count is done in sysfs_kf_bin_read().
+ * Negative value check for @off is done in vfs_setpos() and default_llseek().
+ *
+ * Returns number of bytes written to @buf.
+ */
+ssize_t sysfs_bin_attr_simple_read(struct file *file, struct kobject *kobj,
+				   struct bin_attribute *attr, char *buf,
+				   loff_t off, size_t count)
+{
+	memcpy(buf, attr->private + off, count);
+	return count;
+}
+EXPORT_SYMBOL_GPL(sysfs_bin_attr_simple_read);
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 326341c62385..a7d725fbf739 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -371,6 +371,17 @@ struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RO(_name, _size)
 #define BIN_ATTR_ADMIN_RW(_name, _size)					\
 struct bin_attribute bin_attr_##_name = __BIN_ATTR_ADMIN_RW(_name, _size)
 
+#define __BIN_ATTR_SIMPLE_RO(_name, _mode) {				\
+	.attr	= { .name = __stringify(_name), .mode = _mode },	\
+	.read	= sysfs_bin_attr_simple_read,				\
+}
+
+#define BIN_ATTR_SIMPLE_RO(_name)					\
+struct bin_attribute bin_attr_##_name = __BIN_ATTR_SIMPLE_RO(_name, 0444)
+
+#define BIN_ATTR_SIMPLE_ADMIN_RO(_name)					\
+struct bin_attribute bin_attr_##_name = __BIN_ATTR_SIMPLE_RO(_name, 0400)
+
 struct sysfs_ops {
 	ssize_t	(*show)(struct kobject *, struct attribute *, char *);
 	ssize_t	(*store)(struct kobject *, struct attribute *, const char *, size_t);
@@ -478,6 +489,10 @@ int sysfs_emit(char *buf, const char *fmt, ...);
 __printf(3, 4)
 int sysfs_emit_at(char *buf, int at, const char *fmt, ...);
 
+ssize_t sysfs_bin_attr_simple_read(struct file *file, struct kobject *kobj,
+				   struct bin_attribute *attr, char *buf,
+				   loff_t off, size_t count);
+
 #else /* CONFIG_SYSFS */
 
 static inline int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
-- 
cgit v1.2.3


From 18d82cde743237def9f80171e32be1153f5cadac Mon Sep 17 00:00:00 2001
From: Geliang Tang <tanggeliang@kylinos.cn>
Date: Wed, 10 Apr 2024 11:48:24 +0200
Subject: mptcp: add last time fields in mptcp_info

This patch adds "last time" fields last_data_sent, last_data_recv and
last_ack_recv in struct mptcp_sock to record the last time data_sent,
data_recv and ack_recv happened. They all are initialized as
tcp_jiffies32 in __mptcp_init_sock(), and updated as tcp_jiffies32 too
when data is sent in __subflow_push_pending(), data is received in
__mptcp_move_skbs_from_subflow(), and ack is received in ack_update_msk().

Similar to tcpi_last_data_sent, tcpi_last_data_recv and tcpi_last_ack_recv
exposed with TCP, this patch exposes the last time "an action happened" for
MPTCP in mptcp_info, named mptcpi_last_data_sent, mptcpi_last_data_recv and
mptcpi_last_ack_recv, calculated in mptcp_diag_fill_info() as the time
deltas between now and the newly added last time fields in mptcp_sock.

Since msk->last_ack_recv needs to be protected by mptcp_data_lock/unlock,
and lock_sock_fast can sleep and be quite slow, move the entire
mptcp_data_lock/unlock block after the lock/unlock_sock_fast block.
Then mptcpi_last_data_sent and mptcpi_last_data_recv are set in
lock/unlock_sock_fast block, while mptcpi_last_ack_recv is set in
mptcp_data_lock/unlock block, which is protected by a spinlock and
should not block for too long.

Also add three reserved bytes in struct mptcp_info not to have holes in
this structure exposed to userspace.

Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/446
Signed-off-by: Geliang Tang <tanggeliang@kylinos.cn>
Reviewed-by: Mat Martineau <martineau@kernel.org>
Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://lore.kernel.org/r/20240410-upstream-net-next-20240405-mptcp-last-time-info-v2-1-f95bd6b33e51@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/mptcp.h |  4 ++++
 net/mptcp/options.c        |  1 +
 net/mptcp/protocol.c       |  7 +++++++
 net/mptcp/protocol.h       |  3 +++
 net/mptcp/sockopt.c        | 16 +++++++++++-----
 5 files changed, 26 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
index 74cfe496891e..67d015df8893 100644
--- a/include/uapi/linux/mptcp.h
+++ b/include/uapi/linux/mptcp.h
@@ -58,6 +58,10 @@ struct mptcp_info {
 	__u64	mptcpi_bytes_received;
 	__u64	mptcpi_bytes_acked;
 	__u8	mptcpi_subflows_total;
+	__u8	reserved[3];
+	__u32	mptcpi_last_data_sent;
+	__u32	mptcpi_last_data_recv;
+	__u32	mptcpi_last_ack_recv;
 };
 
 /* MPTCP Reset reason codes, rfc8684 */
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 27ca42c77b02..8e8dcfbc2993 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -1068,6 +1068,7 @@ static void ack_update_msk(struct mptcp_sock *msk,
 		__mptcp_snd_una_update(msk, new_snd_una);
 		__mptcp_data_acked(sk);
 	}
+	msk->last_ack_recv = tcp_jiffies32;
 	mptcp_data_unlock(sk);
 
 	trace_ack_update_msk(mp_opt->data_ack,
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 995b53cd021c..f8bc34f0d973 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -706,6 +706,8 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
 		}
 	} while (more_data_avail);
 
+	if (moved > 0)
+		msk->last_data_recv = tcp_jiffies32;
 	*bytes += moved;
 	return done;
 }
@@ -1556,6 +1558,8 @@ static int __subflow_push_pending(struct sock *sk, struct sock *ssk,
 	err = copied;
 
 out:
+	if (err > 0)
+		msk->last_data_sent = tcp_jiffies32;
 	return err;
 }
 
@@ -2793,6 +2797,9 @@ static void __mptcp_init_sock(struct sock *sk)
 	WRITE_ONCE(msk->allow_infinite_fallback, true);
 	msk->recovery = false;
 	msk->subflow_id = 1;
+	msk->last_data_sent = tcp_jiffies32;
+	msk->last_data_recv = tcp_jiffies32;
+	msk->last_ack_recv = tcp_jiffies32;
 
 	mptcp_pm_data_init(msk);
 
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 46f4655b7123..fdfa843e2d88 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -282,6 +282,9 @@ struct mptcp_sock {
 	u64		bytes_acked;
 	u64		snd_una;
 	u64		wnd_end;
+	u32		last_data_sent;
+	u32		last_data_recv;
+	u32		last_ack_recv;
 	unsigned long	timer_ival;
 	u32		token;
 	int		rmem_released;
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index 9d5d42a77bcc..1fea43f5b6f3 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -898,6 +898,7 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
 	struct sock *sk = (struct sock *)msk;
 	u32 flags = 0;
 	bool slow;
+	u32 now;
 
 	memset(info, 0, sizeof(*info));
 
@@ -926,11 +927,6 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
 	if (READ_ONCE(msk->can_ack))
 		flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED;
 	info->mptcpi_flags = flags;
-	mptcp_data_lock(sk);
-	info->mptcpi_snd_una = msk->snd_una;
-	info->mptcpi_rcv_nxt = msk->ack_seq;
-	info->mptcpi_bytes_acked = msk->bytes_acked;
-	mptcp_data_unlock(sk);
 
 	slow = lock_sock_fast(sk);
 	info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled);
@@ -942,7 +938,17 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
 	info->mptcpi_bytes_retrans = msk->bytes_retrans;
 	info->mptcpi_subflows_total = info->mptcpi_subflows +
 		__mptcp_has_initial_subflow(msk);
+	now = tcp_jiffies32;
+	info->mptcpi_last_data_sent = jiffies_to_msecs(now - msk->last_data_sent);
+	info->mptcpi_last_data_recv = jiffies_to_msecs(now - msk->last_data_recv);
 	unlock_sock_fast(sk, slow);
+
+	mptcp_data_lock(sk);
+	info->mptcpi_last_ack_recv = jiffies_to_msecs(now - msk->last_ack_recv);
+	info->mptcpi_snd_una = msk->snd_una;
+	info->mptcpi_rcv_nxt = msk->ack_seq;
+	info->mptcpi_bytes_acked = msk->bytes_acked;
+	mptcp_data_unlock(sk);
 }
 EXPORT_SYMBOL_GPL(mptcp_diag_fill_info);
 
-- 
cgit v1.2.3


From 9677547d836299497ba2da1ad59851e200109ca1 Mon Sep 17 00:00:00 2001
From: Ville Syrjälä <ville.syrjala@linux.intel.com>
Date: Mon, 18 Mar 2024 22:44:07 +0200
Subject: drm: Introduce plane SIZE_HINTS property
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a new immutable plane property by which a plane can advertise
a handful of recommended plane sizes. This would be mostly exposed
by cursor planes as a slightly more capable replacement for
the DRM_CAP_CURSOR_WIDTH/HEIGHT caps, which can only declare
a one size fits all limit for the whole device.

Currently eg. amdgpu/i915/nouveau just advertize the max cursor
size via the cursor size caps. But always using the max sized
cursor can waste a surprising amount of power, so a better
strategy is desirable.

Most other drivers don't specify any cursor size at all, in
which case the ioctl code just claims that 64x64 is a great
choice. Whether that is actually true is debatable.

A poll of various compositor developers informs us that
blindly probing with setcursor/atomic ioctl to determine
suitable cursor sizes is not acceptable, thus the
introduction of the new property to supplant the cursor
size caps. The compositor will now be free to select a
more optimal cursor size from the short list of options.

Note that the reported sizes (either via the property or the
caps) make no claims about things such as plane scaling. So
these things should only really be consulted for simple
"cursor like" use cases.

Userspace consumer in the form of mutter seems ready:
https://gitlab.gnome.org/GNOME/mutter/-/merge_requests/3165

v2: Try to add some docs
v3: Specify that value 0 is reserved for future use (basic idea from Jonas)
    Drop the note about typical hardware (Pekka)
v4: Update the docs to indicate the list is "in order of preference"
    Add a a link to the mutter MR
v5: Limit to cursors only for now (Simon)

Cc: Jonas Ådahl <jadahl@redhat.com>
Cc: Sameer Lattannavar <sameer.lattannavar@intel.com>
Reviewed-by: Sebastian Wick <sebastian.wick@redhat.com>
Reviewed-by: Simon Ser <contact@emersion.fr>
Acked-by: Daniel Stone <daniels@collabora.com>
Acked-by: Harry Wentland <harry.wentland@amd.com>
Acked-by: Pekka Paalanen <pekka.paalanen@collabora.com>
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240318204408.9687-2-ville.syrjala@linux.intel.com
---
 drivers/gpu/drm/drm_mode_config.c |  7 +++++
 drivers/gpu/drm/drm_plane.c       | 56 +++++++++++++++++++++++++++++++++++++++
 include/drm/drm_mode_config.h     |  5 ++++
 include/drm/drm_plane.h           |  4 +++
 include/uapi/drm/drm_mode.h       | 11 ++++++++
 5 files changed, 83 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_mode_config.c b/drivers/gpu/drm/drm_mode_config.c
index 48fd2d67f352..568972258222 100644
--- a/drivers/gpu/drm/drm_mode_config.c
+++ b/drivers/gpu/drm/drm_mode_config.c
@@ -372,6 +372,13 @@ static int drm_mode_create_standard_properties(struct drm_device *dev)
 		return -ENOMEM;
 	dev->mode_config.modifiers_property = prop;
 
+	prop = drm_property_create(dev,
+				   DRM_MODE_PROP_IMMUTABLE | DRM_MODE_PROP_BLOB,
+				   "SIZE_HINTS", 0);
+	if (!prop)
+		return -ENOMEM;
+	dev->mode_config.size_hints_property = prop;
+
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/drm_plane.c b/drivers/gpu/drm/drm_plane.c
index 672c655c7a8e..eecc24c54efd 100644
--- a/drivers/gpu/drm/drm_plane.c
+++ b/drivers/gpu/drm/drm_plane.c
@@ -140,6 +140,25 @@
  *     DRM_FORMAT_MOD_LINEAR. Before linux kernel release v5.1 there have been
  *     various bugs in this area with inconsistencies between the capability
  *     flag and per-plane properties.
+ *
+ * SIZE_HINTS:
+ *     Blob property which contains the set of recommended plane size
+ *     which can used for simple "cursor like" use cases (eg. no scaling).
+ *     Using these hints frees userspace from extensive probing of
+ *     supported plane sizes through atomic/setcursor ioctls.
+ *
+ *     The blob contains an array of struct drm_plane_size_hint, in
+ *     order of preference. For optimal usage userspace should pick
+ *     the first size that satisfies its own requirements.
+ *
+ *     Drivers should only attach this property to planes that
+ *     support a very limited set of sizes.
+ *
+ *     Note that property value 0 (ie. no blob) is reserved for potential
+ *     future use. Current userspace is expected to ignore the property
+ *     if the value is 0, and fall back to some other means (eg.
+ *     &DRM_CAP_CURSOR_WIDTH and &DRM_CAP_CURSOR_HEIGHT) to determine
+ *     the appropriate plane size to use.
  */
 
 static unsigned int drm_num_planes(struct drm_device *dev)
@@ -1729,3 +1748,40 @@ int drm_plane_create_scaling_filter_property(struct drm_plane *plane,
 	return 0;
 }
 EXPORT_SYMBOL(drm_plane_create_scaling_filter_property);
+
+/**
+ * drm_plane_add_size_hint_property - create a size hint property
+ *
+ * @plane: drm plane
+ * @hints: size hints
+ * @num_hints: number of size hints
+ *
+ * Create a size hints property for the plane.
+ *
+ * RETURNS:
+ * Zero for success or -errno
+ */
+int drm_plane_add_size_hints_property(struct drm_plane *plane,
+				      const struct drm_plane_size_hint *hints,
+				      int num_hints)
+{
+	struct drm_device *dev = plane->dev;
+	struct drm_mode_config *config = &dev->mode_config;
+	struct drm_property_blob *blob;
+
+	/* extending to other plane types needs actual thought */
+	if (drm_WARN_ON(dev, plane->type != DRM_PLANE_TYPE_CURSOR))
+		return -EINVAL;
+
+	blob = drm_property_create_blob(dev,
+					array_size(sizeof(hints[0]), num_hints),
+					hints);
+	if (IS_ERR(blob))
+		return PTR_ERR(blob);
+
+	drm_object_attach_property(&plane->base, config->size_hints_property,
+				   blob->base.id);
+
+	return 0;
+}
+EXPORT_SYMBOL(drm_plane_add_size_hints_property);
diff --git a/include/drm/drm_mode_config.h b/include/drm/drm_mode_config.h
index 973119a9176b..9d8acf7a10eb 100644
--- a/include/drm/drm_mode_config.h
+++ b/include/drm/drm_mode_config.h
@@ -942,6 +942,11 @@ struct drm_mode_config {
 	 */
 	struct drm_property *modifiers_property;
 
+	/**
+	 * @size_hints_propertty: Plane SIZE_HINTS property.
+	 */
+	struct drm_property *size_hints_property;
+
 	/* cursor size */
 	uint32_t cursor_width, cursor_height;
 
diff --git a/include/drm/drm_plane.h b/include/drm/drm_plane.h
index 641fe298052d..ec1112208b73 100644
--- a/include/drm/drm_plane.h
+++ b/include/drm/drm_plane.h
@@ -32,6 +32,7 @@
 #include <drm/drm_util.h>
 
 struct drm_crtc;
+struct drm_plane_size_hint;
 struct drm_printer;
 struct drm_modeset_acquire_ctx;
 
@@ -976,5 +977,8 @@ drm_plane_get_damage_clips(const struct drm_plane_state *state);
 
 int drm_plane_create_scaling_filter_property(struct drm_plane *plane,
 					     unsigned int supported_filters);
+int drm_plane_add_size_hints_property(struct drm_plane *plane,
+				      const struct drm_plane_size_hint *hints,
+				      int num_hints);
 
 #endif
diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h
index 7040e7ea80c7..1ca5c7e418fd 100644
--- a/include/uapi/drm/drm_mode.h
+++ b/include/uapi/drm/drm_mode.h
@@ -865,6 +865,17 @@ struct drm_color_lut {
 	__u16 reserved;
 };
 
+/**
+ * struct drm_plane_size_hint - Plane size hints
+ *
+ * The plane SIZE_HINTS property blob contains an
+ * array of struct drm_plane_size_hint.
+ */
+struct drm_plane_size_hint {
+	__u16 width;
+	__u16 height;
+};
+
 /**
  * struct hdr_metadata_infoframe - HDR Metadata Infoframe Data.
  *
-- 
cgit v1.2.3


From c82389947d90c8b0dd206d9ef082191eb58df8fc Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 11 Apr 2024 12:20:57 +0200
Subject: tracing: Add sched_prepare_exec tracepoint

Add "sched_prepare_exec" tracepoint, which is run right after the point
of no return but before the current task assumes its new exec identity.

Unlike the tracepoint "sched_process_exec", the "sched_prepare_exec"
tracepoint runs before flushing the old exec, i.e. while the task still
has the original state (such as original MM), but when the new exec
either succeeds or crashes (but never returns to the original exec).

Being able to trace this event can be helpful in a number of use cases:

  * allowing tracing eBPF programs access to the original MM on exec,
    before current->mm is replaced;
  * counting exec in the original task (via perf event);
  * profiling flush time ("sched_prepare_exec" to "sched_process_exec").

Example of tracing output:

 $ cat /sys/kernel/debug/tracing/trace_pipe
    <...>-379  [003] .....  179.626921: sched_prepare_exec: interp=/usr/bin/sshd filename=/usr/bin/sshd pid=379 comm=sshd
    <...>-381  [002] .....  180.048580: sched_prepare_exec: interp=/bin/bash filename=/bin/bash pid=381 comm=sshd
    <...>-385  [001] .....  180.068277: sched_prepare_exec: interp=/usr/bin/tty filename=/usr/bin/tty pid=385 comm=bash
    <...>-389  [006] .....  192.020147: sched_prepare_exec: interp=/usr/bin/dmesg filename=/usr/bin/dmesg pid=389 comm=bash

Signed-off-by: Marco Elver <elver@google.com>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Link: https://lore.kernel.org/r/20240411102158.1272267-1-elver@google.com
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 fs/exec.c                    |  8 ++++++++
 include/trace/events/sched.h | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)

(limited to 'include')

diff --git a/fs/exec.c b/fs/exec.c
index cf1df7f16e55..b3c40fbb325f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1267,6 +1267,14 @@ int begin_new_exec(struct linux_binprm * bprm)
 	if (retval)
 		return retval;
 
+	/*
+	 * This tracepoint marks the point before flushing the old exec where
+	 * the current task is still unchanged, but errors are fatal (point of
+	 * no return). The later "sched_process_exec" tracepoint is called after
+	 * the current task has successfully switched to the new exec.
+	 */
+	trace_sched_prepare_exec(current, bprm);
+
 	/*
 	 * Ensure all future errors are fatal.
 	 */
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index dbb01b4b7451..226f47c6939c 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -420,6 +420,41 @@ TRACE_EVENT(sched_process_exec,
 		  __entry->pid, __entry->old_pid)
 );
 
+/**
+ * sched_prepare_exec - called before setting up new exec
+ * @task:	pointer to the current task
+ * @bprm:	pointer to linux_binprm used for new exec
+ *
+ * Called before flushing the old exec, where @task is still unchanged, but at
+ * the point of no return during switching to the new exec. At the point it is
+ * called the exec will either succeed, or on failure terminate the task. Also
+ * see the "sched_process_exec" tracepoint, which is called right after @task
+ * has successfully switched to the new exec.
+ */
+TRACE_EVENT(sched_prepare_exec,
+
+	TP_PROTO(struct task_struct *task, struct linux_binprm *bprm),
+
+	TP_ARGS(task, bprm),
+
+	TP_STRUCT__entry(
+		__string(	interp,		bprm->interp	)
+		__string(	filename,	bprm->filename	)
+		__field(	pid_t,		pid		)
+		__string(	comm,		task->comm	)
+	),
+
+	TP_fast_assign(
+		__assign_str(interp, bprm->interp);
+		__assign_str(filename, bprm->filename);
+		__entry->pid = task->pid;
+		__assign_str(comm, task->comm);
+	),
+
+	TP_printk("interp=%s filename=%s pid=%d comm=%s",
+		  __get_str(interp), __get_str(filename),
+		  __entry->pid, __get_str(comm))
+);
 
 #ifdef CONFIG_SCHEDSTATS
 #define DEFINE_EVENT_SCHEDSTAT DEFINE_EVENT
-- 
cgit v1.2.3


From 4cd47222e435dec8e3787614924174f53fcfb5ae Mon Sep 17 00:00:00 2001
From: George Stark <gnstark@salutedevices.com>
Date: Thu, 11 Apr 2024 19:10:25 +0300
Subject: locking/mutex: Introduce devm_mutex_init()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Using of devm API leads to a certain order of releasing resources.
So all dependent resources which are not devm-wrapped should be deleted
with respect to devm-release order. Mutex is one of such objects that
often is bound to other resources and has no own devm wrapping.
Since mutex_destroy() actually does nothing in non-debug builds
frequently calling mutex_destroy() is just ignored which is safe for now
but wrong formally and can lead to a problem if mutex_destroy() will be
extended so introduce devm_mutex_init().

Suggested-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: George Stark <gnstark@salutedevices.com>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Reviewed-by: Marek Behún <kabel@kernel.org>
Acked-by: Waiman Long <longman@redhat.com>
Link: https://lore.kernel.org/r/20240411161032.609544-2-gnstark@salutedevices.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mutex.h        | 27 +++++++++++++++++++++++++++
 kernel/locking/mutex-debug.c | 12 ++++++++++++
 2 files changed, 39 insertions(+)

(limited to 'include')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 67edc4ca2bee..a561c629d89f 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -22,6 +22,8 @@
 #include <linux/cleanup.h>
 #include <linux/mutex_types.h>
 
+struct device;
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define __DEP_MAP_MUTEX_INITIALIZER(lockname)			\
 		, .dep_map = {					\
@@ -117,6 +119,31 @@ do {							\
 } while (0)
 #endif /* CONFIG_PREEMPT_RT */
 
+#ifdef CONFIG_DEBUG_MUTEXES
+
+int __devm_mutex_init(struct device *dev, struct mutex *lock);
+
+#else
+
+static inline int __devm_mutex_init(struct device *dev, struct mutex *lock)
+{
+	/*
+	 * When CONFIG_DEBUG_MUTEXES is off mutex_destroy() is just a nop so
+	 * no really need to register it in the devm subsystem.
+	 */
+	return 0;
+}
+
+#endif
+
+#define devm_mutex_init(dev, mutex)			\
+({							\
+	typeof(mutex) mutex_ = (mutex);			\
+							\
+	mutex_init(mutex_);				\
+	__devm_mutex_init(dev, mutex_);			\
+})
+
 /*
  * See kernel/locking/mutex.c for detailed documentation of these APIs.
  * Also see Documentation/locking/mutex-design.rst.
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index bc8abb8549d2..6e6f6071cfa2 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -12,6 +12,7 @@
  */
 #include <linux/mutex.h>
 #include <linux/delay.h>
+#include <linux/device.h>
 #include <linux/export.h>
 #include <linux/poison.h>
 #include <linux/sched.h>
@@ -89,6 +90,17 @@ void debug_mutex_init(struct mutex *lock, const char *name,
 	lock->magic = lock;
 }
 
+static void devm_mutex_release(void *res)
+{
+	mutex_destroy(res);
+}
+
+int __devm_mutex_init(struct device *dev, struct mutex *lock)
+{
+	return devm_add_action_or_reset(dev, devm_mutex_release, lock);
+}
+EXPORT_SYMBOL_GPL(__devm_mutex_init);
+
 /***
  * mutex_destroy - mark a mutex unusable
  * @lock: the mutex to be destroyed
-- 
cgit v1.2.3


From f3b65bbaed7c43d10989380d4b95e2a3e9fe5a6b Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 5 Apr 2024 07:58:12 -0400
Subject: KVM: delete .change_pte MMU notifier callback

The .change_pte() MMU notifier callback was intended as an
optimization. The original point of it was that KSM could tell KVM to flip
its secondary PTE to a new location without having to first zap it. At
the time there was also an .invalidate_page() callback; both of them were
*not* bracketed by calls to mmu_notifier_invalidate_range_{start,end}(),
and .invalidate_page() also doubled as a fallback implementation of
.change_pte().

Later on, however, both callbacks were changed to occur within an
invalidate_range_start/end() block.

In the case of .change_pte(), commit 6bdb913f0a70 ("mm: wrap calls to
set_pte_at_notify with invalidate_range_start and invalidate_range_end",
2012-10-09) did so to remove the fallback from .invalidate_page() to
.change_pte() and allow sleepable .invalidate_page() hooks.

This however made KVM's usage of the .change_pte() callback completely
moot, because KVM unmaps the sPTEs during .invalidate_range_start()
and therefore .change_pte() has no hope of finding a sPTE to change.
Drop the generic KVM code that dispatches to kvm_set_spte_gfn(), as
well as all the architecture specific implementations.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Acked-by: Anup Patel <anup@brainfault.org>
Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Reviewed-by: Bibo Mao <maobibo@loongson.cn>
Message-ID: <20240405115815.3226315-2-pbonzini@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/arm64/kvm/mmu.c                  | 34 ------------------
 arch/loongarch/include/asm/kvm_host.h |  1 -
 arch/loongarch/kvm/mmu.c              | 32 -----------------
 arch/mips/kvm/mmu.c                   | 30 ----------------
 arch/powerpc/include/asm/kvm_ppc.h    |  1 -
 arch/powerpc/kvm/book3s.c             |  5 ---
 arch/powerpc/kvm/book3s.h             |  1 -
 arch/powerpc/kvm/book3s_64_mmu_hv.c   | 12 -------
 arch/powerpc/kvm/book3s_hv.c          |  1 -
 arch/powerpc/kvm/book3s_pr.c          |  7 ----
 arch/powerpc/kvm/e500_mmu_host.c      |  6 ----
 arch/riscv/kvm/mmu.c                  | 20 -----------
 arch/x86/kvm/mmu/mmu.c                | 67 ++++-------------------------------
 arch/x86/kvm/mmu/spte.c               | 16 ---------
 arch/x86/kvm/mmu/spte.h               |  2 --
 arch/x86/kvm/mmu/tdp_mmu.c            | 46 ------------------------
 arch/x86/kvm/mmu/tdp_mmu.h            |  1 -
 include/linux/kvm_host.h              |  2 --
 include/trace/events/kvm.h            | 15 --------
 virt/kvm/kvm_main.c                   | 43 ----------------------
 20 files changed, 7 insertions(+), 335 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index dc04bc767865..ff17849be9f4 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1768,40 +1768,6 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 	return false;
 }
 
-bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
-{
-	kvm_pfn_t pfn = pte_pfn(range->arg.pte);
-
-	if (!kvm->arch.mmu.pgt)
-		return false;
-
-	WARN_ON(range->end - range->start != 1);
-
-	/*
-	 * If the page isn't tagged, defer to user_mem_abort() for sanitising
-	 * the MTE tags. The S2 pte should have been unmapped by
-	 * mmu_notifier_invalidate_range_end().
-	 */
-	if (kvm_has_mte(kvm) && !page_mte_tagged(pfn_to_page(pfn)))
-		return false;
-
-	/*
-	 * We've moved a page around, probably through CoW, so let's treat
-	 * it just like a translation fault and the map handler will clean
-	 * the cache to the PoC.
-	 *
-	 * The MMU notifiers will have unmapped a huge PMD before calling
-	 * ->change_pte() (which in turn calls kvm_set_spte_gfn()) and
-	 * therefore we never need to clear out a huge PMD through this
-	 * calling path and a memcache is not required.
-	 */
-	kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT,
-			       PAGE_SIZE, __pfn_to_phys(pfn),
-			       KVM_PGTABLE_PROT_R, NULL, 0);
-
-	return false;
-}
-
 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
 	u64 size = (range->end - range->start) << PAGE_SHIFT;
diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h
index 2d62f7b0d377..69305441f40d 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -203,7 +203,6 @@ void kvm_flush_tlb_all(void);
 void kvm_flush_tlb_gpa(struct kvm_vcpu *vcpu, unsigned long gpa);
 int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long badv, bool write);
 
-void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
 int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, bool blockable);
 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
diff --git a/arch/loongarch/kvm/mmu.c b/arch/loongarch/kvm/mmu.c
index a556cff35740..98883aa23ab8 100644
--- a/arch/loongarch/kvm/mmu.c
+++ b/arch/loongarch/kvm/mmu.c
@@ -494,38 +494,6 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 			range->end << PAGE_SHIFT, &ctx);
 }
 
-bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
-{
-	unsigned long prot_bits;
-	kvm_pte_t *ptep;
-	kvm_pfn_t pfn = pte_pfn(range->arg.pte);
-	gpa_t gpa = range->start << PAGE_SHIFT;
-
-	ptep = kvm_populate_gpa(kvm, NULL, gpa, 0);
-	if (!ptep)
-		return false;
-
-	/* Replacing an absent or old page doesn't need flushes */
-	if (!kvm_pte_present(NULL, ptep) || !kvm_pte_young(*ptep)) {
-		kvm_set_pte(ptep, 0);
-		return false;
-	}
-
-	/* Fill new pte if write protected or page migrated */
-	prot_bits = _PAGE_PRESENT | __READABLE;
-	prot_bits |= _CACHE_MASK & pte_val(range->arg.pte);
-
-	/*
-	 * Set _PAGE_WRITE or _PAGE_DIRTY iff old and new pte both support
-	 * _PAGE_WRITE for map_page_fast if next page write fault
-	 * _PAGE_DIRTY since gpa has already recorded as dirty page
-	 */
-	prot_bits |= __WRITEABLE & *ptep & pte_val(range->arg.pte);
-	kvm_set_pte(ptep, kvm_pfn_pte(pfn, __pgprot(prot_bits)));
-
-	return true;
-}
-
 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
 	kvm_ptw_ctx ctx;
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 467ee6b95ae1..c17157e700c0 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -444,36 +444,6 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 	return true;
 }
 
-bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
-{
-	gpa_t gpa = range->start << PAGE_SHIFT;
-	pte_t hva_pte = range->arg.pte;
-	pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa);
-	pte_t old_pte;
-
-	if (!gpa_pte)
-		return false;
-
-	/* Mapping may need adjusting depending on memslot flags */
-	old_pte = *gpa_pte;
-	if (range->slot->flags & KVM_MEM_LOG_DIRTY_PAGES && !pte_dirty(old_pte))
-		hva_pte = pte_mkclean(hva_pte);
-	else if (range->slot->flags & KVM_MEM_READONLY)
-		hva_pte = pte_wrprotect(hva_pte);
-
-	set_pte(gpa_pte, hva_pte);
-
-	/* Replacing an absent or old page doesn't need flushes */
-	if (!pte_present(old_pte) || !pte_young(old_pte))
-		return false;
-
-	/* Pages swapped, aged, moved, or cleaned require flushes */
-	return !pte_present(hva_pte) ||
-	       !pte_young(hva_pte) ||
-	       pte_pfn(old_pte) != pte_pfn(hva_pte) ||
-	       (pte_dirty(old_pte) && !pte_dirty(hva_pte));
-}
-
 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
 	return kvm_mips_mkold_gpa_pt(kvm, range->start, range->end);
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 3281215097cc..ca3829d47ab7 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -287,7 +287,6 @@ struct kvmppc_ops {
 	bool (*unmap_gfn_range)(struct kvm *kvm, struct kvm_gfn_range *range);
 	bool (*age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
 	bool (*test_age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
-	bool (*set_spte_gfn)(struct kvm *kvm, struct kvm_gfn_range *range);
 	void (*free_memslot)(struct kvm_memory_slot *slot);
 	int (*init_vm)(struct kvm *kvm);
 	void (*destroy_vm)(struct kvm *kvm);
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 8acec144120e..0d0624088e6b 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -899,11 +899,6 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 	return kvm->arch.kvm_ops->test_age_gfn(kvm, range);
 }
 
-bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
-{
-	return kvm->arch.kvm_ops->set_spte_gfn(kvm, range);
-}
-
 int kvmppc_core_init_vm(struct kvm *kvm)
 {
 
diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h
index 58391b4b32ed..4aa2ab89afbc 100644
--- a/arch/powerpc/kvm/book3s.h
+++ b/arch/powerpc/kvm/book3s.h
@@ -12,7 +12,6 @@ extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
 extern bool kvm_unmap_gfn_range_hv(struct kvm *kvm, struct kvm_gfn_range *range);
 extern bool kvm_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
 extern bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
-extern bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range);
 
 extern int kvmppc_mmu_init_pr(struct kvm_vcpu *vcpu);
 extern void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 2b1f0cdd8c18..1b51b1c4713b 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -1010,18 +1010,6 @@ bool kvm_test_age_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range)
 		return kvm_test_age_rmapp(kvm, range->slot, range->start);
 }
 
-bool kvm_set_spte_gfn_hv(struct kvm *kvm, struct kvm_gfn_range *range)
-{
-	WARN_ON(range->start + 1 != range->end);
-
-	if (kvm_is_radix(kvm))
-		kvm_unmap_radix(kvm, range->slot, range->start);
-	else
-		kvm_unmap_rmapp(kvm, range->slot, range->start);
-
-	return false;
-}
-
 static int vcpus_running(struct kvm *kvm)
 {
 	return atomic_read(&kvm->arch.vcpus_running) != 0;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 8e86eb577eb8..35cb014a0c51 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -6364,7 +6364,6 @@ static struct kvmppc_ops kvm_ops_hv = {
 	.unmap_gfn_range = kvm_unmap_gfn_range_hv,
 	.age_gfn = kvm_age_gfn_hv,
 	.test_age_gfn = kvm_test_age_gfn_hv,
-	.set_spte_gfn = kvm_set_spte_gfn_hv,
 	.free_memslot = kvmppc_core_free_memslot_hv,
 	.init_vm =  kvmppc_core_init_vm_hv,
 	.destroy_vm = kvmppc_core_destroy_vm_hv,
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 5b92619a05fd..a7d7137ea0c8 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -461,12 +461,6 @@ static bool kvm_test_age_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range)
 	return false;
 }
 
-static bool kvm_set_spte_gfn_pr(struct kvm *kvm, struct kvm_gfn_range *range)
-{
-	/* The page will get remapped properly on its next fault */
-	return do_kvm_unmap_gfn(kvm, range);
-}
-
 /*****************************************/
 
 static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr)
@@ -2071,7 +2065,6 @@ static struct kvmppc_ops kvm_ops_pr = {
 	.unmap_gfn_range = kvm_unmap_gfn_range_pr,
 	.age_gfn  = kvm_age_gfn_pr,
 	.test_age_gfn = kvm_test_age_gfn_pr,
-	.set_spte_gfn = kvm_set_spte_gfn_pr,
 	.free_memslot = kvmppc_core_free_memslot_pr,
 	.init_vm = kvmppc_core_init_vm_pr,
 	.destroy_vm = kvmppc_core_destroy_vm_pr,
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index ccb8f16ffe41..c664fdec75b1 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -747,12 +747,6 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 	return false;
 }
 
-bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
-{
-	/* The page will get remapped properly on its next fault */
-	return kvm_e500_mmu_unmap_gfn(kvm, range);
-}
-
 /*****************************************/
 
 int e500_mmu_host_init(struct kvmppc_vcpu_e500 *vcpu_e500)
diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
index a9e2fd7245e1..b63650f9b966 100644
--- a/arch/riscv/kvm/mmu.c
+++ b/arch/riscv/kvm/mmu.c
@@ -550,26 +550,6 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 	return false;
 }
 
-bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
-{
-	int ret;
-	kvm_pfn_t pfn = pte_pfn(range->arg.pte);
-
-	if (!kvm->arch.pgd)
-		return false;
-
-	WARN_ON(range->end - range->start != 1);
-
-	ret = gstage_map_page(kvm, NULL, range->start << PAGE_SHIFT,
-			      __pfn_to_phys(pfn), PAGE_SIZE, true, true);
-	if (ret) {
-		kvm_debug("Failed to map G-stage page (error %d)\n", ret);
-		return true;
-	}
-
-	return false;
-}
-
 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
 	pte_t *ptep;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 992e651540e8..08900a0563f9 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -432,8 +432,8 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
  * The idea using the light way get the spte on x86_32 guest is from
  * gup_get_pte (mm/gup.c).
  *
- * An spte tlb flush may be pending, because kvm_set_pte_rmap
- * coalesces them and we are running out of the MMU lock.  Therefore
+ * An spte tlb flush may be pending, because they are coalesced and
+ * we are running out of the MMU lock.  Therefore
  * we need to protect against in-progress updates of the spte.
  *
  * Reading the spte while an update is in progress may get the old value
@@ -1448,49 +1448,11 @@ static bool __kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 }
 
 static bool kvm_zap_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
-			 struct kvm_memory_slot *slot, gfn_t gfn, int level,
-			 pte_t unused)
+			 struct kvm_memory_slot *slot, gfn_t gfn, int level)
 {
 	return __kvm_zap_rmap(kvm, rmap_head, slot);
 }
 
-static bool kvm_set_pte_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
-			     struct kvm_memory_slot *slot, gfn_t gfn, int level,
-			     pte_t pte)
-{
-	u64 *sptep;
-	struct rmap_iterator iter;
-	bool need_flush = false;
-	u64 new_spte;
-	kvm_pfn_t new_pfn;
-
-	WARN_ON_ONCE(pte_huge(pte));
-	new_pfn = pte_pfn(pte);
-
-restart:
-	for_each_rmap_spte(rmap_head, &iter, sptep) {
-		need_flush = true;
-
-		if (pte_write(pte)) {
-			kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
-			goto restart;
-		} else {
-			new_spte = kvm_mmu_changed_pte_notifier_make_spte(
-					*sptep, new_pfn);
-
-			mmu_spte_clear_track_bits(kvm, sptep);
-			mmu_spte_set(sptep, new_spte);
-		}
-	}
-
-	if (need_flush && kvm_available_flush_remote_tlbs_range()) {
-		kvm_flush_remote_tlbs_gfn(kvm, gfn, level);
-		return false;
-	}
-
-	return need_flush;
-}
-
 struct slot_rmap_walk_iterator {
 	/* input fields. */
 	const struct kvm_memory_slot *slot;
@@ -1562,7 +1524,7 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
 
 typedef bool (*rmap_handler_t)(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 			       struct kvm_memory_slot *slot, gfn_t gfn,
-			       int level, pte_t pte);
+			       int level);
 
 static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
 						 struct kvm_gfn_range *range,
@@ -1574,7 +1536,7 @@ static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
 	for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
 				 range->start, range->end - 1, &iterator)
 		ret |= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
-			       iterator.level, range->arg.pte);
+			       iterator.level);
 
 	return ret;
 }
@@ -1596,22 +1558,8 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 	return flush;
 }
 
-bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
-{
-	bool flush = false;
-
-	if (kvm_memslots_have_rmaps(kvm))
-		flush = kvm_handle_gfn_range(kvm, range, kvm_set_pte_rmap);
-
-	if (tdp_mmu_enabled)
-		flush |= kvm_tdp_mmu_set_spte_gfn(kvm, range);
-
-	return flush;
-}
-
 static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
-			 struct kvm_memory_slot *slot, gfn_t gfn, int level,
-			 pte_t unused)
+			 struct kvm_memory_slot *slot, gfn_t gfn, int level)
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
@@ -1624,8 +1572,7 @@ static bool kvm_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 }
 
 static bool kvm_test_age_rmap(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
-			      struct kvm_memory_slot *slot, gfn_t gfn,
-			      int level, pte_t unused)
+			      struct kvm_memory_slot *slot, gfn_t gfn, int level)
 {
 	u64 *sptep;
 	struct rmap_iterator iter;
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 4a599130e9c9..6c7ab3aa6aa7 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -322,22 +322,6 @@ u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
 	return spte;
 }
 
-u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
-{
-	u64 new_spte;
-
-	new_spte = old_spte & ~SPTE_BASE_ADDR_MASK;
-	new_spte |= (u64)new_pfn << PAGE_SHIFT;
-
-	new_spte &= ~PT_WRITABLE_MASK;
-	new_spte &= ~shadow_host_writable_mask;
-	new_spte &= ~shadow_mmu_writable_mask;
-
-	new_spte = mark_spte_for_access_track(new_spte);
-
-	return new_spte;
-}
-
 u64 mark_spte_for_access_track(u64 spte)
 {
 	if (spte_ad_enabled(spte))
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index a129951c9a88..f5c600c52f83 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -496,8 +496,6 @@ static inline u64 restore_acc_track_spte(u64 spte)
 	return spte;
 }
 
-u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn);
-
 void __init kvm_mmu_spte_module_init(void);
 void kvm_mmu_reset_all_pte_masks(void);
 
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index d078157e62aa..c6192a52bd31 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1258,52 +1258,6 @@ bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 	return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
 }
 
-static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
-			 struct kvm_gfn_range *range)
-{
-	u64 new_spte;
-
-	/* Huge pages aren't expected to be modified without first being zapped. */
-	WARN_ON_ONCE(pte_huge(range->arg.pte) || range->start + 1 != range->end);
-
-	if (iter->level != PG_LEVEL_4K ||
-	    !is_shadow_present_pte(iter->old_spte))
-		return false;
-
-	/*
-	 * Note, when changing a read-only SPTE, it's not strictly necessary to
-	 * zero the SPTE before setting the new PFN, but doing so preserves the
-	 * invariant that the PFN of a present * leaf SPTE can never change.
-	 * See handle_changed_spte().
-	 */
-	tdp_mmu_iter_set_spte(kvm, iter, 0);
-
-	if (!pte_write(range->arg.pte)) {
-		new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,
-								  pte_pfn(range->arg.pte));
-
-		tdp_mmu_iter_set_spte(kvm, iter, new_spte);
-	}
-
-	return true;
-}
-
-/*
- * Handle the changed_pte MMU notifier for the TDP MMU.
- * data is a pointer to the new pte_t mapping the HVA specified by the MMU
- * notifier.
- * Returns non-zero if a flush is needed before releasing the MMU lock.
- */
-bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
-{
-	/*
-	 * No need to handle the remote TLB flush under RCU protection, the
-	 * target SPTE _must_ be a leaf SPTE, i.e. cannot result in freeing a
-	 * shadow page. See the WARN on pfn_changed in handle_changed_spte().
-	 */
-	return kvm_tdp_mmu_handle_gfn(kvm, range, set_spte_gfn);
-}
-
 /*
  * Remove write access from all SPTEs at or above min_level that map GFNs
  * [start, end). Returns true if an SPTE has been changed and the TLBs need to
diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h
index 6e1ea04ca885..58b55e61bd33 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.h
+++ b/arch/x86/kvm/mmu/tdp_mmu.h
@@ -31,7 +31,6 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
 				 bool flush);
 bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
 bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
-bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
 
 bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
 			     const struct kvm_memory_slot *slot, int min_level);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 48f31dcd318a..afbc99264ffa 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -259,7 +259,6 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 
 #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
 union kvm_mmu_notifier_arg {
-	pte_t pte;
 	unsigned long attributes;
 };
 
@@ -273,7 +272,6 @@ struct kvm_gfn_range {
 bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
 bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
-bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
 #endif
 
 enum {
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 011fba6b5552..74e40d5d4af4 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -456,21 +456,6 @@ TRACE_EVENT(kvm_unmap_hva_range,
 		  __entry->start, __entry->end)
 );
 
-TRACE_EVENT(kvm_set_spte_hva,
-	TP_PROTO(unsigned long hva),
-	TP_ARGS(hva),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	hva		)
-	),
-
-	TP_fast_assign(
-		__entry->hva		= hva;
-	),
-
-	TP_printk("mmu notifier set pte hva: %#016lx", __entry->hva)
-);
-
 TRACE_EVENT(kvm_age_hva,
 	TP_PROTO(unsigned long start, unsigned long end),
 	TP_ARGS(start, end),
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index fb49c2a60200..fb0f922d3109 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -705,48 +705,6 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn
 	return __kvm_handle_hva_range(kvm, &range).ret;
 }
 
-static bool kvm_change_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
-{
-	/*
-	 * Skipping invalid memslots is correct if and only change_pte() is
-	 * surrounded by invalidate_range_{start,end}(), which is currently
-	 * guaranteed by the primary MMU.  If that ever changes, KVM needs to
-	 * unmap the memslot instead of skipping the memslot to ensure that KVM
-	 * doesn't hold references to the old PFN.
-	 */
-	WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
-
-	if (range->slot->flags & KVM_MEMSLOT_INVALID)
-		return false;
-
-	return kvm_set_spte_gfn(kvm, range);
-}
-
-static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
-					struct mm_struct *mm,
-					unsigned long address,
-					pte_t pte)
-{
-	struct kvm *kvm = mmu_notifier_to_kvm(mn);
-	const union kvm_mmu_notifier_arg arg = { .pte = pte };
-
-	trace_kvm_set_spte_hva(address);
-
-	/*
-	 * .change_pte() must be surrounded by .invalidate_range_{start,end}().
-	 * If mmu_invalidate_in_progress is zero, then no in-progress
-	 * invalidations, including this one, found a relevant memslot at
-	 * start(); rechecking memslots here is unnecessary.  Note, a false
-	 * positive (count elevated by a different invalidation) is sub-optimal
-	 * but functionally ok.
-	 */
-	WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
-	if (!READ_ONCE(kvm->mmu_invalidate_in_progress))
-		return;
-
-	kvm_handle_hva_range(mn, address, address + 1, arg, kvm_change_spte_gfn);
-}
-
 void kvm_mmu_invalidate_begin(struct kvm *kvm)
 {
 	lockdep_assert_held_write(&kvm->mmu_lock);
@@ -964,7 +922,6 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
 	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
 	.clear_young		= kvm_mmu_notifier_clear_young,
 	.test_young		= kvm_mmu_notifier_test_young,
-	.change_pte		= kvm_mmu_notifier_change_pte,
 	.release		= kvm_mmu_notifier_release,
 };
 
-- 
cgit v1.2.3


From 997308f9ae72783bf370849ad309c9e42b5d0aa8 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 5 Apr 2024 07:58:14 -0400
Subject: mmu_notifier: remove the .change_pte() callback

The scope of set_pte_at_notify() has reduced more and more through the
years.  Initially, it was meant for when the change to the PTE was
not bracketed by mmu_notifier_invalidate_range_{start,end}().  However,
that has not been so for over ten years.  During all this period
the only implementation of .change_pte() was KVM and it
had no actual functionality, because it was called after
mmu_notifier_invalidate_range_start() zapped the secondary PTE.

Now that this (nonfunctional) user of the .change_pte() callback is
gone, the whole callback can be removed.  For now, leave in place
set_pte_at_notify() even though it is just a synonym for set_pte_at().

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Message-ID: <20240405115815.3226315-4-pbonzini@redhat.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/mmu_notifier.h | 46 ++------------------------------------------
 mm/mmu_notifier.c            | 17 ----------------
 2 files changed, 2 insertions(+), 61 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index f349e08a9dfe..8c72bf651606 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -122,15 +122,6 @@ struct mmu_notifier_ops {
 			  struct mm_struct *mm,
 			  unsigned long address);
 
-	/*
-	 * change_pte is called in cases that pte mapping to page is changed:
-	 * for example, when ksm remaps pte to point to a new shared page.
-	 */
-	void (*change_pte)(struct mmu_notifier *subscription,
-			   struct mm_struct *mm,
-			   unsigned long address,
-			   pte_t pte);
-
 	/*
 	 * invalidate_range_start() and invalidate_range_end() must be
 	 * paired and are called only when the mmap_lock and/or the
@@ -392,8 +383,6 @@ extern int __mmu_notifier_clear_young(struct mm_struct *mm,
 				      unsigned long end);
 extern int __mmu_notifier_test_young(struct mm_struct *mm,
 				     unsigned long address);
-extern void __mmu_notifier_change_pte(struct mm_struct *mm,
-				      unsigned long address, pte_t pte);
 extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r);
 extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r);
 extern void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm,
@@ -439,13 +428,6 @@ static inline int mmu_notifier_test_young(struct mm_struct *mm,
 	return 0;
 }
 
-static inline void mmu_notifier_change_pte(struct mm_struct *mm,
-					   unsigned long address, pte_t pte)
-{
-	if (mm_has_notifiers(mm))
-		__mmu_notifier_change_pte(mm, address, pte);
-}
-
 static inline void
 mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
 {
@@ -581,26 +563,6 @@ static inline void mmu_notifier_range_init_owner(
 	__young;							\
 })
 
-/*
- * set_pte_at_notify() sets the pte _after_ running the notifier.
- * This is safe to start by updating the secondary MMUs, because the primary MMU
- * pte invalidate must have already happened with a ptep_clear_flush() before
- * set_pte_at_notify() has been invoked.  Updating the secondary MMUs first is
- * required when we change both the protection of the mapping from read-only to
- * read-write and the pfn (like during copy on write page faults). Otherwise the
- * old page would remain mapped readonly in the secondary MMUs after the new
- * page is already writable by some CPU through the primary MMU.
- */
-#define set_pte_at_notify(__mm, __address, __ptep, __pte)		\
-({									\
-	struct mm_struct *___mm = __mm;					\
-	unsigned long ___address = __address;				\
-	pte_t ___pte = __pte;						\
-									\
-	mmu_notifier_change_pte(___mm, ___address, ___pte);		\
-	set_pte_at(___mm, ___address, __ptep, ___pte);			\
-})
-
 #else /* CONFIG_MMU_NOTIFIER */
 
 struct mmu_notifier_range {
@@ -650,11 +612,6 @@ static inline int mmu_notifier_test_young(struct mm_struct *mm,
 	return 0;
 }
 
-static inline void mmu_notifier_change_pte(struct mm_struct *mm,
-					   unsigned long address, pte_t pte)
-{
-}
-
 static inline void
 mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
 {
@@ -693,7 +650,6 @@ static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
 #define	ptep_clear_flush_notify ptep_clear_flush
 #define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush
 #define pudp_huge_clear_flush_notify pudp_huge_clear_flush
-#define set_pte_at_notify set_pte_at
 
 static inline void mmu_notifier_synchronize(void)
 {
@@ -701,4 +657,6 @@ static inline void mmu_notifier_synchronize(void)
 
 #endif /* CONFIG_MMU_NOTIFIER */
 
+#define set_pte_at_notify set_pte_at
+
 #endif /* _LINUX_MMU_NOTIFIER_H */
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index ec3b068cbbe6..8982e6139d07 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -424,23 +424,6 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
 	return young;
 }
 
-void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
-			       pte_t pte)
-{
-	struct mmu_notifier *subscription;
-	int id;
-
-	id = srcu_read_lock(&srcu);
-	hlist_for_each_entry_rcu(subscription,
-				 &mm->notifier_subscriptions->list, hlist,
-				 srcu_read_lock_held(&srcu)) {
-		if (subscription->ops->change_pte)
-			subscription->ops->change_pte(subscription, mm, address,
-						      pte);
-	}
-	srcu_read_unlock(&srcu, id);
-}
-
 static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions,
 			       const struct mmu_notifier_range *range)
 {
-- 
cgit v1.2.3


From 293066264fb45df4290897c22e69833bea5fe171 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 9 Apr 2024 16:37:26 +0200
Subject: scsi: block: Add a helper to cancel atomic queue limit updates

Drivers might have to perform complex actions to determine queue limits,
and those might fail.  Add a helper to cancel a queue limit update that can
be called in those cases.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20240409143748.980206-2-hch@lst.de
Reviewed-by: Kanchan Joshi <joshi.k@samsung.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/linux/blkdev.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c3e8f7cf96be..ded7f66dc4b9 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -892,6 +892,19 @@ int queue_limits_commit_update(struct request_queue *q,
 		struct queue_limits *lim);
 int queue_limits_set(struct request_queue *q, struct queue_limits *lim);
 
+/**
+ * queue_limits_cancel_update - cancel an atomic update of queue limits
+ * @q:		queue to update
+ *
+ * This functions cancels an atomic update of the queue limits started by
+ * queue_limits_start_update() and should be used when an error occurs after
+ * starting update.
+ */
+static inline void queue_limits_cancel_update(struct request_queue *q)
+{
+	mutex_unlock(&q->limits_lock);
+}
+
 /*
  * Access functions for manipulating queue properties
  */
-- 
cgit v1.2.3


From 4373d2ecca7fa7ad04aa9c371c80049bafec2610 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 9 Apr 2024 16:37:27 +0200
Subject: scsi: bsg: Pass queue_limits to bsg_setup_queue()

This allows bsg_setup_queue() to pass them to blk_mq_alloc_queue() and thus
set up the limits at queue allocation time.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20240409143748.980206-3-hch@lst.de
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 block/bsg-lib.c                     | 6 ++++--
 drivers/scsi/mpi3mr/mpi3mr_app.c    | 2 +-
 drivers/scsi/scsi_transport_fc.c    | 6 +++---
 drivers/scsi/scsi_transport_iscsi.c | 3 ++-
 drivers/scsi/scsi_transport_sas.c   | 4 ++--
 drivers/ufs/core/ufs_bsg.c          | 3 ++-
 include/linux/bsg-lib.h             | 3 ++-
 7 files changed, 16 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index bcc7dee6abce..ee738d129a9f 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -354,12 +354,14 @@ static const struct blk_mq_ops bsg_mq_ops = {
  * bsg_setup_queue - Create and add the bsg hooks so we can receive requests
  * @dev: device to attach bsg device to
  * @name: device to give bsg device
+ * @lim: queue limits for the bsg queue
  * @job_fn: bsg job handler
  * @timeout: timeout handler function pointer
  * @dd_job_size: size of LLD data needed for each job
  */
 struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
-		bsg_job_fn *job_fn, bsg_timeout_fn *timeout, int dd_job_size)
+		struct queue_limits *lim, bsg_job_fn *job_fn,
+		bsg_timeout_fn *timeout, int dd_job_size)
 {
 	struct bsg_set *bset;
 	struct blk_mq_tag_set *set;
@@ -383,7 +385,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
 	if (blk_mq_alloc_tag_set(set))
 		goto out_tag_set;
 
-	q = blk_mq_alloc_queue(set, NULL, NULL);
+	q = blk_mq_alloc_queue(set, lim, NULL);
 	if (IS_ERR(q)) {
 		ret = PTR_ERR(q);
 		goto out_queue;
diff --git a/drivers/scsi/mpi3mr/mpi3mr_app.c b/drivers/scsi/mpi3mr/mpi3mr_app.c
index 38f63bc7ef3b..a45406fae262 100644
--- a/drivers/scsi/mpi3mr/mpi3mr_app.c
+++ b/drivers/scsi/mpi3mr/mpi3mr_app.c
@@ -1860,7 +1860,7 @@ void mpi3mr_bsg_init(struct mpi3mr_ioc *mrioc)
 		return;
 	}
 
-	mrioc->bsg_queue = bsg_setup_queue(bsg_dev, dev_name(bsg_dev),
+	mrioc->bsg_queue = bsg_setup_queue(bsg_dev, dev_name(bsg_dev), NULL,
 			mpi3mr_bsg_request, NULL, 0);
 	if (IS_ERR(mrioc->bsg_queue)) {
 		ioc_err(mrioc, "%s: bsg registration failed\n",
diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index b04075f19445..87b2235b8ece 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -4287,8 +4287,8 @@ fc_bsg_hostadd(struct Scsi_Host *shost, struct fc_host_attrs *fc_host)
 	snprintf(bsg_name, sizeof(bsg_name),
 		 "fc_host%d", shost->host_no);
 
-	q = bsg_setup_queue(dev, bsg_name, fc_bsg_dispatch, fc_bsg_job_timeout,
-				i->f->dd_bsg_size);
+	q = bsg_setup_queue(dev, bsg_name, NULL, fc_bsg_dispatch,
+			fc_bsg_job_timeout, i->f->dd_bsg_size);
 	if (IS_ERR(q)) {
 		dev_err(dev,
 			"fc_host%d: bsg interface failed to initialize - setup queue\n",
@@ -4318,7 +4318,7 @@ fc_bsg_rportadd(struct Scsi_Host *shost, struct fc_rport *rport)
 	if (!i->f->bsg_request)
 		return -ENOTSUPP;
 
-	q = bsg_setup_queue(dev, dev_name(dev), fc_bsg_dispatch_prep,
+	q = bsg_setup_queue(dev, dev_name(dev), NULL, fc_bsg_dispatch_prep,
 				fc_bsg_job_timeout, i->f->dd_bsg_size);
 	if (IS_ERR(q)) {
 		dev_err(dev, "failed to setup bsg queue\n");
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index af3ac6346796..c131746bf207 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -1542,7 +1542,8 @@ iscsi_bsg_host_add(struct Scsi_Host *shost, struct iscsi_cls_host *ihost)
 		return -ENOTSUPP;
 
 	snprintf(bsg_name, sizeof(bsg_name), "iscsi_host%d", shost->host_no);
-	q = bsg_setup_queue(dev, bsg_name, iscsi_bsg_host_dispatch, NULL, 0);
+	q = bsg_setup_queue(dev, bsg_name, NULL, iscsi_bsg_host_dispatch, NULL,
+			0);
 	if (IS_ERR(q)) {
 		shost_printk(KERN_ERR, shost, "bsg interface failed to "
 			     "initialize - no request queue\n");
diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c
index d704c484a251..424a89513814 100644
--- a/drivers/scsi/scsi_transport_sas.c
+++ b/drivers/scsi/scsi_transport_sas.c
@@ -197,7 +197,7 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy)
 	}
 
 	if (rphy) {
-		q = bsg_setup_queue(&rphy->dev, dev_name(&rphy->dev),
+		q = bsg_setup_queue(&rphy->dev, dev_name(&rphy->dev), NULL,
 				sas_smp_dispatch, NULL, 0);
 		if (IS_ERR(q))
 			return PTR_ERR(q);
@@ -206,7 +206,7 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy)
 		char name[20];
 
 		snprintf(name, sizeof(name), "sas_host%d", shost->host_no);
-		q = bsg_setup_queue(&shost->shost_gendev, name,
+		q = bsg_setup_queue(&shost->shost_gendev, name, NULL,
 				sas_smp_dispatch, NULL, 0);
 		if (IS_ERR(q))
 			return PTR_ERR(q);
diff --git a/drivers/ufs/core/ufs_bsg.c b/drivers/ufs/core/ufs_bsg.c
index 374e5aae4e7e..433d0480391e 100644
--- a/drivers/ufs/core/ufs_bsg.c
+++ b/drivers/ufs/core/ufs_bsg.c
@@ -253,7 +253,8 @@ int ufs_bsg_probe(struct ufs_hba *hba)
 	if (ret)
 		goto out;
 
-	q = bsg_setup_queue(bsg_dev, dev_name(bsg_dev), ufs_bsg_request, NULL, 0);
+	q = bsg_setup_queue(bsg_dev, dev_name(bsg_dev), NULL, ufs_bsg_request,
+			NULL, 0);
 	if (IS_ERR(q)) {
 		ret = PTR_ERR(q);
 		goto out;
diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h
index 9e97ced2896d..14fa93268630 100644
--- a/include/linux/bsg-lib.h
+++ b/include/linux/bsg-lib.h
@@ -65,7 +65,8 @@ struct bsg_job {
 void bsg_job_done(struct bsg_job *job, int result,
 		  unsigned int reply_payload_rcv_len);
 struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
-		bsg_job_fn *job_fn, bsg_timeout_fn *timeout, int dd_job_size);
+		struct queue_limits *lim, bsg_job_fn *job_fn,
+		bsg_timeout_fn *timeout, int dd_job_size);
 void bsg_remove_queue(struct request_queue *q);
 void bsg_job_put(struct bsg_job *job);
 int __must_check bsg_job_get(struct bsg_job *job);
-- 
cgit v1.2.3


From afd53a3d852808bfeb5bc3ae3cd1caa9389bcc94 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 9 Apr 2024 16:37:29 +0200
Subject: scsi: core: Initialize scsi midlayer limits before allocating the
 queue

Turn __scsi_init_queue() into scsi_init_limits() which initializes
queue_limits structure that can be passed to blk_mq_alloc_queue().

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20240409143748.980206-5-hch@lst.de
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/scsi_lib.c             | 32 +++++++++++++++-----------------
 drivers/scsi/scsi_scan.c            |  5 +++--
 drivers/scsi/scsi_transport_fc.c    | 11 ++++++-----
 drivers/scsi/scsi_transport_iscsi.c |  5 +++--
 include/scsi/scsi_transport.h       |  2 +-
 5 files changed, 28 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 2e28e2360c85..1deca84914e8 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -32,7 +32,7 @@
 #include <scsi/scsi_driver.h>
 #include <scsi/scsi_eh.h>
 #include <scsi/scsi_host.h>
-#include <scsi/scsi_transport.h> /* __scsi_init_queue() */
+#include <scsi/scsi_transport.h> /* scsi_init_limits() */
 #include <scsi/scsi_dh.h>
 
 #include <trace/events/scsi.h>
@@ -1965,31 +1965,26 @@ static void scsi_map_queues(struct blk_mq_tag_set *set)
 	blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
 }
 
-void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q)
+void scsi_init_limits(struct Scsi_Host *shost, struct queue_limits *lim)
 {
 	struct device *dev = shost->dma_dev;
 
-	/*
-	 * this limit is imposed by hardware restrictions
-	 */
-	blk_queue_max_segments(q, min_t(unsigned short, shost->sg_tablesize,
-					SG_MAX_SEGMENTS));
+	memset(lim, 0, sizeof(*lim));
+	lim->max_segments =
+		min_t(unsigned short, shost->sg_tablesize, SG_MAX_SEGMENTS);
 
 	if (scsi_host_prot_dma(shost)) {
 		shost->sg_prot_tablesize =
 			min_not_zero(shost->sg_prot_tablesize,
 				     (unsigned short)SCSI_MAX_PROT_SG_SEGMENTS);
 		BUG_ON(shost->sg_prot_tablesize < shost->sg_tablesize);
-		blk_queue_max_integrity_segments(q, shost->sg_prot_tablesize);
+		lim->max_integrity_segments = shost->sg_prot_tablesize;
 	}
 
-	blk_queue_max_hw_sectors(q, shost->max_sectors);
-	blk_queue_segment_boundary(q, shost->dma_boundary);
-	dma_set_seg_boundary(dev, shost->dma_boundary);
-
-	blk_queue_max_segment_size(q, shost->max_segment_size);
-	blk_queue_virt_boundary(q, shost->virt_boundary_mask);
-	dma_set_max_seg_size(dev, queue_max_segment_size(q));
+	lim->max_hw_sectors = shost->max_sectors;
+	lim->seg_boundary_mask = shost->dma_boundary;
+	lim->max_segment_size = shost->max_segment_size;
+	lim->virt_boundary_mask = shost->virt_boundary_mask;
 
 	/*
 	 * Set a reasonable default alignment:  The larger of 32-byte (dword),
@@ -1998,9 +1993,12 @@ void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q)
 	 *
 	 * Devices that require a bigger alignment can increase it later.
 	 */
-	blk_queue_dma_alignment(q, max(4, dma_get_cache_alignment()) - 1);
+	lim->dma_alignment = max(4, dma_get_cache_alignment()) - 1;
+
+	dma_set_seg_boundary(dev, shost->dma_boundary);
+	dma_set_max_seg_size(dev, shost->max_segment_size);
 }
-EXPORT_SYMBOL_GPL(__scsi_init_queue);
+EXPORT_SYMBOL_GPL(scsi_init_limits);
 
 static const struct blk_mq_ops scsi_mq_ops_no_commit = {
 	.get_budget	= scsi_mq_get_budget,
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index 8d06475de17a..205ab3b3ea89 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -283,6 +283,7 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
 	struct request_queue *q;
 	int display_failure_msg = 1, ret;
 	struct Scsi_Host *shost = dev_to_shost(starget->dev.parent);
+	struct queue_limits lim;
 
 	sdev = kzalloc(sizeof(*sdev) + shost->transportt->device_size,
 		       GFP_KERNEL);
@@ -332,7 +333,8 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
 
 	sdev->sg_reserved_size = INT_MAX;
 
-	q = blk_mq_alloc_queue(&sdev->host->tag_set, NULL, NULL);
+	scsi_init_limits(shost, &lim);
+	q = blk_mq_alloc_queue(&sdev->host->tag_set, &lim, NULL);
 	if (IS_ERR(q)) {
 		/* release fn is set up in scsi_sysfs_device_initialise, so
 		 * have to free and put manually here */
@@ -343,7 +345,6 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
 	kref_get(&sdev->host->tagset_refcnt);
 	sdev->request_queue = q;
 	q->queuedata = sdev;
-	__scsi_init_queue(sdev->host, q);
 
 	depth = sdev->host->cmd_per_lun ?: 1;
 
diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 87b2235b8ece..0799700b0fca 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -4276,6 +4276,7 @@ fc_bsg_hostadd(struct Scsi_Host *shost, struct fc_host_attrs *fc_host)
 {
 	struct device *dev = &shost->shost_gendev;
 	struct fc_internal *i = to_fc_internal(shost->transportt);
+	struct queue_limits lim;
 	struct request_queue *q;
 	char bsg_name[20];
 
@@ -4286,8 +4287,8 @@ fc_bsg_hostadd(struct Scsi_Host *shost, struct fc_host_attrs *fc_host)
 
 	snprintf(bsg_name, sizeof(bsg_name),
 		 "fc_host%d", shost->host_no);
-
-	q = bsg_setup_queue(dev, bsg_name, NULL, fc_bsg_dispatch,
+	scsi_init_limits(shost, &lim);
+	q = bsg_setup_queue(dev, bsg_name, &lim, fc_bsg_dispatch,
 			fc_bsg_job_timeout, i->f->dd_bsg_size);
 	if (IS_ERR(q)) {
 		dev_err(dev,
@@ -4295,7 +4296,6 @@ fc_bsg_hostadd(struct Scsi_Host *shost, struct fc_host_attrs *fc_host)
 			shost->host_no);
 		return PTR_ERR(q);
 	}
-	__scsi_init_queue(shost, q);
 	blk_queue_rq_timeout(q, FC_DEFAULT_BSG_TIMEOUT);
 	fc_host->rqst_q = q;
 	return 0;
@@ -4311,6 +4311,7 @@ fc_bsg_rportadd(struct Scsi_Host *shost, struct fc_rport *rport)
 {
 	struct device *dev = &rport->dev;
 	struct fc_internal *i = to_fc_internal(shost->transportt);
+	struct queue_limits lim;
 	struct request_queue *q;
 
 	rport->rqst_q = NULL;
@@ -4318,13 +4319,13 @@ fc_bsg_rportadd(struct Scsi_Host *shost, struct fc_rport *rport)
 	if (!i->f->bsg_request)
 		return -ENOTSUPP;
 
-	q = bsg_setup_queue(dev, dev_name(dev), NULL, fc_bsg_dispatch_prep,
+	scsi_init_limits(shost, &lim);
+	q = bsg_setup_queue(dev, dev_name(dev), &lim, fc_bsg_dispatch_prep,
 				fc_bsg_job_timeout, i->f->dd_bsg_size);
 	if (IS_ERR(q)) {
 		dev_err(dev, "failed to setup bsg queue\n");
 		return PTR_ERR(q);
 	}
-	__scsi_init_queue(shost, q);
 	blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
 	rport->rqst_q = q;
 	return 0;
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index c131746bf207..5e1bb488da15 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -1535,6 +1535,7 @@ iscsi_bsg_host_add(struct Scsi_Host *shost, struct iscsi_cls_host *ihost)
 {
 	struct device *dev = &shost->shost_gendev;
 	struct iscsi_internal *i = to_iscsi_internal(shost->transportt);
+	struct queue_limits lim;
 	struct request_queue *q;
 	char bsg_name[20];
 
@@ -1542,14 +1543,14 @@ iscsi_bsg_host_add(struct Scsi_Host *shost, struct iscsi_cls_host *ihost)
 		return -ENOTSUPP;
 
 	snprintf(bsg_name, sizeof(bsg_name), "iscsi_host%d", shost->host_no);
-	q = bsg_setup_queue(dev, bsg_name, NULL, iscsi_bsg_host_dispatch, NULL,
+	scsi_init_limits(shost, &lim);
+	q = bsg_setup_queue(dev, bsg_name, &lim, iscsi_bsg_host_dispatch, NULL,
 			0);
 	if (IS_ERR(q)) {
 		shost_printk(KERN_ERR, shost, "bsg interface failed to "
 			     "initialize - no request queue\n");
 		return PTR_ERR(q);
 	}
-	__scsi_init_queue(shost, q);
 
 	ihost->bsg_q = q;
 	return 0;
diff --git a/include/scsi/scsi_transport.h b/include/scsi/scsi_transport.h
index a0458bda3148..1394cf313bb3 100644
--- a/include/scsi/scsi_transport.h
+++ b/include/scsi/scsi_transport.h
@@ -83,6 +83,6 @@ scsi_transport_device_data(struct scsi_device *sdev)
 		+ shost->transportt->device_private_offset;
 }
 
-void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q);
+void scsi_init_limits(struct Scsi_Host *shost, struct queue_limits *lim);
 
 #endif /* SCSI_TRANSPORT_H */
-- 
cgit v1.2.3


From 7eaae991c30d46a4ea45aa00f50eba939d7f951b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 9 Apr 2024 16:37:30 +0200
Subject: scsi: scsi_transport_fc: Add a max_bsg_segments field to struct
 fc_function_template

ibmvfc only supports a single segment for BSG FC passthrough.  Instead of
having it set a queue limits after creating the BSG queues, add a field so
that the FC transport can set it before allocating the queue.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20240409143748.980206-6-hch@lst.de
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/ibmvscsi/ibmvfc.c   | 5 +----
 drivers/scsi/scsi_transport_fc.c | 2 ++
 include/scsi/scsi_transport_fc.h | 1 +
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index 05b126bfd18b..a3d1013c8307 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -5541,8 +5541,6 @@ static void ibmvfc_tgt_add_rport(struct ibmvfc_target *tgt)
 			rport->supported_classes |= FC_COS_CLASS2;
 		if (be32_to_cpu(tgt->service_parms.class3_parms[0]) & 0x80000000)
 			rport->supported_classes |= FC_COS_CLASS3;
-		if (rport->rqst_q)
-			blk_queue_max_segments(rport->rqst_q, 1);
 	} else
 		tgt_dbg(tgt, "rport add failed\n");
 	spin_unlock_irqrestore(vhost->host->host_lock, flags);
@@ -6391,8 +6389,6 @@ static int ibmvfc_probe(struct vio_dev *vdev, const struct vio_device_id *id)
 
 	ibmvfc_init_sub_crqs(vhost);
 
-	if (shost_to_fc_host(shost)->rqst_q)
-		blk_queue_max_segments(shost_to_fc_host(shost)->rqst_q, 1);
 	dev_set_drvdata(dev, vhost);
 	spin_lock(&ibmvfc_driver_lock);
 	list_add_tail(&vhost->queue, &ibmvfc_head);
@@ -6547,6 +6543,7 @@ static struct fc_function_template ibmvfc_transport_functions = {
 	.get_starget_port_id = ibmvfc_get_starget_port_id,
 	.show_starget_port_id = 1,
 
+	.max_bsg_segments = 1,
 	.bsg_request = ibmvfc_bsg_request,
 	.bsg_timeout = ibmvfc_bsg_timeout,
 };
diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 0799700b0fca..7d088b8da075 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -4288,6 +4288,7 @@ fc_bsg_hostadd(struct Scsi_Host *shost, struct fc_host_attrs *fc_host)
 	snprintf(bsg_name, sizeof(bsg_name),
 		 "fc_host%d", shost->host_no);
 	scsi_init_limits(shost, &lim);
+	lim.max_segments = min_not_zero(lim.max_segments, i->f->max_bsg_segments);
 	q = bsg_setup_queue(dev, bsg_name, &lim, fc_bsg_dispatch,
 			fc_bsg_job_timeout, i->f->dd_bsg_size);
 	if (IS_ERR(q)) {
@@ -4320,6 +4321,7 @@ fc_bsg_rportadd(struct Scsi_Host *shost, struct fc_rport *rport)
 		return -ENOTSUPP;
 
 	scsi_init_limits(shost, &lim);
+	lim.max_segments = min_not_zero(lim.max_segments, i->f->max_bsg_segments);
 	q = bsg_setup_queue(dev, dev_name(dev), &lim, fc_bsg_dispatch_prep,
 				fc_bsg_job_timeout, i->f->dd_bsg_size);
 	if (IS_ERR(q)) {
diff --git a/include/scsi/scsi_transport_fc.h b/include/scsi/scsi_transport_fc.h
index 483513c57597..fd039306ffbb 100644
--- a/include/scsi/scsi_transport_fc.h
+++ b/include/scsi/scsi_transport_fc.h
@@ -709,6 +709,7 @@ struct fc_function_template {
 	int  	(*vport_delete)(struct fc_vport *);
 
 	/* bsg support */
+	u32				max_bsg_segments;
 	int	(*bsg_request)(struct bsg_job *);
 	int	(*bsg_timeout)(struct bsg_job *);
 
-- 
cgit v1.2.3


From 6248d7f7714f018f2c02f356582784e74596f8e8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 9 Apr 2024 16:37:31 +0200
Subject: scsi: core: Add a no_highmem flag to struct Scsi_Host

While we really should be killing the block layer bounce buffering ASAP, I
even more urgently need to stop the drivers to fiddle with the limits from
->slave_configure.  Add a no_highmem flag to the Scsi_Host to centralize
this setting and switch the remaining four drivers that use block layer
bounce buffering to it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20240409143748.980206-7-hch@lst.de
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/aha152x.c         |  8 +-------
 drivers/scsi/imm.c             | 12 +-----------
 drivers/scsi/ppa.c             |  8 +-------
 drivers/scsi/scsi_lib.c        |  3 +++
 drivers/usb/storage/scsiglue.c | 10 ----------
 drivers/usb/storage/usb.c      | 10 ++++++++++
 include/scsi/scsi_host.h       |  2 ++
 7 files changed, 18 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/aha152x.c b/drivers/scsi/aha152x.c
index 055adb349b0e..83f16fc14d96 100644
--- a/drivers/scsi/aha152x.c
+++ b/drivers/scsi/aha152x.c
@@ -746,6 +746,7 @@ struct Scsi_Host *aha152x_probe_one(struct aha152x_setup *setup)
 	/* need to have host registered before triggering any interrupt */
 	list_add_tail(&HOSTDATA(shpnt)->host_list, &aha152x_host_list);
 
+	shpnt->no_highmem = true;
 	shpnt->io_port   = setup->io_port;
 	shpnt->n_io_port = IO_RANGE;
 	shpnt->irq       = setup->irq;
@@ -2940,12 +2941,6 @@ static int aha152x_show_info(struct seq_file *m, struct Scsi_Host *shpnt)
 	return 0;
 }
 
-static int aha152x_adjust_queue(struct scsi_device *device)
-{
-	blk_queue_bounce_limit(device->request_queue, BLK_BOUNCE_HIGH);
-	return 0;
-}
-
 static const struct scsi_host_template aha152x_driver_template = {
 	.module				= THIS_MODULE,
 	.name				= AHA152X_REVID,
@@ -2961,7 +2956,6 @@ static const struct scsi_host_template aha152x_driver_template = {
 	.this_id			= 7,
 	.sg_tablesize			= SG_ALL,
 	.dma_boundary			= PAGE_SIZE - 1,
-	.slave_alloc			= aha152x_adjust_queue,
 	.cmd_size			= sizeof(struct aha152x_cmd_priv),
 };
 
diff --git a/drivers/scsi/imm.c b/drivers/scsi/imm.c
index 180a5ddedb2c..21339da505f1 100644
--- a/drivers/scsi/imm.c
+++ b/drivers/scsi/imm.c
@@ -1100,16 +1100,6 @@ static int device_check(imm_struct *dev, bool autodetect)
 	return -ENODEV;
 }
 
-/*
- * imm cannot deal with highmem, so this causes all IO pages for this host
- * to reside in low memory (hence mapped)
- */
-static int imm_adjust_queue(struct scsi_device *device)
-{
-	blk_queue_bounce_limit(device->request_queue, BLK_BOUNCE_HIGH);
-	return 0;
-}
-
 static const struct scsi_host_template imm_template = {
 	.module			= THIS_MODULE,
 	.proc_name		= "imm",
@@ -1123,7 +1113,6 @@ static const struct scsi_host_template imm_template = {
 	.this_id		= 7,
 	.sg_tablesize		= SG_ALL,
 	.can_queue		= 1,
-	.slave_alloc		= imm_adjust_queue,
 	.cmd_size		= sizeof(struct scsi_pointer),
 };
 
@@ -1235,6 +1224,7 @@ static int __imm_attach(struct parport *pb)
 	host = scsi_host_alloc(&imm_template, sizeof(imm_struct *));
 	if (!host)
 		goto out1;
+	host->no_highmem = true;
 	host->io_port = pb->base;
 	host->n_io_port = ports;
 	host->dma_channel = -1;
diff --git a/drivers/scsi/ppa.c b/drivers/scsi/ppa.c
index d592ee9170c1..8300f0bdddb3 100644
--- a/drivers/scsi/ppa.c
+++ b/drivers/scsi/ppa.c
@@ -986,12 +986,6 @@ second_pass:
 	return -ENODEV;
 }
 
-static int ppa_adjust_queue(struct scsi_device *device)
-{
-	blk_queue_bounce_limit(device->request_queue, BLK_BOUNCE_HIGH);
-	return 0;
-}
-
 static const struct scsi_host_template ppa_template = {
 	.module			= THIS_MODULE,
 	.proc_name		= "ppa",
@@ -1005,7 +999,6 @@ static const struct scsi_host_template ppa_template = {
 	.this_id		= -1,
 	.sg_tablesize		= SG_ALL,
 	.can_queue		= 1,
-	.slave_alloc		= ppa_adjust_queue,
 	.cmd_size		= sizeof(struct scsi_pointer),
 };
 
@@ -1111,6 +1104,7 @@ static int __ppa_attach(struct parport *pb)
 	host = scsi_host_alloc(&ppa_template, sizeof(ppa_struct *));
 	if (!host)
 		goto out1;
+	host->no_highmem = true;
 	host->io_port = pb->base;
 	host->n_io_port = ports;
 	host->dma_channel = -1;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 1deca84914e8..f1936f98abe3 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1995,6 +1995,9 @@ void scsi_init_limits(struct Scsi_Host *shost, struct queue_limits *lim)
 	 */
 	lim->dma_alignment = max(4, dma_get_cache_alignment()) - 1;
 
+	if (shost->no_highmem)
+		lim->bounce = BLK_BOUNCE_HIGH;
+
 	dma_set_seg_boundary(dev, shost->dma_boundary);
 	dma_set_max_seg_size(dev, shost->max_segment_size);
 }
diff --git a/drivers/usb/storage/scsiglue.c b/drivers/usb/storage/scsiglue.c
index 12cf9940e5b6..1d14c678f3d3 100644
--- a/drivers/usb/storage/scsiglue.c
+++ b/drivers/usb/storage/scsiglue.c
@@ -40,7 +40,6 @@
 #include <scsi/scsi_eh.h>
 
 #include "usb.h"
-#include <linux/usb/hcd.h>
 #include "scsiglue.h"
 #include "debug.h"
 #include "transport.h"
@@ -130,15 +129,6 @@ static int slave_configure(struct scsi_device *sdev)
 		min_t(size_t, queue_max_hw_sectors(sdev->request_queue),
 		      dma_max_mapping_size(dev) >> SECTOR_SHIFT));
 
-	/*
-	 * Some USB host controllers can't do DMA; they have to use PIO.
-	 * For such controllers we need to make sure the block layer sets
-	 * up bounce buffers in addressable memory.
-	 */
-	if (!hcd_uses_dma(bus_to_hcd(us->pusb_dev->bus)) ||
-			(bus_to_hcd(us->pusb_dev->bus)->localmem_pool != NULL))
-		blk_queue_bounce_limit(sdev->request_queue, BLK_BOUNCE_HIGH);
-
 	/*
 	 * We can't put these settings in slave_alloc() because that gets
 	 * called before the device type is known.  Consequently these
diff --git a/drivers/usb/storage/usb.c b/drivers/usb/storage/usb.c
index d1ad6a2509ab..a49a31639f6f 100644
--- a/drivers/usb/storage/usb.c
+++ b/drivers/usb/storage/usb.c
@@ -47,6 +47,7 @@
 #include <scsi/scsi_device.h>
 
 #include "usb.h"
+#include <linux/usb/hcd.h>
 #include "scsiglue.h"
 #include "transport.h"
 #include "protocol.h"
@@ -961,6 +962,15 @@ int usb_stor_probe1(struct us_data **pus,
 	if (result)
 		goto BadDevice;
 
+	/*
+	 * Some USB host controllers can't do DMA; they have to use PIO.
+	 * For such controllers we need to make sure the block layer sets
+	 * up bounce buffers in addressable memory.
+	 */
+	if (!hcd_uses_dma(bus_to_hcd(us->pusb_dev->bus)) ||
+	    bus_to_hcd(us->pusb_dev->bus)->localmem_pool)
+		host->no_highmem = true;
+
 	/* Get the unusual_devs entries and the descriptors */
 	result = get_device_info(us, id, unusual_dev);
 	if (result)
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index b259d42a1e1a..6d77c48e8311 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -665,6 +665,8 @@ struct Scsi_Host {
 	/* The transport requires the LUN bits NOT to be stored in CDB[1] */
 	unsigned no_scsi2_lun_in_cdb:1;
 
+	unsigned no_highmem:1;
+
 	/*
 	 * Optional work queue to be utilized by the transport
 	 */
-- 
cgit v1.2.3


From 5b7dfbeff92a4a00b55b2be580f057d533b65cd5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 9 Apr 2024 16:37:32 +0200
Subject: scsi: core: Add a dma_alignment field to the host and host template

Get drivers out of the business of having to call the block layer DMA
alignment limits helpers themselves.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20240409143748.980206-8-hch@lst.de
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/firewire/sbp2.c           |  6 ------
 drivers/message/fusion/mptfc.c    |  1 +
 drivers/message/fusion/mptsas.c   |  1 +
 drivers/message/fusion/mptscsih.c |  2 --
 drivers/message/fusion/mptspi.c   |  1 +
 drivers/scsi/hosts.c              |  6 ++++++
 drivers/scsi/iscsi_tcp.c          |  2 +-
 drivers/scsi/qla2xxx/qla_os.c     |  6 +++---
 drivers/scsi/scsi_lib.c           | 11 ++---------
 drivers/staging/rts5208/rtsx.c    | 24 ++++++++++++------------
 drivers/usb/image/microtek.c      |  8 +-------
 drivers/usb/storage/scsiglue.c    | 11 +++++------
 drivers/usb/storage/uas.c         | 13 ++++++-------
 include/scsi/scsi_host.h          |  3 +++
 14 files changed, 42 insertions(+), 53 deletions(-)

(limited to 'include')

diff --git a/drivers/firewire/sbp2.c b/drivers/firewire/sbp2.c
index e779d866022b..8f7810b2a4c1 100644
--- a/drivers/firewire/sbp2.c
+++ b/drivers/firewire/sbp2.c
@@ -1500,12 +1500,6 @@ static int sbp2_scsi_slave_alloc(struct scsi_device *sdev)
 
 	sdev->allow_restart = 1;
 
-	/*
-	 * SBP-2 does not require any alignment, but we set it anyway
-	 * for compatibility with earlier versions of this driver.
-	 */
-	blk_queue_update_dma_alignment(sdev->request_queue, 4 - 1);
-
 	if (lu->tgt->workarounds & SBP2_WORKAROUND_INQUIRY_36)
 		sdev->inquiry_len = 36;
 
diff --git a/drivers/message/fusion/mptfc.c b/drivers/message/fusion/mptfc.c
index c459f709107b..a3c17c4fe69c 100644
--- a/drivers/message/fusion/mptfc.c
+++ b/drivers/message/fusion/mptfc.c
@@ -129,6 +129,7 @@ static const struct scsi_host_template mptfc_driver_template = {
 	.sg_tablesize			= MPT_SCSI_SG_DEPTH,
 	.max_sectors			= 8192,
 	.cmd_per_lun			= 7,
+	.dma_alignment			= 511,
 	.shost_groups			= mptscsih_host_attr_groups,
 };
 
diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c
index 300f8e955a53..30cb4f64e770 100644
--- a/drivers/message/fusion/mptsas.c
+++ b/drivers/message/fusion/mptsas.c
@@ -2020,6 +2020,7 @@ static const struct scsi_host_template mptsas_driver_template = {
 	.sg_tablesize			= MPT_SCSI_SG_DEPTH,
 	.max_sectors			= 8192,
 	.cmd_per_lun			= 7,
+	.dma_alignment			= 511,
 	.shost_groups			= mptscsih_host_attr_groups,
 	.no_write_same			= 1,
 };
diff --git a/drivers/message/fusion/mptscsih.c b/drivers/message/fusion/mptscsih.c
index 9080a73b4ea6..6c3f25cc33ff 100644
--- a/drivers/message/fusion/mptscsih.c
+++ b/drivers/message/fusion/mptscsih.c
@@ -2438,8 +2438,6 @@ mptscsih_slave_configure(struct scsi_device *sdev)
 		"tagged %d, simple %d\n",
 		ioc->name,sdev->tagged_supported, sdev->simple_tags));
 
-	blk_queue_dma_alignment (sdev->request_queue, 512 - 1);
-
 	return 0;
 }
 
diff --git a/drivers/message/fusion/mptspi.c b/drivers/message/fusion/mptspi.c
index 6c5920db1e9d..574b882c9a85 100644
--- a/drivers/message/fusion/mptspi.c
+++ b/drivers/message/fusion/mptspi.c
@@ -843,6 +843,7 @@ static const struct scsi_host_template mptspi_driver_template = {
 	.sg_tablesize			= MPT_SCSI_SG_DEPTH,
 	.max_sectors			= 8192,
 	.cmd_per_lun			= 7,
+	.dma_alignment			= 511,
 	.shost_groups			= mptscsih_host_attr_groups,
 };
 
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 4f495a41ec4a..a67a98bd7ae2 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -478,6 +478,12 @@ struct Scsi_Host *scsi_host_alloc(const struct scsi_host_template *sht, int priv
 	else
 		shost->max_segment_size = BLK_MAX_SEGMENT_SIZE;
 
+	/* 32-byte (dword) is a common minimum for HBAs. */
+	if (sht->dma_alignment)
+		shost->dma_alignment = sht->dma_alignment;
+	else
+		shost->dma_alignment = 3;
+
 	/*
 	 * assume a 4GB boundary, if not set
 	 */
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index 8e14cea15f98..60688f18fac6 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -943,6 +943,7 @@ iscsi_sw_tcp_session_create(struct iscsi_endpoint *ep, uint16_t cmds_max,
 	shost->max_id = 0;
 	shost->max_channel = 0;
 	shost->max_cmd_len = SCSI_MAX_VARLEN_CDB_SIZE;
+	shost->dma_alignment = 0;
 
 	rc = iscsi_host_get_max_scsi_cmds(shost, cmds_max);
 	if (rc < 0)
@@ -1065,7 +1066,6 @@ static int iscsi_sw_tcp_slave_configure(struct scsi_device *sdev)
 	if (conn->datadgst_en)
 		blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES,
 				   sdev->request_queue);
-	blk_queue_dma_alignment(sdev->request_queue, 0);
 	return 0;
 }
 
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index 6a1900e96a5a..fcb06df2ce4e 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -1957,9 +1957,6 @@ qla2xxx_slave_configure(struct scsi_device *sdev)
 	scsi_qla_host_t *vha = shost_priv(sdev->host);
 	struct req_que *req = vha->req;
 
-	if (IS_T10_PI_CAPABLE(vha->hw))
-		blk_queue_update_dma_alignment(sdev->request_queue, 0x7);
-
 	scsi_change_queue_depth(sdev, req->max_q_depth);
 	return 0;
 }
@@ -3575,6 +3572,9 @@ skip_dpc:
 		    QLA_SG_ALL : 128;
 	}
 
+	if (IS_T10_PI_CAPABLE(base_vha->hw))
+		host->dma_alignment = 0x7;
+
 	ret = scsi_add_host(host, &pdev->dev);
 	if (ret)
 		goto probe_failed;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index f1936f98abe3..26b51406c477 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1985,15 +1985,8 @@ void scsi_init_limits(struct Scsi_Host *shost, struct queue_limits *lim)
 	lim->seg_boundary_mask = shost->dma_boundary;
 	lim->max_segment_size = shost->max_segment_size;
 	lim->virt_boundary_mask = shost->virt_boundary_mask;
-
-	/*
-	 * Set a reasonable default alignment:  The larger of 32-byte (dword),
-	 * which is a common minimum for HBAs, and the minimum DMA alignment,
-	 * which is set by the platform.
-	 *
-	 * Devices that require a bigger alignment can increase it later.
-	 */
-	lim->dma_alignment = max(4, dma_get_cache_alignment()) - 1;
+	lim->dma_alignment = max_t(unsigned int,
+		shost->dma_alignment, dma_get_cache_alignment() - 1);
 
 	if (shost->no_highmem)
 		lim->bounce = BLK_BOUNCE_HIGH;
diff --git a/drivers/staging/rts5208/rtsx.c b/drivers/staging/rts5208/rtsx.c
index 86d32e3b3282..c4f54c311d05 100644
--- a/drivers/staging/rts5208/rtsx.c
+++ b/drivers/staging/rts5208/rtsx.c
@@ -70,18 +70,6 @@ static int slave_alloc(struct scsi_device *sdev)
 
 static int slave_configure(struct scsi_device *sdev)
 {
-	/*
-	 * Scatter-gather buffers (all but the last) must have a length
-	 * divisible by the bulk maxpacket size.  Otherwise a data packet
-	 * would end up being short, causing a premature end to the data
-	 * transfer.  Since high-speed bulk pipes have a maxpacket size
-	 * of 512, we'll use that as the scsi device queue's DMA alignment
-	 * mask.  Guaranteeing proper alignment of the first buffer will
-	 * have the desired effect because, except at the beginning and
-	 * the end, scatter-gather buffers follow page boundaries.
-	 */
-	blk_queue_dma_alignment(sdev->request_queue, (512 - 1));
-
 	/* Set the SCSI level to at least 2.  We'll leave it at 3 if that's
 	 * what is originally reported.  We need this to avoid confusing
 	 * the SCSI layer with devices that report 0 or 1, but need 10-byte
@@ -219,6 +207,18 @@ static const struct scsi_host_template rtsx_host_template = {
 	/* limit the total size of a transfer to 120 KB */
 	.max_sectors =                  240,
 
+	/*
+	 * Scatter-gather buffers (all but the last) must have a length
+	 * divisible by the bulk maxpacket size.  Otherwise a data packet
+	 * would end up being short, causing a premature end to the data
+	 * transfer.  Since high-speed bulk pipes have a maxpacket size
+	 * of 512, we'll use that as the scsi device queue's DMA alignment
+	 * mask.  Guaranteeing proper alignment of the first buffer will
+	 * have the desired effect because, except at the beginning and
+	 * the end, scatter-gather buffers follow page boundaries.
+	 */
+	.dma_alignment =		511,
+
 	/* emulated HBA */
 	.emulated =			1,
 
diff --git a/drivers/usb/image/microtek.c b/drivers/usb/image/microtek.c
index 8c8fa71c69c4..9f758241d9d3 100644
--- a/drivers/usb/image/microtek.c
+++ b/drivers/usb/image/microtek.c
@@ -328,12 +328,6 @@ static int mts_slave_alloc (struct scsi_device *s)
 	return 0;
 }
 
-static int mts_slave_configure (struct scsi_device *s)
-{
-	blk_queue_dma_alignment(s->request_queue, (512 - 1));
-	return 0;
-}
-
 static int mts_scsi_abort(struct scsi_cmnd *srb)
 {
 	struct mts_desc* desc = (struct mts_desc*)(srb->device->host->hostdata[0]);
@@ -631,8 +625,8 @@ static const struct scsi_host_template mts_scsi_host_template = {
 	.can_queue =		1,
 	.this_id =		-1,
 	.emulated =		1,
+	.dma_alignment =	511,
 	.slave_alloc =		mts_slave_alloc,
-	.slave_configure =	mts_slave_configure,
 	.max_sectors=		256, /* 128 K */
 };
 
diff --git a/drivers/usb/storage/scsiglue.c b/drivers/usb/storage/scsiglue.c
index 1d14c678f3d3..eb4ba03e082d 100644
--- a/drivers/usb/storage/scsiglue.c
+++ b/drivers/usb/storage/scsiglue.c
@@ -75,12 +75,6 @@ static int slave_alloc (struct scsi_device *sdev)
 	 */
 	sdev->inquiry_len = 36;
 
-	/*
-	 * Some host controllers may have alignment requirements.
-	 * We'll play it safe by requiring 512-byte alignment always.
-	 */
-	blk_queue_update_dma_alignment(sdev->request_queue, (512 - 1));
-
 	/* Tell the SCSI layer if we know there is more than one LUN */
 	if (us->protocol == USB_PR_BULK && us->max_lun > 0)
 		sdev->sdev_bflags |= BLIST_FORCELUN;
@@ -638,6 +632,11 @@ static const struct scsi_host_template usb_stor_host_template = {
 	/* lots of sg segments can be handled */
 	.sg_tablesize =			SG_MAX_SEGMENTS,
 
+	/*
+	 * Some host controllers may have alignment requirements.
+	 * We'll play it safe by requiring 512-byte alignment always.
+	 */
+	.dma_alignment =		511,
 
 	/*
 	 * Limit the total size of a transfer to 120 KB.
diff --git a/drivers/usb/storage/uas.c b/drivers/usb/storage/uas.c
index 71ace274761f..0668182e1c46 100644
--- a/drivers/usb/storage/uas.c
+++ b/drivers/usb/storage/uas.c
@@ -824,13 +824,6 @@ static int uas_slave_alloc(struct scsi_device *sdev)
 
 	sdev->hostdata = devinfo;
 
-	/*
-	 * The protocol has no requirements on alignment in the strict sense.
-	 * Controllers may or may not have alignment restrictions.
-	 * As this is not exported, we use an extremely conservative guess.
-	 */
-	blk_queue_update_dma_alignment(sdev->request_queue, (512 - 1));
-
 	if (devinfo->flags & US_FL_MAX_SECTORS_64)
 		blk_queue_max_hw_sectors(sdev->request_queue, 64);
 	else if (devinfo->flags & US_FL_MAX_SECTORS_240)
@@ -912,6 +905,12 @@ static const struct scsi_host_template uas_host_template = {
 	.eh_device_reset_handler = uas_eh_device_reset_handler,
 	.this_id = -1,
 	.skip_settle_delay = 1,
+	/*
+	 * The protocol has no requirements on alignment in the strict sense.
+	 * Controllers may or may not have alignment restrictions.
+	 * As this is not exported, we use an extremely conservative guess.
+	 */
+	.dma_alignment = 511,
 	.dma_boundary = PAGE_SIZE - 1,
 	.cmd_size = sizeof(struct uas_cmd_info),
 };
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 6d77c48e8311..b0948ab69e0f 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -405,6 +405,8 @@ struct scsi_host_template {
 	 */
 	unsigned int max_segment_size;
 
+	unsigned int dma_alignment;
+
 	/*
 	 * DMA scatter gather segment boundary limit. A segment crossing this
 	 * boundary will be split in two.
@@ -614,6 +616,7 @@ struct Scsi_Host {
 	unsigned int max_sectors;
 	unsigned int opt_sectors;
 	unsigned int max_segment_size;
+	unsigned int dma_alignment;
 	unsigned long dma_boundary;
 	unsigned long virt_boundary_mask;
 	/*
-- 
cgit v1.2.3


From 67144d3c58b4256cc15ebad2c90c0a283e707aa1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 9 Apr 2024 16:37:33 +0200
Subject: scsi: ufs: ufs-exynos: Move setting the the DMA alignment to the init
 method

Use the SCSI host's dma_alignment field and set it in ->init and remove the
now unused config_scsi_dev method.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20240409143748.980206-9-hch@lst.de
Reviewed-by: Alim Akhtar <alim.akhtar@samsung.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ufs/core/ufshcd.c     | 3 ---
 drivers/ufs/host/ufs-exynos.c | 8 ++------
 include/ufs/ufshcd.h          | 1 -
 3 files changed, 2 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 62c8575f2c67..83227c870859 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -5274,9 +5274,6 @@ static int ufshcd_slave_configure(struct scsi_device *sdev)
 	 */
 	sdev->silence_suspend = 1;
 
-	if (hba->vops && hba->vops->config_scsi_dev)
-		hba->vops->config_scsi_dev(sdev);
-
 	ufshcd_crypto_register(hba, q);
 
 	return 0;
diff --git a/drivers/ufs/host/ufs-exynos.c b/drivers/ufs/host/ufs-exynos.c
index 734d40f99e31..7ffc28e22784 100644
--- a/drivers/ufs/host/ufs-exynos.c
+++ b/drivers/ufs/host/ufs-exynos.c
@@ -1187,6 +1187,8 @@ static int exynos_ufs_init(struct ufs_hba *hba)
 		goto out;
 	exynos_ufs_specify_phy_time_attr(ufs);
 	exynos_ufs_config_smu(ufs);
+
+	hba->host->dma_alignment = SZ_4K - 1;
 	return 0;
 
 out:
@@ -1510,11 +1512,6 @@ static int fsd_ufs_pre_link(struct exynos_ufs *ufs)
 	return 0;
 }
 
-static void exynos_ufs_config_scsi_dev(struct scsi_device *sdev)
-{
-	blk_queue_update_dma_alignment(sdev->request_queue, SZ_4K - 1);
-}
-
 static int fsd_ufs_post_link(struct exynos_ufs *ufs)
 {
 	int i;
@@ -1583,7 +1580,6 @@ static const struct ufs_hba_variant_ops ufs_hba_exynos_ops = {
 	.hibern8_notify			= exynos_ufs_hibern8_notify,
 	.suspend			= exynos_ufs_suspend,
 	.resume				= exynos_ufs_resume,
-	.config_scsi_dev		= exynos_ufs_config_scsi_dev,
 };
 
 static struct ufs_hba_variant_ops ufs_hba_exynosauto_vh_ops = {
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index a35e12f8e68b..93afb726213f 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -374,7 +374,6 @@ struct ufs_hba_variant_ops {
 	int	(*get_outstanding_cqs)(struct ufs_hba *hba,
 				       unsigned long *ocqs);
 	int	(*config_esi)(struct ufs_hba *hba);
-	void	(*config_scsi_dev)(struct scsi_device *sdev);
 };
 
 /* clock gating state  */
-- 
cgit v1.2.3


From b7eefcf11f3fb1703ddfa075d4f4bc30b9c50eb4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 9 Apr 2024 16:37:35 +0200
Subject: scsi: core: Add a device_configure method to the host template

This is a version of ->slave_configure that also takes a queue_limits
structure that the caller applies, and thus allows drivers to reconfigure
the queue using the atomic queue limits API.

In the long run it should also replace ->slave_configure entirely as there
is no need to have two different methods here, and the slave name in
addition to being politically charged also has no basis in the SCSI
standards or the kernel code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20240409143748.980206-11-hch@lst.de
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/scsi/scsi_scan.c | 47 ++++++++++++++++++++++++++---------------------
 include/scsi/scsi_host.h |  4 ++++
 2 files changed, 30 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index 1ec4b309af72..463ce6e23dc6 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -227,7 +227,7 @@ static int scsi_realloc_sdev_budget_map(struct scsi_device *sdev,
 
 	/*
 	 * realloc if new shift is calculated, which is caused by setting
-	 * up one new default queue depth after calling ->slave_configure
+	 * up one new default queue depth after calling ->device_configure
 	 */
 	if (!need_alloc && new_shift != sdev->budget_map.shift)
 		need_alloc = need_free = true;
@@ -874,6 +874,7 @@ static int scsi_probe_lun(struct scsi_device *sdev, unsigned char *inq_result,
 static int scsi_add_lun(struct scsi_device *sdev, unsigned char *inq_result,
 		blist_flags_t *bflags, int async)
 {
+	const struct scsi_host_template *hostt = sdev->host->hostt;
 	struct queue_limits lim;
 	int ret;
 
@@ -1073,33 +1074,37 @@ static int scsi_add_lun(struct scsi_device *sdev, unsigned char *inq_result,
 		lim.max_hw_sectors = 512;
 	else if (*bflags & BLIST_MAX_1024)
 		lim.max_hw_sectors = 1024;
+
+	if (hostt->device_configure)
+		ret = hostt->device_configure(sdev, &lim);
+	else if (hostt->slave_configure)
+		ret = hostt->slave_configure(sdev);
+	if (ret) {
+		queue_limits_cancel_update(sdev->request_queue);
+		/*
+		 * If the LLDD reports device not present, don't clutter the
+		 * console with failure messages.
+		 */
+		if (ret != -ENXIO)
+			sdev_printk(KERN_ERR, sdev,
+				"failed to configure device\n");
+		return SCSI_SCAN_NO_RESPONSE;
+	}
+
 	ret = queue_limits_commit_update(sdev->request_queue, &lim);
 	if (ret) {
 		sdev_printk(KERN_ERR, sdev, "failed to apply queue limits.\n");
 		return SCSI_SCAN_NO_RESPONSE;
 	}
 
-	if (sdev->host->hostt->slave_configure) {
-		ret = sdev->host->hostt->slave_configure(sdev);
-		if (ret) {
-			/*
-			 * if LLDD reports slave not present, don't clutter
-			 * console with alloc failure messages
-			 */
-			if (ret != -ENXIO) {
-				sdev_printk(KERN_ERR, sdev,
-					"failed to configure device\n");
-			}
-			return SCSI_SCAN_NO_RESPONSE;
-		}
-
-		/*
-		 * The queue_depth is often changed in ->slave_configure.
-		 * Set up budget map again since memory consumption of
-		 * the map depends on actual queue depth.
-		 */
+	/*
+	 * The queue_depth is often changed in ->device_configure.
+	 *
+	 * Set up budget map again since memory consumption of the map depends
+	 * on actual queue depth.
+	 */
+	if (hostt->device_configure || hostt->slave_configure)
 		scsi_realloc_sdev_budget_map(sdev, sdev->queue_depth);
-	}
 
 	if (sdev->scsi_level >= SCSI_3)
 		scsi_attach_vpd(sdev);
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index b0948ab69e0f..92c8c3b08dfc 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -211,7 +211,11 @@ struct scsi_host_template {
 	 *     up after yourself before returning non-0
 	 *
 	 * Status: OPTIONAL
+	 *
+	 * Note: slave_configure is the legacy version, use device_configure for
+	 * all new code.  A driver must never define both.
 	 */
+	int (* device_configure)(struct scsi_device *, struct queue_limits *lim);
 	int (* slave_configure)(struct scsi_device *);
 
 	/*
-- 
cgit v1.2.3


From a25a9c85d17fd2f19bd5a2bb25b8361d72336bc7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 9 Apr 2024 16:37:45 +0200
Subject: scsi: libata: Switch to using ->device_configure

Switch to the ->device_configure method instead of ->slave_configure and
update the block limits on the passed in queue_limits instead of using the
per-limit accessors.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20240409143748.980206-21-hch@lst.de
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Acked-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ata/ahci.h                     |  2 +-
 drivers/ata/libata-sata.c              | 11 +++++++----
 drivers/ata/libata-scsi.c              | 19 +++++++++++--------
 drivers/ata/libata.h                   |  3 ++-
 drivers/ata/pata_macio.c               |  2 +-
 drivers/ata/sata_mv.c                  |  2 +-
 drivers/ata/sata_nv.c                  |  4 ++--
 drivers/ata/sata_sil24.c               |  2 +-
 drivers/scsi/hisi_sas/hisi_sas.h       |  3 ++-
 drivers/scsi/hisi_sas/hisi_sas_main.c  |  7 ++++---
 drivers/scsi/hisi_sas/hisi_sas_v1_hw.c |  2 +-
 drivers/scsi/hisi_sas/hisi_sas_v2_hw.c |  2 +-
 drivers/scsi/hisi_sas/hisi_sas_v3_hw.c |  7 ++++---
 drivers/scsi/libsas/sas_scsi_host.c    |  7 ++++---
 include/linux/libata.h                 | 10 ++++++----
 include/scsi/libsas.h                  |  5 +++--
 16 files changed, 51 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/drivers/ata/ahci.h b/drivers/ata/ahci.h
index 344c87210d8f..8f40f75ba08c 100644
--- a/drivers/ata/ahci.h
+++ b/drivers/ata/ahci.h
@@ -397,7 +397,7 @@ extern const struct attribute_group *ahci_sdev_groups[];
 	.sdev_groups		= ahci_sdev_groups,			\
 	.change_queue_depth     = ata_scsi_change_queue_depth,		\
 	.tag_alloc_policy       = BLK_TAG_ALLOC_RR,             	\
-	.slave_configure        = ata_scsi_slave_config
+	.device_configure	= ata_scsi_device_configure
 
 extern struct ata_port_operations ahci_ops;
 extern struct ata_port_operations ahci_platform_ops;
diff --git a/drivers/ata/libata-sata.c b/drivers/ata/libata-sata.c
index a8d773003d74..9e047bf912b1 100644
--- a/drivers/ata/libata-sata.c
+++ b/drivers/ata/libata-sata.c
@@ -1254,21 +1254,24 @@ void ata_sas_tport_delete(struct ata_port *ap)
 EXPORT_SYMBOL_GPL(ata_sas_tport_delete);
 
 /**
- *	ata_sas_slave_configure - Default slave_config routine for libata devices
+ *	ata_sas_device_configure - Default device_configure routine for libata
+ *				   devices
  *	@sdev: SCSI device to configure
+ *	@lim: queue limits
  *	@ap: ATA port to which SCSI device is attached
  *
  *	RETURNS:
  *	Zero.
  */
 
-int ata_sas_slave_configure(struct scsi_device *sdev, struct ata_port *ap)
+int ata_sas_device_configure(struct scsi_device *sdev, struct queue_limits *lim,
+		struct ata_port *ap)
 {
 	ata_scsi_sdev_config(sdev);
 
-	return ata_scsi_dev_config(sdev, ap->link.device);
+	return ata_scsi_dev_config(sdev, lim, ap->link.device);
 }
-EXPORT_SYMBOL_GPL(ata_sas_slave_configure);
+EXPORT_SYMBOL_GPL(ata_sas_device_configure);
 
 /**
  *	ata_sas_queuecmd - Issue SCSI cdb to libata-managed device
diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 0a0f483124c3..db441815d633 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -1021,7 +1021,8 @@ bool ata_scsi_dma_need_drain(struct request *rq)
 }
 EXPORT_SYMBOL_GPL(ata_scsi_dma_need_drain);
 
-int ata_scsi_dev_config(struct scsi_device *sdev, struct ata_device *dev)
+int ata_scsi_dev_config(struct scsi_device *sdev, struct queue_limits *lim,
+		struct ata_device *dev)
 {
 	struct request_queue *q = sdev->request_queue;
 	int depth = 1;
@@ -1031,7 +1032,7 @@ int ata_scsi_dev_config(struct scsi_device *sdev, struct ata_device *dev)
 
 	/* configure max sectors */
 	dev->max_sectors = min(dev->max_sectors, sdev->host->max_sectors);
-	blk_queue_max_hw_sectors(q, dev->max_sectors);
+	lim->max_hw_sectors = dev->max_sectors;
 
 	if (dev->class == ATA_DEV_ATAPI) {
 		sdev->sector_size = ATA_SECT_SIZE;
@@ -1040,7 +1041,7 @@ int ata_scsi_dev_config(struct scsi_device *sdev, struct ata_device *dev)
 		blk_queue_update_dma_pad(q, ATA_DMA_PAD_SZ - 1);
 
 		/* make room for appending the drain */
-		blk_queue_max_segments(q, queue_max_segments(q) - 1);
+		lim->max_segments--;
 
 		sdev->dma_drain_len = ATAPI_MAX_DRAIN;
 		sdev->dma_drain_buf = kmalloc(sdev->dma_drain_len, GFP_NOIO);
@@ -1077,7 +1078,7 @@ int ata_scsi_dev_config(struct scsi_device *sdev, struct ata_device *dev)
 			"sector_size=%u > PAGE_SIZE, PIO may malfunction\n",
 			sdev->sector_size);
 
-	blk_queue_update_dma_alignment(q, sdev->sector_size - 1);
+	lim->dma_alignment = sdev->sector_size - 1;
 
 	if (dev->flags & ATA_DFLAG_AN)
 		set_bit(SDEV_EVT_MEDIA_CHANGE, sdev->supported_events);
@@ -1131,8 +1132,9 @@ int ata_scsi_slave_alloc(struct scsi_device *sdev)
 EXPORT_SYMBOL_GPL(ata_scsi_slave_alloc);
 
 /**
- *	ata_scsi_slave_config - Set SCSI device attributes
+ *	ata_scsi_device_configure - Set SCSI device attributes
  *	@sdev: SCSI device to examine
+ *	@lim: queue limits
  *
  *	This is called before we actually start reading
  *	and writing to the device, to configure certain
@@ -1142,17 +1144,18 @@ EXPORT_SYMBOL_GPL(ata_scsi_slave_alloc);
  *	Defined by SCSI layer.  We don't really care.
  */
 
-int ata_scsi_slave_config(struct scsi_device *sdev)
+int ata_scsi_device_configure(struct scsi_device *sdev,
+		struct queue_limits *lim)
 {
 	struct ata_port *ap = ata_shost_to_port(sdev->host);
 	struct ata_device *dev = __ata_scsi_find_dev(ap, sdev);
 
 	if (dev)
-		return ata_scsi_dev_config(sdev, dev);
+		return ata_scsi_dev_config(sdev, lim, dev);
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(ata_scsi_slave_config);
+EXPORT_SYMBOL_GPL(ata_scsi_device_configure);
 
 /**
  *	ata_scsi_slave_destroy - SCSI device is about to be destroyed
diff --git a/drivers/ata/libata.h b/drivers/ata/libata.h
index 5c685bb1939e..8aab0df549ea 100644
--- a/drivers/ata/libata.h
+++ b/drivers/ata/libata.h
@@ -131,7 +131,8 @@ extern void ata_scsi_dev_rescan(struct work_struct *work);
 extern int ata_scsi_user_scan(struct Scsi_Host *shost, unsigned int channel,
 			      unsigned int id, u64 lun);
 void ata_scsi_sdev_config(struct scsi_device *sdev);
-int ata_scsi_dev_config(struct scsi_device *sdev, struct ata_device *dev);
+int ata_scsi_dev_config(struct scsi_device *sdev, struct queue_limits *lim,
+		struct ata_device *dev);
 int __ata_scsi_queuecmd(struct scsi_cmnd *scmd, struct ata_device *dev);
 
 /* libata-eh.c */
diff --git a/drivers/ata/pata_macio.c b/drivers/ata/pata_macio.c
index f046bfa5c6e7..3dbcd737ebb7 100644
--- a/drivers/ata/pata_macio.c
+++ b/drivers/ata/pata_macio.c
@@ -806,7 +806,7 @@ static int pata_macio_device_configure(struct scsi_device *sdev,
 	int rc;
 
 	/* First call original */
-	rc = ata_scsi_slave_config(sdev);
+	rc = ata_scsi_device_configure(sdev, lim);
 	if (rc)
 		return rc;
 
diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
index e82786c63fbd..0bb900869ee0 100644
--- a/drivers/ata/sata_mv.c
+++ b/drivers/ata/sata_mv.c
@@ -673,7 +673,7 @@ static const struct scsi_host_template mv6_sht = {
 	.sdev_groups		= ata_ncq_sdev_groups,
 	.change_queue_depth	= ata_scsi_change_queue_depth,
 	.tag_alloc_policy	= BLK_TAG_ALLOC_RR,
-	.slave_configure	= ata_scsi_slave_config
+	.device_configure	= ata_scsi_device_configure
 };
 
 static struct ata_port_operations mv5_ops = {
diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c
index d0a8eb0e8011..36d99043ef50 100644
--- a/drivers/ata/sata_nv.c
+++ b/drivers/ata/sata_nv.c
@@ -676,7 +676,7 @@ static int nv_adma_device_configure(struct scsi_device *sdev,
 	int adma_enable;
 	u32 current_reg, new_reg, config_mask;
 
-	rc = ata_scsi_slave_config(sdev);
+	rc = ata_scsi_device_configure(sdev, lim);
 
 	if (sdev->id >= ATA_MAX_DEVICES || sdev->channel || sdev->lun)
 		/* Not a proper libata device, ignore */
@@ -1882,7 +1882,7 @@ static int nv_swncq_device_configure(struct scsi_device *sdev,
 	u8 check_maxtor = 0;
 	unsigned char model_num[ATA_ID_PROD_LEN + 1];
 
-	rc = ata_scsi_slave_config(sdev);
+	rc = ata_scsi_device_configure(sdev, lim);
 	if (sdev->id >= ATA_MAX_DEVICES || sdev->channel || sdev->lun)
 		/* Not a proper libata device, ignore */
 		return rc;
diff --git a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c
index 142e70bfc498..72c03cbdaff4 100644
--- a/drivers/ata/sata_sil24.c
+++ b/drivers/ata/sata_sil24.c
@@ -381,7 +381,7 @@ static const struct scsi_host_template sil24_sht = {
 	.tag_alloc_policy	= BLK_TAG_ALLOC_FIFO,
 	.sdev_groups		= ata_ncq_sdev_groups,
 	.change_queue_depth	= ata_scsi_change_queue_depth,
-	.slave_configure	= ata_scsi_slave_config
+	.device_configure	= ata_scsi_device_configure
 };
 
 static struct ata_port_operations sil24_ops = {
diff --git a/drivers/scsi/hisi_sas/hisi_sas.h b/drivers/scsi/hisi_sas/hisi_sas.h
index 1e4550156b73..d223f482488f 100644
--- a/drivers/scsi/hisi_sas/hisi_sas.h
+++ b/drivers/scsi/hisi_sas/hisi_sas.h
@@ -643,7 +643,8 @@ extern int hisi_sas_probe(struct platform_device *pdev,
 			  const struct hisi_sas_hw *ops);
 extern void hisi_sas_remove(struct platform_device *pdev);
 
-extern int hisi_sas_slave_configure(struct scsi_device *sdev);
+int hisi_sas_device_configure(struct scsi_device *sdev,
+		struct queue_limits *lim);
 extern int hisi_sas_slave_alloc(struct scsi_device *sdev);
 extern int hisi_sas_scan_finished(struct Scsi_Host *shost, unsigned long time);
 extern void hisi_sas_scan_start(struct Scsi_Host *shost);
diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c
index 097dfe4b620d..11228ba51ded 100644
--- a/drivers/scsi/hisi_sas/hisi_sas_main.c
+++ b/drivers/scsi/hisi_sas/hisi_sas_main.c
@@ -868,10 +868,11 @@ err_out:
 	return rc;
 }
 
-int hisi_sas_slave_configure(struct scsi_device *sdev)
+int hisi_sas_device_configure(struct scsi_device *sdev,
+		struct queue_limits *lim)
 {
 	struct domain_device *dev = sdev_to_domain_dev(sdev);
-	int ret = sas_slave_configure(sdev);
+	int ret = sas_device_configure(sdev, lim);
 
 	if (ret)
 		return ret;
@@ -880,7 +881,7 @@ int hisi_sas_slave_configure(struct scsi_device *sdev)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(hisi_sas_slave_configure);
+EXPORT_SYMBOL_GPL(hisi_sas_device_configure);
 
 void hisi_sas_scan_start(struct Scsi_Host *shost)
 {
diff --git a/drivers/scsi/hisi_sas/hisi_sas_v1_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v1_hw.c
index 161feae3acab..71b5008c3552 100644
--- a/drivers/scsi/hisi_sas/hisi_sas_v1_hw.c
+++ b/drivers/scsi/hisi_sas/hisi_sas_v1_hw.c
@@ -1736,7 +1736,7 @@ ATTRIBUTE_GROUPS(host_v1_hw);
 
 static const struct scsi_host_template sht_v1_hw = {
 	LIBSAS_SHT_BASE_NO_SLAVE_INIT
-	.slave_configure	= hisi_sas_slave_configure,
+	.device_configure	= hisi_sas_device_configure,
 	.scan_finished		= hisi_sas_scan_finished,
 	.scan_start		= hisi_sas_scan_start,
 	.sg_tablesize		= HISI_SAS_SGE_PAGE_CNT,
diff --git a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c
index d89e97e8f5c2..342d75f12051 100644
--- a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c
+++ b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c
@@ -3568,7 +3568,7 @@ static void map_queues_v2_hw(struct Scsi_Host *shost)
 
 static const struct scsi_host_template sht_v2_hw = {
 	LIBSAS_SHT_BASE_NO_SLAVE_INIT
-	.slave_configure	= hisi_sas_slave_configure,
+	.device_configure	= hisi_sas_device_configure,
 	.scan_finished		= hisi_sas_scan_finished,
 	.scan_start		= hisi_sas_scan_start,
 	.sg_tablesize		= HISI_SAS_SGE_PAGE_CNT,
diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c
index 756660588a1e..a38165995e86 100644
--- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c
+++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c
@@ -2894,11 +2894,12 @@ static ssize_t iopoll_q_cnt_v3_hw_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(iopoll_q_cnt_v3_hw);
 
-static int slave_configure_v3_hw(struct scsi_device *sdev)
+static int device_configure_v3_hw(struct scsi_device *sdev,
+		struct queue_limits *lim)
 {
 	struct Scsi_Host *shost = dev_to_shost(&sdev->sdev_gendev);
 	struct hisi_hba *hisi_hba = shost_priv(shost);
-	int ret = hisi_sas_slave_configure(sdev);
+	int ret = hisi_sas_device_configure(sdev, lim);
 	struct device *dev = hisi_hba->dev;
 
 	if (ret)
@@ -3321,7 +3322,7 @@ static void hisi_sas_map_queues(struct Scsi_Host *shost)
 
 static const struct scsi_host_template sht_v3_hw = {
 	LIBSAS_SHT_BASE_NO_SLAVE_INIT
-	.slave_configure	= slave_configure_v3_hw,
+	.device_configure	= device_configure_v3_hw,
 	.scan_finished		= hisi_sas_scan_finished,
 	.scan_start		= hisi_sas_scan_start,
 	.map_queues		= hisi_sas_map_queues,
diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c
index 9047cfcd1072..da11d32840e2 100644
--- a/drivers/scsi/libsas/sas_scsi_host.c
+++ b/drivers/scsi/libsas/sas_scsi_host.c
@@ -804,14 +804,15 @@ EXPORT_SYMBOL_GPL(sas_target_alloc);
 
 #define SAS_DEF_QD 256
 
-int sas_slave_configure(struct scsi_device *scsi_dev)
+int sas_device_configure(struct scsi_device *scsi_dev,
+		struct queue_limits *lim)
 {
 	struct domain_device *dev = sdev_to_domain_dev(scsi_dev);
 
 	BUG_ON(dev->rphy->identify.device_type != SAS_END_DEVICE);
 
 	if (dev_is_sata(dev)) {
-		ata_sas_slave_configure(scsi_dev, dev->sata_dev.ap);
+		ata_sas_device_configure(scsi_dev, lim, dev->sata_dev.ap);
 		return 0;
 	}
 
@@ -829,7 +830,7 @@ int sas_slave_configure(struct scsi_device *scsi_dev)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(sas_slave_configure);
+EXPORT_SYMBOL_GPL(sas_device_configure);
 
 int sas_change_queue_depth(struct scsi_device *sdev, int depth)
 {
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 6dd9a4f9ca7c..f9c892f8940d 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -1151,7 +1151,8 @@ extern int ata_std_bios_param(struct scsi_device *sdev,
 			      sector_t capacity, int geom[]);
 extern void ata_scsi_unlock_native_capacity(struct scsi_device *sdev);
 extern int ata_scsi_slave_alloc(struct scsi_device *sdev);
-extern int ata_scsi_slave_config(struct scsi_device *sdev);
+int ata_scsi_device_configure(struct scsi_device *sdev,
+		struct queue_limits *lim);
 extern void ata_scsi_slave_destroy(struct scsi_device *sdev);
 extern int ata_scsi_change_queue_depth(struct scsi_device *sdev,
 				       int queue_depth);
@@ -1249,7 +1250,8 @@ extern struct ata_port *ata_sas_port_alloc(struct ata_host *,
 extern void ata_port_probe(struct ata_port *ap);
 extern int ata_sas_tport_add(struct device *parent, struct ata_port *ap);
 extern void ata_sas_tport_delete(struct ata_port *ap);
-extern int ata_sas_slave_configure(struct scsi_device *, struct ata_port *);
+int ata_sas_device_configure(struct scsi_device *sdev, struct queue_limits *lim,
+		struct ata_port *ap);
 extern int ata_sas_queuecmd(struct scsi_cmnd *cmd, struct ata_port *ap);
 extern void ata_tf_to_fis(const struct ata_taskfile *tf,
 			  u8 pmp, int is_cmd, u8 *fis);
@@ -1415,13 +1417,13 @@ extern const struct attribute_group *ata_common_sdev_groups[];
 	__ATA_BASE_SHT(drv_name),				\
 	.can_queue		= ATA_DEF_QUEUE,		\
 	.tag_alloc_policy	= BLK_TAG_ALLOC_RR,		\
-	.slave_configure	= ata_scsi_slave_config
+	.device_configure	= ata_scsi_device_configure
 
 #define ATA_SUBBASE_SHT_QD(drv_name, drv_qd)			\
 	__ATA_BASE_SHT(drv_name),				\
 	.can_queue		= drv_qd,			\
 	.tag_alloc_policy	= BLK_TAG_ALLOC_RR,		\
-	.slave_configure	= ata_scsi_slave_config
+	.device_configure	= ata_scsi_device_configure
 
 #define ATA_BASE_SHT(drv_name)					\
 	ATA_SUBBASE_SHT(drv_name),				\
diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h
index d06a0570f4c5..1324068dd950 100644
--- a/include/scsi/libsas.h
+++ b/include/scsi/libsas.h
@@ -683,7 +683,8 @@ int sas_phy_reset(struct sas_phy *phy, int hard_reset);
 int sas_phy_enable(struct sas_phy *phy, int enable);
 extern int sas_queuecommand(struct Scsi_Host *, struct scsi_cmnd *);
 extern int sas_target_alloc(struct scsi_target *);
-extern int sas_slave_configure(struct scsi_device *);
+int sas_device_configure(struct scsi_device *dev,
+		struct queue_limits *lim);
 extern int sas_change_queue_depth(struct scsi_device *, int new_depth);
 extern int sas_bios_param(struct scsi_device *, struct block_device *,
 			  sector_t capacity, int *hsc);
@@ -749,7 +750,7 @@ void sas_notify_phy_event(struct asd_sas_phy *phy, enum phy_event event,
 #endif
 
 #define LIBSAS_SHT_BASE			_LIBSAS_SHT_BASE		\
-	.slave_configure		= sas_slave_configure,		\
+	.device_configure		= sas_device_configure,		\
 	.slave_alloc			= sas_slave_alloc,		\
 
 #define LIBSAS_SHT_BASE_NO_SLAVE_INIT	_LIBSAS_SHT_BASE
-- 
cgit v1.2.3


From a33a502d50fddc85e8d5072cf39ff8fe19ba3025 Mon Sep 17 00:00:00 2001
From: Avri Altman <avri.altman@wdc.com>
Date: Wed, 10 Apr 2024 21:37:19 +0300
Subject: scsi: ufs: Remove support for old UFSHCI versions

UFS spec version 2.1 was published more than 10 years ago. It is
vanishingly unlikely that even there are out there platforms that uses
earlier host controllers, let alone that those ancient platforms will ever
run a V6.10 kernel.  To be extra cautious, leave out removal of UFSHCI 2.0
support from this patch, and just remove support of host controllers prior
to UFS2.0.

This patch removes some legacy tuning calls that no longer apply.

Signed-off-by: Avri Altman <avri.altman@wdc.com>
Link: https://lore.kernel.org/r/20240410183720.908-2-avri.altman@wdc.com
Acked-by: Bean Huo <beanhuo@micron.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 drivers/ufs/core/ufshcd.c   | 158 ++++----------------------------------------
 drivers/ufs/host/ufs-qcom.c |   3 +-
 include/ufs/ufshcd.h        |   2 -
 include/ufs/ufshci.h        |  13 +---
 4 files changed, 15 insertions(+), 161 deletions(-)

(limited to 'include')

diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 62c8575f2c67..c72ef87ea867 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -748,8 +748,6 @@ static int ufshcd_wait_for_register(struct ufs_hba *hba, u32 reg, u32 mask,
  */
 static inline u32 ufshcd_get_intr_mask(struct ufs_hba *hba)
 {
-	if (hba->ufs_version == ufshci_version(1, 0))
-		return INTERRUPT_MASK_ALL_VER_10;
 	if (hba->ufs_version <= ufshci_version(2, 0))
 		return INTERRUPT_MASK_ALL_VER_11;
 
@@ -990,30 +988,6 @@ bool ufshcd_is_hba_active(struct ufs_hba *hba)
 }
 EXPORT_SYMBOL_GPL(ufshcd_is_hba_active);
 
-u32 ufshcd_get_local_unipro_ver(struct ufs_hba *hba)
-{
-	/* HCI version 1.0 and 1.1 supports UniPro 1.41 */
-	if (hba->ufs_version <= ufshci_version(1, 1))
-		return UFS_UNIPRO_VER_1_41;
-	else
-		return UFS_UNIPRO_VER_1_6;
-}
-EXPORT_SYMBOL(ufshcd_get_local_unipro_ver);
-
-static bool ufshcd_is_unipro_pa_params_tuning_req(struct ufs_hba *hba)
-{
-	/*
-	 * If both host and device support UniPro ver1.6 or later, PA layer
-	 * parameters tuning happens during link startup itself.
-	 *
-	 * We can manually tune PA layer parameters if either host or device
-	 * doesn't support UniPro ver 1.6 or later. But to keep manual tuning
-	 * logic simple, we will only do manual tuning if local unipro version
-	 * doesn't support ver1.6 or later.
-	 */
-	return ufshcd_get_local_unipro_ver(hba) < UFS_UNIPRO_VER_1_6;
-}
-
 /**
  * ufshcd_pm_qos_init - initialize PM QoS request
  * @hba: per adapter instance
@@ -2674,14 +2648,7 @@ static void ufshcd_enable_intr(struct ufs_hba *hba, u32 intrs)
 {
 	u32 set = ufshcd_readl(hba, REG_INTERRUPT_ENABLE);
 
-	if (hba->ufs_version == ufshci_version(1, 0)) {
-		u32 rw;
-		rw = set & INTERRUPT_MASK_RW_VER_10;
-		set = rw | ((set ^ intrs) & intrs);
-	} else {
-		set |= intrs;
-	}
-
+	set |= intrs;
 	ufshcd_writel(hba, set, REG_INTERRUPT_ENABLE);
 }
 
@@ -2694,16 +2661,7 @@ static void ufshcd_disable_intr(struct ufs_hba *hba, u32 intrs)
 {
 	u32 set = ufshcd_readl(hba, REG_INTERRUPT_ENABLE);
 
-	if (hba->ufs_version == ufshci_version(1, 0)) {
-		u32 rw;
-		rw = (set & INTERRUPT_MASK_RW_VER_10) &
-			~(intrs & INTERRUPT_MASK_RW_VER_10);
-		set = rw | ((set & intrs) & ~INTERRUPT_MASK_RW_VER_10);
-
-	} else {
-		set &= ~intrs;
-	}
-
+	set &= ~intrs;
 	ufshcd_writel(hba, set, REG_INTERRUPT_ENABLE);
 }
 
@@ -2715,21 +2673,17 @@ static void ufshcd_disable_intr(struct ufs_hba *hba, u32 intrs)
  * @upiu_flags: flags required in the header
  * @cmd_dir: requests data direction
  * @ehs_length: Total EHS Length (in 32‐bytes units of all Extra Header Segments)
- * @legacy_type: UTP_CMD_TYPE_SCSI or UTP_CMD_TYPE_DEV_MANAGE
  */
 static void
 ufshcd_prepare_req_desc_hdr(struct ufs_hba *hba, struct ufshcd_lrb *lrbp,
 			    u8 *upiu_flags, enum dma_data_direction cmd_dir,
-			    int ehs_length, enum utp_cmd_type legacy_type)
+			    int ehs_length)
 {
 	struct utp_transfer_req_desc *req_desc = lrbp->utr_descriptor_ptr;
 	struct request_desc_header *h = &req_desc->header;
 	enum utp_data_direction data_direction;
 
-	if (hba->ufs_version <= ufshci_version(1, 1))
-		lrbp->command_type = legacy_type;
-	else
-		lrbp->command_type = UTP_CMD_TYPE_UFS_STORAGE;
+	lrbp->command_type = UTP_CMD_TYPE_UFS_STORAGE;
 
 	*h = (typeof(*h)){ };
 
@@ -2863,7 +2817,7 @@ static int ufshcd_compose_devman_upiu(struct ufs_hba *hba,
 	u8 upiu_flags;
 	int ret = 0;
 
-	ufshcd_prepare_req_desc_hdr(hba, lrbp, &upiu_flags, DMA_NONE, 0, UTP_CMD_TYPE_DEV_MANAGE);
+	ufshcd_prepare_req_desc_hdr(hba, lrbp, &upiu_flags, DMA_NONE, 0);
 
 	if (hba->dev_cmd.type == DEV_CMD_TYPE_QUERY)
 		ufshcd_prepare_utp_query_req_upiu(hba, lrbp, upiu_flags);
@@ -2887,8 +2841,7 @@ static void ufshcd_comp_scsi_upiu(struct ufs_hba *hba, struct ufshcd_lrb *lrbp)
 	unsigned int ioprio_class = IOPRIO_PRIO_CLASS(req_get_ioprio(rq));
 	u8 upiu_flags;
 
-	ufshcd_prepare_req_desc_hdr(hba, lrbp, &upiu_flags,
-				    lrbp->cmd->sc_data_direction, 0, UTP_CMD_TYPE_SCSI);
+	ufshcd_prepare_req_desc_hdr(hba, lrbp, &upiu_flags, lrbp->cmd->sc_data_direction, 0);
 	if (ioprio_class == IOPRIO_CLASS_RT)
 		upiu_flags |= UPIU_CMD_FLAGS_CP;
 	ufshcd_prepare_utp_scsi_cmd_upiu(lrbp, upiu_flags);
@@ -5559,15 +5512,12 @@ void ufshcd_compl_one_cqe(struct ufs_hba *hba, int task_tag,
 		ufshcd_release_scsi_cmd(hba, lrbp);
 		/* Do not touch lrbp after scsi done */
 		scsi_done(cmd);
-	} else if (lrbp->command_type == UTP_CMD_TYPE_DEV_MANAGE ||
-		   lrbp->command_type == UTP_CMD_TYPE_UFS_STORAGE) {
-		if (hba->dev_cmd.complete) {
-			if (cqe) {
-				ocs = le32_to_cpu(cqe->status) & MASK_OCS;
-				lrbp->utr_descriptor_ptr->header.ocs = ocs;
-			}
-			complete(hba->dev_cmd.complete);
+	} else if (hba->dev_cmd.complete) {
+		if (cqe) {
+			ocs = le32_to_cpu(cqe->status) & MASK_OCS;
+			lrbp->utr_descriptor_ptr->header.ocs = ocs;
 		}
+		complete(hba->dev_cmd.complete);
 	}
 }
 
@@ -7220,7 +7170,7 @@ static int ufshcd_issue_devman_upiu_cmd(struct ufs_hba *hba,
 
 	ufshcd_setup_dev_cmd(hba, lrbp, cmd_type, 0, tag);
 
-	ufshcd_prepare_req_desc_hdr(hba, lrbp, &upiu_flags, DMA_NONE, 0, UTP_CMD_TYPE_DEV_MANAGE);
+	ufshcd_prepare_req_desc_hdr(hba, lrbp, &upiu_flags, DMA_NONE, 0);
 
 	/* update the task tag in the request upiu */
 	req_upiu->header.task_tag = tag;
@@ -7372,7 +7322,7 @@ int ufshcd_advanced_rpmb_req_handler(struct ufs_hba *hba, struct utp_upiu_req *r
 
 	ufshcd_setup_dev_cmd(hba, lrbp, DEV_CMD_TYPE_RPMB, UFS_UPIU_RPMB_WLUN, tag);
 
-	ufshcd_prepare_req_desc_hdr(hba, lrbp, &upiu_flags, DMA_NONE, ehs, UTP_CMD_TYPE_DEV_MANAGE);
+	ufshcd_prepare_req_desc_hdr(hba, lrbp, &upiu_flags, DMA_NONE, ehs);
 
 	/* update the task tag */
 	req_upiu->header.task_tag = tag;
@@ -8359,83 +8309,6 @@ static void ufs_put_device_desc(struct ufs_hba *hba)
 	dev_info->model = NULL;
 }
 
-/**
- * ufshcd_tune_pa_tactivate - Tunes PA_TActivate of local UniPro
- * @hba: per-adapter instance
- *
- * PA_TActivate parameter can be tuned manually if UniPro version is less than
- * 1.61. PA_TActivate needs to be greater than or equal to peerM-PHY's
- * RX_MIN_ACTIVATETIME_CAPABILITY attribute. This optimal value can help reduce
- * the hibern8 exit latency.
- *
- * Return: zero on success, non-zero error value on failure.
- */
-static int ufshcd_tune_pa_tactivate(struct ufs_hba *hba)
-{
-	int ret = 0;
-	u32 peer_rx_min_activatetime = 0, tuned_pa_tactivate;
-
-	ret = ufshcd_dme_peer_get(hba,
-				  UIC_ARG_MIB_SEL(
-					RX_MIN_ACTIVATETIME_CAPABILITY,
-					UIC_ARG_MPHY_RX_GEN_SEL_INDEX(0)),
-				  &peer_rx_min_activatetime);
-	if (ret)
-		goto out;
-
-	/* make sure proper unit conversion is applied */
-	tuned_pa_tactivate =
-		((peer_rx_min_activatetime * RX_MIN_ACTIVATETIME_UNIT_US)
-		 / PA_TACTIVATE_TIME_UNIT_US);
-	ret = ufshcd_dme_set(hba, UIC_ARG_MIB(PA_TACTIVATE),
-			     tuned_pa_tactivate);
-
-out:
-	return ret;
-}
-
-/**
- * ufshcd_tune_pa_hibern8time - Tunes PA_Hibern8Time of local UniPro
- * @hba: per-adapter instance
- *
- * PA_Hibern8Time parameter can be tuned manually if UniPro version is less than
- * 1.61. PA_Hibern8Time needs to be maximum of local M-PHY's
- * TX_HIBERN8TIME_CAPABILITY & peer M-PHY's RX_HIBERN8TIME_CAPABILITY.
- * This optimal value can help reduce the hibern8 exit latency.
- *
- * Return: zero on success, non-zero error value on failure.
- */
-static int ufshcd_tune_pa_hibern8time(struct ufs_hba *hba)
-{
-	int ret = 0;
-	u32 local_tx_hibern8_time_cap = 0, peer_rx_hibern8_time_cap = 0;
-	u32 max_hibern8_time, tuned_pa_hibern8time;
-
-	ret = ufshcd_dme_get(hba,
-			     UIC_ARG_MIB_SEL(TX_HIBERN8TIME_CAPABILITY,
-					UIC_ARG_MPHY_TX_GEN_SEL_INDEX(0)),
-				  &local_tx_hibern8_time_cap);
-	if (ret)
-		goto out;
-
-	ret = ufshcd_dme_peer_get(hba,
-				  UIC_ARG_MIB_SEL(RX_HIBERN8TIME_CAPABILITY,
-					UIC_ARG_MPHY_RX_GEN_SEL_INDEX(0)),
-				  &peer_rx_hibern8_time_cap);
-	if (ret)
-		goto out;
-
-	max_hibern8_time = max(local_tx_hibern8_time_cap,
-			       peer_rx_hibern8_time_cap);
-	/* make sure proper unit conversion is applied */
-	tuned_pa_hibern8time = ((max_hibern8_time * HIBERN8TIME_UNIT_US)
-				/ PA_HIBERN8_TIME_UNIT_US);
-	ret = ufshcd_dme_set(hba, UIC_ARG_MIB(PA_HIBERN8TIME),
-			     tuned_pa_hibern8time);
-out:
-	return ret;
-}
-
 /**
  * ufshcd_quirk_tune_host_pa_tactivate - Ensures that host PA_TACTIVATE is
  * less than device PA_TACTIVATE time.
@@ -8508,11 +8381,6 @@ out:
 
 static void ufshcd_tune_unipro_params(struct ufs_hba *hba)
 {
-	if (ufshcd_is_unipro_pa_params_tuning_req(hba)) {
-		ufshcd_tune_pa_tactivate(hba);
-		ufshcd_tune_pa_hibern8time(hba);
-	}
-
 	ufshcd_vops_apply_dev_quirks(hba);
 
 	if (hba->dev_quirks & UFS_DEVICE_QUIRK_PA_TACTIVATE)
diff --git a/drivers/ufs/host/ufs-qcom.c b/drivers/ufs/host/ufs-qcom.c
index 3a91255036dd..0b02e697ea5b 100644
--- a/drivers/ufs/host/ufs-qcom.c
+++ b/drivers/ufs/host/ufs-qcom.c
@@ -528,8 +528,7 @@ static int ufs_qcom_link_startup_notify(struct ufs_hba *hba,
 		 * and device TX LCC are disabled once link startup is
 		 * completed.
 		 */
-		if (ufshcd_get_local_unipro_ver(hba) != UFS_UNIPRO_VER_1_41)
-			err = ufshcd_disable_host_tx_lcc(hba);
+		err = ufshcd_disable_host_tx_lcc(hba);
 
 		break;
 	default:
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index a35e12f8e68b..431af365cf5e 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -1390,8 +1390,6 @@ void ufshcd_release(struct ufs_hba *hba);
 
 void ufshcd_clkgate_delay_set(struct device *dev, unsigned long value);
 
-u32 ufshcd_get_local_unipro_ver(struct ufs_hba *hba);
-
 int ufshcd_get_vreg(struct device *dev, struct ufs_vreg *vreg);
 
 int ufshcd_send_uic_cmd(struct ufs_hba *hba, struct uic_command *uic_cmd);
diff --git a/include/ufs/ufshci.h b/include/ufs/ufshci.h
index 88193f5540e5..385e1c6b8d60 100644
--- a/include/ufs/ufshci.h
+++ b/include/ufs/ufshci.h
@@ -355,12 +355,8 @@ enum {
 
 /* Interrupt disable masks */
 enum {
-	/* Interrupt disable mask for UFSHCI v1.0 */
-	INTERRUPT_MASK_ALL_VER_10	= 0x30FFF,
-	INTERRUPT_MASK_RW_VER_10	= 0x30000,
-
 	/* Interrupt disable mask for UFSHCI v1.1 */
-	INTERRUPT_MASK_ALL_VER_11	= 0x31FFF,
+	INTERRUPT_MASK_ALL_VER_11       = 0x31FFF,
 
 	/* Interrupt disable mask for UFSHCI v2.1 */
 	INTERRUPT_MASK_ALL_VER_21	= 0x71FFF,
@@ -425,13 +421,6 @@ union ufs_crypto_cfg_entry {
  * Request Descriptor Definitions
  */
 
-/* Transfer request command type */
-enum utp_cmd_type {
-	UTP_CMD_TYPE_SCSI		= 0x0,
-	UTP_CMD_TYPE_UFS		= 0x1,
-	UTP_CMD_TYPE_DEV_MANAGE		= 0x2,
-};
-
 /* To accommodate UFS2.0 required Command type */
 enum {
 	UTP_CMD_TYPE_UFS_STORAGE	= 0x1,
-- 
cgit v1.2.3


From f6d827b180bda01f8805bf5e85307419b0d6f890 Mon Sep 17 00:00:00 2001
From: Mina Almasry <almasrymina@google.com>
Date: Wed, 10 Apr 2024 12:05:01 -0700
Subject: net: move skb ref helpers to new header

Add a new header, linux/skbuff_ref.h, which contains all the skb_*_ref()
helpers. Many of the consumers of skbuff.h do not actually use any of
the skb ref helpers, and we can speed up compilation a bit by minimizing
this header file.

Additionally in the later patch in the series we add page_pool support
to skb_frag_ref(), which requires some page_pool dependencies. We can
now add these dependencies to skbuff_ref.h instead of a very ubiquitous
skbuff.h

Signed-off-by: Mina Almasry <almasrymina@google.com>
Link: https://lore.kernel.org/r/20240410190505.1225848-2-almasrymina@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../chelsio/inline_crypto/ch_ktls/chcr_ktls.c      |  1 +
 drivers/net/ethernet/marvell/sky2.c                |  1 +
 drivers/net/ethernet/mellanox/mlx4/en_rx.c         |  1 +
 drivers/net/ethernet/sun/cassini.c                 |  1 +
 drivers/net/veth.c                                 |  1 +
 drivers/net/xen-netback/netback.c                  |  1 +
 include/linux/skbuff.h                             | 63 ------------------
 include/linux/skbuff_ref.h                         | 75 ++++++++++++++++++++++
 net/core/gro.c                                     |  1 +
 net/core/skbuff.c                                  |  1 +
 net/ipv4/esp4.c                                    |  1 +
 net/ipv4/tcp_output.c                              |  1 +
 net/ipv6/esp6.c                                    |  1 +
 net/tls/tls_device.c                               |  1 +
 net/tls/tls_device_fallback.c                      |  1 +
 net/tls/tls_strp.c                                 |  1 +
 16 files changed, 89 insertions(+), 63 deletions(-)
 create mode 100644 include/linux/skbuff_ref.h

(limited to 'include')

diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c
index 6482728794dd..e8e460a92e0e 100644
--- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c
+++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c
@@ -10,6 +10,7 @@
 #include <net/ipv6.h>
 #include <linux/netdevice.h>
 #include <crypto/aes.h>
+#include <linux/skbuff_ref.h>
 #include "chcr_ktls.h"
 
 static LIST_HEAD(uld_ctx_list);
diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c
index 07720841a8d7..f3f7f4cc27b3 100644
--- a/drivers/net/ethernet/marvell/sky2.c
+++ b/drivers/net/ethernet/marvell/sky2.c
@@ -34,6 +34,7 @@
 #include <linux/mii.h>
 #include <linux/of_net.h>
 #include <linux/dmi.h>
+#include <linux/skbuff_ref.h>
 
 #include <asm/irq.h>
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index eac49657bd07..8328df8645d5 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -42,6 +42,7 @@
 #include <linux/if_vlan.h>
 #include <linux/vmalloc.h>
 #include <linux/irq.h>
+#include <linux/skbuff_ref.h>
 
 #include <net/ip.h>
 #if IS_ENABLED(CONFIG_IPV6)
diff --git a/drivers/net/ethernet/sun/cassini.c b/drivers/net/ethernet/sun/cassini.c
index bfb903506367..8f1f43dbb76d 100644
--- a/drivers/net/ethernet/sun/cassini.c
+++ b/drivers/net/ethernet/sun/cassini.c
@@ -73,6 +73,7 @@
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/skbuff.h>
+#include <linux/skbuff_ref.h>
 #include <linux/ethtool.h>
 #include <linux/crc32.h>
 #include <linux/random.h>
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index bcdfbf61eb66..426e68a95067 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -26,6 +26,7 @@
 #include <linux/ptr_ring.h>
 #include <linux/bpf_trace.h>
 #include <linux/net_tstamp.h>
+#include <linux/skbuff_ref.h>
 #include <net/page_pool/helpers.h>
 
 #define DRV_NAME	"veth"
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index ef76850d9bcd..48254fc07d64 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -38,6 +38,7 @@
 #include <linux/if_vlan.h>
 #include <linux/udp.h>
 #include <linux/highmem.h>
+#include <linux/skbuff_ref.h>
 
 #include <net/tcp.h>
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 7135a3e94afd..4072a7ee3859 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3492,73 +3492,10 @@ static inline struct page *skb_frag_page(const skb_frag_t *frag)
 	return netmem_to_page(frag->netmem);
 }
 
-/**
- * __skb_frag_ref - take an addition reference on a paged fragment.
- * @frag: the paged fragment
- *
- * Takes an additional reference on the paged fragment @frag.
- */
-static inline void __skb_frag_ref(skb_frag_t *frag)
-{
-	get_page(skb_frag_page(frag));
-}
-
-/**
- * skb_frag_ref - take an addition reference on a paged fragment of an skb.
- * @skb: the buffer
- * @f: the fragment offset.
- *
- * Takes an additional reference on the @f'th paged fragment of @skb.
- */
-static inline void skb_frag_ref(struct sk_buff *skb, int f)
-{
-	__skb_frag_ref(&skb_shinfo(skb)->frags[f]);
-}
-
 int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
 		    unsigned int headroom);
 int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
 			 struct bpf_prog *prog);
-bool napi_pp_put_page(struct page *page);
-
-static inline void
-skb_page_unref(struct page *page, bool recycle)
-{
-#ifdef CONFIG_PAGE_POOL
-	if (recycle && napi_pp_put_page(page))
-		return;
-#endif
-	put_page(page);
-}
-
-/**
- * __skb_frag_unref - release a reference on a paged fragment.
- * @frag: the paged fragment
- * @recycle: recycle the page if allocated via page_pool
- *
- * Releases a reference on the paged fragment @frag
- * or recycles the page via the page_pool API.
- */
-static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
-{
-	skb_page_unref(skb_frag_page(frag), recycle);
-}
-
-/**
- * skb_frag_unref - release a reference on a paged fragment of an skb.
- * @skb: the buffer
- * @f: the fragment offset
- *
- * Releases a reference on the @f'th paged fragment of @skb.
- */
-static inline void skb_frag_unref(struct sk_buff *skb, int f)
-{
-	struct skb_shared_info *shinfo = skb_shinfo(skb);
-
-	if (!skb_zcopy_managed(skb))
-		__skb_frag_unref(&shinfo->frags[f], skb->pp_recycle);
-}
-
 /**
  * skb_frag_address - gets the address of the data contained in a paged fragment
  * @frag: the paged fragment buffer
diff --git a/include/linux/skbuff_ref.h b/include/linux/skbuff_ref.h
new file mode 100644
index 000000000000..11f0a4063403
--- /dev/null
+++ b/include/linux/skbuff_ref.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ *	Skb ref helpers.
+ *
+ */
+
+#ifndef _LINUX_SKBUFF_REF_H
+#define _LINUX_SKBUFF_REF_H
+
+#include <linux/skbuff.h>
+
+/**
+ * __skb_frag_ref - take an addition reference on a paged fragment.
+ * @frag: the paged fragment
+ *
+ * Takes an additional reference on the paged fragment @frag.
+ */
+static inline void __skb_frag_ref(skb_frag_t *frag)
+{
+	get_page(skb_frag_page(frag));
+}
+
+/**
+ * skb_frag_ref - take an addition reference on a paged fragment of an skb.
+ * @skb: the buffer
+ * @f: the fragment offset.
+ *
+ * Takes an additional reference on the @f'th paged fragment of @skb.
+ */
+static inline void skb_frag_ref(struct sk_buff *skb, int f)
+{
+	__skb_frag_ref(&skb_shinfo(skb)->frags[f]);
+}
+
+bool napi_pp_put_page(struct page *page);
+
+static inline void
+skb_page_unref(struct page *page, bool recycle)
+{
+#ifdef CONFIG_PAGE_POOL
+	if (recycle && napi_pp_put_page(page))
+		return;
+#endif
+	put_page(page);
+}
+
+/**
+ * __skb_frag_unref - release a reference on a paged fragment.
+ * @frag: the paged fragment
+ * @recycle: recycle the page if allocated via page_pool
+ *
+ * Releases a reference on the paged fragment @frag
+ * or recycles the page via the page_pool API.
+ */
+static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
+{
+	skb_page_unref(skb_frag_page(frag), recycle);
+}
+
+/**
+ * skb_frag_unref - release a reference on a paged fragment of an skb.
+ * @skb: the buffer
+ * @f: the fragment offset
+ *
+ * Releases a reference on the @f'th paged fragment of @skb.
+ */
+static inline void skb_frag_unref(struct sk_buff *skb, int f)
+{
+	struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+	if (!skb_zcopy_managed(skb))
+		__skb_frag_unref(&shinfo->frags[f], skb->pp_recycle);
+}
+
+#endif	/* _LINUX_SKBUFF_REF_H */
diff --git a/net/core/gro.c b/net/core/gro.c
index 83f35d99a682..2459ab697f7f 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -3,6 +3,7 @@
 #include <net/dst_metadata.h>
 #include <net/busy_poll.h>
 #include <trace/events/net.h>
+#include <linux/skbuff_ref.h>
 
 #define MAX_GRO_SKBS 8
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ab970ded8a7b..2554a6f5f386 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -51,6 +51,7 @@
 #endif
 #include <linux/string.h>
 #include <linux/skbuff.h>
+#include <linux/skbuff_ref.h>
 #include <linux/splice.h>
 #include <linux/cache.h>
 #include <linux/rtnetlink.h>
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 40330253f076..dff04580318f 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -20,6 +20,7 @@
 #include <net/udp.h>
 #include <net/tcp.h>
 #include <net/espintcp.h>
+#include <linux/skbuff_ref.h>
 
 #include <linux/highmem.h>
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9282fafc0e61..61119d42b0fd 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -44,6 +44,7 @@
 #include <linux/gfp.h>
 #include <linux/module.h>
 #include <linux/static_key.h>
+#include <linux/skbuff_ref.h>
 
 #include <trace/events/tcp.h>
 
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index fb431d0a3475..6bc0a84c8d05 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -36,6 +36,7 @@
 #include <net/tcp.h>
 #include <net/espintcp.h>
 #include <net/inet6_hashtables.h>
+#include <linux/skbuff_ref.h>
 
 #include <linux/highmem.h>
 
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index bf8ed36b1ad6..ab6e694f7bc2 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -37,6 +37,7 @@
 #include <net/inet_connection_sock.h>
 #include <net/tcp.h>
 #include <net/tls.h>
+#include <linux/skbuff_ref.h>
 
 #include "tls.h"
 #include "trace.h"
diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c
index 4e7228f275fa..f9e3d3d90dcf 100644
--- a/net/tls/tls_device_fallback.c
+++ b/net/tls/tls_device_fallback.c
@@ -33,6 +33,7 @@
 #include <crypto/aead.h>
 #include <crypto/scatterwalk.h>
 #include <net/ip6_checksum.h>
+#include <linux/skbuff_ref.h>
 
 #include "tls.h"
 
diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c
index ca1e0e198ceb..58c4b06f4f0c 100644
--- a/net/tls/tls_strp.c
+++ b/net/tls/tls_strp.c
@@ -2,6 +2,7 @@
 /* Copyright (c) 2016 Tom Herbert <tom@herbertland.com> */
 
 #include <linux/skbuff.h>
+#include <linux/skbuff_ref.h>
 #include <linux/workqueue.h>
 #include <net/strparser.h>
 #include <net/tcp.h>
-- 
cgit v1.2.3


From a580ea994fd37f4105028f5a85c38ff6508a2b25 Mon Sep 17 00:00:00 2001
From: Mina Almasry <almasrymina@google.com>
Date: Wed, 10 Apr 2024 12:05:02 -0700
Subject: net: mirror skb frag ref/unref helpers

Refactor some of the skb frag ref/unref helpers for improved clarity.

Implement napi_pp_get_page() to be the mirror counterpart of
napi_pp_put_page().

Implement skb_page_ref() to be the mirror of skb_page_unref().

Improve __skb_frag_ref() to become a mirror counterpart of
__skb_frag_unref(). Previously unref could handle pp & non-pp pages,
while the ref could only handle non-pp pages. Now both the ref & unref
helpers can correctly handle both pp & non-pp pages.

Now that __skb_frag_ref() can handle both pp & non-pp pages, remove
skb_pp_frag_ref(), and use __skb_frag_ref() instead.  This lets us
remove pp specific handling from skb_try_coalesce.

Additionally, since __skb_frag_ref() can now handle both pp & non-pp
pages, a latent issue in skb_shift() should now be fixed. Previously
this function would do a non-pp ref & pp unref on potential pp frags
(fragfrom). After this patch, skb_shift() should correctly do a pp
ref/unref on pp frags.

Signed-off-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Dragos Tatulea <dtatulea@nvidia.com>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Link: https://lore.kernel.org/r/20240410190505.1225848-3-almasrymina@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../chelsio/inline_crypto/ch_ktls/chcr_ktls.c      |  2 +-
 drivers/net/ethernet/sun/cassini.c                 |  4 +-
 drivers/net/veth.c                                 |  2 +-
 include/linux/skbuff_ref.h                         | 39 ++++++++++++++++--
 net/core/skbuff.c                                  | 46 ++--------------------
 net/tls/tls_device_fallback.c                      |  2 +-
 6 files changed, 44 insertions(+), 51 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c
index e8e460a92e0e..3832c2e8ea5a 100644
--- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c
+++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c
@@ -1659,7 +1659,7 @@ static void chcr_ktls_copy_record_in_skb(struct sk_buff *nskb,
 	for (i = 0; i < record->num_frags; i++) {
 		skb_shinfo(nskb)->frags[i] = record->frags[i];
 		/* increase the frag ref count */
-		__skb_frag_ref(&skb_shinfo(nskb)->frags[i]);
+		__skb_frag_ref(&skb_shinfo(nskb)->frags[i], nskb->pp_recycle);
 	}
 
 	skb_shinfo(nskb)->nr_frags = record->num_frags;
diff --git a/drivers/net/ethernet/sun/cassini.c b/drivers/net/ethernet/sun/cassini.c
index 8f1f43dbb76d..f058e154a3bc 100644
--- a/drivers/net/ethernet/sun/cassini.c
+++ b/drivers/net/ethernet/sun/cassini.c
@@ -2000,7 +2000,7 @@ static int cas_rx_process_pkt(struct cas *cp, struct cas_rx_comp *rxc,
 		skb->len      += hlen - swivel;
 
 		skb_frag_fill_page_desc(frag, page->buffer, off, hlen - swivel);
-		__skb_frag_ref(frag);
+		__skb_frag_ref(frag, skb->pp_recycle);
 
 		/* any more data? */
 		if ((words[0] & RX_COMP1_SPLIT_PKT) && ((dlen -= hlen) > 0)) {
@@ -2024,7 +2024,7 @@ static int cas_rx_process_pkt(struct cas *cp, struct cas_rx_comp *rxc,
 			frag++;
 
 			skb_frag_fill_page_desc(frag, page->buffer, 0, hlen);
-			__skb_frag_ref(frag);
+			__skb_frag_ref(frag, skb->pp_recycle);
 			RX_USED_ADD(page, hlen + cp->crc_size);
 		}
 
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 426e68a95067..0b0293629329 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -717,7 +717,7 @@ static void veth_xdp_get(struct xdp_buff *xdp)
 		return;
 
 	for (i = 0; i < sinfo->nr_frags; i++)
-		__skb_frag_ref(&sinfo->frags[i]);
+		__skb_frag_ref(&sinfo->frags[i], false);
 }
 
 static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq,
diff --git a/include/linux/skbuff_ref.h b/include/linux/skbuff_ref.h
index 11f0a4063403..4dcdbe9fbc5f 100644
--- a/include/linux/skbuff_ref.h
+++ b/include/linux/skbuff_ref.h
@@ -8,16 +8,47 @@
 #define _LINUX_SKBUFF_REF_H
 
 #include <linux/skbuff.h>
+#include <net/page_pool/helpers.h>
+
+#ifdef CONFIG_PAGE_POOL
+static inline bool is_pp_page(struct page *page)
+{
+	return (page->pp_magic & ~0x3UL) == PP_SIGNATURE;
+}
+
+static inline bool napi_pp_get_page(struct page *page)
+{
+	page = compound_head(page);
+
+	if (!is_pp_page(page))
+		return false;
+
+	page_pool_ref_page(page);
+	return true;
+}
+#endif
+
+static inline void skb_page_ref(struct page *page, bool recycle)
+{
+#ifdef CONFIG_PAGE_POOL
+	if (recycle && napi_pp_get_page(page))
+		return;
+#endif
+	get_page(page);
+}
 
 /**
  * __skb_frag_ref - take an addition reference on a paged fragment.
  * @frag: the paged fragment
+ * @recycle: skb->pp_recycle param of the parent skb. False if no parent skb.
  *
- * Takes an additional reference on the paged fragment @frag.
+ * Takes an additional reference on the paged fragment @frag. Obtains the
+ * correct reference count depending on whether skb->pp_recycle is set and
+ * whether the frag is a page pool frag.
  */
-static inline void __skb_frag_ref(skb_frag_t *frag)
+static inline void __skb_frag_ref(skb_frag_t *frag, bool recycle)
 {
-	get_page(skb_frag_page(frag));
+	skb_page_ref(skb_frag_page(frag), recycle);
 }
 
 /**
@@ -29,7 +60,7 @@ static inline void __skb_frag_ref(skb_frag_t *frag)
  */
 static inline void skb_frag_ref(struct sk_buff *skb, int f)
 {
-	__skb_frag_ref(&skb_shinfo(skb)->frags[f]);
+	__skb_frag_ref(&skb_shinfo(skb)->frags[f], skb->pp_recycle);
 }
 
 bool napi_pp_put_page(struct page *page);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2554a6f5f386..ea052fa710d8 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -907,11 +907,6 @@ static void skb_clone_fraglist(struct sk_buff *skb)
 		skb_get(list);
 }
 
-static bool is_pp_page(struct page *page)
-{
-	return (page->pp_magic & ~0x3UL) == PP_SIGNATURE;
-}
-
 int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
 		    unsigned int headroom)
 {
@@ -1033,37 +1028,6 @@ static bool skb_pp_recycle(struct sk_buff *skb, void *data)
 	return napi_pp_put_page(virt_to_page(data));
 }
 
-/**
- * skb_pp_frag_ref() - Increase fragment references of a page pool aware skb
- * @skb:	page pool aware skb
- *
- * Increase the fragment reference count (pp_ref_count) of a skb. This is
- * intended to gain fragment references only for page pool aware skbs,
- * i.e. when skb->pp_recycle is true, and not for fragments in a
- * non-pp-recycling skb. It has a fallback to increase references on normal
- * pages, as page pool aware skbs may also have normal page fragments.
- */
-static int skb_pp_frag_ref(struct sk_buff *skb)
-{
-	struct skb_shared_info *shinfo;
-	struct page *head_page;
-	int i;
-
-	if (!skb->pp_recycle)
-		return -EINVAL;
-
-	shinfo = skb_shinfo(skb);
-
-	for (i = 0; i < shinfo->nr_frags; i++) {
-		head_page = compound_head(skb_frag_page(&shinfo->frags[i]));
-		if (likely(is_pp_page(head_page)))
-			page_pool_ref_page(head_page);
-		else
-			page_ref_inc(head_page);
-	}
-	return 0;
-}
-
 static void skb_kfree_head(void *head, unsigned int end_offset)
 {
 	if (end_offset == SKB_SMALL_HEAD_HEADROOM)
@@ -4176,7 +4140,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
 			to++;
 
 		} else {
-			__skb_frag_ref(fragfrom);
+			__skb_frag_ref(fragfrom, skb->pp_recycle);
 			skb_frag_page_copy(fragto, fragfrom);
 			skb_frag_off_copy(fragto, fragfrom);
 			skb_frag_size_set(fragto, todo);
@@ -4826,7 +4790,7 @@ normal:
 			}
 
 			*nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag;
-			__skb_frag_ref(nskb_frag);
+			__skb_frag_ref(nskb_frag, nskb->pp_recycle);
 			size = skb_frag_size(nskb_frag);
 
 			if (pos < offset) {
@@ -5957,10 +5921,8 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
 	/* if the skb is not cloned this does nothing
 	 * since we set nr_frags to 0.
 	 */
-	if (skb_pp_frag_ref(from)) {
-		for (i = 0; i < from_shinfo->nr_frags; i++)
-			__skb_frag_ref(&from_shinfo->frags[i]);
-	}
+	for (i = 0; i < from_shinfo->nr_frags; i++)
+		__skb_frag_ref(&from_shinfo->frags[i], from->pp_recycle);
 
 	to->truesize += delta;
 	to->len += len;
diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c
index f9e3d3d90dcf..9237dded4467 100644
--- a/net/tls/tls_device_fallback.c
+++ b/net/tls/tls_device_fallback.c
@@ -278,7 +278,7 @@ static int fill_sg_in(struct scatterlist *sg_in,
 	for (i = 0; remaining > 0; i++) {
 		skb_frag_t *frag = &record->frags[i];
 
-		__skb_frag_ref(frag);
+		__skb_frag_ref(frag, false);
 		sg_set_page(sg_in + i, skb_frag_page(frag),
 			    skb_frag_size(frag), skb_frag_off(frag));
 
-- 
cgit v1.2.3


From e1eb10f8ef1068e565b24daa3e95bd8bc95d8cdc Mon Sep 17 00:00:00 2001
From: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Date: Wed, 10 Apr 2024 11:47:17 +0000
Subject: flow_offload: fix flow_offload_has_one_action() kdoc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

include/net/flow_offload.h:351: warning:
  No description found for return value of 'flow_offload_has_one_action'

Signed-off-by: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Reviewed-by: Pieter Jansen van Vuuren <pieter.jansen-van-vuuren@amd.com>
Link: https://lore.kernel.org/r/20240410114718.15145-1-ast@fiberby.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/flow_offload.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 314087a5e181..7726b4a97050 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -345,7 +345,7 @@ static inline bool flow_action_has_entries(const struct flow_action *action)
  * flow_offload_has_one_action() - check if exactly one action is present
  * @action: tc filter flow offload action
  *
- * Returns true if exactly one action is present.
+ * Return: true if exactly one action is present.
  */
 static inline bool flow_offload_has_one_action(const struct flow_action *action)
 {
-- 
cgit v1.2.3


From 5618603f5d063c8d769d2e64cd440f787567fd13 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <gnault@redhat.com>
Date: Wed, 10 Apr 2024 15:14:29 +0200
Subject: ipv4: Remove RTO_ONLINK.

RTO_ONLINK was a flag used in ->flowi4_tos that allowed to alter the
scope of an IPv4 route lookup. Setting this flag was equivalent to
specifying RT_SCOPE_LINK in ->flowi4_scope.

With commit ec20b2830093 ("ipv4: Set scope explicitly in
ip_route_output()."), the last users of RTO_ONLINK have been removed.
Therefore, we can now drop the code that checked this bit and stop
modifying ->flowi4_scope in ip_route_output_key_hash().

Signed-off-by: Guillaume Nault <gnault@redhat.com>
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/57de760565cab55df7b129f523530ac6475865b2.1712754146.git.gnault@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/route.h |  2 --
 net/ipv4/route.c    | 14 +-------------
 2 files changed, 1 insertion(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/net/route.h b/include/net/route.h
index 315a8acee6c6..630d1ef6868a 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -35,8 +35,6 @@
 #include <linux/cache.h>
 #include <linux/security.h>
 
-#define RTO_ONLINK	0x01
-
 static inline __u8 ip_sock_rt_scope(const struct sock *sk)
 {
 	if (sock_flag(sk, SOCK_LOCALROUTE))
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index d36ace160d42..6133f1fd848e 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -106,9 +106,6 @@
 
 #include "fib_lookup.h"
 
-#define RT_FL_TOS(oldflp4) \
-	((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
-
 #define RT_GC_TIMEOUT (300*HZ)
 
 #define DEFAULT_MIN_PMTU (512 + 20 + 20)
@@ -498,15 +495,6 @@ void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 }
 EXPORT_SYMBOL(__ip_select_ident);
 
-static void ip_rt_fix_tos(struct flowi4 *fl4)
-{
-	__u8 tos = RT_FL_TOS(fl4);
-
-	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
-	if (tos & RTO_ONLINK)
-		fl4->flowi4_scope = RT_SCOPE_LINK;
-}
-
 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 			     const struct sock *sk, const struct iphdr *iph,
 			     int oif, __u8 tos, u8 prot, u32 mark,
@@ -2636,7 +2624,7 @@ struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
 	struct rtable *rth;
 
 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
-	ip_rt_fix_tos(fl4);
+	fl4->flowi4_tos &= IPTOS_RT_MASK;
 
 	rcu_read_lock();
 	rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
-- 
cgit v1.2.3


From dd0c9855b41310470086500c9963bbb64bb90dd0 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Wed, 10 Apr 2024 20:42:38 +0100
Subject: net: dsa: introduce dsa_phylink_to_port()

We convert from a phylink_config struct to a dsa_port struct in many
places, let's provide a helper for this.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Link: https://lore.kernel.org/r/E1rudqA-006K9B-85@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dsa.h |  6 ++++++
 net/dsa/port.c    | 12 ++++++------
 2 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 7c0da9effe4e..f228b479a5fd 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -327,6 +327,12 @@ struct dsa_port {
 	};
 };
 
+static inline struct dsa_port *
+dsa_phylink_to_port(struct phylink_config *config)
+{
+	return container_of(config, struct dsa_port, pl_config);
+}
+
 /* TODO: ideally DSA ports would have a single dp->link_dp member,
  * and no dst->rtable nor this struct dsa_link would be needed,
  * but this would require some more complex tree walking,
diff --git a/net/dsa/port.c b/net/dsa/port.c
index c42dac87671b..02bf1c306bdc 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -1558,7 +1558,7 @@ static struct phylink_pcs *
 dsa_port_phylink_mac_select_pcs(struct phylink_config *config,
 				phy_interface_t interface)
 {
-	struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
+	struct dsa_port *dp = dsa_phylink_to_port(config);
 	struct phylink_pcs *pcs = ERR_PTR(-EOPNOTSUPP);
 	struct dsa_switch *ds = dp->ds;
 
@@ -1572,7 +1572,7 @@ static int dsa_port_phylink_mac_prepare(struct phylink_config *config,
 					unsigned int mode,
 					phy_interface_t interface)
 {
-	struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
+	struct dsa_port *dp = dsa_phylink_to_port(config);
 	struct dsa_switch *ds = dp->ds;
 	int err = 0;
 
@@ -1587,7 +1587,7 @@ static void dsa_port_phylink_mac_config(struct phylink_config *config,
 					unsigned int mode,
 					const struct phylink_link_state *state)
 {
-	struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
+	struct dsa_port *dp = dsa_phylink_to_port(config);
 	struct dsa_switch *ds = dp->ds;
 
 	if (!ds->ops->phylink_mac_config)
@@ -1600,7 +1600,7 @@ static int dsa_port_phylink_mac_finish(struct phylink_config *config,
 				       unsigned int mode,
 				       phy_interface_t interface)
 {
-	struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
+	struct dsa_port *dp = dsa_phylink_to_port(config);
 	struct dsa_switch *ds = dp->ds;
 	int err = 0;
 
@@ -1615,7 +1615,7 @@ static void dsa_port_phylink_mac_link_down(struct phylink_config *config,
 					   unsigned int mode,
 					   phy_interface_t interface)
 {
-	struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
+	struct dsa_port *dp = dsa_phylink_to_port(config);
 	struct phy_device *phydev = NULL;
 	struct dsa_switch *ds = dp->ds;
 
@@ -1638,7 +1638,7 @@ static void dsa_port_phylink_mac_link_up(struct phylink_config *config,
 					 int speed, int duplex,
 					 bool tx_pause, bool rx_pause)
 {
-	struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
+	struct dsa_port *dp = dsa_phylink_to_port(config);
 	struct dsa_switch *ds = dp->ds;
 
 	if (!ds->ops->phylink_mac_link_up) {
-- 
cgit v1.2.3


From cae425cb43feddd9fd62fc1b25567f9463da4915 Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Wed, 10 Apr 2024 20:42:43 +0100
Subject: net: dsa: allow DSA switch drivers to provide their own phylink mac
 ops

Rather than having a shim for each and every phylink MAC operation,
allow DSA switch drivers to provide their own ops structure. When a
DSA driver provides the phylink MAC operations, the shimmed ops must
not be provided, so fail an attempt to register a switch with both
the phylink_mac_ops in struct dsa_switch and the phylink_mac_*
operations populated in dsa_switch_ops populated.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Link: https://lore.kernel.org/r/E1rudqF-006K9H-Cc@rmk-PC.armlinux.org.uk
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dsa.h |  5 +++++
 net/dsa/dsa.c     | 11 +++++++++++
 net/dsa/port.c    | 26 ++++++++++++++++++++------
 3 files changed, 36 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index f228b479a5fd..7edfd8de8882 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -457,6 +457,11 @@ struct dsa_switch {
 	 */
 	const struct dsa_switch_ops	*ops;
 
+	/*
+	 * Allow a DSA switch driver to override the phylink MAC ops
+	 */
+	const struct phylink_mac_ops	*phylink_mac_ops;
+
 	/*
 	 * User mii_bus and devices for the individual ports.
 	 */
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 09d2f5d4b3dd..2f347cd37316 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -1505,6 +1505,17 @@ static int dsa_switch_probe(struct dsa_switch *ds)
 	if (!ds->num_ports)
 		return -EINVAL;
 
+	if (ds->phylink_mac_ops) {
+		if (ds->ops->phylink_mac_select_pcs ||
+		    ds->ops->phylink_mac_prepare ||
+		    ds->ops->phylink_mac_config ||
+		    ds->ops->phylink_mac_finish ||
+		    ds->ops->phylink_mac_link_down ||
+		    ds->ops->phylink_mac_link_up ||
+		    ds->ops->adjust_link)
+			return -EINVAL;
+	}
+
 	if (np) {
 		err = dsa_switch_parse_of(ds, np);
 		if (err)
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 02bf1c306bdc..c6febc3d96d9 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -1662,6 +1662,7 @@ static const struct phylink_mac_ops dsa_port_phylink_mac_ops = {
 
 int dsa_port_phylink_create(struct dsa_port *dp)
 {
+	const struct phylink_mac_ops *mac_ops;
 	struct dsa_switch *ds = dp->ds;
 	phy_interface_t mode;
 	struct phylink *pl;
@@ -1685,8 +1686,12 @@ int dsa_port_phylink_create(struct dsa_port *dp)
 		}
 	}
 
-	pl = phylink_create(&dp->pl_config, of_fwnode_handle(dp->dn),
-			    mode, &dsa_port_phylink_mac_ops);
+	mac_ops = &dsa_port_phylink_mac_ops;
+	if (ds->phylink_mac_ops)
+		mac_ops = ds->phylink_mac_ops;
+
+	pl = phylink_create(&dp->pl_config, of_fwnode_handle(dp->dn), mode,
+			    mac_ops);
 	if (IS_ERR(pl)) {
 		pr_err("error creating PHYLINK: %ld\n", PTR_ERR(pl));
 		return PTR_ERR(pl);
@@ -1952,12 +1957,23 @@ static void dsa_shared_port_validate_of(struct dsa_port *dp,
 		dn, dsa_port_is_cpu(dp) ? "CPU" : "DSA", dp->index);
 }
 
+static void dsa_shared_port_link_down(struct dsa_port *dp)
+{
+	struct dsa_switch *ds = dp->ds;
+
+	if (ds->phylink_mac_ops && ds->phylink_mac_ops->mac_link_down)
+		ds->phylink_mac_ops->mac_link_down(&dp->pl_config, MLO_AN_FIXED,
+						   PHY_INTERFACE_MODE_NA);
+	else if (ds->ops->phylink_mac_link_down)
+		ds->ops->phylink_mac_link_down(ds, dp->index, MLO_AN_FIXED,
+					       PHY_INTERFACE_MODE_NA);
+}
+
 int dsa_shared_port_link_register_of(struct dsa_port *dp)
 {
 	struct dsa_switch *ds = dp->ds;
 	bool missing_link_description;
 	bool missing_phy_mode;
-	int port = dp->index;
 
 	dsa_shared_port_validate_of(dp, &missing_phy_mode,
 				    &missing_link_description);
@@ -1973,9 +1989,7 @@ int dsa_shared_port_link_register_of(struct dsa_port *dp)
 				 "Skipping phylink registration for %s port %d\n",
 				 dsa_port_is_cpu(dp) ? "CPU" : "DSA", dp->index);
 		} else {
-			if (ds->ops->phylink_mac_link_down)
-				ds->ops->phylink_mac_link_down(ds, port,
-					MLO_AN_FIXED, PHY_INTERFACE_MODE_NA);
+			dsa_shared_port_link_down(dp);
 
 			return dsa_shared_port_phylink_register(dp);
 		}
-- 
cgit v1.2.3


From df5df1257c9f2af6621976f1e18a088182230d06 Mon Sep 17 00:00:00 2001
From: Gabriel Fernandez <gabriel.fernandez@foss.st.com>
Date: Thu, 11 Apr 2024 11:24:51 +0200
Subject: dt-bindings: clocks: stm32mp25: add description of all parents

RCC driver uses '.index' to define all parent clocks instead '.names'
because the use of a name to define a parent clock is discouraged. This
is an ABI change, but the RCC driver has not yet merged, unlike all
others drivers besides Linux.

Fixes: b5be49db3d47 ("dt-bindings: stm32: add clocks and reset binding for stm32mp25 platform")
Signed-off-by: Gabriel Fernandez <gabriel.fernandez@foss.st.com>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20240411092453.243633-3-gabriel.fernandez@foss.st.com
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 .../bindings/clock/st,stm32mp25-rcc.yaml           | 170 +++++++++++++++++++--
 include/dt-bindings/reset/st,stm32mp25-rcc.h       |   2 +-
 2 files changed, 156 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/st,stm32mp25-rcc.yaml b/Documentation/devicetree/bindings/clock/st,stm32mp25-rcc.yaml
index 7732e79a42b9..0929fa7e271b 100644
--- a/Documentation/devicetree/bindings/clock/st,stm32mp25-rcc.yaml
+++ b/Documentation/devicetree/bindings/clock/st,stm32mp25-rcc.yaml
@@ -38,14 +38,81 @@ properties:
       - description: CK_SCMI_MSI Low Power Internal oscillator (~ 4 MHz or ~ 16 MHz)
       - description: CK_SCMI_LSE Low Speed External oscillator (32 KHz)
       - description: CK_SCMI_LSI Low Speed Internal oscillator (~ 32 KHz)
-
-  clock-names:
-    items:
-      - const: hse
-      - const: hsi
-      - const: msi
-      - const: lse
-      - const: lsi
+      - description: CK_SCMI_HSE_DIV2 CK_SCMI_HSE divided by 2 (coud be gated)
+      - description: CK_SCMI_ICN_HS_MCU High Speed interconnect bus clock
+      - description: CK_SCMI_ICN_LS_MCU Low Speed interconnect bus clock
+      - description: CK_SCMI_ICN_SDMMC SDMMC interconnect bus clock
+      - description: CK_SCMI_ICN_DDR DDR interconnect bus clock
+      - description: CK_SCMI_ICN_DISPLAY Display interconnect bus clock
+      - description: CK_SCMI_ICN_HSL HSL interconnect bus clock
+      - description: CK_SCMI_ICN_NIC NIC interconnect bus clock
+      - description: CK_SCMI_ICN_VID Video interconnect bus clock
+      - description: CK_SCMI_FLEXGEN_07 flexgen clock 7
+      - description: CK_SCMI_FLEXGEN_08 flexgen clock 8
+      - description: CK_SCMI_FLEXGEN_09 flexgen clock 9
+      - description: CK_SCMI_FLEXGEN_10 flexgen clock 10
+      - description: CK_SCMI_FLEXGEN_11 flexgen clock 11
+      - description: CK_SCMI_FLEXGEN_12 flexgen clock 12
+      - description: CK_SCMI_FLEXGEN_13 flexgen clock 13
+      - description: CK_SCMI_FLEXGEN_14 flexgen clock 14
+      - description: CK_SCMI_FLEXGEN_15 flexgen clock 15
+      - description: CK_SCMI_FLEXGEN_16 flexgen clock 16
+      - description: CK_SCMI_FLEXGEN_17 flexgen clock 17
+      - description: CK_SCMI_FLEXGEN_18 flexgen clock 18
+      - description: CK_SCMI_FLEXGEN_19 flexgen clock 19
+      - description: CK_SCMI_FLEXGEN_20 flexgen clock 20
+      - description: CK_SCMI_FLEXGEN_21 flexgen clock 21
+      - description: CK_SCMI_FLEXGEN_22 flexgen clock 22
+      - description: CK_SCMI_FLEXGEN_23 flexgen clock 23
+      - description: CK_SCMI_FLEXGEN_24 flexgen clock 24
+      - description: CK_SCMI_FLEXGEN_25 flexgen clock 25
+      - description: CK_SCMI_FLEXGEN_26 flexgen clock 26
+      - description: CK_SCMI_FLEXGEN_27 flexgen clock 27
+      - description: CK_SCMI_FLEXGEN_28 flexgen clock 28
+      - description: CK_SCMI_FLEXGEN_29 flexgen clock 29
+      - description: CK_SCMI_FLEXGEN_30 flexgen clock 30
+      - description: CK_SCMI_FLEXGEN_31 flexgen clock 31
+      - description: CK_SCMI_FLEXGEN_32 flexgen clock 32
+      - description: CK_SCMI_FLEXGEN_33 flexgen clock 33
+      - description: CK_SCMI_FLEXGEN_34 flexgen clock 34
+      - description: CK_SCMI_FLEXGEN_35 flexgen clock 35
+      - description: CK_SCMI_FLEXGEN_36 flexgen clock 36
+      - description: CK_SCMI_FLEXGEN_37 flexgen clock 37
+      - description: CK_SCMI_FLEXGEN_38 flexgen clock 38
+      - description: CK_SCMI_FLEXGEN_39 flexgen clock 39
+      - description: CK_SCMI_FLEXGEN_40 flexgen clock 40
+      - description: CK_SCMI_FLEXGEN_41 flexgen clock 41
+      - description: CK_SCMI_FLEXGEN_42 flexgen clock 42
+      - description: CK_SCMI_FLEXGEN_43 flexgen clock 43
+      - description: CK_SCMI_FLEXGEN_44 flexgen clock 44
+      - description: CK_SCMI_FLEXGEN_45 flexgen clock 45
+      - description: CK_SCMI_FLEXGEN_46 flexgen clock 46
+      - description: CK_SCMI_FLEXGEN_47 flexgen clock 47
+      - description: CK_SCMI_FLEXGEN_48 flexgen clock 48
+      - description: CK_SCMI_FLEXGEN_49 flexgen clock 49
+      - description: CK_SCMI_FLEXGEN_50 flexgen clock 50
+      - description: CK_SCMI_FLEXGEN_51 flexgen clock 51
+      - description: CK_SCMI_FLEXGEN_52 flexgen clock 52
+      - description: CK_SCMI_FLEXGEN_53 flexgen clock 53
+      - description: CK_SCMI_FLEXGEN_54 flexgen clock 54
+      - description: CK_SCMI_FLEXGEN_55 flexgen clock 55
+      - description: CK_SCMI_FLEXGEN_56 flexgen clock 56
+      - description: CK_SCMI_FLEXGEN_57 flexgen clock 57
+      - description: CK_SCMI_FLEXGEN_58 flexgen clock 58
+      - description: CK_SCMI_FLEXGEN_59 flexgen clock 59
+      - description: CK_SCMI_FLEXGEN_60 flexgen clock 60
+      - description: CK_SCMI_FLEXGEN_61 flexgen clock 61
+      - description: CK_SCMI_FLEXGEN_62 flexgen clock 62
+      - description: CK_SCMI_FLEXGEN_63 flexgen clock 63
+      - description: CK_SCMI_ICN_APB1 Peripheral bridge 1
+      - description: CK_SCMI_ICN_APB2 Peripheral bridge 2
+      - description: CK_SCMI_ICN_APB3 Peripheral bridge 3
+      - description: CK_SCMI_ICN_APB4 Peripheral bridge 4
+      - description: CK_SCMI_ICN_APBDBG Peripheral bridge for degub
+      - description: CK_SCMI_TIMG1 Peripheral bridge for timer1
+      - description: CK_SCMI_TIMG2 Peripheral bridge for timer2
+      - description: CK_SCMI_PLL3 PLL3 clock
+      - description: clk_dsi_txbyte DSI byte clock
 
 required:
   - compatible
@@ -53,7 +120,6 @@ required:
   - '#clock-cells'
   - '#reset-cells'
   - clocks
-  - clock-names
 
 additionalProperties: false
 
@@ -66,11 +132,85 @@ examples:
         reg = <0x44200000 0x10000>;
         #clock-cells = <1>;
         #reset-cells = <1>;
-        clock-names = "hse", "hsi", "msi", "lse", "lsi";
-        clocks = <&scmi_clk CK_SCMI_HSE>,
-                 <&scmi_clk CK_SCMI_HSI>,
-                 <&scmi_clk CK_SCMI_MSI>,
-                 <&scmi_clk CK_SCMI_LSE>,
-                 <&scmi_clk CK_SCMI_LSI>;
+        clocks =  <&scmi_clk CK_SCMI_HSE>,
+                  <&scmi_clk CK_SCMI_HSI>,
+                  <&scmi_clk CK_SCMI_MSI>,
+                  <&scmi_clk CK_SCMI_LSE>,
+                  <&scmi_clk CK_SCMI_LSI>,
+                  <&scmi_clk CK_SCMI_HSE_DIV2>,
+                  <&scmi_clk CK_SCMI_ICN_HS_MCU>,
+                  <&scmi_clk CK_SCMI_ICN_LS_MCU>,
+                  <&scmi_clk CK_SCMI_ICN_SDMMC>,
+                  <&scmi_clk CK_SCMI_ICN_DDR>,
+                  <&scmi_clk CK_SCMI_ICN_DISPLAY>,
+                  <&scmi_clk CK_SCMI_ICN_HSL>,
+                  <&scmi_clk CK_SCMI_ICN_NIC>,
+                  <&scmi_clk CK_SCMI_ICN_VID>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_07>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_08>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_09>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_10>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_11>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_12>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_13>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_14>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_15>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_16>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_17>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_18>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_19>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_20>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_21>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_22>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_23>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_24>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_25>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_26>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_27>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_28>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_29>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_30>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_31>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_32>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_33>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_34>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_35>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_36>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_37>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_38>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_39>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_40>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_41>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_42>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_43>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_44>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_45>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_46>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_47>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_48>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_49>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_50>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_51>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_52>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_53>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_54>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_55>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_56>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_57>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_58>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_59>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_60>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_61>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_62>,
+                  <&scmi_clk CK_SCMI_FLEXGEN_63>,
+                  <&scmi_clk CK_SCMI_ICN_APB1>,
+                  <&scmi_clk CK_SCMI_ICN_APB2>,
+                  <&scmi_clk CK_SCMI_ICN_APB3>,
+                  <&scmi_clk CK_SCMI_ICN_APB4>,
+                  <&scmi_clk CK_SCMI_ICN_APBDBG>,
+                  <&scmi_clk CK_SCMI_TIMG1>,
+                  <&scmi_clk CK_SCMI_TIMG2>,
+                  <&scmi_clk CK_SCMI_PLL3>,
+                  <&clk_dsi_txbyte>;
     };
 ...
diff --git a/include/dt-bindings/reset/st,stm32mp25-rcc.h b/include/dt-bindings/reset/st,stm32mp25-rcc.h
index d5615930bcc8..748e78ae20bd 100644
--- a/include/dt-bindings/reset/st,stm32mp25-rcc.h
+++ b/include/dt-bindings/reset/st,stm32mp25-rcc.h
@@ -69,7 +69,7 @@
 #define ADC3_R		59
 #define ETH1_R		60
 #define ETH2_R		61
-#define USB2_R		62
+#define USBH_R		62
 #define USB2PHY1_R	63
 #define USB2PHY2_R	64
 #define USB3DR_R	65
-- 
cgit v1.2.3


From d67c96fb97b5811e15c881d5cb72e293faa5f8e1 Mon Sep 17 00:00:00 2001
From: Stefan Berger <stefanb@linux.ibm.com>
Date: Thu, 4 Apr 2024 10:18:45 -0400
Subject: crypto: ecdsa - Convert byte arrays with key coordinates to digits

For NIST P192/256/384 the public key's x and y parameters could be copied
directly from a given array since both parameters filled 'ndigits' of
digits (a 'digit' is a u64). For support of NIST P521 the key parameters
need to have leading zeros prepended to the most significant digit since
only 2 bytes of the most significant digit are provided.

Therefore, implement ecc_digits_from_bytes to convert a byte array into an
array of digits and use this function in ecdsa_set_pub_key where an input
byte array needs to be converted into digits.

Suggested-by: Lukas Wunner <lukas@wunner.de>
Tested-by: Lukas Wunner <lukas@wunner.de>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Stefan Berger <stefanb@linux.ibm.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/ecdsa.c                | 14 +++++++++-----
 include/crypto/internal/ecc.h | 21 +++++++++++++++++++++
 2 files changed, 30 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/crypto/ecdsa.c b/crypto/ecdsa.c
index 3f9ec273a121..e819d0983bd3 100644
--- a/crypto/ecdsa.c
+++ b/crypto/ecdsa.c
@@ -222,9 +222,8 @@ static int ecdsa_ecc_ctx_reset(struct ecc_ctx *ctx)
 static int ecdsa_set_pub_key(struct crypto_akcipher *tfm, const void *key, unsigned int keylen)
 {
 	struct ecc_ctx *ctx = akcipher_tfm_ctx(tfm);
+	unsigned int digitlen, ndigits;
 	const unsigned char *d = key;
-	const u64 *digits = (const u64 *)&d[1];
-	unsigned int ndigits;
 	int ret;
 
 	ret = ecdsa_ecc_ctx_reset(ctx);
@@ -238,12 +237,17 @@ static int ecdsa_set_pub_key(struct crypto_akcipher *tfm, const void *key, unsig
 		return -EINVAL;
 
 	keylen--;
-	ndigits = (keylen >> 1) / sizeof(u64);
+	digitlen = keylen >> 1;
+
+	ndigits = DIV_ROUND_UP(digitlen, sizeof(u64));
 	if (ndigits != ctx->curve->g.ndigits)
 		return -EINVAL;
 
-	ecc_swap_digits(digits, ctx->pub_key.x, ndigits);
-	ecc_swap_digits(&digits[ndigits], ctx->pub_key.y, ndigits);
+	d++;
+
+	ecc_digits_from_bytes(d, digitlen, ctx->pub_key.x, ndigits);
+	ecc_digits_from_bytes(&d[digitlen], digitlen, ctx->pub_key.y, ndigits);
+
 	ret = ecc_is_pubkey_valid_full(ctx->curve, &ctx->pub_key);
 
 	ctx->pub_key_set = ret == 0;
diff --git a/include/crypto/internal/ecc.h b/include/crypto/internal/ecc.h
index 4f6c1a68882f..ab722a8986b7 100644
--- a/include/crypto/internal/ecc.h
+++ b/include/crypto/internal/ecc.h
@@ -56,6 +56,27 @@ static inline void ecc_swap_digits(const void *in, u64 *out, unsigned int ndigit
 		out[i] = get_unaligned_be64(&src[ndigits - 1 - i]);
 }
 
+/**
+ * ecc_digits_from_bytes() - Create ndigits-sized digits array from byte array
+ * @in:       Input byte array
+ * @nbytes    Size of input byte array
+ * @out       Output digits array
+ * @ndigits:  Number of digits to create from byte array
+ */
+static inline void ecc_digits_from_bytes(const u8 *in, unsigned int nbytes,
+					 u64 *out, unsigned int ndigits)
+{
+	unsigned int o = nbytes & 7;
+	__be64 msd = 0;
+
+	if (o) {
+		memcpy((u8 *)&msd + sizeof(msd) - o, in, o);
+		out[--ndigits] = be64_to_cpu(msd);
+		in += o;
+	}
+	ecc_swap_digits(in, out, ndigits);
+}
+
 /**
  * ecc_is_key_valid() - Validate a given ECDH private key
  *
-- 
cgit v1.2.3


From c0d6bd1fd367a5374bff7e3f3bdf47beb84893c8 Mon Sep 17 00:00:00 2001
From: Stefan Berger <stefanb@linux.ibm.com>
Date: Thu, 4 Apr 2024 10:18:48 -0400
Subject: crypto: ecc - Add nbits field to ecc_curve structure

Add the number of bits a curve has to the ecc_curve definition to be able
to derive the number of bytes a curve requires for its coordinates from it.
It also allows one to identify a curve by its particular size. Set the
number of bits on all curve definitions.

Tested-by: Lukas Wunner <lukas@wunner.de>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Stefan Berger <stefanb@linux.ibm.com>
Reviewed-by: Vitaly Chikunov <vt@altlinux.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/ecc_curve_defs.h    | 4 ++++
 crypto/ecrdsa_defs.h       | 5 +++++
 include/crypto/ecc_curve.h | 2 ++
 3 files changed, 11 insertions(+)

(limited to 'include')

diff --git a/crypto/ecc_curve_defs.h b/crypto/ecc_curve_defs.h
index 9719934c9428..ab1ef3d94be5 100644
--- a/crypto/ecc_curve_defs.h
+++ b/crypto/ecc_curve_defs.h
@@ -17,6 +17,7 @@ static u64 nist_p192_b[] = { 0xFEB8DEECC146B9B1ull, 0x0FA7E9AB72243049ull,
 				0x64210519E59C80E7ull };
 static struct ecc_curve nist_p192 = {
 	.name = "nist_192",
+	.nbits = 192,
 	.g = {
 		.x = nist_p192_g_x,
 		.y = nist_p192_g_y,
@@ -43,6 +44,7 @@ static u64 nist_p256_b[] = { 0x3BCE3C3E27D2604Bull, 0x651D06B0CC53B0F6ull,
 				0xB3EBBD55769886BCull, 0x5AC635D8AA3A93E7ull };
 static struct ecc_curve nist_p256 = {
 	.name = "nist_256",
+	.nbits = 256,
 	.g = {
 		.x = nist_p256_g_x,
 		.y = nist_p256_g_y,
@@ -75,6 +77,7 @@ static u64 nist_p384_b[] = { 0x2a85c8edd3ec2aefull, 0xc656398d8a2ed19dull,
 				0x988e056be3f82d19ull, 0xb3312fa7e23ee7e4ull };
 static struct ecc_curve nist_p384 = {
 	.name = "nist_384",
+	.nbits = 384,
 	.g = {
 		.x = nist_p384_g_x,
 		.y = nist_p384_g_y,
@@ -95,6 +98,7 @@ static u64 curve25519_a[] = { 0x000000000001DB41, 0x0000000000000000,
 				0x0000000000000000, 0x0000000000000000 };
 static const struct ecc_curve ecc_25519 = {
 	.name = "curve25519",
+	.nbits = 255,
 	.g = {
 		.x = curve25519_g_x,
 		.ndigits = 4,
diff --git a/crypto/ecrdsa_defs.h b/crypto/ecrdsa_defs.h
index 0056335b9d03..1c2c2449e331 100644
--- a/crypto/ecrdsa_defs.h
+++ b/crypto/ecrdsa_defs.h
@@ -47,6 +47,7 @@ static u64 cp256a_b[] = {
 
 static struct ecc_curve gost_cp256a = {
 	.name = "cp256a",
+	.nbits = 256,
 	.g = {
 		.x = cp256a_g_x,
 		.y = cp256a_g_y,
@@ -80,6 +81,7 @@ static u64 cp256b_b[] = {
 
 static struct ecc_curve gost_cp256b = {
 	.name = "cp256b",
+	.nbits = 256,
 	.g = {
 		.x = cp256b_g_x,
 		.y = cp256b_g_y,
@@ -117,6 +119,7 @@ static u64 cp256c_b[] = {
 
 static struct ecc_curve gost_cp256c = {
 	.name = "cp256c",
+	.nbits = 256,
 	.g = {
 		.x = cp256c_g_x,
 		.y = cp256c_g_y,
@@ -166,6 +169,7 @@ static u64 tc512a_b[] = {
 
 static struct ecc_curve gost_tc512a = {
 	.name = "tc512a",
+	.nbits = 512,
 	.g = {
 		.x = tc512a_g_x,
 		.y = tc512a_g_y,
@@ -211,6 +215,7 @@ static u64 tc512b_b[] = {
 
 static struct ecc_curve gost_tc512b = {
 	.name = "tc512b",
+	.nbits = 512,
 	.g = {
 		.x = tc512b_g_x,
 		.y = tc512b_g_y,
diff --git a/include/crypto/ecc_curve.h b/include/crypto/ecc_curve.h
index 70964781eb68..7d90c5e82266 100644
--- a/include/crypto/ecc_curve.h
+++ b/include/crypto/ecc_curve.h
@@ -23,6 +23,7 @@ struct ecc_point {
  * struct ecc_curve - definition of elliptic curve
  *
  * @name:	Short name of the curve.
+ * @nbits:	The number of bits of a curve.
  * @g:		Generator point of the curve.
  * @p:		Prime number, if Barrett's reduction is used for this curve
  *		pre-calculated value 'mu' is appended to the @p after ndigits.
@@ -34,6 +35,7 @@ struct ecc_point {
  */
 struct ecc_curve {
 	char *name;
+	u32 nbits;
 	struct ecc_point g;
 	u64 *p;
 	u64 *n;
-- 
cgit v1.2.3


From e7fb062754ef9f656ee004f2be8f59ce8a79bffb Mon Sep 17 00:00:00 2001
From: Stefan Berger <stefanb@linux.ibm.com>
Date: Thu, 4 Apr 2024 10:18:49 -0400
Subject: crypto: ecc - Implement vli_mmod_fast_521 for NIST p521

Implement vli_mmod_fast_521 following the description for how to calculate
the modulus for NIST P521 in the NIST publication "Recommendations for
Discrete Logarithm-Based Cryptography: Elliptic Curve Domain Parameters"
section G.1.4.

NIST p521 requires 9 64bit digits, so increase the ECC_MAX_DIGITS so that
the vli digit array provides enough elements to fit the larger integers
required by this curve.

Tested-by: Lukas Wunner <lukas@wunner.de>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Stefan Berger <stefanb@linux.ibm.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/ecc.c                  | 25 +++++++++++++++++++++++++
 include/crypto/internal/ecc.h |  3 ++-
 2 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/crypto/ecc.c b/crypto/ecc.c
index e94183f6c418..8914e437f94b 100644
--- a/crypto/ecc.c
+++ b/crypto/ecc.c
@@ -902,6 +902,28 @@ static void vli_mmod_fast_384(u64 *result, const u64 *product,
 #undef AND64H
 #undef AND64L
 
+/*
+ * Computes result = product % curve_prime
+ * from "Recommendations for Discrete Logarithm-Based Cryptography:
+ *       Elliptic Curve Domain Parameters" section G.1.4
+ */
+static void vli_mmod_fast_521(u64 *result, const u64 *product,
+			      const u64 *curve_prime, u64 *tmp)
+{
+	const unsigned int ndigits = ECC_CURVE_NIST_P521_DIGITS;
+	size_t i;
+
+	/* Initialize result with lowest 521 bits from product */
+	vli_set(result, product, ndigits);
+	result[8] &= 0x1ff;
+
+	for (i = 0; i < ndigits; i++)
+		tmp[i] = (product[8 + i] >> 9) | (product[9 + i] << 55);
+	tmp[8] &= 0x1ff;
+
+	vli_mod_add(result, result, tmp, curve_prime, ndigits);
+}
+
 /* Computes result = product % curve_prime for different curve_primes.
  *
  * Note that curve_primes are distinguished just by heuristic check and
@@ -941,6 +963,9 @@ static bool vli_mmod_fast(u64 *result, u64 *product,
 	case ECC_CURVE_NIST_P384_DIGITS:
 		vli_mmod_fast_384(result, product, curve_prime, tmp);
 		break;
+	case ECC_CURVE_NIST_P521_DIGITS:
+		vli_mmod_fast_521(result, product, curve_prime, tmp);
+		break;
 	default:
 		pr_err_ratelimited("ecc: unsupported digits size!\n");
 		return false;
diff --git a/include/crypto/internal/ecc.h b/include/crypto/internal/ecc.h
index ab722a8986b7..4e2f5f938e91 100644
--- a/include/crypto/internal/ecc.h
+++ b/include/crypto/internal/ecc.h
@@ -33,7 +33,8 @@
 #define ECC_CURVE_NIST_P192_DIGITS  3
 #define ECC_CURVE_NIST_P256_DIGITS  4
 #define ECC_CURVE_NIST_P384_DIGITS  6
-#define ECC_MAX_DIGITS              (512 / 64) /* due to ecrdsa */
+#define ECC_CURVE_NIST_P521_DIGITS  9
+#define ECC_MAX_DIGITS              DIV_ROUND_UP(521, 64) /* NIST P521 */
 
 #define ECC_DIGITS_TO_BYTES_SHIFT 3
 
-- 
cgit v1.2.3


From 288b46c57c658d3c798de0c9154e1215bdde8476 Mon Sep 17 00:00:00 2001
From: Stefan Berger <stefanb@linux.ibm.com>
Date: Thu, 4 Apr 2024 10:18:51 -0400
Subject: crypto: ecc - Add NIST P521 curve parameters

Add the parameters for the NIST P521 curve and define a new curve ID
for it. Make the curve available in ecc_get_curve.

Tested-by: Lukas Wunner <lukas@wunner.de>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Stefan Berger <stefanb@linux.ibm.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/ecc.c            |  2 ++
 crypto/ecc_curve_defs.h | 45 +++++++++++++++++++++++++++++++++++++++++++++
 include/crypto/ecdh.h   |  1 +
 3 files changed, 48 insertions(+)

(limited to 'include')

diff --git a/crypto/ecc.c b/crypto/ecc.c
index d15ef0747a76..2e05387b9499 100644
--- a/crypto/ecc.c
+++ b/crypto/ecc.c
@@ -60,6 +60,8 @@ const struct ecc_curve *ecc_get_curve(unsigned int curve_id)
 		return &nist_p256;
 	case ECC_CURVE_NIST_P384:
 		return &nist_p384;
+	case ECC_CURVE_NIST_P521:
+		return &nist_p521;
 	default:
 		return NULL;
 	}
diff --git a/crypto/ecc_curve_defs.h b/crypto/ecc_curve_defs.h
index ab1ef3d94be5..0ecade7d02f5 100644
--- a/crypto/ecc_curve_defs.h
+++ b/crypto/ecc_curve_defs.h
@@ -89,6 +89,51 @@ static struct ecc_curve nist_p384 = {
 	.b = nist_p384_b
 };
 
+/* NIST P-521 */
+static u64 nist_p521_g_x[] = { 0xf97e7e31c2e5bd66ull, 0x3348b3c1856a429bull,
+				0xfe1dc127a2ffa8deull, 0xa14b5e77efe75928ull,
+				0xf828af606b4d3dbaull, 0x9c648139053fb521ull,
+				0x9e3ecb662395b442ull, 0x858e06b70404e9cdull,
+				0xc6ull };
+static u64 nist_p521_g_y[] = { 0x88be94769fd16650ull, 0x353c7086a272c240ull,
+				0xc550b9013fad0761ull, 0x97ee72995ef42640ull,
+				0x17afbd17273e662cull, 0x98f54449579b4468ull,
+				0x5c8a5fb42c7d1bd9ull, 0x39296a789a3bc004ull,
+				0x118ull };
+static u64 nist_p521_p[] = { 0xffffffffffffffffull, 0xffffffffffffffffull,
+				0xffffffffffffffffull, 0xffffffffffffffffull,
+				0xffffffffffffffffull, 0xffffffffffffffffull,
+				0xffffffffffffffffull, 0xffffffffffffffffull,
+				0x1ffull };
+static u64 nist_p521_n[] = { 0xbb6fb71e91386409ull, 0x3bb5c9b8899c47aeull,
+				0x7fcc0148f709a5d0ull, 0x51868783bf2f966bull,
+				0xfffffffffffffffaull, 0xffffffffffffffffull,
+				0xffffffffffffffffull, 0xffffffffffffffffull,
+				0x1ffull };
+static u64 nist_p521_a[] = { 0xfffffffffffffffcull, 0xffffffffffffffffull,
+				0xffffffffffffffffull, 0xffffffffffffffffull,
+				0xffffffffffffffffull, 0xffffffffffffffffull,
+				0xffffffffffffffffull, 0xffffffffffffffffull,
+				0x1ffull };
+static u64 nist_p521_b[] = { 0xef451fd46b503f00ull, 0x3573df883d2c34f1ull,
+				0x1652c0bd3bb1bf07ull, 0x56193951ec7e937bull,
+				0xb8b489918ef109e1ull, 0xa2da725b99b315f3ull,
+				0x929a21a0b68540eeull, 0x953eb9618e1c9a1full,
+				0x051ull };
+static struct ecc_curve nist_p521 = {
+	.name = "nist_521",
+	.nbits = 521,
+	.g = {
+		.x = nist_p521_g_x,
+		.y = nist_p521_g_y,
+		.ndigits = 9,
+	},
+	.p = nist_p521_p,
+	.n = nist_p521_n,
+	.a = nist_p521_a,
+	.b = nist_p521_b
+};
+
 /* curve25519 */
 static u64 curve25519_g_x[] = { 0x0000000000000009, 0x0000000000000000,
 				0x0000000000000000, 0x0000000000000000 };
diff --git a/include/crypto/ecdh.h b/include/crypto/ecdh.h
index a9f98078d29c..9784ecdd2fb4 100644
--- a/include/crypto/ecdh.h
+++ b/include/crypto/ecdh.h
@@ -26,6 +26,7 @@
 #define ECC_CURVE_NIST_P192	0x0001
 #define ECC_CURVE_NIST_P256	0x0002
 #define ECC_CURVE_NIST_P384	0x0003
+#define ECC_CURVE_NIST_P521	0x0004
 
 /**
  * struct ecdh - define an ECDH private key
-- 
cgit v1.2.3


From 3ba2ae361402f28754adf873954a22bf97b856a9 Mon Sep 17 00:00:00 2001
From: Stefan Berger <stefanb@linux.ibm.com>
Date: Thu, 4 Apr 2024 10:18:56 -0400
Subject: crypto: x509 - Add OID for NIST P521 and extend parser for it

Enable the x509 parser to accept NIST P521 certificates and add the
OID for ansip521r1, which is the identifier for NIST P521.

Cc: David Howells <dhowells@redhat.com>
Tested-by: Lukas Wunner <lukas@wunner.de>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Stefan Berger <stefanb@linux.ibm.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/asymmetric_keys/x509_cert_parser.c | 3 +++
 include/linux/oid_registry.h              | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/crypto/asymmetric_keys/x509_cert_parser.c b/crypto/asymmetric_keys/x509_cert_parser.c
index bb0bffa271b5..964208d1a35f 100644
--- a/crypto/asymmetric_keys/x509_cert_parser.c
+++ b/crypto/asymmetric_keys/x509_cert_parser.c
@@ -546,6 +546,9 @@ int x509_extract_key_data(void *context, size_t hdrlen,
 		case OID_id_ansip384r1:
 			ctx->cert->pub->pkey_algo = "ecdsa-nist-p384";
 			break;
+		case OID_id_ansip521r1:
+			ctx->cert->pub->pkey_algo = "ecdsa-nist-p521";
+			break;
 		default:
 			return -ENOPKG;
 		}
diff --git a/include/linux/oid_registry.h b/include/linux/oid_registry.h
index 51421fdbb0ba..6f9242259edc 100644
--- a/include/linux/oid_registry.h
+++ b/include/linux/oid_registry.h
@@ -69,6 +69,7 @@ enum OID {
 	OID_certAuthInfoAccess,		/* 1.3.6.1.5.5.7.1.1 */
 	OID_sha1,			/* 1.3.14.3.2.26 */
 	OID_id_ansip384r1,		/* 1.3.132.0.34 */
+	OID_id_ansip521r1,		/* 1.3.132.0.35 */
 	OID_sha256,			/* 2.16.840.1.101.3.4.2.1 */
 	OID_sha384,			/* 2.16.840.1.101.3.4.2.2 */
 	OID_sha512,			/* 2.16.840.1.101.3.4.2.3 */
-- 
cgit v1.2.3


From 5c6ca9d936654a135b459c846885e08966e5e5bf Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Sun, 7 Apr 2024 19:57:40 +0200
Subject: X.509: Introduce scope-based x509_certificate allocation

Add a DEFINE_FREE() clause for x509_certificate structs and use it in
x509_cert_parse() and x509_key_preparse().  These are the only functions
where scope-based x509_certificate allocation currently makes sense.
A third user will be introduced with the forthcoming SPDM library
(Security Protocol and Data Model) for PCI device authentication.

Unlike most other DEFINE_FREE() clauses, this one checks for IS_ERR()
instead of NULL before calling x509_free_certificate() at end of scope.
That's because the "constructor" of x509_certificate structs,
x509_cert_parse(), returns a valid pointer or an ERR_PTR(), but never
NULL.

Comparing the Assembler output before/after has shown they are identical,
save for the fact that gcc-12 always generates two return paths when
__cleanup() is used, one for the success case and one for the error case.

In x509_cert_parse(), add a hint for the compiler that kzalloc() never
returns an ERR_PTR().  Otherwise the compiler adds a gratuitous IS_ERR()
check on return.  Introduce an assume() macro for this which can be
re-used elsewhere in the kernel to provide hints for the compiler.

Suggested-by: Jonathan Cameron <Jonathan.Cameron@Huawei.com>
Link: https://lore.kernel.org/all/20231003153937.000034ca@Huawei.com/
Link: https://lwn.net/Articles/934679/
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/asymmetric_keys/x509_cert_parser.c | 43 ++++++++++++-------------------
 crypto/asymmetric_keys/x509_parser.h      |  3 +++
 crypto/asymmetric_keys/x509_public_key.c  | 31 +++++++---------------
 include/linux/compiler.h                  |  2 ++
 4 files changed, 30 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/crypto/asymmetric_keys/x509_cert_parser.c b/crypto/asymmetric_keys/x509_cert_parser.c
index 964208d1a35f..a814e5f136e2 100644
--- a/crypto/asymmetric_keys/x509_cert_parser.c
+++ b/crypto/asymmetric_keys/x509_cert_parser.c
@@ -60,24 +60,24 @@ EXPORT_SYMBOL_GPL(x509_free_certificate);
  */
 struct x509_certificate *x509_cert_parse(const void *data, size_t datalen)
 {
-	struct x509_certificate *cert;
-	struct x509_parse_context *ctx;
+	struct x509_certificate *cert __free(x509_free_certificate);
+	struct x509_parse_context *ctx __free(kfree) = NULL;
 	struct asymmetric_key_id *kid;
 	long ret;
 
-	ret = -ENOMEM;
 	cert = kzalloc(sizeof(struct x509_certificate), GFP_KERNEL);
+	assume(!IS_ERR(cert)); /* Avoid gratuitous IS_ERR() check on return */
 	if (!cert)
-		goto error_no_cert;
+		return ERR_PTR(-ENOMEM);
 	cert->pub = kzalloc(sizeof(struct public_key), GFP_KERNEL);
 	if (!cert->pub)
-		goto error_no_ctx;
+		return ERR_PTR(-ENOMEM);
 	cert->sig = kzalloc(sizeof(struct public_key_signature), GFP_KERNEL);
 	if (!cert->sig)
-		goto error_no_ctx;
+		return ERR_PTR(-ENOMEM);
 	ctx = kzalloc(sizeof(struct x509_parse_context), GFP_KERNEL);
 	if (!ctx)
-		goto error_no_ctx;
+		return ERR_PTR(-ENOMEM);
 
 	ctx->cert = cert;
 	ctx->data = (unsigned long)data;
@@ -85,7 +85,7 @@ struct x509_certificate *x509_cert_parse(const void *data, size_t datalen)
 	/* Attempt to decode the certificate */
 	ret = asn1_ber_decoder(&x509_decoder, ctx, data, datalen);
 	if (ret < 0)
-		goto error_decode;
+		return ERR_PTR(ret);
 
 	/* Decode the AuthorityKeyIdentifier */
 	if (ctx->raw_akid) {
@@ -95,20 +95,19 @@ struct x509_certificate *x509_cert_parse(const void *data, size_t datalen)
 				       ctx->raw_akid, ctx->raw_akid_size);
 		if (ret < 0) {
 			pr_warn("Couldn't decode AuthKeyIdentifier\n");
-			goto error_decode;
+			return ERR_PTR(ret);
 		}
 	}
 
-	ret = -ENOMEM;
 	cert->pub->key = kmemdup(ctx->key, ctx->key_size, GFP_KERNEL);
 	if (!cert->pub->key)
-		goto error_decode;
+		return ERR_PTR(-ENOMEM);
 
 	cert->pub->keylen = ctx->key_size;
 
 	cert->pub->params = kmemdup(ctx->params, ctx->params_size, GFP_KERNEL);
 	if (!cert->pub->params)
-		goto error_decode;
+		return ERR_PTR(-ENOMEM);
 
 	cert->pub->paramlen = ctx->params_size;
 	cert->pub->algo = ctx->key_algo;
@@ -116,33 +115,23 @@ struct x509_certificate *x509_cert_parse(const void *data, size_t datalen)
 	/* Grab the signature bits */
 	ret = x509_get_sig_params(cert);
 	if (ret < 0)
-		goto error_decode;
+		return ERR_PTR(ret);
 
 	/* Generate cert issuer + serial number key ID */
 	kid = asymmetric_key_generate_id(cert->raw_serial,
 					 cert->raw_serial_size,
 					 cert->raw_issuer,
 					 cert->raw_issuer_size);
-	if (IS_ERR(kid)) {
-		ret = PTR_ERR(kid);
-		goto error_decode;
-	}
+	if (IS_ERR(kid))
+		return ERR_CAST(kid);
 	cert->id = kid;
 
 	/* Detect self-signed certificates */
 	ret = x509_check_for_self_signed(cert);
 	if (ret < 0)
-		goto error_decode;
-
-	kfree(ctx);
-	return cert;
+		return ERR_PTR(ret);
 
-error_decode:
-	kfree(ctx);
-error_no_ctx:
-	x509_free_certificate(cert);
-error_no_cert:
-	return ERR_PTR(ret);
+	return_ptr(cert);
 }
 EXPORT_SYMBOL_GPL(x509_cert_parse);
 
diff --git a/crypto/asymmetric_keys/x509_parser.h b/crypto/asymmetric_keys/x509_parser.h
index 97a886cbe01c..0688c222806b 100644
--- a/crypto/asymmetric_keys/x509_parser.h
+++ b/crypto/asymmetric_keys/x509_parser.h
@@ -5,6 +5,7 @@
  * Written by David Howells (dhowells@redhat.com)
  */
 
+#include <linux/cleanup.h>
 #include <linux/time.h>
 #include <crypto/public_key.h>
 #include <keys/asymmetric-type.h>
@@ -44,6 +45,8 @@ struct x509_certificate {
  * x509_cert_parser.c
  */
 extern void x509_free_certificate(struct x509_certificate *cert);
+DEFINE_FREE(x509_free_certificate, struct x509_certificate *,
+	    if (!IS_ERR(_T)) x509_free_certificate(_T))
 extern struct x509_certificate *x509_cert_parse(const void *data, size_t datalen);
 extern int x509_decode_time(time64_t *_t,  size_t hdrlen,
 			    unsigned char tag,
diff --git a/crypto/asymmetric_keys/x509_public_key.c b/crypto/asymmetric_keys/x509_public_key.c
index 6a4f00be22fc..00ac7159fba2 100644
--- a/crypto/asymmetric_keys/x509_public_key.c
+++ b/crypto/asymmetric_keys/x509_public_key.c
@@ -161,12 +161,11 @@ not_self_signed:
  */
 static int x509_key_preparse(struct key_preparsed_payload *prep)
 {
-	struct asymmetric_key_ids *kids;
-	struct x509_certificate *cert;
+	struct x509_certificate *cert __free(x509_free_certificate);
+	struct asymmetric_key_ids *kids __free(kfree) = NULL;
+	char *p, *desc __free(kfree) = NULL;
 	const char *q;
 	size_t srlen, sulen;
-	char *desc = NULL, *p;
-	int ret;
 
 	cert = x509_cert_parse(prep->data, prep->datalen);
 	if (IS_ERR(cert))
@@ -188,9 +187,8 @@ static int x509_key_preparse(struct key_preparsed_payload *prep)
 	}
 
 	/* Don't permit addition of blacklisted keys */
-	ret = -EKEYREJECTED;
 	if (cert->blacklisted)
-		goto error_free_cert;
+		return -EKEYREJECTED;
 
 	/* Propose a description */
 	sulen = strlen(cert->subject);
@@ -202,10 +200,9 @@ static int x509_key_preparse(struct key_preparsed_payload *prep)
 		q = cert->raw_serial;
 	}
 
-	ret = -ENOMEM;
 	desc = kmalloc(sulen + 2 + srlen * 2 + 1, GFP_KERNEL);
 	if (!desc)
-		goto error_free_cert;
+		return -ENOMEM;
 	p = memcpy(desc, cert->subject, sulen);
 	p += sulen;
 	*p++ = ':';
@@ -215,16 +212,14 @@ static int x509_key_preparse(struct key_preparsed_payload *prep)
 
 	kids = kmalloc(sizeof(struct asymmetric_key_ids), GFP_KERNEL);
 	if (!kids)
-		goto error_free_desc;
+		return -ENOMEM;
 	kids->id[0] = cert->id;
 	kids->id[1] = cert->skid;
 	kids->id[2] = asymmetric_key_generate_id(cert->raw_subject,
 						 cert->raw_subject_size,
 						 "", 0);
-	if (IS_ERR(kids->id[2])) {
-		ret = PTR_ERR(kids->id[2]);
-		goto error_free_kids;
-	}
+	if (IS_ERR(kids->id[2]))
+		return PTR_ERR(kids->id[2]);
 
 	/* We're pinning the module by being linked against it */
 	__module_get(public_key_subtype.owner);
@@ -242,15 +237,7 @@ static int x509_key_preparse(struct key_preparsed_payload *prep)
 	cert->sig = NULL;
 	desc = NULL;
 	kids = NULL;
-	ret = 0;
-
-error_free_kids:
-	kfree(kids);
-error_free_desc:
-	kfree(desc);
-error_free_cert:
-	x509_free_certificate(cert);
-	return ret;
+	return 0;
 }
 
 static struct asymmetric_key_parser x509_key_parser = {
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index c00cc6c0878a..53666eb19909 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -148,6 +148,8 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 } while (0)
 #endif
 
+#define assume(cond) do { if (!(cond)) __builtin_unreachable(); } while (0)
+
 /*
  * KENTRY - kernel entry point
  * This can be used to annotate symbols (functions or data) that are used
-- 
cgit v1.2.3


From f7842747d13d9f4eedba9edec5b834c9d9237717 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 5 Apr 2024 07:58:15 -0400
Subject: mm: replace set_pte_at_notify() with just set_pte_at()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With the demise of the .change_pte() MMU notifier callback, there is no
notification happening in set_pte_at_notify().  It is a synonym of
set_pte_at() and can be replaced with it.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Message-ID: <20240405115815.3226315-5-pbonzini@redhat.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/mmu_notifier.h | 2 --
 kernel/events/uprobes.c      | 6 +++---
 mm/ksm.c                     | 4 ++--
 mm/memory.c                  | 7 +------
 mm/migrate_device.c          | 8 ++------
 5 files changed, 8 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 8c72bf651606..d39ebb10caeb 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -657,6 +657,4 @@ static inline void mmu_notifier_synchronize(void)
 
 #endif /* CONFIG_MMU_NOTIFIER */
 
-#define set_pte_at_notify set_pte_at
-
 #endif /* _LINUX_MMU_NOTIFIER_H */
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index e4834d23e1d1..1215bc299390 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -18,7 +18,7 @@
 #include <linux/sched/coredump.h>
 #include <linux/export.h>
 #include <linux/rmap.h>		/* anon_vma_prepare */
-#include <linux/mmu_notifier.h>	/* set_pte_at_notify */
+#include <linux/mmu_notifier.h>
 #include <linux/swap.h>		/* folio_free_swap */
 #include <linux/ptrace.h>	/* user_enable_single_step */
 #include <linux/kdebug.h>	/* notifier mechanism */
@@ -195,8 +195,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 	flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte)));
 	ptep_clear_flush(vma, addr, pvmw.pte);
 	if (new_page)
-		set_pte_at_notify(mm, addr, pvmw.pte,
-				  mk_pte(new_page, vma->vm_page_prot));
+		set_pte_at(mm, addr, pvmw.pte,
+			   mk_pte(new_page, vma->vm_page_prot));
 
 	folio_remove_rmap_pte(old_folio, old_page, vma);
 	if (!folio_mapped(old_folio))
diff --git a/mm/ksm.c b/mm/ksm.c
index 8c001819cf10..108a4d167824 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1345,7 +1345,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
 		if (pte_write(entry))
 			entry = pte_wrprotect(entry);
 
-		set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry);
+		set_pte_at(mm, pvmw.address, pvmw.pte, entry);
 	}
 	*orig_pte = entry;
 	err = 0;
@@ -1447,7 +1447,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
 	 * See Documentation/mm/mmu_notifier.rst
 	 */
 	ptep_clear_flush(vma, addr, ptep);
-	set_pte_at_notify(mm, addr, ptep, newpte);
+	set_pte_at(mm, addr, ptep, newpte);
 
 	folio = page_folio(page);
 	folio_remove_rmap_pte(folio, page, vma);
diff --git a/mm/memory.c b/mm/memory.c
index f2bc6dd15eb8..9a6f4d8aa379 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3327,13 +3327,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 		ptep_clear_flush(vma, vmf->address, vmf->pte);
 		folio_add_new_anon_rmap(new_folio, vma, vmf->address);
 		folio_add_lru_vma(new_folio, vma);
-		/*
-		 * We call the notify macro here because, when using secondary
-		 * mmu page tables (such as kvm shadow page tables), we want the
-		 * new page to be mapped directly into the secondary page table.
-		 */
 		BUG_ON(unshare && pte_write(entry));
-		set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
+		set_pte_at(mm, vmf->address, vmf->pte, entry);
 		update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
 		if (old_folio) {
 			/*
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index b6c27c76e1a0..66206734b1b9 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -664,13 +664,9 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
 	if (flush) {
 		flush_cache_page(vma, addr, pte_pfn(orig_pte));
 		ptep_clear_flush(vma, addr, ptep);
-		set_pte_at_notify(mm, addr, ptep, entry);
-		update_mmu_cache(vma, addr, ptep);
-	} else {
-		/* No need to invalidate - it was non-present before */
-		set_pte_at(mm, addr, ptep, entry);
-		update_mmu_cache(vma, addr, ptep);
 	}
+	set_pte_at(mm, addr, ptep, entry);
+	update_mmu_cache(vma, addr, ptep);
 
 	pte_unmap_unlock(ptep, ptl);
 	*src = MIGRATE_PFN_MIGRATE;
-- 
cgit v1.2.3


From 7d36c3573391dcf0da089298a4b5a25c39f7289d Mon Sep 17 00:00:00 2001
From: INAGAKI Hiroshi <musashino.open@gmail.com>
Date: Sat, 23 Mar 2024 16:36:09 +0900
Subject: dt-bindings: leds: Add LED_FUNCTION_MOBILE for mobile network

Add LED_FUNCTION_MOBILE for LEDs that indicate status of mobile network
connection. This is useful to distinguish those LEDs from LEDs that
indicates status of wired "wan" connection.

example (on stock fw):

IIJ SA-W2 has "Mobile" LEDs that indicate status (no signal, too low,
low, good) of mobile network connection via dongle connected to USB
port.

- no signal: (none, turned off)
-   too low: green:mobile & red:mobile (amber, blink)
-       low: green:mobile & red:mobile (amber, turned on)
-      good: green:mobile (turned on)

Suggested-by: Hauke Mehrtens <hauke@hauke-m.de>
Signed-off-by: INAGAKI Hiroshi <musashino.open@gmail.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20240323074326.1428-2-musashino.open@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/dt-bindings/leds/common.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/dt-bindings/leds/common.h b/include/dt-bindings/leds/common.h
index ecea167930d9..6216ecdb06c7 100644
--- a/include/dt-bindings/leds/common.h
+++ b/include/dt-bindings/leds/common.h
@@ -90,6 +90,7 @@
 #define LED_FUNCTION_INDICATOR "indicator"
 #define LED_FUNCTION_LAN "lan"
 #define LED_FUNCTION_MAIL "mail"
+#define LED_FUNCTION_MOBILE "mobile"
 #define LED_FUNCTION_MTD "mtd"
 #define LED_FUNCTION_PANIC "panic"
 #define LED_FUNCTION_PROGRAMMING "programming"
-- 
cgit v1.2.3


From 77b9f2d6fd9bf34ec810de6bdad42d7d0a47d31b Mon Sep 17 00:00:00 2001
From: INAGAKI Hiroshi <musashino.open@gmail.com>
Date: Sat, 23 Mar 2024 16:36:10 +0900
Subject: dt-bindings: leds: Add LED_FUNCTION_SPEED_* for link speed on LAN/WAN

Add LED_FUNCTION_SPEED_LAN and LED_FUNCTION_SPEED_WAN for LEDs that
indicate link speed of ethernet ports on LAN/WAN. This is useful to
distinguish those LEDs from LEDs that indicate link status (up/down).

example:

Fortinet FortiGate 30E/50E have LEDs that indicate link speed on each
of the ethernet ports in addition to LEDs that indicate link status
(up/down).

- 1000 Mbps: green:speed-(lan|wan)-N
-  100 Mbps: amber:speed-(lan|wan)-N
-   10 Mbps: (none, turned off)

Signed-off-by: INAGAKI Hiroshi <musashino.open@gmail.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20240323074326.1428-3-musashino.open@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/dt-bindings/leds/common.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/leds/common.h b/include/dt-bindings/leds/common.h
index 6216ecdb06c7..82a5769725ce 100644
--- a/include/dt-bindings/leds/common.h
+++ b/include/dt-bindings/leds/common.h
@@ -96,6 +96,8 @@
 #define LED_FUNCTION_PROGRAMMING "programming"
 #define LED_FUNCTION_RX "rx"
 #define LED_FUNCTION_SD "sd"
+#define LED_FUNCTION_SPEED_LAN "speed-lan"
+#define LED_FUNCTION_SPEED_WAN "speed-wan"
 #define LED_FUNCTION_STANDBY "standby"
 #define LED_FUNCTION_TORCH "torch"
 #define LED_FUNCTION_TX "tx"
-- 
cgit v1.2.3


From 697a6c8cec03c2299f850fa50322641a8bf6b915 Mon Sep 17 00:00:00 2001
From: Hechao Li <hli@netflix.com>
Date: Tue, 9 Apr 2024 09:43:55 -0700
Subject: tcp: increase the default TCP scaling ratio

After commit dfa2f0483360 ("tcp: get rid of sysctl_tcp_adv_win_scale"),
we noticed an application-level timeout due to reduced throughput.

Before the commit, for a client that sets SO_RCVBUF to 65k, it takes
around 22 seconds to transfer 10M data. After the commit, it takes 40
seconds. Because our application has a 30-second timeout, this
regression broke the application.

The reason that it takes longer to transfer data is that
tp->scaling_ratio is initialized to a value that results in ~0.25 of
rcvbuf. In our case, SO_RCVBUF is set to 65536 by the application, which
translates to 2 * 65536 = 131,072 bytes in rcvbuf and hence a ~28k
initial receive window.

Later, even though the scaling_ratio is updated to a more accurate
skb->len/skb->truesize, which is ~0.66 in our environment, the window
stays at ~0.25 * rcvbuf. This is because tp->window_clamp does not
change together with the tp->scaling_ratio update when autotuning is
disabled due to SO_RCVBUF. As a result, the window size is capped at the
initial window_clamp, which is also ~0.25 * rcvbuf, and never grows
bigger.

Most modern applications let the kernel do autotuning, and benefit from
the increased scaling_ratio. But there are applications such as kafka
that has a default setting of SO_RCVBUF=64k.

This patch increases the initial scaling_ratio from ~25% to 50% in order
to make it backward compatible with the original default
sysctl_tcp_adv_win_scale for applications setting SO_RCVBUF.

Fixes: dfa2f0483360 ("tcp: get rid of sysctl_tcp_adv_win_scale")
Signed-off-by: Hechao Li <hli@netflix.com>
Reviewed-by: Tycho Andersen <tycho@tycho.pizza>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/netdev/20240402215405.432863-1-hli@netflix.com/
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index ba6c5ae86e22..b935e1ae4caf 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1539,11 +1539,10 @@ static inline int tcp_space_from_win(const struct sock *sk, int win)
 	return __tcp_space_from_win(tcp_sk(sk)->scaling_ratio, win);
 }
 
-/* Assume a conservative default of 1200 bytes of payload per 4K page.
+/* Assume a 50% default for skb->len/skb->truesize ratio.
  * This may be adjusted later in tcp_measure_rcv_mss().
  */
-#define TCP_DEFAULT_SCALING_RATIO ((1200 << TCP_RMEM_TO_WIN_SCALE) / \
-				   SKB_TRUESIZE(4096))
+#define TCP_DEFAULT_SCALING_RATIO (1 << (TCP_RMEM_TO_WIN_SCALE - 1))
 
 static inline void tcp_scaling_ratio_init(struct sock *sk)
 {
-- 
cgit v1.2.3


From d2f85a263883b679f87ed8f911746105658e9c47 Mon Sep 17 00:00:00 2001
From: Yi Liu <yi.l.liu@intel.com>
Date: Thu, 28 Mar 2024 05:29:58 -0700
Subject: iommu: Pass domain to remove_dev_pasid() op

Existing remove_dev_pasid() callbacks of the underlying iommu drivers
get the attached domain from the group->pasid_array. However, the domain
stored in group->pasid_array is not always correct in all scenarios.
A wrong domain may result in failure in remove_dev_pasid() callback.
To avoid such problems, it is more reliable to pass the domain to the
remove_dev_pasid() op.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20240328122958.83332-3-yi.l.liu@intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c |  9 ++-------
 drivers/iommu/intel/iommu.c                 | 11 +++--------
 drivers/iommu/iommu.c                       |  9 +++++----
 include/linux/iommu.h                       |  3 ++-
 4 files changed, 12 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 41f93c3ab160..2e1988c861f1 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3053,14 +3053,9 @@ static int arm_smmu_def_domain_type(struct device *dev)
 	return 0;
 }
 
-static void arm_smmu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
+static void arm_smmu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
+				      struct iommu_domain *domain)
 {
-	struct iommu_domain *domain;
-
-	domain = iommu_get_domain_for_dev_pasid(dev, pasid, IOMMU_DOMAIN_SVA);
-	if (WARN_ON(IS_ERR(domain)) || !domain)
-		return;
-
 	arm_smmu_sva_remove_dev_pasid(domain, dev, pasid);
 }
 
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 50eb9aed47cc..45c75a8a0ef5 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4587,19 +4587,15 @@ static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
 	return 0;
 }
 
-static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
+static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
+					 struct iommu_domain *domain)
 {
 	struct device_domain_info *info = dev_iommu_priv_get(dev);
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
 	struct dev_pasid_info *curr, *dev_pasid = NULL;
 	struct intel_iommu *iommu = info->iommu;
-	struct dmar_domain *dmar_domain;
-	struct iommu_domain *domain;
 	unsigned long flags;
 
-	domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
-	if (WARN_ON_ONCE(!domain))
-		goto out_tear_down;
-
 	/*
 	 * The SVA implementation needs to handle its own stuffs like the mm
 	 * notification. Before consolidating that code into iommu core, let
@@ -4610,7 +4606,6 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
 		goto out_tear_down;
 	}
 
-	dmar_domain = to_dmar_domain(domain);
 	spin_lock_irqsave(&dmar_domain->lock, flags);
 	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
 		if (curr->dev == dev && curr->pasid == pasid) {
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 659a77f7bb83..3183b0ed4cdb 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -3335,20 +3335,21 @@ err_revert:
 
 		if (device == last_gdev)
 			break;
-		ops->remove_dev_pasid(device->dev, pasid);
+		ops->remove_dev_pasid(device->dev, pasid, domain);
 	}
 	return ret;
 }
 
 static void __iommu_remove_group_pasid(struct iommu_group *group,
-				       ioasid_t pasid)
+				       ioasid_t pasid,
+				       struct iommu_domain *domain)
 {
 	struct group_device *device;
 	const struct iommu_ops *ops;
 
 	for_each_group_device(group, device) {
 		ops = dev_iommu_ops(device->dev);
-		ops->remove_dev_pasid(device->dev, pasid);
+		ops->remove_dev_pasid(device->dev, pasid, domain);
 	}
 }
 
@@ -3418,7 +3419,7 @@ void iommu_detach_device_pasid(struct iommu_domain *domain, struct device *dev,
 	struct iommu_group *group = dev->iommu_group;
 
 	mutex_lock(&group->mutex);
-	__iommu_remove_group_pasid(group, pasid);
+	__iommu_remove_group_pasid(group, pasid, domain);
 	WARN_ON(xa_erase(&group->pasid_array, pasid) != domain);
 	mutex_unlock(&group->mutex);
 }
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 2e925b5eba53..40dd439307e8 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -578,7 +578,8 @@ struct iommu_ops {
 			      struct iommu_page_response *msg);
 
 	int (*def_domain_type)(struct device *dev);
-	void (*remove_dev_pasid)(struct device *dev, ioasid_t pasid);
+	void (*remove_dev_pasid)(struct device *dev, ioasid_t pasid,
+				 struct iommu_domain *domain);
 
 	const struct iommu_domain_ops *default_domain_ops;
 	unsigned long pgsize_bitmap;
-- 
cgit v1.2.3


From ec84ca4025c0b90c6b7173a230f78ec00cad44f5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 9 Apr 2024 16:37:48 +0200
Subject: scsi: block: Remove now unused queue limits helpers

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20240409143748.980206-24-hch@lst.de
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 block/blk-settings.c           | 245 -----------------------------------------
 drivers/s390/block/dasd_eckd.c |   6 +-
 include/linux/blkdev.h         |  13 ---
 include/linux/mmc/host.h       |   4 +-
 4 files changed, 5 insertions(+), 263 deletions(-)

(limited to 'include')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 3c7d8d638ab5..292aadf8d807 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -284,72 +284,6 @@ int queue_limits_set(struct request_queue *q, struct queue_limits *lim)
 }
 EXPORT_SYMBOL_GPL(queue_limits_set);
 
-/**
- * blk_queue_bounce_limit - set bounce buffer limit for queue
- * @q: the request queue for the device
- * @bounce: bounce limit to enforce
- *
- * Description:
- *    Force bouncing for ISA DMA ranges or highmem.
- *
- *    DEPRECATED, don't use in new code.
- **/
-void blk_queue_bounce_limit(struct request_queue *q, enum blk_bounce bounce)
-{
-	q->limits.bounce = bounce;
-}
-EXPORT_SYMBOL(blk_queue_bounce_limit);
-
-/**
- * blk_queue_max_hw_sectors - set max sectors for a request for this queue
- * @q:  the request queue for the device
- * @max_hw_sectors:  max hardware sectors in the usual 512b unit
- *
- * Description:
- *    Enables a low level driver to set a hard upper limit,
- *    max_hw_sectors, on the size of requests.  max_hw_sectors is set by
- *    the device driver based upon the capabilities of the I/O
- *    controller.
- *
- *    max_dev_sectors is a hard limit imposed by the storage device for
- *    READ/WRITE requests. It is set by the disk driver.
- *
- *    max_sectors is a soft limit imposed by the block layer for
- *    filesystem type requests.  This value can be overridden on a
- *    per-device basis in /sys/block/<device>/queue/max_sectors_kb.
- *    The soft limit can not exceed max_hw_sectors.
- **/
-void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors)
-{
-	struct queue_limits *limits = &q->limits;
-	unsigned int max_sectors;
-
-	if ((max_hw_sectors << 9) < PAGE_SIZE) {
-		max_hw_sectors = 1 << (PAGE_SHIFT - 9);
-		pr_info("%s: set to minimum %u\n", __func__, max_hw_sectors);
-	}
-
-	max_hw_sectors = round_down(max_hw_sectors,
-				    limits->logical_block_size >> SECTOR_SHIFT);
-	limits->max_hw_sectors = max_hw_sectors;
-
-	max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors);
-
-	if (limits->max_user_sectors)
-		max_sectors = min(max_sectors, limits->max_user_sectors);
-	else
-		max_sectors = min(max_sectors, BLK_DEF_MAX_SECTORS_CAP);
-
-	max_sectors = round_down(max_sectors,
-				 limits->logical_block_size >> SECTOR_SHIFT);
-	limits->max_sectors = max_sectors;
-
-	if (!q->disk)
-		return;
-	q->disk->bdi->io_pages = max_sectors >> (PAGE_SHIFT - 9);
-}
-EXPORT_SYMBOL(blk_queue_max_hw_sectors);
-
 /**
  * blk_queue_chunk_sectors - set size of the chunk for this queue
  * @q:  the request queue for the device
@@ -436,65 +370,6 @@ void blk_queue_max_zone_append_sectors(struct request_queue *q,
 }
 EXPORT_SYMBOL_GPL(blk_queue_max_zone_append_sectors);
 
-/**
- * blk_queue_max_segments - set max hw segments for a request for this queue
- * @q:  the request queue for the device
- * @max_segments:  max number of segments
- *
- * Description:
- *    Enables a low level driver to set an upper limit on the number of
- *    hw data segments in a request.
- **/
-void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments)
-{
-	if (!max_segments) {
-		max_segments = 1;
-		pr_info("%s: set to minimum %u\n", __func__, max_segments);
-	}
-
-	q->limits.max_segments = max_segments;
-}
-EXPORT_SYMBOL(blk_queue_max_segments);
-
-/**
- * blk_queue_max_discard_segments - set max segments for discard requests
- * @q:  the request queue for the device
- * @max_segments:  max number of segments
- *
- * Description:
- *    Enables a low level driver to set an upper limit on the number of
- *    segments in a discard request.
- **/
-void blk_queue_max_discard_segments(struct request_queue *q,
-		unsigned short max_segments)
-{
-	q->limits.max_discard_segments = max_segments;
-}
-EXPORT_SYMBOL_GPL(blk_queue_max_discard_segments);
-
-/**
- * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg
- * @q:  the request queue for the device
- * @max_size:  max size of segment in bytes
- *
- * Description:
- *    Enables a low level driver to set an upper limit on the size of a
- *    coalesced segment
- **/
-void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
-{
-	if (max_size < PAGE_SIZE) {
-		max_size = PAGE_SIZE;
-		pr_info("%s: set to minimum %u\n", __func__, max_size);
-	}
-
-	/* see blk_queue_virt_boundary() for the explanation */
-	WARN_ON_ONCE(q->limits.virt_boundary_mask);
-
-	q->limits.max_segment_size = max_size;
-}
-EXPORT_SYMBOL(blk_queue_max_segment_size);
-
 /**
  * blk_queue_logical_block_size - set logical block size for the queue
  * @q:  the request queue for the device
@@ -661,29 +536,6 @@ void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt)
 }
 EXPORT_SYMBOL(blk_limits_io_opt);
 
-/**
- * blk_queue_io_opt - set optimal request size for the queue
- * @q:	the request queue for the device
- * @opt:  optimal request size in bytes
- *
- * Description:
- *   Storage devices may report an optimal I/O size, which is the
- *   device's preferred unit for sustained I/O.  This is rarely reported
- *   for disk drives.  For RAID arrays it is usually the stripe width or
- *   the internal track size.  A properly aligned multiple of
- *   optimal_io_size is the preferred request size for workloads where
- *   sustained throughput is desired.
- */
-void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
-{
-	blk_limits_io_opt(&q->limits, opt);
-	if (!q->disk)
-		return;
-	q->disk->bdi->ra_pages =
-		max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES);
-}
-EXPORT_SYMBOL(blk_queue_io_opt);
-
 static int queue_limit_alignment_offset(const struct queue_limits *lim,
 		sector_t sector)
 {
@@ -933,81 +785,6 @@ void blk_queue_update_dma_pad(struct request_queue *q, unsigned int mask)
 }
 EXPORT_SYMBOL(blk_queue_update_dma_pad);
 
-/**
- * blk_queue_segment_boundary - set boundary rules for segment merging
- * @q:  the request queue for the device
- * @mask:  the memory boundary mask
- **/
-void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)
-{
-	if (mask < PAGE_SIZE - 1) {
-		mask = PAGE_SIZE - 1;
-		pr_info("%s: set to minimum %lx\n", __func__, mask);
-	}
-
-	q->limits.seg_boundary_mask = mask;
-}
-EXPORT_SYMBOL(blk_queue_segment_boundary);
-
-/**
- * blk_queue_virt_boundary - set boundary rules for bio merging
- * @q:  the request queue for the device
- * @mask:  the memory boundary mask
- **/
-void blk_queue_virt_boundary(struct request_queue *q, unsigned long mask)
-{
-	q->limits.virt_boundary_mask = mask;
-
-	/*
-	 * Devices that require a virtual boundary do not support scatter/gather
-	 * I/O natively, but instead require a descriptor list entry for each
-	 * page (which might not be idential to the Linux PAGE_SIZE).  Because
-	 * of that they are not limited by our notion of "segment size".
-	 */
-	if (mask)
-		q->limits.max_segment_size = UINT_MAX;
-}
-EXPORT_SYMBOL(blk_queue_virt_boundary);
-
-/**
- * blk_queue_dma_alignment - set dma length and memory alignment
- * @q:     the request queue for the device
- * @mask:  alignment mask
- *
- * description:
- *    set required memory and length alignment for direct dma transactions.
- *    this is used when building direct io requests for the queue.
- *
- **/
-void blk_queue_dma_alignment(struct request_queue *q, int mask)
-{
-	q->limits.dma_alignment = mask;
-}
-EXPORT_SYMBOL(blk_queue_dma_alignment);
-
-/**
- * blk_queue_update_dma_alignment - update dma length and memory alignment
- * @q:     the request queue for the device
- * @mask:  alignment mask
- *
- * description:
- *    update required memory and length alignment for direct dma transactions.
- *    If the requested alignment is larger than the current alignment, then
- *    the current queue alignment is updated to the new value, otherwise it
- *    is left alone.  The design of this is to allow multiple objects
- *    (driver, device, transport etc) to set their respective
- *    alignments without having them interfere.
- *
- **/
-void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
-{
-	BUG_ON(mask > PAGE_SIZE);
-
-	if (mask > q->limits.dma_alignment)
-		q->limits.dma_alignment = mask;
-}
-EXPORT_SYMBOL(blk_queue_update_dma_alignment);
-
 /**
  * blk_set_queue_depth - tell the block layer about the device queue depth
  * @q:		the request queue for the device
@@ -1061,28 +838,6 @@ void blk_queue_required_elevator_features(struct request_queue *q,
 }
 EXPORT_SYMBOL_GPL(blk_queue_required_elevator_features);
 
-/**
- * blk_queue_can_use_dma_map_merging - configure queue for merging segments.
- * @q:		the request queue for the device
- * @dev:	the device pointer for dma
- *
- * Tell the block layer about merging the segments by dma map of @q.
- */
-bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
-				       struct device *dev)
-{
-	unsigned long boundary = dma_get_merge_boundary(dev);
-
-	if (!boundary)
-		return false;
-
-	/* No need to update max_segment_size. see blk_queue_virt_boundary() */
-	blk_queue_virt_boundary(q, boundary);
-
-	return true;
-}
-EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging);
-
 /**
  * disk_set_zoned - inidicate a zoned device
  * @disk:	gendisk to configure
diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c
index 180a008d38ea..2f16f543079b 100644
--- a/drivers/s390/block/dasd_eckd.c
+++ b/drivers/s390/block/dasd_eckd.c
@@ -4561,9 +4561,9 @@ static struct dasd_ccw_req *dasd_eckd_build_cp_tpm_track(
 	len_to_track_end = 0;
 	/*
 	 * A tidaw can address 4k of memory, but must not cross page boundaries
-	 * We can let the block layer handle this by setting
-	 * blk_queue_segment_boundary to page boundaries and
-	 * blk_max_segment_size to page size when setting up the request queue.
+	 * We can let the block layer handle this by setting seg_boundary_mask
+	 * to page boundaries and max_segment_size to page size when setting up
+	 * the request queue.
 	 * For write requests, a TIDAW must not cross track boundaries, because
 	 * we have to set the CBC flag on the last tidaw for each track.
 	 */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ded7f66dc4b9..e3c7082efa39 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -908,15 +908,9 @@ static inline void queue_limits_cancel_update(struct request_queue *q)
 /*
  * Access functions for manipulating queue properties
  */
-void blk_queue_bounce_limit(struct request_queue *q, enum blk_bounce limit);
-extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_chunk_sectors(struct request_queue *, unsigned int);
-extern void blk_queue_max_segments(struct request_queue *, unsigned short);
-extern void blk_queue_max_discard_segments(struct request_queue *,
-		unsigned short);
 void blk_queue_max_secure_erase_sectors(struct request_queue *q,
 		unsigned int max_sectors);
-extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
 extern void blk_queue_max_discard_sectors(struct request_queue *q,
 		unsigned int max_discard_sectors);
 extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
@@ -933,7 +927,6 @@ void disk_update_readahead(struct gendisk *disk);
 extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
 extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
 extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
-extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt);
 extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth);
 extern void blk_set_stacking_limits(struct queue_limits *lim);
 extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
@@ -941,10 +934,6 @@ extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev,
 		sector_t offset, const char *pfx);
 extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int);
-extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
-extern void blk_queue_virt_boundary(struct request_queue *, unsigned long);
-extern void blk_queue_dma_alignment(struct request_queue *, int);
-extern void blk_queue_update_dma_alignment(struct request_queue *, int);
 extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
 extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua);
 
@@ -961,8 +950,6 @@ void disk_set_independent_access_ranges(struct gendisk *disk,
 
 extern void blk_queue_required_elevator_features(struct request_queue *q,
 						 unsigned int features);
-extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
-					      struct device *dev);
 
 bool __must_check blk_get_queue(struct request_queue *);
 extern void blk_put_queue(struct request_queue *);
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 5894bf912f7b..88c6a76042ee 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -433,8 +433,8 @@ struct mmc_host {
 	mmc_pm_flag_t		pm_caps;	/* supported pm features */
 
 	/* host specific block data */
-	unsigned int		max_seg_size;	/* see blk_queue_max_segment_size */
-	unsigned short		max_segs;	/* see blk_queue_max_segments */
+	unsigned int		max_seg_size;	/* lim->max_segment_size */
+	unsigned short		max_segs;	/* lim->max_segments */
 	unsigned short		unused;
 	unsigned int		max_req_size;	/* maximum number of bytes in one req */
 	unsigned int		max_blk_size;	/* maximum size of one mmc block */
-- 
cgit v1.2.3


From 3bb946c9d323f4b47b87bd73134125d562f72448 Mon Sep 17 00:00:00 2001
From: Fei Qin <fei.qin@corigine.com>
Date: Wed, 10 Apr 2024 13:26:35 +0200
Subject: devlink: add a new info version tag

Add definition and documentation for the new generic
info "board.part_number".

The new one is for part number specific use, and board.id
is modified to match the documentation in devlink-info.

Signed-off-by: Fei Qin <fei.qin@corigine.com>
Signed-off-by: Louis Peens <louis.peens@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/devlink/devlink-info.rst | 5 +++++
 include/net/devlink.h                             | 4 +++-
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/networking/devlink/devlink-info.rst b/Documentation/networking/devlink/devlink-info.rst
index 1242b0e6826b..23073bc219d8 100644
--- a/Documentation/networking/devlink/devlink-info.rst
+++ b/Documentation/networking/devlink/devlink-info.rst
@@ -146,6 +146,11 @@ board.manufacture
 
 An identifier of the company or the facility which produced the part.
 
+board.part_number
+-----------------
+
+Part number of the board and its components.
+
 fw
 --
 
diff --git a/include/net/devlink.h b/include/net/devlink.h
index bb1af599d101..d31769a116ce 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -599,12 +599,14 @@ enum devlink_param_generic_id {
 	.validate = _validate,						\
 }
 
-/* Part number, identifier of board design */
+/* Identifier of board design */
 #define DEVLINK_INFO_VERSION_GENERIC_BOARD_ID	"board.id"
 /* Revision of board design */
 #define DEVLINK_INFO_VERSION_GENERIC_BOARD_REV	"board.rev"
 /* Maker of the board */
 #define DEVLINK_INFO_VERSION_GENERIC_BOARD_MANUFACTURE	"board.manufacture"
+/* Part number of the board and its components */
+#define DEVLINK_INFO_VERSION_GENERIC_BOARD_PART_NUMBER	"board.part_number"
 
 /* Part number, identifier of asic design */
 #define DEVLINK_INFO_VERSION_GENERIC_ASIC_ID	"asic.id"
-- 
cgit v1.2.3


From cd1b30824ff270201c43c41b9b829a369cb4e944 Mon Sep 17 00:00:00 2001
From: Ben Cheatham <Benjamin.Cheatham@amd.com>
Date: Thu, 20 Jul 2023 14:46:14 -0500
Subject: ACPICA: actbl1.h: Add EINJ CXL error types

ACPICA commit c7171588a9f684afafc83c6c18ed0bab9274e5e6

Add EINJ CXL error types added in ACPI v6.5.

Link: https://github.com/acpica/acpica/commit/c7171588
Signed-off-by: Ben Cheatham <Benjamin.Cheatham@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/actbl1.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/acpi/actbl1.h b/include/acpi/actbl1.h
index a33375e055ad..1f58c5d86869 100644
--- a/include/acpi/actbl1.h
+++ b/include/acpi/actbl1.h
@@ -1096,6 +1096,12 @@ enum acpi_einj_command_status {
 #define ACPI_EINJ_PLATFORM_CORRECTABLE      (1<<9)
 #define ACPI_EINJ_PLATFORM_UNCORRECTABLE    (1<<10)
 #define ACPI_EINJ_PLATFORM_FATAL            (1<<11)
+#define ACPI_EINJ_CXL_CACHE_CORRECTABLE     (1<<12)
+#define ACPI_EINJ_CXL_CACHE_UNCORRECTABLE   (1<<13)
+#define ACPI_EINJ_CXL_CACHE_FATAL           (1<<14)
+#define ACPI_EINJ_CXL_MEM_CORRECTABLE       (1<<15)
+#define ACPI_EINJ_CXL_MEM_UNCORRECTABLE     (1<<16)
+#define ACPI_EINJ_CXL_MEM_FATAL             (1<<17)
 #define ACPI_EINJ_VENDOR_DEFINED            (1<<31)
 
 /*******************************************************************************
-- 
cgit v1.2.3


From 2e94dc11898042eb528eda3e09db242529722916 Mon Sep 17 00:00:00 2001
From: Shiju Jose <shiju.jose@huawei.com>
Date: Wed, 27 Sep 2023 17:41:52 +0100
Subject: ACPICA: ACPI 6.5: RAS2: Add support for RAS2 table

ACPICA commit c581606cf49b7574d29c02b1a3bc144650375e32

Add support for ACPI RAS2 feature table(RAS2) defined in the ACPI 6.5
Specification & upwards revision, section 5.2.21.

The RAS2 table provides interfaces for platform RAS features. RAS2 offers
the same services as RASF, but is more scalable than the latter.
RAS2 supports independent RAS controls and capabilities for a given RAS
feature for multiple instances of the same component in a given system.
The platform can support either RAS2 or RASF but not both.

Link: https://github.com/acpica/acpica/commit/c581606c
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/actbl2.h | 129 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 129 insertions(+)

(limited to 'include')

diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h
index 9775384d61c6..64a3f9ae2641 100644
--- a/include/acpi/actbl2.h
+++ b/include/acpi/actbl2.h
@@ -47,6 +47,7 @@
 #define ACPI_SIG_PPTT           "PPTT"	/* Processor Properties Topology Table */
 #define ACPI_SIG_PRMT           "PRMT"	/* Platform Runtime Mechanism Table */
 #define ACPI_SIG_RASF           "RASF"	/* RAS Feature table */
+#define ACPI_SIG_RAS2           "RAS2"	/* RAS2 Feature table */
 #define ACPI_SIG_RGRT           "RGRT"	/* Regulatory Graphics Resource Table */
 #define ACPI_SIG_RHCT           "RHCT"	/* RISC-V Hart Capabilities Table */
 #define ACPI_SIG_SBST           "SBST"	/* Smart Battery Specification Table */
@@ -2751,6 +2752,134 @@ enum acpi_rasf_status {
 #define ACPI_RASF_ERROR                 (1<<2)
 #define ACPI_RASF_STATUS                (0x1F<<3)
 
+/*******************************************************************************
+ *
+ * RAS2 - RAS2 Feature Table (ACPI 6.5)
+ *        Version 1
+ *
+ *
+ ******************************************************************************/
+
+struct acpi_table_ras2 {
+	struct acpi_table_header header;	/* Common ACPI table header */
+	u16 reserved;
+	u16 num_pcc_descs;
+};
+
+/* RAS2 Platform Communication Channel Descriptor */
+
+struct acpi_ras2_pcc_desc {
+	u8 channel_id;
+	u16 reserved;
+	u8 feature_type;
+	u32 instance;
+};
+
+/* RAS2 Platform Communication Channel Shared Memory Region */
+
+struct acpi_ras2_shared_memory {
+	u32 signature;
+	u16 command;
+	u16 status;
+	u16 version;
+	u8 features[16];
+	u8 set_capabilities[16];
+	u16 num_parameter_blocks;
+	u32 set_capabilities_status;
+};
+
+/* RAS2 Parameter Block Structure for PATROL_SCRUB */
+
+struct acpi_ras2_parameter_block {
+	u16 type;
+	u16 version;
+	u16 length;
+};
+
+/* RAS2 Parameter Block Structure for PATROL_SCRUB */
+
+struct acpi_ras2_patrol_scrub_parameter {
+	struct acpi_ras2_parameter_block header;
+	u16 patrol_scrub_command;
+	u64 requested_address_range[2];
+	u64 actual_address_range[2];
+	u32 flags;
+	u32 scrub_params_out;
+	u32 scrub_params_in;
+};
+
+/* Masks for Flags field above */
+
+#define ACPI_RAS2_SCRUBBER_RUNNING      1
+
+/* RAS2 Parameter Block Structure for LA2PA_TRANSLATION */
+
+struct acpi_ras2_la2pa_translation_parameter {
+	struct acpi_ras2_parameter_block header;
+	u16 addr_translation_command;
+	u64 sub_inst_id;
+	u64 logical_address;
+	u64 physical_address;
+	u32 status;
+};
+
+/* Channel Commands */
+
+enum acpi_ras2_commands {
+	ACPI_RAS2_EXECUTE_RAS2_COMMAND = 1
+};
+
+/* Platform RAS2 Features */
+
+enum acpi_ras2_features {
+	ACPI_RAS2_PATROL_SCRUB_SUPPORTED = 0,
+	ACPI_RAS2_LA2PA_TRANSLATION = 1
+};
+
+/* RAS2 Patrol Scrub Commands */
+
+enum acpi_ras2_patrol_scrub_commands {
+	ACPI_RAS2_GET_PATROL_PARAMETERS = 1,
+	ACPI_RAS2_START_PATROL_SCRUBBER = 2,
+	ACPI_RAS2_STOP_PATROL_SCRUBBER = 3
+};
+
+/* RAS2 LA2PA Translation Commands */
+
+enum acpi_ras2_la2_pa_translation_commands {
+	ACPI_RAS2_GET_LA2PA_TRANSLATION = 1,
+};
+
+/* RAS2 LA2PA Translation Status values */
+
+enum acpi_ras2_la2_pa_translation_status {
+	ACPI_RAS2_LA2PA_TRANSLATION_SUCCESS = 0,
+	ACPI_RAS2_LA2PA_TRANSLATION_FAIL = 1,
+};
+
+/* Channel Command flags */
+
+#define ACPI_RAS2_GENERATE_SCI          (1<<15)
+
+/* Status values */
+
+enum acpi_ras2_status {
+	ACPI_RAS2_SUCCESS = 0,
+	ACPI_RAS2_NOT_VALID = 1,
+	ACPI_RAS2_NOT_SUPPORTED = 2,
+	ACPI_RAS2_BUSY = 3,
+	ACPI_RAS2_FAILED = 4,
+	ACPI_RAS2_ABORTED = 5,
+	ACPI_RAS2_INVALID_DATA = 6
+};
+
+/* Status flags */
+
+#define ACPI_RAS2_COMMAND_COMPLETE      (1)
+#define ACPI_RAS2_SCI_DOORBELL          (1<<1)
+#define ACPI_RAS2_ERROR                 (1<<2)
+#define ACPI_RAS2_STATUS                (0x1F<<3)
+
 /*******************************************************************************
  *
  * RGRT - Regulatory Graphics Resource Table
-- 
cgit v1.2.3


From fe1c408d50604f6013ca273d14b0ffeb845f23b1 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo1.xu@intel.com>
Date: Wed, 17 Jan 2024 21:06:43 +0800
Subject: ACPICA: SRAT: Add RISC-V RINTC affinity structure

ACPICA commit 93caddbf2f620769052c59ec471f018281dc3a24

Add definition of RISC-V Interrupt Controller(RINTC)
affinity structure which was approved by UEFI forum
and will be part of next ACPI spec version(6.6).

Link: https://github.com/acpica/acpica/commit/93caddbf
Signed-off-by: Haibo Xu <haibo1.xu@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/actbl3.h | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/acpi/actbl3.h b/include/acpi/actbl3.h
index c080d579a546..e32149d605dc 100644
--- a/include/acpi/actbl3.h
+++ b/include/acpi/actbl3.h
@@ -192,7 +192,8 @@ enum acpi_srat_type {
 	ACPI_SRAT_TYPE_GIC_ITS_AFFINITY = 4,	/* ACPI 6.2 */
 	ACPI_SRAT_TYPE_GENERIC_AFFINITY = 5,	/* ACPI 6.3 */
 	ACPI_SRAT_TYPE_GENERIC_PORT_AFFINITY = 6,	/* ACPI 6.4 */
-	ACPI_SRAT_TYPE_RESERVED = 7	/* 7 and greater are reserved */
+	ACPI_SRAT_TYPE_RINTC_AFFINITY = 7,	/* ACPI 6.6 */
+	ACPI_SRAT_TYPE_RESERVED = 8	/* 8 and greater are reserved */
 };
 
 /*
@@ -296,6 +297,21 @@ struct acpi_srat_generic_affinity {
 #define ACPI_SRAT_GENERIC_AFFINITY_ENABLED     (1)	/* 00: Use affinity structure */
 #define ACPI_SRAT_ARCHITECTURAL_TRANSACTIONS   (1<<1)	/* ACPI 6.4 */
 
+/* 7: RINTC Affinity Structure(ACPI 6.6) */
+
+struct acpi_srat_rintc_affinity {
+	struct acpi_subtable_header header;
+	u16 reserved;
+	u32 proximity_domain;
+	u32 acpi_processor_uid;
+	u32 flags;
+	u32 clock_domain;
+};
+
+/* Flags for ACPI_SRAT_RINTC_AFFINITY */
+
+#define ACPI_SRAT_RINTC_ENABLED     (1)	/* 00: Use affinity structure */
+
 /*******************************************************************************
  *
  * STAO - Status Override Table (_STA override) - ACPI 6.0
-- 
cgit v1.2.3


From e19481071d0afe3b05cbb6602d7a53599e3e8506 Mon Sep 17 00:00:00 2001
From: Haibo Xu <haibo1.xu@intel.com>
Date: Wed, 17 Jan 2024 21:17:35 +0800
Subject: ACPICA: SRAT: Add dump and compiler support for RINTC affinity
 structure

ACPICA commit b9423c1d35b072c8f2acf97a5842b9f144449eaa

After adding RISC-V RINTC affinity structure definition,
enable corresponding dump and compiler support.

Link: https://github.com/acpica/acpica/commit/b9423c1d
Signed-off-by: Haibo Xu <haibo1.xu@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/actbl3.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/acpi/actbl3.h b/include/acpi/actbl3.h
index e32149d605dc..8f775e3a08fd 100644
--- a/include/acpi/actbl3.h
+++ b/include/acpi/actbl3.h
@@ -308,7 +308,7 @@ struct acpi_srat_rintc_affinity {
 	u32 clock_domain;
 };
 
-/* Flags for ACPI_SRAT_RINTC_AFFINITY */
+/* Flags for struct acpi_srat_rintc_affinity */
 
 #define ACPI_SRAT_RINTC_ENABLED     (1)	/* 00: Use affinity structure */
 
-- 
cgit v1.2.3


From 7f35712c2da2b92e03baf618fe469c64591d96fd Mon Sep 17 00:00:00 2001
From: Hojin Nam <hj96.nam@samsung.com>
Date: Mon, 19 Feb 2024 10:06:52 +0900
Subject: ACPICA: Fix CXL 3.0 structure (RDPAS) in the CEDT table

ACPICA commit a0ad1ed5105fb8a15f6f8384b8ab0a2157efaf23

struct acpi_cedt_rdpas does not match with CXL r3.0 9.17.1.5
Table 9-24. reserved1 and length fields are already added by
struct acpi_cedt_header.

Link: https://github.com/acpica/acpica/commit/a0ad1ed5
Signed-off-by: Hojin Nam <hj96.nam@samsung.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/actbl1.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/acpi/actbl1.h b/include/acpi/actbl1.h
index 1f58c5d86869..841ef9f22795 100644
--- a/include/acpi/actbl1.h
+++ b/include/acpi/actbl1.h
@@ -571,8 +571,6 @@ struct acpi_cedt_cxims {
 
 struct acpi_cedt_rdpas {
 	struct acpi_cedt_header header;
-	u8 reserved1;
-	u16 length;
 	u16 segment;
 	u16 bdf;
 	u8 protocol;
-- 
cgit v1.2.3


From e1d3f9d46f17e762d2dd4027456386509a6e6774 Mon Sep 17 00:00:00 2001
From: Saket Dumbre <saket.dumbre@intel.com>
Date: Thu, 21 Mar 2024 22:00:09 -0700
Subject: ACPICA: Update acpixf.h for new ACPICA release 20240322

ACPICA commit 718374cd1bc21d08960b61069c8ac62b0cf67c0c

Link: https://github.com/acpica/acpica/commit/718374cd
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/acpixf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h
index 3d90716f9522..94d0fc3bd412 100644
--- a/include/acpi/acpixf.h
+++ b/include/acpi/acpixf.h
@@ -12,7 +12,7 @@
 
 /* Current ACPICA subsystem version in YYYYMMDD format */
 
-#define ACPI_CA_VERSION                 0x20230628
+#define ACPI_CA_VERSION                 0x20240322
 
 #include <acpi/acconfig.h>
 #include <acpi/actypes.h>
-- 
cgit v1.2.3


From 86d2a2f51fbada84e377665df06b5a479a1edc99 Mon Sep 17 00:00:00 2001
From: Bitao Hu <yaoma@linux.alibaba.com>
Date: Thu, 11 Apr 2024 15:41:30 +0800
Subject: genirq: Convert kstat_irqs to a struct

The irq_desc::kstat_irqs member is a per-CPU variable of type int, which is
only capable of counting. A snapshot mechanism for interrupt statistics
will be added soon, which requires an additional variable to store the
snapshot.

To facilitate expansion, convert kstat_irqs here to a struct containing
only the count.

Originally-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Bitao Hu <yaoma@linux.alibaba.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20240411074134.30922-2-yaoma@linux.alibaba.com
---
 arch/mips/dec/setup.c                |  2 +-
 arch/parisc/kernel/smp.c             |  2 +-
 arch/powerpc/kvm/book3s_hv_rm_xics.c |  2 +-
 include/linux/irqdesc.h              | 12 ++++++++++--
 kernel/irq/internals.h               |  2 +-
 kernel/irq/irqdesc.c                 |  9 ++++-----
 kernel/irq/proc.c                    |  5 ++---
 scripts/gdb/linux/interrupts.py      |  6 +++---
 8 files changed, 23 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/arch/mips/dec/setup.c b/arch/mips/dec/setup.c
index 6c3704f51d0d..87f0a1436bf9 100644
--- a/arch/mips/dec/setup.c
+++ b/arch/mips/dec/setup.c
@@ -756,7 +756,7 @@ void __init arch_init_irq(void)
 				NULL))
 			pr_err("Failed to register fpu interrupt\n");
 		desc_fpu = irq_to_desc(irq_fpu);
-		fpu_kstat_irq = this_cpu_ptr(desc_fpu->kstat_irqs);
+		fpu_kstat_irq = this_cpu_ptr(&desc_fpu->kstat_irqs->cnt);
 	}
 	if (dec_interrupt[DEC_IRQ_CASCADE] >= 0) {
 		if (request_irq(dec_interrupt[DEC_IRQ_CASCADE], no_action,
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c
index 444154271f23..800eb64e91ad 100644
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -344,7 +344,7 @@ static int smp_boot_one_cpu(int cpuid, struct task_struct *idle)
 		struct irq_desc *desc = irq_to_desc(i);
 
 		if (desc && desc->kstat_irqs)
-			*per_cpu_ptr(desc->kstat_irqs, cpuid) = 0;
+			*per_cpu_ptr(desc->kstat_irqs, cpuid) = (struct irqstat) { };
 	}
 #endif
 
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index e42984878503..f2636414d82a 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -837,7 +837,7 @@ static inline void this_cpu_inc_rm(unsigned int __percpu *addr)
  */
 static void kvmppc_rm_handle_irq_desc(struct irq_desc *desc)
 {
-	this_cpu_inc_rm(desc->kstat_irqs);
+	this_cpu_inc_rm(&desc->kstat_irqs->cnt);
 	__this_cpu_inc(kstat.irqs_sum);
 }
 
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index d9451d456a73..c28612674acb 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -17,6 +17,14 @@ struct irq_desc;
 struct irq_domain;
 struct pt_regs;
 
+/**
+ * struct irqstat - interrupt statistics
+ * @cnt:	real-time interrupt count
+ */
+struct irqstat {
+	unsigned int	cnt;
+};
+
 /**
  * struct irq_desc - interrupt descriptor
  * @irq_common_data:	per irq and chip data passed down to chip functions
@@ -55,7 +63,7 @@ struct pt_regs;
 struct irq_desc {
 	struct irq_common_data	irq_common_data;
 	struct irq_data		irq_data;
-	unsigned int __percpu	*kstat_irqs;
+	struct irqstat __percpu	*kstat_irqs;
 	irq_flow_handler_t	handle_irq;
 	struct irqaction	*action;	/* IRQ action list */
 	unsigned int		status_use_accessors;
@@ -119,7 +127,7 @@ extern struct irq_desc irq_desc[NR_IRQS];
 static inline unsigned int irq_desc_kstat_cpu(struct irq_desc *desc,
 					      unsigned int cpu)
 {
-	return desc->kstat_irqs ? *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
+	return desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, cpu) : 0;
 }
 
 static inline struct irq_desc *irq_data_to_desc(struct irq_data *data)
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index bcc7f21db9ee..1d92532c2aae 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -258,7 +258,7 @@ static inline void irq_state_set_masked(struct irq_desc *desc)
 
 static inline void __kstat_incr_irqs_this_cpu(struct irq_desc *desc)
 {
-	__this_cpu_inc(*desc->kstat_irqs);
+	__this_cpu_inc(desc->kstat_irqs->cnt);
 	__this_cpu_inc(kstat.irqs_sum);
 }
 
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 4c6b32318ce3..b59b79200ad7 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -134,7 +134,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
 	desc->name = NULL;
 	desc->owner = owner;
 	for_each_possible_cpu(cpu)
-		*per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
+		*per_cpu_ptr(desc->kstat_irqs, cpu) = (struct irqstat) { };
 	desc_smp_init(desc, node, affinity);
 }
 
@@ -186,7 +186,7 @@ static int init_desc(struct irq_desc *desc, int irq, int node,
 		     const struct cpumask *affinity,
 		     struct module *owner)
 {
-	desc->kstat_irqs = alloc_percpu(unsigned int);
+	desc->kstat_irqs = alloc_percpu(struct irqstat);
 	if (!desc->kstat_irqs)
 		return -ENOMEM;
 
@@ -968,8 +968,7 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	return desc && desc->kstat_irqs ?
-			*per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
+	return desc && desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, cpu) : 0;
 }
 
 static bool irq_is_nmi(struct irq_desc *desc)
@@ -991,7 +990,7 @@ static unsigned int kstat_irqs(unsigned int irq)
 		return data_race(desc->tot_count);
 
 	for_each_possible_cpu(cpu)
-		sum += data_race(*per_cpu_ptr(desc->kstat_irqs, cpu));
+		sum += data_race(per_cpu(desc->kstat_irqs->cnt, cpu));
 	return sum;
 }
 
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 623b8136e9af..6954e0a02047 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -490,7 +490,7 @@ int show_interrupts(struct seq_file *p, void *v)
 
 	if (desc->kstat_irqs) {
 		for_each_online_cpu(j)
-			any_count |= data_race(*per_cpu_ptr(desc->kstat_irqs, j));
+			any_count |= data_race(per_cpu(desc->kstat_irqs->cnt, j));
 	}
 
 	if ((!desc->action || irq_desc_is_chained(desc)) && !any_count)
@@ -498,8 +498,7 @@ int show_interrupts(struct seq_file *p, void *v)
 
 	seq_printf(p, "%*d: ", prec, i);
 	for_each_online_cpu(j)
-		seq_printf(p, "%10u ", desc->kstat_irqs ?
-					*per_cpu_ptr(desc->kstat_irqs, j) : 0);
+		seq_printf(p, "%10u ", desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, j) : 0);
 
 	raw_spin_lock_irqsave(&desc->lock, flags);
 	if (desc->irq_data.chip) {
diff --git a/scripts/gdb/linux/interrupts.py b/scripts/gdb/linux/interrupts.py
index 66ae5c7690cf..616a5f26377a 100644
--- a/scripts/gdb/linux/interrupts.py
+++ b/scripts/gdb/linux/interrupts.py
@@ -37,7 +37,7 @@ def show_irq_desc(prec, irq):
     any_count = 0
     if desc['kstat_irqs']:
         for cpu in cpus.each_online_cpu():
-            any_count += cpus.per_cpu(desc['kstat_irqs'], cpu)
+            any_count += cpus.per_cpu(desc['kstat_irqs'], cpu)['cnt']
 
     if (desc['action'] == 0 or irq_desc_is_chained(desc)) and any_count == 0:
         return text;
@@ -45,7 +45,7 @@ def show_irq_desc(prec, irq):
     text += "%*d: " % (prec, irq)
     for cpu in cpus.each_online_cpu():
         if desc['kstat_irqs']:
-            count = cpus.per_cpu(desc['kstat_irqs'], cpu)
+            count = cpus.per_cpu(desc['kstat_irqs'], cpu)['cnt']
         else:
             count = 0
         text += "%10u" % (count)
@@ -177,7 +177,7 @@ def arm_common_show_interrupts(prec):
         if desc == 0:
             continue
         for cpu in cpus.each_online_cpu():
-            text += "%10u" % (cpus.per_cpu(desc['kstat_irqs'], cpu))
+            text += "%10u" % (cpus.per_cpu(desc['kstat_irqs'], cpu)['cnt'])
         text += "      %s" % (ipi_types[ipi].string())
         text += "\n"
     return text
-- 
cgit v1.2.3


From 99cf63c56661be0a0c42f79b56f37a4aa34b4779 Mon Sep 17 00:00:00 2001
From: Bitao Hu <yaoma@linux.alibaba.com>
Date: Thu, 11 Apr 2024 15:41:31 +0800
Subject: genirq: Provide a snapshot mechanism for interrupt statistics

The soft lockup detector lacks a mechanism to identify interrupt storms as
root cause of a lockup. To enable this the detector needs a mechanism to
snapshot the interrupt count statistics on a CPU when the detector observes
a potential lockup scenario and compare that against the interrupt count
when it warns about the lockup later on. The number of interrupts in that
period give a hint whether the lockup might have been caused by an interrupt
storm.

Instead of having extra storage in the lockup detector and accessing the
internals of the interrupt descriptor directly, add a snapshot member to
the per CPU irq_desc::kstat_irq structure and provide interfaces to take a
snapshot of all interrupts on the current CPU and to retrieve the delta of
a specific interrupt later on.

Originally-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Bitao Hu <yaoma@linux.alibaba.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20240411074134.30922-3-yaoma@linux.alibaba.com
---
 include/linux/irqdesc.h     |  4 ++++
 include/linux/kernel_stat.h |  8 ++++++++
 kernel/irq/Kconfig          |  4 ++++
 kernel/irq/irqdesc.c        | 25 +++++++++++++++++++++++++
 4 files changed, 41 insertions(+)

(limited to 'include')

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index c28612674acb..fd091c35d572 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -20,9 +20,13 @@ struct pt_regs;
 /**
  * struct irqstat - interrupt statistics
  * @cnt:	real-time interrupt count
+ * @ref:	snapshot of interrupt count
  */
 struct irqstat {
 	unsigned int	cnt;
+#ifdef CONFIG_GENERIC_IRQ_STAT_SNAPSHOT
+	unsigned int	ref;
+#endif
 };
 
 /**
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 9935f7ecbfb9..9c042c6384bb 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -79,6 +79,14 @@ static inline unsigned int kstat_cpu_softirqs_sum(int cpu)
 	return sum;
 }
 
+#ifdef CONFIG_GENERIC_IRQ_STAT_SNAPSHOT
+extern void kstat_snapshot_irqs(void);
+extern unsigned int kstat_get_irq_since_snapshot(unsigned int irq);
+#else
+static inline void kstat_snapshot_irqs(void) { }
+static inline unsigned int kstat_get_irq_since_snapshot(unsigned int irq) { return 0; }
+#endif
+
 /*
  * Number of interrupts per specific IRQ source, since bootup
  */
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 2531f3496ab6..529adb1f5859 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -108,6 +108,10 @@ config GENERIC_IRQ_MATRIX_ALLOCATOR
 config GENERIC_IRQ_RESERVATION_MODE
 	bool
 
+# Snapshot for interrupt statistics
+config GENERIC_IRQ_STAT_SNAPSHOT
+	bool
+
 # Support forced irq threading
 config IRQ_FORCED_THREADING
        bool
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index b59b79200ad7..f348faffa7b4 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -994,6 +994,31 @@ static unsigned int kstat_irqs(unsigned int irq)
 	return sum;
 }
 
+#ifdef CONFIG_GENERIC_IRQ_STAT_SNAPSHOT
+
+void kstat_snapshot_irqs(void)
+{
+	struct irq_desc *desc;
+	unsigned int irq;
+
+	for_each_irq_desc(irq, desc) {
+		if (!desc->kstat_irqs)
+			continue;
+		this_cpu_write(desc->kstat_irqs->ref, this_cpu_read(desc->kstat_irqs->cnt));
+	}
+}
+
+unsigned int kstat_get_irq_since_snapshot(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	if (!desc || !desc->kstat_irqs)
+		return 0;
+	return this_cpu_read(desc->kstat_irqs->cnt) - this_cpu_read(desc->kstat_irqs->ref);
+}
+
+#endif
+
 /**
  * kstat_irqs_usr - Get the statistics for an interrupt from thread context
  * @irq:	The interrupt number
-- 
cgit v1.2.3


From cda30c6542c8bb445bc84f6616cac8d012547f0a Mon Sep 17 00:00:00 2001
From: Tim Schumacher <timschumi@gmx.de>
Date: Thu, 28 Mar 2024 21:50:33 +0100
Subject: efi: Clear up misconceptions about a maximum variable name size

The UEFI specification does not make any mention of a maximum variable
name size, so the headers and implementation shouldn't claim that one
exists either.

Comments referring to this limit have been removed or rewritten, as this
is an implementation detail local to the Linux kernel.

Where appropriate, the magic value of 1024 has been replaced with
EFI_VAR_NAME_LEN, as this is used for the efi_variable struct
definition. This in itself does not change any behavior, but should
serve as points of interest when making future changes in the same area.

A related build-time check has been added to ensure that the special
512 byte sized buffer will not overflow with a potentially decreased
EFI_VAR_NAME_LEN.

Signed-off-by: Tim Schumacher <timschumi@gmx.de>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 drivers/firmware/efi/vars.c | 2 +-
 fs/efivarfs/vars.c          | 5 +++--
 include/linux/efi.h         | 9 ++++-----
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c
index f654e6f6af87..4056ba7f3440 100644
--- a/drivers/firmware/efi/vars.c
+++ b/drivers/firmware/efi/vars.c
@@ -215,7 +215,7 @@ efi_status_t efivar_set_variable_locked(efi_char16_t *name, efi_guid_t *vendor,
 
 	if (data_size > 0) {
 		status = check_var_size(nonblocking, attr,
-					data_size + ucs2_strsize(name, 1024));
+					data_size + ucs2_strsize(name, EFI_VAR_NAME_LEN));
 		if (status != EFI_SUCCESS)
 			return status;
 	}
diff --git a/fs/efivarfs/vars.c b/fs/efivarfs/vars.c
index 4d722af1014f..3cc89bb624f0 100644
--- a/fs/efivarfs/vars.c
+++ b/fs/efivarfs/vars.c
@@ -295,9 +295,9 @@ static bool variable_is_present(efi_char16_t *variable_name, efi_guid_t *vendor,
 	unsigned long strsize1, strsize2;
 	bool found = false;
 
-	strsize1 = ucs2_strsize(variable_name, 1024);
+	strsize1 = ucs2_strsize(variable_name, EFI_VAR_NAME_LEN);
 	list_for_each_entry_safe(entry, n, head, list) {
-		strsize2 = ucs2_strsize(entry->var.VariableName, 1024);
+		strsize2 = ucs2_strsize(entry->var.VariableName, EFI_VAR_NAME_LEN);
 		if (strsize1 == strsize2 &&
 			!memcmp(variable_name, &(entry->var.VariableName),
 				strsize2) &&
@@ -396,6 +396,7 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *,
 
 	do {
 		variable_name_size = 512;
+		BUILD_BUG_ON(EFI_VAR_NAME_LEN < 512);
 
 		status = efivar_get_next_variable(&variable_name_size,
 						  variable_name,
diff --git a/include/linux/efi.h b/include/linux/efi.h
index d59b0947fba0..418e555459da 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -1072,12 +1072,11 @@ static inline u64 efivar_reserved_space(void) { return 0; }
 #endif
 
 /*
- * The maximum size of VariableName + Data = 1024
- * Therefore, it's reasonable to save that much
- * space in each part of the structure,
- * and we use a page for reading/writing.
+ * There is no actual upper limit specified for the variable name size.
+ *
+ * This limit exists only for practical purposes, since name conversions
+ * are bounds-checked and name data is occasionally stored in-line.
  */
-
 #define EFI_VAR_NAME_LEN	1024
 
 int efivars_register(struct efivars *efivars,
-- 
cgit v1.2.3


From 42bd2af5950456d46fdaa91c3a8fb02e680f19f5 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 10 Apr 2024 17:10:12 -0700
Subject: vfs: relax linkat() AT_EMPTY_PATH - aka flink() - requirements
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

   "The definition of insanity is doing the same thing over and over
    again and expecting different results”

We've tried to do this before, most recently with commit bb2314b47996
("fs: Allow unprivileged linkat(..., AT_EMPTY_PATH) aka flink") about a
decade ago.

But the effort goes back even further than that, eg this thread back
from 1998 that is so old that we don't even have it archived in lore:

    https://lkml.org/lkml/1998/3/10/108

which also points out some of the reasons why it's dangerous.

Or, how about then in 2003:

    https://lkml.org/lkml/2003/4/6/112

where we went through some of the same arguments, just wirh different
people involved.

In particular, having access to a file descriptor does not necessarily
mean that you have access to the path that was used for lookup, and
there may be very good reasons why you absolutely must not have access
to a path to said file.

For example, if we were passed a file descriptor from the outside into
some limited environment (think chroot, but also user namespaces etc) a
'flink()' system call could now make that file visible inside a context
where it's not supposed to be visible.

In the process the user may also be able to re-open it with permissions
that the original file descriptor did not have (eg a read-only file
descriptor may be associated with an underlying file that is writable).

Another variation on this is if somebody else (typically root) opens a
file in a directory that is not accessible to others, and passes the
file descriptor on as a read-only file.  Again, the access to the file
descriptor does not imply that you should have access to a path to the
file in the filesystem.

So while we have tried this several times in the past, it never works.

The last time we did this, that commit bb2314b47996 quickly got reverted
again in commit f0cc6ffb8ce8 (Revert "fs: Allow unprivileged linkat(...,
AT_EMPTY_PATH) aka flink"), with a note saying "We may re-do this once
the whole discussion about the interface is done".

Well, the discussion is long done, and didn't come to any resolution.
There's no question that 'flink()' would be a useful operation, but it's
a dangerous one.

However, it does turn out that since 2008 (commit d76b0d9b2d87: "CRED:
Use creds in file structs") we have had a fairly straightforward way to
check whether the file descriptor was opened by the same credentials as
the credentials of the flink().

That allows the most common patterns that people want to use, which tend
to be to either open the source carefully (ie using the openat2()
RESOLVE_xyz flags, and/or checking ownership with fstat() before
linking), or to use O_TMPFILE and fill in the file contents before it's
exposed to the world with linkat().

But it also means that if the file descriptor was opened by somebody
else, or we've gone through a credentials change since, the operation no
longer works (unless we have CAP_DAC_READ_SEARCH capabilities in the
opener's user namespace, as before).

Note that the credential equality check is done by using pointer
equality, which means that it's not enough that you have effectively the
same user - they have to be literally identical, since our credentials
are using copy-on-write semantics.

So you can't change your credentials to something else and try to change
it back to the same ones between the open() and the linkat().  This is
not meant to be some kind of generic permission check, this is literally
meant as a "the open and link calls are 'atomic' wrt user credentials"
check.

It also means that you can't just move things between namespaces,
because the credentials aren't just a list of uid's and gid's: they
includes the pointer to the user_ns that the capabilities are relative
to.

So let's try this one more time and see if maybe this approach ends up
being workable after all.

Cc: Andrew Lutomirski <luto@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Peter Anvin <hpa@zytor.com>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/20240411001012.12513-1-torvalds@linux-foundation.org
[brauner: relax capability check to opener of the file]
Link: https://lore.kernel.org/all/20231113-undenkbar-gediegen-efde5f1c34bc@brauner
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namei.c            | 19 +++++++++++++------
 include/linux/namei.h |  1 +
 2 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/fs/namei.c b/fs/namei.c
index ceb9ddf8dfdd..8c1ff0b388e2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2422,6 +2422,14 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 		if (!f.file)
 			return ERR_PTR(-EBADF);
 
+		if (flags & LOOKUP_LINKAT_EMPTY) {
+			if (f.file->f_cred != current_cred() &&
+			    !ns_capable(f.file->f_cred->user_ns, CAP_DAC_READ_SEARCH)) {
+				fdput(f);
+				return ERR_PTR(-ENOENT);
+			}
+		}
+
 		dentry = f.file->f_path.dentry;
 
 		if (*s && unlikely(!d_can_lookup(dentry))) {
@@ -4644,14 +4652,13 @@ int do_linkat(int olddfd, struct filename *old, int newdfd,
 		goto out_putnames;
 	}
 	/*
-	 * To use null names we require CAP_DAC_READ_SEARCH
+	 * To use null names we require CAP_DAC_READ_SEARCH or
+	 * that the open-time creds of the dfd matches current.
 	 * This ensures that not everyone will be able to create
-	 * handlink using the passed filedescriptor.
+	 * a hardlink using the passed file descriptor.
 	 */
-	if (flags & AT_EMPTY_PATH && !capable(CAP_DAC_READ_SEARCH)) {
-		error = -ENOENT;
-		goto out_putnames;
-	}
+	if (flags & AT_EMPTY_PATH)
+		how |= LOOKUP_LINKAT_EMPTY;
 
 	if (flags & AT_SYMLINK_FOLLOW)
 		how |= LOOKUP_FOLLOW;
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 74e0cc14ebf8..967aa9ea9f96 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -44,6 +44,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT};
 #define LOOKUP_BENEATH		0x080000 /* No escaping from starting point. */
 #define LOOKUP_IN_ROOT		0x100000 /* Treat dirfd as fs root. */
 #define LOOKUP_CACHED		0x200000 /* Only do cached lookup */
+#define LOOKUP_LINKAT_EMPTY	0x400000 /* Linkat request with empty path. */
 /* LOOKUP_* flags which do scope-related checks based on the dirfd. */
 #define LOOKUP_IS_SCOPED (LOOKUP_BENEATH | LOOKUP_IN_ROOT)
 
-- 
cgit v1.2.3


From 058e87782c91696020bdb0aa28ddf77d89aed266 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Wed, 14 Feb 2024 14:45:53 -0800
Subject: rcu: Update lockdep while in RCU read-side critical section

With Ankur's lazy-/auto-preemption patches applied and with a
lazy-preemptible kernel in combination with a non-preemptible RCU,
lockdep sometimes complains about context switches within RCU read-side
critical sections.  This is a false positive due to rcu_read_unlock()
updating lockdep state too late:

	__release(RCU);
	__rcu_read_unlock();
	// Context switch here results in lockdep false positive!!!
	rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */

Although this complaint could also happen with preemptible RCU
in a preemptible kernel, the odds of that happening aer quite low.
In constrast, with non-preemptible RCU, a long critical section has a
high probability of performing a context switch from the preempt_enable()
in __rcu_read_unlock().

The fix is straightforward, just move the rcu_lock_release()
within rcu_read_unlock() to obtain the reverse order from that of
rcu_read_lock():

	rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */
	__release(RCU);
	__rcu_read_unlock();

This commit makes this change.

Co-developed-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Co-developed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Co-developed-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Ankur Arora <ankur.a.arora@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/rcupdate.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 17d7ed5f3ae6..2c54750e36a0 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -809,9 +809,9 @@ static inline void rcu_read_unlock(void)
 {
 	RCU_LOCKDEP_WARN(!rcu_is_watching(),
 			 "rcu_read_unlock() used illegally while idle");
+	rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */
 	__release(RCU);
 	__rcu_read_unlock();
-	rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */
 }
 
 /**
-- 
cgit v1.2.3


From 65b4a59557f6fb5d4355093ad5a2c1bd5598ee41 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 15 Feb 2024 09:04:30 -0800
Subject: srcu: Make Tiny SRCU explicitly disable preemption

Because Tiny SRCU is used only in kernels built with either
CONFIG_PREEMPT_NONE=y or CONFIG_PREEMPT_VOLUNTARY=y, there has not
been any need for TINY SRCU to explicitly disable preemption.  However,
the prospect of lazy preemption changes that, and the lazy-preemption
patches do result in rcutorture runs finding both too-short grace periods
and grace-period hangs for Tiny SRCU.

This commit therefore adds the needed preempt_disable() and
preempt_enable() calls to Tiny SRCU.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Ankur Arora <ankur.a.arora@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 include/linux/srcutiny.h |  2 ++
 kernel/rcu/srcutiny.c    | 31 ++++++++++++++++++++++++++-----
 2 files changed, 28 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
index 447133171d95..4d96bbdb45f0 100644
--- a/include/linux/srcutiny.h
+++ b/include/linux/srcutiny.h
@@ -64,8 +64,10 @@ static inline int __srcu_read_lock(struct srcu_struct *ssp)
 {
 	int idx;
 
+	preempt_disable();  // Needed for PREEMPT_AUTO
 	idx = ((READ_ONCE(ssp->srcu_idx) + 1) & 0x2) >> 1;
 	WRITE_ONCE(ssp->srcu_lock_nesting[idx], READ_ONCE(ssp->srcu_lock_nesting[idx]) + 1);
+	preempt_enable();
 	return idx;
 }
 
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index c38e5933a5d6..5afd5cf494db 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -96,9 +96,12 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
  */
 void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
 {
-	int newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1;
+	int newval;
 
+	preempt_disable();  // Needed for PREEMPT_AUTO
+	newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1;
 	WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval);
+	preempt_enable();
 	if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task())
 		swake_up_one(&ssp->srcu_wq);
 }
@@ -117,8 +120,11 @@ void srcu_drive_gp(struct work_struct *wp)
 	struct srcu_struct *ssp;
 
 	ssp = container_of(wp, struct srcu_struct, srcu_work);
-	if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
+	preempt_disable();  // Needed for PREEMPT_AUTO
+	if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) {
 		return; /* Already running or nothing to do. */
+		preempt_enable();
+	}
 
 	/* Remove recently arrived callbacks and wait for readers. */
 	WRITE_ONCE(ssp->srcu_gp_running, true);
@@ -130,9 +136,12 @@ void srcu_drive_gp(struct work_struct *wp)
 	idx = (ssp->srcu_idx & 0x2) / 2;
 	WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
 	WRITE_ONCE(ssp->srcu_gp_waiting, true);  /* srcu_read_unlock() wakes! */
+	preempt_enable();
 	swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx]));
+	preempt_disable();  // Needed for PREEMPT_AUTO
 	WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
 	WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
+	preempt_enable();
 
 	/* Invoke the callbacks we removed above. */
 	while (lh) {
@@ -150,8 +159,11 @@ void srcu_drive_gp(struct work_struct *wp)
 	 * at interrupt level, but the ->srcu_gp_running checks will
 	 * straighten that out.
 	 */
+	preempt_disable();  // Needed for PREEMPT_AUTO
 	WRITE_ONCE(ssp->srcu_gp_running, false);
-	if (ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
+	idx = ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max));
+	preempt_enable();
+	if (idx)
 		schedule_work(&ssp->srcu_work);
 }
 EXPORT_SYMBOL_GPL(srcu_drive_gp);
@@ -160,9 +172,12 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp)
 {
 	unsigned long cookie;
 
+	preempt_disable();  // Needed for PREEMPT_AUTO
 	cookie = get_state_synchronize_srcu(ssp);
-	if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie))
+	if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) {
+		preempt_enable();
 		return;
+	}
 	WRITE_ONCE(ssp->srcu_idx_max, cookie);
 	if (!READ_ONCE(ssp->srcu_gp_running)) {
 		if (likely(srcu_init_done))
@@ -170,6 +185,7 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp)
 		else if (list_empty(&ssp->srcu_work.entry))
 			list_add(&ssp->srcu_work.entry, &srcu_boot_list);
 	}
+	preempt_enable();
 }
 
 /*
@@ -183,11 +199,13 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
 
 	rhp->func = func;
 	rhp->next = NULL;
+	preempt_disable();  // Needed for PREEMPT_AUTO
 	local_irq_save(flags);
 	*ssp->srcu_cb_tail = rhp;
 	ssp->srcu_cb_tail = &rhp->next;
 	local_irq_restore(flags);
 	srcu_gp_start_if_needed(ssp);
+	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(call_srcu);
 
@@ -241,9 +259,12 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_srcu);
  */
 unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp)
 {
-	unsigned long ret = get_state_synchronize_srcu(ssp);
+	unsigned long ret;
 
+	preempt_disable();  // Needed for PREEMPT_AUTO
+	ret = get_state_synchronize_srcu(ssp);
 	srcu_gp_start_if_needed(ssp);
+	preempt_enable();
 	return ret;
 }
 EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
-- 
cgit v1.2.3


From d11e63119432bdb55065d094cb6fd37e9147c70d Mon Sep 17 00:00:00 2001
From: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Date: Thu, 11 Apr 2024 10:52:54 +0000
Subject: flow_offload: add control flag checking helpers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These helpers aim to help drivers, with checking
for the presence of unsupported control flags.

For drivers supporting at least one control flag:
  flow_rule_is_supp_control_flags()

For drivers using flow_rule_match_control(), but not using flags:
  flow_rule_has_control_flags()

For drivers not using flow_rule_match_control():
  flow_rule_match_has_control_flags()

While primarily aimed at FLOW_DISSECTOR_KEY_CONTROL
and flow_rule_match_control(), then the first two
can also be used with FLOW_DISSECTOR_KEY_ENC_CONTROL
and flow_rule_match_enc_control().

These helpers mirrors the existing check done in sfc:
  drivers/net/ethernet/sfc/tc.c +276

Only compile-tested.

Signed-off-by: Asbjørn Sloth Tønnesen <ast@fiberby.net>
Reviewed-by: Louis Peens <louis.peens@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_offload.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

(limited to 'include')

diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 7726b4a97050..ec9f80509f60 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -449,6 +449,61 @@ static inline bool flow_rule_match_key(const struct flow_rule *rule,
 	return dissector_uses_key(rule->match.dissector, key);
 }
 
+/**
+ * flow_rule_is_supp_control_flags() - check for supported control flags
+ * @supp_flags: control flags supported by driver
+ * @ctrl_flags: control flags present in rule
+ * @extack: The netlink extended ACK for reporting errors.
+ *
+ * Return: true if only supported control flags are set, false otherwise.
+ */
+static inline bool flow_rule_is_supp_control_flags(const u32 supp_flags,
+						   const u32 ctrl_flags,
+						   struct netlink_ext_ack *extack)
+{
+	if (likely((ctrl_flags & ~supp_flags) == 0))
+		return true;
+
+	NL_SET_ERR_MSG_FMT_MOD(extack,
+			       "Unsupported match on control.flags %#x",
+			       ctrl_flags);
+
+	return false;
+}
+
+/**
+ * flow_rule_has_control_flags() - check for presence of any control flags
+ * @ctrl_flags: control flags present in rule
+ * @extack: The netlink extended ACK for reporting errors.
+ *
+ * Return: true if control flags are set, false otherwise.
+ */
+static inline bool flow_rule_has_control_flags(const u32 ctrl_flags,
+					       struct netlink_ext_ack *extack)
+{
+	return !flow_rule_is_supp_control_flags(0, ctrl_flags, extack);
+}
+
+/**
+ * flow_rule_match_has_control_flags() - match and check for any control flags
+ * @rule: The flow_rule under evaluation.
+ * @extack: The netlink extended ACK for reporting errors.
+ *
+ * Return: true if control flags are set, false otherwise.
+ */
+static inline bool flow_rule_match_has_control_flags(struct flow_rule *rule,
+						     struct netlink_ext_ack *extack)
+{
+	struct flow_match_control match;
+
+	if (!flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_CONTROL))
+		return false;
+
+	flow_rule_match_control(rule, &match);
+
+	return flow_rule_has_control_flags(match.mask->flags, extack);
+}
+
 struct flow_stats {
 	u64	pkts;
 	u64	bytes;
-- 
cgit v1.2.3


From 2abcd952e1999d0f1006068c455a524bec37883d Mon Sep 17 00:00:00 2001
From: Ricardo Ribalda <ribalda@chromium.org>
Date: Wed, 10 Apr 2024 12:24:38 +0000
Subject: media: dvb: Fix dtvs_stats packing.

The structure is packed, which requires that all its fields need to be
also packed.

./include/uapi/linux/dvb/frontend.h:854:2: warning: field  within 'struct dtv_stats' is less aligned than 'union dtv_stats::(anonymous at ./include/uapi/linux/dvb/frontend.h:854:2)' and is usually due to 'struct dtv_stats' being packed, which can lead to unaligned accesses [-Wunaligned-access]

Explicitly set the inner union as packed.

Marking the inner union as 'packed' does not change the layout, since the
whole struct is already packed, it just silences the clang warning. See
also this llvm discussion:

https://github.com/llvm/llvm-project/issues/55520

Signed-off-by: Ricardo Ribalda <ribalda@chromium.org>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 include/uapi/linux/dvb/frontend.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/dvb/frontend.h b/include/uapi/linux/dvb/frontend.h
index 7e0983b987c2..8d38c6befda8 100644
--- a/include/uapi/linux/dvb/frontend.h
+++ b/include/uapi/linux/dvb/frontend.h
@@ -854,7 +854,7 @@ struct dtv_stats {
 	union {
 		__u64 uvalue;	/* for counters and relative scales */
 		__s64 svalue;	/* for 0.001 dB measures */
-	};
+	}  __attribute__ ((packed));
 } __attribute__ ((packed));
 
 
-- 
cgit v1.2.3


From 81741e804ef4692e8a6741380352625397a3288b Mon Sep 17 00:00:00 2001
From: Ricardo Ribalda <ribalda@chromium.org>
Date: Wed, 10 Apr 2024 12:24:39 +0000
Subject: media: videodev2: Fix v4l2_ext_control packing.

The structure is packed, which requires that all its fields need to be
also packed.

./include/uapi/linux/videodev2.h:1810:2: warning: field  within 'struct v4l2_ext_control' is less aligned than 'union v4l2_ext_control::(anonymous at ./include/uapi/linux/videodev2.h:1810:2)' and is usually due to 'struct v4l2_ext_control' being packed, which can lead to unaligned accesses [-Wunaligned-access]

Explicitly set the inner union as packed.

Marking the inner union as 'packed' does not change the layout, since the
whole struct is already packed, it just silences the clang warning. See
also this llvm discussion:

https://github.com/llvm/llvm-project/issues/55520

Signed-off-by: Ricardo Ribalda <ribalda@chromium.org>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 include/uapi/linux/videodev2.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 2663213b76a4..bf12860d570a 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -1842,7 +1842,7 @@ struct v4l2_ext_control {
 		struct v4l2_ctrl_hdr10_cll_info __user *p_hdr10_cll_info;
 		struct v4l2_ctrl_hdr10_mastering_display __user *p_hdr10_mastering_display;
 		void __user *ptr;
-	};
+	} __attribute__ ((packed));
 } __attribute__ ((packed));
 
 struct v4l2_ext_controls {
-- 
cgit v1.2.3


From bd3520a93a84cd8c3897283e5891a9106fcf5acc Mon Sep 17 00:00:00 2001
From: Pasha Tatashin <pasha.tatashin@soleen.com>
Date: Sat, 13 Apr 2024 00:25:21 +0000
Subject: iommu: observability of the IOMMU allocations

Add NR_IOMMU_PAGES into node_stat_item that counts number of pages
that are allocated by the IOMMU subsystem.

The allocations can be view per-node via:
/sys/devices/system/node/nodeN/vmstat.

For example:

$ grep iommu /sys/devices/system/node/node*/vmstat
/sys/devices/system/node/node0/vmstat:nr_iommu_pages 106025
/sys/devices/system/node/node1/vmstat:nr_iommu_pages 3464

The value is in page-count, therefore, in the above example
the iommu allocations amount to ~428M.

Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Acked-by: David Rientjes <rientjes@google.com>
Tested-by: Bagas Sanjaya <bagasdotme@gmail.com>
Link: https://lore.kernel.org/r/20240413002522.1101315-11-pasha.tatashin@soleen.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu-pages.h | 30 ++++++++++++++++++++++++++++++
 include/linux/mmzone.h      |  3 +++
 mm/vmstat.c                 |  3 +++
 3 files changed, 36 insertions(+)

(limited to 'include')

diff --git a/drivers/iommu/iommu-pages.h b/drivers/iommu/iommu-pages.h
index 5a222d0ad25c..1264b0f6b6c3 100644
--- a/drivers/iommu/iommu-pages.h
+++ b/drivers/iommu/iommu-pages.h
@@ -20,6 +20,30 @@
  * large, i.e. multiple gigabytes in size.
  */
 
+/**
+ * __iommu_alloc_account - account for newly allocated page.
+ * @page: head struct page of the page.
+ * @order: order of the page
+ */
+static inline void __iommu_alloc_account(struct page *page, int order)
+{
+	const long pgcnt = 1l << order;
+
+	mod_node_page_state(page_pgdat(page), NR_IOMMU_PAGES, pgcnt);
+}
+
+/**
+ * __iommu_free_account - account a page that is about to be freed.
+ * @page: head struct page of the page.
+ * @order: order of the page
+ */
+static inline void __iommu_free_account(struct page *page, int order)
+{
+	const long pgcnt = 1l << order;
+
+	mod_node_page_state(page_pgdat(page), NR_IOMMU_PAGES, -pgcnt);
+}
+
 /**
  * __iommu_alloc_pages - allocate a zeroed page of a given order.
  * @gfp: buddy allocator flags
@@ -35,6 +59,8 @@ static inline struct page *__iommu_alloc_pages(gfp_t gfp, int order)
 	if (unlikely(!page))
 		return NULL;
 
+	__iommu_alloc_account(page, order);
+
 	return page;
 }
 
@@ -48,6 +74,7 @@ static inline void __iommu_free_pages(struct page *page, int order)
 	if (!page)
 		return;
 
+	__iommu_free_account(page, order);
 	__free_pages(page, order);
 }
 
@@ -67,6 +94,8 @@ static inline void *iommu_alloc_pages_node(int nid, gfp_t gfp, int order)
 	if (unlikely(!page))
 		return NULL;
 
+	__iommu_alloc_account(page, order);
+
 	return page_address(page);
 }
 
@@ -147,6 +176,7 @@ static inline void iommu_put_pages_list(struct list_head *page)
 		struct page *p = list_entry(page->prev, struct page, lru);
 
 		list_del(&p->lru);
+		__iommu_free_account(p, 0);
 		put_page(p);
 	}
 }
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c11b7cde81ef..be17195f6f86 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -206,6 +206,9 @@ enum node_stat_item {
 #endif
 	NR_PAGETABLE,		/* used for pagetables */
 	NR_SECONDARY_PAGETABLE, /* secondary pagetables, e.g. KVM pagetables */
+#ifdef CONFIG_IOMMU_SUPPORT
+	NR_IOMMU_PAGES,		/* # of pages allocated by IOMMU */
+#endif
 #ifdef CONFIG_SWAP
 	NR_SWAPCACHE,
 #endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index db79935e4a54..8507c497218b 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1242,6 +1242,9 @@ const char * const vmstat_text[] = {
 #endif
 	"nr_page_table_pages",
 	"nr_sec_page_table_pages",
+#ifdef CONFIG_IOMMU_SUPPORT
+	"nr_iommu_pages",
+#endif
 #ifdef CONFIG_SWAP
 	"nr_swapcached",
 #endif
-- 
cgit v1.2.3


From 212c5c078d83d780cf2873ca931df135771e8bb7 Mon Sep 17 00:00:00 2001
From: Pasha Tatashin <pasha.tatashin@soleen.com>
Date: Sat, 13 Apr 2024 00:25:22 +0000
Subject: iommu: account IOMMU allocated memory

In order to be able to limit the amount of memory that is allocated
by IOMMU subsystem, the memory must be accounted.

Account IOMMU as part of the secondary pagetables as it was discussed
at LPC.

The value of SecPageTables now contains mmeory allocation by IOMMU
and KVM.

There is a difference between GFP_ACCOUNT and what NR_IOMMU_PAGES shows.
GFP_ACCOUNT is set only where it makes sense to charge to user
processes, i.e. IOMMU Page Tables, but there more IOMMU shared data
that should not really be charged to a specific process.

Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Acked-by: David Rientjes <rientjes@google.com>
Tested-by: Bagas Sanjaya <bagasdotme@gmail.com>
Link: https://lore.kernel.org/r/20240413002522.1101315-12-pasha.tatashin@soleen.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 Documentation/admin-guide/cgroup-v2.rst | 2 +-
 Documentation/filesystems/proc.rst      | 4 ++--
 drivers/iommu/iommu-pages.h             | 2 ++
 include/linux/mmzone.h                  | 2 +-
 4 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 17e6e9565156..15f80fea8df7 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1432,7 +1432,7 @@ PAGE_SIZE multiple when read back.
 	  sec_pagetables
 		Amount of memory allocated for secondary page tables,
 		this currently includes KVM mmu allocations on x86
-		and arm64.
+		and arm64 and IOMMU page tables.
 
 	  percpu (npn)
 		Amount of memory used for storing per-cpu kernel
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index c6a6b9df2104..707e39280a9a 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -1110,8 +1110,8 @@ KernelStack
 PageTables
               Memory consumed by userspace page tables
 SecPageTables
-              Memory consumed by secondary page tables, this currently
-              currently includes KVM mmu allocations on x86 and arm64.
+              Memory consumed by secondary page tables, this currently includes
+              KVM mmu and IOMMU allocations on x86 and arm64.
 NFS_Unstable
               Always zero. Previous counted pages which had been written to
               the server, but has not been committed to stable storage.
diff --git a/drivers/iommu/iommu-pages.h b/drivers/iommu/iommu-pages.h
index 1264b0f6b6c3..82ebf0033081 100644
--- a/drivers/iommu/iommu-pages.h
+++ b/drivers/iommu/iommu-pages.h
@@ -30,6 +30,7 @@ static inline void __iommu_alloc_account(struct page *page, int order)
 	const long pgcnt = 1l << order;
 
 	mod_node_page_state(page_pgdat(page), NR_IOMMU_PAGES, pgcnt);
+	mod_lruvec_page_state(page, NR_SECONDARY_PAGETABLE, pgcnt);
 }
 
 /**
@@ -42,6 +43,7 @@ static inline void __iommu_free_account(struct page *page, int order)
 	const long pgcnt = 1l << order;
 
 	mod_node_page_state(page_pgdat(page), NR_IOMMU_PAGES, -pgcnt);
+	mod_lruvec_page_state(page, NR_SECONDARY_PAGETABLE, -pgcnt);
 }
 
 /**
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index be17195f6f86..8f9c9590a42c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -205,7 +205,7 @@ enum node_stat_item {
 	NR_KERNEL_SCS_KB,	/* measured in KiB */
 #endif
 	NR_PAGETABLE,		/* used for pagetables */
-	NR_SECONDARY_PAGETABLE, /* secondary pagetables, e.g. KVM pagetables */
+	NR_SECONDARY_PAGETABLE, /* secondary pagetables, KVM & IOMMU */
 #ifdef CONFIG_IOMMU_SUPPORT
 	NR_IOMMU_PAGES,		/* # of pages allocated by IOMMU */
 #endif
-- 
cgit v1.2.3


From 3ddbd345539eb89cb1ccceb79ef0a3c150aeebbf Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Tue, 9 Apr 2024 12:46:09 +0300
Subject: drm/edid: add drm_edid_get_product_id()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a struct drm_edid based function to get the vendor and product ID
from an EDID. Add a separate struct for defining this part of the EDID,
with defined byte order for manufacturer name, product code and serial
number.

v2: Define manufacturer_name as __be16 instead of u8[2] (Ville)

Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Acked-by: Melissa Wen <mwen@igalia.com>
Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/df0e7dedbf7f2c190039d6e6eae3e126eba113c9.1712655867.git.jani.nikula@intel.com
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 drivers/gpu/drm/drm_edid.c | 15 +++++++++++++++
 include/drm/drm_edid.h     | 25 ++++++++++++++++++++-----
 2 files changed, 35 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index ea77577a3786..626a0e24e66a 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -2756,6 +2756,21 @@ const struct drm_edid *drm_edid_read(struct drm_connector *connector)
 }
 EXPORT_SYMBOL(drm_edid_read);
 
+/**
+ * drm_edid_get_product_id - Get the vendor and product identification
+ * @drm_edid: EDID
+ * @id: Where to place the product id
+ */
+void drm_edid_get_product_id(const struct drm_edid *drm_edid,
+			     struct drm_edid_product_id *id)
+{
+	if (drm_edid && drm_edid->edid && drm_edid->size >= EDID_LENGTH)
+		memcpy(id, &drm_edid->edid->product_id, sizeof(*id));
+	else
+		memset(id, 0, sizeof(*id));
+}
+EXPORT_SYMBOL(drm_edid_get_product_id);
+
 /**
  * drm_edid_get_panel_id - Get a panel's ID from EDID
  * @drm_edid: EDID that contains panel ID.
diff --git a/include/drm/drm_edid.h b/include/drm/drm_edid.h
index 6f65bbf655a1..8bdd8d54815d 100644
--- a/include/drm/drm_edid.h
+++ b/include/drm/drm_edid.h
@@ -272,14 +272,27 @@ struct detailed_timing {
 #define DRM_EDID_DSC_MAX_SLICES			0xf
 #define DRM_EDID_DSC_TOTAL_CHUNK_KBYTES		0x3f
 
+struct drm_edid_product_id {
+	__be16 manufacturer_name;
+	__le16 product_code;
+	__le32 serial_number;
+	u8 week_of_manufacture;
+	u8 year_of_manufacture;
+} __packed;
+
 struct edid {
 	u8 header[8];
 	/* Vendor & product info */
-	u8 mfg_id[2];
-	u8 prod_code[2];
-	u32 serial; /* FIXME: byte order */
-	u8 mfg_week;
-	u8 mfg_year;
+	union {
+		struct drm_edid_product_id product_id;
+		struct {
+			u8 mfg_id[2];
+			u8 prod_code[2];
+			u32 serial; /* FIXME: byte order */
+			u8 mfg_week;
+			u8 mfg_year;
+		} __packed;
+	} __packed;
 	/* EDID version */
 	u8 version;
 	u8 revision;
@@ -466,6 +479,8 @@ int drm_edid_connector_update(struct drm_connector *connector,
 			      const struct drm_edid *edid);
 int drm_edid_connector_add_modes(struct drm_connector *connector);
 bool drm_edid_is_digital(const struct drm_edid *drm_edid);
+void drm_edid_get_product_id(const struct drm_edid *drm_edid,
+			     struct drm_edid_product_id *id);
 
 const u8 *drm_find_edid_extension(const struct drm_edid *drm_edid,
 				  int ext_id, int *ext_index);
-- 
cgit v1.2.3


From 3f56e5514bfd99aaba649e4af6b11a11b731d3a2 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Tue, 9 Apr 2024 12:46:10 +0300
Subject: drm/edid: add drm_edid_print_product_id()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a function to print a decoded EDID vendor and product id to a drm
printer, optionally with the raw data.

v2:
- refactor date printing
- use seq_buf to avoid kasprintf() (Ville)
- handle week == 0 (Ville)
- use be16_to_cpu() on manufacturer_name

Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Acked-by: Melissa Wen <mwen@igalia.com> # v1
Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/32bbc83ee6557809ef6d7a5edb1bc8ef4d56d10f.1712655867.git.jani.nikula@intel.com
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 drivers/gpu/drm/drm_edid.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 include/drm/drm_edid.h     |  3 +++
 2 files changed, 47 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index 626a0e24e66a..1400722ae3fe 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -29,12 +29,14 @@
  */
 
 #include <linux/bitfield.h>
+#include <linux/byteorder/generic.h>
 #include <linux/cec.h>
 #include <linux/hdmi.h>
 #include <linux/i2c.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <linux/seq_buf.h>
 #include <linux/slab.h>
 #include <linux/vga_switcheroo.h>
 
@@ -2771,6 +2773,48 @@ void drm_edid_get_product_id(const struct drm_edid *drm_edid,
 }
 EXPORT_SYMBOL(drm_edid_get_product_id);
 
+static void decode_date(struct seq_buf *s, const struct drm_edid_product_id *id)
+{
+	int week = id->week_of_manufacture;
+	int year = id->year_of_manufacture + 1990;
+
+	if (week == 0xff)
+		seq_buf_printf(s, "model year: %d", year);
+	else if (!week)
+		seq_buf_printf(s, "year of manufacture: %d", year);
+	else
+		seq_buf_printf(s, "week/year of manufacture: %d/%d", week, year);
+}
+
+/**
+ * drm_edid_print_product_id - Print decoded product id to printer
+ * @p: drm printer
+ * @id: EDID product id
+ * @raw: If true, also print the raw hex
+ *
+ * See VESA E-EDID 1.4 section 3.4.
+ */
+void drm_edid_print_product_id(struct drm_printer *p,
+			       const struct drm_edid_product_id *id, bool raw)
+{
+	DECLARE_SEQ_BUF(date, 40);
+	char vend[4];
+
+	drm_edid_decode_mfg_id(be16_to_cpu(id->manufacturer_name), vend);
+
+	decode_date(&date, id);
+
+	drm_printf(p, "manufacturer name: %s, product code: %u, serial number: %u, %s\n",
+		   vend, le16_to_cpu(id->product_code),
+		   le32_to_cpu(id->serial_number), seq_buf_str(&date));
+
+	if (raw)
+		drm_printf(p, "raw product id: %*ph\n", (int)sizeof(*id), id);
+
+	WARN_ON(seq_buf_has_overflowed(&date));
+}
+EXPORT_SYMBOL(drm_edid_print_product_id);
+
 /**
  * drm_edid_get_panel_id - Get a panel's ID from EDID
  * @drm_edid: EDID that contains panel ID.
diff --git a/include/drm/drm_edid.h b/include/drm/drm_edid.h
index 8bdd8d54815d..8e0e32349332 100644
--- a/include/drm/drm_edid.h
+++ b/include/drm/drm_edid.h
@@ -30,6 +30,7 @@ struct drm_connector;
 struct drm_device;
 struct drm_display_mode;
 struct drm_edid;
+struct drm_printer;
 struct hdmi_avi_infoframe;
 struct hdmi_vendor_infoframe;
 struct i2c_adapter;
@@ -481,6 +482,8 @@ int drm_edid_connector_add_modes(struct drm_connector *connector);
 bool drm_edid_is_digital(const struct drm_edid *drm_edid);
 void drm_edid_get_product_id(const struct drm_edid *drm_edid,
 			     struct drm_edid_product_id *id);
+void drm_edid_print_product_id(struct drm_printer *p,
+			       const struct drm_edid_product_id *id, bool raw);
 
 const u8 *drm_find_edid_extension(const struct drm_edid *drm_edid,
 				  int ext_id, int *ext_index);
-- 
cgit v1.2.3


From 57b60ec4b30df188ca31bdd95971a6ef5dfc5094 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Thu, 11 Apr 2024 20:22:32 +0300
Subject: gpiolib: acpi: Pass con_id instead of property into
 acpi_dev_gpio_irq_get_by()

Pass the con_id instead of property so that callers won't repeat
the GPIO suffixes to try.

Acked-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
---
 drivers/gpio/gpio-pca953x.c                                |  2 +-
 drivers/gpio/gpiolib-acpi.c                                | 11 +++++------
 drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c |  2 +-
 drivers/pinctrl/pinctrl-cy8c95x0.c                         |  2 +-
 include/linux/acpi.h                                       |  8 ++++----
 5 files changed, 12 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/gpio/gpio-pca953x.c b/drivers/gpio/gpio-pca953x.c
index 00ffa168e405..77a2812f2974 100644
--- a/drivers/gpio/gpio-pca953x.c
+++ b/drivers/gpio/gpio-pca953x.c
@@ -144,7 +144,7 @@ static int pca953x_acpi_get_irq(struct device *dev)
 	if (ret)
 		dev_warn(dev, "can't add GPIO ACPI mapping\n");
 
-	ret = acpi_dev_gpio_irq_get_by(ACPI_COMPANION(dev), "irq-gpios", 0);
+	ret = acpi_dev_gpio_irq_get_by(ACPI_COMPANION(dev), "irq", 0);
 	if (ret < 0)
 		return ret;
 
diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c
index 2b3fd43b13fc..909113312a1b 100644
--- a/drivers/gpio/gpiolib-acpi.c
+++ b/drivers/gpio/gpiolib-acpi.c
@@ -1013,7 +1013,7 @@ struct gpio_desc *acpi_find_gpio(struct fwnode_handle *fwnode,
 /**
  * acpi_dev_gpio_irq_wake_get_by() - Find GpioInt and translate it to Linux IRQ number
  * @adev: pointer to a ACPI device to get IRQ from
- * @name: optional name of GpioInt resource
+ * @con_id: optional name of GpioInt resource
  * @index: index of GpioInt resource (starting from %0)
  * @wake_capable: Set to true if the IRQ is wake capable
  *
@@ -1024,15 +1024,15 @@ struct gpio_desc *acpi_find_gpio(struct fwnode_handle *fwnode,
  * The function is idempotent, though each time it runs it will configure GPIO
  * pin direction according to the flags in GpioInt resource.
  *
- * The function takes optional @name parameter. If the resource has a property
- * name, then only those will be taken into account.
+ * The function takes optional @con_id parameter. If the resource has
+ * a @con_id in a property, then only those will be taken into account.
  *
  * The GPIO is considered wake capable if the GpioInt resource specifies
  * SharedAndWake or ExclusiveAndWake.
  *
  * Return: Linux IRQ number (> %0) on success, negative errno on failure.
  */
-int acpi_dev_gpio_irq_wake_get_by(struct acpi_device *adev, const char *name, int index,
+int acpi_dev_gpio_irq_wake_get_by(struct acpi_device *adev, const char *con_id, int index,
 				  bool *wake_capable)
 {
 	int idx, i;
@@ -1043,9 +1043,8 @@ int acpi_dev_gpio_irq_wake_get_by(struct acpi_device *adev, const char *name, in
 		struct acpi_gpio_info info;
 		struct gpio_desc *desc;
 
-		desc = acpi_get_gpiod_by_index(adev, name, i, &info);
-
 		/* Ignore -EPROBE_DEFER, it only matters if idx matches */
+		desc = __acpi_find_gpio(acpi_fwnode_handle(adev), con_id, i, true, &info);
 		if (IS_ERR(desc) && PTR_ERR(desc) != -EPROBE_DEFER)
 			return PTR_ERR(desc);
 
diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c
index 3d09fa54598f..28a0016f1a55 100644
--- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c
+++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c
@@ -428,7 +428,7 @@ static int mlxbf_gige_probe(struct platform_device *pdev)
 	priv->rx_irq = platform_get_irq(pdev, MLXBF_GIGE_RECEIVE_PKT_INTR_IDX);
 	priv->llu_plu_irq = platform_get_irq(pdev, MLXBF_GIGE_LLU_PLU_INTR_IDX);
 
-	phy_irq = acpi_dev_gpio_irq_get_by(ACPI_COMPANION(&pdev->dev), "phy-gpios", 0);
+	phy_irq = acpi_dev_gpio_irq_get_by(ACPI_COMPANION(&pdev->dev), "phy", 0);
 	if (phy_irq < 0) {
 		dev_err(&pdev->dev, "Error getting PHY irq. Use polling instead");
 		phy_irq = PHY_POLL;
diff --git a/drivers/pinctrl/pinctrl-cy8c95x0.c b/drivers/pinctrl/pinctrl-cy8c95x0.c
index 67b5d160c027..981c569bd671 100644
--- a/drivers/pinctrl/pinctrl-cy8c95x0.c
+++ b/drivers/pinctrl/pinctrl-cy8c95x0.c
@@ -95,7 +95,7 @@ static int cy8c95x0_acpi_get_irq(struct device *dev)
 	if (ret)
 		dev_warn(dev, "can't add GPIO ACPI mapping\n");
 
-	ret = acpi_dev_gpio_irq_get_by(ACPI_COMPANION(dev), "irq-gpios", 0);
+	ret = acpi_dev_gpio_irq_get_by(ACPI_COMPANION(dev), "irq", 0);
 	if (ret < 0)
 		return ret;
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 34829f2c517a..35aff121f418 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -1233,7 +1233,7 @@ bool acpi_gpio_get_irq_resource(struct acpi_resource *ares,
 				struct acpi_resource_gpio **agpio);
 bool acpi_gpio_get_io_resource(struct acpi_resource *ares,
 			       struct acpi_resource_gpio **agpio);
-int acpi_dev_gpio_irq_wake_get_by(struct acpi_device *adev, const char *name, int index,
+int acpi_dev_gpio_irq_wake_get_by(struct acpi_device *adev, const char *con_id, int index,
 				  bool *wake_capable);
 #else
 static inline bool acpi_gpio_get_irq_resource(struct acpi_resource *ares,
@@ -1246,7 +1246,7 @@ static inline bool acpi_gpio_get_io_resource(struct acpi_resource *ares,
 {
 	return false;
 }
-static inline int acpi_dev_gpio_irq_wake_get_by(struct acpi_device *adev, const char *name,
+static inline int acpi_dev_gpio_irq_wake_get_by(struct acpi_device *adev, const char *con_id,
 						int index, bool *wake_capable)
 {
 	return -ENXIO;
@@ -1259,10 +1259,10 @@ static inline int acpi_dev_gpio_irq_wake_get(struct acpi_device *adev, int index
 	return acpi_dev_gpio_irq_wake_get_by(adev, NULL, index, wake_capable);
 }
 
-static inline int acpi_dev_gpio_irq_get_by(struct acpi_device *adev, const char *name,
+static inline int acpi_dev_gpio_irq_get_by(struct acpi_device *adev, const char *con_id,
 					   int index)
 {
-	return acpi_dev_gpio_irq_wake_get_by(adev, name, index, NULL);
+	return acpi_dev_gpio_irq_wake_get_by(adev, con_id, index, NULL);
 }
 
 static inline int acpi_dev_gpio_irq_get(struct acpi_device *adev, int index)
-- 
cgit v1.2.3


From 7ab6c64663a001723e7febbf444afce93910d26b Mon Sep 17 00:00:00 2001
From: Gergo Koteles <soyer@irl.hu>
Date: Tue, 2 Apr 2024 15:21:00 +0200
Subject: dt-bindings: leds: Add LED_FUNCTION_FNLOCK

Newer laptops have FnLock LED.

Add a define for this very common function.

Signed-off-by: Gergo Koteles <soyer@irl.hu>
Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Acked-by: Lee Jones <lee@kernel.org>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Link: https://lore.kernel.org/r/8ac95e85a53dc0b8cce1e27fc1cab6d19221543b.1712063200.git.soyer@irl.hu
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/dt-bindings/leds/common.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/dt-bindings/leds/common.h b/include/dt-bindings/leds/common.h
index ecea167930d9..d7c980bdf383 100644
--- a/include/dt-bindings/leds/common.h
+++ b/include/dt-bindings/leds/common.h
@@ -46,6 +46,7 @@
 #define LED_FUNCTION_CAPSLOCK "capslock"
 #define LED_FUNCTION_SCROLLLOCK "scrolllock"
 #define LED_FUNCTION_NUMLOCK "numlock"
+#define LED_FUNCTION_FNLOCK "fnlock"
 /*   Obsolete equivalents: "tpacpi::thinklight" (IBM/Lenovo Thinkpads),
      "lp5523:kb{1,2,3,4,5,6}" (Nokia N900) */
 #define LED_FUNCTION_KBD_BACKLIGHT "kbd_backlight"
-- 
cgit v1.2.3


From ba95eb44676d68049490af617049a7bf68946527 Mon Sep 17 00:00:00 2001
From: Gergo Koteles <soyer@irl.hu>
Date: Mon, 8 Apr 2024 19:35:10 +0200
Subject: ACPI: platform-profile: add platform_profile_cycle()

Some laptops have a key to switch platform profiles.

Add a platform_profile_cycle() function to cycle between the enabled
profiles.

Signed-off-by: Gergo Koteles <soyer@irl.hu>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/5a97deddf72aa5e764d881eb39a7ba35c01a903e.1712597199.git.soyer@irl.hu
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 drivers/acpi/platform_profile.c  | 39 +++++++++++++++++++++++++++++++++++++++
 include/linux/platform_profile.h |  1 +
 2 files changed, 40 insertions(+)

(limited to 'include')

diff --git a/drivers/acpi/platform_profile.c b/drivers/acpi/platform_profile.c
index d418462ab791..4a9704730224 100644
--- a/drivers/acpi/platform_profile.c
+++ b/drivers/acpi/platform_profile.c
@@ -136,6 +136,45 @@ void platform_profile_notify(void)
 }
 EXPORT_SYMBOL_GPL(platform_profile_notify);
 
+int platform_profile_cycle(void)
+{
+	enum platform_profile_option profile;
+	enum platform_profile_option next;
+	int err;
+
+	err = mutex_lock_interruptible(&profile_lock);
+	if (err)
+		return err;
+
+	if (!cur_profile) {
+		mutex_unlock(&profile_lock);
+		return -ENODEV;
+	}
+
+	err = cur_profile->profile_get(cur_profile, &profile);
+	if (err) {
+		mutex_unlock(&profile_lock);
+		return err;
+	}
+
+	next = find_next_bit_wrap(cur_profile->choices, PLATFORM_PROFILE_LAST,
+				  profile + 1);
+
+	if (WARN_ON(next == PLATFORM_PROFILE_LAST)) {
+		mutex_unlock(&profile_lock);
+		return -EINVAL;
+	}
+
+	err = cur_profile->profile_set(cur_profile, next);
+	mutex_unlock(&profile_lock);
+
+	if (!err)
+		sysfs_notify(acpi_kobj, NULL, "platform_profile");
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(platform_profile_cycle);
+
 int platform_profile_register(struct platform_profile_handler *pprof)
 {
 	int err;
diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h
index e5cbb6841f3a..f5492ed413f3 100644
--- a/include/linux/platform_profile.h
+++ b/include/linux/platform_profile.h
@@ -36,6 +36,7 @@ struct platform_profile_handler {
 
 int platform_profile_register(struct platform_profile_handler *pprof);
 int platform_profile_remove(void);
+int platform_profile_cycle(void);
 void platform_profile_notify(void);
 
 #endif  /*_PLATFORM_PROFILE_H_*/
-- 
cgit v1.2.3


From 36a005b9c66eade68f4235e612d733510a8d52ff Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 18 Mar 2024 22:00:26 +0000
Subject: io_uring/cmd: document some uring_cmd related helpers

Add comments warning users that they're only allowed to pass issue_flags
that were given from io_uring.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/82ff8a45f2c3eb5f3a04a33f0692e5e4a1320455.1710799188.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring/cmd.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include')

diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index e453a997c060..01b95505538a 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -26,12 +26,25 @@ static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe)
 #if defined(CONFIG_IO_URING)
 int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
 			      struct iov_iter *iter, void *ioucmd);
+
+/*
+ * Completes the request, i.e. posts an io_uring CQE and deallocates @ioucmd
+ * and the corresponding io_uring request.
+ *
+ * Note: the caller should never hard code @issue_flags and is only allowed
+ * to pass the mask provided by the core io_uring code.
+ */
 void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2,
 			unsigned issue_flags);
+
 void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
 			    void (*task_work_cb)(struct io_uring_cmd *, unsigned),
 			    unsigned flags);
 
+/*
+ * Note: the caller should never hard code @issue_flags and only use the
+ * mask provided by the core io_uring code.
+ */
 void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
 		unsigned int issue_flags);
 
-- 
cgit v1.2.3


From 1afdb76038e27a3a4dd4cf522f6457868051db84 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 19 Mar 2024 17:10:50 -0600
Subject: nvme/io_uring: use helper for polled completions

NVMe is making up issue_flags, which is a no-no in general, and to make
matters worse, they are completely the wrong ones. For a pure polled
request, which it does check for, we're already inside the
ctx->uring_lock when the completions are run off io_do_iopoll(). Hence
the correct flag would be '0' rather than IO_URING_F_UNLOCKED.

Reviewed-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/ioctl.c    | 15 +++++++++++----
 include/linux/io_uring/cmd.h | 11 +++++++++++
 2 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 3dfd5ae99ae0..499a8bb7cac7 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -423,13 +423,20 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
 	pdu->result = le64_to_cpu(nvme_req(req)->result.u64);
 
 	/*
-	 * For iopoll, complete it directly.
+	 * For iopoll, complete it directly. Note that using the uring_cmd
+	 * helper for this is safe only because we check blk_rq_is_poll().
+	 * As that returns false if we're NOT on a polled queue, then it's
+	 * safe to use the polled completion helper.
+	 *
 	 * Otherwise, move the completion to task work.
 	 */
-	if (blk_rq_is_poll(req))
-		nvme_uring_task_cb(ioucmd, IO_URING_F_UNLOCKED);
-	else
+	if (blk_rq_is_poll(req)) {
+		if (pdu->bio)
+			blk_rq_unmap_user(pdu->bio);
+		io_uring_cmd_iopoll_done(ioucmd, pdu->result, pdu->status);
+	} else {
 		io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
+	}
 
 	return RQ_END_IO_FREE;
 }
diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index 01b95505538a..447fbfd32215 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -69,6 +69,17 @@ static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
 }
 #endif
 
+/*
+ * Polled completions must ensure they are coming from a poll queue, and
+ * hence are completed inside the usual poll handling loops.
+ */
+static inline void io_uring_cmd_iopoll_done(struct io_uring_cmd *ioucmd,
+					    ssize_t ret, ssize_t res2)
+{
+	lockdep_assert(in_task());
+	io_uring_cmd_done(ioucmd, ret, res2, 0);
+}
+
 /* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */
 static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
 			void (*task_work_cb)(struct io_uring_cmd *, unsigned))
-- 
cgit v1.2.3


From 8e5b3b89ecaf6d9295e561c225b35c574a5e0fe7 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 18 Mar 2024 22:00:30 +0000
Subject: io_uring: remove struct io_tw_state::locked

ctx is always locked for task_work now, so get rid of struct
io_tw_state::locked. Note I'm stopping one step before removing
io_tw_state altogether, which is not empty, because it still serves the
purpose of indicating which function is a tw callback and forcing users
not to invoke them carelessly out of a wrong context. The removal can
always be done later.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/e95e1ea116d0bfa54b656076e6a977bc221392a4.1710799188.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  2 --
 io_uring/io_uring.c            | 31 ++++++++-----------------------
 io_uring/io_uring.h            |  5 +----
 io_uring/poll.c                |  2 +-
 io_uring/rw.c                  |  6 ++----
 io_uring/timeout.c             |  8 ++------
 io_uring/uring_cmd.c           |  8 ++------
 io_uring/waitid.c              |  2 +-
 8 files changed, 17 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index ac333ea81d31..90e5af401800 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -438,8 +438,6 @@ struct io_ring_ctx {
 };
 
 struct io_tw_state {
-	/* ->uring_lock is taken, callbacks can use io_tw_lock to lock it */
-	bool locked;
 };
 
 enum {
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 13fb5958454f..ecb59bb2418b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -247,14 +247,12 @@ static __cold void io_fallback_req_func(struct work_struct *work)
 						fallback_work.work);
 	struct llist_node *node = llist_del_all(&ctx->fallback_llist);
 	struct io_kiocb *req, *tmp;
-	struct io_tw_state ts = { .locked = true, };
+	struct io_tw_state ts = {};
 
 	percpu_ref_get(&ctx->refs);
 	mutex_lock(&ctx->uring_lock);
 	llist_for_each_entry_safe(req, tmp, node, io_task_work.node)
 		req->io_task_work.func(req, &ts);
-	if (WARN_ON_ONCE(!ts.locked))
-		return;
 	io_submit_flush_completions(ctx);
 	mutex_unlock(&ctx->uring_lock);
 	percpu_ref_put(&ctx->refs);
@@ -1157,11 +1155,9 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts)
 		return;
 	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
 		atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
-	if (ts->locked) {
-		io_submit_flush_completions(ctx);
-		mutex_unlock(&ctx->uring_lock);
-		ts->locked = false;
-	}
+
+	io_submit_flush_completions(ctx);
+	mutex_unlock(&ctx->uring_lock);
 	percpu_ref_put(&ctx->refs);
 }
 
@@ -1185,8 +1181,6 @@ struct llist_node *io_handle_tw_list(struct llist_node *node,
 		if (req->ctx != ctx) {
 			ctx_flush_and_put(ctx, &ts);
 			ctx = req->ctx;
-
-			ts.locked = true;
 			mutex_lock(&ctx->uring_lock);
 			percpu_ref_get(&ctx->refs);
 		}
@@ -1459,22 +1453,16 @@ again:
 static inline int io_run_local_work_locked(struct io_ring_ctx *ctx,
 					   int min_events)
 {
-	struct io_tw_state ts = { .locked = true, };
-	int ret;
+	struct io_tw_state ts = {};
 
 	if (llist_empty(&ctx->work_llist))
 		return 0;
-
-	ret = __io_run_local_work(ctx, &ts, min_events);
-	/* shouldn't happen! */
-	if (WARN_ON_ONCE(!ts.locked))
-		mutex_lock(&ctx->uring_lock);
-	return ret;
+	return __io_run_local_work(ctx, &ts, min_events);
 }
 
 static int io_run_local_work(struct io_ring_ctx *ctx, int min_events)
 {
-	struct io_tw_state ts = { .locked = true };
+	struct io_tw_state ts = {};
 	int ret;
 
 	mutex_lock(&ctx->uring_lock);
@@ -1702,10 +1690,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 
 void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts)
 {
-	if (ts->locked)
-		io_req_complete_defer(req);
-	else
-		io_req_complete_post(req, IO_URING_F_UNLOCKED);
+	io_req_complete_defer(req);
 }
 
 /*
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 0861d49e83de..7f921daae9c3 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -351,10 +351,7 @@ static inline bool io_task_work_pending(struct io_ring_ctx *ctx)
 
 static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts)
 {
-	if (!ts->locked) {
-		mutex_lock(&ctx->uring_lock);
-		ts->locked = true;
-	}
+	lockdep_assert_held(&ctx->uring_lock);
 }
 
 /*
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 6db1dcb2c797..8901dd118e50 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -322,7 +322,7 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts)
 			__poll_t mask = mangle_poll(req->cqe.res &
 						    req->apoll_events);
 
-			if (!io_fill_cqe_req_aux(req, ts->locked, mask,
+			if (!io_fill_cqe_req_aux(req, true, mask,
 						 IORING_CQE_F_MORE)) {
 				io_req_set_res(req, mask, 0);
 				return IOU_POLL_REMOVE_POLL_USE_RES;
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 0afe4f9e0e3f..674581bda8d7 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -305,11 +305,9 @@ void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts)
 
 	io_req_io_end(req);
 
-	if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
-		unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED;
+	if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))
+		req->cqe.flags |= io_put_kbuf(req, 0);
 
-		req->cqe.flags |= io_put_kbuf(req, issue_flags);
-	}
 	io_req_task_complete(req, ts);
 }
 
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 7fd7dbb211d6..0a48e6acd0b2 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -72,10 +72,7 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts)
 	struct io_ring_ctx *ctx = req->ctx;
 
 	if (!io_timeout_finish(timeout, data)) {
-		bool filled;
-		filled = io_fill_cqe_req_aux(req, ts->locked, -ETIME,
-					     IORING_CQE_F_MORE);
-		if (filled) {
+		if (io_fill_cqe_req_aux(req, true, -ETIME, IORING_CQE_F_MORE)) {
 			/* re-arm timer */
 			spin_lock_irq(&ctx->timeout_lock);
 			list_add(&timeout->list, ctx->timeout_list.prev);
@@ -301,7 +298,6 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
 
 static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *ts)
 {
-	unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED;
 	struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
 	struct io_kiocb *prev = timeout->prev;
 	int ret = -ENOENT;
@@ -313,7 +309,7 @@ static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *t
 				.data		= prev->cqe.user_data,
 			};
 
-			ret = io_try_cancel(req->task->io_uring, &cd, issue_flags);
+			ret = io_try_cancel(req->task->io_uring, &cd, 0);
 		}
 		io_req_set_res(req, ret ?: -ETIME, 0);
 		io_req_task_complete(req, ts);
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 759f919b14a9..4614ce734fee 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -87,13 +87,9 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable);
 static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts)
 {
 	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
-	unsigned issue_flags = IO_URING_F_UNLOCKED;
 
-	/* locked task_work executor checks the deffered list completion */
-	if (ts->locked)
-		issue_flags = IO_URING_F_COMPLETE_DEFER;
-
-	ioucmd->task_work_cb(ioucmd, issue_flags);
+	/* task_work executor checks the deffered list completion */
+	ioucmd->task_work_cb(ioucmd, IO_URING_F_COMPLETE_DEFER);
 }
 
 void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
diff --git a/io_uring/waitid.c b/io_uring/waitid.c
index 77d340666cb9..6362ec20abc0 100644
--- a/io_uring/waitid.c
+++ b/io_uring/waitid.c
@@ -118,7 +118,7 @@ static int io_waitid_finish(struct io_kiocb *req, int ret)
 static void io_waitid_complete(struct io_kiocb *req, int ret)
 {
 	struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid);
-	struct io_tw_state ts = { .locked = true };
+	struct io_tw_state ts = {};
 
 	/* anyone completing better be holding a reference */
 	WARN_ON_ONCE(!(atomic_read(&iw->refs) & IO_WAITID_REF_MASK));
-- 
cgit v1.2.3


From 902ce82c2aa130bea5e3feca2d4ae62781865da7 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 18 Mar 2024 22:00:32 +0000
Subject: io_uring: get rid of intermediate aux cqe caches

io_post_aux_cqe(), which is used for multishot requests, delays
completions by putting CQEs into a temporary array for the purpose
completion lock/flush batching.

DEFER_TASKRUN doesn't need any locking, so for it we can put completions
directly into the CQ and defer post completion handling with a flag.
That leaves !DEFER_TASKRUN, which is not that interesting / hot for
multishot requests, so have conditional locking with deferred flush
for them.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/b1d05a81fd27aaa2a07f9860af13059e7ad7a890.1710799188.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  3 +-
 io_uring/io_uring.c            | 62 +++++++++---------------------------------
 io_uring/io_uring.h            |  2 +-
 3 files changed, 15 insertions(+), 52 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 90e5af401800..75b46119d4c8 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -205,6 +205,7 @@ struct io_submit_state {
 
 	bool			plug_started;
 	bool			need_plug;
+	bool			cq_flush;
 	unsigned short		submit_nr;
 	unsigned int		cqes_count;
 	struct blk_plug		plug;
@@ -341,8 +342,6 @@ struct io_ring_ctx {
 		unsigned		cq_last_tm_flush;
 	} ____cacheline_aligned_in_smp;
 
-	struct io_uring_cqe	completion_cqes[16];
-
 	spinlock_t		completion_lock;
 
 	/* IRQ completion list, under ->completion_lock */
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 9ae5e7db7c6a..77e26b037fdd 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -630,6 +630,12 @@ static inline void __io_cq_lock(struct io_ring_ctx *ctx)
 		spin_lock(&ctx->completion_lock);
 }
 
+static inline void __io_cq_unlock(struct io_ring_ctx *ctx)
+{
+	if (!ctx->lockless_cq)
+		spin_unlock(&ctx->completion_lock);
+}
+
 static inline void io_cq_lock(struct io_ring_ctx *ctx)
 	__acquires(ctx->completion_lock)
 {
@@ -882,31 +888,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
 	return false;
 }
 
-static void __io_flush_post_cqes(struct io_ring_ctx *ctx)
-	__must_hold(&ctx->uring_lock)
-{
-	struct io_submit_state *state = &ctx->submit_state;
-	unsigned int i;
-
-	lockdep_assert_held(&ctx->uring_lock);
-	for (i = 0; i < state->cqes_count; i++) {
-		struct io_uring_cqe *cqe = &ctx->completion_cqes[i];
-
-		if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) {
-			if (ctx->lockless_cq) {
-				spin_lock(&ctx->completion_lock);
-				io_cqring_event_overflow(ctx, cqe->user_data,
-							cqe->res, cqe->flags, 0, 0);
-				spin_unlock(&ctx->completion_lock);
-			} else {
-				io_cqring_event_overflow(ctx, cqe->user_data,
-							cqe->res, cqe->flags, 0, 0);
-			}
-		}
-	}
-	state->cqes_count = 0;
-}
-
 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
 {
 	bool filled;
@@ -927,31 +908,16 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags
 bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	u64 user_data = req->cqe.user_data;
-	struct io_uring_cqe *cqe;
+	bool posted;
 
 	lockdep_assert(!io_wq_current_is_worker());
 	lockdep_assert_held(&ctx->uring_lock);
 
-	if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->completion_cqes)) {
-		__io_cq_lock(ctx);
-		__io_flush_post_cqes(ctx);
-		/* no need to flush - flush is deferred */
-		__io_cq_unlock_post(ctx);
-	}
-
-	/* For defered completions this is not as strict as it is otherwise,
-	 * however it's main job is to prevent unbounded posted completions,
-	 * and in that it works just as well.
-	 */
-	if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))
-		return false;
-
-	cqe = &ctx->completion_cqes[ctx->submit_state.cqes_count++];
-	cqe->user_data = user_data;
-	cqe->res = res;
-	cqe->flags = cflags;
-	return true;
+	__io_cq_lock(ctx);
+	posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags);
+	ctx->submit_state.cq_flush = true;
+	__io_cq_unlock_post(ctx);
+	return posted;
 }
 
 static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
@@ -1545,9 +1511,6 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 	struct io_wq_work_node *node;
 
 	__io_cq_lock(ctx);
-	/* must come first to preserve CQE ordering in failure cases */
-	if (state->cqes_count)
-		__io_flush_post_cqes(ctx);
 	__wq_list_for_each(node, &state->compl_reqs) {
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
 					    comp_list);
@@ -1569,6 +1532,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 		io_free_batch_list(ctx, state->compl_reqs.first);
 		INIT_WQ_LIST(&state->compl_reqs);
 	}
+	ctx->submit_state.cq_flush = false;
 }
 
 static unsigned io_cqring_events(struct io_ring_ctx *ctx)
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 460290e1bdec..5119265a11c2 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -156,7 +156,7 @@ static inline void io_req_task_work_add(struct io_kiocb *req)
 static inline void io_submit_flush_completions(struct io_ring_ctx *ctx)
 {
 	if (!wq_list_empty(&ctx->submit_state.compl_reqs) ||
-	    ctx->submit_state.cqes_count)
+	    ctx->submit_state.cq_flush)
 		__io_submit_flush_completions(ctx);
 }
 
-- 
cgit v1.2.3


From a9165b83c1937eeed1f0c731468216d6371d647f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 18 Mar 2024 16:13:01 -0600
Subject: io_uring/rw: always setup io_async_rw for read/write requests

read/write requests try to put everything on the stack, and then alloc
and copy if a retry is needed. This necessitates a bunch of nasty code
that deals with intermediate state.

Get rid of this, and have the prep side setup everything that is needed
upfront, which greatly simplifies the opcode handlers.

This includes adding an alloc cache for io_async_rw, to make it cheap
to handle.

In terms of cost, this should be basically free and transparent. For
the worst case of {READ,WRITE}_FIXED which didn't need it before,
performance is unaffected in the normal peak workload that is being
used to test that. Still runs at 122M IOPS.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |   1 +
 io_uring/io_uring.c            |   3 +
 io_uring/opdef.c               |  15 +-
 io_uring/rw.c                  | 538 +++++++++++++++++++----------------------
 io_uring/rw.h                  |  19 +-
 5 files changed, 278 insertions(+), 298 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 75b46119d4c8..60d7e35fc303 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -300,6 +300,7 @@ struct io_ring_ctx {
 		struct io_hash_table	cancel_table_locked;
 		struct io_alloc_cache	apoll_cache;
 		struct io_alloc_cache	netmsg_cache;
+		struct io_alloc_cache	rw_cache;
 
 		/*
 		 * Any cancelable uring_cmd is added to this list in
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e6e7794d497f..b0554d122931 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -311,6 +311,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 			    sizeof(struct async_poll));
 	io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
 			    sizeof(struct io_async_msghdr));
+	io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX,
+			    sizeof(struct io_async_rw));
 	io_futex_cache_init(ctx);
 	init_completion(&ctx->ref_comp);
 	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
@@ -2823,6 +2825,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	io_eventfd_unregister(ctx);
 	io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free);
 	io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
+	io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
 	io_futex_cache_free(ctx);
 	io_destroy_buffers(ctx);
 	mutex_unlock(&ctx->uring_lock);
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index dd4a1e1425e1..fcae75a08f2c 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -67,7 +67,7 @@ const struct io_issue_def io_issue_defs[] = {
 		.iopoll			= 1,
 		.iopoll_queue		= 1,
 		.vectored		= 1,
-		.prep			= io_prep_rwv,
+		.prep			= io_prep_readv,
 		.issue			= io_read,
 	},
 	[IORING_OP_WRITEV] = {
@@ -81,7 +81,7 @@ const struct io_issue_def io_issue_defs[] = {
 		.iopoll			= 1,
 		.iopoll_queue		= 1,
 		.vectored		= 1,
-		.prep			= io_prep_rwv,
+		.prep			= io_prep_writev,
 		.issue			= io_write,
 	},
 	[IORING_OP_FSYNC] = {
@@ -99,7 +99,7 @@ const struct io_issue_def io_issue_defs[] = {
 		.ioprio			= 1,
 		.iopoll			= 1,
 		.iopoll_queue		= 1,
-		.prep			= io_prep_rw_fixed,
+		.prep			= io_prep_read_fixed,
 		.issue			= io_read,
 	},
 	[IORING_OP_WRITE_FIXED] = {
@@ -112,7 +112,7 @@ const struct io_issue_def io_issue_defs[] = {
 		.ioprio			= 1,
 		.iopoll			= 1,
 		.iopoll_queue		= 1,
-		.prep			= io_prep_rw_fixed,
+		.prep			= io_prep_write_fixed,
 		.issue			= io_write,
 	},
 	[IORING_OP_POLL_ADD] = {
@@ -239,7 +239,7 @@ const struct io_issue_def io_issue_defs[] = {
 		.ioprio			= 1,
 		.iopoll			= 1,
 		.iopoll_queue		= 1,
-		.prep			= io_prep_rw,
+		.prep			= io_prep_read,
 		.issue			= io_read,
 	},
 	[IORING_OP_WRITE] = {
@@ -252,7 +252,7 @@ const struct io_issue_def io_issue_defs[] = {
 		.ioprio			= 1,
 		.iopoll			= 1,
 		.iopoll_queue		= 1,
-		.prep			= io_prep_rw,
+		.prep			= io_prep_write,
 		.issue			= io_write,
 	},
 	[IORING_OP_FADVISE] = {
@@ -490,14 +490,12 @@ const struct io_cold_def io_cold_defs[] = {
 	[IORING_OP_READV] = {
 		.async_size		= sizeof(struct io_async_rw),
 		.name			= "READV",
-		.prep_async		= io_readv_prep_async,
 		.cleanup		= io_readv_writev_cleanup,
 		.fail			= io_rw_fail,
 	},
 	[IORING_OP_WRITEV] = {
 		.async_size		= sizeof(struct io_async_rw),
 		.name			= "WRITEV",
-		.prep_async		= io_writev_prep_async,
 		.cleanup		= io_readv_writev_cleanup,
 		.fail			= io_rw_fail,
 	},
@@ -699,6 +697,7 @@ const struct io_cold_def io_cold_defs[] = {
 #endif
 	},
 	[IORING_OP_READ_MULTISHOT] = {
+		.async_size		= sizeof(struct io_async_rw),
 		.name			= "READ_MULTISHOT",
 	},
 	[IORING_OP_WAITID] = {
diff --git a/io_uring/rw.c b/io_uring/rw.c
index b5165d1c9686..2625e177c9bc 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -75,7 +75,153 @@ static int io_iov_buffer_select_prep(struct io_kiocb *req)
 	return 0;
 }
 
-int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int __io_import_iovec(int ddir, struct io_kiocb *req,
+			     struct io_async_rw *io,
+			     unsigned int issue_flags)
+{
+	const struct io_issue_def *def = &io_issue_defs[req->opcode];
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+	void __user *buf;
+	size_t sqe_len;
+
+	buf = u64_to_user_ptr(rw->addr);
+	sqe_len = rw->len;
+
+	if (!def->vectored || req->flags & REQ_F_BUFFER_SELECT) {
+		if (io_do_buffer_select(req)) {
+			buf = io_buffer_select(req, &sqe_len, issue_flags);
+			if (!buf)
+				return -ENOBUFS;
+			rw->addr = (unsigned long) buf;
+			rw->len = sqe_len;
+		}
+
+		return import_ubuf(ddir, buf, sqe_len, &io->s.iter);
+	}
+
+	io->free_iovec = io->s.fast_iov;
+	return __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &io->free_iovec,
+				&io->s.iter, req->ctx->compat);
+}
+
+static inline int io_import_iovec(int rw, struct io_kiocb *req,
+				  struct io_async_rw *io,
+				  unsigned int issue_flags)
+{
+	int ret;
+
+	ret = __io_import_iovec(rw, req, io, issue_flags);
+	if (unlikely(ret < 0))
+		return ret;
+
+	iov_iter_save_state(&io->s.iter, &io->s.iter_state);
+	return 0;
+}
+
+static void io_rw_iovec_free(struct io_async_rw *rw)
+{
+	if (rw->free_iovec) {
+		kfree(rw->free_iovec);
+		rw->free_iovec = NULL;
+	}
+}
+
+static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_async_rw *rw = req->async_data;
+
+	if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
+		io_rw_iovec_free(rw);
+		return;
+	}
+	if (io_alloc_cache_put(&req->ctx->rw_cache, &rw->cache)) {
+		req->async_data = NULL;
+		req->flags &= ~REQ_F_ASYNC_DATA;
+	}
+}
+
+static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags)
+{
+	/*
+	 * Disable quick recycling for anything that's gone through io-wq.
+	 * In theory, this should be fine to cleanup. However, some read or
+	 * write iter handling touches the iovec AFTER having called into the
+	 * handler, eg to reexpand or revert. This means we can have:
+	 *
+	 * task			io-wq
+	 *   issue
+	 *     punt to io-wq
+	 *			issue
+	 *			  blkdev_write_iter()
+	 *			    ->ki_complete()
+	 *			      io_complete_rw()
+	 *			        queue tw complete
+	 *  run tw
+	 *    req_rw_cleanup
+	 *			iov_iter_count() <- look at iov_iter again
+	 *
+	 * which can lead to a UAF. This is only possible for io-wq offload
+	 * as the cleanup can run in parallel. As io-wq is not the fast path,
+	 * just leave cleanup to the end.
+	 *
+	 * This is really a bug in the core code that does this, any issue
+	 * path should assume that a successful (or -EIOCBQUEUED) return can
+	 * mean that the underlying data can be gone at any time. But that
+	 * should be fixed seperately, and then this check could be killed.
+	 */
+	if (!(req->flags & REQ_F_REFCOUNT)) {
+		req->flags &= ~REQ_F_NEED_CLEANUP;
+		io_rw_recycle(req, issue_flags);
+	}
+}
+
+static int io_rw_alloc_async(struct io_kiocb *req)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_cache_entry *entry;
+	struct io_async_rw *rw;
+
+	entry = io_alloc_cache_get(&ctx->rw_cache);
+	if (entry) {
+		rw = container_of(entry, struct io_async_rw, cache);
+		req->flags |= REQ_F_ASYNC_DATA;
+		req->async_data = rw;
+		goto done;
+	}
+
+	if (!io_alloc_async_data(req)) {
+		rw = req->async_data;
+done:
+		rw->free_iovec = NULL;
+		rw->bytes_done = 0;
+		return 0;
+	}
+
+	return -ENOMEM;
+}
+
+static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import)
+{
+	struct io_async_rw *rw;
+	int ret;
+
+	if (io_rw_alloc_async(req))
+		return -ENOMEM;
+
+	if (!do_import || io_do_buffer_select(req))
+		return 0;
+
+	rw = req->async_data;
+	ret = io_import_iovec(ddir, req, rw, 0);
+	if (unlikely(ret < 0))
+		return ret;
+
+	iov_iter_save_state(&rw->s.iter, &rw->s.iter_state);
+	return 0;
+}
+
+static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+		      int ddir, bool do_import)
 {
 	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	unsigned ioprio;
@@ -100,34 +246,58 @@ int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	rw->addr = READ_ONCE(sqe->addr);
 	rw->len = READ_ONCE(sqe->len);
 	rw->flags = READ_ONCE(sqe->rw_flags);
-	return 0;
+	return io_prep_rw_setup(req, ddir, do_import);
 }
 
-int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
+	return io_prep_rw(req, sqe, ITER_DEST, true);
+}
+
+int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	return io_prep_rw(req, sqe, ITER_SOURCE, true);
+}
+
+static int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+		       int ddir)
+{
+	const bool do_import = !(req->flags & REQ_F_BUFFER_SELECT);
 	int ret;
 
-	ret = io_prep_rw(req, sqe);
+	ret = io_prep_rw(req, sqe, ddir, do_import);
 	if (unlikely(ret))
 		return ret;
+	if (do_import)
+		return 0;
 
 	/*
 	 * Have to do this validation here, as this is in io_read() rw->len
 	 * might have chanaged due to buffer selection
 	 */
-	if (req->flags & REQ_F_BUFFER_SELECT)
-		return io_iov_buffer_select_prep(req);
+	return io_iov_buffer_select_prep(req);
+}
 
-	return 0;
+int io_prep_readv(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	return io_prep_rwv(req, sqe, ITER_DEST);
+}
+
+int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	return io_prep_rwv(req, sqe, ITER_SOURCE);
 }
 
-int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+			    int ddir)
 {
+	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	struct io_ring_ctx *ctx = req->ctx;
+	struct io_async_rw *io;
 	u16 index;
 	int ret;
 
-	ret = io_prep_rw(req, sqe);
+	ret = io_prep_rw(req, sqe, ddir, false);
 	if (unlikely(ret))
 		return ret;
 
@@ -136,7 +306,21 @@ int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	index = array_index_nospec(req->buf_index, ctx->nr_user_bufs);
 	req->imu = ctx->user_bufs[index];
 	io_req_set_rsrc_node(req, ctx, 0);
-	return 0;
+
+	io = req->async_data;
+	ret = io_import_fixed(ddir, &io->s.iter, req->imu, rw->addr, rw->len);
+	iov_iter_save_state(&io->s.iter, &io->s.iter_state);
+	return ret;
+}
+
+int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	return io_prep_rw_fixed(req, sqe, ITER_DEST);
+}
+
+int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	return io_prep_rw_fixed(req, sqe, ITER_SOURCE);
 }
 
 /*
@@ -152,7 +336,7 @@ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (!(req->flags & REQ_F_BUFFER_SELECT))
 		return -EINVAL;
 
-	ret = io_prep_rw(req, sqe);
+	ret = io_prep_rw(req, sqe, ITER_DEST, false);
 	if (unlikely(ret))
 		return ret;
 
@@ -165,9 +349,7 @@ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 void io_readv_writev_cleanup(struct io_kiocb *req)
 {
-	struct io_async_rw *io = req->async_data;
-
-	kfree(io->free_iovec);
+	io_rw_iovec_free(req->async_data);
 }
 
 static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
@@ -188,14 +370,11 @@ static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
 }
 
 #ifdef CONFIG_BLOCK
-static bool io_resubmit_prep(struct io_kiocb *req)
+static void io_resubmit_prep(struct io_kiocb *req)
 {
 	struct io_async_rw *io = req->async_data;
 
-	if (!req_has_async_data(req))
-		return !io_req_prep_async(req);
 	iov_iter_restore(&io->s.iter, &io->s.iter_state);
-	return true;
 }
 
 static bool io_rw_should_reissue(struct io_kiocb *req)
@@ -224,9 +403,8 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
 	return true;
 }
 #else
-static bool io_resubmit_prep(struct io_kiocb *req)
+static void io_resubmit_prep(struct io_kiocb *req)
 {
-	return false;
 }
 static bool io_rw_should_reissue(struct io_kiocb *req)
 {
@@ -308,6 +486,7 @@ void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts)
 	if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))
 		req->cqe.flags |= io_put_kbuf(req, 0);
 
+	io_req_rw_cleanup(req, 0);
 	io_req_task_complete(req, ts);
 }
 
@@ -388,6 +567,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
 			io_req_io_end(req);
 			io_req_set_res(req, final_ret,
 				       io_put_kbuf(req, issue_flags));
+			io_req_rw_cleanup(req, issue_flags);
 			return IOU_OK;
 		}
 	} else {
@@ -396,71 +576,12 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
 
 	if (req->flags & REQ_F_REISSUE) {
 		req->flags &= ~REQ_F_REISSUE;
-		if (io_resubmit_prep(req))
-			return -EAGAIN;
-		else
-			io_req_task_queue_fail(req, final_ret);
+		io_resubmit_prep(req);
+		return -EAGAIN;
 	}
 	return IOU_ISSUE_SKIP_COMPLETE;
 }
 
-static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
-				       struct io_rw_state *s,
-				       unsigned int issue_flags)
-{
-	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
-	struct iov_iter *iter = &s->iter;
-	u8 opcode = req->opcode;
-	struct iovec *iovec;
-	void __user *buf;
-	size_t sqe_len;
-	ssize_t ret;
-
-	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
-		ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len);
-		if (ret)
-			return ERR_PTR(ret);
-		return NULL;
-	}
-
-	buf = u64_to_user_ptr(rw->addr);
-	sqe_len = rw->len;
-
-	if (!io_issue_defs[opcode].vectored || req->flags & REQ_F_BUFFER_SELECT) {
-		if (io_do_buffer_select(req)) {
-			buf = io_buffer_select(req, &sqe_len, issue_flags);
-			if (!buf)
-				return ERR_PTR(-ENOBUFS);
-			rw->addr = (unsigned long) buf;
-			rw->len = sqe_len;
-		}
-
-		ret = import_ubuf(ddir, buf, sqe_len, iter);
-		if (ret)
-			return ERR_PTR(ret);
-		return NULL;
-	}
-
-	iovec = s->fast_iov;
-	ret = __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &iovec, iter,
-			      req->ctx->compat);
-	if (unlikely(ret < 0))
-		return ERR_PTR(ret);
-	return iovec;
-}
-
-static inline int io_import_iovec(int rw, struct io_kiocb *req,
-				  struct iovec **iovec, struct io_rw_state *s,
-				  unsigned int issue_flags)
-{
-	*iovec = __io_import_iovec(rw, req, s, issue_flags);
-	if (IS_ERR(*iovec))
-		return PTR_ERR(*iovec);
-
-	iov_iter_save_state(&s->iter, &s->iter_state);
-	return 0;
-}
-
 static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb)
 {
 	return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos;
@@ -532,89 +653,6 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter)
 	return ret;
 }
 
-static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
-			  const struct iovec *fast_iov, struct iov_iter *iter)
-{
-	struct io_async_rw *io = req->async_data;
-
-	memcpy(&io->s.iter, iter, sizeof(*iter));
-	io->free_iovec = iovec;
-	io->bytes_done = 0;
-	/* can only be fixed buffers, no need to do anything */
-	if (iov_iter_is_bvec(iter) || iter_is_ubuf(iter))
-		return;
-	if (!iovec) {
-		unsigned iov_off = 0;
-
-		io->s.iter.__iov = io->s.fast_iov;
-		if (iter->__iov != fast_iov) {
-			iov_off = iter_iov(iter) - fast_iov;
-			io->s.iter.__iov += iov_off;
-		}
-		if (io->s.fast_iov != fast_iov)
-			memcpy(io->s.fast_iov + iov_off, fast_iov + iov_off,
-			       sizeof(struct iovec) * iter->nr_segs);
-	} else {
-		req->flags |= REQ_F_NEED_CLEANUP;
-	}
-}
-
-static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
-			     struct io_rw_state *s, bool force)
-{
-	if (!force && !io_cold_defs[req->opcode].prep_async)
-		return 0;
-	/* opcode type doesn't need async data */
-	if (!io_cold_defs[req->opcode].async_size)
-		return 0;
-	if (!req_has_async_data(req)) {
-		struct io_async_rw *iorw;
-
-		if (io_alloc_async_data(req)) {
-			kfree(iovec);
-			return -ENOMEM;
-		}
-
-		io_req_map_rw(req, iovec, s->fast_iov, &s->iter);
-		iorw = req->async_data;
-		/* we've copied and mapped the iter, ensure state is saved */
-		iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state);
-	}
-	return 0;
-}
-
-static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
-{
-	struct io_async_rw *iorw = req->async_data;
-	struct iovec *iov;
-	int ret;
-
-	iorw->bytes_done = 0;
-	iorw->free_iovec = NULL;
-
-	/* submission path, ->uring_lock should already be taken */
-	ret = io_import_iovec(rw, req, &iov, &iorw->s, 0);
-	if (unlikely(ret < 0))
-		return ret;
-
-	if (iov) {
-		iorw->free_iovec = iov;
-		req->flags |= REQ_F_NEED_CLEANUP;
-	}
-
-	return 0;
-}
-
-int io_readv_prep_async(struct io_kiocb *req)
-{
-	return io_rw_prep_async(req, ITER_DEST);
-}
-
-int io_writev_prep_async(struct io_kiocb *req)
-{
-	return io_rw_prep_async(req, ITER_SOURCE);
-}
-
 /*
  * This is our waitqueue callback handler, registered through __folio_lock_async()
  * when we initially tried to do the IO with the iocb armed our waitqueue.
@@ -754,54 +792,28 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
 
 static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 {
+	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
-	struct io_rw_state __s, *s = &__s;
-	struct iovec *iovec;
+	struct io_async_rw *io = req->async_data;
 	struct kiocb *kiocb = &rw->kiocb;
-	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
-	struct io_async_rw *io;
-	ssize_t ret, ret2;
+	ssize_t ret;
 	loff_t *ppos;
 
-	if (!req_has_async_data(req)) {
-		ret = io_import_iovec(ITER_DEST, req, &iovec, s, issue_flags);
+	if (io_do_buffer_select(req)) {
+		ret = io_import_iovec(ITER_DEST, req, io, issue_flags);
 		if (unlikely(ret < 0))
 			return ret;
-	} else {
-		io = req->async_data;
-		s = &io->s;
-
-		/*
-		 * Safe and required to re-import if we're using provided
-		 * buffers, as we dropped the selected one before retry.
-		 */
-		if (io_do_buffer_select(req)) {
-			ret = io_import_iovec(ITER_DEST, req, &iovec, s, issue_flags);
-			if (unlikely(ret < 0))
-				return ret;
-		}
-
-		/*
-		 * We come here from an earlier attempt, restore our state to
-		 * match in case it doesn't. It's cheap enough that we don't
-		 * need to make this conditional.
-		 */
-		iov_iter_restore(&s->iter, &s->iter_state);
-		iovec = NULL;
 	}
+
 	ret = io_rw_init_file(req, FMODE_READ);
-	if (unlikely(ret)) {
-		kfree(iovec);
+	if (unlikely(ret))
 		return ret;
-	}
-	req->cqe.res = iov_iter_count(&s->iter);
+	req->cqe.res = iov_iter_count(&io->s.iter);
 
 	if (force_nonblock) {
 		/* If the file doesn't support async, just async punt */
-		if (unlikely(!io_file_supports_nowait(req))) {
-			ret = io_setup_async_rw(req, iovec, s, true);
-			return ret ?: -EAGAIN;
-		}
+		if (unlikely(!io_file_supports_nowait(req)))
+			return -EAGAIN;
 		kiocb->ki_flags |= IOCB_NOWAIT;
 	} else {
 		/* Ensure we clear previously set non-block flag */
@@ -811,20 +823,15 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 	ppos = io_kiocb_update_pos(req);
 
 	ret = rw_verify_area(READ, req->file, ppos, req->cqe.res);
-	if (unlikely(ret)) {
-		kfree(iovec);
+	if (unlikely(ret))
 		return ret;
-	}
 
-	ret = io_iter_do_read(rw, &s->iter);
+	ret = io_iter_do_read(rw, &io->s.iter);
 
 	if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
 		req->flags &= ~REQ_F_REISSUE;
-		/*
-		 * If we can poll, just do that. For a vectored read, we'll
-		 * need to copy state first.
-		 */
-		if (io_file_can_poll(req) && !io_issue_defs[req->opcode].vectored)
+		/* If we can poll, just do that. */
+		if (io_file_can_poll(req))
 			return -EAGAIN;
 		/* IOPOLL retry should happen for io-wq threads */
 		if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
@@ -834,8 +841,6 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 			goto done;
 		ret = 0;
 	} else if (ret == -EIOCBQUEUED) {
-		if (iovec)
-			kfree(iovec);
 		return IOU_ISSUE_SKIP_COMPLETE;
 	} else if (ret == req->cqe.res || ret <= 0 || !force_nonblock ||
 		   (req->flags & REQ_F_NOWAIT) || !need_complete_io(req)) {
@@ -848,21 +853,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 	 * untouched in case of error. Restore it and we'll advance it
 	 * manually if we need to.
 	 */
-	iov_iter_restore(&s->iter, &s->iter_state);
-
-	ret2 = io_setup_async_rw(req, iovec, s, true);
-	iovec = NULL;
-	if (ret2) {
-		ret = ret > 0 ? ret : ret2;
-		goto done;
-	}
-
-	io = req->async_data;
-	s = &io->s;
-	/*
-	 * Now use our persistent iterator and state, if we aren't already.
-	 * We've restored and mapped the iter to match.
-	 */
+	iov_iter_restore(&io->s.iter, &io->s.iter_state);
 
 	do {
 		/*
@@ -870,11 +861,11 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 		 * above or inside this loop. Advance the iter by the bytes
 		 * that were consumed.
 		 */
-		iov_iter_advance(&s->iter, ret);
-		if (!iov_iter_count(&s->iter))
+		iov_iter_advance(&io->s.iter, ret);
+		if (!iov_iter_count(&io->s.iter))
 			break;
 		io->bytes_done += ret;
-		iov_iter_save_state(&s->iter, &s->iter_state);
+		iov_iter_save_state(&io->s.iter, &io->s.iter_state);
 
 		/* if we can retry, do so with the callbacks armed */
 		if (!io_rw_should_retry(req)) {
@@ -882,24 +873,22 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
 			return -EAGAIN;
 		}
 
-		req->cqe.res = iov_iter_count(&s->iter);
+		req->cqe.res = iov_iter_count(&io->s.iter);
 		/*
 		 * Now retry read with the IOCB_WAITQ parts set in the iocb. If
 		 * we get -EIOCBQUEUED, then we'll get a notification when the
 		 * desired page gets unlocked. We can also get a partial read
 		 * here, and if we do, then just retry at the new offset.
 		 */
-		ret = io_iter_do_read(rw, &s->iter);
+		ret = io_iter_do_read(rw, &io->s.iter);
 		if (ret == -EIOCBQUEUED)
 			return IOU_ISSUE_SKIP_COMPLETE;
 		/* we got some bytes, but not all. retry. */
 		kiocb->ki_flags &= ~IOCB_WAITQ;
-		iov_iter_restore(&s->iter, &s->iter_state);
+		iov_iter_restore(&io->s.iter, &io->s.iter_state);
 	} while (ret > 0);
 done:
 	/* it's faster to check here then delegate to kfree */
-	if (iovec)
-		kfree(iovec);
 	return ret;
 }
 
@@ -908,8 +897,9 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags)
 	int ret;
 
 	ret = __io_read(req, issue_flags);
-	if (ret >= 0)
-		return kiocb_done(req, ret, issue_flags);
+	if (ret >= 0) {
+		ret = kiocb_done(req, ret, issue_flags);
+	}
 
 	return ret;
 }
@@ -981,6 +971,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
 	 * multishot request, hitting overflow will terminate it.
 	 */
 	io_req_set_res(req, ret, cflags);
+	io_req_rw_cleanup(req, issue_flags);
 	if (issue_flags & IO_URING_F_MULTISHOT)
 		return IOU_STOP_MULTISHOT;
 	return IOU_OK;
@@ -988,42 +979,28 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
 
 int io_write(struct io_kiocb *req, unsigned int issue_flags)
 {
+	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
-	struct io_rw_state __s, *s = &__s;
-	struct iovec *iovec;
+	struct io_async_rw *io = req->async_data;
 	struct kiocb *kiocb = &rw->kiocb;
-	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 	ssize_t ret, ret2;
 	loff_t *ppos;
 
-	if (!req_has_async_data(req)) {
-		ret = io_import_iovec(ITER_SOURCE, req, &iovec, s, issue_flags);
-		if (unlikely(ret < 0))
-			return ret;
-	} else {
-		struct io_async_rw *io = req->async_data;
-
-		s = &io->s;
-		iov_iter_restore(&s->iter, &s->iter_state);
-		iovec = NULL;
-	}
 	ret = io_rw_init_file(req, FMODE_WRITE);
-	if (unlikely(ret)) {
-		kfree(iovec);
+	if (unlikely(ret))
 		return ret;
-	}
-	req->cqe.res = iov_iter_count(&s->iter);
+	req->cqe.res = iov_iter_count(&io->s.iter);
 
 	if (force_nonblock) {
 		/* If the file doesn't support async, just async punt */
 		if (unlikely(!io_file_supports_nowait(req)))
-			goto copy_iov;
+			goto ret_eagain;
 
 		/* File path supports NOWAIT for non-direct_IO only for block devices. */
 		if (!(kiocb->ki_flags & IOCB_DIRECT) &&
 			!(kiocb->ki_filp->f_mode & FMODE_BUF_WASYNC) &&
 			(req->flags & REQ_F_ISREG))
-			goto copy_iov;
+			goto ret_eagain;
 
 		kiocb->ki_flags |= IOCB_NOWAIT;
 	} else {
@@ -1034,19 +1011,17 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
 	ppos = io_kiocb_update_pos(req);
 
 	ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res);
-	if (unlikely(ret)) {
-		kfree(iovec);
+	if (unlikely(ret))
 		return ret;
-	}
 
 	if (req->flags & REQ_F_ISREG)
 		kiocb_start_write(kiocb);
 	kiocb->ki_flags |= IOCB_WRITE;
 
 	if (likely(req->file->f_op->write_iter))
-		ret2 = call_write_iter(req->file, kiocb, &s->iter);
+		ret2 = call_write_iter(req->file, kiocb, &io->s.iter);
 	else if (req->file->f_op->write)
-		ret2 = loop_rw_iter(WRITE, rw, &s->iter);
+		ret2 = loop_rw_iter(WRITE, rw, &io->s.iter);
 	else
 		ret2 = -EINVAL;
 
@@ -1067,11 +1042,9 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
 	if (!force_nonblock || ret2 != -EAGAIN) {
 		/* IOPOLL retry should happen for io-wq threads */
 		if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
-			goto copy_iov;
+			goto ret_eagain;
 
 		if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) {
-			struct io_async_rw *io;
-
 			trace_io_uring_short_write(req->ctx, kiocb->ki_pos - ret2,
 						req->cqe.res, ret2);
 
@@ -1080,33 +1053,22 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
 			 * in the worker. Also update bytes_done to account for
 			 * the bytes already written.
 			 */
-			iov_iter_save_state(&s->iter, &s->iter_state);
-			ret = io_setup_async_rw(req, iovec, s, true);
-
-			io = req->async_data;
-			if (io)
-				io->bytes_done += ret2;
+			iov_iter_save_state(&io->s.iter, &io->s.iter_state);
+			io->bytes_done += ret2;
 
 			if (kiocb->ki_flags & IOCB_WRITE)
 				io_req_end_write(req);
-			return ret ? ret : -EAGAIN;
+			return -EAGAIN;
 		}
 done:
 		ret = kiocb_done(req, ret2, issue_flags);
 	} else {
-copy_iov:
-		iov_iter_restore(&s->iter, &s->iter_state);
-		ret = io_setup_async_rw(req, iovec, s, false);
-		if (!ret) {
-			if (kiocb->ki_flags & IOCB_WRITE)
-				io_req_end_write(req);
-			return -EAGAIN;
-		}
-		return ret;
+ret_eagain:
+		iov_iter_restore(&io->s.iter, &io->s.iter_state);
+		if (kiocb->ki_flags & IOCB_WRITE)
+			io_req_end_write(req);
+		return -EAGAIN;
 	}
-	/* it's reportedly faster than delegating the null check to kfree() */
-	if (iovec)
-		kfree(iovec);
 	return ret;
 }
 
@@ -1181,6 +1143,8 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 			break;
 		nr_events++;
 		req->cqe.flags = io_put_kbuf(req, 0);
+		if (req->opcode != IORING_OP_URING_CMD)
+			io_req_rw_cleanup(req, 0);
 	}
 	if (unlikely(!nr_events))
 		return 0;
@@ -1194,3 +1158,11 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 	__io_submit_flush_completions(ctx);
 	return nr_events;
 }
+
+void io_rw_cache_free(struct io_cache_entry *entry)
+{
+	struct io_async_rw *rw;
+
+	rw = container_of(entry, struct io_async_rw, cache);
+	kfree(rw);
+}
diff --git a/io_uring/rw.h b/io_uring/rw.h
index f9e89b4fe4da..f7905070d10b 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -9,21 +9,26 @@ struct io_rw_state {
 };
 
 struct io_async_rw {
+	union {
+		size_t			bytes_done;
+		struct io_cache_entry	cache;
+	};
 	struct io_rw_state		s;
-	const struct iovec		*free_iovec;
-	size_t				bytes_done;
+	struct iovec			*free_iovec;
 	struct wait_page_queue		wpq;
 };
 
-int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe);
-int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe);
-int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_prep_readv(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe);
+int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_read(struct io_kiocb *req, unsigned int issue_flags);
-int io_readv_prep_async(struct io_kiocb *req);
 int io_write(struct io_kiocb *req, unsigned int issue_flags);
-int io_writev_prep_async(struct io_kiocb *req);
 void io_readv_writev_cleanup(struct io_kiocb *req);
 void io_rw_fail(struct io_kiocb *req);
 void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts);
 int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags);
+void io_rw_cache_free(struct io_cache_entry *entry);
-- 
cgit v1.2.3


From d10f19dff56eac5ae44dc270336b18071a8bd51c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 18 Mar 2024 20:41:58 -0600
Subject: io_uring/uring_cmd: switch to always allocating async data

Basic conversion ensuring async_data is allocated off the prep path. Adds
a basic alloc cache as well, as passthrough IO can be quite high in rate.

Tested-by: Anuj Gupta <anuj20.g@samsung.com>
Reviewed-by: Anuj Gupta <anuj20.g@samsung.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  1 +
 io_uring/io_uring.c            |  3 ++
 io_uring/opdef.c               |  1 -
 io_uring/uring_cmd.c           | 77 ++++++++++++++++++++++++++++++------------
 io_uring/uring_cmd.h           | 10 +++++-
 5 files changed, 69 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 60d7e35fc303..0f24fdad19c2 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -301,6 +301,7 @@ struct io_ring_ctx {
 		struct io_alloc_cache	apoll_cache;
 		struct io_alloc_cache	netmsg_cache;
 		struct io_alloc_cache	rw_cache;
+		struct io_alloc_cache	uring_cache;
 
 		/*
 		 * Any cancelable uring_cmd is added to this list in
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index b0554d122931..a868c0a253a6 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -313,6 +313,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 			    sizeof(struct io_async_msghdr));
 	io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX,
 			    sizeof(struct io_async_rw));
+	io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX,
+			    sizeof(struct uring_cache));
 	io_futex_cache_init(ctx);
 	init_completion(&ctx->ref_comp);
 	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
@@ -2826,6 +2828,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free);
 	io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
 	io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
+	io_alloc_cache_free(&ctx->uring_cache, io_uring_cache_free);
 	io_futex_cache_free(ctx);
 	io_destroy_buffers(ctx);
 	mutex_unlock(&ctx->uring_lock);
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 1951107210d4..745246086c23 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -677,7 +677,6 @@ const struct io_cold_def io_cold_defs[] = {
 	[IORING_OP_URING_CMD] = {
 		.name			= "URING_CMD",
 		.async_size		= 2 * sizeof(struct io_uring_sqe),
-		.prep_async		= io_uring_cmd_prep_async,
 	},
 	[IORING_OP_SEND_ZC] = {
 		.name			= "SEND_ZC",
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 4614ce734fee..9bd0ba87553f 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -14,6 +14,38 @@
 #include "rsrc.h"
 #include "uring_cmd.h"
 
+static struct uring_cache *io_uring_async_get(struct io_kiocb *req)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_cache_entry *entry;
+	struct uring_cache *cache;
+
+	entry = io_alloc_cache_get(&ctx->uring_cache);
+	if (entry) {
+		cache = container_of(entry, struct uring_cache, cache);
+		req->flags |= REQ_F_ASYNC_DATA;
+		req->async_data = cache;
+		return cache;
+	}
+	if (!io_alloc_async_data(req))
+		return req->async_data;
+	return NULL;
+}
+
+static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
+	struct uring_cache *cache = req->async_data;
+
+	if (issue_flags & IO_URING_F_UNLOCKED)
+		return;
+	if (io_alloc_cache_put(&req->ctx->uring_cache, &cache->cache)) {
+		ioucmd->sqe = NULL;
+		req->async_data = NULL;
+		req->flags &= ~REQ_F_ASYNC_DATA;
+	}
+}
+
 bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
 				   struct task_struct *task, bool cancel_all)
 {
@@ -128,6 +160,7 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2,
 	io_req_set_res(req, ret, 0);
 	if (req->ctx->flags & IORING_SETUP_CQE32)
 		io_req_set_cqe32_extra(req, res2, 0);
+	io_req_uring_cleanup(req, issue_flags);
 	if (req->ctx->flags & IORING_SETUP_IOPOLL) {
 		/* order with io_iopoll_req_issued() checking ->iopoll_complete */
 		smp_store_release(&req->iopoll_completed, 1);
@@ -142,13 +175,19 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2,
 }
 EXPORT_SYMBOL_GPL(io_uring_cmd_done);
 
-int io_uring_cmd_prep_async(struct io_kiocb *req)
+static int io_uring_cmd_prep_setup(struct io_kiocb *req,
+				   const struct io_uring_sqe *sqe)
 {
 	struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
+	struct uring_cache *cache;
 
-	memcpy(req->async_data, ioucmd->sqe, uring_sqe_size(req->ctx));
-	ioucmd->sqe = req->async_data;
-	return 0;
+	cache = io_uring_async_get(req);
+	if (cache) {
+		memcpy(cache->sqes, sqe, uring_sqe_size(req->ctx));
+		ioucmd->sqe = req->async_data;
+		return 0;
+	}
+	return -ENOMEM;
 }
 
 int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -173,9 +212,9 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		req->imu = ctx->user_bufs[index];
 		io_req_set_rsrc_node(req, ctx, 0);
 	}
-	ioucmd->sqe = sqe;
 	ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
-	return 0;
+
+	return io_uring_cmd_prep_setup(req, sqe);
 }
 
 int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
@@ -206,23 +245,14 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
 	}
 
 	ret = file->f_op->uring_cmd(ioucmd, issue_flags);
-	if (ret == -EAGAIN) {
-		if (!req_has_async_data(req)) {
-			if (io_alloc_async_data(req))
-				return -ENOMEM;
-			io_uring_cmd_prep_async(req);
-		}
-		return -EAGAIN;
-	}
-
-	if (ret != -EIOCBQUEUED) {
-		if (ret < 0)
-			req_set_fail(req);
-		io_req_set_res(req, ret, 0);
+	if (ret == -EAGAIN || ret == -EIOCBQUEUED)
 		return ret;
-	}
 
-	return IOU_ISSUE_SKIP_COMPLETE;
+	if (ret < 0)
+		req_set_fail(req);
+	io_req_uring_cleanup(req, issue_flags);
+	io_req_set_res(req, ret, 0);
+	return ret;
 }
 
 int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
@@ -311,3 +341,8 @@ int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
 }
 EXPORT_SYMBOL_GPL(io_uring_cmd_sock);
 #endif
+
+void io_uring_cache_free(struct io_cache_entry *entry)
+{
+	kfree(container_of(entry, struct uring_cache, cache));
+}
diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h
index 7356bf9aa655..b0ccff7091ee 100644
--- a/io_uring/uring_cmd.h
+++ b/io_uring/uring_cmd.h
@@ -1,8 +1,16 @@
 // SPDX-License-Identifier: GPL-2.0
 
+struct uring_cache {
+	union {
+		struct io_cache_entry cache;
+		struct io_uring_sqe sqes[2];
+	};
+};
+
 int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags);
 int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_uring_cmd_prep_async(struct io_kiocb *req);
+void io_uring_cache_free(struct io_cache_entry *entry);
 
 bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
-				   struct task_struct *task, bool cancel_all);
\ No newline at end of file
+				   struct task_struct *task, bool cancel_all);
-- 
cgit v1.2.3


From 414d0f45c316221acbf066658afdbae5b354a5cc Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 20 Mar 2024 15:19:44 -0600
Subject: io_uring/alloc_cache: switch to array based caching

Currently lists are being used to manage this, but best practice is
usually to have these in an array instead as that it cheaper to manage.

Outside of that detail, games are also played with KASAN as the list
is inside the cached entry itself.

Finally, all users of this need a struct io_cache_entry embedded in
their struct, which is union'ized with something else in there that
isn't used across the free -> realloc cycle.

Get rid of all of that, and simply have it be an array. This will not
change the memory used, as we're just trading an 8-byte member entry
for the per-elem array size.

This reduces the overhead of the recycled allocations, and it reduces
the amount of code code needed to support recycling to about half of
what it currently is.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  2 +-
 io_uring/alloc_cache.h         | 57 +++++++++++++++++++-----------------------
 io_uring/futex.c               | 30 +++++++++-------------
 io_uring/futex.h               |  5 ++--
 io_uring/io_uring.c            | 34 ++++++++++++++-----------
 io_uring/net.c                 | 13 ++++------
 io_uring/net.h                 | 18 ++++---------
 io_uring/poll.c                | 12 +++------
 io_uring/poll.h                |  9 +------
 io_uring/rsrc.c                | 10 +++-----
 io_uring/rsrc.h                |  7 +-----
 io_uring/rw.c                  | 14 +++++------
 io_uring/rw.h                  |  7 ++----
 io_uring/uring_cmd.c           | 14 +++--------
 io_uring/uring_cmd.h           |  6 +----
 15 files changed, 93 insertions(+), 145 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 0f24fdad19c2..ef45b8bd1b35 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -220,7 +220,7 @@ struct io_ev_fd {
 };
 
 struct io_alloc_cache {
-	struct io_wq_work_node	list;
+	void			**entries;
 	unsigned int		nr_cached;
 	unsigned int		max_cached;
 	size_t			elem_size;
diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h
index 138ad14b0b12..b7a38a2069cf 100644
--- a/io_uring/alloc_cache.h
+++ b/io_uring/alloc_cache.h
@@ -6,61 +6,56 @@
  */
 #define IO_ALLOC_CACHE_MAX	128
 
-struct io_cache_entry {
-	struct io_wq_work_node node;
-};
-
 static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
-				      struct io_cache_entry *entry)
+				      void *entry)
 {
 	if (cache->nr_cached < cache->max_cached) {
-		cache->nr_cached++;
-		wq_stack_add_head(&entry->node, &cache->list);
-		kasan_mempool_poison_object(entry);
+		if (!kasan_mempool_poison_object(entry))
+			return false;
+		cache->entries[cache->nr_cached++] = entry;
 		return true;
 	}
 	return false;
 }
 
-static inline bool io_alloc_cache_empty(struct io_alloc_cache *cache)
-{
-	return !cache->list.next;
-}
-
-static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache)
+static inline void *io_alloc_cache_get(struct io_alloc_cache *cache)
 {
-	if (cache->list.next) {
-		struct io_cache_entry *entry;
+	if (cache->nr_cached) {
+		void *entry = cache->entries[--cache->nr_cached];
 
-		entry = container_of(cache->list.next, struct io_cache_entry, node);
 		kasan_mempool_unpoison_object(entry, cache->elem_size);
-		cache->list.next = cache->list.next->next;
-		cache->nr_cached--;
 		return entry;
 	}
 
 	return NULL;
 }
 
-static inline void io_alloc_cache_init(struct io_alloc_cache *cache,
+/* returns false if the cache was initialized properly */
+static inline bool io_alloc_cache_init(struct io_alloc_cache *cache,
 				       unsigned max_nr, size_t size)
 {
-	cache->list.next = NULL;
-	cache->nr_cached = 0;
-	cache->max_cached = max_nr;
-	cache->elem_size = size;
+	cache->entries = kvmalloc_array(max_nr, sizeof(void *), GFP_KERNEL);
+	if (cache->entries) {
+		cache->nr_cached = 0;
+		cache->max_cached = max_nr;
+		cache->elem_size = size;
+		return false;
+	}
+	return true;
 }
 
 static inline void io_alloc_cache_free(struct io_alloc_cache *cache,
-					void (*free)(struct io_cache_entry *))
+				       void (*free)(const void *))
 {
-	while (1) {
-		struct io_cache_entry *entry = io_alloc_cache_get(cache);
+	void *entry;
+
+	if (!cache->entries)
+		return;
 
-		if (!entry)
-			break;
+	while ((entry = io_alloc_cache_get(cache)) != NULL)
 		free(entry);
-	}
-	cache->nr_cached = 0;
+
+	kvfree(cache->entries);
+	cache->entries = NULL;
 }
 #endif
diff --git a/io_uring/futex.c b/io_uring/futex.c
index 792a03df58de..914848f46beb 100644
--- a/io_uring/futex.c
+++ b/io_uring/futex.c
@@ -9,7 +9,7 @@
 
 #include "../kernel/futex/futex.h"
 #include "io_uring.h"
-#include "rsrc.h"
+#include "alloc_cache.h"
 #include "futex.h"
 
 struct io_futex {
@@ -27,27 +27,21 @@ struct io_futex {
 };
 
 struct io_futex_data {
-	union {
-		struct futex_q		q;
-		struct io_cache_entry	cache;
-	};
+	struct futex_q	q;
 	struct io_kiocb	*req;
 };
 
-void io_futex_cache_init(struct io_ring_ctx *ctx)
-{
-	io_alloc_cache_init(&ctx->futex_cache, IO_NODE_ALLOC_CACHE_MAX,
-				sizeof(struct io_futex_data));
-}
+#define IO_FUTEX_ALLOC_CACHE_MAX	32
 
-static void io_futex_cache_entry_free(struct io_cache_entry *entry)
+bool io_futex_cache_init(struct io_ring_ctx *ctx)
 {
-	kfree(container_of(entry, struct io_futex_data, cache));
+	return io_alloc_cache_init(&ctx->futex_cache, IO_FUTEX_ALLOC_CACHE_MAX,
+				sizeof(struct io_futex_data));
 }
 
 void io_futex_cache_free(struct io_ring_ctx *ctx)
 {
-	io_alloc_cache_free(&ctx->futex_cache, io_futex_cache_entry_free);
+	io_alloc_cache_free(&ctx->futex_cache, kfree);
 }
 
 static void __io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts)
@@ -63,7 +57,7 @@ static void io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts)
 	struct io_ring_ctx *ctx = req->ctx;
 
 	io_tw_lock(ctx, ts);
-	if (!io_alloc_cache_put(&ctx->futex_cache, &ifd->cache))
+	if (!io_alloc_cache_put(&ctx->futex_cache, ifd))
 		kfree(ifd);
 	__io_futex_complete(req, ts);
 }
@@ -259,11 +253,11 @@ static void io_futex_wake_fn(struct wake_q_head *wake_q, struct futex_q *q)
 
 static struct io_futex_data *io_alloc_ifd(struct io_ring_ctx *ctx)
 {
-	struct io_cache_entry *entry;
+	struct io_futex_data *ifd;
 
-	entry = io_alloc_cache_get(&ctx->futex_cache);
-	if (entry)
-		return container_of(entry, struct io_futex_data, cache);
+	ifd = io_alloc_cache_get(&ctx->futex_cache);
+	if (ifd)
+		return ifd;
 
 	return kmalloc(sizeof(struct io_futex_data), GFP_NOWAIT);
 }
diff --git a/io_uring/futex.h b/io_uring/futex.h
index 0847e9e8a127..b8bb09873d57 100644
--- a/io_uring/futex.h
+++ b/io_uring/futex.h
@@ -13,7 +13,7 @@ int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
 		    unsigned int issue_flags);
 bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task,
 			 bool cancel_all);
-void io_futex_cache_init(struct io_ring_ctx *ctx);
+bool io_futex_cache_init(struct io_ring_ctx *ctx);
 void io_futex_cache_free(struct io_ring_ctx *ctx);
 #else
 static inline int io_futex_cancel(struct io_ring_ctx *ctx,
@@ -27,8 +27,9 @@ static inline bool io_futex_remove_all(struct io_ring_ctx *ctx,
 {
 	return false;
 }
-static inline void io_futex_cache_init(struct io_ring_ctx *ctx)
+static inline bool io_futex_cache_init(struct io_ring_ctx *ctx)
 {
+	return false;
 }
 static inline void io_futex_cache_free(struct io_ring_ctx *ctx)
 {
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 579618fad833..1d453eb8e49f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -276,6 +276,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 {
 	struct io_ring_ctx *ctx;
 	int hash_bits;
+	bool ret;
 
 	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 	if (!ctx)
@@ -305,17 +306,19 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->cq_overflow_list);
 	INIT_LIST_HEAD(&ctx->io_buffers_cache);
 	INIT_HLIST_HEAD(&ctx->io_buf_list);
-	io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
+	ret = io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
 			    sizeof(struct io_rsrc_node));
-	io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX,
+	ret |= io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX,
 			    sizeof(struct async_poll));
-	io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
+	ret |= io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
 			    sizeof(struct io_async_msghdr));
-	io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX,
+	ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX,
 			    sizeof(struct io_async_rw));
-	io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX,
+	ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX,
 			    sizeof(struct uring_cache));
-	io_futex_cache_init(ctx);
+	ret |= io_futex_cache_init(ctx);
+	if (ret)
+		goto err;
 	init_completion(&ctx->ref_comp);
 	xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1);
 	mutex_init(&ctx->uring_lock);
@@ -345,6 +348,12 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 
 	return ctx;
 err:
+	io_alloc_cache_free(&ctx->rsrc_node_cache, kfree);
+	io_alloc_cache_free(&ctx->apoll_cache, kfree);
+	io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
+	io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
+	io_alloc_cache_free(&ctx->uring_cache, kfree);
+	io_futex_cache_free(ctx);
 	kfree(ctx->cancel_table.hbs);
 	kfree(ctx->cancel_table_locked.hbs);
 	xa_destroy(&ctx->io_bl_xa);
@@ -1482,7 +1491,7 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,
 
 				if (apoll->double_poll)
 					kfree(apoll->double_poll);
-				if (!io_alloc_cache_put(&ctx->apoll_cache, &apoll->cache))
+				if (!io_alloc_cache_put(&ctx->apoll_cache, apoll))
 					kfree(apoll);
 				req->flags &= ~REQ_F_POLLED;
 			}
@@ -2778,11 +2787,6 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
 	mutex_unlock(&ctx->uring_lock);
 }
 
-static void io_rsrc_node_cache_free(struct io_cache_entry *entry)
-{
-	kfree(container_of(entry, struct io_rsrc_node, cache));
-}
-
 static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 {
 	io_sq_thread_finish(ctx);
@@ -2797,10 +2801,10 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 		__io_sqe_files_unregister(ctx);
 	io_cqring_overflow_kill(ctx);
 	io_eventfd_unregister(ctx);
-	io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free);
+	io_alloc_cache_free(&ctx->apoll_cache, kfree);
 	io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
 	io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
-	io_alloc_cache_free(&ctx->uring_cache, io_uring_cache_free);
+	io_alloc_cache_free(&ctx->uring_cache, kfree);
 	io_futex_cache_free(ctx);
 	io_destroy_buffers(ctx);
 	mutex_unlock(&ctx->uring_lock);
@@ -2816,7 +2820,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
 	WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
 
-	io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free);
+	io_alloc_cache_free(&ctx->rsrc_node_cache, kfree);
 	if (ctx->mm_account) {
 		mmdrop(ctx->mm_account);
 		ctx->mm_account = NULL;
diff --git a/io_uring/net.c b/io_uring/net.c
index 3bef562b67de..d0abc5689066 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -137,7 +137,7 @@ static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
 
 	/* Let normal cleanup path reap it if we fail adding to the cache */
 	iov = hdr->free_iov;
-	if (io_alloc_cache_put(&req->ctx->netmsg_cache, &hdr->cache)) {
+	if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) {
 		if (iov)
 			kasan_mempool_poison_object(iov);
 		req->async_data = NULL;
@@ -148,12 +148,10 @@ static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
 static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_cache_entry *entry;
 	struct io_async_msghdr *hdr;
 
-	entry = io_alloc_cache_get(&ctx->netmsg_cache);
-	if (entry) {
-		hdr = container_of(entry, struct io_async_msghdr, cache);
+	hdr = io_alloc_cache_get(&ctx->netmsg_cache);
+	if (hdr) {
 		if (hdr->free_iov) {
 			kasan_mempool_unpoison_object(hdr->free_iov,
 				hdr->free_iov_nr * sizeof(struct iovec));
@@ -1490,11 +1488,10 @@ out:
 	return IOU_OK;
 }
 
-void io_netmsg_cache_free(struct io_cache_entry *entry)
+void io_netmsg_cache_free(const void *entry)
 {
-	struct io_async_msghdr *kmsg;
+	struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry;
 
-	kmsg = container_of(entry, struct io_async_msghdr, cache);
 	if (kmsg->free_iov) {
 		kasan_mempool_unpoison_object(kmsg->free_iov,
 				kmsg->free_iov_nr * sizeof(struct iovec));
diff --git a/io_uring/net.h b/io_uring/net.h
index b47b43ec6459..0eb1c1920fc9 100644
--- a/io_uring/net.h
+++ b/io_uring/net.h
@@ -3,23 +3,15 @@
 #include <linux/net.h>
 #include <linux/uio.h>
 
-#include "alloc_cache.h"
-
 struct io_async_msghdr {
 #if defined(CONFIG_NET)
-	union {
-		struct iovec			fast_iov;
-		struct {
-			struct io_cache_entry	cache;
-			/* entry size of ->free_iov, if valid */
-			int			free_iov_nr;
-		};
-	};
+	struct iovec			fast_iov;
 	/* points to an allocated iov, if NULL we use fast_iov instead */
 	struct iovec			*free_iov;
+	int				free_iov_nr;
+	int				namelen;
 	__kernel_size_t			controllen;
 	__kernel_size_t			payloadlen;
-	int				namelen;
 	struct sockaddr __user		*uaddr;
 	struct msghdr			msg;
 	struct sockaddr_storage		addr;
@@ -57,9 +49,9 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags);
 int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 void io_send_zc_cleanup(struct io_kiocb *req);
 
-void io_netmsg_cache_free(struct io_cache_entry *entry);
+void io_netmsg_cache_free(const void *entry);
 #else
-static inline void io_netmsg_cache_free(struct io_cache_entry *entry)
+static inline void io_netmsg_cache_free(const void *entry)
 {
 }
 #endif
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 5d55bbf1de15..0a8e02944689 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -14,6 +14,7 @@
 #include <uapi/linux/io_uring.h>
 
 #include "io_uring.h"
+#include "alloc_cache.h"
 #include "refs.h"
 #include "napi.h"
 #include "opdef.h"
@@ -686,17 +687,15 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req,
 					     unsigned issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_cache_entry *entry;
 	struct async_poll *apoll;
 
 	if (req->flags & REQ_F_POLLED) {
 		apoll = req->apoll;
 		kfree(apoll->double_poll);
 	} else if (!(issue_flags & IO_URING_F_UNLOCKED)) {
-		entry = io_alloc_cache_get(&ctx->apoll_cache);
-		if (entry == NULL)
+		apoll = io_alloc_cache_get(&ctx->apoll_cache);
+		if (!apoll)
 			goto alloc_apoll;
-		apoll = container_of(entry, struct async_poll, cache);
 		apoll->poll.retries = APOLL_MAX_RETRY;
 	} else {
 alloc_apoll:
@@ -1055,8 +1054,3 @@ out:
 	io_req_set_res(req, ret, 0);
 	return IOU_OK;
 }
-
-void io_apoll_cache_free(struct io_cache_entry *entry)
-{
-	kfree(container_of(entry, struct async_poll, cache));
-}
diff --git a/io_uring/poll.h b/io_uring/poll.h
index 1dacae9e816c..5c240f11069a 100644
--- a/io_uring/poll.h
+++ b/io_uring/poll.h
@@ -1,7 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 
-#include "alloc_cache.h"
-
 enum {
 	IO_APOLL_OK,
 	IO_APOLL_ABORTED,
@@ -17,10 +15,7 @@ struct io_poll {
 };
 
 struct async_poll {
-	union {
-		struct io_poll		poll;
-		struct io_cache_entry	cache;
-	};
+	struct io_poll		poll;
 	struct io_poll		*double_poll;
 };
 
@@ -46,6 +41,4 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags);
 bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
 			bool cancel_all);
 
-void io_apoll_cache_free(struct io_cache_entry *entry);
-
 void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 4818b79231dd..7b8a056f98ed 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -13,6 +13,7 @@
 #include <uapi/linux/io_uring.h>
 
 #include "io_uring.h"
+#include "alloc_cache.h"
 #include "openclose.h"
 #include "rsrc.h"
 
@@ -169,7 +170,7 @@ static void io_rsrc_put_work(struct io_rsrc_node *node)
 
 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
 {
-	if (!io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache))
+	if (!io_alloc_cache_put(&ctx->rsrc_node_cache, node))
 		kfree(node);
 }
 
@@ -197,12 +198,9 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
 {
 	struct io_rsrc_node *ref_node;
-	struct io_cache_entry *entry;
 
-	entry = io_alloc_cache_get(&ctx->rsrc_node_cache);
-	if (entry) {
-		ref_node = container_of(entry, struct io_rsrc_node, cache);
-	} else {
+	ref_node = io_alloc_cache_get(&ctx->rsrc_node_cache);
+	if (!ref_node) {
 		ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
 		if (!ref_node)
 			return NULL;
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index e21000238954..83c079a707f8 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -2,8 +2,6 @@
 #ifndef IOU_RSRC_H
 #define IOU_RSRC_H
 
-#include "alloc_cache.h"
-
 #define IO_NODE_ALLOC_CACHE_MAX 32
 
 #define IO_RSRC_TAG_TABLE_SHIFT	(PAGE_SHIFT - 3)
@@ -36,10 +34,7 @@ struct io_rsrc_data {
 };
 
 struct io_rsrc_node {
-	union {
-		struct io_cache_entry		cache;
-		struct io_ring_ctx		*ctx;
-	};
+	struct io_ring_ctx		*ctx;
 	int				refs;
 	bool				empty;
 	u16				type;
diff --git a/io_uring/rw.c b/io_uring/rw.c
index e84d322a6150..3134a6ece1be 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -18,6 +18,7 @@
 #include "io_uring.h"
 #include "opdef.h"
 #include "kbuf.h"
+#include "alloc_cache.h"
 #include "rsrc.h"
 #include "poll.h"
 #include "rw.h"
@@ -154,7 +155,7 @@ static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags)
 		return;
 	}
 	iov = rw->free_iovec;
-	if (io_alloc_cache_put(&req->ctx->rw_cache, &rw->cache)) {
+	if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) {
 		if (iov)
 			kasan_mempool_poison_object(iov);
 		req->async_data = NULL;
@@ -200,12 +201,10 @@ static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags)
 static int io_rw_alloc_async(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_cache_entry *entry;
 	struct io_async_rw *rw;
 
-	entry = io_alloc_cache_get(&ctx->rw_cache);
-	if (entry) {
-		rw = container_of(entry, struct io_async_rw, cache);
+	rw = io_alloc_cache_get(&ctx->rw_cache);
+	if (rw) {
 		if (rw->free_iovec) {
 			kasan_mempool_unpoison_object(rw->free_iovec,
 				rw->free_iov_nr * sizeof(struct iovec));
@@ -1168,11 +1167,10 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 	return nr_events;
 }
 
-void io_rw_cache_free(struct io_cache_entry *entry)
+void io_rw_cache_free(const void *entry)
 {
-	struct io_async_rw *rw;
+	struct io_async_rw *rw = (struct io_async_rw *) entry;
 
-	rw = container_of(entry, struct io_async_rw, cache);
 	if (rw->free_iovec) {
 		kasan_mempool_unpoison_object(rw->free_iovec,
 				rw->free_iov_nr * sizeof(struct iovec));
diff --git a/io_uring/rw.h b/io_uring/rw.h
index cf51d0eb407a..3f432dc75441 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -3,10 +3,7 @@
 #include <linux/pagemap.h>
 
 struct io_async_rw {
-	union {
-		size_t			bytes_done;
-		struct io_cache_entry	cache;
-	};
+	size_t				bytes_done;
 	struct iov_iter			iter;
 	struct iov_iter_state		iter_state;
 	struct iovec			fast_iov;
@@ -28,4 +25,4 @@ void io_rw_fail(struct io_kiocb *req);
 void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts);
 int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
 int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags);
-void io_rw_cache_free(struct io_cache_entry *entry);
+void io_rw_cache_free(const void *entry);
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 92346b5d9f5b..334d31dd6628 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -11,18 +11,17 @@
 #include <asm/ioctls.h>
 
 #include "io_uring.h"
+#include "alloc_cache.h"
 #include "rsrc.h"
 #include "uring_cmd.h"
 
 static struct uring_cache *io_uring_async_get(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_cache_entry *entry;
 	struct uring_cache *cache;
 
-	entry = io_alloc_cache_get(&ctx->uring_cache);
-	if (entry) {
-		cache = container_of(entry, struct uring_cache, cache);
+	cache = io_alloc_cache_get(&ctx->uring_cache);
+	if (cache) {
 		req->flags |= REQ_F_ASYNC_DATA;
 		req->async_data = cache;
 		return cache;
@@ -39,7 +38,7 @@ static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (issue_flags & IO_URING_F_UNLOCKED)
 		return;
-	if (io_alloc_cache_put(&req->ctx->uring_cache, &cache->cache)) {
+	if (io_alloc_cache_put(&req->ctx->uring_cache, cache)) {
 		ioucmd->sqe = NULL;
 		req->async_data = NULL;
 		req->flags &= ~REQ_F_ASYNC_DATA;
@@ -354,8 +353,3 @@ int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
 }
 EXPORT_SYMBOL_GPL(io_uring_cmd_sock);
 #endif
-
-void io_uring_cache_free(struct io_cache_entry *entry)
-{
-	kfree(container_of(entry, struct uring_cache, cache));
-}
diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h
index 477ea8865639..a361f98664d2 100644
--- a/io_uring/uring_cmd.h
+++ b/io_uring/uring_cmd.h
@@ -1,15 +1,11 @@
 // SPDX-License-Identifier: GPL-2.0
 
 struct uring_cache {
-	union {
-		struct io_cache_entry cache;
-		struct io_uring_sqe sqes[2];
-	};
+	struct io_uring_sqe sqes[2];
 };
 
 int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags);
 int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
-void io_uring_cache_free(struct io_cache_entry *entry);
 
 bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
 				   struct task_struct *task, bool cancel_all);
-- 
cgit v1.2.3


From 0f21a9574b1d04afbf818a3e6a60cb95eb04a616 Mon Sep 17 00:00:00 2001
From: Gabriel Krisman Bertazi <krisman@suse.de>
Date: Thu, 28 Mar 2024 17:09:35 -0400
Subject: io_uring: Avoid anonymous enums in io_uring uapi

While valid C, anonymous enums confuse Cython (Python to C translator),
as reported by Ritesh (YoSTEALTH) [1] .  Since people rely on it when
building against liburing and we want to keep this header in sync with
the library version, let's name the existing enums in the uapi header.

[1] https://github.com/cython/cython/issues/3240

Signed-off-by: Gabriel Krisman Bertazi <krisman@suse.de>
Link: https://lore.kernel.org/r/20240328210935.25640-1-krisman@suse.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 7bd10201a02b..a7f847543a7f 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -115,7 +115,7 @@ struct io_uring_sqe {
  */
 #define IORING_FILE_INDEX_ALLOC		(~0U)
 
-enum {
+enum io_uring_sqe_flags_bit {
 	IOSQE_FIXED_FILE_BIT,
 	IOSQE_IO_DRAIN_BIT,
 	IOSQE_IO_LINK_BIT,
@@ -374,7 +374,7 @@ enum io_uring_op {
 /*
  * IORING_OP_MSG_RING command types, stored in sqe->addr
  */
-enum {
+enum io_uring_msg_ring_flags {
 	IORING_MSG_DATA,	/* pass sqe->len as 'res' and off as user_data */
 	IORING_MSG_SEND_FD,	/* send a registered fd to another ring */
 };
@@ -425,9 +425,7 @@ struct io_uring_cqe {
 #define IORING_CQE_F_SOCK_NONEMPTY	(1U << 2)
 #define IORING_CQE_F_NOTIF		(1U << 3)
 
-enum {
-	IORING_CQE_BUFFER_SHIFT		= 16,
-};
+#define IORING_CQE_BUFFER_SHIFT		16
 
 /*
  * Magic offsets for the application to mmap the data it needs
@@ -526,7 +524,7 @@ struct io_uring_params {
 /*
  * io_uring_register(2) opcodes and arguments
  */
-enum {
+enum io_uring_register_op {
 	IORING_REGISTER_BUFFERS			= 0,
 	IORING_UNREGISTER_BUFFERS		= 1,
 	IORING_REGISTER_FILES			= 2,
@@ -583,7 +581,7 @@ enum {
 };
 
 /* io-wq worker categories */
-enum {
+enum io_wq_type {
 	IO_WQ_BOUND,
 	IO_WQ_UNBOUND,
 };
@@ -688,7 +686,7 @@ struct io_uring_buf_ring {
  *			IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT)
  *			to get a virtual mapping for the ring.
  */
-enum {
+enum io_uring_register_pbuf_ring_flags {
 	IOU_PBUF_RING_MMAP	= 1,
 };
 
@@ -719,7 +717,7 @@ struct io_uring_napi {
 /*
  * io_uring_restriction->opcode values
  */
-enum {
+enum io_uring_register_restriction_op {
 	/* Allow an io_uring_register(2) opcode */
 	IORING_RESTRICTION_REGISTER_OP		= 0,
 
@@ -775,7 +773,7 @@ struct io_uring_recvmsg_out {
 /*
  * Argument for IORING_OP_URING_CMD when file is a socket
  */
-enum {
+enum io_uring_socket_op {
 	SOCKET_URING_OP_SIOCINQ		= 0,
 	SOCKET_URING_OP_SIOCOUTQ,
 	SOCKET_URING_OP_GETSOCKOPT,
-- 
cgit v1.2.3


From 87585b05757dc70545efb434669708d276125559 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 12 Mar 2024 20:24:21 -0600
Subject: io_uring/kbuf: use vm_insert_pages() for mmap'ed pbuf ring

Rather than use remap_pfn_range() for this and manually free later,
switch to using vm_insert_page() and have it Just Work.

This requires a bit of effort on the mmap lookup side, as the ctx
uring_lock isn't held, which  otherwise protects buffer_lists from being
torn down, and it's not safe to grab from mmap context that would
introduce an ABBA deadlock between the mmap lock and the ctx uring_lock.
Instead, lookup the buffer_list under RCU, as the the list is RCU freed
already. Use the existing reference count to determine whether it's
possible to safely grab a reference to it (eg if it's not zero already),
and drop that reference when done with the mapping. If the mmap
reference is the last one, the buffer_list and the associated memory can
go away, since the vma insertion has references to the inserted pages at
that point.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |   3 -
 io_uring/io_uring.c            |  58 +++++-------------
 io_uring/io_uring.h            |   6 +-
 io_uring/kbuf.c                | 134 ++++++++---------------------------------
 io_uring/kbuf.h                |   3 +-
 5 files changed, 47 insertions(+), 157 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index ef45b8bd1b35..d34c8433caf9 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -372,9 +372,6 @@ struct io_ring_ctx {
 
 	struct list_head	io_buffers_cache;
 
-	/* deferred free list, protected by ->uring_lock */
-	struct hlist_head	io_buf_list;
-
 	/* Keep this last, we don't need it for the fast path */
 	struct wait_queue_head		poll_wq;
 	struct io_restriction		restrictions;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 0ef418faac33..50e859da59e1 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -303,7 +303,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->sqd_list);
 	INIT_LIST_HEAD(&ctx->cq_overflow_list);
 	INIT_LIST_HEAD(&ctx->io_buffers_cache);
-	INIT_HLIST_HEAD(&ctx->io_buf_list);
 	ret = io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
 			    sizeof(struct io_rsrc_node));
 	ret |= io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX,
@@ -2598,15 +2597,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
 }
 
-static void io_pages_unmap(void *ptr, struct page ***pages,
-			   unsigned short *npages)
+void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages,
+		    bool put_pages)
 {
 	bool do_vunmap = false;
 
 	if (!ptr)
 		return;
 
-	if (*npages) {
+	if (put_pages && *npages) {
 		struct page **to_free = *pages;
 		int i;
 
@@ -2628,14 +2627,6 @@ static void io_pages_unmap(void *ptr, struct page ***pages,
 	*npages = 0;
 }
 
-void io_mem_free(void *ptr)
-{
-	if (!ptr)
-		return;
-
-	folio_put(virt_to_folio(ptr));
-}
-
 static void io_pages_free(struct page ***pages, int npages)
 {
 	struct page **page_array = *pages;
@@ -2730,8 +2721,10 @@ static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr,
 static void io_rings_free(struct io_ring_ctx *ctx)
 {
 	if (!(ctx->flags & IORING_SETUP_NO_MMAP)) {
-		io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages);
-		io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages);
+		io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages,
+				true);
+		io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages,
+				true);
 	} else {
 		io_pages_free(&ctx->ring_pages, ctx->n_ring_pages);
 		ctx->n_ring_pages = 0;
@@ -2788,8 +2781,8 @@ err:
 	return ERR_PTR(-ENOMEM);
 }
 
-static void *io_pages_map(struct page ***out_pages, unsigned short *npages,
-			  size_t size)
+void *io_pages_map(struct page ***out_pages, unsigned short *npages,
+		   size_t size)
 {
 	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN;
 	struct page **pages;
@@ -2819,17 +2812,6 @@ done:
 	return ret;
 }
 
-void *io_mem_alloc(size_t size)
-{
-	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
-	void *ret;
-
-	ret = (void *) __get_free_pages(gfp, get_order(size));
-	if (ret)
-		return ret;
-	return ERR_PTR(-ENOMEM);
-}
-
 static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
 				unsigned int cq_entries, size_t *sq_offset)
 {
@@ -2926,7 +2908,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 		ctx->mm_account = NULL;
 	}
 	io_rings_free(ctx);
-	io_kbuf_mmap_list_free(ctx);
 
 	percpu_ref_exit(&ctx->refs);
 	free_uid(ctx->user);
@@ -3396,10 +3377,8 @@ static void *io_uring_validate_mmap_request(struct file *file,
 {
 	struct io_ring_ctx *ctx = file->private_data;
 	loff_t offset = pgoff << PAGE_SHIFT;
-	struct page *page;
-	void *ptr;
 
-	switch (offset & IORING_OFF_MMAP_MASK) {
+	switch ((pgoff << PAGE_SHIFT) & IORING_OFF_MMAP_MASK) {
 	case IORING_OFF_SQ_RING:
 	case IORING_OFF_CQ_RING:
 		/* Don't allow mmap if the ring was setup without it */
@@ -3414,6 +3393,7 @@ static void *io_uring_validate_mmap_request(struct file *file,
 	case IORING_OFF_PBUF_RING: {
 		struct io_buffer_list *bl;
 		unsigned int bgid;
+		void *ptr;
 
 		bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
 		bl = io_pbuf_get_bl(ctx, bgid);
@@ -3421,17 +3401,11 @@ static void *io_uring_validate_mmap_request(struct file *file,
 			return bl;
 		ptr = bl->buf_ring;
 		io_put_bl(ctx, bl);
-		break;
+		return ptr;
 		}
-	default:
-		return ERR_PTR(-EINVAL);
 	}
 
-	page = virt_to_head_page(ptr);
-	if (sz > page_size(page))
-		return ERR_PTR(-EINVAL);
-
-	return ptr;
+	return ERR_PTR(-EINVAL);
 }
 
 int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
@@ -3450,7 +3424,6 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
 	struct io_ring_ctx *ctx = file->private_data;
 	size_t sz = vma->vm_end - vma->vm_start;
 	long offset = vma->vm_pgoff << PAGE_SHIFT;
-	unsigned long pfn;
 	void *ptr;
 
 	ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
@@ -3465,10 +3438,11 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
 	case IORING_OFF_SQES:
 		return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages,
 						ctx->n_sqe_pages);
+	case IORING_OFF_PBUF_RING:
+		return io_pbuf_mmap(file, vma);
 	}
 
-	pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
-	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
+	return -EINVAL;
 }
 
 static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 75230d914007..dec996a1c789 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -109,8 +109,10 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
 bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
 			bool cancel_all);
 
-void *io_mem_alloc(size_t size);
-void io_mem_free(void *ptr);
+void *io_pages_map(struct page ***out_pages, unsigned short *npages,
+		   size_t size);
+void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages,
+		    bool put_pages);
 
 enum {
 	IO_EVENTFD_OP_SIGNAL_BIT,
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 4289f4a92693..820ac599d003 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -32,25 +32,12 @@ struct io_provide_buf {
 	__u16				bid;
 };
 
-struct io_buf_free {
-	struct hlist_node		list;
-	void				*mem;
-	size_t				size;
-	int				inuse;
-};
-
-static inline struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx,
-							  unsigned int bgid)
-{
-	return xa_load(&ctx->io_bl_xa, bgid);
-}
-
 static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
 							unsigned int bgid)
 {
 	lockdep_assert_held(&ctx->uring_lock);
 
-	return __io_buffer_get_list(ctx, bgid);
+	return xa_load(&ctx->io_bl_xa, bgid);
 }
 
 static int io_buffer_add_list(struct io_ring_ctx *ctx,
@@ -191,24 +178,6 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
 	return ret;
 }
 
-/*
- * Mark the given mapped range as free for reuse
- */
-static void io_kbuf_mark_free(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
-{
-	struct io_buf_free *ibf;
-
-	hlist_for_each_entry(ibf, &ctx->io_buf_list, list) {
-		if (bl->buf_ring == ibf->mem) {
-			ibf->inuse = 0;
-			return;
-		}
-	}
-
-	/* can't happen... */
-	WARN_ON_ONCE(1);
-}
-
 static int __io_remove_buffers(struct io_ring_ctx *ctx,
 			       struct io_buffer_list *bl, unsigned nbufs)
 {
@@ -220,23 +189,16 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
 
 	if (bl->is_buf_ring) {
 		i = bl->buf_ring->tail - bl->head;
-		if (bl->is_mmap) {
-			/*
-			 * io_kbuf_list_free() will free the page(s) at
-			 * ->release() time.
-			 */
-			io_kbuf_mark_free(ctx, bl);
-			bl->buf_ring = NULL;
-			bl->is_mmap = 0;
-		} else if (bl->buf_nr_pages) {
+		if (bl->buf_nr_pages) {
 			int j;
 
-			for (j = 0; j < bl->buf_nr_pages; j++)
-				unpin_user_page(bl->buf_pages[j]);
-			kvfree(bl->buf_pages);
-			vunmap(bl->buf_ring);
-			bl->buf_pages = NULL;
-			bl->buf_nr_pages = 0;
+			if (!bl->is_mmap) {
+				for (j = 0; j < bl->buf_nr_pages; j++)
+					unpin_user_page(bl->buf_pages[j]);
+			}
+			io_pages_unmap(bl->buf_ring, &bl->buf_pages,
+					&bl->buf_nr_pages, bl->is_mmap);
+			bl->is_mmap = 0;
 		}
 		/* make sure it's seen as empty */
 		INIT_LIST_HEAD(&bl->buf_list);
@@ -537,63 +499,18 @@ error_unpin:
 	return ret;
 }
 
-/*
- * See if we have a suitable region that we can reuse, rather than allocate
- * both a new io_buf_free and mem region again. We leave it on the list as
- * even a reused entry will need freeing at ring release.
- */
-static struct io_buf_free *io_lookup_buf_free_entry(struct io_ring_ctx *ctx,
-						    size_t ring_size)
-{
-	struct io_buf_free *ibf, *best = NULL;
-	size_t best_dist;
-
-	hlist_for_each_entry(ibf, &ctx->io_buf_list, list) {
-		size_t dist;
-
-		if (ibf->inuse || ibf->size < ring_size)
-			continue;
-		dist = ibf->size - ring_size;
-		if (!best || dist < best_dist) {
-			best = ibf;
-			if (!dist)
-				break;
-			best_dist = dist;
-		}
-	}
-
-	return best;
-}
-
 static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
 			      struct io_uring_buf_reg *reg,
 			      struct io_buffer_list *bl)
 {
-	struct io_buf_free *ibf;
 	size_t ring_size;
-	void *ptr;
 
 	ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
 
-	/* Reuse existing entry, if we can */
-	ibf = io_lookup_buf_free_entry(ctx, ring_size);
-	if (!ibf) {
-		ptr = io_mem_alloc(ring_size);
-		if (IS_ERR(ptr))
-			return PTR_ERR(ptr);
-
-		/* Allocate and store deferred free entry */
-		ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT);
-		if (!ibf) {
-			io_mem_free(ptr);
-			return -ENOMEM;
-		}
-		ibf->mem = ptr;
-		ibf->size = ring_size;
-		hlist_add_head(&ibf->list, &ctx->io_buf_list);
-	}
-	ibf->inuse = 1;
-	bl->buf_ring = ibf->mem;
+	bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size);
+	if (!bl->buf_ring)
+		return -ENOMEM;
+
 	bl->is_buf_ring = 1;
 	bl->is_mmap = 1;
 	return 0;
@@ -741,18 +658,19 @@ struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
 	return ERR_PTR(-EINVAL);
 }
 
-/*
- * Called at or after ->release(), free the mmap'ed buffers that we used
- * for memory mapped provided buffer rings.
- */
-void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx)
+int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma)
 {
-	struct io_buf_free *ibf;
-	struct hlist_node *tmp;
+	struct io_ring_ctx *ctx = file->private_data;
+	loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT;
+	struct io_buffer_list *bl;
+	int bgid, ret;
 
-	hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) {
-		hlist_del(&ibf->list);
-		io_mem_free(ibf->mem);
-		kfree(ibf);
-	}
+	bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
+	bl = io_pbuf_get_bl(ctx, bgid);
+	if (IS_ERR(bl))
+		return PTR_ERR(bl);
+
+	ret = io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages);
+	io_put_bl(ctx, bl);
+	return ret;
 }
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index df365b8860cf..53c141d9a8b2 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -55,8 +55,6 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
 int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
 int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg);
 
-void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx);
-
 void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
 
 bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
@@ -64,6 +62,7 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
 void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl);
 struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
 				      unsigned long bgid);
+int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma);
 
 static inline bool io_kbuf_recycle_ring(struct io_kiocb *req)
 {
-- 
cgit v1.2.3


From d9713ad3fa227726a0b4d544c5a4cdd393c1933e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 5 Apr 2024 16:50:04 +0100
Subject: io_uring: remove async request cache

io_req_complete_post() was a sole user of ->locked_free_list, but
since we just gutted the function, the cache is not used anymore and
can be removed.

->locked_free_list served as an asynhronous counterpart of the main
request (i.e. struct io_kiocb) cache for all unlocked cases like io-wq.
Now they're all forced to be completed into the main cache directly,
off of the normal completion path or via io_free_req().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/7bffccd213e370abd4de480e739d8b08ab6c1326.1712331455.git.asml.silence@gmail.com
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  4 ----
 io_uring/io_uring.c            | 22 ----------------------
 2 files changed, 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index d34c8433caf9..c47f412cf18e 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -346,10 +346,6 @@ struct io_ring_ctx {
 
 	spinlock_t		completion_lock;
 
-	/* IRQ completion list, under ->completion_lock */
-	unsigned int		locked_free_nr;
-	struct io_wq_work_list	locked_free_list;
-
 	struct list_head	io_buffers_comp;
 	struct list_head	cq_overflow_list;
 	struct io_hash_table	cancel_table;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 8078d6944411..e291f227a1a0 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -334,7 +334,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	init_llist_head(&ctx->work_llist);
 	INIT_LIST_HEAD(&ctx->tctx_list);
 	ctx->submit_state.free_list.next = NULL;
-	INIT_WQ_LIST(&ctx->locked_free_list);
 	INIT_HLIST_HEAD(&ctx->waitid_list);
 #ifdef CONFIG_FUTEX
 	INIT_HLIST_HEAD(&ctx->futex_list);
@@ -988,15 +987,6 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
 	memset(&req->big_cqe, 0, sizeof(req->big_cqe));
 }
 
-static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
-					struct io_submit_state *state)
-{
-	spin_lock(&ctx->completion_lock);
-	wq_list_splice(&ctx->locked_free_list, &state->free_list);
-	ctx->locked_free_nr = 0;
-	spin_unlock(&ctx->completion_lock);
-}
-
 /*
  * A request might get retired back into the request caches even before opcode
  * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
@@ -1010,17 +1000,6 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
 	void *reqs[IO_REQ_ALLOC_BATCH];
 	int ret;
 
-	/*
-	 * If we have more than a batch's worth of requests in our IRQ side
-	 * locked cache, grab the lock and move them over to our submission
-	 * side cache.
-	 */
-	if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) {
-		io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
-		if (!io_req_cache_empty(ctx))
-			return true;
-	}
-
 	ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
 
 	/*
@@ -2654,7 +2633,6 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
 	int nr = 0;
 
 	mutex_lock(&ctx->uring_lock);
-	io_flush_cached_locked_reqs(ctx, &ctx->submit_state);
 
 	while (!io_req_cache_empty(ctx)) {
 		req = io_extract_req(ctx);
-- 
cgit v1.2.3


From 8c9a6f549e65912825e31dc1e0e3f7995984649d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 9 Apr 2024 14:05:53 -0700
Subject: io_uring: separate header for exported net bits

We're exporting some io_uring bits to networking, e.g. for implementing
a net callback for io_uring cmds, but we don't want to expose more than
needed. Add a separate header for networking.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: David Wei <dw@davidwei.uk>
Link: https://lore.kernel.org/r/20240409210554.1878789-1-dw@davidwei.uk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring.h     |  6 ------
 include/linux/io_uring/net.h | 18 ++++++++++++++++++
 io_uring/uring_cmd.c         |  1 +
 net/socket.c                 |  2 +-
 4 files changed, 20 insertions(+), 7 deletions(-)
 create mode 100644 include/linux/io_uring/net.h

(limited to 'include')

diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 68ed6697fece..e123d5e17b52 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -11,7 +11,6 @@ void __io_uring_cancel(bool cancel_all);
 void __io_uring_free(struct task_struct *tsk);
 void io_uring_unreg_ringfd(void);
 const char *io_uring_get_opcode(u8 opcode);
-int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
 bool io_is_uring_fops(struct file *file);
 
 static inline void io_uring_files_cancel(void)
@@ -45,11 +44,6 @@ static inline const char *io_uring_get_opcode(u8 opcode)
 {
 	return "";
 }
-static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd,
-				    unsigned int issue_flags)
-{
-	return -EOPNOTSUPP;
-}
 static inline bool io_is_uring_fops(struct file *file)
 {
 	return false;
diff --git a/include/linux/io_uring/net.h b/include/linux/io_uring/net.h
new file mode 100644
index 000000000000..b58f39fed4d5
--- /dev/null
+++ b/include/linux/io_uring/net.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _LINUX_IO_URING_NET_H
+#define _LINUX_IO_URING_NET_H
+
+struct io_uring_cmd;
+
+#if defined(CONFIG_IO_URING)
+int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
+
+#else
+static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd,
+				    unsigned int issue_flags)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
+#endif
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 334d31dd6628..21ac5fb2d5f0 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -3,6 +3,7 @@
 #include <linux/errno.h>
 #include <linux/file.h>
 #include <linux/io_uring/cmd.h>
+#include <linux/io_uring/net.h>
 #include <linux/security.h>
 #include <linux/nospec.h>
 #include <net/sock.h>
diff --git a/net/socket.c b/net/socket.c
index e5f3af49a8b6..01a71ae10c35 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -88,7 +88,7 @@
 #include <linux/xattr.h>
 #include <linux/nospec.h>
 #include <linux/indirect_call_wrapper.h>
-#include <linux/io_uring.h>
+#include <linux/io_uring/net.h>
 
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
-- 
cgit v1.2.3


From e2a1cda3e0c784740751d46431973dcee32cf108 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Tue, 9 Apr 2024 18:30:40 +0200
Subject: drm/panic: Add drm panic locking

Rough sketch for the locking of drm panic printing code. The upshot of
this approach is that we can pretty much entirely rely on the atomic
commit flow, with the pair of raw_spin_lock/unlock providing any
barriers we need, without having to create really big critical
sections in code.

This also avoids the need that drivers must explicitly update the
panic handler state, which they might forget to do, or not do
consistently, and then we blow up in the worst possible times.

It is somewhat racy against a concurrent atomic update, and we might
write into a buffer which the hardware will never display. But there's
fundamentally no way to avoid that - if we do the panic state update
explicitly after writing to the hardware, we might instead write to an
old buffer that the user will barely ever see.

Note that an rcu protected deference of plane->state would give us the
the same guarantees, but it has the downside that we then need to
protect the plane state freeing functions with call_rcu too. Which
would very widely impact a lot of code and therefore doesn't seem
worth the complexity compared to a raw spinlock with very tiny
critical sections. Plus rcu cannot be used to protect access to
peek/poke registers anyway, so we'd still need it for those cases.

Peek/poke registers for vram access (or a gart pte reserved just for
panic code) are also the reason I've gone with a per-device and not
per-plane spinlock, since usually these things are global for the
entire display. Going with per-plane locks would mean drivers for such
hardware would need additional locks, which we don't want, since it
deviates from the per-console takeoverlocks design.

Longer term it might be useful if the panic notifiers grow a bit more
structure than just the absolute bare
EXPORT_SYMBOL(panic_notifier_list) - somewhat aside, why is that not
EXPORT_SYMBOL_GPL ... If panic notifiers would be more like console
drivers with proper register/unregister interfaces we could perhaps
reuse the very fancy console lock with all it's check and takeover
semantics that John Ogness is developing to fix the console_lock mess.
But for the initial cut of a drm panic printing support I don't think
we need that, because the critical sections are extremely small and
only happen once per display refresh. So generally just 60 tiny locked
sections per second, which is nothing compared to a serial console
running a 115kbaud doing really slow mmio writes for each byte. So for
now the raw spintrylock in drm panic notifier callback should be good
enough.

Another benefit of making panic notifiers more like full blown
consoles (that are used in panics only) would be that we get the two
stage design, where first all the safe outputs are used. And then the
dangerous takeover tricks are deployed (where for display drivers we
also might try to intercept any in-flight display buffer flips, which
if we race and misprogram fifos and watermarks can hang the memory
controller on some hw).

For context the actual implementation on the drm side is by Jocelyn
and this patch is meant to be combined with the overall approach in
v7 (v8 is a bit less flexible, which I think is the wrong direction):

https://lore.kernel.org/dri-devel/20240104160301.185915-1-jfalempe@redhat.com/

Note that the locking is very much not correct there, hence this
separate rfc.

Starting from v10, I (Jocelyn) have included this patch in the drm_panic
series, and done the corresponding changes.

v2:
- fix authorship, this was all my typing
- some typo oopsies
- link to the drm panic work by Jocelyn for context

v10:
- Use spinlock_irqsave/restore (John Ogness)

v11:
- Use macro instead of inline functions for drm_panic_lock/unlock (John Ogness)

Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Jocelyn Falempe <jfalempe@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Cc: Lukas Wunner <lukas@wunner.de>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: John Ogness <john.ogness@linutronix.de>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Maxime Ripard <mripard@kernel.org>
Cc: Thomas Zimmermann <tzimmermann@suse.de>
Cc: David Airlie <airlied@gmail.com>
Cc: Daniel Vetter <daniel@ffwll.ch>
Signed-off-by: Jocelyn Falempe <jfalempe@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240409163432.352518-2-jfalempe@redhat.com
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/drm_atomic_helper.c |  4 ++
 drivers/gpu/drm/drm_drv.c           |  1 +
 include/drm/drm_mode_config.h       | 10 ++++
 include/drm/drm_panic.h             | 95 +++++++++++++++++++++++++++++++++++++
 4 files changed, 110 insertions(+)
 create mode 100644 include/drm/drm_panic.h

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c
index 39ef0a6addeb..fb97b51b38f1 100644
--- a/drivers/gpu/drm/drm_atomic_helper.c
+++ b/drivers/gpu/drm/drm_atomic_helper.c
@@ -38,6 +38,7 @@
 #include <drm/drm_drv.h>
 #include <drm/drm_framebuffer.h>
 #include <drm/drm_gem_atomic_helper.h>
+#include <drm/drm_panic.h>
 #include <drm/drm_print.h>
 #include <drm/drm_self_refresh_helper.h>
 #include <drm/drm_vblank.h>
@@ -3016,6 +3017,7 @@ int drm_atomic_helper_swap_state(struct drm_atomic_state *state,
 				  bool stall)
 {
 	int i, ret;
+	unsigned long flags;
 	struct drm_connector *connector;
 	struct drm_connector_state *old_conn_state, *new_conn_state;
 	struct drm_crtc *crtc;
@@ -3099,6 +3101,7 @@ int drm_atomic_helper_swap_state(struct drm_atomic_state *state,
 		}
 	}
 
+	drm_panic_lock(state->dev, flags);
 	for_each_oldnew_plane_in_state(state, plane, old_plane_state, new_plane_state, i) {
 		WARN_ON(plane->state != old_plane_state);
 
@@ -3108,6 +3111,7 @@ int drm_atomic_helper_swap_state(struct drm_atomic_state *state,
 		state->planes[i].state = old_plane_state;
 		plane->state = new_plane_state;
 	}
+	drm_panic_unlock(state->dev, flags);
 
 	for_each_oldnew_private_obj_in_state(state, obj, old_obj_state, new_obj_state, i) {
 		WARN_ON(obj->state != old_obj_state);
diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
index 243cacb3575c..c157500b3135 100644
--- a/drivers/gpu/drm/drm_drv.c
+++ b/drivers/gpu/drm/drm_drv.c
@@ -638,6 +638,7 @@ static int drm_dev_init(struct drm_device *dev,
 	mutex_init(&dev->filelist_mutex);
 	mutex_init(&dev->clientlist_mutex);
 	mutex_init(&dev->master_mutex);
+	raw_spin_lock_init(&dev->mode_config.panic_lock);
 
 	ret = drmm_add_action_or_reset(dev, drm_dev_init_release, NULL);
 	if (ret)
diff --git a/include/drm/drm_mode_config.h b/include/drm/drm_mode_config.h
index 9d8acf7a10eb..06d7777a881f 100644
--- a/include/drm/drm_mode_config.h
+++ b/include/drm/drm_mode_config.h
@@ -505,6 +505,16 @@ struct drm_mode_config {
 	 */
 	struct list_head plane_list;
 
+	/**
+	 * @panic_lock:
+	 *
+	 * Raw spinlock used to protect critical sections of code that access
+	 * the display hardware or modeset software state, which the panic
+	 * printing code must be protected against. See drm_panic_trylock(),
+	 * drm_panic_lock() and drm_panic_unlock().
+	 */
+	struct raw_spinlock panic_lock;
+
 	/**
 	 * @num_crtc:
 	 *
diff --git a/include/drm/drm_panic.h b/include/drm/drm_panic.h
new file mode 100644
index 000000000000..967e02ccc6d5
--- /dev/null
+++ b/include/drm/drm_panic.h
@@ -0,0 +1,95 @@
+/* SPDX-License-Identifier: GPL-2.0 or MIT */
+#ifndef __DRM_PANIC_H__
+#define __DRM_PANIC_H__
+
+#include <drm/drm_device.h>
+/*
+ * Copyright (c) 2024 Intel
+ */
+
+/**
+ * drm_panic_trylock - try to enter the panic printing critical section
+ * @dev: struct drm_device
+ * @flags: unsigned long irq flags you need to pass to the unlock() counterpart
+ *
+ * This function must be called by any panic printing code. The panic printing
+ * attempt must be aborted if the trylock fails.
+ *
+ * Panic printing code can make the following assumptions while holding the
+ * panic lock:
+ *
+ * - Anything protected by drm_panic_lock() and drm_panic_unlock() pairs is safe
+ *   to access.
+ *
+ * - Furthermore the panic printing code only registers in drm_dev_unregister()
+ *   and gets removed in drm_dev_unregister(). This allows the panic code to
+ *   safely access any state which is invariant in between these two function
+ *   calls, like the list of planes &drm_mode_config.plane_list or most of the
+ *   struct drm_plane structure.
+ *
+ * Specifically thanks to the protection around plane updates in
+ * drm_atomic_helper_swap_state() the following additional guarantees hold:
+ *
+ * - It is safe to deference the drm_plane.state pointer.
+ *
+ * - Anything in struct drm_plane_state or the driver's subclass thereof which
+ *   stays invariant after the atomic check code has finished is safe to access.
+ *   Specifically this includes the reference counted pointers to framebuffer
+ *   and buffer objects.
+ *
+ * - Anything set up by &drm_plane_helper_funcs.fb_prepare and cleaned up
+ *   &drm_plane_helper_funcs.fb_cleanup is safe to access, as long as it stays
+ *   invariant between these two calls. This also means that for drivers using
+ *   dynamic buffer management the framebuffer is pinned, and therefer all
+ *   relevant datastructures can be accessed without taking any further locks
+ *   (which would be impossible in panic context anyway).
+ *
+ * - Importantly, software and hardware state set up by
+ *   &drm_plane_helper_funcs.begin_fb_access and
+ *   &drm_plane_helper_funcs.end_fb_access is not safe to access.
+ *
+ * Drivers must not make any assumptions about the actual state of the hardware,
+ * unless they explicitly protected these hardware access with drm_panic_lock()
+ * and drm_panic_unlock().
+ *
+ * Return:
+ * %0 when failing to acquire the raw spinlock, nonzero on success.
+ */
+#define drm_panic_trylock(dev, flags) \
+	raw_spin_trylock_irqsave(&(dev)->mode_config.panic_lock, flags)
+
+/**
+ * drm_panic_lock - protect panic printing relevant state
+ * @dev: struct drm_device
+ * @flags: unsigned long irq flags you need to pass to the unlock() counterpart
+ *
+ * This function must be called to protect software and hardware state that the
+ * panic printing code must be able to rely on. The protected sections must be
+ * as small as possible. It uses the irqsave/irqrestore variant, and can be
+ * called from irq handler. Examples include:
+ *
+ * - Access to peek/poke or other similar registers, if that is the way the
+ *   driver prints the pixels into the scanout buffer at panic time.
+ *
+ * - Updates to pointers like &drm_plane.state, allowing the panic handler to
+ *   safely deference these. This is done in drm_atomic_helper_swap_state().
+ *
+ * - An state that isn't invariant and that the driver must be able to access
+ *   during panic printing.
+ */
+
+#define drm_panic_lock(dev, flags) \
+	raw_spin_lock_irqsave(&(dev)->mode_config.panic_lock, flags)
+
+/**
+ * drm_panic_unlock - end of the panic printing critical section
+ * @dev: struct drm_device
+ * @flags: irq flags that were returned when acquiring the lock
+ *
+ * Unlocks the raw spinlock acquired by either drm_panic_lock() or
+ * drm_panic_trylock().
+ */
+#define drm_panic_unlock(dev, flags) \
+	raw_spin_unlock_irqrestore(&(dev)->mode_config.panic_lock, flags)
+
+#endif /* __DRM_PANIC_H__ */
-- 
cgit v1.2.3


From bf9fb17c6672868d95126321762c8fdfe0ff0a2a Mon Sep 17 00:00:00 2001
From: Jocelyn Falempe <jfalempe@redhat.com>
Date: Tue, 9 Apr 2024 18:30:41 +0200
Subject: drm/panic: Add a drm panic handler

This module displays a user friendly message when a kernel panic
occurs. It currently doesn't contain any debug information,
but that can be added later.

v2
 * Use get_scanout_buffer() instead of the drm client API.
  (Thomas Zimmermann)
 * Add the panic reason to the panic message (Nerdopolis)
 * Add an exclamation mark (Nerdopolis)

v3
 * Rework the drawing functions, to write the pixels line by line and
 to use the drm conversion helper to support other formats.
 (Thomas Zimmermann)

v4
 * Use drm_fb_r1_to_32bit for fonts (Thomas Zimmermann)
 * Remove the default y to DRM_PANIC config option (Thomas Zimmermann)
 * Add foreground/background color config option
 * Fix the bottom lines not painted if the framebuffer height
   is not a multiple of the font height.
 * Automatically register the device to drm_panic, if the function
   get_scanout_buffer exists. (Thomas Zimmermann)

v5
 * Change the drawing API, use drm_fb_blit_from_r1() to draw the font.
 * Also add drm_fb_fill() to fill area with background color.
 * Add draw_pixel_xy() API for drivers that can't provide a linear buffer.
 * Add a flush() callback for drivers that needs to synchronize the buffer.
 * Add a void *private field, so drivers can pass private data to
   draw_pixel_xy() and flush().

v6
 * Fix sparse warning for panic_msg and logo.

v7
 * Add select DRM_KMS_HELPER for the color conversion functions.

v8
 * Register directly each plane to the panic notifier (Sima)
 * Add raw_spinlock to properly handle concurrency (Sima)
 * Register plane instead of device, to avoid looping through plane
   list, and simplify code.
 * Replace get_scanout_buffer() logic with drm_panic_set_buffer()
  (Thomas Zimmermann)
 * Removed the draw_pixel_xy() API, will see later if it can be added back.

v9
 * Revert to using get_scanout_buffer() (Sima)
 * Move get_scanout_buffer() and panic_flush() to the plane helper
   functions (Thomas Zimmermann)
 * Register all planes with get_scanout_buffer() to the panic notifier
 * Use drm_panic_lock() to protect against race (Sima)

v10
 * Move blit and fill functions back in drm_panic (Thomas Zimmermann).
 * Simplify the text drawing functions.
 * Use kmsg_dumper instead of panic_notifier (Sima).

v12
 * Use array for map and pitch in struct drm_scanout_buffer
   to support multi-planar format later. (Thomas Zimmermann)
 * Better indent struct drm_scanout_buffer declaration. (Thomas Zimmermann)

Signed-off-by: Jocelyn Falempe <jfalempe@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240409163432.352518-3-jfalempe@redhat.com
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 Documentation/gpu/drm-kms.rst            |  15 ++
 drivers/gpu/drm/Kconfig                  |  23 +++
 drivers/gpu/drm/Makefile                 |   1 +
 drivers/gpu/drm/drm_drv.c                |   4 +
 drivers/gpu/drm/drm_panic.c              | 290 +++++++++++++++++++++++++++++++
 include/drm/drm_modeset_helper_vtables.h |  39 +++++
 include/drm/drm_panic.h                  |  57 ++++++
 include/drm/drm_plane.h                  |   6 +
 8 files changed, 435 insertions(+)
 create mode 100644 drivers/gpu/drm/drm_panic.c

(limited to 'include')

diff --git a/Documentation/gpu/drm-kms.rst b/Documentation/gpu/drm-kms.rst
index b98b98359c90..abfe220764e1 100644
--- a/Documentation/gpu/drm-kms.rst
+++ b/Documentation/gpu/drm-kms.rst
@@ -398,6 +398,21 @@ Plane Damage Tracking Functions Reference
 .. kernel-doc:: include/drm/drm_damage_helper.h
    :internal:
 
+Plane Panic Feature
+-------------------
+
+.. kernel-doc:: drivers/gpu/drm/drm_panic.c
+   :doc: overview
+
+Plane Panic Functions Reference
+-------------------------------
+
+.. kernel-doc:: include/drm/drm_panic.h
+   :internal:
+
+.. kernel-doc:: drivers/gpu/drm/drm_panic.c
+   :export:
+
 Display Modes Function Reference
 ================================
 
diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig
index 3914aaf443a8..f8a26423369e 100644
--- a/drivers/gpu/drm/Kconfig
+++ b/drivers/gpu/drm/Kconfig
@@ -104,6 +104,29 @@ config DRM_KMS_HELPER
 	help
 	  CRTC helpers for KMS drivers.
 
+config DRM_PANIC
+	bool "Display a user-friendly message when a kernel panic occurs"
+	depends on DRM && !FRAMEBUFFER_CONSOLE
+	select DRM_KMS_HELPER
+	select FONT_SUPPORT
+	help
+	  Enable a drm panic handler, which will display a user-friendly message
+	  when a kernel panic occurs. It's useful when using a user-space
+	  console instead of fbcon.
+	  It will only work if your graphic driver supports this feature.
+	  To support Hi-DPI Display, you can enable bigger fonts like
+	  FONT_TER16x32
+
+config DRM_PANIC_FOREGROUND_COLOR
+	hex "Drm panic screen foreground color, in RGB"
+	depends on DRM_PANIC
+	default 0xffffff
+
+config DRM_PANIC_BACKGROUND_COLOR
+	hex "Drm panic screen background color, in RGB"
+	depends on DRM_PANIC
+	default 0x000000
+
 config DRM_DEBUG_DP_MST_TOPOLOGY_REFS
         bool "Enable refcount backtrace history in the DP MST helpers"
 	depends on STACKTRACE_SUPPORT
diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile
index a73c04d2d7a3..f9ca4f8fa6c5 100644
--- a/drivers/gpu/drm/Makefile
+++ b/drivers/gpu/drm/Makefile
@@ -88,6 +88,7 @@ drm-$(CONFIG_DRM_PRIVACY_SCREEN) += \
 	drm_privacy_screen.o \
 	drm_privacy_screen_x86.o
 drm-$(CONFIG_DRM_ACCEL) += ../../accel/drm_accel.o
+drm-$(CONFIG_DRM_PANIC) += drm_panic.o
 obj-$(CONFIG_DRM)	+= drm.o
 
 obj-$(CONFIG_DRM_PANEL_ORIENTATION_QUIRKS) += drm_panel_orientation_quirks.o
diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
index c157500b3135..535b624d4c9d 100644
--- a/drivers/gpu/drm/drm_drv.c
+++ b/drivers/gpu/drm/drm_drv.c
@@ -43,6 +43,7 @@
 #include <drm/drm_file.h>
 #include <drm/drm_managed.h>
 #include <drm/drm_mode_object.h>
+#include <drm/drm_panic.h>
 #include <drm/drm_print.h>
 #include <drm/drm_privacy_screen_machine.h>
 
@@ -944,6 +945,7 @@ int drm_dev_register(struct drm_device *dev, unsigned long flags)
 		if (ret)
 			goto err_unload;
 	}
+	drm_panic_register(dev);
 
 	DRM_INFO("Initialized %s %d.%d.%d %s for %s on minor %d\n",
 		 driver->name, driver->major, driver->minor,
@@ -988,6 +990,8 @@ void drm_dev_unregister(struct drm_device *dev)
 {
 	dev->registered = false;
 
+	drm_panic_unregister(dev);
+
 	drm_client_dev_unregister(dev);
 
 	if (drm_core_check_feature(dev, DRIVER_MODESET))
diff --git a/drivers/gpu/drm/drm_panic.c b/drivers/gpu/drm/drm_panic.c
new file mode 100644
index 000000000000..7c81b3d73239
--- /dev/null
+++ b/drivers/gpu/drm/drm_panic.c
@@ -0,0 +1,290 @@
+// SPDX-License-Identifier: GPL-2.0 or MIT
+/*
+ * Copyright (c) 2023 Red Hat.
+ * Author: Jocelyn Falempe <jfalempe@redhat.com>
+ * inspired by the drm_log driver from David Herrmann <dh.herrmann@gmail.com>
+ * Tux Ascii art taken from cowsay written by Tony Monroe
+ */
+
+#include <linux/font.h>
+#include <linux/iosys-map.h>
+#include <linux/kdebug.h>
+#include <linux/kmsg_dump.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#include <drm/drm_drv.h>
+#include <drm/drm_format_helper.h>
+#include <drm/drm_fourcc.h>
+#include <drm/drm_framebuffer.h>
+#include <drm/drm_modeset_helper_vtables.h>
+#include <drm/drm_panic.h>
+#include <drm/drm_plane.h>
+#include <drm/drm_print.h>
+
+MODULE_AUTHOR("Jocelyn Falempe");
+MODULE_DESCRIPTION("DRM panic handler");
+MODULE_LICENSE("GPL");
+
+/**
+ * DOC: overview
+ *
+ * To enable DRM panic for a driver, the primary plane must implement a
+ * &drm_plane_helper_funcs.get_scanout_buffer helper function. It is then
+ * automatically registered to the drm panic handler.
+ * When a panic occurs, the &drm_plane_helper_funcs.get_scanout_buffer will be
+ * called, and the driver can provide a framebuffer so the panic handler can
+ * draw the panic screen on it. Currently only linear buffer and a few color
+ * formats are supported.
+ * Optionally the driver can also provide a &drm_plane_helper_funcs.panic_flush
+ * callback, that will be called after that, to send additional commands to the
+ * hardware to make the scanout buffer visible.
+ */
+
+/*
+ * This module displays a user friendly message on screen when a kernel panic
+ * occurs. This is conflicting with fbcon, so you can only enable it when fbcon
+ * is disabled.
+ * It's intended for end-user, so have minimal technical/debug information.
+ *
+ * Implementation details:
+ *
+ * It is a panic handler, so it can't take lock, allocate memory, run tasks/irq,
+ * or attempt to sleep. It's a best effort, and it may not be able to display
+ * the message in all situations (like if the panic occurs in the middle of a
+ * modesetting).
+ * It will display only one static frame, so performance optimizations are low
+ * priority as the machine is already in an unusable state.
+ */
+
+struct drm_panic_line {
+	u32 len;
+	const char *txt;
+};
+
+#define PANIC_LINE(s) {.len = sizeof(s) - 1, .txt = s}
+
+static struct drm_panic_line panic_msg[] = {
+	PANIC_LINE("KERNEL PANIC !"),
+	PANIC_LINE(""),
+	PANIC_LINE("Please reboot your computer."),
+};
+
+static const struct drm_panic_line logo[] = {
+	PANIC_LINE("     .--.        _"),
+	PANIC_LINE("    |o_o |      | |"),
+	PANIC_LINE("    |:_/ |      | |"),
+	PANIC_LINE("   //   \\ \\     |_|"),
+	PANIC_LINE("  (|     | )     _"),
+	PANIC_LINE(" /'\\_   _/`\\    (_)"),
+	PANIC_LINE(" \\___)=(___/"),
+};
+
+static void drm_panic_fill32(struct iosys_map *map, unsigned int pitch,
+			     unsigned int height, unsigned int width,
+			     u32 color)
+{
+	unsigned int y, x;
+
+	for (y = 0; y < height; y++)
+		for (x = 0; x < width; x++)
+			iosys_map_wr(map, y * pitch + x * sizeof(u32), u32, color);
+}
+
+static void drm_panic_blit32(struct iosys_map *dmap, unsigned int dpitch,
+			     const u8 *sbuf8, unsigned int spitch,
+			     unsigned int height, unsigned int width,
+			     u32 fg32, u32 bg32)
+{
+	unsigned int y, x;
+	u32 val32;
+
+	for (y = 0; y < height; y++) {
+		for (x = 0; x < width; x++) {
+			val32 = (sbuf8[(y * spitch) + x / 8] & (0x80 >> (x % 8))) ? fg32 : bg32;
+			iosys_map_wr(dmap, y * dpitch + x * sizeof(u32), u32, val32);
+		}
+	}
+}
+
+static const u8 *get_char_bitmap(const struct font_desc *font, char c, size_t font_pitch)
+{
+	return font->data + (c * font->height) * font_pitch;
+}
+
+static unsigned int get_max_line_len(const struct drm_panic_line *lines, int len)
+{
+	int i;
+	unsigned int max = 0;
+
+	for (i = 0; i < len; i++)
+		max = max(lines[i].len, max);
+	return max;
+}
+
+/*
+ * Draw a text in a rectangle on a framebuffer. The text is truncated if it overflows the rectangle
+ */
+static void draw_txt_rectangle(struct drm_scanout_buffer *sb,
+			       const struct font_desc *font,
+			       const struct drm_panic_line *msg,
+			       unsigned int msg_lines,
+			       bool centered,
+			       struct drm_rect *clip,
+			       u32 fg_color,
+			       u32 bg_color)
+{
+	int i, j;
+	const u8 *src;
+	size_t font_pitch = DIV_ROUND_UP(font->width, 8);
+	struct iosys_map dst;
+	unsigned int px_width = sb->format->cpp[0];
+	int left = 0;
+
+	msg_lines = min(msg_lines,  drm_rect_height(clip) / font->height);
+	for (i = 0; i < msg_lines; i++) {
+		size_t line_len = min(msg[i].len, drm_rect_width(clip) / font->width);
+
+		if (centered)
+			left = (drm_rect_width(clip) - (line_len * font->width)) / 2;
+
+		dst = sb->map[0];
+		iosys_map_incr(&dst, (clip->y1 + i * font->height) * sb->pitch[0] +
+				     (clip->x1 + left) * px_width);
+		for (j = 0; j < line_len; j++) {
+			src = get_char_bitmap(font, msg[i].txt[j], font_pitch);
+			drm_panic_blit32(&dst, sb->pitch[0], src, font_pitch,
+					 font->height, font->width,
+					 fg_color, bg_color);
+			iosys_map_incr(&dst, font->width * px_width);
+		}
+	}
+}
+
+/*
+ * Draw the panic message at the center of the screen
+ */
+static void draw_panic_static(struct drm_scanout_buffer *sb)
+{
+	size_t msg_lines = ARRAY_SIZE(panic_msg);
+	size_t logo_lines = ARRAY_SIZE(logo);
+	u32 fg_color = CONFIG_DRM_PANIC_FOREGROUND_COLOR;
+	u32 bg_color = CONFIG_DRM_PANIC_BACKGROUND_COLOR;
+	const struct font_desc *font = get_default_font(sb->width, sb->height, NULL, NULL);
+	struct drm_rect r_logo, r_msg;
+
+	if (!font)
+		return;
+
+	r_logo = DRM_RECT_INIT(0, 0,
+			       get_max_line_len(logo, logo_lines) * font->width,
+			       logo_lines * font->height);
+	r_msg = DRM_RECT_INIT(0, 0,
+			      min(get_max_line_len(panic_msg, msg_lines) * font->width, sb->width),
+			      min(msg_lines * font->height, sb->height));
+
+	/* Center the panic message */
+	drm_rect_translate(&r_msg, (sb->width - r_msg.x2) / 2, (sb->height - r_msg.y2) / 2);
+
+	/* Fill with the background color, and draw text on top */
+	drm_panic_fill32(&sb->map[0], sb->pitch[0], sb->height, sb->width, bg_color);
+
+	if ((r_msg.x1 >= drm_rect_width(&r_logo) || r_msg.y1 >= drm_rect_height(&r_logo)) &&
+	    drm_rect_width(&r_logo) < sb->width && drm_rect_height(&r_logo) < sb->height) {
+		draw_txt_rectangle(sb, font, logo, logo_lines, false, &r_logo, fg_color, bg_color);
+	}
+	draw_txt_rectangle(sb, font, panic_msg, msg_lines, true, &r_msg, fg_color, bg_color);
+}
+
+/*
+ * drm_panic_is_format_supported()
+ * @format: a fourcc color code
+ * Returns: true if supported, false otherwise.
+ *
+ * Check if drm_panic will be able to use this color format.
+ */
+static bool drm_panic_is_format_supported(const struct drm_format_info *format)
+{
+	if (format->num_planes != 1)
+		return false;
+	return format->format == DRM_FORMAT_XRGB8888;
+}
+
+static void draw_panic_plane(struct drm_plane *plane)
+{
+	struct drm_scanout_buffer sb;
+	int ret;
+	unsigned long flags;
+
+	if (!drm_panic_trylock(plane->dev, flags))
+		return;
+
+	ret = plane->helper_private->get_scanout_buffer(plane, &sb);
+
+	if (!ret && drm_panic_is_format_supported(sb.format)) {
+		draw_panic_static(&sb);
+		if (plane->helper_private->panic_flush)
+			plane->helper_private->panic_flush(plane);
+	}
+	drm_panic_unlock(plane->dev, flags);
+}
+
+static struct drm_plane *to_drm_plane(struct kmsg_dumper *kd)
+{
+	return container_of(kd, struct drm_plane, kmsg_panic);
+}
+
+static void drm_panic(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason)
+{
+	struct drm_plane *plane = to_drm_plane(dumper);
+
+	if (reason == KMSG_DUMP_PANIC)
+		draw_panic_plane(plane);
+}
+
+/**
+ * drm_panic_register() - Initialize DRM panic for a device
+ * @dev: the drm device on which the panic screen will be displayed.
+ */
+void drm_panic_register(struct drm_device *dev)
+{
+	struct drm_plane *plane;
+	int registered_plane = 0;
+
+	if (!dev->mode_config.num_total_plane)
+		return;
+
+	drm_for_each_plane(plane, dev) {
+		if (!plane->helper_private || !plane->helper_private->get_scanout_buffer)
+			continue;
+		plane->kmsg_panic.dump = drm_panic;
+		plane->kmsg_panic.max_reason = KMSG_DUMP_PANIC;
+		if (kmsg_dump_register(&plane->kmsg_panic))
+			drm_warn(dev, "Failed to register panic handler\n");
+		else
+			registered_plane++;
+	}
+	if (registered_plane)
+		drm_info(dev, "Registered %d planes with drm panic\n", registered_plane);
+}
+EXPORT_SYMBOL(drm_panic_register);
+
+/**
+ * drm_panic_unregister()
+ * @dev: the drm device previously registered.
+ */
+void drm_panic_unregister(struct drm_device *dev)
+{
+	struct drm_plane *plane;
+
+	if (!dev->mode_config.num_total_plane)
+		return;
+
+	drm_for_each_plane(plane, dev) {
+		if (!plane->helper_private || !plane->helper_private->get_scanout_buffer)
+			continue;
+		kmsg_dump_unregister(&plane->kmsg_panic);
+	}
+}
+EXPORT_SYMBOL(drm_panic_unregister);
diff --git a/include/drm/drm_modeset_helper_vtables.h b/include/drm/drm_modeset_helper_vtables.h
index 9ed42469540e..ec59015aec3c 100644
--- a/include/drm/drm_modeset_helper_vtables.h
+++ b/include/drm/drm_modeset_helper_vtables.h
@@ -48,6 +48,7 @@
  * To make this clear all the helper vtables are pulled together in this location here.
  */
 
+struct drm_scanout_buffer;
 struct drm_writeback_connector;
 struct drm_writeback_job;
 
@@ -1443,6 +1444,44 @@ struct drm_plane_helper_funcs {
 	 */
 	void (*atomic_async_update)(struct drm_plane *plane,
 				    struct drm_atomic_state *state);
+
+	/**
+	 * @get_scanout_buffer:
+	 *
+	 * Get the current scanout buffer, to display a message with drm_panic.
+	 * The driver should do the minimum changes to provide a buffer,
+	 * that can be used to display the panic screen. Currently only linear
+	 * buffers are supported. Non-linear buffer support is on the TODO list.
+	 * The device &dev.mode_config.panic_lock is taken before calling this
+	 * function, so you can safely access the &plane.state
+	 * It is called from a panic callback, and must follow its restrictions.
+	 * Please look the documentation at drm_panic_trylock() for an in-depth
+	 * discussions of what's safe and what is not allowed.
+	 * It's a best effort mode, so it's expected that in some complex cases
+	 * the panic screen won't be displayed.
+	 * The returned &drm_scanout_buffer.map must be valid if no error code is
+	 * returned.
+	 *
+	 * Return:
+	 * %0 on success, negative errno on failure.
+	 */
+	int (*get_scanout_buffer)(struct drm_plane *plane,
+				  struct drm_scanout_buffer *sb);
+
+	/**
+	 * @panic_flush:
+	 *
+	 * It is used by drm_panic, and is called after the panic screen is
+	 * drawn to the scanout buffer. In this function, the driver
+	 * can send additional commands to the hardware, to make the scanout
+	 * buffer visible.
+	 * It is only called if get_scanout_buffer() returned successfully, and
+	 * the &dev.mode_config.panic_lock is held during the entire sequence.
+	 * It is called from a panic callback, and must follow its restrictions.
+	 * Please look the documentation at drm_panic_trylock() for an in-depth
+	 * discussions of what's safe and what is not allowed.
+	 */
+	void (*panic_flush)(struct drm_plane *plane);
 };
 
 /**
diff --git a/include/drm/drm_panic.h b/include/drm/drm_panic.h
index 967e02ccc6d5..822dbb1aa9d6 100644
--- a/include/drm/drm_panic.h
+++ b/include/drm/drm_panic.h
@@ -2,11 +2,56 @@
 #ifndef __DRM_PANIC_H__
 #define __DRM_PANIC_H__
 
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/iosys-map.h>
+
 #include <drm/drm_device.h>
+#include <drm/drm_fourcc.h>
 /*
  * Copyright (c) 2024 Intel
  */
 
+/**
+ * struct drm_scanout_buffer - DRM scanout buffer
+ *
+ * This structure holds the information necessary for drm_panic to draw the
+ * panic screen, and display it.
+ */
+struct drm_scanout_buffer {
+	/**
+	 * @format:
+	 *
+	 * drm format of the scanout buffer.
+	 */
+	const struct drm_format_info *format;
+
+	/**
+	 * @map:
+	 *
+	 * Virtual address of the scanout buffer, either in memory or iomem.
+	 * The scanout buffer should be in linear format, and can be directly
+	 * sent to the display hardware. Tearing is not an issue for the panic
+	 * screen.
+	 */
+	struct iosys_map map[DRM_FORMAT_MAX_PLANES];
+
+	/**
+	 * @width: Width of the scanout buffer, in pixels.
+	 */
+	unsigned int width;
+
+	/**
+	 * @height: Height of the scanout buffer, in pixels.
+	 */
+	unsigned int height;
+
+	/**
+	 * @pitch: Length in bytes between the start of two consecutive lines.
+	 */
+	unsigned int pitch[DRM_FORMAT_MAX_PLANES];
+};
+
 /**
  * drm_panic_trylock - try to enter the panic printing critical section
  * @dev: struct drm_device
@@ -92,4 +137,16 @@
 #define drm_panic_unlock(dev, flags) \
 	raw_spin_unlock_irqrestore(&(dev)->mode_config.panic_lock, flags)
 
+#ifdef CONFIG_DRM_PANIC
+
+void drm_panic_register(struct drm_device *dev);
+void drm_panic_unregister(struct drm_device *dev);
+
+#else
+
+static inline void drm_panic_register(struct drm_device *dev) {}
+static inline void drm_panic_unregister(struct drm_device *dev) {}
+
+#endif
+
 #endif /* __DRM_PANIC_H__ */
diff --git a/include/drm/drm_plane.h b/include/drm/drm_plane.h
index ec1112208b73..9507542121fa 100644
--- a/include/drm/drm_plane.h
+++ b/include/drm/drm_plane.h
@@ -25,6 +25,7 @@
 
 #include <linux/list.h>
 #include <linux/ctype.h>
+#include <linux/kmsg_dump.h>
 #include <drm/drm_mode_object.h>
 #include <drm/drm_color_mgmt.h>
 #include <drm/drm_rect.h>
@@ -780,6 +781,11 @@ struct drm_plane {
 	 * @hotspot_y_property: property to set mouse hotspot y offset.
 	 */
 	struct drm_property *hotspot_y_property;
+
+	/**
+	 * @kmsg_panic: Used to register a panic notifier for this plane
+	 */
+	struct kmsg_dumper kmsg_panic;
 };
 
 #define obj_to_plane(x) container_of(x, struct drm_plane, base)
-- 
cgit v1.2.3


From 879b3b6511fe92b1b93dfc543961347289a8aeaa Mon Sep 17 00:00:00 2001
From: Jocelyn Falempe <jfalempe@redhat.com>
Date: Tue, 9 Apr 2024 18:30:44 +0200
Subject: drm/fb_dma: Add generic get_scanout_buffer() for drm_panic

This was initialy done for imx6, but should work on most drivers
using drm_fb_dma_helper.

v8:
 * Replace get_scanout_buffer() logic with drm_panic_set_buffer()
   (Thomas Zimmermann)

v9:
 * go back to get_scanout_buffer()
 * move get_scanout_buffer() to plane helper functions

v12:
 * Rename drm_panic_gem_get_scanout_buffer to drm_fb_dma_get_scanout_buffer
   (Thomas Zimmermann)
 * Remove the #ifdef CONFIG_DRM_PANIC, and build it unconditionnaly, as
   it's a small function. (Thomas Zimmermann)

Signed-off-by: Jocelyn Falempe <jfalempe@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240409163432.352518-6-jfalempe@redhat.com
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/drm_fb_dma_helper.c | 42 +++++++++++++++++++++++++++++++++++++
 include/drm/drm_fb_dma_helper.h     |  4 ++++
 2 files changed, 46 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_fb_dma_helper.c b/drivers/gpu/drm/drm_fb_dma_helper.c
index 3b535ad1b07c..c79b5078303f 100644
--- a/drivers/gpu/drm/drm_fb_dma_helper.c
+++ b/drivers/gpu/drm/drm_fb_dma_helper.c
@@ -15,6 +15,7 @@
 #include <drm/drm_framebuffer.h>
 #include <drm/drm_gem_dma_helper.h>
 #include <drm/drm_gem_framebuffer_helper.h>
+#include <drm/drm_panic.h>
 #include <drm/drm_plane.h>
 #include <linux/dma-mapping.h>
 #include <linux/module.h>
@@ -148,3 +149,44 @@ void drm_fb_dma_sync_non_coherent(struct drm_device *drm,
 	}
 }
 EXPORT_SYMBOL_GPL(drm_fb_dma_sync_non_coherent);
+
+/**
+ * drm_fb_dma_get_scanout_buffer - Provide a scanout buffer in case of panic
+ * @plane: DRM primary plane
+ * @drm_scanout_buffer: scanout buffer for the panic handler
+ * Returns: 0 or negative error code
+ *
+ * Generic get_scanout_buffer() implementation, for drivers that uses the
+ * drm_fb_dma_helper. It won't call vmap in the panic context, so the driver
+ * should make sure the primary plane is vmapped, otherwise the panic screen
+ * won't get displayed.
+ */
+int drm_fb_dma_get_scanout_buffer(struct drm_plane *plane,
+				  struct drm_scanout_buffer *sb)
+{
+	struct drm_gem_dma_object *dma_obj;
+	struct drm_framebuffer *fb;
+
+	fb = plane->state->fb;
+	/* Only support linear modifier */
+	if (fb->modifier != DRM_FORMAT_MOD_LINEAR)
+		return -ENODEV;
+
+	dma_obj = drm_fb_dma_get_gem_obj(fb, 0);
+
+	/* Buffer should be accessible from the CPU */
+	if (dma_obj->base.import_attach)
+		return -ENODEV;
+
+	/* Buffer should be already mapped to CPU */
+	if (!dma_obj->vaddr)
+		return -ENODEV;
+
+	iosys_map_set_vaddr(&sb->map[0], dma_obj->vaddr);
+	sb->format = fb->format;
+	sb->height = fb->height;
+	sb->width = fb->width;
+	sb->pitch[0] = fb->pitches[0];
+	return 0;
+}
+EXPORT_SYMBOL(drm_fb_dma_get_scanout_buffer);
diff --git a/include/drm/drm_fb_dma_helper.h b/include/drm/drm_fb_dma_helper.h
index d5e036c57801..61f24c2aba2f 100644
--- a/include/drm/drm_fb_dma_helper.h
+++ b/include/drm/drm_fb_dma_helper.h
@@ -7,6 +7,7 @@
 struct drm_device;
 struct drm_framebuffer;
 struct drm_plane_state;
+struct drm_scanout_buffer;
 
 struct drm_gem_dma_object *drm_fb_dma_get_gem_obj(struct drm_framebuffer *fb,
 	unsigned int plane);
@@ -19,5 +20,8 @@ void drm_fb_dma_sync_non_coherent(struct drm_device *drm,
 				  struct drm_plane_state *old_state,
 				  struct drm_plane_state *state);
 
+int drm_panic_gem_get_scanout_buffer(struct drm_plane *plane,
+				     struct drm_scanout_buffer *sb);
+
 #endif
 
-- 
cgit v1.2.3


From e964fc77577a9afe528e54b50527cf49e24aa211 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 11 Apr 2024 20:51:07 +0300
Subject: vfs, swap: compile out IS_SWAPFILE() on swapless configs

No swap support -- no swapfiles possible.

Signed-off-by: Alexey Dobriyan (Yandex) <adobriyan@gmail.com>
Link: https://lore.kernel.org/r/2391c7f5-0f83-4188-ae56-4ec7ccbf2576@p183
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index b053f98393e6..d376dd7057d5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2260,7 +2260,13 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags
 
 #define IS_DEADDIR(inode)	((inode)->i_flags & S_DEAD)
 #define IS_NOCMTIME(inode)	((inode)->i_flags & S_NOCMTIME)
+
+#ifdef CONFIG_SWAP
 #define IS_SWAPFILE(inode)	((inode)->i_flags & S_SWAPFILE)
+#else
+#define IS_SWAPFILE(inode)	((void)(inode), 0U)
+#endif
+
 #define IS_PRIVATE(inode)	((inode)->i_flags & S_PRIVATE)
 #define IS_IMA(inode)		((inode)->i_flags & S_IMA)
 #define IS_AUTOMOUNT(inode)	((inode)->i_flags & S_AUTOMOUNT)
-- 
cgit v1.2.3


From 10c9e40f297d3f1b0d4df6e73618d4466a0b2cfc Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 15 Mar 2024 23:29:59 -0700
Subject: rcu: Remove redundant CONFIG_PROVE_RCU #if condition

The #if condition controlling the rcu_preempt_sleep_check() definition
has a redundant check for CONFIG_PREEMPT_RCU, which is already checked
for by an enclosing #ifndef.  This commit therefore removes this redundant
condition from the inner #if.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 include/linux/rcupdate.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 2c54750e36a0..382780bb60f4 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -401,15 +401,15 @@ static inline int debug_lockdep_rcu_enabled(void)
 		}							\
 	} while (0)
 
-#if defined(CONFIG_PROVE_RCU) && !defined(CONFIG_PREEMPT_RCU)
+#ifndef CONFIG_PREEMPT_RCU
 static inline void rcu_preempt_sleep_check(void)
 {
 	RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map),
 			 "Illegal context switch in RCU read-side critical section");
 }
-#else /* #ifdef CONFIG_PROVE_RCU */
+#else // #ifndef CONFIG_PREEMPT_RCU
 static inline void rcu_preempt_sleep_check(void) { }
-#endif /* #else #ifdef CONFIG_PROVE_RCU */
+#endif // #else // #ifndef CONFIG_PREEMPT_RCU
 
 #define rcu_sleep_check()						\
 	do {								\
-- 
cgit v1.2.3


From d894ea562f7ac908bf848bf15e1cfab503da71a9 Mon Sep 17 00:00:00 2001
From: Maíra Canal <mcanal@igalia.com>
Date: Mon, 15 Apr 2024 12:09:53 -0300
Subject: drm/fb_dma:
 s/drm_panic_gem_get_scanout_buffer/drm_fb_dma_get_scanout_buffer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On version 11 of the "drm/panic: Add a drm panic handler" series [1], Thomas
suggested to change the name of the function `drm_panic_gem_get_scanout_buffer`
to `drm_fb_dma_get_scanout_buffer` and this request was applied on version 12,
which is the version that landed [2]. Although the name of the function
changed on the C file, it didn't changed on the header file, leading to a
compilation error as such:

drivers/gpu/drm/imx/ipuv3/ipuv3-plane.c:780:24: error: use of undeclared
identifier 'drm_fb_dma_get_scanout_buffer'; did you mean 'drm_panic_gem_get_scanout_buffer'?
  780 |         .get_scanout_buffer = drm_fb_dma_get_scanout_buffer,
      |                               ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      |                               drm_panic_gem_get_scanout_buffer
./include/drm/drm_fb_dma_helper.h:23:5: note: 'drm_panic_gem_get_scanout_buffer'
declared here
   23 | int drm_panic_gem_get_scanout_buffer(struct drm_plane *plane,
      |     ^
1 error generated.

Fix the compilation error by changing `drm_panic_gem_get_scanout_buffer` to
`drm_fb_dma_get_scanout_buffer` on the header file.

Link: https://lore.kernel.org/dri-devel/20240328120638.468738-1-jfalempe@redhat.com/ [1]
Link: https://lore.kernel.org/dri-devel/aea2aa01-7f03-453b-8b30-8f4d90b1b47f@redhat.com/ [2]
Fixes: 879b3b6511fe ("drm/fb_dma: Add generic get_scanout_buffer() for drm_panic")
Signed-off-by: Maíra Canal <mcanal@igalia.com>
Reviewed-by: Jocelyn Falempe <jfalempe@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240415151013.3210278-1-mcanal@igalia.com
---
 include/drm/drm_fb_dma_helper.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/drm/drm_fb_dma_helper.h b/include/drm/drm_fb_dma_helper.h
index 61f24c2aba2f..c950732c6d36 100644
--- a/include/drm/drm_fb_dma_helper.h
+++ b/include/drm/drm_fb_dma_helper.h
@@ -6,6 +6,7 @@
 
 struct drm_device;
 struct drm_framebuffer;
+struct drm_plane;
 struct drm_plane_state;
 struct drm_scanout_buffer;
 
@@ -20,8 +21,8 @@ void drm_fb_dma_sync_non_coherent(struct drm_device *drm,
 				  struct drm_plane_state *old_state,
 				  struct drm_plane_state *state);
 
-int drm_panic_gem_get_scanout_buffer(struct drm_plane *plane,
-				     struct drm_scanout_buffer *sb);
+int drm_fb_dma_get_scanout_buffer(struct drm_plane *plane,
+				  struct drm_scanout_buffer *sb);
 
 #endif
 
-- 
cgit v1.2.3


From bfc7bc539392f681194de341dd2bb9e83f31f588 Mon Sep 17 00:00:00 2001
From: Tvrtko Ursulin <tursulin@ursulin.net>
Date: Thu, 28 Mar 2024 14:53:23 +0000
Subject: dma-buf: Do not build debugfs related code when !CONFIG_DEBUG_FS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There is no point in compiling in the list and mutex operations which are
only used from the dma-buf debugfs code, if debugfs is not compiled in.

Put the code in questions behind some kconfig guards and so save some text
and maybe even a pointer per object at runtime when not enabled.

Signed-off-by: Tvrtko Ursulin <tursulin@ursulin.net>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: Christian König <christian.koenig@amd.com>
Cc: linux-media@vger.kernel.org
Cc: dri-devel@lists.freedesktop.org
Cc: linaro-mm-sig@lists.linaro.org
Cc: linux-kernel@vger.kernel.org
Cc: kernel-dev@igalia.com
Reviewed-by: T.J. Mercier <tjmercier@google.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Maíra Canal <mcanal@igalia.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240328145323.68872-1-tursulin@igalia.com
---
 drivers/dma-buf/dma-buf.c | 56 ++++++++++++++++++++++++++++-------------------
 include/linux/dma-buf.h   |  2 ++
 2 files changed, 36 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 8fe5aa67b167..8892bc701a66 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -35,12 +35,35 @@
 
 static inline int is_dma_buf_file(struct file *);
 
-struct dma_buf_list {
-	struct list_head head;
-	struct mutex lock;
-};
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+static DEFINE_MUTEX(debugfs_list_mutex);
+static LIST_HEAD(debugfs_list);
 
-static struct dma_buf_list db_list;
+static void __dma_buf_debugfs_list_add(struct dma_buf *dmabuf)
+{
+	mutex_lock(&debugfs_list_mutex);
+	list_add(&dmabuf->list_node, &debugfs_list);
+	mutex_unlock(&debugfs_list_mutex);
+}
+
+static void __dma_buf_debugfs_list_del(struct dma_buf *dmabuf)
+{
+	if (!dmabuf)
+		return;
+
+	mutex_lock(&debugfs_list_mutex);
+	list_del(&dmabuf->list_node);
+	mutex_unlock(&debugfs_list_mutex);
+}
+#else
+static void __dma_buf_debugfs_list_add(struct dma_buf *dmabuf)
+{
+}
+
+static void __dma_buf_debugfs_list_del(struct file *file)
+{
+}
+#endif
 
 static char *dmabuffs_dname(struct dentry *dentry, char *buffer, int buflen)
 {
@@ -89,17 +112,10 @@ static void dma_buf_release(struct dentry *dentry)
 
 static int dma_buf_file_release(struct inode *inode, struct file *file)
 {
-	struct dma_buf *dmabuf;
-
 	if (!is_dma_buf_file(file))
 		return -EINVAL;
 
-	dmabuf = file->private_data;
-	if (dmabuf) {
-		mutex_lock(&db_list.lock);
-		list_del(&dmabuf->list_node);
-		mutex_unlock(&db_list.lock);
-	}
+	__dma_buf_debugfs_list_del(file->private_data);
 
 	return 0;
 }
@@ -672,9 +688,7 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info)
 	file->f_path.dentry->d_fsdata = dmabuf;
 	dmabuf->file = file;
 
-	mutex_lock(&db_list.lock);
-	list_add(&dmabuf->list_node, &db_list.head);
-	mutex_unlock(&db_list.lock);
+	__dma_buf_debugfs_list_add(dmabuf);
 
 	return dmabuf;
 
@@ -1611,7 +1625,7 @@ static int dma_buf_debug_show(struct seq_file *s, void *unused)
 	size_t size = 0;
 	int ret;
 
-	ret = mutex_lock_interruptible(&db_list.lock);
+	ret = mutex_lock_interruptible(&debugfs_list_mutex);
 
 	if (ret)
 		return ret;
@@ -1620,7 +1634,7 @@ static int dma_buf_debug_show(struct seq_file *s, void *unused)
 	seq_printf(s, "%-8s\t%-8s\t%-8s\t%-8s\texp_name\t%-8s\tname\n",
 		   "size", "flags", "mode", "count", "ino");
 
-	list_for_each_entry(buf_obj, &db_list.head, list_node) {
+	list_for_each_entry(buf_obj, &debugfs_list, list_node) {
 
 		ret = dma_resv_lock_interruptible(buf_obj->resv, NULL);
 		if (ret)
@@ -1657,11 +1671,11 @@ static int dma_buf_debug_show(struct seq_file *s, void *unused)
 
 	seq_printf(s, "\nTotal %d objects, %zu bytes\n", count, size);
 
-	mutex_unlock(&db_list.lock);
+	mutex_unlock(&debugfs_list_mutex);
 	return 0;
 
 error_unlock:
-	mutex_unlock(&db_list.lock);
+	mutex_unlock(&debugfs_list_mutex);
 	return ret;
 }
 
@@ -1718,8 +1732,6 @@ static int __init dma_buf_init(void)
 	if (IS_ERR(dma_buf_mnt))
 		return PTR_ERR(dma_buf_mnt);
 
-	mutex_init(&db_list.lock);
-	INIT_LIST_HEAD(&db_list.head);
 	dma_buf_init_debugfs();
 	return 0;
 }
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 8ff4add71f88..36216d28d8bd 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -370,8 +370,10 @@ struct dma_buf {
 	 */
 	struct module *owner;
 
+#if IS_ENABLED(CONFIG_DEBUG_FS)
 	/** @list_node: node for dma_buf accounting and debugging. */
 	struct list_head list_node;
+#endif
 
 	/** @priv: exporter specific private data for this buffer object. */
 	void *priv;
-- 
cgit v1.2.3


From 80cd613a9ae091dbf52e27a409d58da988ffc8f3 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 25 Mar 2024 18:39:08 +0100
Subject: rcu: Mollify sparse with RCU guard

When using "guard(rcu)();" sparse will complain, because even
though it now understands the cleanup attribute, it doesn't
evaluate the calls from it at function exit, and thus doesn't
count the context correctly.

Given that there's a conditional in the resulting code:

  static inline void class_rcu_destructor(class_rcu_t *_T)
  {
      if (_T->lock) {
          rcu_read_unlock();
      }
  }

it seems that even trying to teach sparse to evalulate the
cleanup attribute function it'd still be difficult to really
make it understand the full context here.

Suppress the sparse warning by just releasing the context in
the acquisition part of the function, after all we know it's
safe with the guard, that's the whole point of it.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Reviewed-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 include/linux/rcupdate.h | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 382780bb60f4..dfd2399f2cde 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -1090,6 +1090,18 @@ rcu_head_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f)
 extern int rcu_expedited;
 extern int rcu_normal;
 
-DEFINE_LOCK_GUARD_0(rcu, rcu_read_lock(), rcu_read_unlock())
+DEFINE_LOCK_GUARD_0(rcu,
+	do {
+		rcu_read_lock();
+		/*
+		 * sparse doesn't call the cleanup function,
+		 * so just release immediately and don't track
+		 * the context. We don't need to anyway, since
+		 * the whole point of the guard is to not need
+		 * the explicit unlock.
+		 */
+		__release(RCU);
+	} while (0),
+	rcu_read_unlock())
 
 #endif /* __LINUX_RCUPDATE_H */
-- 
cgit v1.2.3


From 2053937a310a3982de9d33af3db2dbd2b32b66e4 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Fri, 8 Mar 2024 18:34:06 +0100
Subject: rcu: Add a trace event for synchronize_rcu_normal()

Add an rcu_sr_normal() trace event. It takes three arguments
first one is the name of RCU flavour, second one is a user id
which triggeres synchronize_rcu_normal() and last one is an
event.

There are two traces in the synchronize_rcu_normal(). On entry,
when a new request is registered and on exit point when request
is completed.

Please note, CONFIG_RCU_TRACE=y is required to activate traces.

Reviewed-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 include/trace/events/rcu.h | 27 +++++++++++++++++++++++++++
 kernel/rcu/tree.c          |  7 ++++++-
 2 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 2ef9c719772a..31b3e0d3e65f 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -707,6 +707,33 @@ TRACE_EVENT_RCU(rcu_invoke_kfree_bulk_callback,
 		__entry->rcuname, __entry->p, __entry->nr_records)
 );
 
+/*
+ * Tracepoint for a normal synchronize_rcu() states. The first argument
+ * is the RCU flavor, the second argument is a pointer to rcu_head the
+ * last one is an event.
+ */
+TRACE_EVENT_RCU(rcu_sr_normal,
+
+	TP_PROTO(const char *rcuname, struct rcu_head *rhp, const char *srevent),
+
+	TP_ARGS(rcuname, rhp, srevent),
+
+	TP_STRUCT__entry(
+		__field(const char *, rcuname)
+		__field(void *, rhp)
+		__field(const char *, srevent)
+	),
+
+	TP_fast_assign(
+		__entry->rcuname = rcuname;
+		__entry->rhp = rhp;
+		__entry->srevent = srevent;
+	),
+
+	TP_printk("%s rhp=0x%p event=%s",
+		__entry->rcuname, __entry->rhp, __entry->srevent)
+);
+
 /*
  * Tracepoint for exiting rcu_do_batch after RCU callbacks have been
  * invoked.  The first argument is the name of the RCU flavor,
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f65255205e44..2e1c5be6d64b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3863,9 +3863,11 @@ static void synchronize_rcu_normal(void)
 {
 	struct rcu_synchronize rs;
 
+	trace_rcu_sr_normal(rcu_state.name, &rs.head, TPS("request"));
+
 	if (!READ_ONCE(rcu_normal_wake_from_gp)) {
 		wait_rcu_gp(call_rcu_hurry);
-		return;
+		goto trace_complete_out;
 	}
 
 	init_rcu_head_on_stack(&rs.head);
@@ -3886,6 +3888,9 @@ static void synchronize_rcu_normal(void)
 	/* Now we can wait. */
 	wait_for_completion(&rs.completion);
 	destroy_rcu_head_on_stack(&rs.head);
+
+trace_complete_out:
+	trace_rcu_sr_normal(rcu_state.name, &rs.head, TPS("complete"));
 }
 
 /**
-- 
cgit v1.2.3


From a9c3475dd67bd828d999e95d0ba985e7ac4cbbb7 Mon Sep 17 00:00:00 2001
From: Vignesh Balasubramanian <vigbalas@amd.com>
Date: Fri, 12 Apr 2024 11:51:39 +0530
Subject: Replace macro "ARCH_HAVE_EXTRA_ELF_NOTES" with kconfig

"ARCH_HAVE_EXTRA_ELF_NOTES" enables an extra note section in the
core dump. Kconfig variable is preferred over ARCH_HAVE_* macro.

Co-developed-by: Jini Susan George <jinisusan.george@amd.com>
Signed-off-by: Jini Susan George <jinisusan.george@amd.com>
Signed-off-by: Vignesh Balasubramanian <vigbalas@amd.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20240412062138.1132841-2-vigbalas@amd.com
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/Kconfig                   | 9 +++++++++
 arch/powerpc/Kconfig           | 1 +
 arch/powerpc/include/asm/elf.h | 2 --
 include/linux/elf.h            | 2 +-
 4 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/arch/Kconfig b/arch/Kconfig
index 9f066785bb71..143f021c8a76 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -502,6 +502,15 @@ config MMU_LAZY_TLB_SHOOTDOWN
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
 	bool
 
+config ARCH_HAVE_EXTRA_ELF_NOTES
+	bool
+	help
+	  An architecture should select this in order to enable adding an
+	  arch-specific ELF note section to core files. It must provide two
+	  functions: elf_coredump_extra_notes_size() and
+	  elf_coredump_extra_notes_write() which are invoked by the ELF core
+	  dumper.
+
 config ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
 	bool
 
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 1c4be3373686..c45fa9d7fb76 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -156,6 +156,7 @@ config PPC
 	select ARCH_HAS_UACCESS_FLUSHCACHE
 	select ARCH_HAS_UBSAN
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
+	select ARCH_HAVE_EXTRA_ELF_NOTES        if SPU_BASE
 	select ARCH_KEEP_MEMBLOCK
 	select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE	if PPC_RADIX_MMU
 	select ARCH_MIGHT_HAVE_PC_PARPORT
diff --git a/arch/powerpc/include/asm/elf.h b/arch/powerpc/include/asm/elf.h
index 79f1c480b5eb..bb4b94444d3e 100644
--- a/arch/powerpc/include/asm/elf.h
+++ b/arch/powerpc/include/asm/elf.h
@@ -127,8 +127,6 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm,
 /* Notes used in ET_CORE. Note name is "SPU/<fd>/<filename>". */
 #define NT_SPU		1
 
-#define ARCH_HAVE_EXTRA_ELF_NOTES
-
 #endif /* CONFIG_SPU_BASE */
 
 #ifdef CONFIG_PPC64
diff --git a/include/linux/elf.h b/include/linux/elf.h
index c9a46c4e183b..5c402788da19 100644
--- a/include/linux/elf.h
+++ b/include/linux/elf.h
@@ -65,7 +65,7 @@ extern Elf64_Dyn _DYNAMIC [];
 struct file;
 struct coredump_params;
 
-#ifndef ARCH_HAVE_EXTRA_ELF_NOTES
+#ifndef CONFIG_ARCH_HAVE_EXTRA_ELF_NOTES
 static inline int elf_coredump_extra_notes_size(void) { return 0; }
 static inline int elf_coredump_extra_notes_write(struct coredump_params *cprm) { return 0; }
 #else
-- 
cgit v1.2.3


From 4854b463c4b27c94a7de86d16ad84f235f4c1a72 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Thu, 11 Apr 2024 12:22:29 -0700
Subject: net: dql: Avoid calling BUG() when WARN() is enough

If the dql_queued() function receives an invalid argument, WARN about it
and continue, instead of crashing the kernel.

This was raised by checkpatch, when I am refactoring this code (see
following patch/commit)

	WARNING: Do not crash the kernel unless it is absolutely unavoidable--use WARN_ON_ONCE() plus recovery code (if feasible) instead of BUG() or variants

Signed-off-by: Breno Leitao <leitao@debian.org>
Link: https://lore.kernel.org/r/20240411192241.2498631-2-leitao@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/dynamic_queue_limits.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/dynamic_queue_limits.h b/include/linux/dynamic_queue_limits.h
index 5693a4be0d9a..ff9c65841ae8 100644
--- a/include/linux/dynamic_queue_limits.h
+++ b/include/linux/dynamic_queue_limits.h
@@ -91,7 +91,8 @@ static inline void dql_queued(struct dql *dql, unsigned int count)
 {
 	unsigned long map, now, now_hi, i;
 
-	BUG_ON(count > DQL_MAX_OBJECT);
+	if (WARN_ON_ONCE(count > DQL_MAX_OBJECT))
+		return;
 
 	dql->last_obj_cnt = count;
 
-- 
cgit v1.2.3


From cbe481a1b7410b0f2f303e8fd4867ece388a9729 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Thu, 11 Apr 2024 12:22:30 -0700
Subject: net: dql: Separate queue function responsibilities

The dql_queued() function currently handles both queuing object counts
and populating bitmaps for reporting stalls.

This commit splits the bitmap population into a separate function,
allowing for conditional invocation in scenarios where the feature is
disabled.

This refactor maintains functionality while improving code
organization.

Signed-off-by: Breno Leitao <leitao@debian.org>
Link: https://lore.kernel.org/r/20240411192241.2498631-3-leitao@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/dynamic_queue_limits.h | 44 ++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/dynamic_queue_limits.h b/include/linux/dynamic_queue_limits.h
index ff9c65841ae8..9980df0b7247 100644
--- a/include/linux/dynamic_queue_limits.h
+++ b/include/linux/dynamic_queue_limits.h
@@ -83,28 +83,11 @@ struct dql {
 #define DQL_MAX_OBJECT (UINT_MAX / 16)
 #define DQL_MAX_LIMIT ((UINT_MAX / 2) - DQL_MAX_OBJECT)
 
-/*
- * Record number of objects queued. Assumes that caller has already checked
- * availability in the queue with dql_avail.
- */
-static inline void dql_queued(struct dql *dql, unsigned int count)
+/* Populate the bitmap to be processed later in dql_check_stall() */
+static inline void dql_queue_stall(struct dql *dql)
 {
 	unsigned long map, now, now_hi, i;
 
-	if (WARN_ON_ONCE(count > DQL_MAX_OBJECT))
-		return;
-
-	dql->last_obj_cnt = count;
-
-	/* We want to force a write first, so that cpu do not attempt
-	 * to get cache line containing last_obj_cnt, num_queued, adj_limit
-	 * in Shared state, but directly does a Request For Ownership
-	 * It is only a hint, we use barrier() only.
-	 */
-	barrier();
-
-	dql->num_queued += count;
-
 	now = jiffies;
 	now_hi = now / BITS_PER_LONG;
 
@@ -134,6 +117,29 @@ static inline void dql_queued(struct dql *dql, unsigned int count)
 		WRITE_ONCE(DQL_HIST_ENT(dql, now_hi), map | BIT_MASK(now));
 }
 
+/*
+ * Record number of objects queued. Assumes that caller has already checked
+ * availability in the queue with dql_avail.
+ */
+static inline void dql_queued(struct dql *dql, unsigned int count)
+{
+	if (WARN_ON_ONCE(count > DQL_MAX_OBJECT))
+		return;
+
+	dql->last_obj_cnt = count;
+
+	/* We want to force a write first, so that cpu do not attempt
+	 * to get cache line containing last_obj_cnt, num_queued, adj_limit
+	 * in Shared state, but directly does a Request For Ownership
+	 * It is only a hint, we use barrier() only.
+	 */
+	barrier();
+
+	dql->num_queued += count;
+
+	dql_queue_stall(dql);
+}
+
 /* Returns how many objects can be queued, < 0 indicates over limit. */
 static inline int dql_avail(const struct dql *dql)
 {
-- 
cgit v1.2.3


From 721f076b62cb05108565adf17c27875ef5015307 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Thu, 11 Apr 2024 12:22:31 -0700
Subject: net: dql: Optimize stall information population

When Dynamic Queue Limit (DQL) is set, it always populate stall
information through dql_queue_stall().  However, this information is
only necessary if a stall threshold is set, stored in struct
dql->stall_thrs.

dql_queue_stall() is cheap, but not free, since it does have memory
barriers and so forth.

Do not call dql_queue_stall() if there is no stall threshold set, and
save some CPU cycles.

Signed-off-by: Breno Leitao <leitao@debian.org>
Link: https://lore.kernel.org/r/20240411192241.2498631-4-leitao@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/dynamic_queue_limits.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/dynamic_queue_limits.h b/include/linux/dynamic_queue_limits.h
index 9980df0b7247..869afb800ea1 100644
--- a/include/linux/dynamic_queue_limits.h
+++ b/include/linux/dynamic_queue_limits.h
@@ -137,7 +137,9 @@ static inline void dql_queued(struct dql *dql, unsigned int count)
 
 	dql->num_queued += count;
 
-	dql_queue_stall(dql);
+	/* Only populate stall information if the threshold is set */
+	if (READ_ONCE(dql->stall_thrs))
+		dql_queue_stall(dql);
 }
 
 /* Returns how many objects can be queued, < 0 indicates over limit. */
-- 
cgit v1.2.3


From 4ba67ef3a1fbb7d8dc5f00de9b93a583d05b38cc Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Thu, 11 Apr 2024 12:22:32 -0700
Subject: net: dqs: make struct dql more cache efficient

With the previous change, struct dqs->stall_thrs will be in the hot path
(at queue side), even if DQS is disabled.

The other fields accessed in this function (last_obj_cnt and num_queued)
are in the first cache line, let's move this field  (stall_thrs) to the
very first cache line, since there is a hole there.

This does not change the structure size, since it moves an short (2
bytes) to 4-bytes whole in the first cache line.

This is the new structure format now:

struct dql {
	unsigned int    num_queued;
	unsigned int    last_obj_cnt;
...
	short unsigned int    stall_thrs;
	/* XXX 2 bytes hole, try to pack */
...
	/* --- cacheline 1 boundary (64 bytes) --- */
...
 	/* Longest stall detected, reported to user */
	short unsigned int         stall_max;
	/* XXX 2 bytes hole, try to pack */
};

Also, read the stall_thrs (now in the very first cache line) earlier,
together with dql->num_queued (also in the first cache line).

Suggested-by: Jakub Kicinski <kuba@kernel.org>
Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Breno Leitao <leitao@debian.org>
Link: https://lore.kernel.org/r/20240411192241.2498631-5-leitao@debian.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/dynamic_queue_limits.h |  5 +++--
 lib/dynamic_queue_limits.c           | 13 +++++++++----
 2 files changed, 12 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/dynamic_queue_limits.h b/include/linux/dynamic_queue_limits.h
index 869afb800ea1..281298e77a15 100644
--- a/include/linux/dynamic_queue_limits.h
+++ b/include/linux/dynamic_queue_limits.h
@@ -50,6 +50,9 @@ struct dql {
 	unsigned int	adj_limit;		/* limit + num_completed */
 	unsigned int	last_obj_cnt;		/* Count at last queuing */
 
+	/* Stall threshold (in jiffies), defined by user */
+	unsigned short	stall_thrs;
+
 	unsigned long	history_head;		/* top 58 bits of jiffies */
 	/* stall entries, a bit per entry */
 	unsigned long	history[DQL_HIST_LEN];
@@ -71,8 +74,6 @@ struct dql {
 	unsigned int	min_limit;		/* Minimum limit */
 	unsigned int	slack_hold_time;	/* Time to measure slack */
 
-	/* Stall threshold (in jiffies), defined by user */
-	unsigned short	stall_thrs;
 	/* Longest stall detected, reported to user */
 	unsigned short	stall_max;
 	unsigned long	last_reap;		/* Last reap (in jiffies) */
diff --git a/lib/dynamic_queue_limits.c b/lib/dynamic_queue_limits.c
index a1389db1c30a..e49deddd3de9 100644
--- a/lib/dynamic_queue_limits.c
+++ b/lib/dynamic_queue_limits.c
@@ -15,12 +15,10 @@
 #define POSDIFF(A, B) ((int)((A) - (B)) > 0 ? (A) - (B) : 0)
 #define AFTER_EQ(A, B) ((int)((A) - (B)) >= 0)
 
-static void dql_check_stall(struct dql *dql)
+static void dql_check_stall(struct dql *dql, unsigned short stall_thrs)
 {
-	unsigned short stall_thrs;
 	unsigned long now;
 
-	stall_thrs = READ_ONCE(dql->stall_thrs);
 	if (!stall_thrs)
 		return;
 
@@ -86,9 +84,16 @@ void dql_completed(struct dql *dql, unsigned int count)
 {
 	unsigned int inprogress, prev_inprogress, limit;
 	unsigned int ovlimit, completed, num_queued;
+	unsigned short stall_thrs;
 	bool all_prev_completed;
 
 	num_queued = READ_ONCE(dql->num_queued);
+	/* Read stall_thrs in advance since it belongs to the same (first)
+	 * cache line as ->num_queued. This way, dql_check_stall() does not
+	 * need to touch the first cache line again later, reducing the window
+	 * of possible false sharing.
+	 */
+	stall_thrs = READ_ONCE(dql->stall_thrs);
 
 	/* Can't complete more than what's in queue */
 	BUG_ON(count > num_queued - dql->num_completed);
@@ -178,7 +183,7 @@ void dql_completed(struct dql *dql, unsigned int count)
 	dql->num_completed = completed;
 	dql->prev_num_queued = num_queued;
 
-	dql_check_stall(dql);
+	dql_check_stall(dql, stall_thrs);
 }
 EXPORT_SYMBOL(dql_completed);
 
-- 
cgit v1.2.3


From f60d374d2cc88034385265d193a38e3f4a4b430c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 4 Jan 2024 21:35:38 -0500
Subject: close_on_exec(): pass files_struct instead of fdtable

both callers are happier that way...

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file.c               |  5 +----
 fs/proc/fd.c            |  4 +---
 include/linux/fdtable.h | 10 +++++-----
 3 files changed, 7 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/fs/file.c b/fs/file.c
index 3b683b9101d8..eff5ce79f66a 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -1219,12 +1219,9 @@ void set_close_on_exec(unsigned int fd, int flag)
 
 bool get_close_on_exec(unsigned int fd)
 {
-	struct files_struct *files = current->files;
-	struct fdtable *fdt;
 	bool res;
 	rcu_read_lock();
-	fdt = files_fdtable(files);
-	res = close_on_exec(fd, fdt);
+	res = close_on_exec(fd, current->files);
 	rcu_read_unlock();
 	return res;
 }
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 6e72e5ad42bc..0139aae19651 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -39,10 +39,8 @@ static int seq_show(struct seq_file *m, void *v)
 		spin_lock(&files->file_lock);
 		file = files_lookup_fd_locked(files, fd);
 		if (file) {
-			struct fdtable *fdt = files_fdtable(files);
-
 			f_flags = file->f_flags;
-			if (close_on_exec(fd, fdt))
+			if (close_on_exec(fd, files))
 				f_flags |= O_CLOEXEC;
 
 			get_file(file);
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 78c8326d74ae..cc060b20502b 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -33,11 +33,6 @@ struct fdtable {
 	struct rcu_head rcu;
 };
 
-static inline bool close_on_exec(unsigned int fd, const struct fdtable *fdt)
-{
-	return test_bit(fd, fdt->close_on_exec);
-}
-
 static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
 {
 	return test_bit(fd, fdt->open_fds);
@@ -107,6 +102,11 @@ struct file *lookup_fdget_rcu(unsigned int fd);
 struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd);
 struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *fd);
 
+static inline bool close_on_exec(unsigned int fd, const struct files_struct *files)
+{
+	return test_bit(fd, files_fdtable(files)->close_on_exec);
+}
+
 struct task_struct;
 
 void put_files_struct(struct files_struct *fs);
-- 
cgit v1.2.3


From c4aab26253cd1f302279b8d6b5b66ccf1b120520 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 4 Jan 2024 21:45:47 -0500
Subject: fd_is_open(): move to fs/file.c

no users outside that...

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file.c               | 5 +++++
 include/linux/fdtable.h | 5 -----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/fs/file.c b/fs/file.c
index eff5ce79f66a..ab38b005633c 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -271,6 +271,11 @@ static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
 	__clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits);
 }
 
+static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
+{
+	return test_bit(fd, fdt->open_fds);
+}
+
 static unsigned int count_open_files(struct fdtable *fdt)
 {
 	unsigned int size = fdt->max_fds;
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index cc060b20502b..2944d4aa413b 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -33,11 +33,6 @@ struct fdtable {
 	struct rcu_head rcu;
 };
 
-static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
-{
-	return test_bit(fd, fdt->open_fds);
-}
-
 /*
  * Open file table structure
  */
-- 
cgit v1.2.3


From af58dc1f50c1946018773beca23ebaad587b9cc9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 20 Jan 2024 06:24:55 -0500
Subject: kernel_file_open(): get rid of inode argument

always equal to ->dentry->d_inode of the path argument these
days.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/cachefiles/namei.c | 3 +--
 fs/open.c             | 5 ++---
 fs/overlayfs/util.c   | 2 +-
 include/linux/fs.h    | 2 +-
 4 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 7ade836beb58..f53977169db4 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -563,8 +563,7 @@ static bool cachefiles_open_file(struct cachefiles_object *object,
 	 */
 	path.mnt = cache->mnt;
 	path.dentry = dentry;
-	file = kernel_file_open(&path, O_RDWR | O_LARGEFILE | O_DIRECT,
-				d_backing_inode(dentry), cache->cache_cred);
+	file = kernel_file_open(&path, O_RDWR | O_LARGEFILE | O_DIRECT, cache->cache_cred);
 	if (IS_ERR(file)) {
 		trace_cachefiles_vfs_error(object, d_backing_inode(dentry),
 					   PTR_ERR(file),
diff --git a/fs/open.c b/fs/open.c
index ee8460c83c77..ec287ac67e7f 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1155,7 +1155,6 @@ EXPORT_SYMBOL(dentry_create);
  * kernel_file_open - open a file for kernel internal use
  * @path:	path of the file to open
  * @flags:	open flags
- * @inode:	the inode
  * @cred:	credentials for open
  *
  * Open a file for use by in-kernel consumers. The file is not accounted
@@ -1165,7 +1164,7 @@ EXPORT_SYMBOL(dentry_create);
  * Return: Opened file on success, an error pointer on failure.
  */
 struct file *kernel_file_open(const struct path *path, int flags,
-				struct inode *inode, const struct cred *cred)
+				const struct cred *cred)
 {
 	struct file *f;
 	int error;
@@ -1175,7 +1174,7 @@ struct file *kernel_file_open(const struct path *path, int flags,
 		return f;
 
 	f->f_path = *path;
-	error = do_dentry_open(f, inode, NULL);
+	error = do_dentry_open(f, d_inode(path->dentry), NULL);
 	if (error) {
 		fput(f);
 		f = ERR_PTR(error);
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index d285d1d7baad..edc9216f6e27 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -1376,7 +1376,7 @@ int ovl_ensure_verity_loaded(struct path *datapath)
 		 * If this inode was not yet opened, the verity info hasn't been
 		 * loaded yet, so we need to do that here to force it into memory.
 		 */
-		filp = kernel_file_open(datapath, O_RDONLY, inode, current_cred());
+		filp = kernel_file_open(datapath, O_RDONLY, current_cred());
 		if (IS_ERR(filp))
 			return PTR_ERR(filp);
 		fput(filp);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 00fc429b0af0..143e967a3af2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1906,7 +1906,7 @@ struct file *kernel_tmpfile_open(struct mnt_idmap *idmap,
 				 umode_t mode, int open_flag,
 				 const struct cred *cred);
 struct file *kernel_file_open(const struct path *path, int flags,
-			      struct inode *inode, const struct cred *cred);
+			      const struct cred *cred);
 
 int vfs_mkobj(struct dentry *, umode_t,
 		int (*f)(struct dentry *, umode_t, void *),
-- 
cgit v1.2.3


From 7c98f7cb8fda964fbc60b9307ad35e94735fa35f Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Mon, 28 Aug 2023 17:13:18 +0200
Subject: remove call_{read,write}_iter() functions

These have no clear purpose.  This is effectively a revert of commit
bb7462b6fd64 ("vfs: use helpers for calling f_op->{read,write}_iter()").

The patch was created with the help of a coccinelle script.

Fixes: bb7462b6fd64 ("vfs: use helpers for calling f_op->{read,write}_iter()")
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/block/loop.c              |  4 ++--
 drivers/target/target_core_file.c |  4 ++--
 fs/aio.c                          |  4 ++--
 fs/read_write.c                   | 12 ++++++------
 fs/splice.c                       |  4 ++--
 include/linux/fs.h                | 12 ------------
 io_uring/rw.c                     |  4 ++--
 7 files changed, 16 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 28a95fd366fe..93780f41646b 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -445,9 +445,9 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 	cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
 
 	if (rw == ITER_SOURCE)
-		ret = call_write_iter(file, &cmd->iocb, &iter);
+		ret = file->f_op->write_iter(&cmd->iocb, &iter);
 	else
-		ret = call_read_iter(file, &cmd->iocb, &iter);
+		ret = file->f_op->read_iter(&cmd->iocb, &iter);
 
 	lo_rw_aio_do_completion(cmd);
 
diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c
index 4d447520bab8..94e6cd4e7e43 100644
--- a/drivers/target/target_core_file.c
+++ b/drivers/target/target_core_file.c
@@ -299,9 +299,9 @@ fd_execute_rw_aio(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
 		aio_cmd->iocb.ki_flags |= IOCB_DSYNC;
 
 	if (is_write)
-		ret = call_write_iter(file, &aio_cmd->iocb, &iter);
+		ret = file->f_op->write_iter(&aio_cmd->iocb, &iter);
 	else
-		ret = call_read_iter(file, &aio_cmd->iocb, &iter);
+		ret = file->f_op->read_iter(&aio_cmd->iocb, &iter);
 
 	if (ret != -EIOCBQUEUED)
 		cmd_rw_aio_complete(&aio_cmd->iocb, ret);
diff --git a/fs/aio.c b/fs/aio.c
index 9cdaa2faa536..13ca81c7c744 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1605,7 +1605,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb,
 		return ret;
 	ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
 	if (!ret)
-		aio_rw_done(req, call_read_iter(file, req, &iter));
+		aio_rw_done(req, file->f_op->read_iter(req, &iter));
 	kfree(iovec);
 	return ret;
 }
@@ -1636,7 +1636,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
 		if (S_ISREG(file_inode(file)->i_mode))
 			kiocb_start_write(req);
 		req->ki_flags |= IOCB_WRITE;
-		aio_rw_done(req, call_write_iter(file, req, &iter));
+		aio_rw_done(req, file->f_op->write_iter(req, &iter));
 	}
 	kfree(iovec);
 	return ret;
diff --git a/fs/read_write.c b/fs/read_write.c
index d4c036e82b6c..2de7f6adb33d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -392,7 +392,7 @@ static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, lo
 	kiocb.ki_pos = (ppos ? *ppos : 0);
 	iov_iter_ubuf(&iter, ITER_DEST, buf, len);
 
-	ret = call_read_iter(filp, &kiocb, &iter);
+	ret = filp->f_op->read_iter(&kiocb, &iter);
 	BUG_ON(ret == -EIOCBQUEUED);
 	if (ppos)
 		*ppos = kiocb.ki_pos;
@@ -494,7 +494,7 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
 	kiocb.ki_pos = (ppos ? *ppos : 0);
 	iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len);
 
-	ret = call_write_iter(filp, &kiocb, &iter);
+	ret = filp->f_op->write_iter(&kiocb, &iter);
 	BUG_ON(ret == -EIOCBQUEUED);
 	if (ret > 0 && ppos)
 		*ppos = kiocb.ki_pos;
@@ -736,9 +736,9 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
 	kiocb.ki_pos = (ppos ? *ppos : 0);
 
 	if (type == READ)
-		ret = call_read_iter(filp, &kiocb, iter);
+		ret = filp->f_op->read_iter(&kiocb, iter);
 	else
-		ret = call_write_iter(filp, &kiocb, iter);
+		ret = filp->f_op->write_iter(&kiocb, iter);
 	BUG_ON(ret == -EIOCBQUEUED);
 	if (ppos)
 		*ppos = kiocb.ki_pos;
@@ -799,7 +799,7 @@ ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
 	if (ret < 0)
 		return ret;
 
-	ret = call_read_iter(file, iocb, iter);
+	ret = file->f_op->read_iter(iocb, iter);
 out:
 	if (ret >= 0)
 		fsnotify_access(file);
@@ -860,7 +860,7 @@ ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
 		return ret;
 
 	kiocb_start_write(iocb);
-	ret = call_write_iter(file, iocb, iter);
+	ret = file->f_op->write_iter(iocb, iter);
 	if (ret != -EIOCBQUEUED)
 		kiocb_end_write(iocb);
 	if (ret > 0)
diff --git a/fs/splice.c b/fs/splice.c
index 218e24b1ac40..60aed8de21f8 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -362,7 +362,7 @@ ssize_t copy_splice_read(struct file *in, loff_t *ppos,
 	iov_iter_bvec(&to, ITER_DEST, bv, npages, len);
 	init_sync_kiocb(&kiocb, in);
 	kiocb.ki_pos = *ppos;
-	ret = call_read_iter(in, &kiocb, &to);
+	ret = in->f_op->read_iter(&kiocb, &to);
 
 	if (ret > 0) {
 		keep = DIV_ROUND_UP(ret, PAGE_SIZE);
@@ -740,7 +740,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 		iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left);
 		init_sync_kiocb(&kiocb, out);
 		kiocb.ki_pos = sd.pos;
-		ret = call_write_iter(out, &kiocb, &from);
+		ret = out->f_op->write_iter(&kiocb, &from);
 		sd.pos = kiocb.ki_pos;
 		if (ret <= 0)
 			break;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 143e967a3af2..a94343f567cb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2096,18 +2096,6 @@ struct inode_operations {
 	struct offset_ctx *(*get_offset_ctx)(struct inode *inode);
 } ____cacheline_aligned;
 
-static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio,
-				     struct iov_iter *iter)
-{
-	return file->f_op->read_iter(kio, iter);
-}
-
-static inline ssize_t call_write_iter(struct file *file, struct kiocb *kio,
-				      struct iov_iter *iter)
-{
-	return file->f_op->write_iter(kio, iter);
-}
-
 static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	return file->f_op->mmap(file, vma);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 0585ebcc9773..7d335b7e00ed 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -701,7 +701,7 @@ static inline int io_iter_do_read(struct io_rw *rw, struct iov_iter *iter)
 	struct file *file = rw->kiocb.ki_filp;
 
 	if (likely(file->f_op->read_iter))
-		return call_read_iter(file, &rw->kiocb, iter);
+		return file->f_op->read_iter(&rw->kiocb, iter);
 	else if (file->f_op->read)
 		return loop_rw_iter(READ, rw, iter);
 	else
@@ -1047,7 +1047,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
 	kiocb->ki_flags |= IOCB_WRITE;
 
 	if (likely(req->file->f_op->write_iter))
-		ret2 = call_write_iter(req->file, kiocb, &s->iter);
+		ret2 = req->file->f_op->write_iter(kiocb, &s->iter);
 	else if (req->file->f_op->write)
 		ret2 = loop_rw_iter(WRITE, rw, &s->iter);
 	else
-- 
cgit v1.2.3


From 5b9932f6001c70b984e8c9c2fe09e443beb4baba Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <djwong@kernel.org>
Date: Mon, 15 Apr 2024 14:54:13 -0700
Subject: vfs: export remap and write check helpers

Export these functions so that the next patch can use them to check the
file ranges being passed to the XFS_IOC_EXCHANGE_RANGE operation.

Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/read_write.c    | 1 +
 fs/remap_range.c   | 4 ++--
 include/linux/fs.h | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/read_write.c b/fs/read_write.c
index d4c036e82b6c..85c096f2c0d0 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1667,6 +1667,7 @@ int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(generic_write_check_limits);
 
 /* Like generic_write_checks(), but takes size of write instead of iter. */
 int generic_write_checks_count(struct kiocb *iocb, loff_t *count)
diff --git a/fs/remap_range.c b/fs/remap_range.c
index de07f978ce3e..28246dfc8485 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -99,8 +99,7 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in,
 	return 0;
 }
 
-static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
-			     bool write)
+int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write)
 {
 	int mask = write ? MAY_WRITE : MAY_READ;
 	loff_t tmp;
@@ -118,6 +117,7 @@ static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
 
 	return fsnotify_file_area_perm(file, mask, &pos, len);
 }
+EXPORT_SYMBOL_GPL(remap_verify_area);
 
 /*
  * Ensure that we don't remap a partial EOF block in the middle of something
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8dfd53b52744..0835faeebe7b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2119,6 +2119,7 @@ extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
 extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
 extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
 				   loff_t, size_t, unsigned int);
+int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write);
 int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
 				    struct file *file_out, loff_t pos_out,
 				    loff_t *len, unsigned int remap_flags,
-- 
cgit v1.2.3


From 075b7cd7ad7dec8651a6a6654fa5ebae436ac00f Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Thu, 14 Mar 2024 11:28:36 +0530
Subject: coresight: Add helpers registering/removing both AMBA and platform
 drivers

This adds two different helpers i.e coresight_init_driver()/remove_driver()
enabling coresight devices to register or remove AMBA and platform drivers.
This changes replicator and funnel devices to use above new helpers.

Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Mike Leach <mike.leach@linaro.org>
Cc: James Clark <james.clark@arm.com>
Cc: Leo Yan <leo.yan@linaro.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Cc: coresight@lists.linaro.org
Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20240314055843.2625883-5-anshuman.khandual@arm.com
---
 drivers/hwtracing/coresight/coresight-core.c       | 29 ++++++++++++++++++++++
 drivers/hwtracing/coresight/coresight-funnel.c     | 19 ++------------
 drivers/hwtracing/coresight/coresight-replicator.c | 20 +++------------
 include/linux/coresight.h                          |  7 ++++++
 4 files changed, 41 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/drivers/hwtracing/coresight/coresight-core.c b/drivers/hwtracing/coresight/coresight-core.c
index b83613e34289..9fc6f6b863e0 100644
--- a/drivers/hwtracing/coresight/coresight-core.c
+++ b/drivers/hwtracing/coresight/coresight-core.c
@@ -1398,6 +1398,35 @@ static void __exit coresight_exit(void)
 module_init(coresight_init);
 module_exit(coresight_exit);
 
+int coresight_init_driver(const char *drv, struct amba_driver *amba_drv,
+			  struct platform_driver *pdev_drv)
+{
+	int ret;
+
+	ret = amba_driver_register(amba_drv);
+	if (ret) {
+		pr_err("%s: error registering AMBA driver\n", drv);
+		return ret;
+	}
+
+	ret = platform_driver_register(pdev_drv);
+	if (!ret)
+		return 0;
+
+	pr_err("%s: error registering platform driver\n", drv);
+	amba_driver_unregister(amba_drv);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(coresight_init_driver);
+
+void coresight_remove_driver(struct amba_driver *amba_drv,
+			     struct platform_driver *pdev_drv)
+{
+	amba_driver_unregister(amba_drv);
+	platform_driver_unregister(pdev_drv);
+}
+EXPORT_SYMBOL_GPL(coresight_remove_driver);
+
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Pratik Patel <pratikp@codeaurora.org>");
 MODULE_AUTHOR("Mathieu Poirier <mathieu.poirier@linaro.org>");
diff --git a/drivers/hwtracing/coresight/coresight-funnel.c b/drivers/hwtracing/coresight/coresight-funnel.c
index ef1a0abfee4e..ff3ea0670a5b 100644
--- a/drivers/hwtracing/coresight/coresight-funnel.c
+++ b/drivers/hwtracing/coresight/coresight-funnel.c
@@ -410,27 +410,12 @@ static struct amba_driver dynamic_funnel_driver = {
 
 static int __init funnel_init(void)
 {
-	int ret;
-
-	ret = platform_driver_register(&static_funnel_driver);
-	if (ret) {
-		pr_info("Error registering platform driver\n");
-		return ret;
-	}
-
-	ret = amba_driver_register(&dynamic_funnel_driver);
-	if (ret) {
-		pr_info("Error registering amba driver\n");
-		platform_driver_unregister(&static_funnel_driver);
-	}
-
-	return ret;
+	return coresight_init_driver("funnel", &dynamic_funnel_driver, &static_funnel_driver);
 }
 
 static void __exit funnel_exit(void)
 {
-	platform_driver_unregister(&static_funnel_driver);
-	amba_driver_unregister(&dynamic_funnel_driver);
+	coresight_remove_driver(&dynamic_funnel_driver, &static_funnel_driver);
 }
 
 module_init(funnel_init);
diff --git a/drivers/hwtracing/coresight/coresight-replicator.c b/drivers/hwtracing/coresight/coresight-replicator.c
index 73452d9dc13b..ddb530a8436f 100644
--- a/drivers/hwtracing/coresight/coresight-replicator.c
+++ b/drivers/hwtracing/coresight/coresight-replicator.c
@@ -416,27 +416,13 @@ static struct amba_driver dynamic_replicator_driver = {
 
 static int __init replicator_init(void)
 {
-	int ret;
-
-	ret = platform_driver_register(&static_replicator_driver);
-	if (ret) {
-		pr_info("Error registering platform driver\n");
-		return ret;
-	}
-
-	ret = amba_driver_register(&dynamic_replicator_driver);
-	if (ret) {
-		pr_info("Error registering amba driver\n");
-		platform_driver_unregister(&static_replicator_driver);
-	}
-
-	return ret;
+	return coresight_init_driver("replicator", &dynamic_replicator_driver,
+				     &static_replicator_driver);
 }
 
 static void __exit replicator_exit(void)
 {
-	platform_driver_unregister(&static_replicator_driver);
-	amba_driver_unregister(&dynamic_replicator_driver);
+	coresight_remove_driver(&dynamic_replicator_driver, &static_replicator_driver);
 }
 
 module_init(replicator_init);
diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 5f288d475490..653f1712eb77 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -12,6 +12,8 @@
 #include <linux/io.h>
 #include <linux/perf_event.h>
 #include <linux/sched.h>
+#include <linux/amba/bus.h>
+#include <linux/platform_device.h>
 
 /* Peripheral id registers (0xFD0-0xFEC) */
 #define CORESIGHT_PERIPHIDR4	0xfd0
@@ -658,4 +660,9 @@ coresight_find_output_type(struct coresight_platform_data *pdata,
 			   enum coresight_dev_type type,
 			   union coresight_dev_subtype subtype);
 
+int coresight_init_driver(const char *drv, struct amba_driver *amba_drv,
+			  struct platform_driver *pdev_drv);
+
+void coresight_remove_driver(struct amba_driver *amba_drv,
+			     struct platform_driver *pdev_drv);
 #endif		/* _LINUX_COREISGHT_H */
-- 
cgit v1.2.3


From 58300f8d6a48e58d1843199be743f819e2791ea3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 16 Apr 2024 00:03:03 -0400
Subject: ASoC: tracing: Export SND_SOC_DAPM_DIR_OUT to its value

The string SND_SOC_DAPM_DIR_OUT is printed in the snd_soc_dapm_path trace
event instead of its value:

   (((REC->path_dir) == SND_SOC_DAPM_DIR_OUT) ? "->" : "<-")

User space cannot parse this, as it has no idea what SND_SOC_DAPM_DIR_OUT
is. Use TRACE_DEFINE_ENUM() to convert it to its value:

   (((REC->path_dir) == 1) ? "->" : "<-")

So that user space tools, such as perf and trace-cmd, can parse it
correctly.

Reported-by: Luca Ceresoli <luca.ceresoli@bootlin.com>
Fixes: 6e588a0d839b5 ("ASoC: dapm: Consolidate path trace events")
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Link: https://lore.kernel.org/r/20240416000303.04670cdf@rorschach.local.home
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/trace/events/asoc.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/trace/events/asoc.h b/include/trace/events/asoc.h
index 4eed9028bb11..517015ef36a8 100644
--- a/include/trace/events/asoc.h
+++ b/include/trace/events/asoc.h
@@ -12,6 +12,8 @@
 #define DAPM_DIRECT "(direct)"
 #define DAPM_ARROW(dir) (((dir) == SND_SOC_DAPM_DIR_OUT) ? "->" : "<-")
 
+TRACE_DEFINE_ENUM(SND_SOC_DAPM_DIR_OUT);
+
 struct snd_soc_jack;
 struct snd_soc_card;
 struct snd_soc_dapm_widget;
-- 
cgit v1.2.3


From 9d50f95bc0d5df56f2591b950a251d90bffad094 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.cirrus.com>
Date: Tue, 16 Apr 2024 11:09:01 +0100
Subject: gpio: swnode: Add ability to specify native chip selects for SPI

SPI devices can specify a cs-gpios property to enumerate their
chip selects. Under device tree, a zero entry in this property can
be used to specify that a particular chip select is using the SPI
controllers native chip select, for example:

        cs-gpios = <&gpio1 0 0>, <0>;

Here, the second chip select is native. However, when using swnodes
there is currently no way to specify a native chip select. The
proposal here is to register a swnode_gpio_undefined software node,
that can be specified to allow the indication of a native chip
select. For example:

static const struct software_node_ref_args device_cs_refs[] = {
	{
		.node  = &device_gpiochip_swnode,
		.nargs = 2,
		.args  = { 0, GPIO_ACTIVE_LOW },
	},
	{
		.node  = &swnode_gpio_undefined,
		.nargs = 0,
	},
};

Register the swnode as the gpiolib is initialised and check in
swnode_get_gpio_device() if the returned node matches
swnode_gpio_undefined and return -ENOENT, which matches the
behaviour of the device tree system when it encounters a 0 phandle.

Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Andy Shevchenko <andy@kernel.org>
Signed-off-by: Charles Keepax <ckeepax@opensource.cirrus.com>
Link: https://lore.kernel.org/r/20240416100904.3738093-2-ckeepax@opensource.cirrus.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/gpio/Kconfig          |  9 +++++++++
 drivers/gpio/gpiolib-swnode.c | 44 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/gpio/property.h |  4 ++++
 3 files changed, 57 insertions(+)

(limited to 'include')

diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index b50d0b470849..00b5c007a2bb 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -103,6 +103,15 @@ config GPIO_REGMAP
 	select REGMAP
 	tristate
 
+config GPIO_SWNODE_UNDEFINED
+	bool
+	help
+	  This adds a special place holder for software nodes to contain an
+	  undefined GPIO reference, this is primarily used by SPI to allow a
+	  list of GPIO chip selects to mark a certain chip select as being
+	  controlled the SPI device's internal chip select mechanism and not
+	  a GPIO.
+
 # put drivers in the right section, in alphabetical order
 
 # This symbol is selected by both I2C and SPI expanders
diff --git a/drivers/gpio/gpiolib-swnode.c b/drivers/gpio/gpiolib-swnode.c
index fa52bdb1a29a..cec1ab878af8 100644
--- a/drivers/gpio/gpiolib-swnode.c
+++ b/drivers/gpio/gpiolib-swnode.c
@@ -4,8 +4,13 @@
  *
  * Copyright 2022 Google LLC
  */
+
+#define pr_fmt(fmt) "gpiolib: swnode: " fmt
+
 #include <linux/err.h>
 #include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/printk.h>
 #include <linux/property.h>
@@ -17,6 +22,8 @@
 #include "gpiolib.h"
 #include "gpiolib-swnode.h"
 
+#define GPIOLIB_SWNODE_UNDEFINED_NAME "swnode-gpio-undefined"
+
 static void swnode_format_propname(const char *con_id, char *propname,
 				   size_t max_size)
 {
@@ -40,6 +47,14 @@ static struct gpio_device *swnode_get_gpio_device(struct fwnode_handle *fwnode)
 	if (!gdev_node || !gdev_node->name)
 		return ERR_PTR(-EINVAL);
 
+	/*
+	 * Check for a special node that identifies undefined GPIOs, this is
+	 * primarily used as a key for internal chip selects in SPI bindings.
+	 */
+	if (IS_ENABLED(CONFIG_GPIO_SWNODE_UNDEFINED) &&
+	    !strcmp(gdev_node->name, GPIOLIB_SWNODE_UNDEFINED_NAME))
+		return ERR_PTR(-ENOENT);
+
 	gdev = gpio_device_find_by_label(gdev_node->name);
 	return gdev ?: ERR_PTR(-EPROBE_DEFER);
 }
@@ -121,3 +136,32 @@ int swnode_gpio_count(const struct fwnode_handle *fwnode, const char *con_id)
 
 	return count ?: -ENOENT;
 }
+
+#if IS_ENABLED(CONFIG_GPIO_SWNODE_UNDEFINED)
+/*
+ * A special node that identifies undefined GPIOs, this is primarily used as
+ * a key for internal chip selects in SPI bindings.
+ */
+const struct software_node swnode_gpio_undefined = {
+	.name = GPIOLIB_SWNODE_UNDEFINED_NAME,
+};
+EXPORT_SYMBOL_NS_GPL(swnode_gpio_undefined, GPIO_SWNODE);
+
+static int __init swnode_gpio_init(void)
+{
+	int ret;
+
+	ret = software_node_register(&swnode_gpio_undefined);
+	if (ret < 0)
+		pr_err("failed to register swnode: %d\n", ret);
+
+	return ret;
+}
+subsys_initcall(swnode_gpio_init);
+
+static void __exit swnode_gpio_cleanup(void)
+{
+	software_node_unregister(&swnode_gpio_undefined);
+}
+__exitcall(swnode_gpio_cleanup);
+#endif
diff --git a/include/linux/gpio/property.h b/include/linux/gpio/property.h
index 6c75c8bd44a0..832a60c2e0b9 100644
--- a/include/linux/gpio/property.h
+++ b/include/linux/gpio/property.h
@@ -5,7 +5,11 @@
 #include <dt-bindings/gpio/gpio.h> /* for GPIO_* flags */
 #include <linux/property.h>
 
+struct software_node;
+
 #define PROPERTY_ENTRY_GPIO(_name_, _chip_node_, _idx_, _flags_) \
 	PROPERTY_ENTRY_REF(_name_, _chip_node_, _idx_, _flags_)
 
+extern const struct software_node swnode_gpio_undefined;
+
 #endif /* __LINUX_GPIO_PROPERTY_H */
-- 
cgit v1.2.3


From fd86344823b521149bb31d91eba900ba3525efa6 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Fri, 12 Apr 2024 19:19:28 -0700
Subject: af_unix: Try not to hold unix_gc_lock during accept().

Commit dcf70df2048d ("af_unix: Fix up unix_edge.successor for embryo
socket.") added spin_lock(&unix_gc_lock) in accept() path, and it
caused regression in a stress test as reported by kernel test robot.

If the embryo socket is not part of the inflight graph, we need not
hold the lock.

To decide that in O(1) time and avoid the regression in the normal
use case,

  1. add a new stat unix_sk(sk)->scm_stat.nr_unix_fds

  2. count the number of inflight AF_UNIX sockets in the receive
     queue under unix_state_lock()

  3. move unix_update_edges() call under unix_state_lock()

  4. avoid locking if nr_unix_fds is 0 in unix_update_edges()

Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/oe-lkp/202404101427.92a08551-oliver.sang@intel.com
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://lore.kernel.org/r/20240413021928.20946-1-kuniyu@amazon.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/af_unix.h |  1 +
 net/unix/af_unix.c    |  2 +-
 net/unix/garbage.c    | 20 ++++++++++++++++----
 3 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index 7311b77edfc7..872ff2a50372 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -67,6 +67,7 @@ struct unix_skb_parms {
 
 struct scm_stat {
 	atomic_t nr_fds;
+	unsigned long nr_unix_fds;
 };
 
 #define UNIXCB(skb)	(*(struct unix_skb_parms *)&((skb)->cb))
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 142d210b5b03..ed16d5f66df8 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1719,12 +1719,12 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
 	}
 
 	tsk = skb->sk;
-	unix_update_edges(unix_sk(tsk));
 	skb_free_datagram(sk, skb);
 	wake_up_interruptible(&unix_sk(sk)->peer_wait);
 
 	/* attach accepted sock to socket */
 	unix_state_lock(tsk);
+	unix_update_edges(unix_sk(tsk));
 	newsock->state = SS_CONNECTED;
 	unix_sock_inherit_flags(sock, newsock);
 	sock_graft(tsk, newsock);
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 12a4ec27e0d4..95240a59808f 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -209,6 +209,7 @@ void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver)
 		unix_add_edge(fpl, edge);
 	} while (i < fpl->count_unix);
 
+	receiver->scm_stat.nr_unix_fds += fpl->count_unix;
 	WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + fpl->count_unix);
 out:
 	WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight + fpl->count);
@@ -222,6 +223,7 @@ out:
 
 void unix_del_edges(struct scm_fp_list *fpl)
 {
+	struct unix_sock *receiver;
 	int i = 0;
 
 	spin_lock(&unix_gc_lock);
@@ -235,6 +237,8 @@ void unix_del_edges(struct scm_fp_list *fpl)
 		unix_del_edge(fpl, edge);
 	} while (i < fpl->count_unix);
 
+	receiver = fpl->edges[0].successor;
+	receiver->scm_stat.nr_unix_fds -= fpl->count_unix;
 	WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - fpl->count_unix);
 out:
 	WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count);
@@ -246,10 +250,18 @@ out:
 
 void unix_update_edges(struct unix_sock *receiver)
 {
-	spin_lock(&unix_gc_lock);
-	unix_update_graph(unix_sk(receiver->listener)->vertex);
-	receiver->listener = NULL;
-	spin_unlock(&unix_gc_lock);
+	/* nr_unix_fds is only updated under unix_state_lock().
+	 * If it's 0 here, the embryo socket is not part of the
+	 * inflight graph, and GC will not see it, so no lock needed.
+	 */
+	if (!receiver->scm_stat.nr_unix_fds) {
+		receiver->listener = NULL;
+	} else {
+		spin_lock(&unix_gc_lock);
+		unix_update_graph(unix_sk(receiver->listener)->vertex);
+		receiver->listener = NULL;
+		spin_unlock(&unix_gc_lock);
+	}
 }
 
 int unix_prepare_fpl(struct scm_fp_list *fpl)
-- 
cgit v1.2.3


From fc29e04ae1ad4c99422c0b8ae4b43cfe99c70429 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Tue, 16 Apr 2024 19:51:26 +0200
Subject: cgroup/rstat: add cgroup_rstat_lock helpers and tracepoints

This commit enhances the ability to troubleshoot the global
cgroup_rstat_lock by introducing wrapper helper functions for the lock
along with associated tracepoints.

Although global, the cgroup_rstat_lock helper APIs and tracepoints take
arguments such as cgroup pointer and cpu_in_loop variable. This
adjustment is made because flushing occurs per cgroup despite the lock
being global. Hence, when troubleshooting, it's important to identify the
relevant cgroup. The cpu_in_loop variable is necessary because the global
lock may be released within the main flushing loop that traverses CPUs.
In the tracepoints, the cpu_in_loop value is set to -1 when acquiring the
main lock; otherwise, it denotes the CPU number processed last.

The new feature in this patchset is detecting when lock is contended. The
tracepoints are implemented with production in mind. For minimum overhead
attach to cgroup:cgroup_rstat_lock_contended, which only gets activated
when trylock detects lock is contended. A quick production check for
issues could be done via this perf commands:

 perf record -g -e cgroup:cgroup_rstat_lock_contended

Next natural question would be asking how long time do lock contenders
wait for obtaining the lock. This can be answered by measuring the time
between cgroup:cgroup_rstat_lock_contended and cgroup:cgroup_rstat_locked
when args->contended is set.  Like this bpftrace script:

 bpftrace -e '
   tracepoint:cgroup:cgroup_rstat_lock_contended {@start[tid]=nsecs}
   tracepoint:cgroup:cgroup_rstat_locked {
     if (args->contended) {
       @wait_ns=hist(nsecs-@start[tid]); delete(@start[tid]);}}
   interval:s:1 {time("%H:%M:%S "); print(@wait_ns); }'

Extending with time spend holding the lock will be more expensive as this
also looks at all the non-contended cases.
Like this bpftrace script:

 bpftrace -e '
   tracepoint:cgroup:cgroup_rstat_lock_contended {@start[tid]=nsecs}
   tracepoint:cgroup:cgroup_rstat_locked { @locked[tid]=nsecs;
     if (args->contended) {
       @wait_ns=hist(nsecs-@start[tid]); delete(@start[tid]);}}
   tracepoint:cgroup:cgroup_rstat_unlock {
       @locked_ns=hist(nsecs-@locked[tid]); delete(@locked[tid]);}
   interval:s:1 {time("%H:%M:%S ");  print(@wait_ns);print(@locked_ns); }'

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h        |  2 +-
 include/trace/events/cgroup.h | 48 +++++++++++++++++++++++++++++++++++++++++++
 kernel/cgroup/rstat.c         | 47 ++++++++++++++++++++++++++++++++++--------
 3 files changed, 88 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 34aaf0e87def..2150ca60394b 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -690,7 +690,7 @@ static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
 void cgroup_rstat_updated(struct cgroup *cgrp, int cpu);
 void cgroup_rstat_flush(struct cgroup *cgrp);
 void cgroup_rstat_flush_hold(struct cgroup *cgrp);
-void cgroup_rstat_flush_release(void);
+void cgroup_rstat_flush_release(struct cgroup *cgrp);
 
 /*
  * Basic resource stats.
diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h
index dd7d7c9efecd..13f375800135 100644
--- a/include/trace/events/cgroup.h
+++ b/include/trace/events/cgroup.h
@@ -204,6 +204,54 @@ DEFINE_EVENT(cgroup_event, cgroup_notify_frozen,
 	TP_ARGS(cgrp, path, val)
 );
 
+DECLARE_EVENT_CLASS(cgroup_rstat,
+
+	TP_PROTO(struct cgroup *cgrp, int cpu_in_loop, bool contended),
+
+	TP_ARGS(cgrp, cpu_in_loop, contended),
+
+	TP_STRUCT__entry(
+		__field(	int,		root			)
+		__field(	int,		level			)
+		__field(	u64,		id			)
+		__field(	int,		cpu_in_loop		)
+		__field(	bool,		contended		)
+	),
+
+	TP_fast_assign(
+		__entry->root = cgrp->root->hierarchy_id;
+		__entry->id = cgroup_id(cgrp);
+		__entry->level = cgrp->level;
+		__entry->cpu_in_loop = cpu_in_loop;
+		__entry->contended = contended;
+	),
+
+	TP_printk("root=%d id=%llu level=%d cpu_in_loop=%d lock contended:%d",
+		  __entry->root, __entry->id, __entry->level,
+		  __entry->cpu_in_loop, __entry->contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,
+
+	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+	TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_locked,
+
+	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+	TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,
+
+	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+	TP_ARGS(cgrp, cpu, contended)
+);
+
 #endif /* _TRACE_CGROUP_H */
 
 /* This part must be outside protection */
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 07e2284bb499..ff68c904e647 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -7,6 +7,8 @@
 #include <linux/btf.h>
 #include <linux/btf_ids.h>
 
+#include <trace/events/cgroup.h>
+
 static DEFINE_SPINLOCK(cgroup_rstat_lock);
 static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
 
@@ -222,6 +224,35 @@ __weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
 
 __bpf_hook_end();
 
+/*
+ * Helper functions for locking cgroup_rstat_lock.
+ *
+ * This makes it easier to diagnose locking issues and contention in
+ * production environments.  The parameter @cpu_in_loop indicate lock
+ * was released and re-taken when collection data from the CPUs. The
+ * value -1 is used when obtaining the main lock else this is the CPU
+ * number processed last.
+ */
+static inline void __cgroup_rstat_lock(struct cgroup *cgrp, int cpu_in_loop)
+	__acquires(&cgroup_rstat_lock)
+{
+	bool contended;
+
+	contended = !spin_trylock_irq(&cgroup_rstat_lock);
+	if (contended) {
+		trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
+		spin_lock_irq(&cgroup_rstat_lock);
+	}
+	trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
+}
+
+static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop)
+	__releases(&cgroup_rstat_lock)
+{
+	trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
+	spin_unlock_irq(&cgroup_rstat_lock);
+}
+
 /* see cgroup_rstat_flush() */
 static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
 	__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
@@ -248,10 +279,10 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
 
 		/* play nice and yield if necessary */
 		if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
-			spin_unlock_irq(&cgroup_rstat_lock);
+			__cgroup_rstat_unlock(cgrp, cpu);
 			if (!cond_resched())
 				cpu_relax();
-			spin_lock_irq(&cgroup_rstat_lock);
+			__cgroup_rstat_lock(cgrp, cpu);
 		}
 	}
 }
@@ -273,9 +304,9 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
 {
 	might_sleep();
 
-	spin_lock_irq(&cgroup_rstat_lock);
+	__cgroup_rstat_lock(cgrp, -1);
 	cgroup_rstat_flush_locked(cgrp);
-	spin_unlock_irq(&cgroup_rstat_lock);
+	__cgroup_rstat_unlock(cgrp, -1);
 }
 
 /**
@@ -291,17 +322,17 @@ void cgroup_rstat_flush_hold(struct cgroup *cgrp)
 	__acquires(&cgroup_rstat_lock)
 {
 	might_sleep();
-	spin_lock_irq(&cgroup_rstat_lock);
+	__cgroup_rstat_lock(cgrp, -1);
 	cgroup_rstat_flush_locked(cgrp);
 }
 
 /**
  * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
  */
-void cgroup_rstat_flush_release(void)
+void cgroup_rstat_flush_release(struct cgroup *cgrp)
 	__releases(&cgroup_rstat_lock)
 {
-	spin_unlock_irq(&cgroup_rstat_lock);
+	__cgroup_rstat_unlock(cgrp, -1);
 }
 
 int cgroup_rstat_init(struct cgroup *cgrp)
@@ -533,7 +564,7 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
 #ifdef CONFIG_SCHED_CORE
 		forceidle_time = cgrp->bstat.forceidle_sum;
 #endif
-		cgroup_rstat_flush_release();
+		cgroup_rstat_flush_release(cgrp);
 	} else {
 		root_cgroup_cputime(&bstat);
 		usage = bstat.cputime.sum_exec_runtime;
-- 
cgit v1.2.3


From b010bf72da5ada39f4e722fa28d4275df8673229 Mon Sep 17 00:00:00 2001
From: "Kory Maincent (Dent Project)" <kory.maincent@bootlin.com>
Date: Sun, 14 Apr 2024 16:21:52 +0200
Subject: net: pse-pd: Rectify and adapt the naming of admin_cotrol member of
 struct pse_control_config

In commit 18ff0bcda6d1 ("ethtool: add interface to interact with Ethernet
Power Equipment"), the 'pse_control_config' structure was introduced,
housing a single member labeled 'admin_cotrol' responsible for maintaining
the operational state of the PoDL PSE functions.

A noticeable typographical error exists in the naming of this field
('cotrol' should be corrected to 'control'), which this commit aims to
rectify.

Furthermore, with upcoming extensions of this structure to encompass PoE
functionalities, the field is being renamed to 'podl_admin_state' to
distinctly indicate that this state is tailored specifically for PoDL."

Reviewed-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Kory Maincent <kory.maincent@bootlin.com>
Link: https://lore.kernel.org/r/20240414-feature_poe-v8-3-e4bf1e860da5@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/pse-pd/pse_regulator.c | 8 ++++----
 include/linux/pse-pd/pse.h         | 4 ++--
 net/ethtool/pse-pd.c               | 2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/net/pse-pd/pse_regulator.c b/drivers/net/pse-pd/pse_regulator.c
index e2bf8306ca90..1dedf4de296e 100644
--- a/drivers/net/pse-pd/pse_regulator.c
+++ b/drivers/net/pse-pd/pse_regulator.c
@@ -31,10 +31,10 @@ pse_reg_ethtool_set_config(struct pse_controller_dev *pcdev, unsigned long id,
 	struct pse_reg_priv *priv = to_pse_reg(pcdev);
 	int ret;
 
-	if (priv->admin_state == config->admin_cotrol)
+	if (priv->admin_state == config->podl_admin_control)
 		return 0;
 
-	switch (config->admin_cotrol) {
+	switch (config->podl_admin_control) {
 	case ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED:
 		ret = regulator_enable(priv->ps);
 		break;
@@ -43,14 +43,14 @@ pse_reg_ethtool_set_config(struct pse_controller_dev *pcdev, unsigned long id,
 		break;
 	default:
 		dev_err(pcdev->dev, "Unknown admin state %i\n",
-			config->admin_cotrol);
+			config->podl_admin_control);
 		ret = -ENOTSUPP;
 	}
 
 	if (ret)
 		return ret;
 
-	priv->admin_state = config->admin_cotrol;
+	priv->admin_state = config->podl_admin_control;
 
 	return 0;
 }
diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h
index fb724c65c77b..199cf4ae3cf2 100644
--- a/include/linux/pse-pd/pse.h
+++ b/include/linux/pse-pd/pse.h
@@ -15,11 +15,11 @@ struct pse_controller_dev;
 /**
  * struct pse_control_config - PSE control/channel configuration.
  *
- * @admin_cotrol: set PoDL PSE admin control as described in
+ * @podl_admin_control: set PoDL PSE admin control as described in
  *	IEEE 802.3-2018 30.15.1.2.1 acPoDLPSEAdminControl
  */
 struct pse_control_config {
-	enum ethtool_podl_pse_admin_state admin_cotrol;
+	enum ethtool_podl_pse_admin_state podl_admin_control;
 };
 
 /**
diff --git a/net/ethtool/pse-pd.c b/net/ethtool/pse-pd.c
index cc478af77111..aef57a058f0d 100644
--- a/net/ethtool/pse-pd.c
+++ b/net/ethtool/pse-pd.c
@@ -130,7 +130,7 @@ ethnl_set_pse(struct ethnl_req_info *req_info, struct genl_info *info)
 	struct phy_device *phydev;
 
 	/* this values are already validated by the ethnl_pse_set_policy */
-	config.admin_cotrol = nla_get_u32(tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL]);
+	config.podl_admin_control = nla_get_u32(tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL]);
 
 	phydev = dev->phydev;
 	if (!phydev) {
-- 
cgit v1.2.3


From 2a49b45cd0e7e8c9a0cd5e2f3993b558469ed744 Mon Sep 17 00:00:00 2001
From: Guanbing Huang <albanhuang@tencent.com>
Date: Tue, 16 Apr 2024 11:16:18 +0800
Subject: PNP: Add dev_is_pnp() macro

Add dev_is_pnp() macro to determine whether the device is a PNP device.

Signed-off-by: Guanbing Huang <albanhuang@tencent.com>
Suggested-by: Andy Shevchenko <andriy.shevchenko@intel.com>
Reviewed-by: Bing Fan <tombinfan@tencent.com>
Tested-by: Linheng Du <dylanlhdu@tencent.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/4e68f5557ad53b671ca8103e572163eca52a8f29.1713234515.git.albanhuang@tencent.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/pnp.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/pnp.h b/include/linux/pnp.h
index ddbe7c3ca4ce..82561242cda4 100644
--- a/include/linux/pnp.h
+++ b/include/linux/pnp.h
@@ -469,6 +469,8 @@ int compare_pnp_id(struct pnp_id *pos, const char *id);
 int pnp_register_driver(struct pnp_driver *drv);
 void pnp_unregister_driver(struct pnp_driver *drv);
 
+#define dev_is_pnp(d) ((d)->bus == &pnp_bus_type)
+
 #else
 
 /* device management */
@@ -500,6 +502,8 @@ static inline int compare_pnp_id(struct pnp_id *pos, const char *id) { return -E
 static inline int pnp_register_driver(struct pnp_driver *drv) { return -ENODEV; }
 static inline void pnp_unregister_driver(struct pnp_driver *drv) { }
 
+#define dev_is_pnp(d) false
+
 #endif /* CONFIG_PNP */
 
 /**
-- 
cgit v1.2.3


From 37f36cab53c9e903823b327f964227563766071a Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <agordeev@linux.ibm.com>
Date: Wed, 10 Apr 2024 17:09:44 +0200
Subject: sched/vtime: Remove confusing arch_vtime_task_switch() declaration

Callback arch_vtime_task_switch() is only defined when
CONFIG_VIRT_CPU_ACCOUNTING_NATIVE is selected. Yet, the
function prototype forward declaration is present for
CONFIG_VIRT_CPU_ACCOUNTING_GEN variant. Remove it.

Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Link: https://lore.kernel.org/r/783d7c611864f82b0fb9edf01890b9396f3a549a.1712760275.git.agordeev@linux.ibm.com
---
 include/linux/vtime.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index 3684487d01e1..593466ceebed 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -18,7 +18,6 @@ extern void vtime_account_idle(struct task_struct *tsk);
 #endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-extern void arch_vtime_task_switch(struct task_struct *tsk);
 extern void vtime_user_enter(struct task_struct *tsk);
 extern void vtime_user_exit(struct task_struct *tsk);
 extern void vtime_guest_enter(struct task_struct *tsk);
-- 
cgit v1.2.3


From 08a36a48544d73bf153960245aec6c5fa23960de Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <agordeev@linux.ibm.com>
Date: Wed, 10 Apr 2024 17:09:48 +0200
Subject: sched/vtime: Do not include <asm/vtime.h> header

There is no architecture-specific code or data left
that generic <linux/vtime.h> needs to know about.
Thus, avoid the inclusion of <asm/vtime.h> header.

Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Nicholas Piggin <npiggin@gmail.com>
Link: https://lore.kernel.org/r/f7cd245668b9ae61a55184871aec494ec9199c4a.1712760275.git.agordeev@linux.ibm.com
---
 arch/powerpc/include/asm/Kbuild | 1 -
 include/asm-generic/vtime.h     | 1 -
 include/linux/vtime.h           | 4 ----
 3 files changed, 6 deletions(-)
 delete mode 100644 include/asm-generic/vtime.h

(limited to 'include')

diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index 61a8d5555cd7..e5fdc336c9b2 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -6,5 +6,4 @@ generic-y += agp.h
 generic-y += kvm_types.h
 generic-y += mcs_spinlock.h
 generic-y += qrwlock.h
-generic-y += vtime.h
 generic-y += early_ioremap.h
diff --git a/include/asm-generic/vtime.h b/include/asm-generic/vtime.h
deleted file mode 100644
index b1a49677fe25..000000000000
--- a/include/asm-generic/vtime.h
+++ /dev/null
@@ -1 +0,0 @@
-/* no content, but patch(1) dislikes empty files */
diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index 593466ceebed..29dd5b91dd7d 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -5,10 +5,6 @@
 #include <linux/context_tracking_state.h>
 #include <linux/sched.h>
 
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-#include <asm/vtime.h>
-#endif
-
 /*
  * Common vtime APIs
  */
-- 
cgit v1.2.3


From 5a1a25be995e1014abd01600479915683e356f5c Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 15 Apr 2024 11:20:55 -0400
Subject: libfs: Add simple_offset_rename() API

I'm about to fix a tmpfs rename bug that requires the use of
internal simple_offset helpers that are not available in mm/shmem.c

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Link: https://lore.kernel.org/r/20240415152057.4605-3-cel@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/libfs.c         | 21 +++++++++++++++++++++
 include/linux/fs.h |  2 ++
 mm/shmem.c         |  3 +--
 3 files changed, 24 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/libfs.c b/fs/libfs.c
index ab61fae92cde..c392a6edd393 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -357,6 +357,27 @@ int simple_offset_empty(struct dentry *dentry)
 	return ret;
 }
 
+/**
+ * simple_offset_rename - handle directory offsets for rename
+ * @old_dir: parent directory of source entry
+ * @old_dentry: dentry of source entry
+ * @new_dir: parent_directory of destination entry
+ * @new_dentry: dentry of destination
+ *
+ * Caller provides appropriate serialization.
+ *
+ * Returns zero on success, a negative errno value on failure.
+ */
+int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry,
+			 struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir);
+	struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir);
+
+	simple_offset_remove(old_ctx, old_dentry);
+	return simple_offset_add(new_ctx, old_dentry);
+}
+
 /**
  * simple_offset_rename_exchange - exchange rename with directory offsets
  * @old_dir: parent of dentry being moved
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 00fc429b0af0..82c707e40fea 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3338,6 +3338,8 @@ void simple_offset_init(struct offset_ctx *octx);
 int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry);
 void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry);
 int simple_offset_empty(struct dentry *dentry);
+int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry,
+			 struct inode *new_dir, struct dentry *new_dentry);
 int simple_offset_rename_exchange(struct inode *old_dir,
 				  struct dentry *old_dentry,
 				  struct inode *new_dir,
diff --git a/mm/shmem.c b/mm/shmem.c
index 0aad0d9a621b..c0fb65223963 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3473,8 +3473,7 @@ static int shmem_rename2(struct mnt_idmap *idmap,
 			return error;
 	}
 
-	simple_offset_remove(shmem_get_offset_ctx(old_dir), old_dentry);
-	error = simple_offset_add(shmem_get_offset_ctx(new_dir), old_dentry);
+	error = simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry);
 	if (error)
 		return error;
 
-- 
cgit v1.2.3


From 795bb82d12a16a4cee42845b0e4c7e3276574e5d Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Tue, 16 Apr 2024 21:14:52 +0300
Subject: fsnotify: fix UAF from FS_ERROR event on a shutting down filesystem

Protect against use after free when filesystem calls fsnotify_sb_error()
during fs shutdown.

Move freeing of sb->s_fsnotify_info to destroy_super_work(), because it
may be accessed from fs shutdown context.

Reported-by: syzbot+5e3f9b2a67b45f16d4e6@syzkaller.appspotmail.com
Suggested-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/linux-fsdevel/20240416173211.4lnmgctyo4jn5fha@quack3/
Fixes: 07a3b8d0bf72 ("fsnotify: lazy attach fsnotify_sb_info state to sb")
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Message-Id: <20240416181452.567070-1-amir73il@gmail.com>
---
 fs/notify/fsnotify.c             | 6 +++++-
 fs/super.c                       | 1 +
 include/linux/fsnotify_backend.h | 4 ++++
 3 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 2ae965ef37e8..ff69ae24c4e8 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -103,7 +103,11 @@ void fsnotify_sb_delete(struct super_block *sb)
 	WARN_ON(fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_CONTENT));
 	WARN_ON(fsnotify_sb_has_priority_watchers(sb,
 						  FSNOTIFY_PRIO_PRE_CONTENT));
-	kfree(sbinfo);
+}
+
+void fsnotify_sb_free(struct super_block *sb)
+{
+	kfree(sb->s_fsnotify_info);
 }
 
 /*
diff --git a/fs/super.c b/fs/super.c
index 71d9779c42b1..81656c7807db 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -274,6 +274,7 @@ static void destroy_super_work(struct work_struct *work)
 {
 	struct super_block *s = container_of(work, struct super_block,
 							destroy_work);
+	fsnotify_sb_free(s);
 	security_sb_free(s);
 	put_user_ns(s->s_user_ns);
 	kfree(s->s_subtype);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 7f1ab8264e41..4dd6143db271 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -576,6 +576,7 @@ extern int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data
 extern void __fsnotify_inode_delete(struct inode *inode);
 extern void __fsnotify_vfsmount_delete(struct vfsmount *mnt);
 extern void fsnotify_sb_delete(struct super_block *sb);
+extern void fsnotify_sb_free(struct super_block *sb);
 extern u32 fsnotify_get_cookie(void);
 
 static inline __u32 fsnotify_parent_needed_mask(__u32 mask)
@@ -880,6 +881,9 @@ static inline void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
 static inline void fsnotify_sb_delete(struct super_block *sb)
 {}
 
+static inline void fsnotify_sb_free(struct super_block *sb)
+{}
+
 static inline void fsnotify_update_flags(struct dentry *dentry)
 {}
 
-- 
cgit v1.2.3


From 44e030d8a5a1be503301a0f095416c5ebb93c9e6 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Tue, 16 Apr 2024 12:19:54 +0300
Subject: drm/displayid: move drm_displayid.h to drm_displayd_internal.h

There are no exported symbols for displayid, and it's all internal
interfaces. Move the header to drivers/gpu/drm/drm_displayd_internal.h.

Reviewed-by: Thomas Zimmermann <tzimmermann@suse.de>
Link: https://patchwork.freedesktop.org/patch/msgid/220713d4e3cc364ac103ba689065ae96e075f1fa.1713259151.git.jani.nikula@intel.com
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 drivers/gpu/drm/drm_displayid.c          |   3 +-
 drivers/gpu/drm/drm_displayid_internal.h | 171 +++++++++++++++++++++++++++++++
 drivers/gpu/drm/drm_edid.c               |   2 +-
 include/drm/drm_displayid.h              | 170 ------------------------------
 4 files changed, 174 insertions(+), 172 deletions(-)
 create mode 100644 drivers/gpu/drm/drm_displayid_internal.h
 delete mode 100644 include/drm/drm_displayid.h

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_displayid.c b/drivers/gpu/drm/drm_displayid.c
index 9edc111be7ee..f72a893c7deb 100644
--- a/drivers/gpu/drm/drm_displayid.c
+++ b/drivers/gpu/drm/drm_displayid.c
@@ -3,10 +3,11 @@
  * Copyright © 2021 Intel Corporation
  */
 
-#include <drm/drm_displayid.h>
 #include <drm/drm_edid.h>
 #include <drm/drm_print.h>
 
+#include "drm_displayid_internal.h"
+
 static const struct displayid_header *
 displayid_get_header(const u8 *displayid, int length, int index)
 {
diff --git a/drivers/gpu/drm/drm_displayid_internal.h b/drivers/gpu/drm/drm_displayid_internal.h
new file mode 100644
index 000000000000..56fd3bb0a779
--- /dev/null
+++ b/drivers/gpu/drm/drm_displayid_internal.h
@@ -0,0 +1,171 @@
+/*
+ * Copyright © 2014 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __DRM_DISPLAYID_INTERNAL_H__
+#define __DRM_DISPLAYID_INTERNAL_H__
+
+#include <linux/types.h>
+#include <linux/bits.h>
+
+struct drm_edid;
+
+#define VESA_IEEE_OUI				0x3a0292
+
+/* DisplayID Structure versions */
+#define DISPLAY_ID_STRUCTURE_VER_12		0x12
+#define DISPLAY_ID_STRUCTURE_VER_20		0x20
+
+/* DisplayID Structure v1r2 Data Blocks */
+#define DATA_BLOCK_PRODUCT_ID			0x00
+#define DATA_BLOCK_DISPLAY_PARAMETERS		0x01
+#define DATA_BLOCK_COLOR_CHARACTERISTICS	0x02
+#define DATA_BLOCK_TYPE_1_DETAILED_TIMING	0x03
+#define DATA_BLOCK_TYPE_2_DETAILED_TIMING	0x04
+#define DATA_BLOCK_TYPE_3_SHORT_TIMING		0x05
+#define DATA_BLOCK_TYPE_4_DMT_TIMING		0x06
+#define DATA_BLOCK_VESA_TIMING			0x07
+#define DATA_BLOCK_CEA_TIMING			0x08
+#define DATA_BLOCK_VIDEO_TIMING_RANGE		0x09
+#define DATA_BLOCK_PRODUCT_SERIAL_NUMBER	0x0a
+#define DATA_BLOCK_GP_ASCII_STRING		0x0b
+#define DATA_BLOCK_DISPLAY_DEVICE_DATA		0x0c
+#define DATA_BLOCK_INTERFACE_POWER_SEQUENCING	0x0d
+#define DATA_BLOCK_TRANSFER_CHARACTERISTICS	0x0e
+#define DATA_BLOCK_DISPLAY_INTERFACE		0x0f
+#define DATA_BLOCK_STEREO_DISPLAY_INTERFACE	0x10
+#define DATA_BLOCK_TILED_DISPLAY		0x12
+#define DATA_BLOCK_VENDOR_SPECIFIC		0x7f
+#define DATA_BLOCK_CTA				0x81
+
+/* DisplayID Structure v2r0 Data Blocks */
+#define DATA_BLOCK_2_PRODUCT_ID			0x20
+#define DATA_BLOCK_2_DISPLAY_PARAMETERS		0x21
+#define DATA_BLOCK_2_TYPE_7_DETAILED_TIMING	0x22
+#define DATA_BLOCK_2_TYPE_8_ENUMERATED_TIMING	0x23
+#define DATA_BLOCK_2_TYPE_9_FORMULA_TIMING	0x24
+#define DATA_BLOCK_2_DYNAMIC_VIDEO_TIMING	0x25
+#define DATA_BLOCK_2_DISPLAY_INTERFACE_FEATURES	0x26
+#define DATA_BLOCK_2_STEREO_DISPLAY_INTERFACE	0x27
+#define DATA_BLOCK_2_TILED_DISPLAY_TOPOLOGY	0x28
+#define DATA_BLOCK_2_CONTAINER_ID		0x29
+#define DATA_BLOCK_2_VENDOR_SPECIFIC		0x7e
+#define DATA_BLOCK_2_CTA_DISPLAY_ID		0x81
+
+/* DisplayID Structure v1r2 Product Type */
+#define PRODUCT_TYPE_EXTENSION			0
+#define PRODUCT_TYPE_TEST			1
+#define PRODUCT_TYPE_PANEL			2
+#define PRODUCT_TYPE_MONITOR			3
+#define PRODUCT_TYPE_TV				4
+#define PRODUCT_TYPE_REPEATER			5
+#define PRODUCT_TYPE_DIRECT_DRIVE		6
+
+/* DisplayID Structure v2r0 Display Product Primary Use Case (~Product Type) */
+#define PRIMARY_USE_EXTENSION			0
+#define PRIMARY_USE_TEST			1
+#define PRIMARY_USE_GENERIC			2
+#define PRIMARY_USE_TV				3
+#define PRIMARY_USE_DESKTOP_PRODUCTIVITY	4
+#define PRIMARY_USE_DESKTOP_GAMING		5
+#define PRIMARY_USE_PRESENTATION		6
+#define PRIMARY_USE_HEAD_MOUNTED_VR		7
+#define PRIMARY_USE_HEAD_MOUNTED_AR		8
+
+struct displayid_header {
+	u8 rev;
+	u8 bytes;
+	u8 prod_id;
+	u8 ext_count;
+} __packed;
+
+struct displayid_block {
+	u8 tag;
+	u8 rev;
+	u8 num_bytes;
+} __packed;
+
+struct displayid_tiled_block {
+	struct displayid_block base;
+	u8 tile_cap;
+	u8 topo[3];
+	u8 tile_size[4];
+	u8 tile_pixel_bezel[5];
+	u8 topology_id[8];
+} __packed;
+
+struct displayid_detailed_timings_1 {
+	u8 pixel_clock[3];
+	u8 flags;
+	u8 hactive[2];
+	u8 hblank[2];
+	u8 hsync[2];
+	u8 hsw[2];
+	u8 vactive[2];
+	u8 vblank[2];
+	u8 vsync[2];
+	u8 vsw[2];
+} __packed;
+
+struct displayid_detailed_timing_block {
+	struct displayid_block base;
+	struct displayid_detailed_timings_1 timings[];
+};
+
+#define DISPLAYID_VESA_MSO_OVERLAP	GENMASK(3, 0)
+#define DISPLAYID_VESA_MSO_MODE		GENMASK(6, 5)
+
+struct displayid_vesa_vendor_specific_block {
+	struct displayid_block base;
+	u8 oui[3];
+	u8 data_structure_type;
+	u8 mso;
+} __packed;
+
+/*
+ * DisplayID iteration.
+ *
+ * Do not access directly, this is private.
+ */
+struct displayid_iter {
+	const struct drm_edid *drm_edid;
+
+	const u8 *section;
+	int length;
+	int idx;
+	int ext_index;
+
+	u8 version;
+	u8 primary_use;
+};
+
+void displayid_iter_edid_begin(const struct drm_edid *drm_edid,
+			       struct displayid_iter *iter);
+const struct displayid_block *
+__displayid_iter_next(struct displayid_iter *iter);
+#define displayid_iter_for_each(__block, __iter) \
+	while (((__block) = __displayid_iter_next(__iter)))
+void displayid_iter_end(struct displayid_iter *iter);
+
+u8 displayid_version(const struct displayid_iter *iter);
+u8 displayid_primary_use(const struct displayid_iter *iter);
+
+#endif
diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index 1400722ae3fe..c4f799059522 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -40,7 +40,6 @@
 #include <linux/slab.h>
 #include <linux/vga_switcheroo.h>
 
-#include <drm/drm_displayid.h>
 #include <drm/drm_drv.h>
 #include <drm/drm_edid.h>
 #include <drm/drm_eld.h>
@@ -48,6 +47,7 @@
 #include <drm/drm_print.h>
 
 #include "drm_crtc_internal.h"
+#include "drm_displayid_internal.h"
 #include "drm_internal.h"
 
 static int oui(u8 first, u8 second, u8 third)
diff --git a/include/drm/drm_displayid.h b/include/drm/drm_displayid.h
deleted file mode 100644
index 566497eeb3b8..000000000000
--- a/include/drm/drm_displayid.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright © 2014 Red Hat Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-#ifndef DRM_DISPLAYID_H
-#define DRM_DISPLAYID_H
-
-#include <linux/types.h>
-#include <linux/bits.h>
-
-struct drm_edid;
-
-#define VESA_IEEE_OUI				0x3a0292
-
-/* DisplayID Structure versions */
-#define DISPLAY_ID_STRUCTURE_VER_12		0x12
-#define DISPLAY_ID_STRUCTURE_VER_20		0x20
-
-/* DisplayID Structure v1r2 Data Blocks */
-#define DATA_BLOCK_PRODUCT_ID			0x00
-#define DATA_BLOCK_DISPLAY_PARAMETERS		0x01
-#define DATA_BLOCK_COLOR_CHARACTERISTICS	0x02
-#define DATA_BLOCK_TYPE_1_DETAILED_TIMING	0x03
-#define DATA_BLOCK_TYPE_2_DETAILED_TIMING	0x04
-#define DATA_BLOCK_TYPE_3_SHORT_TIMING		0x05
-#define DATA_BLOCK_TYPE_4_DMT_TIMING		0x06
-#define DATA_BLOCK_VESA_TIMING			0x07
-#define DATA_BLOCK_CEA_TIMING			0x08
-#define DATA_BLOCK_VIDEO_TIMING_RANGE		0x09
-#define DATA_BLOCK_PRODUCT_SERIAL_NUMBER	0x0a
-#define DATA_BLOCK_GP_ASCII_STRING		0x0b
-#define DATA_BLOCK_DISPLAY_DEVICE_DATA		0x0c
-#define DATA_BLOCK_INTERFACE_POWER_SEQUENCING	0x0d
-#define DATA_BLOCK_TRANSFER_CHARACTERISTICS	0x0e
-#define DATA_BLOCK_DISPLAY_INTERFACE		0x0f
-#define DATA_BLOCK_STEREO_DISPLAY_INTERFACE	0x10
-#define DATA_BLOCK_TILED_DISPLAY		0x12
-#define DATA_BLOCK_VENDOR_SPECIFIC		0x7f
-#define DATA_BLOCK_CTA				0x81
-
-/* DisplayID Structure v2r0 Data Blocks */
-#define DATA_BLOCK_2_PRODUCT_ID			0x20
-#define DATA_BLOCK_2_DISPLAY_PARAMETERS		0x21
-#define DATA_BLOCK_2_TYPE_7_DETAILED_TIMING	0x22
-#define DATA_BLOCK_2_TYPE_8_ENUMERATED_TIMING	0x23
-#define DATA_BLOCK_2_TYPE_9_FORMULA_TIMING	0x24
-#define DATA_BLOCK_2_DYNAMIC_VIDEO_TIMING	0x25
-#define DATA_BLOCK_2_DISPLAY_INTERFACE_FEATURES	0x26
-#define DATA_BLOCK_2_STEREO_DISPLAY_INTERFACE	0x27
-#define DATA_BLOCK_2_TILED_DISPLAY_TOPOLOGY	0x28
-#define DATA_BLOCK_2_CONTAINER_ID		0x29
-#define DATA_BLOCK_2_VENDOR_SPECIFIC		0x7e
-#define DATA_BLOCK_2_CTA_DISPLAY_ID		0x81
-
-/* DisplayID Structure v1r2 Product Type */
-#define PRODUCT_TYPE_EXTENSION			0
-#define PRODUCT_TYPE_TEST			1
-#define PRODUCT_TYPE_PANEL			2
-#define PRODUCT_TYPE_MONITOR			3
-#define PRODUCT_TYPE_TV				4
-#define PRODUCT_TYPE_REPEATER			5
-#define PRODUCT_TYPE_DIRECT_DRIVE		6
-
-/* DisplayID Structure v2r0 Display Product Primary Use Case (~Product Type) */
-#define PRIMARY_USE_EXTENSION			0
-#define PRIMARY_USE_TEST			1
-#define PRIMARY_USE_GENERIC			2
-#define PRIMARY_USE_TV				3
-#define PRIMARY_USE_DESKTOP_PRODUCTIVITY	4
-#define PRIMARY_USE_DESKTOP_GAMING		5
-#define PRIMARY_USE_PRESENTATION		6
-#define PRIMARY_USE_HEAD_MOUNTED_VR		7
-#define PRIMARY_USE_HEAD_MOUNTED_AR		8
-
-struct displayid_header {
-	u8 rev;
-	u8 bytes;
-	u8 prod_id;
-	u8 ext_count;
-} __packed;
-
-struct displayid_block {
-	u8 tag;
-	u8 rev;
-	u8 num_bytes;
-} __packed;
-
-struct displayid_tiled_block {
-	struct displayid_block base;
-	u8 tile_cap;
-	u8 topo[3];
-	u8 tile_size[4];
-	u8 tile_pixel_bezel[5];
-	u8 topology_id[8];
-} __packed;
-
-struct displayid_detailed_timings_1 {
-	u8 pixel_clock[3];
-	u8 flags;
-	u8 hactive[2];
-	u8 hblank[2];
-	u8 hsync[2];
-	u8 hsw[2];
-	u8 vactive[2];
-	u8 vblank[2];
-	u8 vsync[2];
-	u8 vsw[2];
-} __packed;
-
-struct displayid_detailed_timing_block {
-	struct displayid_block base;
-	struct displayid_detailed_timings_1 timings[];
-};
-
-#define DISPLAYID_VESA_MSO_OVERLAP	GENMASK(3, 0)
-#define DISPLAYID_VESA_MSO_MODE		GENMASK(6, 5)
-
-struct displayid_vesa_vendor_specific_block {
-	struct displayid_block base;
-	u8 oui[3];
-	u8 data_structure_type;
-	u8 mso;
-} __packed;
-
-/*
- * DisplayID iteration.
- *
- * Do not access directly, this is private.
- */
-struct displayid_iter {
-	const struct drm_edid *drm_edid;
-
-	const u8 *section;
-	int length;
-	int idx;
-	int ext_index;
-
-	u8 version;
-	u8 primary_use;
-};
-
-void displayid_iter_edid_begin(const struct drm_edid *drm_edid,
-			       struct displayid_iter *iter);
-const struct displayid_block *
-__displayid_iter_next(struct displayid_iter *iter);
-#define displayid_iter_for_each(__block, __iter) \
-	while (((__block) = __displayid_iter_next(__iter)))
-void displayid_iter_end(struct displayid_iter *iter);
-
-u8 displayid_version(const struct displayid_iter *iter);
-u8 displayid_primary_use(const struct displayid_iter *iter);
-
-#endif
-- 
cgit v1.2.3


From 4e765920eeb91861a42cbc89036dedd3f7d27eef Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Tue, 16 Apr 2024 12:19:55 +0300
Subject: drm/edid: move all internal declarations to drm_crtc_internal.h

The declarations for internal EDID functions are a bit scattered. Put
them all in drm_crtc_internal.h.

Reviewed-by: Thomas Zimmermann <tzimmermann@suse.de>
Link: https://patchwork.freedesktop.org/patch/msgid/9ae137ea51f9cc2ccb3899b0acda553e6a8ce2db.1713259151.git.jani.nikula@intel.com
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 drivers/gpu/drm/drm_crtc_internal.h | 6 ++++++
 drivers/gpu/drm/drm_displayid.c     | 1 +
 drivers/gpu/drm/drm_eld.c           | 4 +++-
 drivers/gpu/drm/drm_internal.h      | 5 -----
 include/drm/drm_edid.h              | 3 ---
 5 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_crtc_internal.h b/drivers/gpu/drm/drm_crtc_internal.h
index 0c693229a1c9..2256893d7d67 100644
--- a/drivers/gpu/drm/drm_crtc_internal.h
+++ b/drivers/gpu/drm/drm_crtc_internal.h
@@ -43,12 +43,14 @@ enum drm_color_range;
 enum drm_connector_force;
 enum drm_mode_status;
 
+struct cea_sad;
 struct drm_atomic_state;
 struct drm_bridge;
 struct drm_connector;
 struct drm_crtc;
 struct drm_device;
 struct drm_display_mode;
+struct drm_edid;
 struct drm_file;
 struct drm_framebuffer;
 struct drm_mode_create_dumb;
@@ -297,6 +299,10 @@ void drm_mode_fixup_1366x768(struct drm_display_mode *mode);
 int drm_edid_override_show(struct drm_connector *connector, struct seq_file *m);
 int drm_edid_override_set(struct drm_connector *connector, const void *edid, size_t size);
 int drm_edid_override_reset(struct drm_connector *connector);
+const u8 *drm_find_edid_extension(const struct drm_edid *drm_edid,
+				  int ext_id, int *ext_index);
+void drm_edid_cta_sad_get(const struct cea_sad *cta_sad, u8 *sad);
+void drm_edid_cta_sad_set(struct cea_sad *cta_sad, const u8 *sad);
 
 /* drm_edid_load.c */
 #ifdef CONFIG_DRM_LOAD_EDID_FIRMWARE
diff --git a/drivers/gpu/drm/drm_displayid.c b/drivers/gpu/drm/drm_displayid.c
index f72a893c7deb..f800dc0906d5 100644
--- a/drivers/gpu/drm/drm_displayid.c
+++ b/drivers/gpu/drm/drm_displayid.c
@@ -6,6 +6,7 @@
 #include <drm/drm_edid.h>
 #include <drm/drm_print.h>
 
+#include "drm_crtc_internal.h"
 #include "drm_displayid_internal.h"
 
 static const struct displayid_header *
diff --git a/drivers/gpu/drm/drm_eld.c b/drivers/gpu/drm/drm_eld.c
index 5177991aa272..c0428d07de53 100644
--- a/drivers/gpu/drm/drm_eld.c
+++ b/drivers/gpu/drm/drm_eld.c
@@ -3,10 +3,12 @@
  * Copyright © 2023 Intel Corporation
  */
 
+#include <linux/export.h>
+
 #include <drm/drm_edid.h>
 #include <drm/drm_eld.h>
 
-#include "drm_internal.h"
+#include "drm_crtc_internal.h"
 
 /**
  * drm_eld_sad_get - get SAD from ELD to struct cea_sad
diff --git a/drivers/gpu/drm/drm_internal.h b/drivers/gpu/drm/drm_internal.h
index 2215baef9a3e..690505a1f7a5 100644
--- a/drivers/gpu/drm/drm_internal.h
+++ b/drivers/gpu/drm/drm_internal.h
@@ -35,7 +35,6 @@
 
 #define DRM_IF_VERSION(maj, min) (maj << 16 | min)
 
-struct cea_sad;
 struct dentry;
 struct dma_buf;
 struct iosys_map;
@@ -278,8 +277,4 @@ void drm_framebuffer_print_info(struct drm_printer *p, unsigned int indent,
 				const struct drm_framebuffer *fb);
 void drm_framebuffer_debugfs_init(struct drm_device *dev);
 
-/* drm_edid.c */
-void drm_edid_cta_sad_get(const struct cea_sad *cta_sad, u8 *sad);
-void drm_edid_cta_sad_set(struct cea_sad *cta_sad, const u8 *sad);
-
 #endif /* __DRM_INTERNAL_H__ */
diff --git a/include/drm/drm_edid.h b/include/drm/drm_edid.h
index 8e0e32349332..324e900cc287 100644
--- a/include/drm/drm_edid.h
+++ b/include/drm/drm_edid.h
@@ -485,7 +485,4 @@ void drm_edid_get_product_id(const struct drm_edid *drm_edid,
 void drm_edid_print_product_id(struct drm_printer *p,
 			       const struct drm_edid_product_id *id, bool raw);
 
-const u8 *drm_find_edid_extension(const struct drm_edid *drm_edid,
-				  int ext_id, int *ext_index);
-
 #endif /* __DRM_EDID_H__ */
-- 
cgit v1.2.3


From 2d798ccc89e7233f48bd7d552d95aad3de50acb1 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Tue, 16 Apr 2024 12:19:56 +0300
Subject: drm/edid: group struct drm_edid based declarations together

Keep the declarations for struct drm_edid based functions together.

Reviewed-by: Thomas Zimmermann <tzimmermann@suse.de>
Link: https://patchwork.freedesktop.org/patch/msgid/2634de73734dfbf0a772511a80eab966c8206cf2.1713259151.git.jani.nikula@intel.com
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 include/drm/drm_edid.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/drm/drm_edid.h b/include/drm/drm_edid.h
index 324e900cc287..92fff199aa78 100644
--- a/include/drm/drm_edid.h
+++ b/include/drm/drm_edid.h
@@ -431,10 +431,6 @@ struct edid *drm_do_get_edid(struct drm_connector *connector,
 	void *data);
 struct edid *drm_get_edid(struct drm_connector *connector,
 			  struct i2c_adapter *adapter);
-const struct drm_edid *drm_edid_read_base_block(struct i2c_adapter *adapter);
-u32 drm_edid_get_panel_id(const struct drm_edid *drm_edid);
-bool drm_edid_match(const struct drm_edid *drm_edid,
-		    const struct drm_edid_ident *ident);
 struct edid *drm_get_edid_switcheroo(struct drm_connector *connector,
 				     struct i2c_adapter *adapter);
 struct edid *drm_edid_duplicate(const struct edid *edid);
@@ -474,6 +470,7 @@ const struct drm_edid *drm_edid_read_ddc(struct drm_connector *connector,
 const struct drm_edid *drm_edid_read_custom(struct drm_connector *connector,
 					    int (*read_block)(void *context, u8 *buf, unsigned int block, size_t len),
 					    void *context);
+const struct drm_edid *drm_edid_read_base_block(struct i2c_adapter *adapter);
 const struct drm_edid *drm_edid_read_switcheroo(struct drm_connector *connector,
 						struct i2c_adapter *adapter);
 int drm_edid_connector_update(struct drm_connector *connector,
@@ -484,5 +481,8 @@ void drm_edid_get_product_id(const struct drm_edid *drm_edid,
 			     struct drm_edid_product_id *id);
 void drm_edid_print_product_id(struct drm_printer *p,
 			       const struct drm_edid_product_id *id, bool raw);
+u32 drm_edid_get_panel_id(const struct drm_edid *drm_edid);
+bool drm_edid_match(const struct drm_edid *drm_edid,
+		    const struct drm_edid_ident *ident);
 
 #endif /* __DRM_EDID_H__ */
-- 
cgit v1.2.3


From d7bf5fcc2f8212b91cd33d91415f6a887761c32a Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Tue, 16 Apr 2024 12:19:59 +0300
Subject: drm/edid: make drm_edid_are_equal() static

drm_edid_are_equal() is only used within drm_edid.c. Make it static. Do
not encourage more uses of struct edid.

Reviewed-by: Thomas Zimmermann <tzimmermann@suse.de>
Link: https://patchwork.freedesktop.org/patch/msgid/aa79be9a5d0b08c71b82b86b5a8ff0f332e13c6a.1713259151.git.jani.nikula@intel.com
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 drivers/gpu/drm/drm_edid.c | 3 +--
 include/drm/drm_edid.h     | 2 --
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index 4b3ad42a8f95..463fbad85d90 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -1827,7 +1827,7 @@ static bool edid_block_is_zero(const void *edid)
  * This helper can be used during probing to determine if
  * edid had changed.
  */
-bool drm_edid_are_equal(const struct edid *edid1, const struct edid *edid2)
+static bool drm_edid_are_equal(const struct edid *edid1, const struct edid *edid2)
 {
 	int edid1_len, edid2_len;
 	bool edid1_present = edid1 != NULL;
@@ -1849,7 +1849,6 @@ bool drm_edid_are_equal(const struct edid *edid1, const struct edid *edid2)
 
 	return true;
 }
-EXPORT_SYMBOL(drm_edid_are_equal);
 
 enum edid_block_status {
 	EDID_BLOCK_OK = 0,
diff --git a/include/drm/drm_edid.h b/include/drm/drm_edid.h
index 92fff199aa78..b085525e53e2 100644
--- a/include/drm/drm_edid.h
+++ b/include/drm/drm_edid.h
@@ -348,8 +348,6 @@ int drm_edid_to_speaker_allocation(const struct edid *edid, u8 **sadb);
 int drm_av_sync_delay(struct drm_connector *connector,
 		      const struct drm_display_mode *mode);
 
-bool drm_edid_are_equal(const struct edid *edid1, const struct edid *edid2);
-
 int
 drm_hdmi_avi_infoframe_from_display_mode(struct hdmi_avi_infoframe *frame,
 					 const struct drm_connector *connector,
-- 
cgit v1.2.3


From b85a3c1b7978f942fa5bf8cfe22b6a6aaa49d3b7 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Mon, 8 Apr 2024 10:41:04 +0900
Subject: block: Introduce bio_straddles_zones() and
 bio_offset_from_zone_start()

Implement the inline helper functions bio_straddles_zones() and
bio_offset_from_zone_start() to respectively test if a BIO crosses a
zone boundary (the start sector and last sector belong to different
zones) and to obtain the offset of a BIO from the start sector of its
target zone.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Tested-by: Hans Holmberg <hans.holmberg@wdc.com>
Tested-by: Dennis Maisenbacher <dennis.maisenbacher@wdc.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20240408014128.205141-5-dlemoal@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c3e8f7cf96be..ec7bd7091467 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -853,6 +853,13 @@ static inline unsigned int bio_zone_no(struct bio *bio)
 	return disk_zone_no(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector);
 }
 
+static inline bool bio_straddles_zones(struct bio *bio)
+{
+	return bio_sectors(bio) &&
+		bio_zone_no(bio) !=
+		disk_zone_no(bio->bi_bdev->bd_disk, bio_end_sector(bio) - 1);
+}
+
 static inline unsigned int bio_zone_is_seq(struct bio *bio)
 {
 	return disk_zone_is_seq(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector);
@@ -1328,6 +1335,12 @@ static inline sector_t bdev_offset_from_zone_start(struct block_device *bdev,
 	return sector & (bdev_zone_sectors(bdev) - 1);
 }
 
+static inline sector_t bio_offset_from_zone_start(struct bio *bio)
+{
+	return bdev_offset_from_zone_start(bio->bi_bdev,
+					   bio->bi_iter.bi_sector);
+}
+
 static inline bool bdev_is_zone_start(struct block_device *bdev,
 				      sector_t sector)
 {
-- 
cgit v1.2.3


From ecfe43b11b02ffeb24c203af7d3947417d412cf7 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Mon, 8 Apr 2024 10:41:06 +0900
Subject: block: Remember zone capacity when revalidating zones

In preparation for adding zone write plugging, modify
blk_revalidate_disk_zones() to get the capacity of zones of a zoned
block device. This capacity value as a number of 512B sectors is stored
in the gendisk zone_capacity field.

Given that host-managed SMR disks (including zoned UFS drives) and all
known NVMe ZNS devices have the same zone capacity for all zones
blk_revalidate_disk_zones() returns an error if different capacities are
detected for different zones.

This also adds check to verify that the values reported by the device
for zone capacities are correct, that is, that the zone capacity is
never 0, does not exceed the zone size and is equal to the zone size for
conventional zones.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Tested-by: Hans Holmberg <hans.holmberg@wdc.com>
Tested-by: Dennis Maisenbacher <dennis.maisenbacher@wdc.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Link: https://lore.kernel.org/r/20240408014128.205141-7-dlemoal@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-zoned.c      | 26 ++++++++++++++++++++++++++
 include/linux/blkdev.h |  1 +
 2 files changed, 27 insertions(+)

(limited to 'include')

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index da0f4b2a8fa0..23d9bb21c459 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -438,6 +438,7 @@ struct blk_revalidate_zone_args {
 	unsigned long	*conv_zones_bitmap;
 	unsigned long	*seq_zones_wlock;
 	unsigned int	nr_zones;
+	unsigned int	zone_capacity;
 	sector_t	sector;
 };
 
@@ -482,9 +483,20 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
 		return -ENODEV;
 	}
 
+	if (!zone->capacity || zone->capacity > zone->len) {
+		pr_warn("%s: Invalid zone capacity\n",
+			disk->disk_name);
+		return -ENODEV;
+	}
+
 	/* Check zone type */
 	switch (zone->type) {
 	case BLK_ZONE_TYPE_CONVENTIONAL:
+		if (zone->capacity != zone->len) {
+			pr_warn("%s: Invalid conventional zone capacity\n",
+				disk->disk_name);
+			return -ENODEV;
+		}
 		if (!args->conv_zones_bitmap) {
 			args->conv_zones_bitmap =
 				blk_alloc_zone_bitmap(q->node, args->nr_zones);
@@ -500,6 +512,18 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
 			if (!args->seq_zones_wlock)
 				return -ENOMEM;
 		}
+
+		/*
+		 * Remember the capacity of the first sequential zone and check
+		 * if it is constant for all zones.
+		 */
+		if (!args->zone_capacity)
+			args->zone_capacity = zone->capacity;
+		if (zone->capacity != args->zone_capacity) {
+			pr_warn("%s: Invalid variable zone capacity\n",
+				disk->disk_name);
+			return -ENODEV;
+		}
 		break;
 	case BLK_ZONE_TYPE_SEQWRITE_PREF:
 	default:
@@ -595,6 +619,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
 	blk_mq_freeze_queue(q);
 	if (ret > 0) {
 		disk->nr_zones = args.nr_zones;
+		disk->zone_capacity = args.zone_capacity;
 		swap(disk->seq_zones_wlock, args.seq_zones_wlock);
 		swap(disk->conv_zones_bitmap, args.conv_zones_bitmap);
 		if (update_driver_data)
@@ -608,6 +633,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
 
 	kfree(args.seq_zones_wlock);
 	kfree(args.conv_zones_bitmap);
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ec7bd7091467..4e81f714cca7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -191,6 +191,7 @@ struct gendisk {
 	 * blk_mq_unfreeze_queue().
 	 */
 	unsigned int		nr_zones;
+	unsigned int		zone_capacity;
 	unsigned long		*conv_zones_bitmap;
 	unsigned long		*seq_zones_wlock;
 #endif /* CONFIG_BLK_DEV_ZONED */
-- 
cgit v1.2.3


From dd291d77cc90eb6a86e9860ba8e6e38eebd57d12 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Mon, 8 Apr 2024 10:41:07 +0900
Subject: block: Introduce zone write plugging

Zone write plugging implements a per-zone "plug" for write operations
to control the submission and execution order of write operations to
sequential write required zones of a zoned block device. Per-zone
plugging guarantees that at any time there is at most only one write
request per zone being executed. This mechanism is intended to replace
zone write locking which implements a similar per-zone write throttling
at the scheduler level, but is implemented only by mq-deadline.

Unlike zone write locking which operates on requests, zone write
plugging operates on BIOs. A zone write plug is simply a BIO list that
is atomically manipulated using a spinlock and a kblockd submission
work. A write BIO to a zone is "plugged" to delay its execution if a
write BIO for the same zone was already issued, that is, if a write
request for the same zone is being executed. The next plugged BIO is
unplugged and issued once the write request completes.

This mechanism allows to:
 - Untangle zone write ordering from block IO schedulers. This allows
   removing the restriction on using mq-deadline for writing to zoned
   block devices. Any block IO scheduler, including "none" can be used.
 - Zone write plugging operates on BIOs instead of requests. Plugged
   BIOs waiting for execution thus do not hold scheduling tags and thus
   are not preventing other BIOs from executing (reads or writes to
   other zones). Depending on the workload, this can significantly
   improve the device use (higher queue depth operation) and
   performance.
 - Both blk-mq (request based) zoned devices and BIO-based zoned devices
   (e.g.  device mapper) can use zone write plugging. It is mandatory
   for the former but optional for the latter. BIO-based drivers can
   use zone write plugging to implement write ordering guarantees, or
   the drivers can implement their own if needed.
 - The code is less invasive in the block layer and is mostly limited to
   blk-zoned.c with some small changes in blk-mq.c, blk-merge.c and
   bio.c.

Zone write plugging is implemented using struct blk_zone_wplug. This
structure includes a spinlock, a BIO list and a work structure to
handle the submission of plugged BIOs. Zone write plugs structures are
managed using a per-disk hash table.

Plugging of zone write BIOs is done using the function
blk_zone_write_plug_bio() which returns false if a BIO execution does
not need to be delayed and true otherwise. This function is called
from blk_mq_submit_bio() after a BIO is split to avoid large BIOs
spanning multiple zones which would cause mishandling of zone write
plugs. This ichange enables by default zone write plugging for any mq
request-based block device. BIO-based device drivers can also use zone
write plugging by expliclty calling blk_zone_write_plug_bio() in their
->submit_bio method. For such devices, the driver must ensure that a
BIO passed to blk_zone_write_plug_bio() is already split and not
straddling zone boundaries.

Only write and write zeroes BIOs are plugged. Zone write plugging does
not introduce any significant overhead for other operations. A BIO that
is being handled through zone write plugging is flagged using the new
BIO flag BIO_ZONE_WRITE_PLUGGING. A request handling a BIO flagged with
this new flag is flagged with the new RQF_ZONE_WRITE_PLUGGING flag.
The completion of BIOs and requests flagged trigger respectively calls
to the functions blk_zone_write_bio_endio() and
blk_zone_write_complete_request(). The latter function is used to
trigger submission of the next plugged BIO using the zone plug work.
blk_zone_write_bio_endio() does the same for BIO-based devices.
This ensures that at any time, at most one request (blk-mq devices) or
one BIO (BIO-based devices) is being executed for any zone. The
handling of zone write plugs using a per-zone plug spinlock maximizes
parallelism and device usage by allowing multiple zones to be writen
simultaneously without lock contention.

Zone write plugging ignores flush BIOs without data. Hovever, any flush
BIO that has data is always plugged so that the write part of the flush
sequence is serialized with other regular writes.

Given that any BIO handled through zone write plugging will be the only
BIO in flight for the target zone when it is executed, the unplugging
and submission of a BIO will have no chance of successfully merging with
plugged requests or requests in the scheduler. To overcome this
potential performance degradation, blk_mq_submit_bio() calls the
function blk_zone_write_plug_attempt_merge() to try to merge other
plugged BIOs with the one just unplugged and submitted. Successful
merging is signaled using blk_zone_write_plug_bio_merged(), called from
bio_attempt_back_merge(). Furthermore, to avoid recalculating the number
of segments of plugged BIOs to attempt merging, the number of segments
of a plugged BIO is saved using the new struct bio field
__bi_nr_segments. To avoid growing the size of struct bio, this field is
added as a union with the bio_cookie field. This is safe to do as
polling is always disabled for plugged BIOs.

When BIOs are plugged in a zone write plug, the device request queue
usage counter is always incremented. This reference is kept and reused
for blk-mq devices when the plugged BIO is unplugged and submitted
again using submit_bio_noacct_nocheck(). For this case, the unplugged
BIO is already flagged with BIO_ZONE_WRITE_PLUGGING and
blk_mq_submit_bio() proceeds directly to allocating a new request for
the BIO, re-using the usage reference count taken when the BIO was
plugged. This extra reference count is dropped in
blk_zone_write_plug_attempt_merge() for any plugged BIO that is
successfully merged. Given that BIO-based devices will not take this
path, the extra reference is dropped after a plugged BIO is unplugged
and submitted.

Zone write plugs are dynamically allocated and managed using a hash
table (an array of struct hlist_head) with RCU protection.
A zone write plug is allocated when a write BIO is received for the
zone and not freed until the zone is fully written, reset or finished.
To detect when a zone write plug can be freed, the write state of each
zone is tracked using a write pointer offset which corresponds to the
offset of a zone write pointer relative to the zone start. Write
operations always increment this write pointer offset. Zone reset
operations set it to 0 and zone finish operations set it to the zone
size.

If a write error happens, the wp_offset value of a zone write plug may
become incorrect and out of sync with the device managed write pointer.
This is handled using the zone write plug flag BLK_ZONE_WPLUG_ERROR.
The function blk_zone_wplug_handle_error() is called from the new disk
zone write plug work when this flag is set. This function executes a
report zone to update the zone write pointer offset to the current
value as indicated by the device. The disk zone write plug work is
scheduled whenever a BIO flagged with BIO_ZONE_WRITE_PLUGGING completes
with an error or when bio_zone_wplug_prepare_bio() detects an unaligned
write. Once scheduled, the disk zone write plugs work keeps running
until all zone errors are handled.

To match the new data structures used for zoned disks, the function
disk_free_zone_bitmaps() is renamed to the more generic
disk_free_zone_resources(). The function disk_init_zone_resources() is
also introduced to initialize zone write plugs resources when a gendisk
is allocated.

In order to guarantee that the user can simultaneously write up to a
number of zones equal to a device max active zone limit or max open zone
limit, zone write plugs are allocated using a mempool sized to the
maximum of these 2 device limits. For a device that does not have
active and open zone limits, 128 is used as the default mempool size.

If a change to the device active and open zone limits is detected, the
disk mempool is resized when blk_revalidate_disk_zones() is executed.

This commit contains contributions from Christoph Hellwig <hch@lst.de>.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Tested-by: Hans Holmberg <hans.holmberg@wdc.com>
Tested-by: Dennis Maisenbacher <dennis.maisenbacher@wdc.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20240408014128.205141-8-dlemoal@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c               |    2 +
 block/blk-merge.c         |   12 +
 block/blk-mq.c            |   32 +-
 block/blk-zoned.c         | 1091 ++++++++++++++++++++++++++++++++++++++++++++-
 block/blk.h               |   51 ++-
 block/genhd.c             |    3 +-
 include/linux/blk-mq.h    |    2 +
 include/linux/blk_types.h |    8 +-
 include/linux/blkdev.h    |   11 +
 9 files changed, 1201 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/block/bio.c b/block/bio.c
index d24420ed1c4c..38baedb39c6f 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1576,6 +1576,8 @@ again:
 	if (!bio_integrity_endio(bio))
 		return;
 
+	blk_zone_bio_endio(bio);
+
 	rq_qos_done_bio(bio);
 
 	if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 3363b1321908..7f8a808b74c1 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -377,6 +377,7 @@ struct bio *__bio_split_to_limits(struct bio *bio,
 		blkcg_bio_issue_init(split);
 		bio_chain(split, bio);
 		trace_block_split(split, bio->bi_iter.bi_sector);
+		WARN_ON_ONCE(bio_zone_write_plugging(bio));
 		submit_bio_noacct(bio);
 		return split;
 	}
@@ -988,6 +989,9 @@ enum bio_merge_status bio_attempt_back_merge(struct request *req,
 
 	blk_update_mixed_merge(req, bio, false);
 
+	if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING)
+		blk_zone_write_plug_bio_merged(bio);
+
 	req->biotail->bi_next = bio;
 	req->biotail = bio;
 	req->__data_len += bio->bi_iter.bi_size;
@@ -1003,6 +1007,14 @@ static enum bio_merge_status bio_attempt_front_merge(struct request *req,
 {
 	const blk_opf_t ff = bio_failfast(bio);
 
+	/*
+	 * A front merge for writes to sequential zones of a zoned block device
+	 * can happen only if the user submitted writes out of order. Do not
+	 * merge such write to let it fail.
+	 */
+	if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING)
+		return BIO_MERGE_FAILED;
+
 	if (!ll_front_merge_fn(req, bio, nr_segs))
 		return BIO_MERGE_FAILED;
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 88b541e8873f..48eb7dd049d1 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -828,6 +828,8 @@ static void blk_complete_request(struct request *req)
 		bio = next;
 	} while (bio);
 
+	blk_zone_complete_request(req);
+
 	/*
 	 * Reset counters so that the request stacking driver
 	 * can find how many bytes remain in the request
@@ -939,6 +941,7 @@ bool blk_update_request(struct request *req, blk_status_t error,
 	 * completely done
 	 */
 	if (!req->bio) {
+		blk_zone_complete_request(req);
 		/*
 		 * Reset counters so that the request stacking driver
 		 * can find how many bytes remain in the request
@@ -2963,15 +2966,30 @@ void blk_mq_submit_bio(struct bio *bio)
 	struct request *rq;
 	blk_status_t ret;
 
+	/*
+	 * If the plug has a cached request for this queue, try to use it.
+	 */
+	rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf);
+
+	/*
+	 * A BIO that was released from a zone write plug has already been
+	 * through the preparation in this function, already holds a reference
+	 * on the queue usage counter, and is the only write BIO in-flight for
+	 * the target zone. Go straight to preparing a request for it.
+	 */
+	if (bio_zone_write_plugging(bio)) {
+		nr_segs = bio->__bi_nr_segments;
+		if (rq)
+			blk_queue_exit(q);
+		goto new_request;
+	}
+
 	bio = blk_queue_bounce(bio, q);
 
 	/*
-	 * If the plug has a cached request for this queue, try use it.
-	 *
 	 * The cached request already holds a q_usage_counter reference and we
 	 * don't have to acquire a new one if we use it.
 	 */
-	rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf);
 	if (!rq) {
 		if (unlikely(bio_queue_enter(bio)))
 			return;
@@ -2988,6 +3006,10 @@ void blk_mq_submit_bio(struct bio *bio)
 	if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
 		goto queue_exit;
 
+	if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs))
+		goto queue_exit;
+
+new_request:
 	if (!rq) {
 		rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
 		if (unlikely(!rq))
@@ -3006,6 +3028,7 @@ void blk_mq_submit_bio(struct bio *bio)
 	if (ret != BLK_STS_OK) {
 		bio->bi_status = ret;
 		bio_endio(bio);
+		blk_zone_complete_request(rq);
 		blk_mq_free_request(rq);
 		return;
 	}
@@ -3013,6 +3036,9 @@ void blk_mq_submit_bio(struct bio *bio)
 	if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
 		return;
 
+	if (bio_zone_write_plugging(bio))
+		blk_zone_write_plug_attempt_merge(rq);
+
 	if (plug) {
 		blk_add_rq_to_plug(plug, rq);
 		return;
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 23d9bb21c459..fefcebd70445 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -7,6 +7,7 @@
  *
  * Copyright (c) 2016, Damien Le Moal
  * Copyright (c) 2016, Western Digital
+ * Copyright (c) 2024, Western Digital Corporation or its affiliates.
  */
 
 #include <linux/kernel.h>
@@ -16,8 +17,12 @@
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/sched/mm.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+#include <linux/mempool.h>
 
 #include "blk.h"
+#include "blk-mq-sched.h"
 
 #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name
 static const char *const zone_cond_name[] = {
@@ -32,6 +37,64 @@ static const char *const zone_cond_name[] = {
 };
 #undef ZONE_COND_NAME
 
+/*
+ * Per-zone write plug.
+ * @node: hlist_node structure for managing the plug using a hash table.
+ * @link: To list the plug in the zone write plug error list of the disk.
+ * @ref: Zone write plug reference counter. A zone write plug reference is
+ *       always at least 1 when the plug is hashed in the disk plug hash table.
+ *       The reference is incremented whenever a new BIO needing plugging is
+ *       submitted and when a function needs to manipulate a plug. The
+ *       reference count is decremented whenever a plugged BIO completes and
+ *       when a function that referenced the plug returns. The initial
+ *       reference is dropped whenever the zone of the zone write plug is reset,
+ *       finished and when the zone becomes full (last write BIO to the zone
+ *       completes).
+ * @lock: Spinlock to atomically manipulate the plug.
+ * @flags: Flags indicating the plug state.
+ * @zone_no: The number of the zone the plug is managing.
+ * @wp_offset: The zone write pointer location relative to the start of the zone
+ *             as a number of 512B sectors.
+ * @bio_list: The list of BIOs that are currently plugged.
+ * @bio_work: Work struct to handle issuing of plugged BIOs
+ * @rcu_head: RCU head to free zone write plugs with an RCU grace period.
+ * @disk: The gendisk the plug belongs to.
+ */
+struct blk_zone_wplug {
+	struct hlist_node	node;
+	struct list_head	link;
+	atomic_t		ref;
+	spinlock_t		lock;
+	unsigned int		flags;
+	unsigned int		zone_no;
+	unsigned int		wp_offset;
+	struct bio_list		bio_list;
+	struct work_struct	bio_work;
+	struct rcu_head		rcu_head;
+	struct gendisk		*disk;
+};
+
+/*
+ * Zone write plug flags bits:
+ *  - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged,
+ *    that is, that write BIOs are being throttled due to a write BIO already
+ *    being executed or the zone write plug bio list is not empty.
+ *  - BLK_ZONE_WPLUG_ERROR: Indicates that a write error happened which will be
+ *    recovered with a report zone to update the zone write pointer offset.
+ *  - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed
+ *    from the disk hash table and that the initial reference to the zone
+ *    write plug set when the plug was first added to the hash table has been
+ *    dropped. This flag is set when a zone is reset, finished or become full,
+ *    to prevent new references to the zone write plug to be taken for
+ *    newly incoming BIOs. A zone write plug flagged with this flag will be
+ *    freed once all remaining references from BIOs or functions are dropped.
+ */
+#define BLK_ZONE_WPLUG_PLUGGED		(1U << 0)
+#define BLK_ZONE_WPLUG_ERROR		(1U << 1)
+#define BLK_ZONE_WPLUG_UNHASHED		(1U << 2)
+
+#define BLK_ZONE_WPLUG_BUSY	(BLK_ZONE_WPLUG_PLUGGED | BLK_ZONE_WPLUG_ERROR)
+
 /**
  * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX.
  * @zone_cond: BLK_ZONE_COND_XXX.
@@ -425,12 +488,1010 @@ fail:
 	return ret;
 }
 
-void disk_free_zone_bitmaps(struct gendisk *disk)
+static inline bool disk_zone_is_conv(struct gendisk *disk, sector_t sector)
+{
+	if (!disk->conv_zones_bitmap)
+		return false;
+	return test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap);
+}
+
+static bool disk_insert_zone_wplug(struct gendisk *disk,
+				   struct blk_zone_wplug *zwplug)
+{
+	struct blk_zone_wplug *zwplg;
+	unsigned long flags;
+	unsigned int idx =
+		hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits);
+
+	/*
+	 * Add the new zone write plug to the hash table, but carefully as we
+	 * are racing with other submission context, so we may already have a
+	 * zone write plug for the same zone.
+	 */
+	spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+	hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) {
+		if (zwplg->zone_no == zwplug->zone_no) {
+			spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+			return false;
+		}
+	}
+	hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]);
+	spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+
+	return true;
+}
+
+static void disk_remove_zone_wplug(struct gendisk *disk,
+				   struct blk_zone_wplug *zwplug)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+	zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED;
+	atomic_dec(&zwplug->ref);
+	hlist_del_init_rcu(&zwplug->node);
+	spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+}
+
+static inline bool disk_should_remove_zone_wplug(struct gendisk *disk,
+						 struct blk_zone_wplug *zwplug)
+{
+	/* If the zone is still busy, the plug cannot be removed. */
+	if (zwplug->flags & BLK_ZONE_WPLUG_BUSY)
+		return false;
+
+	/* We can remove zone write plugs for zones that are empty or full. */
+	return !zwplug->wp_offset || zwplug->wp_offset >= disk->zone_capacity;
+}
+
+static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk,
+						  sector_t sector)
+{
+	unsigned int zno = disk_zone_no(disk, sector);
+	unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits);
+	struct blk_zone_wplug *zwplug;
+
+	rcu_read_lock();
+
+	hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) {
+		if (zwplug->zone_no == zno &&
+		    atomic_inc_not_zero(&zwplug->ref)) {
+			rcu_read_unlock();
+			return zwplug;
+		}
+	}
+
+	rcu_read_unlock();
+
+	return NULL;
+}
+
+static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head)
+{
+	struct blk_zone_wplug *zwplug =
+		container_of(rcu_head, struct blk_zone_wplug, rcu_head);
+
+	mempool_free(zwplug, zwplug->disk->zone_wplugs_pool);
+}
+
+static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug)
+{
+	if (atomic_dec_and_test(&zwplug->ref)) {
+		WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list));
+		WARN_ON_ONCE(!list_empty(&zwplug->link));
+
+		call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu);
+	}
+}
+
+static void blk_zone_wplug_bio_work(struct work_struct *work);
+
+/*
+ * Get a reference on the write plug for the zone containing @sector.
+ * If the plug does not exist, it is allocated and hashed.
+ * Return a pointer to the zone write plug with the plug spinlock held.
+ */
+static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk,
+					sector_t sector, gfp_t gfp_mask,
+					unsigned long *flags)
 {
+	unsigned int zno = disk_zone_no(disk, sector);
+	struct blk_zone_wplug *zwplug;
+
+again:
+	zwplug = disk_get_zone_wplug(disk, sector);
+	if (zwplug) {
+		/*
+		 * Check that a BIO completion or a zone reset or finish
+		 * operation has not already removed the zone write plug from
+		 * the hash table and dropped its reference count. In such case,
+		 * we need to get a new plug so start over from the beginning.
+		 */
+		spin_lock_irqsave(&zwplug->lock, *flags);
+		if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) {
+			spin_unlock_irqrestore(&zwplug->lock, *flags);
+			disk_put_zone_wplug(zwplug);
+			goto again;
+		}
+		return zwplug;
+	}
+
+	/*
+	 * Allocate and initialize a zone write plug with an extra reference
+	 * so that it is not freed when the zone write plug becomes idle without
+	 * the zone being full.
+	 */
+	zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask);
+	if (!zwplug)
+		return NULL;
+
+	INIT_HLIST_NODE(&zwplug->node);
+	INIT_LIST_HEAD(&zwplug->link);
+	atomic_set(&zwplug->ref, 2);
+	spin_lock_init(&zwplug->lock);
+	zwplug->flags = 0;
+	zwplug->zone_no = zno;
+	zwplug->wp_offset = sector & (disk->queue->limits.chunk_sectors - 1);
+	bio_list_init(&zwplug->bio_list);
+	INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work);
+	zwplug->disk = disk;
+
+	spin_lock_irqsave(&zwplug->lock, *flags);
+
+	/*
+	 * Insert the new zone write plug in the hash table. This can fail only
+	 * if another context already inserted a plug. Retry from the beginning
+	 * in such case.
+	 */
+	if (!disk_insert_zone_wplug(disk, zwplug)) {
+		spin_unlock_irqrestore(&zwplug->lock, *flags);
+		mempool_free(zwplug, disk->zone_wplugs_pool);
+		goto again;
+	}
+
+	return zwplug;
+}
+
+static inline void blk_zone_wplug_bio_io_error(struct bio *bio)
+{
+	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
+
+	bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
+	bio_io_error(bio);
+	blk_queue_exit(q);
+}
+
+/*
+ * Abort (fail) all plugged BIOs of a zone write plug.
+ */
+static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug)
+{
+	struct bio *bio;
+
+	while ((bio = bio_list_pop(&zwplug->bio_list))) {
+		blk_zone_wplug_bio_io_error(bio);
+		disk_put_zone_wplug(zwplug);
+	}
+}
+
+/*
+ * Abort (fail) all plugged BIOs of a zone write plug that are not aligned
+ * with the assumed write pointer location of the zone when the BIO will
+ * be unplugged.
+ */
+static void disk_zone_wplug_abort_unaligned(struct gendisk *disk,
+					    struct blk_zone_wplug *zwplug)
+{
+	unsigned int zone_capacity = disk->zone_capacity;
+	unsigned int wp_offset = zwplug->wp_offset;
+	struct bio_list bl = BIO_EMPTY_LIST;
+	struct bio *bio;
+
+	while ((bio = bio_list_pop(&zwplug->bio_list))) {
+		if (wp_offset >= zone_capacity ||
+		     bio_offset_from_zone_start(bio) != wp_offset) {
+			blk_zone_wplug_bio_io_error(bio);
+			disk_put_zone_wplug(zwplug);
+			continue;
+		}
+
+		wp_offset += bio_sectors(bio);
+		bio_list_add(&bl, bio);
+	}
+
+	bio_list_merge(&zwplug->bio_list, &bl);
+}
+
+/*
+ * Set a zone write plug write pointer offset to either 0 (zone reset case)
+ * or to the zone size (zone finish case). This aborts all plugged BIOs, which
+ * is fine to do as doing a zone reset or zone finish while writes are in-flight
+ * is a mistake from the user which will most likely cause all plugged BIOs to
+ * fail anyway.
+ */
+static void disk_zone_wplug_set_wp_offset(struct gendisk *disk,
+					  struct blk_zone_wplug *zwplug,
+					  unsigned int wp_offset)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&zwplug->lock, flags);
+
+	/*
+	 * Make sure that a BIO completion or another zone reset or finish
+	 * operation has not already removed the plug from the hash table.
+	 */
+	if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) {
+		spin_unlock_irqrestore(&zwplug->lock, flags);
+		return;
+	}
+
+	/* Update the zone write pointer and abort all plugged BIOs. */
+	zwplug->wp_offset = wp_offset;
+	disk_zone_wplug_abort(zwplug);
+
+	/*
+	 * Updating the write pointer offset puts back the zone
+	 * in a good state. So clear the error flag and decrement the
+	 * error count if we were in error state.
+	 */
+	if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) {
+		zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR;
+		spin_lock(&disk->zone_wplugs_lock);
+		list_del_init(&zwplug->link);
+		spin_unlock(&disk->zone_wplugs_lock);
+	}
+
+	/*
+	 * The zone write plug now has no BIO plugged: remove it from the
+	 * hash table so that it cannot be seen. The plug will be freed
+	 * when the last reference is dropped.
+	 */
+	if (disk_should_remove_zone_wplug(disk, zwplug))
+		disk_remove_zone_wplug(disk, zwplug);
+
+	spin_unlock_irqrestore(&zwplug->lock, flags);
+}
+
+static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio,
+						  unsigned int wp_offset)
+{
+	struct gendisk *disk = bio->bi_bdev->bd_disk;
+	sector_t sector = bio->bi_iter.bi_sector;
+	struct blk_zone_wplug *zwplug;
+
+	/* Conventional zones cannot be reset nor finished. */
+	if (disk_zone_is_conv(disk, sector)) {
+		bio_io_error(bio);
+		return true;
+	}
+
+	/*
+	 * If we have a zone write plug, set its write pointer offset to 0
+	 * (reset case) or to the zone size (finish case). This will abort all
+	 * BIOs plugged for the target zone. It is fine as resetting or
+	 * finishing zones while writes are still in-flight will result in the
+	 * writes failing anyway.
+	 */
+	zwplug = disk_get_zone_wplug(disk, sector);
+	if (zwplug) {
+		disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset);
+		disk_put_zone_wplug(zwplug);
+	}
+
+	return false;
+}
+
+static bool blk_zone_wplug_handle_reset_all(struct bio *bio)
+{
+	struct gendisk *disk = bio->bi_bdev->bd_disk;
+	struct blk_zone_wplug *zwplug;
+	sector_t sector;
+
+	/*
+	 * Set the write pointer offset of all zone write plugs to 0. This will
+	 * abort all plugged BIOs. It is fine as resetting zones while writes
+	 * are still in-flight will result in the writes failing anyway.
+	 */
+	for (sector = 0; sector < get_capacity(disk);
+	     sector += disk->queue->limits.chunk_sectors) {
+		zwplug = disk_get_zone_wplug(disk, sector);
+		if (zwplug) {
+			disk_zone_wplug_set_wp_offset(disk, zwplug, 0);
+			disk_put_zone_wplug(zwplug);
+		}
+	}
+
+	return false;
+}
+
+static inline void blk_zone_wplug_add_bio(struct blk_zone_wplug *zwplug,
+					  struct bio *bio, unsigned int nr_segs)
+{
+	/*
+	 * Grab an extra reference on the BIO request queue usage counter.
+	 * This reference will be reused to submit a request for the BIO for
+	 * blk-mq devices and dropped when the BIO is failed and after
+	 * it is issued in the case of BIO-based devices.
+	 */
+	percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter);
+
+	/*
+	 * The BIO is being plugged and thus will have to wait for the on-going
+	 * write and for all other writes already plugged. So polling makes
+	 * no sense.
+	 */
+	bio_clear_polled(bio);
+
+	/*
+	 * Reuse the poll cookie field to store the number of segments when
+	 * split to the hardware limits.
+	 */
+	bio->__bi_nr_segments = nr_segs;
+
+	/*
+	 * We always receive BIOs after they are split and ready to be issued.
+	 * The block layer passes the parts of a split BIO in order, and the
+	 * user must also issue write sequentially. So simply add the new BIO
+	 * at the tail of the list to preserve the sequential write order.
+	 */
+	bio_list_add(&zwplug->bio_list, bio);
+}
+
+/*
+ * Called from bio_attempt_back_merge() when a BIO was merged with a request.
+ */
+void blk_zone_write_plug_bio_merged(struct bio *bio)
+{
+	struct blk_zone_wplug *zwplug;
+	unsigned long flags;
+
+	/*
+	 * If the BIO was already plugged, then we were called through
+	 * blk_zone_write_plug_attempt_merge() -> blk_attempt_bio_merge().
+	 * For this case, blk_zone_write_plug_attempt_merge() will handle the
+	 * zone write pointer offset update.
+	 */
+	if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
+		return;
+
+	bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
+
+	/*
+	 * Increase the plug reference count and advance the zone write
+	 * pointer offset.
+	 */
+	zwplug = disk_get_zone_wplug(bio->bi_bdev->bd_disk,
+				     bio->bi_iter.bi_sector);
+	spin_lock_irqsave(&zwplug->lock, flags);
+	zwplug->wp_offset += bio_sectors(bio);
+	spin_unlock_irqrestore(&zwplug->lock, flags);
+}
+
+/*
+ * Attempt to merge plugged BIOs with a newly prepared request for a BIO that
+ * already went through zone write plugging (either a new BIO or one that was
+ * unplugged).
+ */
+void blk_zone_write_plug_attempt_merge(struct request *req)
+{
+	sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req);
+	struct request_queue *q = req->q;
+	struct gendisk *disk = q->disk;
+	unsigned int zone_capacity = disk->zone_capacity;
+	struct blk_zone_wplug *zwplug =
+		disk_get_zone_wplug(disk, blk_rq_pos(req));
+	unsigned long flags;
+	struct bio *bio;
+
+	/*
+	 * Completion of this request needs to be handled with
+	 * blk_zone_write_plug_complete_request().
+	 */
+	req->rq_flags |= RQF_ZONE_WRITE_PLUGGING;
+
+	if (blk_queue_nomerges(q))
+		return;
+
+	/*
+	 * Walk through the list of plugged BIOs to check if they can be merged
+	 * into the back of the request.
+	 */
+	spin_lock_irqsave(&zwplug->lock, flags);
+	while (zwplug->wp_offset < zone_capacity) {
+		bio = bio_list_peek(&zwplug->bio_list);
+		if (!bio)
+			break;
+
+		if (bio->bi_iter.bi_sector != req_back_sector ||
+		    !blk_rq_merge_ok(req, bio))
+			break;
+
+		WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE_ZEROES &&
+			     !bio->__bi_nr_segments);
+
+		bio_list_pop(&zwplug->bio_list);
+		if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) !=
+		    BIO_MERGE_OK) {
+			bio_list_add_head(&zwplug->bio_list, bio);
+			break;
+		}
+
+		/*
+		 * Drop the extra reference on the queue usage we got when
+		 * plugging the BIO and advance the write pointer offset.
+		 */
+		blk_queue_exit(q);
+		zwplug->wp_offset += bio_sectors(bio);
+
+		req_back_sector += bio_sectors(bio);
+	}
+	spin_unlock_irqrestore(&zwplug->lock, flags);
+}
+
+static inline void disk_zone_wplug_set_error(struct gendisk *disk,
+					     struct blk_zone_wplug *zwplug)
+{
+	if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR)) {
+		unsigned long flags;
+
+		/*
+		 * Increase the plug reference count. The reference will be
+		 * dropped in disk_zone_wplugs_work() once the error state
+		 * is handled.
+		 */
+		zwplug->flags |= BLK_ZONE_WPLUG_ERROR;
+		atomic_inc(&zwplug->ref);
+
+		spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+		list_add_tail(&zwplug->link, &disk->zone_wplugs_err_list);
+		spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+	}
+}
+
+/*
+ * Check and prepare a BIO for submission by incrementing the write pointer
+ * offset of its zone write plug.
+ */
+static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug,
+				       struct bio *bio)
+{
+	struct gendisk *disk = bio->bi_bdev->bd_disk;
+
+	/*
+	 * Check that the user is not attempting to write to a full zone.
+	 * We know such BIO will fail, and that would potentially overflow our
+	 * write pointer offset beyond the end of the zone.
+	 */
+	if (zwplug->wp_offset >= disk->zone_capacity)
+		goto err;
+
+	/*
+	 * Check for non-sequential writes early because we avoid a
+	 * whole lot of error handling trouble if we don't send it off
+	 * to the driver.
+	 */
+	if (bio_offset_from_zone_start(bio) != zwplug->wp_offset)
+		goto err;
+
+	/* Advance the zone write pointer offset. */
+	zwplug->wp_offset += bio_sectors(bio);
+
+	return true;
+
+err:
+	/* We detected an invalid write BIO: schedule error recovery. */
+	disk_zone_wplug_set_error(disk, zwplug);
+	kblockd_schedule_work(&disk->zone_wplugs_work);
+	return false;
+}
+
+static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
+{
+	struct gendisk *disk = bio->bi_bdev->bd_disk;
+	sector_t sector = bio->bi_iter.bi_sector;
+	struct blk_zone_wplug *zwplug;
+	gfp_t gfp_mask = GFP_NOIO;
+	unsigned long flags;
+
+	/*
+	 * BIOs must be fully contained within a zone so that we use the correct
+	 * zone write plug for the entire BIO. For blk-mq devices, the block
+	 * layer should already have done any splitting required to ensure this
+	 * and this BIO should thus not be straddling zone boundaries. For
+	 * BIO-based devices, it is the responsibility of the driver to split
+	 * the bio before submitting it.
+	 */
+	if (WARN_ON_ONCE(bio_straddles_zones(bio))) {
+		bio_io_error(bio);
+		return true;
+	}
+
+	/* Conventional zones do not need write plugging. */
+	if (disk_zone_is_conv(disk, sector))
+		return false;
+
+	if (bio->bi_opf & REQ_NOWAIT)
+		gfp_mask = GFP_NOWAIT;
+
+	zwplug = disk_get_and_lock_zone_wplug(disk, sector, gfp_mask, &flags);
+	if (!zwplug) {
+		bio_io_error(bio);
+		return true;
+	}
+
+	/* Indicate that this BIO is being handled using zone write plugging. */
+	bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
+
+	/*
+	 * If the zone is already plugged or has a pending error, add the BIO
+	 * to the plug BIO list. Otherwise, plug and let the BIO execute.
+	 */
+	if (zwplug->flags & BLK_ZONE_WPLUG_BUSY)
+		goto plug;
+
+	/*
+	 * If an error is detected when preparing the BIO, add it to the BIO
+	 * list so that error recovery can deal with it.
+	 */
+	if (!blk_zone_wplug_prepare_bio(zwplug, bio))
+		goto plug;
+
+	zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
+
+	spin_unlock_irqrestore(&zwplug->lock, flags);
+
+	return false;
+
+plug:
+	zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
+	blk_zone_wplug_add_bio(zwplug, bio, nr_segs);
+
+	spin_unlock_irqrestore(&zwplug->lock, flags);
+
+	return true;
+}
+
+/**
+ * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging
+ * @bio: The BIO being submitted
+ * @nr_segs: The number of physical segments of @bio
+ *
+ * Handle write and write zeroes operations using zone write plugging.
+ *
+ * Return true whenever @bio execution needs to be delayed through the zone
+ * write plug. Otherwise, return false to let the submission path process
+ * @bio normally.
+ */
+bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
+{
+	struct block_device *bdev = bio->bi_bdev;
+
+	if (!bdev->bd_disk->zone_wplugs_hash)
+		return false;
+
+	/*
+	 * If the BIO already has the plugging flag set, then it was already
+	 * handled through this path and this is a submission from the zone
+	 * plug bio submit work.
+	 */
+	if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
+		return false;
+
+	/*
+	 * We do not need to do anything special for empty flush BIOs, e.g
+	 * BIOs such as issued by blkdev_issue_flush(). The is because it is
+	 * the responsibility of the user to first wait for the completion of
+	 * write operations for flush to have any effect on the persistence of
+	 * the written data.
+	 */
+	if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
+		return false;
+
+	/*
+	 * Regular writes and write zeroes need to be handled through the target
+	 * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH
+	 * which may need to go through the flush machinery depending on the
+	 * target device capabilities. Plugging such writes is fine as the flush
+	 * machinery operates at the request level, below the plug, and
+	 * completion of the flush sequence will go through the regular BIO
+	 * completion, which will handle zone write plugging.
+	 * Zone reset, reset all and finish commands need special treatment
+	 * to correctly track the write pointer offset of zones. These commands
+	 * are not plugged as we do not need serialization with write
+	 * operations. It is the responsibility of the user to not issue reset
+	 * and finish commands when write operations are in flight.
+	 */
+	switch (bio_op(bio)) {
+	case REQ_OP_WRITE:
+	case REQ_OP_WRITE_ZEROES:
+		return blk_zone_wplug_handle_write(bio, nr_segs);
+	case REQ_OP_ZONE_RESET:
+		return blk_zone_wplug_handle_reset_or_finish(bio, 0);
+	case REQ_OP_ZONE_FINISH:
+		return blk_zone_wplug_handle_reset_or_finish(bio,
+						bdev_zone_sectors(bdev));
+	case REQ_OP_ZONE_RESET_ALL:
+		return blk_zone_wplug_handle_reset_all(bio);
+	default:
+		return false;
+	}
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(blk_zone_plug_bio);
+
+static void disk_zone_wplug_unplug_bio(struct gendisk *disk,
+				       struct blk_zone_wplug *zwplug)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&zwplug->lock, flags);
+
+	/*
+	 * If we had an error, schedule error recovery. The recovery work
+	 * will restart submission of plugged BIOs.
+	 */
+	if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) {
+		spin_unlock_irqrestore(&zwplug->lock, flags);
+		kblockd_schedule_work(&disk->zone_wplugs_work);
+		return;
+	}
+
+	/* Schedule submission of the next plugged BIO if we have one. */
+	if (!bio_list_empty(&zwplug->bio_list)) {
+		spin_unlock_irqrestore(&zwplug->lock, flags);
+		kblockd_schedule_work(&zwplug->bio_work);
+		return;
+	}
+
+	zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
+
+	/*
+	 * If the zone is full (it was fully written or finished, or empty
+	 * (it was reset), remove its zone write plug from the hash table.
+	 */
+	if (disk_should_remove_zone_wplug(disk, zwplug))
+		disk_remove_zone_wplug(disk, zwplug);
+
+	spin_unlock_irqrestore(&zwplug->lock, flags);
+}
+
+void blk_zone_write_plug_bio_endio(struct bio *bio)
+{
+	struct gendisk *disk = bio->bi_bdev->bd_disk;
+	struct blk_zone_wplug *zwplug =
+		disk_get_zone_wplug(bio->bi_bdev->bd_disk,
+				    bio->bi_iter.bi_sector);
+	unsigned long flags;
+
+	if (WARN_ON_ONCE(!zwplug))
+		return;
+
+	/* Make sure we do not see this BIO again by clearing the plug flag. */
+	bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
+
+	/*
+	 * If the BIO failed, mark the plug as having an error to trigger
+	 * recovery.
+	 */
+	if (bio->bi_status != BLK_STS_OK) {
+		spin_lock_irqsave(&zwplug->lock, flags);
+		disk_zone_wplug_set_error(disk, zwplug);
+		spin_unlock_irqrestore(&zwplug->lock, flags);
+	}
+
+	/*
+	 * For BIO-based devices, blk_zone_write_plug_complete_request()
+	 * is not called. So we need to schedule execution of the next
+	 * plugged BIO here.
+	 */
+	if (bio->bi_bdev->bd_has_submit_bio)
+		disk_zone_wplug_unplug_bio(disk, zwplug);
+
+	/* Drop the reference we took when the BIO was issued. */
+	atomic_dec(&zwplug->ref);
+	disk_put_zone_wplug(zwplug);
+}
+
+void blk_zone_write_plug_complete_request(struct request *req)
+{
+	struct gendisk *disk = req->q->disk;
+	struct blk_zone_wplug *zwplug = disk_get_zone_wplug(disk, req->__sector);
+
+	if (WARN_ON_ONCE(!zwplug))
+		return;
+
+	req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING;
+
+	disk_zone_wplug_unplug_bio(disk, zwplug);
+
+	/*
+	 * Drop the reference we took when the request was initialized in
+	 * blk_zone_write_plug_attempt_merge().
+	 */
+	atomic_dec(&zwplug->ref);
+	disk_put_zone_wplug(zwplug);
+}
+
+static void blk_zone_wplug_bio_work(struct work_struct *work)
+{
+	struct blk_zone_wplug *zwplug =
+		container_of(work, struct blk_zone_wplug, bio_work);
+	struct block_device *bdev;
+	unsigned long flags;
+	struct bio *bio;
+
+	/*
+	 * Submit the next plugged BIO. If we do not have any, clear
+	 * the plugged flag.
+	 */
+	spin_lock_irqsave(&zwplug->lock, flags);
+
+	bio = bio_list_pop(&zwplug->bio_list);
+	if (!bio) {
+		zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
+		spin_unlock_irqrestore(&zwplug->lock, flags);
+		return;
+	}
+
+	if (!blk_zone_wplug_prepare_bio(zwplug, bio)) {
+		/* Error recovery will decide what to do with the BIO. */
+		bio_list_add_head(&zwplug->bio_list, bio);
+		spin_unlock_irqrestore(&zwplug->lock, flags);
+		return;
+	}
+
+	spin_unlock_irqrestore(&zwplug->lock, flags);
+
+	bdev = bio->bi_bdev;
+	submit_bio_noacct_nocheck(bio);
+
+	/*
+	 * blk-mq devices will reuse the extra reference on the request queue
+	 * usage counter we took when the BIO was plugged, but the submission
+	 * path for BIO-based devices will not do that. So drop this extra
+	 * reference here.
+	 */
+	if (bdev->bd_has_submit_bio)
+		blk_queue_exit(bdev->bd_disk->queue);
+}
+
+static unsigned int blk_zone_wp_offset(struct blk_zone *zone)
+{
+	switch (zone->cond) {
+	case BLK_ZONE_COND_IMP_OPEN:
+	case BLK_ZONE_COND_EXP_OPEN:
+	case BLK_ZONE_COND_CLOSED:
+		return zone->wp - zone->start;
+	case BLK_ZONE_COND_FULL:
+		return zone->len;
+	case BLK_ZONE_COND_EMPTY:
+		return 0;
+	case BLK_ZONE_COND_NOT_WP:
+	case BLK_ZONE_COND_OFFLINE:
+	case BLK_ZONE_COND_READONLY:
+	default:
+		/*
+		 * Conventional, offline and read-only zones do not have a valid
+		 * write pointer.
+		 */
+		return UINT_MAX;
+	}
+}
+
+static int blk_zone_wplug_report_zone_cb(struct blk_zone *zone,
+					 unsigned int idx, void *data)
+{
+	struct blk_zone *zonep = data;
+
+	*zonep = *zone;
+	return 0;
+}
+
+static void disk_zone_wplug_handle_error(struct gendisk *disk,
+					 struct blk_zone_wplug *zwplug)
+{
+	sector_t zone_start_sector =
+		bdev_zone_sectors(disk->part0) * zwplug->zone_no;
+	unsigned int noio_flag;
+	struct blk_zone zone;
+	unsigned long flags;
+	int ret;
+
+	/* Get the current zone information from the device. */
+	noio_flag = memalloc_noio_save();
+	ret = disk->fops->report_zones(disk, zone_start_sector, 1,
+				       blk_zone_wplug_report_zone_cb, &zone);
+	memalloc_noio_restore(noio_flag);
+
+	spin_lock_irqsave(&zwplug->lock, flags);
+
+	/*
+	 * A zone reset or finish may have cleared the error already. In such
+	 * case, do nothing as the report zones may have seen the "old" write
+	 * pointer value before the reset/finish operation completed.
+	 */
+	if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR))
+		goto unlock;
+
+	zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR;
+
+	if (ret != 1) {
+		/*
+		 * We failed to get the zone information, meaning that something
+		 * is likely really wrong with the device. Abort all remaining
+		 * plugged BIOs as otherwise we could endup waiting forever on
+		 * plugged BIOs to complete if there is a queue freeze on-going.
+		 */
+		disk_zone_wplug_abort(zwplug);
+		goto unplug;
+	}
+
+	/* Update the zone write pointer offset. */
+	zwplug->wp_offset = blk_zone_wp_offset(&zone);
+	disk_zone_wplug_abort_unaligned(disk, zwplug);
+
+	/* Restart BIO submission if we still have any BIO left. */
+	if (!bio_list_empty(&zwplug->bio_list)) {
+		WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED));
+		kblockd_schedule_work(&zwplug->bio_work);
+		goto unlock;
+	}
+
+unplug:
+	zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
+	if (disk_should_remove_zone_wplug(disk, zwplug))
+		disk_remove_zone_wplug(disk, zwplug);
+
+unlock:
+	spin_unlock_irqrestore(&zwplug->lock, flags);
+}
+
+static void disk_zone_wplugs_work(struct work_struct *work)
+{
+	struct gendisk *disk =
+		container_of(work, struct gendisk, zone_wplugs_work);
+	struct blk_zone_wplug *zwplug;
+	unsigned long flags;
+
+	spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+
+	while (!list_empty(&disk->zone_wplugs_err_list)) {
+		zwplug = list_first_entry(&disk->zone_wplugs_err_list,
+					  struct blk_zone_wplug, link);
+		list_del_init(&zwplug->link);
+		spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+
+		disk_zone_wplug_handle_error(disk, zwplug);
+		disk_put_zone_wplug(zwplug);
+
+		spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+	}
+
+	spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+}
+
+static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk)
+{
+	return 1U << disk->zone_wplugs_hash_bits;
+}
+
+void disk_init_zone_resources(struct gendisk *disk)
+{
+	spin_lock_init(&disk->zone_wplugs_lock);
+	INIT_LIST_HEAD(&disk->zone_wplugs_err_list);
+	INIT_WORK(&disk->zone_wplugs_work, disk_zone_wplugs_work);
+}
+
+/*
+ * For the size of a disk zone write plug hash table, use the size of the
+ * zone write plug mempool, which is the maximum of the disk open zones and
+ * active zones limits. But do not exceed 4KB (512 hlist head entries), that is,
+ * 9 bits. For a disk that has no limits, mempool size defaults to 128.
+ */
+#define BLK_ZONE_WPLUG_MAX_HASH_BITS		9
+#define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE	128
+
+static int disk_alloc_zone_resources(struct gendisk *disk,
+				     unsigned int pool_size)
+{
+	unsigned int i;
+
+	disk->zone_wplugs_hash_bits =
+		min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS);
+
+	disk->zone_wplugs_hash =
+		kcalloc(disk_zone_wplugs_hash_size(disk),
+			sizeof(struct hlist_head), GFP_KERNEL);
+	if (!disk->zone_wplugs_hash)
+		return -ENOMEM;
+
+	for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++)
+		INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]);
+
+	disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size,
+						sizeof(struct blk_zone_wplug));
+	if (!disk->zone_wplugs_pool) {
+		kfree(disk->zone_wplugs_hash);
+		disk->zone_wplugs_hash = NULL;
+		disk->zone_wplugs_hash_bits = 0;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
+{
+	struct blk_zone_wplug *zwplug;
+	unsigned int i;
+
+	if (!disk->zone_wplugs_hash)
+		return;
+
+	/* Free all the zone write plugs we have. */
+	for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
+		while (!hlist_empty(&disk->zone_wplugs_hash[i])) {
+			zwplug = hlist_entry(disk->zone_wplugs_hash[i].first,
+					     struct blk_zone_wplug, node);
+			atomic_inc(&zwplug->ref);
+			disk_remove_zone_wplug(disk, zwplug);
+			disk_put_zone_wplug(zwplug);
+		}
+	}
+
+	kfree(disk->zone_wplugs_hash);
+	disk->zone_wplugs_hash = NULL;
+	disk->zone_wplugs_hash_bits = 0;
+}
+
+void disk_free_zone_resources(struct gendisk *disk)
+{
+	cancel_work_sync(&disk->zone_wplugs_work);
+
+	disk_destroy_zone_wplugs_hash_table(disk);
+
+	/*
+	 * Wait for the zone write plugs to be RCU-freed before
+	 * destorying the mempool.
+	 */
+	rcu_barrier();
+
+	mempool_destroy(disk->zone_wplugs_pool);
+	disk->zone_wplugs_pool = NULL;
+
 	kfree(disk->conv_zones_bitmap);
 	disk->conv_zones_bitmap = NULL;
 	kfree(disk->seq_zones_wlock);
 	disk->seq_zones_wlock = NULL;
+
+	disk->zone_capacity = 0;
+	disk->nr_zones = 0;
+}
+
+static int disk_revalidate_zone_resources(struct gendisk *disk,
+					  unsigned int nr_zones)
+{
+	struct queue_limits *lim = &disk->queue->limits;
+	unsigned int pool_size;
+
+	/*
+	 * If the device has no limit on the maximum number of open and active
+	 * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE.
+	 */
+	pool_size = max(lim->max_open_zones, lim->max_active_zones);
+	if (!pool_size)
+		pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_zones);
+
+	if (!disk->zone_wplugs_hash)
+		return disk_alloc_zone_resources(disk, pool_size);
+
+	/* Resize the zone write plug memory pool if needed. */
+	if (disk->zone_wplugs_pool->min_nr != pool_size)
+		return mempool_resize(disk->zone_wplugs_pool, pool_size);
+
+	return 0;
 }
 
 struct blk_revalidate_zone_args {
@@ -453,6 +1514,9 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
 	struct request_queue *q = disk->queue;
 	sector_t capacity = get_capacity(disk);
 	sector_t zone_sectors = q->limits.chunk_sectors;
+	struct blk_zone_wplug *zwplug;
+	unsigned long flags;
+	unsigned int wp_offset;
 
 	/* Check for bad zones and holes in the zone report */
 	if (zone->start != args->sector) {
@@ -524,6 +1588,22 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
 				disk->disk_name);
 			return -ENODEV;
 		}
+
+		/*
+		 * We need to track the write pointer of all zones that are not
+		 * empty nor full. So make sure we have a zone write plug for
+		 * such zone.
+		 */
+		wp_offset = blk_zone_wp_offset(zone);
+		if (wp_offset && wp_offset < zone_sectors) {
+			zwplug = disk_get_and_lock_zone_wplug(disk, zone->start,
+							      GFP_NOIO, &flags);
+			if (!zwplug)
+				return -ENOMEM;
+			spin_unlock_irqrestore(&zwplug->lock, flags);
+			disk_put_zone_wplug(zwplug);
+		}
+
 		break;
 	case BLK_ZONE_TYPE_SEQWRITE_PREF:
 	default:
@@ -560,7 +1640,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
 	sector_t capacity = get_capacity(disk);
 	struct blk_revalidate_zone_args args = { };
 	unsigned int noio_flag;
-	int ret;
+	int ret = -ENOMEM;
 
 	if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
 		return -EIO;
@@ -593,6 +1673,11 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
 	args.disk = disk;
 	args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors);
 	noio_flag = memalloc_noio_save();
+	ret = disk_revalidate_zone_resources(disk, args.nr_zones);
+	if (ret) {
+		memalloc_noio_restore(noio_flag);
+		return ret;
+	}
 	ret = disk->fops->report_zones(disk, 0, UINT_MAX,
 				       blk_revalidate_zone_cb, &args);
 	if (!ret) {
@@ -627,7 +1712,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
 		ret = 0;
 	} else {
 		pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
-		disk_free_zone_bitmaps(disk);
+		disk_free_zone_resources(disk);
 	}
 	blk_mq_unfreeze_queue(q);
 
diff --git a/block/blk.h b/block/blk.h
index bca50a9510c8..4df969f8fa28 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -415,7 +415,14 @@ static inline struct bio *blk_queue_bounce(struct bio *bio,
 }
 
 #ifdef CONFIG_BLK_DEV_ZONED
-void disk_free_zone_bitmaps(struct gendisk *disk);
+void disk_init_zone_resources(struct gendisk *disk);
+void disk_free_zone_resources(struct gendisk *disk);
+static inline bool bio_zone_write_plugging(struct bio *bio)
+{
+	return bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING);
+}
+void blk_zone_write_plug_bio_merged(struct bio *bio);
+void blk_zone_write_plug_attempt_merge(struct request *rq);
 static inline void blk_zone_update_request_bio(struct request *rq,
 					       struct bio *bio)
 {
@@ -423,22 +430,60 @@ static inline void blk_zone_update_request_bio(struct request *rq,
 	 * For zone append requests, the request sector indicates the location
 	 * at which the BIO data was written. Return this value to the BIO
 	 * issuer through the BIO iter sector.
+	 * For plugged zone writes, we need the original BIO sector so
+	 * that blk_zone_write_plug_bio_endio() can lookup the zone write plug.
 	 */
-	if (req_op(rq) == REQ_OP_ZONE_APPEND)
+	if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio))
 		bio->bi_iter.bi_sector = rq->__sector;
 }
+void blk_zone_write_plug_bio_endio(struct bio *bio);
+static inline void blk_zone_bio_endio(struct bio *bio)
+{
+	/*
+	 * For write BIOs to zoned devices, signal the completion of the BIO so
+	 * that the next write BIO can be submitted by zone write plugging.
+	 */
+	if (bio_zone_write_plugging(bio))
+		blk_zone_write_plug_bio_endio(bio);
+}
+
+void blk_zone_write_plug_complete_request(struct request *rq);
+static inline void blk_zone_complete_request(struct request *rq)
+{
+	if (rq->rq_flags & RQF_ZONE_WRITE_PLUGGING)
+		blk_zone_write_plug_complete_request(rq);
+}
 int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
 		unsigned long arg);
 int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
 		unsigned int cmd, unsigned long arg);
 #else /* CONFIG_BLK_DEV_ZONED */
-static inline void disk_free_zone_bitmaps(struct gendisk *disk)
+static inline void disk_init_zone_resources(struct gendisk *disk)
+{
+}
+static inline void disk_free_zone_resources(struct gendisk *disk)
+{
+}
+static inline bool bio_zone_write_plugging(struct bio *bio)
+{
+	return false;
+}
+static inline void blk_zone_write_plug_bio_merged(struct bio *bio)
+{
+}
+static inline void blk_zone_write_plug_attempt_merge(struct request *rq)
 {
 }
 static inline void blk_zone_update_request_bio(struct request *rq,
 					       struct bio *bio)
 {
 }
+static inline void blk_zone_bio_endio(struct bio *bio)
+{
+}
+static inline void blk_zone_complete_request(struct request *rq)
+{
+}
 static inline int blkdev_report_zones_ioctl(struct block_device *bdev,
 		unsigned int cmd, unsigned long arg)
 {
diff --git a/block/genhd.c b/block/genhd.c
index bb29a68e1d67..eb893df56d51 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1182,7 +1182,7 @@ static void disk_release(struct device *dev)
 
 	disk_release_events(disk);
 	kfree(disk->random);
-	disk_free_zone_bitmaps(disk);
+	disk_free_zone_resources(disk);
 	xa_destroy(&disk->part_tbl);
 
 	disk->queue->disk = NULL;
@@ -1364,6 +1364,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
 	if (blkcg_init_disk(disk))
 		goto out_erase_part0;
 
+	disk_init_zone_resources(disk);
 	rand_initialize_disk(disk);
 	disk_to_dev(disk)->class = &block_class;
 	disk_to_dev(disk)->type = &disk_type;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index d3d8fd8e229b..60090c8366fb 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -56,6 +56,8 @@ typedef __u32 __bitwise req_flags_t;
 #define RQF_SPECIAL_PAYLOAD	((__force req_flags_t)(1 << 18))
 /* The per-zone write lock is held for this request */
 #define RQF_ZONE_WRITE_LOCKED	((__force req_flags_t)(1 << 19))
+/* The request completion needs to be signaled to zone write pluging. */
+#define RQF_ZONE_WRITE_PLUGGING	((__force req_flags_t)(1 << 20))
 /* ->timeout has been called, don't expire again */
 #define RQF_TIMED_OUT		((__force req_flags_t)(1 << 21))
 #define RQF_RESV		((__force req_flags_t)(1 << 23))
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index cb1526ec44b5..ed45de07d2ef 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -234,7 +234,12 @@ struct bio {
 
 	struct bvec_iter	bi_iter;
 
-	blk_qc_t		bi_cookie;
+	union {
+		/* for polled bios: */
+		blk_qc_t		bi_cookie;
+		/* for plugged zoned writes only: */
+		unsigned int		__bi_nr_segments;
+	};
 	bio_end_io_t		*bi_end_io;
 	void			*bi_private;
 #ifdef CONFIG_BLK_CGROUP
@@ -305,6 +310,7 @@ enum {
 	BIO_QOS_MERGED,		/* but went through rq_qos merge path */
 	BIO_REMAPPED,
 	BIO_ZONE_WRITE_LOCKED,	/* Owns a zoned device zone write lock */
+	BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */
 	BIO_FLAG_LAST
 };
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4e81f714cca7..348b57ca0425 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -194,6 +194,12 @@ struct gendisk {
 	unsigned int		zone_capacity;
 	unsigned long		*conv_zones_bitmap;
 	unsigned long		*seq_zones_wlock;
+	unsigned int            zone_wplugs_hash_bits;
+	spinlock_t              zone_wplugs_lock;
+	struct mempool_s	*zone_wplugs_pool;
+	struct hlist_head       *zone_wplugs_hash;
+	struct list_head        zone_wplugs_err_list;
+	struct work_struct	zone_wplugs_work;
 #endif /* CONFIG_BLK_DEV_ZONED */
 
 #if IS_ENABLED(CONFIG_CDROM)
@@ -663,6 +669,7 @@ static inline unsigned int bdev_max_active_zones(struct block_device *bdev)
 	return bdev->bd_disk->queue->limits.max_active_zones;
 }
 
+bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs);
 #else /* CONFIG_BLK_DEV_ZONED */
 static inline unsigned int bdev_nr_zones(struct block_device *bdev)
 {
@@ -690,6 +697,10 @@ static inline unsigned int bdev_max_active_zones(struct block_device *bdev)
 {
 	return 0;
 }
+static inline bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
+{
+	return false;
+}
 #endif /* CONFIG_BLK_DEV_ZONED */
 
 static inline unsigned int blk_queue_depth(struct request_queue *q)
-- 
cgit v1.2.3


From ccdbf0aad2523ca133cceb22ce0f8306730e7ac3 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Mon, 8 Apr 2024 10:41:09 +0900
Subject: block: Allow zero value of max_zone_append_sectors queue limit

In preparation for adding a generic zone append emulation using zone
write plugging, allow device drivers supporting zoned block device to
set a the max_zone_append_sectors queue limit of a device to 0 to
indicate the lack of native support for zone append operations and that
the block layer should emulate these operations using regular write
operations.

blk_queue_max_zone_append_sectors() is modified to allow passing 0 as
the max_zone_append_sectors argument. The function
queue_max_zone_append_sectors() is also modified to ensure that the
minimum of the max_hw_sectors and chunk_sectors limit is used whenever
the max_zone_append_sectors limit is 0. This minimum is consistent with
the value set for the max_zone_append_sectors limit by the function
blk_validate_zoned_limits() when limits for a queue are validated.

The helper functions queue_emulates_zone_append() and
bdev_emulates_zone_append() are added to test if a queue (or block
device) emulates zone append operations.

In order for blk_revalidate_disk_zones() to accept zoned block devices
relying on zone append emulation, the direct check to the
max_zone_append_sectors queue limit of the disk is replaced by a check
using the value returned by queue_max_zone_append_sectors(). Similarly,
queue_zone_append_max_show() is modified to use the same accessor so
that the sysfs attribute advertizes the non-zero limit that will be
used, regardless if it is for native or emulated commands.

For stacking drivers, a top device should not need to care if the
underlying devices have native or emulated zone append operations.
blk_stack_limits() is thus modified to set the top device
max_zone_append_sectors limit using the new accessor
queue_limits_max_zone_append_sectors(). queue_max_zone_append_sectors()
is modified to use this function as well. Stacking drivers that require
zone append emulation, e.g. dm-crypt, can still request this feature by
calling blk_queue_max_zone_append_sectors() with a 0 limit.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Tested-by: Hans Holmberg <hans.holmberg@wdc.com>
Tested-by: Dennis Maisenbacher <dennis.maisenbacher@wdc.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20240408014128.205141-10-dlemoal@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       |  2 +-
 block/blk-settings.c   | 30 +++++++++++++++++++-----------
 block/blk-sysfs.c      |  2 +-
 block/blk-zoned.c      |  2 +-
 include/linux/blkdev.h | 23 ++++++++++++++++++++---
 5 files changed, 42 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/block/blk-core.c b/block/blk-core.c
index a16b5abdbbf5..3bf28149e104 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -602,7 +602,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
 		return BLK_STS_IOERR;
 
 	/* Make sure the BIO is small enough and will not get split */
-	if (nr_sectors > q->limits.max_zone_append_sectors)
+	if (nr_sectors > queue_max_zone_append_sectors(q))
 		return BLK_STS_IOERR;
 
 	bio->bi_opf |= REQ_NOMERGE;
diff --git a/block/blk-settings.c b/block/blk-settings.c
index cdbaef159c4b..c0197e1e7485 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -412,24 +412,32 @@ EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors);
  * blk_queue_max_zone_append_sectors - set max sectors for a single zone append
  * @q:  the request queue for the device
  * @max_zone_append_sectors: maximum number of sectors to write per command
+ *
+ * Sets the maximum number of sectors allowed for zone append commands. If
+ * Specifying 0 for @max_zone_append_sectors indicates that the queue does
+ * not natively support zone append operations and that the block layer must
+ * emulate these operations using regular writes.
  **/
 void blk_queue_max_zone_append_sectors(struct request_queue *q,
 		unsigned int max_zone_append_sectors)
 {
-	unsigned int max_sectors;
+	unsigned int max_sectors = 0;
 
 	if (WARN_ON(!blk_queue_is_zoned(q)))
 		return;
 
-	max_sectors = min(q->limits.max_hw_sectors, max_zone_append_sectors);
-	max_sectors = min(q->limits.chunk_sectors, max_sectors);
+	if (max_zone_append_sectors) {
+		max_sectors = min(q->limits.max_hw_sectors,
+				  max_zone_append_sectors);
+		max_sectors = min(q->limits.chunk_sectors, max_sectors);
 
-	/*
-	 * Signal eventual driver bugs resulting in the max_zone_append sectors limit
-	 * being 0 due to a 0 argument, the chunk_sectors limit (zone size) not set,
-	 * or the max_hw_sectors limit not set.
-	 */
-	WARN_ON(!max_sectors);
+		/*
+		 * Signal eventual driver bugs resulting in the max_zone_append
+		 * sectors limit being 0 due to the chunk_sectors limit (zone
+		 * size) not set or the max_hw_sectors limit not set.
+		 */
+		WARN_ON_ONCE(!max_sectors);
+	}
 
 	q->limits.max_zone_append_sectors = max_sectors;
 }
@@ -756,8 +764,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
 	t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
 					b->max_write_zeroes_sectors);
-	t->max_zone_append_sectors = min(t->max_zone_append_sectors,
-					b->max_zone_append_sectors);
+	t->max_zone_append_sectors = min(queue_limits_max_zone_append_sectors(t),
+					 queue_limits_max_zone_append_sectors(b));
 	t->bounce = max(t->bounce, b->bounce);
 
 	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 8c8f69d8ba48..e3ed5a921aff 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -224,7 +224,7 @@ static ssize_t queue_zone_write_granularity_show(struct request_queue *q,
 
 static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page)
 {
-	unsigned long long max_sectors = q->limits.max_zone_append_sectors;
+	unsigned long long max_sectors = queue_max_zone_append_sectors(q);
 
 	return sprintf(page, "%llu\n", max_sectors << SECTOR_SHIFT);
 }
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 4b21a1ec00d4..fcc1284b7c19 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -1692,7 +1692,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
 		return -ENODEV;
 	}
 
-	if (!q->limits.max_zone_append_sectors) {
+	if (!queue_max_zone_append_sectors(q)) {
 		pr_warn("%s: Invalid 0 maximum zone append limit\n",
 			disk->disk_name);
 		return -ENODEV;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 348b57ca0425..46613bf6a402 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1173,12 +1173,29 @@ static inline unsigned int queue_max_segment_size(const struct request_queue *q)
 	return q->limits.max_segment_size;
 }
 
-static inline unsigned int queue_max_zone_append_sectors(const struct request_queue *q)
+static inline unsigned int queue_limits_max_zone_append_sectors(struct queue_limits *l)
 {
+	unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors);
 
-	const struct queue_limits *l = &q->limits;
+	return min_not_zero(l->max_zone_append_sectors, max_sectors);
+}
+
+static inline unsigned int queue_max_zone_append_sectors(struct request_queue *q)
+{
+	if (!blk_queue_is_zoned(q))
+		return 0;
 
-	return min(l->max_zone_append_sectors, l->max_sectors);
+	return queue_limits_max_zone_append_sectors(&q->limits);
+}
+
+static inline bool queue_emulates_zone_append(struct request_queue *q)
+{
+	return blk_queue_is_zoned(q) && !q->limits.max_zone_append_sectors;
+}
+
+static inline bool bdev_emulates_zone_append(struct block_device *bdev)
+{
+	return queue_emulates_zone_append(bdev_get_queue(bdev));
 }
 
 static inline unsigned int
-- 
cgit v1.2.3


From 9b1ce7f0c6f82e241196febabddba5fab66c8f05 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Mon, 8 Apr 2024 10:41:10 +0900
Subject: block: Implement zone append emulation

Given that zone write plugging manages all writes to zones of a zoned
block device and tracks the write pointer position of all zones that are
not full nor empty, emulating zone append operations using regular
writes can be implemented generically, without relying on the underlying
device driver to implement such emulation. This is needed for devices
that do not natively support the zone append command (e.g. SMR
hard-disks).

A device may request zone append emulation by setting its
max_zone_append_sectors queue limit to 0. For such device, the function
blk_zone_wplug_prepare_bio() changes zone append BIOs into
non-mergeable regular write BIOs. Modified zone append BIOs are flagged
with the new BIO flag BIO_EMULATES_ZONE_APPEND. This flag is checked
on completion of the BIO in blk_zone_write_plug_bio_endio() to restore
the original REQ_OP_ZONE_APPEND operation code of the BIO.

The block layer internal inline helper function bio_is_zone_append() is
added to test if a BIO is either a native zone append operation
(REQ_OP_ZONE_APPEND operation code) or if it is flagged with
BIO_EMULATES_ZONE_APPEND. Given that both native and emulated zone
append BIO completion handling should be similar, The functions
blk_update_request() and blk_zone_complete_request_bio() are modified to
use bio_is_zone_append() to execute blk_zone_update_request_bio() for
both native and emulated zone append operations.

This commit contains contributions from Christoph Hellwig <hch@lst.de>.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Tested-by: Hans Holmberg <hans.holmberg@wdc.com>
Tested-by: Dennis Maisenbacher <dennis.maisenbacher@wdc.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20240408014128.205141-11-dlemoal@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c            |  3 +--
 block/blk-zoned.c         | 64 +++++++++++++++++++++++++++++++++++++++--------
 block/blk.h               | 14 +++++++++--
 include/linux/blk_types.h |  1 +
 4 files changed, 67 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 48eb7dd049d1..6f9cc1c4d4fb 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -906,8 +906,7 @@ bool blk_update_request(struct request *req, blk_status_t error,
 
 		if (bio_bytes == bio->bi_iter.bi_size) {
 			req->bio = bio->bi_next;
-		} else if (req_op(req) == REQ_OP_ZONE_APPEND &&
-			   error == BLK_STS_OK) {
+		} else if (bio_is_zone_append(bio) && error == BLK_STS_OK) {
 			/*
 			 * Partial zone append completions cannot be supported
 			 * as the BIO fragments may end up not being written
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index fcc1284b7c19..a60ac5b3e637 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -689,7 +689,8 @@ static void disk_zone_wplug_abort_unaligned(struct gendisk *disk,
 
 	while ((bio = bio_list_pop(&zwplug->bio_list))) {
 		if (wp_offset >= zone_capacity ||
-		     bio_offset_from_zone_start(bio) != wp_offset) {
+		    (bio_op(bio) != REQ_OP_ZONE_APPEND &&
+		     bio_offset_from_zone_start(bio) != wp_offset)) {
 			blk_zone_wplug_bio_io_error(bio);
 			disk_put_zone_wplug(zwplug);
 			continue;
@@ -951,7 +952,8 @@ static inline void disk_zone_wplug_set_error(struct gendisk *disk,
 
 /*
  * Check and prepare a BIO for submission by incrementing the write pointer
- * offset of its zone write plug.
+ * offset of its zone write plug and changing zone append operations into
+ * regular write when zone append emulation is needed.
  */
 static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug,
 				       struct bio *bio)
@@ -966,13 +968,30 @@ static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug,
 	if (zwplug->wp_offset >= disk->zone_capacity)
 		goto err;
 
-	/*
-	 * Check for non-sequential writes early because we avoid a
-	 * whole lot of error handling trouble if we don't send it off
-	 * to the driver.
-	 */
-	if (bio_offset_from_zone_start(bio) != zwplug->wp_offset)
-		goto err;
+	if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+		/*
+		 * Use a regular write starting at the current write pointer.
+		 * Similarly to native zone append operations, do not allow
+		 * merging.
+		 */
+		bio->bi_opf &= ~REQ_OP_MASK;
+		bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE;
+		bio->bi_iter.bi_sector += zwplug->wp_offset;
+
+		/*
+		 * Remember that this BIO is in fact a zone append operation
+		 * so that we can restore its operation code on completion.
+		 */
+		bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND);
+	} else {
+		/*
+		 * Check for non-sequential writes early because we avoid a
+		 * whole lot of error handling trouble if we don't send it off
+		 * to the driver.
+		 */
+		if (bio_offset_from_zone_start(bio) != zwplug->wp_offset)
+			goto err;
+	}
 
 	/* Advance the zone write pointer offset. */
 	zwplug->wp_offset += bio_sectors(bio);
@@ -1008,8 +1027,14 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
 	}
 
 	/* Conventional zones do not need write plugging. */
-	if (disk_zone_is_conv(disk, sector))
+	if (disk_zone_is_conv(disk, sector)) {
+		/* Zone append to conventional zones is not allowed. */
+		if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+			bio_io_error(bio);
+			return true;
+		}
 		return false;
+	}
 
 	if (bio->bi_opf & REQ_NOWAIT)
 		gfp_mask = GFP_NOWAIT;
@@ -1057,7 +1082,8 @@ plug:
  * @bio: The BIO being submitted
  * @nr_segs: The number of physical segments of @bio
  *
- * Handle write and write zeroes operations using zone write plugging.
+ * Handle write, write zeroes and zone append operations requiring emulation
+ * using zone write plugging.
  *
  * Return true whenever @bio execution needs to be delayed through the zone
  * write plug. Otherwise, return false to let the submission path process
@@ -1096,6 +1122,9 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
 	 * machinery operates at the request level, below the plug, and
 	 * completion of the flush sequence will go through the regular BIO
 	 * completion, which will handle zone write plugging.
+	 * Zone append operations for devices that requested emulation must
+	 * also be plugged so that these BIOs can be changed into regular
+	 * write BIOs.
 	 * Zone reset, reset all and finish commands need special treatment
 	 * to correctly track the write pointer offset of zones. These commands
 	 * are not plugged as we do not need serialization with write
@@ -1103,6 +1132,10 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
 	 * and finish commands when write operations are in flight.
 	 */
 	switch (bio_op(bio)) {
+	case REQ_OP_ZONE_APPEND:
+		if (!bdev_emulates_zone_append(bdev))
+			return false;
+		fallthrough;
 	case REQ_OP_WRITE:
 	case REQ_OP_WRITE_ZEROES:
 		return blk_zone_wplug_handle_write(bio, nr_segs);
@@ -1171,6 +1204,15 @@ void blk_zone_write_plug_bio_endio(struct bio *bio)
 	/* Make sure we do not see this BIO again by clearing the plug flag. */
 	bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
 
+	/*
+	 * If this is a regular write emulating a zone append operation,
+	 * restore the original operation code.
+	 */
+	if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) {
+		bio->bi_opf &= ~REQ_OP_MASK;
+		bio->bi_opf |= REQ_OP_ZONE_APPEND;
+	}
+
 	/*
 	 * If the BIO failed, mark the plug as having an error to trigger
 	 * recovery.
diff --git a/block/blk.h b/block/blk.h
index 4df969f8fa28..1140c4a0be03 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -421,6 +421,11 @@ static inline bool bio_zone_write_plugging(struct bio *bio)
 {
 	return bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING);
 }
+static inline bool bio_is_zone_append(struct bio *bio)
+{
+	return bio_op(bio) == REQ_OP_ZONE_APPEND ||
+		bio_flagged(bio, BIO_EMULATES_ZONE_APPEND);
+}
 void blk_zone_write_plug_bio_merged(struct bio *bio);
 void blk_zone_write_plug_attempt_merge(struct request *rq);
 static inline void blk_zone_update_request_bio(struct request *rq,
@@ -430,8 +435,9 @@ static inline void blk_zone_update_request_bio(struct request *rq,
 	 * For zone append requests, the request sector indicates the location
 	 * at which the BIO data was written. Return this value to the BIO
 	 * issuer through the BIO iter sector.
-	 * For plugged zone writes, we need the original BIO sector so
-	 * that blk_zone_write_plug_bio_endio() can lookup the zone write plug.
+	 * For plugged zone writes, which include emulated zone append, we need
+	 * the original BIO sector so that blk_zone_write_plug_bio_endio() can
+	 * lookup the zone write plug.
 	 */
 	if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio))
 		bio->bi_iter.bi_sector = rq->__sector;
@@ -468,6 +474,10 @@ static inline bool bio_zone_write_plugging(struct bio *bio)
 {
 	return false;
 }
+static inline bool bio_is_zone_append(struct bio *bio)
+{
+	return false;
+}
 static inline void blk_zone_write_plug_bio_merged(struct bio *bio)
 {
 }
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index ed45de07d2ef..29b3170431e7 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -311,6 +311,7 @@ enum {
 	BIO_REMAPPED,
 	BIO_ZONE_WRITE_LOCKED,	/* Owns a zoned device zone write lock */
 	BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */
+	BIO_EMULATES_ZONE_APPEND, /* bio emulates a zone append operation */
 	BIO_FLAG_LAST
 };
 
-- 
cgit v1.2.3


From 63b5385e781417e73bda3fd652c2199826afda6e Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Mon, 8 Apr 2024 10:41:19 +0900
Subject: block: Remove BLK_STS_ZONE_RESOURCE

The zone append emulation of the scsi disk driver was the only driver
using BLK_STS_ZONE_RESOURCE. With this code removed,
BLK_STS_ZONE_RESOURCE is now unused. Remove this macro definition and
simplify blk_mq_dispatch_rq_list() where this status code was handled.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Tested-by: Hans Holmberg <hans.holmberg@wdc.com>
Tested-by: Dennis Maisenbacher <dennis.maisenbacher@wdc.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20240408014128.205141-20-dlemoal@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c            | 26 --------------------------
 drivers/scsi/scsi_lib.c   |  1 -
 include/linux/blk_types.h | 20 ++++----------------
 3 files changed, 4 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6f9cc1c4d4fb..9f2d9970eeba 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1921,19 +1921,6 @@ static void blk_mq_handle_dev_resource(struct request *rq,
 	__blk_mq_requeue_request(rq);
 }
 
-static void blk_mq_handle_zone_resource(struct request *rq,
-					struct list_head *zone_list)
-{
-	/*
-	 * If we end up here it is because we cannot dispatch a request to a
-	 * specific zone due to LLD level zone-write locking or other zone
-	 * related resource not being available. In this case, set the request
-	 * aside in zone_list for retrying it later.
-	 */
-	list_add(&rq->queuelist, zone_list);
-	__blk_mq_requeue_request(rq);
-}
-
 enum prep_dispatch {
 	PREP_DISPATCH_OK,
 	PREP_DISPATCH_NO_TAG,
@@ -2019,7 +2006,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
 	struct request *rq;
 	int queued;
 	blk_status_t ret = BLK_STS_OK;
-	LIST_HEAD(zone_list);
 	bool needs_resource = false;
 
 	if (list_empty(list))
@@ -2061,23 +2047,11 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
 		case BLK_STS_DEV_RESOURCE:
 			blk_mq_handle_dev_resource(rq, list);
 			goto out;
-		case BLK_STS_ZONE_RESOURCE:
-			/*
-			 * Move the request to zone_list and keep going through
-			 * the dispatch list to find more requests the drive can
-			 * accept.
-			 */
-			blk_mq_handle_zone_resource(rq, &zone_list);
-			needs_resource = true;
-			break;
 		default:
 			blk_mq_end_request(rq, ret);
 		}
 	} while (!list_empty(list));
 out:
-	if (!list_empty(&zone_list))
-		list_splice_tail_init(&zone_list, list);
-
 	/* If we didn't flush the entire list, we could have told the driver
 	 * there was more coming, but that turned out to be a lie.
 	 */
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 2e28e2360c85..9ca96116bd33 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1870,7 +1870,6 @@ out_put_budget:
 	case BLK_STS_OK:
 		break;
 	case BLK_STS_RESOURCE:
-	case BLK_STS_ZONE_RESOURCE:
 		if (scsi_device_blocked(sdev))
 			ret = BLK_STS_DEV_RESOURCE;
 		break;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 29b3170431e7..ffe0c112b128 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -136,18 +136,6 @@ typedef u16 blk_short_t;
  */
 #define BLK_STS_DEV_RESOURCE	((__force blk_status_t)13)
 
-/*
- * BLK_STS_ZONE_RESOURCE is returned from the driver to the block layer if zone
- * related resources are unavailable, but the driver can guarantee the queue
- * will be rerun in the future once the resources become available again.
- *
- * This is different from BLK_STS_DEV_RESOURCE in that it explicitly references
- * a zone specific resource and IO to a different zone on the same device could
- * still be served. Examples of that are zones that are write-locked, but a read
- * to the same zone could be served.
- */
-#define BLK_STS_ZONE_RESOURCE	((__force blk_status_t)14)
-
 /*
  * BLK_STS_ZONE_OPEN_RESOURCE is returned from the driver in the completion
  * path if the device returns a status indicating that too many zone resources
@@ -155,7 +143,7 @@ typedef u16 blk_short_t;
  * after the number of open zones decreases below the device's limits, which is
  * reported in the request_queue's max_open_zones.
  */
-#define BLK_STS_ZONE_OPEN_RESOURCE	((__force blk_status_t)15)
+#define BLK_STS_ZONE_OPEN_RESOURCE	((__force blk_status_t)14)
 
 /*
  * BLK_STS_ZONE_ACTIVE_RESOURCE is returned from the driver in the completion
@@ -164,20 +152,20 @@ typedef u16 blk_short_t;
  * after the number of active zones decreases below the device's limits, which
  * is reported in the request_queue's max_active_zones.
  */
-#define BLK_STS_ZONE_ACTIVE_RESOURCE	((__force blk_status_t)16)
+#define BLK_STS_ZONE_ACTIVE_RESOURCE	((__force blk_status_t)15)
 
 /*
  * BLK_STS_OFFLINE is returned from the driver when the target device is offline
  * or is being taken offline. This could help differentiate the case where a
  * device is intentionally being shut down from a real I/O error.
  */
-#define BLK_STS_OFFLINE		((__force blk_status_t)17)
+#define BLK_STS_OFFLINE		((__force blk_status_t)16)
 
 /*
  * BLK_STS_DURATION_LIMIT is returned from the driver when the target device
  * aborted the command because it exceeded one of its Command Duration Limits.
  */
-#define BLK_STS_DURATION_LIMIT	((__force blk_status_t)18)
+#define BLK_STS_DURATION_LIMIT	((__force blk_status_t)17)
 
 /**
  * blk_path_error - returns true if error may be path related
-- 
cgit v1.2.3


From 9b3c08b90fc212de58c34621d83e74977170b2cd Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Mon, 8 Apr 2024 10:41:20 +0900
Subject: block: Simplify blk_revalidate_disk_zones() interface

The only user of blk_revalidate_disk_zones() second argument was the
SCSI disk driver (sd). Now that this driver does not require this
update_driver_data argument, remove it to simplify the interface of
blk_revalidate_disk_zones(). Also update the function kdoc comment to
be more accurate (i.e. there is no gendisk ->revalidate method).

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Tested-by: Hans Holmberg <hans.holmberg@wdc.com>
Tested-by: Dennis Maisenbacher <dennis.maisenbacher@wdc.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20240408014128.205141-21-dlemoal@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-zoned.c              | 21 +++++++--------------
 drivers/block/null_blk/zoned.c |  2 +-
 drivers/block/ublk_drv.c       |  2 +-
 drivers/block/virtio_blk.c     |  2 +-
 drivers/md/dm-zone.c           |  2 +-
 drivers/nvme/host/core.c       |  2 +-
 drivers/scsi/sd_zbc.c          |  2 +-
 include/linux/blkdev.h         |  3 +--
 8 files changed, 14 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index da0fc7e2d00a..e46d23ad2fa9 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -1713,21 +1713,17 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
 /**
  * blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps
  * @disk:	Target disk
- * @update_driver_data:	Callback to update driver data on the frozen disk
  *
- * Helper function for low-level device drivers to check and (re) allocate and
- * initialize a disk request queue zone bitmaps. This functions should normally
- * be called within the disk ->revalidate method for blk-mq based drivers.
+ * Helper function for low-level device drivers to check, (re) allocate and
+ * initialize resources used for managing zoned disks. This function should
+ * normally be called by blk-mq based drivers when a zoned gendisk is probed
+ * and when the zone configuration of the gendisk changes (e.g. after a format).
  * Before calling this function, the device driver must already have set the
  * device zone size (chunk_sector limit) and the max zone append limit.
  * BIO based drivers can also use this function as long as the device queue
  * can be safely frozen.
- * If the @update_driver_data callback function is not NULL, the callback is
- * executed with the device request queue frozen after all zones have been
- * checked.
  */
-int blk_revalidate_disk_zones(struct gendisk *disk,
-			      void (*update_driver_data)(struct gendisk *disk))
+int blk_revalidate_disk_zones(struct gendisk *disk)
 {
 	struct request_queue *q = disk->queue;
 	sector_t zone_sectors = q->limits.chunk_sectors;
@@ -1794,13 +1790,10 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
 	 * referencing the bitmaps).
 	 */
 	blk_mq_freeze_queue(q);
-	if (ret > 0) {
+	if (ret > 0)
 		ret = disk_update_zone_resources(disk, &args);
-		if (update_driver_data)
-			update_driver_data(disk);
-	} else {
+	else
 		pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
-	}
 	if (ret)
 		disk_free_zone_resources(disk);
 	blk_mq_unfreeze_queue(q);
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index 0b2af273adaf..4ddd84752557 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -177,7 +177,7 @@ int null_register_zoned_dev(struct nullb *nullb)
 		disk->disk_name,
 		queue_emulates_zone_append(q) ? "emulated" : "native");
 
-	return blk_revalidate_disk_zones(disk, NULL);
+	return blk_revalidate_disk_zones(disk);
 }
 
 void null_free_zoned_dev(struct nullb_device *dev)
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index ab6af84e327c..851c78913de2 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -221,7 +221,7 @@ static int ublk_get_nr_zones(const struct ublk_device *ub)
 
 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
 {
-	return blk_revalidate_disk_zones(ub->ub_disk, NULL);
+	return blk_revalidate_disk_zones(ub->ub_disk);
 }
 
 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 42dea7601d87..c1af0a7d56c8 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -1543,7 +1543,7 @@ static int virtblk_probe(struct virtio_device *vdev)
 	 */
 	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && lim.zoned) {
 		blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, vblk->disk->queue);
-		err = blk_revalidate_disk_zones(vblk->disk, NULL);
+		err = blk_revalidate_disk_zones(vblk->disk);
 		if (err)
 			goto out_cleanup_disk;
 	}
diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c
index 34769f3e3175..d17ae4486a6a 100644
--- a/drivers/md/dm-zone.c
+++ b/drivers/md/dm-zone.c
@@ -169,7 +169,7 @@ static int dm_revalidate_zones(struct mapped_device *md, struct dm_table *t)
 	 * our table for dm_blk_report_zones() to use directly.
 	 */
 	md->zone_revalidate_map = t;
-	ret = blk_revalidate_disk_zones(disk, NULL);
+	ret = blk_revalidate_disk_zones(disk);
 	md->zone_revalidate_map = NULL;
 
 	if (ret) {
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 943d72bdd794..c9955ecd1790 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2150,7 +2150,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
 	blk_mq_unfreeze_queue(ns->disk->queue);
 
 	if (blk_queue_is_zoned(ns->queue)) {
-		ret = blk_revalidate_disk_zones(ns->disk, NULL);
+		ret = blk_revalidate_disk_zones(ns->disk);
 		if (ret && !nvme_first_scan(ns->disk))
 			goto out;
 	}
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index d0ead9858954..806036e48abe 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -572,7 +572,7 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp)
 	blk_queue_max_zone_append_sectors(q, 0);
 
 	flags = memalloc_noio_save();
-	ret = blk_revalidate_disk_zones(disk, NULL);
+	ret = blk_revalidate_disk_zones(disk);
 	memalloc_noio_restore(flags);
 	if (ret) {
 		sdkp->zone_info = (struct zoned_disk_info){ };
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 46613bf6a402..fbc6860b3622 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -336,8 +336,7 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector,
 		unsigned int nr_zones, report_zones_cb cb, void *data);
 int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
 		sector_t sectors, sector_t nr_sectors);
-int blk_revalidate_disk_zones(struct gendisk *disk,
-		void (*update_driver_data)(struct gendisk *disk));
+int blk_revalidate_disk_zones(struct gendisk *disk);
 
 /*
  * Independent access ranges: struct blk_independent_access_range describes
-- 
cgit v1.2.3


From e4eb37cc0f3ed8971c50dddfbeb35a799e5b504e Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Mon, 8 Apr 2024 10:41:22 +0900
Subject: block: Remove elevator required features

The only elevator feature ever implemented is ELEVATOR_F_ZBD_SEQ_WRITE
for signaling that a scheduler implements zone write locking to tightly
control the dispatching order of write operations to zoned block
devices. With the removal of zone write locking support in mq-deadline
and the reliance of all block device drivers on the block layer zone
write plugging to control ordering of write operations to zones, the
elevator feature ELEVATOR_F_ZBD_SEQ_WRITE is completely unused.
Remove it, and also remove the now unused code for filtering the
possible schedulers for a block device based on required features.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Tested-by: Hans Holmberg <hans.holmberg@wdc.com>
Tested-by: Dennis Maisenbacher <dennis.maisenbacher@wdc.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20240408014128.205141-23-dlemoal@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-settings.c   | 16 ----------------
 block/elevator.c       | 46 +++++-----------------------------------------
 block/elevator.h       |  1 -
 include/linux/blkdev.h | 10 ----------
 4 files changed, 5 insertions(+), 68 deletions(-)

(limited to 'include')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index c0197e1e7485..715f4b6356c4 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -1052,22 +1052,6 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
 }
 EXPORT_SYMBOL_GPL(blk_queue_write_cache);
 
-/**
- * blk_queue_required_elevator_features - Set a queue required elevator features
- * @q:		the request queue for the target device
- * @features:	Required elevator features OR'ed together
- *
- * Tell the block layer that for the device controlled through @q, only the
- * only elevators that can be used are those that implement at least the set of
- * features specified by @features.
- */
-void blk_queue_required_elevator_features(struct request_queue *q,
-					  unsigned int features)
-{
-	q->required_elevator_features = features;
-}
-EXPORT_SYMBOL_GPL(blk_queue_required_elevator_features);
-
 /**
  * blk_queue_can_use_dma_map_merging - configure queue for merging segments.
  * @q:		the request queue for the device
diff --git a/block/elevator.c b/block/elevator.c
index 5ff093cb3cf8..f64ebd726e58 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -83,13 +83,6 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
 }
 EXPORT_SYMBOL(elv_bio_merge_ok);
 
-static inline bool elv_support_features(struct request_queue *q,
-		const struct elevator_type *e)
-{
-	return (q->required_elevator_features & e->elevator_features) ==
-		q->required_elevator_features;
-}
-
 /**
  * elevator_match - Check whether @e's name or alias matches @name
  * @e: Scheduler to test
@@ -120,7 +113,7 @@ static struct elevator_type *elevator_find_get(struct request_queue *q,
 
 	spin_lock(&elv_list_lock);
 	e = __elevator_find(name);
-	if (e && (!elv_support_features(q, e) || !elevator_tryget(e)))
+	if (e && (!elevator_tryget(e)))
 		e = NULL;
 	spin_unlock(&elv_list_lock);
 	return e;
@@ -580,34 +573,8 @@ static struct elevator_type *elevator_get_default(struct request_queue *q)
 }
 
 /*
- * Get the first elevator providing the features required by the request queue.
- * Default to "none" if no matching elevator is found.
- */
-static struct elevator_type *elevator_get_by_features(struct request_queue *q)
-{
-	struct elevator_type *e, *found = NULL;
-
-	spin_lock(&elv_list_lock);
-
-	list_for_each_entry(e, &elv_list, list) {
-		if (elv_support_features(q, e)) {
-			found = e;
-			break;
-		}
-	}
-
-	if (found && !elevator_tryget(found))
-		found = NULL;
-
-	spin_unlock(&elv_list_lock);
-	return found;
-}
-
-/*
- * For a device queue that has no required features, use the default elevator
- * settings. Otherwise, use the first elevator available matching the required
- * features. If no suitable elevator is find or if the chosen elevator
- * initialization fails, fall back to the "none" elevator (no elevator).
+ * Use the default elevator settings. If the chosen elevator initialization
+ * fails, fall back to the "none" elevator (no elevator).
  */
 void elevator_init_mq(struct request_queue *q)
 {
@@ -622,10 +589,7 @@ void elevator_init_mq(struct request_queue *q)
 	if (unlikely(q->elevator))
 		return;
 
-	if (!q->required_elevator_features)
-		e = elevator_get_default(q);
-	else
-		e = elevator_get_by_features(q);
+	e = elevator_get_default(q);
 	if (!e)
 		return;
 
@@ -781,7 +745,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
 	list_for_each_entry(e, &elv_list, list) {
 		if (e == cur)
 			len += sprintf(name+len, "[%s] ", e->elevator_name);
-		else if (elv_support_features(q, e))
+		else
 			len += sprintf(name+len, "%s ", e->elevator_name);
 	}
 	spin_unlock(&elv_list_lock);
diff --git a/block/elevator.h b/block/elevator.h
index 7ca3d7b6ed82..e9a050a96e53 100644
--- a/block/elevator.h
+++ b/block/elevator.h
@@ -74,7 +74,6 @@ struct elevator_type
 	struct elv_fs_entry *elevator_attrs;
 	const char *elevator_name;
 	const char *elevator_alias;
-	const unsigned int elevator_features;
 	struct module *elevator_owner;
 #ifdef CONFIG_BLK_DEBUG_FS
 	const struct blk_mq_debugfs_attr *queue_debugfs_attrs;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index fbc6860b3622..017e9d064177 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -453,8 +453,6 @@ struct request_queue {
 
 	atomic_t		nr_active_requests_shared_tags;
 
-	unsigned int		required_elevator_features;
-
 	struct blk_mq_tags	*sched_shared_tags;
 
 	struct list_head	icq_list;
@@ -958,14 +956,6 @@ disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges);
 void disk_set_independent_access_ranges(struct gendisk *disk,
 				struct blk_independent_access_ranges *iars);
 
-/*
- * Elevator features for blk_queue_required_elevator_features:
- */
-/* Supports zoned block devices sequential write constraint */
-#define ELEVATOR_F_ZBD_SEQ_WRITE	(1U << 0)
-
-extern void blk_queue_required_elevator_features(struct request_queue *q,
-						 unsigned int features);
 extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
 					      struct device *dev);
 
-- 
cgit v1.2.3


From 02ccd7c360b1692da164842f211d41fab7d83adc Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Mon, 8 Apr 2024 10:41:26 +0900
Subject: block: Remove zone write locking

Zone write locking is now unused and replaced with zone write plugging.
Remove all code that was implementing zone write locking, that is, the
various helper functions controlling request zone write locking and
the gendisk attached zone bitmaps.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Tested-by: Hans Holmberg <hans.holmberg@wdc.com>
Tested-by: Dennis Maisenbacher <dennis.maisenbacher@wdc.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20240408014128.205141-27-dlemoal@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c    |  1 -
 block/blk-zoned.c         | 66 ++-----------------------------------
 include/linux/blk-mq.h    | 83 -----------------------------------------------
 include/linux/blk_types.h |  1 -
 include/linux/blkdev.h    | 35 +++-----------------
 5 files changed, 7 insertions(+), 179 deletions(-)

(limited to 'include')

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index ca1f2b9422d5..770c0c2b72fa 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -256,7 +256,6 @@ static const char *const rqf_name[] = {
 	RQF_NAME(HASHED),
 	RQF_NAME(STATS),
 	RQF_NAME(SPECIAL_PAYLOAD),
-	RQF_NAME(ZONE_WRITE_LOCKED),
 	RQF_NAME(TIMED_OUT),
 	RQF_NAME(RESV),
 };
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 44699b431ad0..3befebe6b319 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -115,52 +115,6 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
 }
 EXPORT_SYMBOL_GPL(blk_zone_cond_str);
 
-/*
- * Return true if a request is a write requests that needs zone write locking.
- */
-bool blk_req_needs_zone_write_lock(struct request *rq)
-{
-	if (!rq->q->disk->seq_zones_wlock)
-		return false;
-
-	return blk_rq_is_seq_zoned_write(rq);
-}
-EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock);
-
-bool blk_req_zone_write_trylock(struct request *rq)
-{
-	unsigned int zno = blk_rq_zone_no(rq);
-
-	if (test_and_set_bit(zno, rq->q->disk->seq_zones_wlock))
-		return false;
-
-	WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
-	rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
-
-	return true;
-}
-EXPORT_SYMBOL_GPL(blk_req_zone_write_trylock);
-
-void __blk_req_zone_write_lock(struct request *rq)
-{
-	if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq),
-					  rq->q->disk->seq_zones_wlock)))
-		return;
-
-	WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
-	rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
-}
-EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock);
-
-void __blk_req_zone_write_unlock(struct request *rq)
-{
-	rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED;
-	if (rq->q->disk->seq_zones_wlock)
-		WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq),
-						 rq->q->disk->seq_zones_wlock));
-}
-EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
-
 /**
  * bdev_nr_zones - Get number of zones
  * @bdev:	Target device
@@ -1506,9 +1460,6 @@ void disk_free_zone_resources(struct gendisk *disk)
 
 	kfree(disk->conv_zones_bitmap);
 	disk->conv_zones_bitmap = NULL;
-	kfree(disk->seq_zones_wlock);
-	disk->seq_zones_wlock = NULL;
-
 	disk->zone_capacity = 0;
 	disk->nr_zones = 0;
 }
@@ -1556,7 +1507,6 @@ static int disk_revalidate_zone_resources(struct gendisk *disk,
 struct blk_revalidate_zone_args {
 	struct gendisk	*disk;
 	unsigned long	*conv_zones_bitmap;
-	unsigned long	*seq_zones_wlock;
 	unsigned int	nr_zones;
 	unsigned int	zone_capacity;
 	sector_t	sector;
@@ -1574,7 +1524,6 @@ static int disk_update_zone_resources(struct gendisk *disk,
 
 	disk->nr_zones = args->nr_zones;
 	disk->zone_capacity = args->zone_capacity;
-	swap(disk->seq_zones_wlock, args->seq_zones_wlock);
 	swap(disk->conv_zones_bitmap, args->conv_zones_bitmap);
 
 	/*
@@ -1664,13 +1613,6 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
 		set_bit(idx, args->conv_zones_bitmap);
 		break;
 	case BLK_ZONE_TYPE_SEQWRITE_REQ:
-		if (!args->seq_zones_wlock) {
-			args->seq_zones_wlock =
-				blk_alloc_zone_bitmap(q->node, args->nr_zones);
-			if (!args->seq_zones_wlock)
-				return -ENOMEM;
-		}
-
 		/*
 		 * Remember the capacity of the first sequential zone and check
 		 * if it is constant for all zones.
@@ -1712,7 +1654,7 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
 }
 
 /**
- * blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps
+ * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs
  * @disk:	Target disk
  *
  * Helper function for low-level device drivers to check, (re) allocate and
@@ -1786,9 +1728,8 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
 	}
 
 	/*
-	 * Install the new bitmaps and update nr_zones only once the queue is
-	 * stopped and all I/Os are completed (i.e. a scheduler is not
-	 * referencing the bitmaps).
+	 * Set the new disk zone parameters only once the queue is frozen and
+	 * all I/Os are completed.
 	 */
 	blk_mq_freeze_queue(q);
 	if (ret > 0)
@@ -1799,7 +1740,6 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
 		disk_free_zone_resources(disk);
 	blk_mq_unfreeze_queue(q);
 
-	kfree(args.seq_zones_wlock);
 	kfree(args.conv_zones_bitmap);
 
 	return ret;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 60090c8366fb..89ba6b16fe8b 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -54,8 +54,6 @@ typedef __u32 __bitwise req_flags_t;
 /* Look at ->special_vec for the actual data payload instead of the
    bio chain. */
 #define RQF_SPECIAL_PAYLOAD	((__force req_flags_t)(1 << 18))
-/* The per-zone write lock is held for this request */
-#define RQF_ZONE_WRITE_LOCKED	((__force req_flags_t)(1 << 19))
 /* The request completion needs to be signaled to zone write pluging. */
 #define RQF_ZONE_WRITE_PLUGGING	((__force req_flags_t)(1 << 20))
 /* ->timeout has been called, don't expire again */
@@ -1152,85 +1150,4 @@ static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq,
 }
 void blk_dump_rq_flags(struct request *, char *);
 
-#ifdef CONFIG_BLK_DEV_ZONED
-static inline unsigned int blk_rq_zone_no(struct request *rq)
-{
-	return disk_zone_no(rq->q->disk, blk_rq_pos(rq));
-}
-
-static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
-{
-	return disk_zone_is_seq(rq->q->disk, blk_rq_pos(rq));
-}
-
-/**
- * blk_rq_is_seq_zoned_write() - Check if @rq requires write serialization.
- * @rq: Request to examine.
- *
- * Note: REQ_OP_ZONE_APPEND requests do not require serialization.
- */
-static inline bool blk_rq_is_seq_zoned_write(struct request *rq)
-{
-	return op_needs_zoned_write_locking(req_op(rq)) &&
-		blk_rq_zone_is_seq(rq);
-}
-
-bool blk_req_needs_zone_write_lock(struct request *rq);
-bool blk_req_zone_write_trylock(struct request *rq);
-void __blk_req_zone_write_lock(struct request *rq);
-void __blk_req_zone_write_unlock(struct request *rq);
-
-static inline void blk_req_zone_write_lock(struct request *rq)
-{
-	if (blk_req_needs_zone_write_lock(rq))
-		__blk_req_zone_write_lock(rq);
-}
-
-static inline void blk_req_zone_write_unlock(struct request *rq)
-{
-	if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED)
-		__blk_req_zone_write_unlock(rq);
-}
-
-static inline bool blk_req_zone_is_write_locked(struct request *rq)
-{
-	return rq->q->disk->seq_zones_wlock &&
-		test_bit(blk_rq_zone_no(rq), rq->q->disk->seq_zones_wlock);
-}
-
-static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
-{
-	if (!blk_req_needs_zone_write_lock(rq))
-		return true;
-	return !blk_req_zone_is_write_locked(rq);
-}
-#else /* CONFIG_BLK_DEV_ZONED */
-static inline bool blk_rq_is_seq_zoned_write(struct request *rq)
-{
-	return false;
-}
-
-static inline bool blk_req_needs_zone_write_lock(struct request *rq)
-{
-	return false;
-}
-
-static inline void blk_req_zone_write_lock(struct request *rq)
-{
-}
-
-static inline void blk_req_zone_write_unlock(struct request *rq)
-{
-}
-static inline bool blk_req_zone_is_write_locked(struct request *rq)
-{
-	return false;
-}
-
-static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
-{
-	return true;
-}
-#endif /* CONFIG_BLK_DEV_ZONED */
-
 #endif /* BLK_MQ_H */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index ffe0c112b128..5751292fee6a 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -297,7 +297,6 @@ enum {
 	BIO_QOS_THROTTLED,	/* bio went through rq_qos throttle path */
 	BIO_QOS_MERGED,		/* but went through rq_qos merge path */
 	BIO_REMAPPED,
-	BIO_ZONE_WRITE_LOCKED,	/* Owns a zoned device zone write lock */
 	BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */
 	BIO_EMULATES_ZONE_APPEND, /* bio emulates a zone append operation */
 	BIO_FLAG_LAST
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 017e9d064177..7e8a68805324 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -177,23 +177,14 @@ struct gendisk {
 
 #ifdef CONFIG_BLK_DEV_ZONED
 	/*
-	 * Zoned block device information for request dispatch control.
-	 * nr_zones is the total number of zones of the device. This is always
-	 * 0 for regular block devices. conv_zones_bitmap is a bitmap of nr_zones
-	 * bits which indicates if a zone is conventional (bit set) or
-	 * sequential (bit clear). seq_zones_wlock is a bitmap of nr_zones
-	 * bits which indicates if a zone is write locked, that is, if a write
-	 * request targeting the zone was dispatched.
-	 *
-	 * Reads of this information must be protected with blk_queue_enter() /
-	 * blk_queue_exit(). Modifying this information is only allowed while
-	 * no requests are being processed. See also blk_mq_freeze_queue() and
-	 * blk_mq_unfreeze_queue().
+	 * Zoned block device information. Reads of this information must be
+	 * protected with blk_queue_enter() / blk_queue_exit(). Modifying this
+	 * information is only allowed while no requests are being processed.
+	 * See also blk_mq_freeze_queue() and blk_mq_unfreeze_queue().
 	 */
 	unsigned int		nr_zones;
 	unsigned int		zone_capacity;
 	unsigned long		*conv_zones_bitmap;
-	unsigned long		*seq_zones_wlock;
 	unsigned int            zone_wplugs_hash_bits;
 	spinlock_t              zone_wplugs_lock;
 	struct mempool_s	*zone_wplugs_pool;
@@ -635,15 +626,6 @@ static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector)
 	return sector >> ilog2(disk->queue->limits.chunk_sectors);
 }
 
-static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector)
-{
-	if (!blk_queue_is_zoned(disk->queue))
-		return false;
-	if (!disk->conv_zones_bitmap)
-		return true;
-	return !test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap);
-}
-
 static inline void disk_set_max_open_zones(struct gendisk *disk,
 		unsigned int max_open_zones)
 {
@@ -677,10 +659,6 @@ static inline unsigned int disk_nr_zones(struct gendisk *disk)
 {
 	return 0;
 }
-static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector)
-{
-	return false;
-}
 static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector)
 {
 	return 0;
@@ -869,11 +847,6 @@ static inline bool bio_straddles_zones(struct bio *bio)
 		disk_zone_no(bio->bi_bdev->bd_disk, bio_end_sector(bio) - 1);
 }
 
-static inline unsigned int bio_zone_is_seq(struct bio *bio)
-{
-	return disk_zone_is_seq(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector);
-}
-
 /*
  * Return how much of the chunk is left to be used for I/O at a given offset.
  */
-- 
cgit v1.2.3


From 99a9476b27e89525cef653b91e542baf61f105d2 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Mon, 8 Apr 2024 10:41:28 +0900
Subject: block: Do not special-case plugging of zone write operations

With the block layer zone write plugging being automatically done for
any write operation to a zone of a zoned block device, a regular request
plugging handled through current->plug can only ever see at most a
single write request per zone. In such case, any potential reordering
of the plugged requests will be harmless. We can thus remove the special
casing for write operations to zones and have these requests plugged as
well. This allows removing the function blk_mq_plug and instead directly
using current->plug where needed.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Tested-by: Hans Holmberg <hans.holmberg@wdc.com>
Tested-by: Dennis Maisenbacher <dennis.maisenbacher@wdc.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20240408014128.205141-29-dlemoal@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       |  6 ------
 block/blk-merge.c      |  3 +--
 block/blk-mq.c         |  7 +------
 block/blk-mq.h         | 31 -------------------------------
 include/linux/blkdev.h | 12 ------------
 5 files changed, 2 insertions(+), 57 deletions(-)

(limited to 'include')

diff --git a/block/blk-core.c b/block/blk-core.c
index e1a5344c2257..47400a4fe851 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -907,12 +907,6 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
 	    !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
 		return 0;
 
-	/*
-	 * As the requests that require a zone lock are not plugged in the
-	 * first place, directly accessing the plug instead of using
-	 * blk_mq_plug() should not have any consequences during flushing for
-	 * zoned devices.
-	 */
 	blk_flush_plug(current->plug, false);
 
 	/*
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 7f8a808b74c1..f64115d72f3d 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -1113,10 +1113,9 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 		unsigned int nr_segs)
 {
-	struct blk_plug *plug;
+	struct blk_plug *plug = current->plug;
 	struct request *rq;
 
-	plug = blk_mq_plug(bio);
 	if (!plug || rq_list_empty(plug->mq_list))
 		return false;
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9f2d9970eeba..434d45219e23 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1330,11 +1330,6 @@ void blk_execute_rq_nowait(struct request *rq, bool at_head)
 
 	blk_account_io_start(rq);
 
-	/*
-	 * As plugging can be enabled for passthrough requests on a zoned
-	 * device, directly accessing the plug instead of using blk_mq_plug()
-	 * should not have any consequences.
-	 */
 	if (current->plug && !at_head) {
 		blk_add_rq_to_plug(current->plug, rq);
 		return;
@@ -2932,7 +2927,7 @@ static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
 void blk_mq_submit_bio(struct bio *bio)
 {
 	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
-	struct blk_plug *plug = blk_mq_plug(bio);
+	struct blk_plug *plug = current->plug;
 	const int is_sync = op_is_sync(bio->bi_opf);
 	struct blk_mq_hw_ctx *hctx;
 	unsigned int nr_segs = 1;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index f75a9ecfebde..260beea8e332 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -365,37 +365,6 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
 		qmap->mq_map[cpu] = 0;
 }
 
-/*
- * blk_mq_plug() - Get caller context plug
- * @bio : the bio being submitted by the caller context
- *
- * Plugging, by design, may delay the insertion of BIOs into the elevator in
- * order to increase BIO merging opportunities. This however can cause BIO
- * insertion order to change from the order in which submit_bio() is being
- * executed in the case of multiple contexts concurrently issuing BIOs to a
- * device, even if these context are synchronized to tightly control BIO issuing
- * order. While this is not a problem with regular block devices, this ordering
- * change can cause write BIO failures with zoned block devices as these
- * require sequential write patterns to zones. Prevent this from happening by
- * ignoring the plug state of a BIO issuing context if it is for a zoned block
- * device and the BIO to plug is a write operation.
- *
- * Return current->plug if the bio can be plugged and NULL otherwise
- */
-static inline struct blk_plug *blk_mq_plug( struct bio *bio)
-{
-	/* Zoned block device write operation case: do not plug the BIO */
-	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
-	    bdev_op_is_zoned_write(bio->bi_bdev, bio_op(bio)))
-		return NULL;
-
-	/*
-	 * For regular block devices or read operations, use the context plug
-	 * which may be NULL if blk_start_plug() was not executed.
-	 */
-	return current->plug;
-}
-
 /* Free all requests on the list */
 static inline void blk_mq_free_requests(struct list_head *list)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7e8a68805324..1ce8ba08e318 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1299,18 +1299,6 @@ static inline unsigned int bdev_zone_no(struct block_device *bdev, sector_t sec)
 	return disk_zone_no(bdev->bd_disk, sec);
 }
 
-/* Whether write serialization is required for @op on zoned devices. */
-static inline bool op_needs_zoned_write_locking(enum req_op op)
-{
-	return op == REQ_OP_WRITE || op == REQ_OP_WRITE_ZEROES;
-}
-
-static inline bool bdev_op_is_zoned_write(struct block_device *bdev,
-					  enum req_op op)
-{
-	return bdev_is_zoned(bdev) && op_needs_zoned_write_locking(op);
-}
-
 static inline sector_t bdev_zone_sectors(struct block_device *bdev)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
-- 
cgit v1.2.3


From 8a7a6103258715857310253ec2193bcc4d1d7082 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 21 Feb 2024 23:31:56 +0200
Subject: gpiolib: Get rid of never false gpio_is_valid() calls

In the cases when gpio_is_valid() is called with unsigned parameter
the result is always true in the GPIO library code, hence the check
for false won't ever be true. Get rid of such calls.

While at it, move GPIO device base to be unsigned to clearly show
it won't ever be negative. This requires a new definition for the
maximum GPIO number in the system.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 drivers/gpio/gpiolib-legacy.c | 10 +++++-----
 drivers/gpio/gpiolib-sysfs.c  |  2 +-
 drivers/gpio/gpiolib.c        | 19 +++++++++----------
 drivers/gpio/gpiolib.h        |  2 +-
 include/linux/gpio.h          |  6 ++++++
 5 files changed, 22 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/gpio/gpiolib-legacy.c b/drivers/gpio/gpiolib-legacy.c
index 616d60187f85..5a9911ae9125 100644
--- a/drivers/gpio/gpiolib-legacy.c
+++ b/drivers/gpio/gpiolib-legacy.c
@@ -28,10 +28,9 @@ int gpio_request_one(unsigned gpio, unsigned long flags, const char *label)
 	struct gpio_desc *desc;
 	int err;
 
-	desc = gpio_to_desc(gpio);
-
 	/* Compatibility: assume unavailable "valid" GPIOs will appear later */
-	if (!desc && gpio_is_valid(gpio))
+	desc = gpio_to_desc(gpio);
+	if (!desc)
 		return -EPROBE_DEFER;
 
 	err = gpiod_request(desc, label);
@@ -63,10 +62,11 @@ EXPORT_SYMBOL_GPL(gpio_request_one);
  */
 int gpio_request(unsigned gpio, const char *label)
 {
-	struct gpio_desc *desc = gpio_to_desc(gpio);
+	struct gpio_desc *desc;
 
 	/* Compatibility: assume unavailable "valid" GPIOs will appear later */
-	if (!desc && gpio_is_valid(gpio))
+	desc = gpio_to_desc(gpio);
+	if (!desc)
 		return -EPROBE_DEFER;
 
 	return gpiod_request(desc, label);
diff --git a/drivers/gpio/gpiolib-sysfs.c b/drivers/gpio/gpiolib-sysfs.c
index 6853ecd98bcb..26202586fd39 100644
--- a/drivers/gpio/gpiolib-sysfs.c
+++ b/drivers/gpio/gpiolib-sysfs.c
@@ -412,7 +412,7 @@ static ssize_t base_show(struct device *dev,
 {
 	const struct gpio_device *gdev = dev_get_drvdata(dev);
 
-	return sysfs_emit(buf, "%d\n", gdev->base);
+	return sysfs_emit(buf, "%u\n", gdev->base);
 }
 static DEVICE_ATTR_RO(base);
 
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index f20367b1ea8e..f749ece2d3cd 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -150,9 +150,6 @@ struct gpio_desc *gpio_to_desc(unsigned gpio)
 		}
 	}
 
-	if (!gpio_is_valid(gpio))
-		pr_warn("invalid GPIO %d\n", gpio);
-
 	return NULL;
 }
 EXPORT_SYMBOL_GPL(gpio_to_desc);
@@ -297,10 +294,10 @@ struct gpio_chip *gpio_device_get_chip(struct gpio_device *gdev)
 EXPORT_SYMBOL_GPL(gpio_device_get_chip);
 
 /* dynamic allocation of GPIOs, e.g. on a hotplugged device */
-static int gpiochip_find_base_unlocked(int ngpio)
+static int gpiochip_find_base_unlocked(u16 ngpio)
 {
+	unsigned int base = GPIO_DYNAMIC_BASE;
 	struct gpio_device *gdev;
-	int base = GPIO_DYNAMIC_BASE;
 
 	list_for_each_entry_srcu(gdev, &gpio_devices, list,
 				 lockdep_is_held(&gpio_devices_lock)) {
@@ -311,9 +308,11 @@ static int gpiochip_find_base_unlocked(int ngpio)
 		base = gdev->base + gdev->ngpio;
 		if (base < GPIO_DYNAMIC_BASE)
 			base = GPIO_DYNAMIC_BASE;
+		if (base > GPIO_DYNAMIC_MAX - ngpio)
+			break;
 	}
 
-	if (gpio_is_valid(base)) {
+	if (base <= GPIO_DYNAMIC_MAX - ngpio) {
 		pr_debug("%s: found new base at %d\n", __func__, base);
 		return base;
 	} else {
@@ -749,7 +748,7 @@ static int gpiochip_setup_dev(struct gpio_device *gdev)
 	if (ret)
 		goto err_remove_device;
 
-	dev_dbg(&gdev->dev, "registered GPIOs %d to %d on %s\n", gdev->base,
+	dev_dbg(&gdev->dev, "registered GPIOs %u to %u on %s\n", gdev->base,
 		gdev->base + gdev->ngpio - 1, gdev->label);
 
 	return 0;
@@ -4788,14 +4787,14 @@ static void gpiolib_dbg_show(struct seq_file *s, struct gpio_device *gdev)
 			value = gpio_chip_get_value(gc, desc);
 			is_irq = test_bit(FLAG_USED_AS_IRQ, &desc->flags);
 			active_low = test_bit(FLAG_ACTIVE_LOW, &desc->flags);
-			seq_printf(s, " gpio-%-3d (%-20.20s|%-20.20s) %s %s %s%s\n",
+			seq_printf(s, " gpio-%-3u (%-20.20s|%-20.20s) %s %s %s%s\n",
 				   gpio, desc->name ?: "", gpiod_get_label(desc),
 				   is_out ? "out" : "in ",
 				   value >= 0 ? (value ? "hi" : "lo") : "?  ",
 				   is_irq ? "IRQ " : "",
 				   active_low ? "ACTIVE LOW" : "");
 		} else if (desc->name) {
-			seq_printf(s, " gpio-%-3d (%-20.20s)\n", gpio, desc->name);
+			seq_printf(s, " gpio-%-3u (%-20.20s)\n", gpio, desc->name);
 		}
 
 		gpio++;
@@ -4867,7 +4866,7 @@ static int gpiolib_seq_show(struct seq_file *s, void *v)
 		return 0;
 	}
 
-	seq_printf(s, "%s%s: GPIOs %d-%d", priv->newline ? "\n" : "",
+	seq_printf(s, "%s%s: GPIOs %u-%u", priv->newline ? "\n" : "",
 		   dev_name(&gdev->dev),
 		   gdev->base, gdev->base + gdev->ngpio - 1);
 	parent = gc->parent;
diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h
index f67d5991ab1c..7f94580efdbc 100644
--- a/drivers/gpio/gpiolib.h
+++ b/drivers/gpio/gpiolib.h
@@ -61,7 +61,7 @@ struct gpio_device {
 	struct module		*owner;
 	struct gpio_chip __rcu	*chip;
 	struct gpio_desc	*descs;
-	int			base;
+	unsigned int		base;
 	u16			ngpio;
 	bool			can_sleep;
 	const char		*label;
diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index 4aaedcf424ce..56ac7e7a2889 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -74,6 +74,12 @@ static inline bool gpio_is_valid(int number)
  * Until they are all fixed, leave 0-512 space for them.
  */
 #define GPIO_DYNAMIC_BASE	512
+/*
+ * Define the maximum of the possible GPIO in the global numberspace.
+ * While the GPIO base and numbers are positive, we limit it with signed
+ * maximum as a lot of code is using negative values for special cases.
+ */
+#define GPIO_DYNAMIC_MAX	INT_MAX
 
 /* Always use the library code for GPIO management calls,
  * or when sleeping may be involved.
-- 
cgit v1.2.3


From 8c467f3300591a206fa8dcc6988d768910799872 Mon Sep 17 00:00:00 2001
From: Alexey Gladkov <legion@kernel.org>
Date: Wed, 17 Apr 2024 19:37:35 +0200
Subject: VT: Use macros to define ioctls

All other headers use _IOC() macros to describe ioctls for a long time
now. This header is stuck in the last century.

Simply use the _IO() macro. No other changes.

Signed-off-by: Alexey Gladkov <legion@kernel.org>
Link: https://lore.kernel.org/r/e4229fe2933a003341e338b558ab1ea8b63a51f6.1713375378.git.legion@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/kd.h | 96 +++++++++++++++++++++++++------------------------
 1 file changed, 49 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/kd.h b/include/uapi/linux/kd.h
index 6b384065c013..8ddb2219a84b 100644
--- a/include/uapi/linux/kd.h
+++ b/include/uapi/linux/kd.h
@@ -5,60 +5,61 @@
 #include <linux/compiler.h>
 
 /* 0x4B is 'K', to avoid collision with termios and vt */
+#define KD_IOCTL_BASE	'K'
 
-#define GIO_FONT	0x4B60	/* gets font in expanded form */
-#define PIO_FONT	0x4B61	/* use font in expanded form */
+#define GIO_FONT	_IO(KD_IOCTL_BASE, 0x60)	/* gets font in expanded form */
+#define PIO_FONT	_IO(KD_IOCTL_BASE, 0x61)	/* use font in expanded form */
 
-#define GIO_FONTX	0x4B6B	/* get font using struct consolefontdesc */
-#define PIO_FONTX	0x4B6C	/* set font using struct consolefontdesc */
+#define GIO_FONTX	_IO(KD_IOCTL_BASE, 0x6B)	/* get font using struct consolefontdesc */
+#define PIO_FONTX	_IO(KD_IOCTL_BASE, 0x6C)	/* set font using struct consolefontdesc */
 struct consolefontdesc {
 	unsigned short charcount;	/* characters in font (256 or 512) */
 	unsigned short charheight;	/* scan lines per character (1-32) */
 	char __user *chardata;		/* font data in expanded form */
 };
 
-#define PIO_FONTRESET   0x4B6D	/* reset to default font */
+#define PIO_FONTRESET	_IO(KD_IOCTL_BASE, 0x6D)	/* reset to default font */
 
-#define GIO_CMAP	0x4B70	/* gets colour palette on VGA+ */
-#define PIO_CMAP	0x4B71	/* sets colour palette on VGA+ */
+#define GIO_CMAP	_IO(KD_IOCTL_BASE, 0x70)	/* gets colour palette on VGA+ */
+#define PIO_CMAP	_IO(KD_IOCTL_BASE, 0x71)	/* sets colour palette on VGA+ */
 
-#define KIOCSOUND	0x4B2F	/* start sound generation (0 for off) */
-#define KDMKTONE	0x4B30	/* generate tone */
+#define KIOCSOUND	_IO(KD_IOCTL_BASE, 0x2F)	/* start sound generation (0 for off) */
+#define KDMKTONE	_IO(KD_IOCTL_BASE, 0x30)	/* generate tone */
 
-#define KDGETLED	0x4B31	/* return current led state */
-#define KDSETLED	0x4B32	/* set led state [lights, not flags] */
+#define KDGETLED	_IO(KD_IOCTL_BASE, 0x31)	/* return current led state */
+#define KDSETLED	_IO(KD_IOCTL_BASE, 0x32)	/* set led state [lights, not flags] */
 #define 	LED_SCR		0x01	/* scroll lock led */
 #define 	LED_NUM		0x02	/* num lock led */
 #define 	LED_CAP		0x04	/* caps lock led */
 
-#define KDGKBTYPE	0x4B33	/* get keyboard type */
+#define KDGKBTYPE	_IO(KD_IOCTL_BASE, 0x33)	/* get keyboard type */
 #define 	KB_84		0x01
 #define 	KB_101		0x02 	/* this is what we always answer */
 #define 	KB_OTHER	0x03
 
-#define KDADDIO		0x4B34	/* add i/o port as valid */
-#define KDDELIO		0x4B35	/* del i/o port as valid */
-#define KDENABIO	0x4B36	/* enable i/o to video board */
-#define KDDISABIO	0x4B37	/* disable i/o to video board */
+#define KDADDIO		_IO(KD_IOCTL_BASE, 0x34)	/* add i/o port as valid */
+#define KDDELIO		_IO(KD_IOCTL_BASE, 0x35)	/* del i/o port as valid */
+#define KDENABIO	_IO(KD_IOCTL_BASE, 0x36)	/* enable i/o to video board */
+#define KDDISABIO	_IO(KD_IOCTL_BASE, 0x37)	/* disable i/o to video board */
 
-#define KDSETMODE	0x4B3A	/* set text/graphics mode */
+#define KDSETMODE	_IO(KD_IOCTL_BASE, 0x3A)	/* set text/graphics mode */
 #define		KD_TEXT		0x00
 #define		KD_GRAPHICS	0x01
 #define		KD_TEXT0	0x02	/* obsolete */
 #define		KD_TEXT1	0x03	/* obsolete */
-#define KDGETMODE	0x4B3B	/* get current mode */
+#define KDGETMODE	_IO(KD_IOCTL_BASE, 0x3B)	/* get current mode */
 
-#define KDMAPDISP	0x4B3C	/* map display into address space */
-#define KDUNMAPDISP	0x4B3D	/* unmap display from address space */
+#define KDMAPDISP	_IO(KD_IOCTL_BASE, 0x3C)	/* map display into address space */
+#define KDUNMAPDISP	_IO(KD_IOCTL_BASE, 0x3D)	/* unmap display from address space */
 
 typedef char scrnmap_t;
 #define		E_TABSZ		256
-#define GIO_SCRNMAP	0x4B40	/* get screen mapping from kernel */
-#define PIO_SCRNMAP	0x4B41	/* put screen mapping table in kernel */
-#define GIO_UNISCRNMAP  0x4B69	/* get full Unicode screen mapping */
-#define PIO_UNISCRNMAP  0x4B6A  /* set full Unicode screen mapping */
+#define GIO_SCRNMAP	_IO(KD_IOCTL_BASE, 0x40)	/* get screen mapping from kernel */
+#define PIO_SCRNMAP	_IO(KD_IOCTL_BASE, 0x41)	/* put screen mapping table in kernel */
+#define GIO_UNISCRNMAP	_IO(KD_IOCTL_BASE, 0x69)	/* get full Unicode screen mapping */
+#define PIO_UNISCRNMAP	_IO(KD_IOCTL_BASE, 0x6A)	/* set full Unicode screen mapping */
 
-#define GIO_UNIMAP	0x4B66	/* get unicode-to-font mapping from kernel */
+#define GIO_UNIMAP	_IO(KD_IOCTL_BASE, 0x66)	/* get unicode-to-font mapping from kernel */
 struct unipair {
 	unsigned short unicode;
 	unsigned short fontpos;
@@ -67,8 +68,8 @@ struct unimapdesc {
 	unsigned short entry_ct;
 	struct unipair __user *entries;
 };
-#define PIO_UNIMAP	0x4B67	/* put unicode-to-font mapping in kernel */
-#define PIO_UNIMAPCLR	0x4B68	/* clear table, possibly advise hash algorithm */
+#define PIO_UNIMAP	_IO(KD_IOCTL_BASE, 0x67)	/* put unicode-to-font mapping in kernel */
+#define PIO_UNIMAPCLR	_IO(KD_IOCTL_BASE, 0x68)	/* clear table, possibly advise hash algorithm */
 struct unimapinit {
 	unsigned short advised_hashsize;  /* 0 if no opinion */
 	unsigned short advised_hashstep;  /* 0 if no opinion */
@@ -83,19 +84,19 @@ struct unimapinit {
 #define		K_MEDIUMRAW	0x02
 #define		K_UNICODE	0x03
 #define		K_OFF		0x04
-#define KDGKBMODE	0x4B44	/* gets current keyboard mode */
-#define KDSKBMODE	0x4B45	/* sets current keyboard mode */
+#define KDGKBMODE	_IO(KD_IOCTL_BASE, 0x44)	/* gets current keyboard mode */
+#define KDSKBMODE	_IO(KD_IOCTL_BASE, 0x45)	/* sets current keyboard mode */
 
 #define		K_METABIT	0x03
 #define		K_ESCPREFIX	0x04
-#define KDGKBMETA	0x4B62	/* gets meta key handling mode */
-#define KDSKBMETA	0x4B63	/* sets meta key handling mode */
+#define KDGKBMETA	_IO(KD_IOCTL_BASE, 0x62)	/* gets meta key handling mode */
+#define KDSKBMETA	_IO(KD_IOCTL_BASE, 0x63)	/* sets meta key handling mode */
 
 #define		K_SCROLLLOCK	0x01
 #define		K_NUMLOCK	0x02
 #define		K_CAPSLOCK	0x04
-#define	KDGKBLED	0x4B64	/* get led flags (not lights) */
-#define	KDSKBLED	0x4B65	/* set led flags (not lights) */
+#define	KDGKBLED	_IO(KD_IOCTL_BASE, 0x64)	/* get led flags (not lights) */
+#define	KDSKBLED	_IO(KD_IOCTL_BASE, 0x65)	/* set led flags (not lights) */
 
 struct kbentry {
 	unsigned char kb_table;
@@ -107,15 +108,15 @@ struct kbentry {
 #define		K_ALTTAB	0x02
 #define		K_ALTSHIFTTAB	0x03
 
-#define KDGKBENT	0x4B46	/* gets one entry in translation table */
-#define KDSKBENT	0x4B47	/* sets one entry in translation table */
+#define KDGKBENT	_IO(KD_IOCTL_BASE, 0x46)	/* gets one entry in translation table */
+#define KDSKBENT	_IO(KD_IOCTL_BASE, 0x47)	/* sets one entry in translation table */
 
 struct kbsentry {
 	unsigned char kb_func;
 	unsigned char kb_string[512];
 };
-#define KDGKBSENT	0x4B48	/* gets one function key string entry */
-#define KDSKBSENT	0x4B49	/* sets one function key string entry */
+#define KDGKBSENT	_IO(KD_IOCTL_BASE, 0x48)	/* gets one function key string entry */
+#define KDSKBSENT	_IO(KD_IOCTL_BASE, 0x49)	/* sets one function key string entry */
 
 struct kbdiacr {
         unsigned char diacr, base, result;
@@ -124,8 +125,8 @@ struct kbdiacrs {
         unsigned int kb_cnt;    /* number of entries in following array */
 	struct kbdiacr kbdiacr[256];    /* MAX_DIACR from keyboard.h */
 };
-#define KDGKBDIACR      0x4B4A  /* read kernel accent table */
-#define KDSKBDIACR      0x4B4B  /* write kernel accent table */
+#define KDGKBDIACR	_IO(KD_IOCTL_BASE, 0x4A)  /* read kernel accent table */
+#define KDSKBDIACR	_IO(KD_IOCTL_BASE, 0x4B)  /* write kernel accent table */
 
 struct kbdiacruc {
 	unsigned int diacr, base, result;
@@ -134,16 +135,16 @@ struct kbdiacrsuc {
         unsigned int kb_cnt;    /* number of entries in following array */
 	struct kbdiacruc kbdiacruc[256];    /* MAX_DIACR from keyboard.h */
 };
-#define KDGKBDIACRUC    0x4BFA  /* read kernel accent table - UCS */
-#define KDSKBDIACRUC    0x4BFB  /* write kernel accent table - UCS */
+#define KDGKBDIACRUC	_IO(KD_IOCTL_BASE, 0xFA)  /* read kernel accent table - UCS */
+#define KDSKBDIACRUC	_IO(KD_IOCTL_BASE, 0xFB)  /* write kernel accent table - UCS */
 
 struct kbkeycode {
 	unsigned int scancode, keycode;
 };
-#define KDGETKEYCODE	0x4B4C	/* read kernel keycode table entry */
-#define KDSETKEYCODE	0x4B4D	/* write kernel keycode table entry */
+#define KDGETKEYCODE	_IO(KD_IOCTL_BASE, 0x4C)	/* read kernel keycode table entry */
+#define KDSETKEYCODE	_IO(KD_IOCTL_BASE, 0x4D)	/* write kernel keycode table entry */
 
-#define KDSIGACCEPT	0x4B4E	/* accept kbd generated signals */
+#define KDSIGACCEPT	_IO(KD_IOCTL_BASE, 0x4E)	/* accept kbd generated signals */
 
 struct kbd_repeat {
 	int delay;	/* in msec; <= 0: don't change */
@@ -151,10 +152,11 @@ struct kbd_repeat {
 			/* earlier this field was misnamed "rate" */
 };
 
-#define KDKBDREP        0x4B52  /* set keyboard delay/repeat rate;
-				 * actually used values are returned */
+#define KDKBDREP	_IO(KD_IOCTL_BASE, 0x52)	/* set keyboard delay/repeat rate;
+							 * actually used values are returned
+							 */
 
-#define KDFONTOP	0x4B72	/* font operations */
+#define KDFONTOP	_IO(KD_IOCTL_BASE, 0x72)	/* font operations */
 
 struct console_font_op {
 	unsigned int op;	/* operation code KD_FONT_OP_* */
-- 
cgit v1.2.3


From a1de26c0b74f060ee29f35a6d71765c56c925f5e Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Date: Tue, 9 Apr 2024 11:38:08 +0300
Subject: ALSA: hda: Introduce flags to force commands via PIO instead of CORB

Add AZX_DCAPS_PIO_COMMANDS quirk (bit 31) and use_pio_for_commands flag to
be able to select PIO mode as alternative for CORB based command sending
while retaining the RIRB functionality to receive unsolicited responses.

This mode differs from the azx single_cmd mode when RIRB is disabled.

The mixed mode is needed on Lunar Lake family because it is recommended to
use Immediate Command Response (PIO mode) instead of CORB for HDA commands.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Liam Girdwood <liam.r.girdwood@intel.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Message-ID: <20240409083812.14001-2-peter.ujfalusi@linux.intel.com>
---
 include/sound/hdaudio.h        | 1 +
 sound/pci/hda/hda_controller.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/sound/hdaudio.h b/include/sound/hdaudio.h
index a73d7f34f4e5..85d2c3d00a27 100644
--- a/include/sound/hdaudio.h
+++ b/include/sound/hdaudio.h
@@ -351,6 +351,7 @@ struct hdac_bus {
 	bool needs_damn_long_delay:1;
 	bool not_use_interrupts:1;	/* prohibiting the RIRB IRQ */
 	bool access_sdnctl_in_dword:1;	/* accessing the sdnctl register by dword */
+	bool use_pio_for_commands:1;	/* Use PIO instead of CORB for commands */
 
 	int poll_count;
 
diff --git a/sound/pci/hda/hda_controller.h b/sound/pci/hda/hda_controller.h
index 8556031bcd68..c2d0109866e6 100644
--- a/sound/pci/hda/hda_controller.h
+++ b/sound/pci/hda/hda_controller.h
@@ -45,6 +45,7 @@
 #define AZX_DCAPS_CORBRP_SELF_CLEAR (1 << 28)	/* CORBRP clears itself after reset */
 #define AZX_DCAPS_NO_MSI64      (1 << 29)	/* Stick to 32-bit MSIs */
 #define AZX_DCAPS_SEPARATE_STREAM_TAG	(1 << 30) /* capture and playback use separate stream tag */
+#define AZX_DCAPS_PIO_COMMANDS (1 << 31)	/* Use PIO instead of CORB for commands */
 
 enum {
 	AZX_SNOOP_TYPE_NONE,
-- 
cgit v1.2.3


From ef752c60e41e4e2d7030238e8978e057878a97b1 Mon Sep 17 00:00:00 2001
From: Stefan Binding <sbinding@opensource.cirrus.com>
Date: Thu, 11 Apr 2024 12:08:07 +0100
Subject: ALSA: hda: cs35l41: Set the max PCM Gain using tuning setting

Some systems requires different max PCM Gains settings than the default.
The current default value, when running firmware is 17.5 dB, which is
used for all systems. Some systems require lower values.
Value when running without firmware is 4.5 dB and remains unchanged.

Since the gain value is dependent on Tuning and Firmware, it can
change, so it cannot be saved in _DSD. Instead we can store it inside
a configuration binary file alongside the Firmware and Tuning files.

The gain value increments in steps of 1 dB, with value 0 representing
0.5 dB. The max value is 20, which corresponds to 20.5 dB.

Signed-off-by: Stefan Binding <sbinding@opensource.cirrus.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Message-ID: <20240411110813.330483-2-sbinding@opensource.cirrus.com>
---
 include/sound/cs35l41.h     |   5 ++
 sound/pci/hda/cs35l41_hda.c | 170 +++++++++++++++++++++++++++++++++++++++++---
 sound/pci/hda/cs35l41_hda.h |   3 +
 3 files changed, 167 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/sound/cs35l41.h b/include/sound/cs35l41.h
index 68e053fe7340..bb70782d15d0 100644
--- a/include/sound/cs35l41.h
+++ b/include/sound/cs35l41.h
@@ -554,6 +554,11 @@
 #define CS35L41_LRCLK_FRC_SHIFT		1
 
 #define CS35L41_AMP_GAIN_PCM_MASK	0x3E0
+#define CS35L41_AMP_GAIN_PCM_SHIFT	5
+#define CS35L41_AMP_GAIN_PDM_MASK	0x1F
+#define CS35L41_AMP_GAIN_PDM_SHIFT	0
+#define CS35L41_AMP_GAIN_PCM_MAX	20
+#define CS35L41_AMP_GAIN_PDM_MAX	20
 #define CS35L41_AMP_GAIN_ZC_MASK	0x0400
 #define CS35L41_AMP_GAIN_ZC_SHIFT	10
 
diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c
index d3fa6e136744..12609d6d96c2 100644
--- a/sound/pci/hda/cs35l41_hda.c
+++ b/sound/pci/hda/cs35l41_hda.c
@@ -37,6 +37,32 @@
 #define CS35L41_UUID			"50d90cdc-3de4-4f18-b528-c7fe3b71f40d"
 #define CS35L41_DSM_GET_MUTE		5
 #define CS35L41_NOTIFY_EVENT		0x91
+#define CS35L41_TUNING_SIG		0x109A4A35
+
+enum cs35l41_tuning_param_types {
+	TUNING_PARAM_GAIN,
+};
+
+struct cs35l41_tuning_param_hdr {
+	__le32 tuning_index;
+	__le32 type;
+	__le32 size;
+} __packed;
+
+struct cs35l41_tuning_param {
+	struct cs35l41_tuning_param_hdr hdr;
+	union {
+		__le32 gain;
+	};
+} __packed;
+
+struct cs35l41_tuning_params {
+	__le32 signature;
+	__le32 version;
+	__le32 size;
+	__le32 num_entries;
+	u8 data[];
+} __packed;
 
 static bool firmware_autostart = 1;
 module_param(firmware_autostart, bool, 0444);
@@ -92,11 +118,6 @@ static const struct reg_sequence cs35l41_hda_unmute[] = {
 	{ CS35L41_AMP_GAIN_CTRL,	0x00000084 }, // AMP_GAIN_PCM 4.5 dB
 };
 
-static const struct reg_sequence cs35l41_hda_unmute_dsp[] = {
-	{ CS35L41_AMP_DIG_VOL_CTRL,	0x00008000 }, // AMP_HPF_PCM_EN = 1, AMP_VOL_PCM  0.0 dB
-	{ CS35L41_AMP_GAIN_CTRL,	0x00000233 }, // AMP_GAIN_PCM = 17.5dB AMP_GAIN_PDM = 19.5dB
-};
-
 static const struct reg_sequence cs35l41_hda_mute[] = {
 	{ CS35L41_AMP_GAIN_CTRL,	0x00000000 }, // AMP_GAIN_PCM 0.5 dB
 	{ CS35L41_AMP_DIG_VOL_CTRL,	0x0000A678 }, // AMP_HPF_PCM_EN = 1, AMP_VOL_PCM Mute
@@ -117,6 +138,27 @@ static const struct cs_dsp_client_ops client_ops = {
 	.control_remove = hda_cs_dsp_control_remove,
 };
 
+static int cs35l41_request_tuning_param_file(struct cs35l41_hda *cs35l41, char *tuning_filename,
+					     const struct firmware **firmware, char **filename,
+					     const char *ssid)
+{
+	int ret = 0;
+
+	/* Filename is the same as the tuning file with "cfg" suffix */
+	*filename = kasprintf(GFP_KERNEL, "%scfg", tuning_filename);
+	if (*filename == NULL)
+		return -ENOMEM;
+
+	ret = firmware_request_nowarn(firmware, *filename, cs35l41->dev);
+	if (ret != 0) {
+		dev_dbg(cs35l41->dev, "Failed to request '%s'\n", *filename);
+		kfree(*filename);
+		*filename = NULL;
+	}
+
+	return ret;
+}
+
 static int cs35l41_request_firmware_file(struct cs35l41_hda *cs35l41,
 					 const struct firmware **firmware, char **filename,
 					 const char *dir, const char *ssid, const char *amp_name,
@@ -451,6 +493,94 @@ static int cs35l41_save_calibration(struct cs35l41_hda *cs35l41)
 }
 #endif
 
+static void cs35l41_set_default_tuning_params(struct cs35l41_hda *cs35l41)
+{
+	cs35l41->tuning_gain = DEFAULT_AMP_GAIN_PCM;
+}
+
+static int cs35l41_read_tuning_params(struct cs35l41_hda *cs35l41, const struct firmware *firmware)
+{
+	struct cs35l41_tuning_params *params;
+	unsigned int offset = 0;
+	unsigned int end;
+	int i;
+
+	params = (void *)&firmware->data[0];
+
+	if (le32_to_cpu(params->size) != firmware->size) {
+		dev_err(cs35l41->dev, "Wrong Size for Tuning Param file. Expected %d got %zu\n",
+			le32_to_cpu(params->size), firmware->size);
+		return -EINVAL;
+	}
+
+	if (le32_to_cpu(params->version) != 1) {
+		dev_err(cs35l41->dev, "Unsupported Tuning Param Version: %d\n",
+			le32_to_cpu(params->version));
+		return -EINVAL;
+	}
+
+	if (le32_to_cpu(params->signature) != CS35L41_TUNING_SIG) {
+		dev_err(cs35l41->dev,
+			"Mismatched Signature for Tuning Param file. Expected %#x got %#x\n",
+			CS35L41_TUNING_SIG, le32_to_cpu(params->signature));
+		return -EINVAL;
+	}
+
+	end = firmware->size - sizeof(struct cs35l41_tuning_params);
+
+	for (i = 0; i < le32_to_cpu(params->num_entries); i++) {
+		struct cs35l41_tuning_param *param;
+
+		if ((offset >= end) || ((offset + sizeof(struct cs35l41_tuning_param_hdr)) >= end))
+			return -EFAULT;
+
+		param = (void *)&params->data[offset];
+		offset += le32_to_cpu(param->hdr.size);
+
+		if (offset > end)
+			return -EFAULT;
+
+		switch (le32_to_cpu(param->hdr.type)) {
+		case TUNING_PARAM_GAIN:
+			cs35l41->tuning_gain = le32_to_cpu(param->gain);
+			dev_dbg(cs35l41->dev, "Applying Gain: %d\n", cs35l41->tuning_gain);
+			break;
+		default:
+			break;
+		}
+	}
+
+	return 0;
+}
+
+static int cs35l41_load_tuning_params(struct cs35l41_hda *cs35l41, char *tuning_filename)
+{
+	const struct firmware *tuning_param_file = NULL;
+	char *tuning_param_filename = NULL;
+	int ret;
+
+	ret = cs35l41_request_tuning_param_file(cs35l41, tuning_filename, &tuning_param_file,
+						&tuning_param_filename, cs35l41->acpi_subsystem_id);
+	if (ret) {
+		dev_dbg(cs35l41->dev, "Missing Tuning Param for file: %s: %d\n", tuning_filename,
+			ret);
+		return 0;
+	}
+
+	ret = cs35l41_read_tuning_params(cs35l41, tuning_param_file);
+	if (ret) {
+		dev_err(cs35l41->dev, "Error reading Tuning Params from file: %s: %d\n",
+			tuning_param_filename, ret);
+		/* Reset to default Tuning Parameters */
+		cs35l41_set_default_tuning_params(cs35l41);
+	}
+
+	release_firmware(tuning_param_file);
+	kfree(tuning_param_filename);
+
+	return ret;
+}
+
 static int cs35l41_init_dsp(struct cs35l41_hda *cs35l41)
 {
 	const struct firmware *coeff_firmware = NULL;
@@ -470,27 +600,35 @@ static int cs35l41_init_dsp(struct cs35l41_hda *cs35l41)
 		cs35l41->halo_initialized = true;
 	}
 
+	cs35l41_set_default_tuning_params(cs35l41);
+
 	ret = cs35l41_request_firmware_files(cs35l41, &wmfw_firmware, &wmfw_filename,
 					     &coeff_firmware, &coeff_filename);
 	if (ret < 0)
 		return ret;
 
 	dev_dbg(cs35l41->dev, "Loading WMFW Firmware: %s\n", wmfw_filename);
-	if (coeff_filename)
+	if (coeff_filename) {
 		dev_dbg(cs35l41->dev, "Loading Coefficient File: %s\n", coeff_filename);
-	else
+		ret = cs35l41_load_tuning_params(cs35l41, coeff_filename);
+		if (ret)
+			dev_warn(cs35l41->dev, "Unable to load Tuning Parameters: %d\n", ret);
+	} else {
 		dev_warn(cs35l41->dev, "No Coefficient File available.\n");
+	}
 
 	ret = cs_dsp_power_up(dsp, wmfw_firmware, wmfw_filename, coeff_firmware, coeff_filename,
 			      hda_cs_dsp_fw_ids[cs35l41->firmware_type]);
 	if (ret)
-		goto err_release;
+		goto err;
 
 	cs35l41_add_controls(cs35l41);
 
 	ret = cs35l41_save_calibration(cs35l41);
 
-err_release:
+err:
+	if (ret)
+		cs35l41_set_default_tuning_params(cs35l41);
 	release_firmware(wmfw_firmware);
 	release_firmware(coeff_firmware);
 	kfree(wmfw_filename);
@@ -503,6 +641,7 @@ static void cs35l41_shutdown_dsp(struct cs35l41_hda *cs35l41)
 {
 	struct cs_dsp *dsp = &cs35l41->cs_dsp;
 
+	cs35l41_set_default_tuning_params(cs35l41);
 	cs_dsp_stop(dsp);
 	cs_dsp_power_down(dsp);
 	dev_dbg(cs35l41->dev, "Unloaded Firmware\n");
@@ -570,6 +709,7 @@ static void cs35l41_mute(struct device *dev, bool mute)
 {
 	struct cs35l41_hda *cs35l41 = dev_get_drvdata(dev);
 	struct regmap *reg = cs35l41->regmap;
+	unsigned int amp_gain;
 
 	dev_dbg(dev, "Mute(%d:%d) Playback Started: %d\n", mute, cs35l41->mute_override,
 		cs35l41->playback_started);
@@ -581,8 +721,13 @@ static void cs35l41_mute(struct device *dev, bool mute)
 		} else {
 			dev_dbg(dev, "Unmuting\n");
 			if (cs35l41->cs_dsp.running) {
-				regmap_multi_reg_write(reg, cs35l41_hda_unmute_dsp,
-						ARRAY_SIZE(cs35l41_hda_unmute_dsp));
+				dev_dbg(dev, "Using Tuned Gain: %d\n", cs35l41->tuning_gain);
+				amp_gain = (cs35l41->tuning_gain << CS35L41_AMP_GAIN_PCM_SHIFT) |
+					(DEFAULT_AMP_GAIN_PDM << CS35L41_AMP_GAIN_PDM_SHIFT);
+
+				/* AMP_HPF_PCM_EN = 1, AMP_VOL_PCM  0.0 dB */
+				regmap_write(reg, CS35L41_AMP_DIG_VOL_CTRL, 0x00008000);
+				regmap_write(reg, CS35L41_AMP_GAIN_CTRL, amp_gain);
 			} else {
 				regmap_multi_reg_write(reg, cs35l41_hda_unmute,
 						ARRAY_SIZE(cs35l41_hda_unmute));
@@ -1056,6 +1201,9 @@ static int cs35l41_smart_amp(struct cs35l41_hda *cs35l41)
 		goto clean_dsp;
 	}
 
+	dev_info(cs35l41->dev, "Firmware Loaded - Type: %s, Gain: %d\n",
+		 hda_cs_dsp_fw_ids[cs35l41->firmware_type], cs35l41->tuning_gain);
+
 	return 0;
 
 clean_dsp:
diff --git a/sound/pci/hda/cs35l41_hda.h b/sound/pci/hda/cs35l41_hda.h
index 43d55292b327..d60aa98bfafc 100644
--- a/sound/pci/hda/cs35l41_hda.h
+++ b/sound/pci/hda/cs35l41_hda.h
@@ -21,6 +21,8 @@
 #include <linux/firmware/cirrus/wmfw.h>
 
 #define CS35L41_MAX_ACCEPTABLE_SPI_SPEED_HZ	1000000
+#define DEFAULT_AMP_GAIN_PCM			17	/* 17.5dB Gain */
+#define DEFAULT_AMP_GAIN_PDM			19	/* 19.5dB Gain */
 
 struct cs35l41_amp_cal_data {
 	u32 calTarget[2];
@@ -83,6 +85,7 @@ struct cs35l41_hda {
 	bool mute_override;
 	enum control_bus control_bus;
 	bool bypass_fw;
+	unsigned int tuning_gain;
 
 };
 
-- 
cgit v1.2.3


From ad88ea67b135f74a9f32dc5404a35c773b2a3925 Mon Sep 17 00:00:00 2001
From: Pavel Hofman <pavel.hofman@ivitera.com>
Date: Tue, 16 Apr 2024 14:17:25 +0200
Subject: ALSA: pcm: add support for 705.6kHz and 768kHz sample rates

Many modern codecs support 705.6kHz and 768kHz sample rates. Current HW
params fail to set 705.6kHz and 768kHz sample rates as these are not in the
known-rates list.

Add these new rates to the known-rates list to allow them.

Also add defines in pcm.h so that drivers can use it.

Signed-off-by: Pavel Hofman <pavel.hofman@ivitera.com>
Reviewed-by: Jaroslav Kysela <perex@perex.cz>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Message-ID: <20240416121726.628679-3-pavel.hofman@ivitera.com>
---
 include/sound/pcm.h     | 5 +++++
 sound/core/pcm_native.c | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/sound/pcm.h b/include/sound/pcm.h
index 210096f124ee..61c6054618c8 100644
--- a/include/sound/pcm.h
+++ b/include/sound/pcm.h
@@ -120,6 +120,8 @@ struct snd_pcm_ops {
 #define SNDRV_PCM_RATE_192000		(1U<<12)	/* 192000Hz */
 #define SNDRV_PCM_RATE_352800		(1U<<13)	/* 352800Hz */
 #define SNDRV_PCM_RATE_384000		(1U<<14)	/* 384000Hz */
+#define SNDRV_PCM_RATE_705600		(1U<<15)	/* 705600Hz */
+#define SNDRV_PCM_RATE_768000		(1U<<16)	/* 768000Hz */
 
 #define SNDRV_PCM_RATE_CONTINUOUS	(1U<<30)	/* continuous range */
 #define SNDRV_PCM_RATE_KNOT		(1U<<31)	/* supports more non-continuos rates */
@@ -135,6 +137,9 @@ struct snd_pcm_ops {
 #define SNDRV_PCM_RATE_8000_384000	(SNDRV_PCM_RATE_8000_192000|\
 					 SNDRV_PCM_RATE_352800|\
 					 SNDRV_PCM_RATE_384000)
+#define SNDRV_PCM_RATE_8000_768000	(SNDRV_PCM_RATE_8000_384000|\
+					 SNDRV_PCM_RATE_705600|\
+					 SNDRV_PCM_RATE_768000)
 #define _SNDRV_PCM_FMTBIT(fmt)		(1ULL << (__force int)SNDRV_PCM_FORMAT_##fmt)
 #define SNDRV_PCM_FMTBIT_S8		_SNDRV_PCM_FMTBIT(S8)
 #define SNDRV_PCM_FMTBIT_U8		_SNDRV_PCM_FMTBIT(U8)
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 0b76e76823d2..521ba56392a0 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -2416,7 +2416,7 @@ static int snd_pcm_hw_rule_sample_bits(struct snd_pcm_hw_params *params,
 
 static const unsigned int rates[] = {
 	5512, 8000, 11025, 16000, 22050, 32000, 44100,
-	48000, 64000, 88200, 96000, 176400, 192000, 352800, 384000
+	48000, 64000, 88200, 96000, 176400, 192000, 352800, 384000, 705600, 768000
 };
 
 const struct snd_pcm_hw_constraint_list snd_pcm_known_rates = {
-- 
cgit v1.2.3


From aeb48a428d7dbe636203ae892e981bcc3e2ac042 Mon Sep 17 00:00:00 2001
From: Antony Antony <antony.antony@secunet.com>
Date: Fri, 12 Apr 2024 13:50:44 +0200
Subject: udpencap: Remove Obsolete UDP_ENCAP_ESPINUDP_NON_IKE Support

The UDP_ENCAP_ESPINUDP_NON_IKE mode, introduced into the Linux kernel
in 2004 [2], has remained inactive and obsolete for an extended period.

This mode was originally defined in an early version of an IETF draft
[1] from 2001. By the time it was integrated into the kernel in 2004 [2],
it had already been replaced by UDP_ENCAP_ESPINUDP [3] in later
versions of draft-ietf-ipsec-udp-encaps, particularly in version 06.

Over time, UDP_ENCAP_ESPINUDP_NON_IKE has lost its relevance, with no
known use cases.

With this commit, we remove support for UDP_ENCAP_ESPINUDP_NON_IKE,
simplifying the codebase and eliminating unnecessary complexity.
Kernel will return an error -ENOPROTOOPT if the userspace tries to set
this option.

References:
[1] https://datatracker.ietf.org/doc/html/draft-ietf-ipsec-udp-encaps-00.txt

[2] Commit that added UDP_ENCAP_ESPINUDP_NON_IKE to the Linux historic
    repository.

    Author: Andreas Gruenbacher <agruen@suse.de>
    Date: Fri Apr 9 01:47:47 2004 -0700

   [IPSEC]: Support draft-ietf-ipsec-udp-encaps-00/01, some ipec impls need it.

[3] Commit that added UDP_ENCAP_ESPINUDP to the Linux historic
    repository.

    Author: Derek Atkins <derek@ihtfp.com>
    Date: Wed Apr 2 13:21:02 2003 -0800

    [IPSEC]: Implement UDP Encapsulation framework.

Signed-off-by: Antony Antony <antony.antony@secunet.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/uapi/linux/udp.h |  2 +-
 net/ipv4/esp4.c          | 12 ------------
 net/ipv4/udp.c           |  2 --
 net/ipv4/xfrm4_input.c   | 13 -------------
 net/ipv6/esp6.c          | 12 ------------
 net/ipv6/xfrm6_input.c   | 13 -------------
 6 files changed, 1 insertion(+), 53 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/udp.h b/include/uapi/linux/udp.h
index 4828794efcf8..1a0fe8b151fb 100644
--- a/include/uapi/linux/udp.h
+++ b/include/uapi/linux/udp.h
@@ -36,7 +36,7 @@ struct udphdr {
 #define UDP_GRO		104	/* This socket can receive UDP GRO packets */
 
 /* UDP encapsulation types */
-#define UDP_ENCAP_ESPINUDP_NON_IKE	1 /* draft-ietf-ipsec-nat-t-ike-00/01 */
+#define UDP_ENCAP_ESPINUDP_NON_IKE	1 /* unused  draft-ietf-ipsec-nat-t-ike-00/01 */
 #define UDP_ENCAP_ESPINUDP	2 /* draft-ietf-ipsec-udp-encaps-06 */
 #define UDP_ENCAP_L2TPINUDP	3 /* rfc2661 */
 #define UDP_ENCAP_GTP0		4 /* GSM TS 09.60 */
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 3d647c9a7a21..7d38ddd64115 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -347,7 +347,6 @@ static struct ip_esp_hdr *esp_output_udp_encap(struct sk_buff *skb,
 					       __be16 dport)
 {
 	struct udphdr *uh;
-	__be32 *udpdata32;
 	unsigned int len;
 
 	len = skb->len + esp->tailen - skb_transport_offset(skb);
@@ -362,12 +361,6 @@ static struct ip_esp_hdr *esp_output_udp_encap(struct sk_buff *skb,
 
 	*skb_mac_header(skb) = IPPROTO_UDP;
 
-	if (encap_type == UDP_ENCAP_ESPINUDP_NON_IKE) {
-		udpdata32 = (__be32 *)(uh + 1);
-		udpdata32[0] = udpdata32[1] = 0;
-		return (struct ip_esp_hdr *)(udpdata32 + 2);
-	}
-
 	return (struct ip_esp_hdr *)(uh + 1);
 }
 
@@ -423,7 +416,6 @@ static int esp_output_encap(struct xfrm_state *x, struct sk_buff *skb,
 	switch (encap_type) {
 	default:
 	case UDP_ENCAP_ESPINUDP:
-	case UDP_ENCAP_ESPINUDP_NON_IKE:
 		esph = esp_output_udp_encap(skb, encap_type, esp, sport, dport);
 		break;
 	case TCP_ENCAP_ESPINTCP:
@@ -775,7 +767,6 @@ int esp_input_done2(struct sk_buff *skb, int err)
 			source = th->source;
 			break;
 		case UDP_ENCAP_ESPINUDP:
-		case UDP_ENCAP_ESPINUDP_NON_IKE:
 			source = uh->source;
 			break;
 		default:
@@ -1179,9 +1170,6 @@ static int esp_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
 		case UDP_ENCAP_ESPINUDP:
 			x->props.header_len += sizeof(struct udphdr);
 			break;
-		case UDP_ENCAP_ESPINUDP_NON_IKE:
-			x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32);
-			break;
 #ifdef CONFIG_INET_ESPINTCP
 		case TCP_ENCAP_ESPINTCP:
 			/* only the length field, TCP encap is done by
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 7613daa339b0..4ca781065a07 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2699,8 +2699,6 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 #ifdef CONFIG_XFRM
 		case UDP_ENCAP_ESPINUDP:
 			set_xfrm_gro_udp_encap_rcv(val, sk->sk_family, sk);
-			fallthrough;
-		case UDP_ENCAP_ESPINUDP_NON_IKE:
 #if IS_ENABLED(CONFIG_IPV6)
 			if (sk->sk_family == AF_INET6)
 				WRITE_ONCE(up->encap_rcv,
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index dae35101d189..0918b0682174 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -113,19 +113,6 @@ static int __xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb, bool pull
 			/* Must be an IKE packet.. pass it through */
 			return 1;
 		break;
-	case UDP_ENCAP_ESPINUDP_NON_IKE:
-		/* Check if this is a keepalive packet.  If so, eat it. */
-		if (len == 1 && udpdata[0] == 0xff) {
-			return -EINVAL;
-		} else if (len > 2 * sizeof(u32) + sizeof(struct ip_esp_hdr) &&
-			   udpdata32[0] == 0 && udpdata32[1] == 0) {
-
-			/* ESP Packet with Non-IKE marker */
-			len = sizeof(struct udphdr) + 2 * sizeof(u32);
-		} else
-			/* Must be an IKE packet.. pass it through */
-			return 1;
-		break;
 	}
 
 	/* At this point we are sure that this is an ESPinUDP packet,
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index fe8d53f5a5ee..27df148530a6 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -383,7 +383,6 @@ static struct ip_esp_hdr *esp6_output_udp_encap(struct sk_buff *skb,
 					       __be16 dport)
 {
 	struct udphdr *uh;
-	__be32 *udpdata32;
 	unsigned int len;
 
 	len = skb->len + esp->tailen - skb_transport_offset(skb);
@@ -398,12 +397,6 @@ static struct ip_esp_hdr *esp6_output_udp_encap(struct sk_buff *skb,
 
 	*skb_mac_header(skb) = IPPROTO_UDP;
 
-	if (encap_type == UDP_ENCAP_ESPINUDP_NON_IKE) {
-		udpdata32 = (__be32 *)(uh + 1);
-		udpdata32[0] = udpdata32[1] = 0;
-		return (struct ip_esp_hdr *)(udpdata32 + 2);
-	}
-
 	return (struct ip_esp_hdr *)(uh + 1);
 }
 
@@ -459,7 +452,6 @@ static int esp6_output_encap(struct xfrm_state *x, struct sk_buff *skb,
 	switch (encap_type) {
 	default:
 	case UDP_ENCAP_ESPINUDP:
-	case UDP_ENCAP_ESPINUDP_NON_IKE:
 		esph = esp6_output_udp_encap(skb, encap_type, esp, sport, dport);
 		break;
 	case TCP_ENCAP_ESPINTCP:
@@ -822,7 +814,6 @@ int esp6_input_done2(struct sk_buff *skb, int err)
 			source = th->source;
 			break;
 		case UDP_ENCAP_ESPINUDP:
-		case UDP_ENCAP_ESPINUDP_NON_IKE:
 			source = uh->source;
 			break;
 		default:
@@ -1232,9 +1223,6 @@ static int esp6_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
 		case UDP_ENCAP_ESPINUDP:
 			x->props.header_len += sizeof(struct udphdr);
 			break;
-		case UDP_ENCAP_ESPINUDP_NON_IKE:
-			x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32);
-			break;
 #ifdef CONFIG_INET6_ESPINTCP
 		case TCP_ENCAP_ESPINTCP:
 			/* only the length field, TCP encap is done by
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index a17d783dc7c0..2c6aeb090b7a 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -109,19 +109,6 @@ static int __xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb, bool pull
 			/* Must be an IKE packet.. pass it through */
 			return 1;
 		break;
-	case UDP_ENCAP_ESPINUDP_NON_IKE:
-		/* Check if this is a keepalive packet.  If so, eat it. */
-		if (len == 1 && udpdata[0] == 0xff) {
-			return -EINVAL;
-		} else if (len > 2 * sizeof(u32) + sizeof(struct ip_esp_hdr) &&
-			   udpdata32[0] == 0 && udpdata32[1] == 0) {
-
-			/* ESP Packet with Non-IKE marker */
-			len = sizeof(struct udphdr) + 2 * sizeof(u32);
-		} else
-			/* Must be an IKE packet.. pass it through */
-			return 1;
-		break;
 	}
 
 	/* At this point we are sure that this is an ESPinUDP packet,
-- 
cgit v1.2.3


From d12e36494dc2bf221867ecbfa7059e1e231f6ac2 Mon Sep 17 00:00:00 2001
From: Ville Syrjälä <ville.syrjala@linux.intel.com>
Date: Mon, 8 Apr 2024 22:06:07 +0300
Subject: drm/vblank: Introduce drm_crtc_vblank_crtc()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make life easier by providing a function that hands
out the correct drm_vblank_crtc for a given a drm_crtc.

Also abstract the lower level internals of the vblank
code in a similar fashion.

Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240408190611.24914-1-ville.syrjala@linux.intel.com
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
---
 drivers/gpu/drm/drm_vblank.c      | 58 +++++++++++++++++++++++----------------
 drivers/gpu/drm/drm_vblank_work.c |  2 +-
 include/drm/drm_vblank.h          |  1 +
 3 files changed, 36 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_vblank.c b/drivers/gpu/drm/drm_vblank.c
index 702a12bc93bd..cc3571e25a9a 100644
--- a/drivers/gpu/drm/drm_vblank.c
+++ b/drivers/gpu/drm/drm_vblank.c
@@ -166,11 +166,24 @@ module_param_named(timestamp_precision_usec, drm_timestamp_precision, int, 0600)
 MODULE_PARM_DESC(vblankoffdelay, "Delay until vblank irq auto-disable [msecs] (0: never disable, <0: disable immediately)");
 MODULE_PARM_DESC(timestamp_precision_usec, "Max. error on timestamps [usecs]");
 
+static struct drm_vblank_crtc *
+drm_vblank_crtc(struct drm_device *dev, unsigned int pipe)
+{
+	return &dev->vblank[pipe];
+}
+
+struct drm_vblank_crtc *
+drm_crtc_vblank_crtc(struct drm_crtc *crtc)
+{
+	return drm_vblank_crtc(crtc->dev, drm_crtc_index(crtc));
+}
+EXPORT_SYMBOL(drm_crtc_vblank_crtc);
+
 static void store_vblank(struct drm_device *dev, unsigned int pipe,
 			 u32 vblank_count_inc,
 			 ktime_t t_vblank, u32 last)
 {
-	struct drm_vblank_crtc *vblank = &dev->vblank[pipe];
+	struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe);
 
 	assert_spin_locked(&dev->vblank_time_lock);
 
@@ -184,7 +197,7 @@ static void store_vblank(struct drm_device *dev, unsigned int pipe,
 
 static u32 drm_max_vblank_count(struct drm_device *dev, unsigned int pipe)
 {
-	struct drm_vblank_crtc *vblank = &dev->vblank[pipe];
+	struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe);
 
 	return vblank->max_vblank_count ?: dev->max_vblank_count;
 }
@@ -273,7 +286,7 @@ static void drm_reset_vblank_timestamp(struct drm_device *dev, unsigned int pipe
 static void drm_update_vblank_count(struct drm_device *dev, unsigned int pipe,
 				    bool in_vblank_irq)
 {
-	struct drm_vblank_crtc *vblank = &dev->vblank[pipe];
+	struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe);
 	u32 cur_vblank, diff;
 	bool rc;
 	ktime_t t_vblank;
@@ -364,7 +377,7 @@ static void drm_update_vblank_count(struct drm_device *dev, unsigned int pipe,
 
 u64 drm_vblank_count(struct drm_device *dev, unsigned int pipe)
 {
-	struct drm_vblank_crtc *vblank = &dev->vblank[pipe];
+	struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe);
 	u64 count;
 
 	if (drm_WARN_ON(dev, pipe >= dev->num_crtcs))
@@ -438,7 +451,7 @@ static void __disable_vblank(struct drm_device *dev, unsigned int pipe)
  */
 void drm_vblank_disable_and_save(struct drm_device *dev, unsigned int pipe)
 {
-	struct drm_vblank_crtc *vblank = &dev->vblank[pipe];
+	struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe);
 	unsigned long irqflags;
 
 	assert_spin_locked(&dev->vbl_lock);
@@ -600,7 +613,7 @@ void drm_calc_timestamping_constants(struct drm_crtc *crtc,
 {
 	struct drm_device *dev = crtc->dev;
 	unsigned int pipe = drm_crtc_index(crtc);
-	struct drm_vblank_crtc *vblank = &dev->vblank[pipe];
+	struct drm_vblank_crtc *vblank = drm_crtc_vblank_crtc(crtc);
 	int linedur_ns = 0, framedur_ns = 0;
 	int dotclock = mode->crtc_clock;
 
@@ -930,7 +943,7 @@ EXPORT_SYMBOL(drm_crtc_vblank_count);
 static u64 drm_vblank_count_and_time(struct drm_device *dev, unsigned int pipe,
 				     ktime_t *vblanktime)
 {
-	struct drm_vblank_crtc *vblank = &dev->vblank[pipe];
+	struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe);
 	u64 vblank_count;
 	unsigned int seq;
 
@@ -985,7 +998,6 @@ EXPORT_SYMBOL(drm_crtc_vblank_count_and_time);
  */
 int drm_crtc_next_vblank_start(struct drm_crtc *crtc, ktime_t *vblanktime)
 {
-	unsigned int pipe = drm_crtc_index(crtc);
 	struct drm_vblank_crtc *vblank;
 	struct drm_display_mode *mode;
 	u64 vblank_start;
@@ -993,7 +1005,7 @@ int drm_crtc_next_vblank_start(struct drm_crtc *crtc, ktime_t *vblanktime)
 	if (!drm_dev_has_vblank(crtc->dev))
 		return -EINVAL;
 
-	vblank = &crtc->dev->vblank[pipe];
+	vblank = drm_crtc_vblank_crtc(crtc);
 	mode = &vblank->hwmode;
 
 	if (!vblank->framedur_ns || !vblank->linedur_ns)
@@ -1147,7 +1159,7 @@ static int __enable_vblank(struct drm_device *dev, unsigned int pipe)
 
 static int drm_vblank_enable(struct drm_device *dev, unsigned int pipe)
 {
-	struct drm_vblank_crtc *vblank = &dev->vblank[pipe];
+	struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe);
 	int ret = 0;
 
 	assert_spin_locked(&dev->vbl_lock);
@@ -1185,7 +1197,7 @@ static int drm_vblank_enable(struct drm_device *dev, unsigned int pipe)
 
 int drm_vblank_get(struct drm_device *dev, unsigned int pipe)
 {
-	struct drm_vblank_crtc *vblank = &dev->vblank[pipe];
+	struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe);
 	unsigned long irqflags;
 	int ret = 0;
 
@@ -1228,7 +1240,7 @@ EXPORT_SYMBOL(drm_crtc_vblank_get);
 
 void drm_vblank_put(struct drm_device *dev, unsigned int pipe)
 {
-	struct drm_vblank_crtc *vblank = &dev->vblank[pipe];
+	struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe);
 
 	if (drm_WARN_ON(dev, pipe >= dev->num_crtcs))
 		return;
@@ -1274,7 +1286,7 @@ EXPORT_SYMBOL(drm_crtc_vblank_put);
  */
 void drm_wait_one_vblank(struct drm_device *dev, unsigned int pipe)
 {
-	struct drm_vblank_crtc *vblank = &dev->vblank[pipe];
+	struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe);
 	int ret;
 	u64 last;
 
@@ -1327,7 +1339,7 @@ void drm_crtc_vblank_off(struct drm_crtc *crtc)
 {
 	struct drm_device *dev = crtc->dev;
 	unsigned int pipe = drm_crtc_index(crtc);
-	struct drm_vblank_crtc *vblank = &dev->vblank[pipe];
+	struct drm_vblank_crtc *vblank = drm_crtc_vblank_crtc(crtc);
 	struct drm_pending_vblank_event *e, *t;
 	ktime_t now;
 	u64 seq;
@@ -1405,8 +1417,7 @@ EXPORT_SYMBOL(drm_crtc_vblank_off);
 void drm_crtc_vblank_reset(struct drm_crtc *crtc)
 {
 	struct drm_device *dev = crtc->dev;
-	unsigned int pipe = drm_crtc_index(crtc);
-	struct drm_vblank_crtc *vblank = &dev->vblank[pipe];
+	struct drm_vblank_crtc *vblank = drm_crtc_vblank_crtc(crtc);
 
 	spin_lock_irq(&dev->vbl_lock);
 	/*
@@ -1445,8 +1456,7 @@ void drm_crtc_set_max_vblank_count(struct drm_crtc *crtc,
 				   u32 max_vblank_count)
 {
 	struct drm_device *dev = crtc->dev;
-	unsigned int pipe = drm_crtc_index(crtc);
-	struct drm_vblank_crtc *vblank = &dev->vblank[pipe];
+	struct drm_vblank_crtc *vblank = drm_crtc_vblank_crtc(crtc);
 
 	drm_WARN_ON(dev, dev->max_vblank_count);
 	drm_WARN_ON(dev, !READ_ONCE(vblank->inmodeset));
@@ -1469,7 +1479,7 @@ void drm_crtc_vblank_on(struct drm_crtc *crtc)
 {
 	struct drm_device *dev = crtc->dev;
 	unsigned int pipe = drm_crtc_index(crtc);
-	struct drm_vblank_crtc *vblank = &dev->vblank[pipe];
+	struct drm_vblank_crtc *vblank = drm_crtc_vblank_crtc(crtc);
 
 	if (drm_WARN_ON(dev, pipe >= dev->num_crtcs))
 		return;
@@ -1512,7 +1522,7 @@ static void drm_vblank_restore(struct drm_device *dev, unsigned int pipe)
 	assert_spin_locked(&dev->vbl_lock);
 	assert_spin_locked(&dev->vblank_time_lock);
 
-	vblank = &dev->vblank[pipe];
+	vblank = drm_vblank_crtc(dev, pipe);
 	drm_WARN_ONCE(dev,
 		      drm_debug_enabled(DRM_UT_VBL) && !vblank->framedur_ns,
 		      "Cannot compute missed vblanks without frame duration\n");
@@ -1564,7 +1574,7 @@ static int drm_queue_vblank_event(struct drm_device *dev, unsigned int pipe,
 				  union drm_wait_vblank *vblwait,
 				  struct drm_file *file_priv)
 {
-	struct drm_vblank_crtc *vblank = &dev->vblank[pipe];
+	struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe);
 	struct drm_pending_vblank_event *e;
 	ktime_t now;
 	u64 seq;
@@ -1872,7 +1882,7 @@ static void drm_handle_vblank_events(struct drm_device *dev, unsigned int pipe)
  */
 bool drm_handle_vblank(struct drm_device *dev, unsigned int pipe)
 {
-	struct drm_vblank_crtc *vblank = &dev->vblank[pipe];
+	struct drm_vblank_crtc *vblank = drm_vblank_crtc(dev, pipe);
 	unsigned long irqflags;
 	bool disable_irq;
 
@@ -1981,7 +1991,7 @@ int drm_crtc_get_sequence_ioctl(struct drm_device *dev, void *data,
 
 	pipe = drm_crtc_index(crtc);
 
-	vblank = &dev->vblank[pipe];
+	vblank = drm_crtc_vblank_crtc(crtc);
 	vblank_enabled = dev->vblank_disable_immediate && READ_ONCE(vblank->enabled);
 
 	if (!vblank_enabled) {
@@ -2046,7 +2056,7 @@ int drm_crtc_queue_sequence_ioctl(struct drm_device *dev, void *data,
 
 	pipe = drm_crtc_index(crtc);
 
-	vblank = &dev->vblank[pipe];
+	vblank = drm_crtc_vblank_crtc(crtc);
 
 	e = kzalloc(sizeof(*e), GFP_KERNEL);
 	if (e == NULL)
diff --git a/drivers/gpu/drm/drm_vblank_work.c b/drivers/gpu/drm/drm_vblank_work.c
index 43cd5c0f4f6f..4fe9b1d3b00f 100644
--- a/drivers/gpu/drm/drm_vblank_work.c
+++ b/drivers/gpu/drm/drm_vblank_work.c
@@ -245,7 +245,7 @@ void drm_vblank_work_init(struct drm_vblank_work *work, struct drm_crtc *crtc,
 {
 	kthread_init_work(&work->base, func);
 	INIT_LIST_HEAD(&work->node);
-	work->vblank = &crtc->dev->vblank[drm_crtc_index(crtc)];
+	work->vblank = drm_crtc_vblank_crtc(crtc);
 }
 EXPORT_SYMBOL(drm_vblank_work_init);
 
diff --git a/include/drm/drm_vblank.h b/include/drm/drm_vblank.h
index 7f3957943dd1..c8f829b4307c 100644
--- a/include/drm/drm_vblank.h
+++ b/include/drm/drm_vblank.h
@@ -225,6 +225,7 @@ struct drm_vblank_crtc {
 	wait_queue_head_t work_wait_queue;
 };
 
+struct drm_vblank_crtc *drm_crtc_vblank_crtc(struct drm_crtc *crtc);
 int drm_vblank_init(struct drm_device *dev, unsigned int num_crtcs);
 bool drm_dev_has_vblank(const struct drm_device *dev);
 u64 drm_crtc_vblank_count(struct drm_crtc *crtc);
-- 
cgit v1.2.3


From 5677b17c33246fb69ecc250d493c635500b78980 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Date: Tue, 2 Apr 2024 10:51:08 +0100
Subject: ARM: 9361/1: amba: store owner from modules with
 amba_driver_register()

Modules registering driver with amba_driver_register() often forget to
set .owner field.  The field is used by some of other kernel parts for
reference counting (try_module_get()), so it is expected that drivers
will set it.

Solve the problem by moving this task away from the drivers to the core
amba bus code, just like we did for platform_driver in
commit 9447057eaff8 ("platform_device: use a macro instead of
platform_driver_register").

Link: https://lore.kernel.org/r/20240326-module-owner-amba-v1-1-4517b091385b@linaro.org

Reviewed-by: Andi Shyti <andi.shyti@kernel.org>
Acked-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
---
 drivers/amba/bus.c       | 11 +++++++----
 include/linux/amba/bus.h | 11 +++++++++--
 2 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c
index a24c152bfaac..aba3aa95b224 100644
--- a/drivers/amba/bus.c
+++ b/drivers/amba/bus.c
@@ -488,28 +488,31 @@ static int __init amba_stub_drv_init(void)
 	 * waiting on amba_match(). So, register a stub driver to make sure
 	 * amba_match() is called even if no amba driver has been registered.
 	 */
-	return amba_driver_register(&amba_proxy_drv);
+	return __amba_driver_register(&amba_proxy_drv, NULL);
 }
 late_initcall_sync(amba_stub_drv_init);
 
 /**
- *	amba_driver_register - register an AMBA device driver
+ *	__amba_driver_register - register an AMBA device driver
  *	@drv: amba device driver structure
+ *	@owner: owning module/driver
  *
  *	Register an AMBA device driver with the Linux device model
  *	core.  If devices pre-exist, the drivers probe function will
  *	be called.
  */
-int amba_driver_register(struct amba_driver *drv)
+int __amba_driver_register(struct amba_driver *drv,
+			   struct module *owner)
 {
 	if (!drv->probe)
 		return -EINVAL;
 
+	drv->drv.owner = owner;
 	drv->drv.bus = &amba_bustype;
 
 	return driver_register(&drv->drv);
 }
-EXPORT_SYMBOL(amba_driver_register);
+EXPORT_SYMBOL(__amba_driver_register);
 
 /**
  *	amba_driver_unregister - remove an AMBA device driver
diff --git a/include/linux/amba/bus.h b/include/linux/amba/bus.h
index c60a6a14638c..958a55bcc708 100644
--- a/include/linux/amba/bus.h
+++ b/include/linux/amba/bus.h
@@ -112,11 +112,18 @@ extern struct bus_type amba_bustype;
 #define amba_get_drvdata(d)	dev_get_drvdata(&d->dev)
 #define amba_set_drvdata(d,p)	dev_set_drvdata(&d->dev, p)
 
+/*
+ * use a macro to avoid include chaining to get THIS_MODULE
+ */
+#define amba_driver_register(drv) \
+	__amba_driver_register(drv, THIS_MODULE)
+
 #ifdef CONFIG_ARM_AMBA
-int amba_driver_register(struct amba_driver *);
+int __amba_driver_register(struct amba_driver *, struct module *);
 void amba_driver_unregister(struct amba_driver *);
 #else
-static inline int amba_driver_register(struct amba_driver *drv)
+static inline int __amba_driver_register(struct amba_driver *drv,
+					 struct module *owner)
 {
 	return -EINVAL;
 }
-- 
cgit v1.2.3


From bd2cd796d2859c1a7cee326d7a9c0f77c6b95c71 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Tue, 16 Apr 2024 17:53:45 +0200
Subject: usb: renesas_usbhs: Remove renesas_usbhs_get_info() wrapper

The renesas_usbhs_get_info() wrapper was useful for legacy board code.
Since commit 1fa59bda21c7fa36 ("ARM: shmobile: Remove legacy board code
for Armadillo-800 EVA") in v4.3, it is no longer used outside the USBHS
driver, and provides no added value over dev_get_platdata(), while
obfuscating the real operation.

Drop it, and replace it by dev_get_platdata() in its sole user.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Biju Das <biju.das.jz@bp.renesas.com>
Reviewed-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Link: https://lore.kernel.org/r/fa296af4452dfe394a58b75fd44c3bb9591936eb.1713282736.git.geert+renesas@glider.be
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/renesas_usbhs/common.c | 2 +-
 include/linux/usb/renesas_usbhs.h  | 5 -----
 2 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/usb/renesas_usbhs/common.c b/drivers/usb/renesas_usbhs/common.c
index b6bef9081bf2..edc43f169d49 100644
--- a/drivers/usb/renesas_usbhs/common.c
+++ b/drivers/usb/renesas_usbhs/common.c
@@ -613,7 +613,7 @@ static int usbhs_probe(struct platform_device *pdev)
 
 	info = of_device_get_match_data(dev);
 	if (!info) {
-		info = renesas_usbhs_get_info(pdev);
+		info = dev_get_platdata(dev);
 		if (!info)
 			return dev_err_probe(dev, -EINVAL, "no platform info\n");
 	}
diff --git a/include/linux/usb/renesas_usbhs.h b/include/linux/usb/renesas_usbhs.h
index 372898d9eeb0..67bfcda6c7d2 100644
--- a/include/linux/usb/renesas_usbhs.h
+++ b/include/linux/usb/renesas_usbhs.h
@@ -194,9 +194,4 @@ struct renesas_usbhs_platform_info {
 	struct renesas_usbhs_driver_param	driver_param;
 };
 
-/*
- * macro for platform
- */
-#define renesas_usbhs_get_info(pdev)\
-	((struct renesas_usbhs_platform_info *)(pdev)->dev.platform_data)
 #endif /* RENESAS_USB_H */
-- 
cgit v1.2.3


From 6b9391b581fddd8579239dad4de4f0393149e10a Mon Sep 17 00:00:00 2001
From: Charlie Jenkins <charlie@rivosinc.com>
Date: Tue, 12 Mar 2024 16:53:41 -0700
Subject: riscv: Include riscv_set_icache_flush_ctx prctl

Support new prctl with key PR_RISCV_SET_ICACHE_FLUSH_CTX to enable
optimization of cross modifying code. This prctl enables userspace code
to use icache flushing instructions such as fence.i with the guarantee
that the icache will continue to be clean after thread migration.

Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
Reviewed-by: Atish Patra <atishp@rivosinc.com>
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Reviewed-by: Samuel Holland <samuel.holland@sifive.com>
Link: https://lore.kernel.org/r/20240312-fencei-v13-2-4b6bdc2bbf32@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/include/asm/mmu.h       |   2 +
 arch/riscv/include/asm/processor.h |  10 ++++
 arch/riscv/include/asm/switch_to.h |  23 ++++++++
 arch/riscv/mm/cacheflush.c         | 113 +++++++++++++++++++++++++++++++++++++
 arch/riscv/mm/context.c            |  19 ++++---
 include/uapi/linux/prctl.h         |   6 ++
 kernel/sys.c                       |   6 ++
 7 files changed, 171 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/arch/riscv/include/asm/mmu.h b/arch/riscv/include/asm/mmu.h
index 355504b37f8e..60be458e94da 100644
--- a/arch/riscv/include/asm/mmu.h
+++ b/arch/riscv/include/asm/mmu.h
@@ -19,6 +19,8 @@ typedef struct {
 #ifdef CONFIG_SMP
 	/* A local icache flush is needed before user execution can resume. */
 	cpumask_t icache_stale_mask;
+	/* Force local icache flush on all migrations. */
+	bool force_icache_flush;
 #endif
 #ifdef CONFIG_BINFMT_ELF_FDPIC
 	unsigned long exec_fdpic_loadmap;
diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
index a8509cc31ab2..cca62013c3c0 100644
--- a/arch/riscv/include/asm/processor.h
+++ b/arch/riscv/include/asm/processor.h
@@ -69,6 +69,7 @@
 #endif
 
 #ifndef __ASSEMBLY__
+#include <linux/cpumask.h>
 
 struct task_struct;
 struct pt_regs;
@@ -123,6 +124,12 @@ struct thread_struct {
 	struct __riscv_v_ext_state vstate;
 	unsigned long align_ctl;
 	struct __riscv_v_ext_state kernel_vstate;
+#ifdef CONFIG_SMP
+	/* Flush the icache on migration */
+	bool force_icache_flush;
+	/* A forced icache flush is not needed if migrating to the previous cpu. */
+	unsigned int prev_cpu;
+#endif
 };
 
 /* Whitelist the fstate from the task_struct for hardened usercopy */
@@ -184,6 +191,9 @@ extern int set_unalign_ctl(struct task_struct *tsk, unsigned int val);
 #define GET_UNALIGN_CTL(tsk, addr)	get_unalign_ctl((tsk), (addr))
 #define SET_UNALIGN_CTL(tsk, val)	set_unalign_ctl((tsk), (val))
 
+#define RISCV_SET_ICACHE_FLUSH_CTX(arg1, arg2)	riscv_set_icache_flush_ctx(arg1, arg2)
+extern int riscv_set_icache_flush_ctx(unsigned long ctx, unsigned long per_thread);
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_RISCV_PROCESSOR_H */
diff --git a/arch/riscv/include/asm/switch_to.h b/arch/riscv/include/asm/switch_to.h
index 7efdb0584d47..7594df37cc9f 100644
--- a/arch/riscv/include/asm/switch_to.h
+++ b/arch/riscv/include/asm/switch_to.h
@@ -8,6 +8,7 @@
 
 #include <linux/jump_label.h>
 #include <linux/sched/task_stack.h>
+#include <linux/mm_types.h>
 #include <asm/vector.h>
 #include <asm/cpufeature.h>
 #include <asm/processor.h>
@@ -72,14 +73,36 @@ static __always_inline bool has_fpu(void) { return false; }
 extern struct task_struct *__switch_to(struct task_struct *,
 				       struct task_struct *);
 
+static inline bool switch_to_should_flush_icache(struct task_struct *task)
+{
+#ifdef CONFIG_SMP
+	bool stale_mm = task->mm && task->mm->context.force_icache_flush;
+	bool stale_thread = task->thread.force_icache_flush;
+	bool thread_migrated = smp_processor_id() != task->thread.prev_cpu;
+
+	return thread_migrated && (stale_mm || stale_thread);
+#else
+	return false;
+#endif
+}
+
+#ifdef CONFIG_SMP
+#define __set_prev_cpu(thread) ((thread).prev_cpu = smp_processor_id())
+#else
+#define __set_prev_cpu(thread)
+#endif
+
 #define switch_to(prev, next, last)			\
 do {							\
 	struct task_struct *__prev = (prev);		\
 	struct task_struct *__next = (next);		\
+	__set_prev_cpu(__prev->thread);			\
 	if (has_fpu())					\
 		__switch_to_fpu(__prev, __next);	\
 	if (has_vector())					\
 		__switch_to_vector(__prev, __next);	\
+	if (switch_to_should_flush_icache(__next))	\
+		local_flush_icache_all();		\
 	((last) = __switch_to(__prev, __next));		\
 } while (0)
 
diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c
index 55a34f2020a8..15a95578e670 100644
--- a/arch/riscv/mm/cacheflush.c
+++ b/arch/riscv/mm/cacheflush.c
@@ -5,6 +5,7 @@
 
 #include <linux/acpi.h>
 #include <linux/of.h>
+#include <linux/prctl.h>
 #include <asm/acpi.h>
 #include <asm/cacheflush.h>
 
@@ -152,3 +153,115 @@ void __init riscv_init_cbo_blocksizes(void)
 	if (cboz_block_size)
 		riscv_cboz_block_size = cboz_block_size;
 }
+
+#ifdef CONFIG_SMP
+static void set_icache_stale_mask(void)
+{
+	cpumask_t *mask;
+	bool stale_cpu;
+
+	/*
+	 * Mark every other hart's icache as needing a flush for
+	 * this MM. Maintain the previous value of the current
+	 * cpu to handle the case when this function is called
+	 * concurrently on different harts.
+	 */
+	mask = &current->mm->context.icache_stale_mask;
+	stale_cpu = cpumask_test_cpu(smp_processor_id(), mask);
+
+	cpumask_setall(mask);
+	assign_bit(cpumask_check(smp_processor_id()), cpumask_bits(mask), stale_cpu);
+}
+#endif
+
+/**
+ * riscv_set_icache_flush_ctx() - Enable/disable icache flushing instructions in
+ * userspace.
+ * @ctx: Set the type of icache flushing instructions permitted/prohibited in
+ *	 userspace. Supported values described below.
+ *
+ * Supported values for ctx:
+ *
+ * * %PR_RISCV_CTX_SW_FENCEI_ON: Allow fence.i in user space.
+ *
+ * * %PR_RISCV_CTX_SW_FENCEI_OFF: Disallow fence.i in user space. All threads in
+ *   a process will be affected when ``scope == PR_RISCV_SCOPE_PER_PROCESS``.
+ *   Therefore, caution must be taken; use this flag only when you can guarantee
+ *   that no thread in the process will emit fence.i from this point onward.
+ *
+ * @scope: Set scope of where icache flushing instructions are allowed to be
+ *	   emitted. Supported values described below.
+ *
+ * Supported values for scope:
+ *
+ * * %PR_RISCV_SCOPE_PER_PROCESS: Ensure the icache of any thread in this process
+ *                               is coherent with instruction storage upon
+ *                               migration.
+ *
+ * * %PR_RISCV_SCOPE_PER_THREAD: Ensure the icache of the current thread is
+ *                              coherent with instruction storage upon
+ *                              migration.
+ *
+ * When ``scope == PR_RISCV_SCOPE_PER_PROCESS``, all threads in the process are
+ * permitted to emit icache flushing instructions. Whenever any thread in the
+ * process is migrated, the corresponding hart's icache will be guaranteed to be
+ * consistent with instruction storage. This does not enforce any guarantees
+ * outside of migration. If a thread modifies an instruction that another thread
+ * may attempt to execute, the other thread must still emit an icache flushing
+ * instruction before attempting to execute the potentially modified
+ * instruction. This must be performed by the user-space program.
+ *
+ * In per-thread context (eg. ``scope == PR_RISCV_SCOPE_PER_THREAD``) only the
+ * thread calling this function is permitted to emit icache flushing
+ * instructions. When the thread is migrated, the corresponding hart's icache
+ * will be guaranteed to be consistent with instruction storage.
+ *
+ * On kernels configured without SMP, this function is a nop as migrations
+ * across harts will not occur.
+ */
+int riscv_set_icache_flush_ctx(unsigned long ctx, unsigned long scope)
+{
+#ifdef CONFIG_SMP
+	switch (ctx) {
+	case PR_RISCV_CTX_SW_FENCEI_ON:
+		switch (scope) {
+		case PR_RISCV_SCOPE_PER_PROCESS:
+			current->mm->context.force_icache_flush = true;
+			break;
+		case PR_RISCV_SCOPE_PER_THREAD:
+			current->thread.force_icache_flush = true;
+			break;
+		default:
+			return -EINVAL;
+		}
+		break;
+	case PR_RISCV_CTX_SW_FENCEI_OFF:
+		switch (scope) {
+		case PR_RISCV_SCOPE_PER_PROCESS:
+			current->mm->context.force_icache_flush = false;
+
+			set_icache_stale_mask();
+			break;
+		case PR_RISCV_SCOPE_PER_THREAD:
+			current->thread.force_icache_flush = false;
+
+			set_icache_stale_mask();
+			break;
+		default:
+			return -EINVAL;
+		}
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+#else
+	switch (ctx) {
+	case PR_RISCV_CTX_SW_FENCEI_ON:
+	case PR_RISCV_CTX_SW_FENCEI_OFF:
+		return 0;
+	default:
+		return -EINVAL;
+	}
+#endif
+}
diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c
index 217fd4de6134..beef30f42d5c 100644
--- a/arch/riscv/mm/context.c
+++ b/arch/riscv/mm/context.c
@@ -15,6 +15,7 @@
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
 #include <asm/mmu_context.h>
+#include <asm/switch_to.h>
 
 #ifdef CONFIG_MMU
 
@@ -297,21 +298,23 @@ static inline void set_mm(struct mm_struct *prev,
  *
  * The "cpu" argument must be the current local CPU number.
  */
-static inline void flush_icache_deferred(struct mm_struct *mm, unsigned int cpu)
+static inline void flush_icache_deferred(struct mm_struct *mm, unsigned int cpu,
+					 struct task_struct *task)
 {
 #ifdef CONFIG_SMP
-	cpumask_t *mask = &mm->context.icache_stale_mask;
-
-	if (cpumask_test_cpu(cpu, mask)) {
-		cpumask_clear_cpu(cpu, mask);
+	if (cpumask_test_and_clear_cpu(cpu, &mm->context.icache_stale_mask)) {
 		/*
 		 * Ensure the remote hart's writes are visible to this hart.
 		 * This pairs with a barrier in flush_icache_mm.
 		 */
 		smp_mb();
-		local_flush_icache_all();
-	}
 
+		/*
+		 * If cache will be flushed in switch_to, no need to flush here.
+		 */
+		if (!(task && switch_to_should_flush_icache(task)))
+			local_flush_icache_all();
+	}
 #endif
 }
 
@@ -332,5 +335,5 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 
 	set_mm(prev, next, cpu);
 
-	flush_icache_deferred(next, cpu);
+	flush_icache_deferred(next, cpu, task);
 }
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 370ed14b1ae0..524d546d697b 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -306,4 +306,10 @@ struct prctl_mm_map {
 # define PR_RISCV_V_VSTATE_CTRL_NEXT_MASK	0xc
 # define PR_RISCV_V_VSTATE_CTRL_MASK		0x1f
 
+#define PR_RISCV_SET_ICACHE_FLUSH_CTX	71
+# define PR_RISCV_CTX_SW_FENCEI_ON	0
+# define PR_RISCV_CTX_SW_FENCEI_OFF	1
+# define PR_RISCV_SCOPE_PER_PROCESS	0
+# define PR_RISCV_SCOPE_PER_THREAD	1
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index e219fcfa112d..69afdd8b430f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -146,6 +146,9 @@
 #ifndef RISCV_V_GET_CONTROL
 # define RISCV_V_GET_CONTROL()		(-EINVAL)
 #endif
+#ifndef RISCV_SET_ICACHE_FLUSH_CTX
+# define RISCV_SET_ICACHE_FLUSH_CTX(a, b)	(-EINVAL)
+#endif
 
 /*
  * this is where the system-wide overflow UID and GID are defined, for
@@ -2743,6 +2746,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_RISCV_V_GET_CONTROL:
 		error = RISCV_V_GET_CONTROL();
 		break;
+	case PR_RISCV_SET_ICACHE_FLUSH_CTX:
+		error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3);
+		break;
 	default:
 		error = -EINVAL;
 		break;
-- 
cgit v1.2.3


From decde1fa209323c77a7498e803350c5f3e992d62 Mon Sep 17 00:00:00 2001
From: Charlie Jenkins <charlie@rivosinc.com>
Date: Tue, 12 Mar 2024 16:53:43 -0700
Subject: cpumask: Add assign cpu

Standardize an assign_cpu function for cpumasks.

Signed-off-by: Charlie Jenkins <charlie@rivosinc.com>
Link: https://lore.kernel.org/r/20240312-fencei-v13-4-4b6bdc2bbf32@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/mm/cacheflush.c |  2 +-
 include/linux/cpumask.h    | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c
index 15a95578e670..9dad35f0ce8b 100644
--- a/arch/riscv/mm/cacheflush.c
+++ b/arch/riscv/mm/cacheflush.c
@@ -170,7 +170,7 @@ static void set_icache_stale_mask(void)
 	stale_cpu = cpumask_test_cpu(smp_processor_id(), mask);
 
 	cpumask_setall(mask);
-	assign_bit(cpumask_check(smp_processor_id()), cpumask_bits(mask), stale_cpu);
+	cpumask_assign_cpu(smp_processor_id(), mask, stale_cpu);
 }
 #endif
 
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index cfb545841a2c..1b85e09c4ba5 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -492,6 +492,22 @@ static __always_inline void __cpumask_clear_cpu(int cpu, struct cpumask *dstp)
 	__clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
 }
 
+/**
+ * cpumask_assign_cpu - assign a cpu in a cpumask
+ * @cpu: cpu number (< nr_cpu_ids)
+ * @dstp: the cpumask pointer
+ * @bool: the value to assign
+ */
+static __always_inline void cpumask_assign_cpu(int cpu, struct cpumask *dstp, bool value)
+{
+	assign_bit(cpumask_check(cpu), cpumask_bits(dstp), value);
+}
+
+static __always_inline void __cpumask_assign_cpu(int cpu, struct cpumask *dstp, bool value)
+{
+	__assign_bit(cpumask_check(cpu), cpumask_bits(dstp), value);
+}
+
 /**
  * cpumask_test_cpu - test for a cpu in a cpumask
  * @cpu: cpu number (< nr_cpu_ids)
-- 
cgit v1.2.3


From 270f3a4886ef78889d0b9a887808e6c425931131 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Wed, 10 Apr 2024 13:05:08 +0300
Subject: drm/i915: use system include for drm headers

Use <> instead of "" for including headers from include/, even if the
file is in the same directory.

Reviewed-by: Andi Shyti <andi.shyti@linux.intel.com>
Acked-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Acked-by: Lucas De Marchi <lucas.demarchi@intel.com>
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/ed02b81fcbbc0d27be4af460c521aefbfdc76375.1712743191.git.jani.nikula@intel.com
---
 include/drm/i915_component.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/drm/i915_component.h b/include/drm/i915_component.h
index 56a84ee1c64c..4ea3b17aa143 100644
--- a/include/drm/i915_component.h
+++ b/include/drm/i915_component.h
@@ -24,7 +24,7 @@
 #ifndef _I915_COMPONENT_H_
 #define _I915_COMPONENT_H_
 
-#include "drm_audio_component.h"
+#include <drm/drm_audio_component.h>
 
 enum i915_component_type {
 	I915_COMPONENT_AUDIO = 1,
-- 
cgit v1.2.3


From 3c258bf6bf29d8c9f9b358c64f6e9f4510c91ff9 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Wed, 17 Apr 2024 10:09:21 +0100
Subject: firmware: arm_ffa: Stash the partition properties for query purposes

The properies obtained from the partition information descriptor as
part of initial partitions discovery is useful as it contain info
if the partition
- Runs in AArch64 or AArch32 execution state
- Can send and/or receive direct requests
- Can send and receive indirect message
- Does support receipt of notifications.

These can be used for querying before attempting to do any of the
above operations.

Link: https://lore.kernel.org/r/20240417090921.2866447-1-sudeep.holla@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_ffa/driver.c |  2 ++
 include/linux/arm_ffa.h           | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/drivers/firmware/arm_ffa/driver.c b/drivers/firmware/arm_ffa/driver.c
index b64e8ee64bc2..99a23dd3d189 100644
--- a/drivers/firmware/arm_ffa/driver.c
+++ b/drivers/firmware/arm_ffa/driver.c
@@ -1228,6 +1228,8 @@ static int ffa_setup_partitions(void)
 			continue;
 		}
 
+		ffa_dev->properties = tpbuf->properties;
+
 		if (drv_info->version > FFA_VERSION_1_0 &&
 		    !(tpbuf->properties & FFA_PARTITION_AARCH64_EXEC))
 			ffa_mode_32bit_set(ffa_dev);
diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index c906f666ff5d..94a49612ecec 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -126,6 +126,7 @@
 /* FFA Bus/Device/Driver related */
 struct ffa_device {
 	u32 id;
+	u32 properties;
 	int vm_id;
 	bool mode_32bit;
 	uuid_t uuid;
@@ -221,12 +222,29 @@ struct ffa_partition_info {
 #define FFA_PARTITION_DIRECT_SEND	BIT(1)
 /* partition can send and receive indirect messages. */
 #define FFA_PARTITION_INDIRECT_MSG	BIT(2)
+/* partition can receive notifications */
+#define FFA_PARTITION_NOTIFICATION_RECV	BIT(3)
 /* partition runs in the AArch64 execution state. */
 #define FFA_PARTITION_AARCH64_EXEC	BIT(8)
 	u32 properties;
 	u32 uuid[4];
 };
 
+static inline
+bool ffa_partition_check_property(struct ffa_device *dev, u32 property)
+{
+	return dev->properties & property;
+}
+
+#define ffa_partition_supports_notify_recv(dev)	\
+	ffa_partition_check_property(dev, FFA_PARTITION_NOTIFICATION_RECV)
+
+#define ffa_partition_supports_indirect_msg(dev)	\
+	ffa_partition_check_property(dev, FFA_PARTITION_INDIRECT_MSG)
+
+#define ffa_partition_supports_direct_recv(dev)	\
+	ffa_partition_check_property(dev, FFA_PARTITION_DIRECT_RECV)
+
 /* For use with FFA_MSG_SEND_DIRECT_{REQ,RESP} which pass data via registers */
 struct ffa_send_direct_data {
 	unsigned long data0; /* w3/x3 */
-- 
cgit v1.2.3


From 02c19d84c7c5026624d181b8e4cdc8488134d013 Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Wed, 17 Apr 2024 10:09:31 +0100
Subject: firmware: arm_ffa: Add support for FFA_MSG_SEND2

The FFA_MSG_SEND2 can be  used to transmit a partition message from
the Tx buffer of the sender(the driver in this case) endpoint to the Rx
buffer of the receiver endpoint.

An invocation of the FFA_MSG_SEND2 transfers the ownership of the Tx
buffer to the receiver endpoint(or any intermediate consumer). Completion
of an FFA_MSG_SEND2 invocation transfers the ownership of the buffer
back to the sender endpoint.

The framework defines the FFA_MSG_SEND2 interface to transmit a partition
message from the Tx buffer of the sender to the Rx buffer of a receiver
and inform the scheduler that the receiver must be run.

Link: https://lore.kernel.org/r/20240417090931.2866487-1-sudeep.holla@arm.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_ffa/driver.c | 38 ++++++++++++++++++++++++++++++++++++++
 include/linux/arm_ffa.h           |  9 +++++++++
 2 files changed, 47 insertions(+)

(limited to 'include')

diff --git a/drivers/firmware/arm_ffa/driver.c b/drivers/firmware/arm_ffa/driver.c
index 99a23dd3d189..26968edac5b2 100644
--- a/drivers/firmware/arm_ffa/driver.c
+++ b/drivers/firmware/arm_ffa/driver.c
@@ -345,6 +345,38 @@ static int ffa_msg_send_direct_req(u16 src_id, u16 dst_id, bool mode_32bit,
 	return -EINVAL;
 }
 
+static int ffa_msg_send2(u16 src_id, u16 dst_id, void *buf, size_t sz)
+{
+	u32 src_dst_ids = PACK_TARGET_INFO(src_id, dst_id);
+	struct ffa_indirect_msg_hdr *msg;
+	ffa_value_t ret;
+	int retval = 0;
+
+	if (sz > (RXTX_BUFFER_SIZE - sizeof(*msg)))
+		return -ERANGE;
+
+	mutex_lock(&drv_info->tx_lock);
+
+	msg = drv_info->tx_buffer;
+	msg->flags = 0;
+	msg->res0 = 0;
+	msg->offset = sizeof(*msg);
+	msg->send_recv_id = src_dst_ids;
+	msg->size = sz;
+	memcpy(msg + msg->offset, buf, sz);
+
+	/* flags = 0, sender VMID = 0 works for both physical/virtual NS */
+	invoke_ffa_fn((ffa_value_t){
+		      .a0 = FFA_MSG_SEND2, .a1 = 0, .a2 = 0
+		      }, &ret);
+
+	if (ret.a0 == FFA_ERROR)
+		retval = ffa_to_linux_errno((int)ret.a2);
+
+	mutex_unlock(&drv_info->tx_lock);
+	return retval;
+}
+
 static int ffa_mem_first_frag(u32 func_id, phys_addr_t buf, u32 buf_sz,
 			      u32 frag_len, u32 len, u64 *handle)
 {
@@ -871,6 +903,11 @@ static int ffa_sync_send_receive(struct ffa_device *dev,
 				       dev->mode_32bit, data);
 }
 
+static int ffa_indirect_msg_send(struct ffa_device *dev, void *buf, size_t sz)
+{
+	return ffa_msg_send2(drv_info->vm_id, dev->vm_id, buf, sz);
+}
+
 static int ffa_memory_share(struct ffa_mem_ops_args *args)
 {
 	if (drv_info->mem_ops_native)
@@ -1146,6 +1183,7 @@ static const struct ffa_info_ops ffa_drv_info_ops = {
 static const struct ffa_msg_ops ffa_drv_msg_ops = {
 	.mode_32bit_set = ffa_mode_32bit_set,
 	.sync_send_receive = ffa_sync_send_receive,
+	.indirect_send = ffa_indirect_msg_send,
 };
 
 static const struct ffa_mem_ops ffa_drv_mem_ops = {
diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h
index 94a49612ecec..c82d56768101 100644
--- a/include/linux/arm_ffa.h
+++ b/include/linux/arm_ffa.h
@@ -254,6 +254,14 @@ struct ffa_send_direct_data {
 	unsigned long data4; /* w7/x7 */
 };
 
+struct ffa_indirect_msg_hdr {
+	u32 flags;
+	u32 res0;
+	u32 offset;
+	u32 send_recv_id;
+	u32 size;
+};
+
 struct ffa_mem_region_addr_range {
 	/* The base IPA of the constituent memory region, aligned to 4 kiB */
 	u64 address;
@@ -414,6 +422,7 @@ struct ffa_msg_ops {
 	void (*mode_32bit_set)(struct ffa_device *dev);
 	int (*sync_send_receive)(struct ffa_device *dev,
 				 struct ffa_send_direct_data *data);
+	int (*indirect_send)(struct ffa_device *dev, void *buf, size_t sz);
 };
 
 struct ffa_mem_ops {
-- 
cgit v1.2.3


From b58be8db6327b21e0c7fbee559b8eb47f5110efe Mon Sep 17 00:00:00 2001
From: "Kory Maincent (Dent Project)" <kory.maincent@bootlin.com>
Date: Wed, 17 Apr 2024 16:39:49 +0200
Subject: ethtool: Expand Ethernet Power Equipment with c33 (PoE) alongside
 PoDL

In the current PSE interface for Ethernet Power Equipment, support is
limited to PoDL. This patch extends the interface to accommodate the
objects specified in IEEE 802.3-2022 145.2 for Power sourcing
Equipment (PSE).

The following objects are now supported and considered mandatory:
- IEEE 802.3-2022 30.9.1.1.5 aPSEPowerDetectionStatus
- IEEE 802.3-2022 30.9.1.1.2 aPSEAdminState
- IEEE 802.3-2022 30.9.1.2.1 aPSEAdminControl

To avoid confusion between "PoDL PSE" and "PoE PSE", which have similar
names but distinct values, we have followed the suggestion of Oleksij
Rempel and Andrew Lunn to maintain separate naming schemes for each,
using c33 (clause 33) prefix for "PoE PSE".
You can find more details in the discussion threads here:
https://lore.kernel.org/netdev/20230912110637.GI780075@pengutronix.de/
https://lore.kernel.org/netdev/2539b109-72ad-470a-9dae-9f53de4f64ec@lunn.ch/

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Oleksij Rempel <o.rempel@pengutronix.de>
Signed-off-by: Kory Maincent <kory.maincent@bootlin.com>
Link: https://lore.kernel.org/r/20240417-feature_poe-v9-1-242293fd1900@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/index.rst               |  1 +
 Documentation/networking/pse-pd/index.rst        |  9 +++
 Documentation/networking/pse-pd/introduction.rst | 73 ++++++++++++++++++++++++
 include/linux/pse-pd/pse.h                       |  9 +++
 include/uapi/linux/ethtool.h                     | 43 ++++++++++++++
 include/uapi/linux/ethtool_netlink.h             |  3 +
 6 files changed, 138 insertions(+)
 create mode 100644 Documentation/networking/pse-pd/index.rst
 create mode 100644 Documentation/networking/pse-pd/introduction.rst

(limited to 'include')

diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst
index 473d72c36d61..7664c0bfe461 100644
--- a/Documentation/networking/index.rst
+++ b/Documentation/networking/index.rst
@@ -93,6 +93,7 @@ Contents:
    plip
    ppp_generic
    proc_net_tcp
+   pse-pd/index
    radiotap-headers
    rds
    regulatory
diff --git a/Documentation/networking/pse-pd/index.rst b/Documentation/networking/pse-pd/index.rst
new file mode 100644
index 000000000000..18197bc7303d
--- /dev/null
+++ b/Documentation/networking/pse-pd/index.rst
@@ -0,0 +1,9 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Power Sourcing Equipment (PSE) Documentation
+============================================
+
+.. toctree::
+   :maxdepth: 2
+
+   introduction
diff --git a/Documentation/networking/pse-pd/introduction.rst b/Documentation/networking/pse-pd/introduction.rst
new file mode 100644
index 000000000000..e3d3faaef717
--- /dev/null
+++ b/Documentation/networking/pse-pd/introduction.rst
@@ -0,0 +1,73 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+Power Sourcing Equipment (PSE) in IEEE 802.3 Standard
+=====================================================
+
+Overview
+--------
+
+Power Sourcing Equipment (PSE) is essential in networks for delivering power
+along with data over Ethernet cables. It usually refers to devices like
+switches and hubs that supply power to Powered Devices (PDs) such as IP
+cameras, VoIP phones, and wireless access points.
+
+PSE vs. PoDL PSE
+----------------
+
+PSE in the IEEE 802.3 standard generally refers to equipment that provides
+power alongside data over Ethernet cables, typically associated with Power over
+Ethernet (PoE).
+
+PoDL PSE, or Power over Data Lines PSE, specifically denotes PSEs operating
+with single balanced twisted-pair PHYs, as per Clause 104 of IEEE 802.3. PoDL
+is significant in contexts like automotive and industrial controls where power
+and data delivery over a single pair is advantageous.
+
+IEEE 802.3-2018 Addendums and Related Clauses
+---------------------------------------------
+
+Key addenda to the IEEE 802.3-2018 standard relevant to power delivery over
+Ethernet are as follows:
+
+- **802.3af (Approved in 2003-06-12)**: Known as PoE in the market, detailed in
+  Clause 33, delivering up to 15.4W of power.
+- **802.3at (Approved in 2009-09-11)**: Marketed as PoE+, enhancing PoE as
+  covered in Clause 33, increasing power delivery to up to 30W.
+- **802.3bt (Approved in 2018-09-27)**: Known as 4PPoE in the market, outlined
+  in Clause 33. Type 3 delivers up to 60W, and Type 4 up to 100W.
+- **802.3bu (Approved in 2016-12-07)**: Formerly referred to as PoDL, detailed
+  in Clause 104. Introduces Classes 0 - 9. Class 9 PoDL PSE delivers up to ~65W
+
+Kernel Naming Convention Recommendations
+----------------------------------------
+
+For clarity and consistency within the Linux kernel's networking subsystem, the
+following naming conventions are recommended:
+
+- For general PSE (PoE) code, use "c33_pse" key words. For example:
+  ``enum ethtool_c33_pse_admin_state c33_admin_control;``.
+  This aligns with Clause 33, encompassing various PoE forms.
+
+- For PoDL PSE - specific code, use "podl_pse". For example:
+  ``enum ethtool_podl_pse_admin_state podl_admin_control;`` to differentiate
+  PoDL PSE settings according to Clause 104.
+
+Summary of Clause 33: Data Terminal Equipment (DTE) Power via Media Dependent Interface (MDI)
+---------------------------------------------------------------------------------------------
+
+Clause 33 of the IEEE 802.3 standard defines the functional and electrical
+characteristics of Powered Device (PD) and Power Sourcing Equipment (PSE).
+These entities enable power delivery using the same generic cabling as for data
+transmission, integrating power with data communication for devices such as
+10BASE-T, 100BASE-TX, or 1000BASE-T.
+
+Summary of Clause 104: Power over Data Lines (PoDL) of Single Balanced Twisted-Pair Ethernet
+--------------------------------------------------------------------------------------------
+
+Clause 104 of the IEEE 802.3 standard delineates the functional and electrical
+characteristics of PoDL Powered Devices (PDs) and PoDL Power Sourcing Equipment
+(PSEs). These are designed for use with single balanced twisted-pair Ethernet
+Physical Layers. In this clause, 'PSE' refers specifically to PoDL PSE, and
+'PD' to PoDL PD. The key intent is to provide devices with a unified interface
+for both data and the power required to process this data over a single
+balanced twisted-pair Ethernet connection.
diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h
index 199cf4ae3cf2..be4e5754eb24 100644
--- a/include/linux/pse-pd/pse.h
+++ b/include/linux/pse-pd/pse.h
@@ -17,9 +17,12 @@ struct pse_controller_dev;
  *
  * @podl_admin_control: set PoDL PSE admin control as described in
  *	IEEE 802.3-2018 30.15.1.2.1 acPoDLPSEAdminControl
+ * @c33_admin_control: set PSE admin control as described in
+ *	IEEE 802.3-2022 30.9.1.2.1 acPSEAdminControl
  */
 struct pse_control_config {
 	enum ethtool_podl_pse_admin_state podl_admin_control;
+	enum ethtool_c33_pse_admin_state c33_admin_control;
 };
 
 /**
@@ -29,10 +32,16 @@ struct pse_control_config {
  *	functions. IEEE 802.3-2018 30.15.1.1.2 aPoDLPSEAdminState
  * @podl_pw_status: power detection status of the PoDL PSE.
  *	IEEE 802.3-2018 30.15.1.1.3 aPoDLPSEPowerDetectionStatus:
+ * @c33_admin_state: operational state of the PSE
+ *	functions. IEEE 802.3-2022 30.9.1.1.2 aPSEAdminState
+ * @c33_pw_status: power detection status of the PSE.
+ *	IEEE 802.3-2022 30.9.1.1.5 aPSEPowerDetectionStatus:
  */
 struct pse_control_status {
 	enum ethtool_podl_pse_admin_state podl_admin_state;
 	enum ethtool_podl_pse_pw_d_status podl_pw_status;
+	enum ethtool_c33_pse_admin_state c33_admin_state;
+	enum ethtool_c33_pse_pw_d_status c33_pw_status;
 };
 
 /**
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 95c2f09f0d0a..7b9a3d890949 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -752,6 +752,49 @@ enum ethtool_module_power_mode {
 	ETHTOOL_MODULE_POWER_MODE_HIGH,
 };
 
+/**
+ * enum ethtool_c33_pse_admin_state - operational state of the PoDL PSE
+ *	functions. IEEE 802.3-2022 30.9.1.1.2 aPSEAdminState
+ * @ETHTOOL_C33_PSE_ADMIN_STATE_UNKNOWN: state of PSE functions is unknown
+ * @ETHTOOL_C33_PSE_ADMIN_STATE_DISABLED: PSE functions are disabled
+ * @ETHTOOL_C33_PSE_ADMIN_STATE_ENABLED: PSE functions are enabled
+ */
+enum ethtool_c33_pse_admin_state {
+	ETHTOOL_C33_PSE_ADMIN_STATE_UNKNOWN = 1,
+	ETHTOOL_C33_PSE_ADMIN_STATE_DISABLED,
+	ETHTOOL_C33_PSE_ADMIN_STATE_ENABLED,
+};
+
+/**
+ * enum ethtool_c33_pse_pw_d_status - power detection status of the PSE.
+ *	IEEE 802.3-2022 30.9.1.1.3 aPoDLPSEPowerDetectionStatus:
+ * @ETHTOOL_C33_PSE_PW_D_STATUS_UNKNOWN: PSE status is unknown
+ * @ETHTOOL_C33_PSE_PW_D_STATUS_DISABLED: The enumeration "disabled"
+ *	indicates that the PSE State diagram is in the state DISABLED.
+ * @ETHTOOL_C33_PSE_PW_D_STATUS_SEARCHING: The enumeration "searching"
+ *	indicates the PSE State diagram is in a state other than those
+ *	listed.
+ * @ETHTOOL_C33_PSE_PW_D_STATUS_DELIVERING: The enumeration
+ *	"deliveringPower" indicates that the PSE State diagram is in the
+ *	state POWER_ON.
+ * @ETHTOOL_C33_PSE_PW_D_STATUS_TEST: The enumeration "test" indicates that
+ *	the PSE State diagram is in the state TEST_MODE.
+ * @ETHTOOL_C33_PSE_PW_D_STATUS_FAULT: The enumeration "fault" indicates that
+ *	the PSE State diagram is in the state TEST_ERROR.
+ * @ETHTOOL_C33_PSE_PW_D_STATUS_OTHERFAULT: The enumeration "otherFault"
+ *	indicates that the PSE State diagram is in the state IDLE due to
+ *	the variable error_condition = true.
+ */
+enum ethtool_c33_pse_pw_d_status {
+	ETHTOOL_C33_PSE_PW_D_STATUS_UNKNOWN = 1,
+	ETHTOOL_C33_PSE_PW_D_STATUS_DISABLED,
+	ETHTOOL_C33_PSE_PW_D_STATUS_SEARCHING,
+	ETHTOOL_C33_PSE_PW_D_STATUS_DELIVERING,
+	ETHTOOL_C33_PSE_PW_D_STATUS_TEST,
+	ETHTOOL_C33_PSE_PW_D_STATUS_FAULT,
+	ETHTOOL_C33_PSE_PW_D_STATUS_OTHERFAULT,
+};
+
 /**
  * enum ethtool_podl_pse_admin_state - operational state of the PoDL PSE
  *	functions. IEEE 802.3-2018 30.15.1.1.2 aPoDLPSEAdminState
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index b4f0d233d048..f17dbe54bf5e 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -913,6 +913,9 @@ enum {
 	ETHTOOL_A_PODL_PSE_ADMIN_STATE,		/* u32 */
 	ETHTOOL_A_PODL_PSE_ADMIN_CONTROL,	/* u32 */
 	ETHTOOL_A_PODL_PSE_PW_D_STATUS,		/* u32 */
+	ETHTOOL_A_C33_PSE_ADMIN_STATE,		/* u32 */
+	ETHTOOL_A_C33_PSE_ADMIN_CONTROL,	/* u32 */
+	ETHTOOL_A_C33_PSE_PW_D_STATUS,		/* u32 */
 
 	/* add new constants above here */
 	__ETHTOOL_A_PSE_CNT,
-- 
cgit v1.2.3


From 47e0dd53c5eb9dab66689592523f31ea39fc32fa Mon Sep 17 00:00:00 2001
From: "Kory Maincent (Dent Project)" <kory.maincent@bootlin.com>
Date: Wed, 17 Apr 2024 16:39:50 +0200
Subject: net: pse-pd: Introduce PSE types enumeration

Introduce an enumeration to define PSE types (C33 or PoDL),
utilizing a bitfield for potential future support of both types.
Include 'pse_get_types' helper for external access to PSE type info.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Kory Maincent <kory.maincent@bootlin.com>
Link: https://lore.kernel.org/r/20240417-feature_poe-v9-2-242293fd1900@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/pse-pd/pse_core.c      | 12 ++++++++++++
 drivers/net/pse-pd/pse_regulator.c |  1 +
 include/linux/pse-pd/pse.h         | 15 +++++++++++++++
 include/uapi/linux/ethtool.h       | 12 ++++++++++++
 4 files changed, 40 insertions(+)

(limited to 'include')

diff --git a/drivers/net/pse-pd/pse_core.c b/drivers/net/pse-pd/pse_core.c
index 146b81f08a89..fed006cbc185 100644
--- a/drivers/net/pse-pd/pse_core.c
+++ b/drivers/net/pse-pd/pse_core.c
@@ -312,3 +312,15 @@ int pse_ethtool_set_config(struct pse_control *psec,
 	return err;
 }
 EXPORT_SYMBOL_GPL(pse_ethtool_set_config);
+
+bool pse_has_podl(struct pse_control *psec)
+{
+	return psec->pcdev->types & ETHTOOL_PSE_PODL;
+}
+EXPORT_SYMBOL_GPL(pse_has_podl);
+
+bool pse_has_c33(struct pse_control *psec)
+{
+	return psec->pcdev->types & ETHTOOL_PSE_C33;
+}
+EXPORT_SYMBOL_GPL(pse_has_c33);
diff --git a/drivers/net/pse-pd/pse_regulator.c b/drivers/net/pse-pd/pse_regulator.c
index 1dedf4de296e..547af384764b 100644
--- a/drivers/net/pse-pd/pse_regulator.c
+++ b/drivers/net/pse-pd/pse_regulator.c
@@ -116,6 +116,7 @@ pse_reg_probe(struct platform_device *pdev)
 	priv->pcdev.owner = THIS_MODULE;
 	priv->pcdev.ops = &pse_reg_ops;
 	priv->pcdev.dev = dev;
+	priv->pcdev.types = ETHTOOL_PSE_PODL;
 	ret = devm_pse_controller_register(dev, &priv->pcdev);
 	if (ret) {
 		dev_err(dev, "failed to register PSE controller (%pe)\n",
diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h
index be4e5754eb24..19589571157f 100644
--- a/include/linux/pse-pd/pse.h
+++ b/include/linux/pse-pd/pse.h
@@ -77,6 +77,7 @@ struct pse_control;
  *            device tree to id as given to the PSE control ops
  * @nr_lines: number of PSE controls in this controller device
  * @lock: Mutex for serialization access to the PSE controller
+ * @types: types of the PSE controller
  */
 struct pse_controller_dev {
 	const struct pse_controller_ops *ops;
@@ -89,6 +90,7 @@ struct pse_controller_dev {
 			const struct of_phandle_args *pse_spec);
 	unsigned int nr_lines;
 	struct mutex lock;
+	enum ethtool_pse_types types;
 };
 
 #if IS_ENABLED(CONFIG_PSE_CONTROLLER)
@@ -108,6 +110,9 @@ int pse_ethtool_set_config(struct pse_control *psec,
 			   struct netlink_ext_ack *extack,
 			   const struct pse_control_config *config);
 
+bool pse_has_podl(struct pse_control *psec);
+bool pse_has_c33(struct pse_control *psec);
+
 #else
 
 static inline struct pse_control *of_pse_control_get(struct device_node *node)
@@ -133,6 +138,16 @@ static inline int pse_ethtool_set_config(struct pse_control *psec,
 	return -ENOTSUPP;
 }
 
+static inline bool pse_has_podl(struct pse_control *psec)
+{
+	return false;
+}
+
+static inline bool pse_has_c33(struct pse_control *psec)
+{
+	return false;
+}
+
 #endif
 
 #endif
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 7b9a3d890949..041e09c3515d 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -752,6 +752,18 @@ enum ethtool_module_power_mode {
 	ETHTOOL_MODULE_POWER_MODE_HIGH,
 };
 
+/**
+ * enum ethtool_pse_types - Types of PSE controller.
+ * @ETHTOOL_PSE_UNKNOWN: Type of PSE controller is unknown
+ * @ETHTOOL_PSE_PODL: PSE controller which support PoDL
+ * @ETHTOOL_PSE_C33: PSE controller which support Clause 33 (PoE)
+ */
+enum ethtool_pse_types {
+	ETHTOOL_PSE_UNKNOWN =	1 << 0,
+	ETHTOOL_PSE_PODL =	1 << 1,
+	ETHTOOL_PSE_C33 =	1 << 2,
+};
+
 /**
  * enum ethtool_c33_pse_admin_state - operational state of the PoDL PSE
  *	functions. IEEE 802.3-2022 30.9.1.1.2 aPSEAdminState
-- 
cgit v1.2.3


From de1c705c50326acaceaf1f02bc5bf6f267c572bd Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Date: Mon, 8 Apr 2024 02:53:51 +0300
Subject: drm/mipi-dsi: use correct return type for the DSC functions

The functions mipi_dsi_compression_mode() and
mipi_dsi_picture_parameter_set() return 0-or-error rather than a buffer
size. Follow example of other similar MIPI DSI functions and use int
return type instead of size_t.

Fixes: f4dea1aaa9a1 ("drm/dsi: add helpers for DSI compression mode and PPS packets")
Reviewed-by: Marijn Suijten <marijn.suijten@somainline.org>
Reviewed-by: Jessica Zhang <quic_jesszhan@quicinc.com>
Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20240408-lg-sw43408-panel-v5-2-4e092da22991@linaro.org
---
 drivers/gpu/drm/drm_mipi_dsi.c | 6 +++---
 include/drm/drm_mipi_dsi.h     | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_mipi_dsi.c b/drivers/gpu/drm/drm_mipi_dsi.c
index ef6e416522f8..9874ff6d4718 100644
--- a/drivers/gpu/drm/drm_mipi_dsi.c
+++ b/drivers/gpu/drm/drm_mipi_dsi.c
@@ -654,7 +654,7 @@ EXPORT_SYMBOL(mipi_dsi_set_maximum_return_packet_size);
  *
  * Return: 0 on success or a negative error code on failure.
  */
-ssize_t mipi_dsi_compression_mode(struct mipi_dsi_device *dsi, bool enable)
+int mipi_dsi_compression_mode(struct mipi_dsi_device *dsi, bool enable)
 {
 	/* Note: Needs updating for non-default PPS or algorithm */
 	u8 tx[2] = { enable << 0, 0 };
@@ -679,8 +679,8 @@ EXPORT_SYMBOL(mipi_dsi_compression_mode);
  *
  * Return: 0 on success or a negative error code on failure.
  */
-ssize_t mipi_dsi_picture_parameter_set(struct mipi_dsi_device *dsi,
-				       const struct drm_dsc_picture_parameter_set *pps)
+int mipi_dsi_picture_parameter_set(struct mipi_dsi_device *dsi,
+				   const struct drm_dsc_picture_parameter_set *pps)
 {
 	struct mipi_dsi_msg msg = {
 		.channel = dsi->channel,
diff --git a/include/drm/drm_mipi_dsi.h b/include/drm/drm_mipi_dsi.h
index c0aec0d4d664..3011d33eccbd 100644
--- a/include/drm/drm_mipi_dsi.h
+++ b/include/drm/drm_mipi_dsi.h
@@ -241,9 +241,9 @@ int mipi_dsi_shutdown_peripheral(struct mipi_dsi_device *dsi);
 int mipi_dsi_turn_on_peripheral(struct mipi_dsi_device *dsi);
 int mipi_dsi_set_maximum_return_packet_size(struct mipi_dsi_device *dsi,
 					    u16 value);
-ssize_t mipi_dsi_compression_mode(struct mipi_dsi_device *dsi, bool enable);
-ssize_t mipi_dsi_picture_parameter_set(struct mipi_dsi_device *dsi,
-				       const struct drm_dsc_picture_parameter_set *pps);
+int mipi_dsi_compression_mode(struct mipi_dsi_device *dsi, bool enable);
+int mipi_dsi_picture_parameter_set(struct mipi_dsi_device *dsi,
+				   const struct drm_dsc_picture_parameter_set *pps);
 
 ssize_t mipi_dsi_generic_write(struct mipi_dsi_device *dsi, const void *payload,
 			       size_t size);
-- 
cgit v1.2.3


From b724455e89ea9be900b81492897aadebcdc5ba92 Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Date: Mon, 8 Apr 2024 02:53:52 +0300
Subject: drm/mipi-dsi: add mipi_dsi_compression_mode_ext()

Add the extended version of mipi_dsi_compression_mode(). It provides
a way to specify the algorithm and PPS selector.

Reviewed-by: Marijn Suijten <marijn.suijten@somainline.org>
Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20240408-lg-sw43408-panel-v5-3-4e092da22991@linaro.org
---
 drivers/gpu/drm/drm_mipi_dsi.c | 41 ++++++++++++++++++++++++++++++++++-------
 include/drm/drm_mipi_dsi.h     |  9 +++++++++
 2 files changed, 43 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_mipi_dsi.c b/drivers/gpu/drm/drm_mipi_dsi.c
index 9874ff6d4718..795001bb7ff1 100644
--- a/drivers/gpu/drm/drm_mipi_dsi.c
+++ b/drivers/gpu/drm/drm_mipi_dsi.c
@@ -645,29 +645,56 @@ int mipi_dsi_set_maximum_return_packet_size(struct mipi_dsi_device *dsi,
 EXPORT_SYMBOL(mipi_dsi_set_maximum_return_packet_size);
 
 /**
- * mipi_dsi_compression_mode() - enable/disable DSC on the peripheral
+ * mipi_dsi_compression_mode_ext() - enable/disable DSC on the peripheral
  * @dsi: DSI peripheral device
  * @enable: Whether to enable or disable the DSC
+ * @algo: Selected compression algorithm
+ * @pps_selector: Select PPS from the table of pre-stored or uploaded PPS entries
  *
- * Enable or disable Display Stream Compression on the peripheral using the
- * default Picture Parameter Set and VESA DSC 1.1 algorithm.
+ * Enable or disable Display Stream Compression on the peripheral.
  *
  * Return: 0 on success or a negative error code on failure.
  */
-int mipi_dsi_compression_mode(struct mipi_dsi_device *dsi, bool enable)
+int mipi_dsi_compression_mode_ext(struct mipi_dsi_device *dsi, bool enable,
+				  enum mipi_dsi_compression_algo algo,
+				  unsigned int pps_selector)
 {
-	/* Note: Needs updating for non-default PPS or algorithm */
-	u8 tx[2] = { enable << 0, 0 };
+	u8 tx[2] = { };
 	struct mipi_dsi_msg msg = {
 		.channel = dsi->channel,
 		.type = MIPI_DSI_COMPRESSION_MODE,
 		.tx_len = sizeof(tx),
 		.tx_buf = tx,
 	};
-	int ret = mipi_dsi_device_transfer(dsi, &msg);
+	int ret;
+
+	if (algo > 3 || pps_selector > 3)
+		return -EINVAL;
+
+	tx[0] = (enable << 0) |
+		(algo << 1) |
+		(pps_selector << 4);
+
+	ret = mipi_dsi_device_transfer(dsi, &msg);
 
 	return (ret < 0) ? ret : 0;
 }
+EXPORT_SYMBOL(mipi_dsi_compression_mode_ext);
+
+/**
+ * mipi_dsi_compression_mode() - enable/disable DSC on the peripheral
+ * @dsi: DSI peripheral device
+ * @enable: Whether to enable or disable the DSC
+ *
+ * Enable or disable Display Stream Compression on the peripheral using the
+ * default Picture Parameter Set and VESA DSC 1.1 algorithm.
+ *
+ * Return: 0 on success or a negative error code on failure.
+ */
+int mipi_dsi_compression_mode(struct mipi_dsi_device *dsi, bool enable)
+{
+	return mipi_dsi_compression_mode_ext(dsi, enable, MIPI_DSI_COMPRESSION_DSC, 0);
+}
 EXPORT_SYMBOL(mipi_dsi_compression_mode);
 
 /**
diff --git a/include/drm/drm_mipi_dsi.h b/include/drm/drm_mipi_dsi.h
index 3011d33eccbd..82b1cc434ea3 100644
--- a/include/drm/drm_mipi_dsi.h
+++ b/include/drm/drm_mipi_dsi.h
@@ -226,6 +226,12 @@ static inline int mipi_dsi_pixel_format_to_bpp(enum mipi_dsi_pixel_format fmt)
 	return -EINVAL;
 }
 
+enum mipi_dsi_compression_algo {
+	MIPI_DSI_COMPRESSION_DSC = 0,
+	MIPI_DSI_COMPRESSION_VENDOR = 3,
+	/* other two values are reserved, DSI 1.3 */
+};
+
 struct mipi_dsi_device *
 mipi_dsi_device_register_full(struct mipi_dsi_host *host,
 			      const struct mipi_dsi_device_info *info);
@@ -242,6 +248,9 @@ int mipi_dsi_turn_on_peripheral(struct mipi_dsi_device *dsi);
 int mipi_dsi_set_maximum_return_packet_size(struct mipi_dsi_device *dsi,
 					    u16 value);
 int mipi_dsi_compression_mode(struct mipi_dsi_device *dsi, bool enable);
+int mipi_dsi_compression_mode_ext(struct mipi_dsi_device *dsi, bool enable,
+				  enum mipi_dsi_compression_algo algo,
+				  unsigned int pps_selector);
 int mipi_dsi_picture_parameter_set(struct mipi_dsi_device *dsi,
 				   const struct drm_dsc_picture_parameter_set *pps);
 
-- 
cgit v1.2.3


From 9be9567a7c59b7314ea776f56945fe3fc28efe99 Mon Sep 17 00:00:00 2001
From: "Kory Maincent (Dent Project)" <kory.maincent@bootlin.com>
Date: Wed, 17 Apr 2024 16:39:55 +0200
Subject: net: pse-pd: Add support for PSE PIs

The Power Sourcing Equipment Power Interface (PSE PI) plays a pivotal role
in the architecture of Power over Ethernet (PoE) systems. It is essentially
a blueprint that outlines how one or multiple power sources are connected
to the eight-pin modular jack, commonly known as the Ethernet RJ45 port.
This connection scheme is crucial for enabling the delivery of power
alongside data over Ethernet cables.

This patch adds support for getting the PSE controller node through PSE PI
device subnode.

This supports adds a way to get the PSE PI id from the pse_pi devicetree
subnode of a PSE controller node simply by reading the reg property.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Kory Maincent <kory.maincent@bootlin.com>
Link: https://lore.kernel.org/r/20240417-feature_poe-v9-7-242293fd1900@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/pse-pd/index.rst  |   1 +
 Documentation/networking/pse-pd/pse-pi.rst | 301 +++++++++++++++++++++++++++++
 drivers/net/pse-pd/pse_core.c              | 262 +++++++++++++++++++++----
 include/linux/pse-pd/pse.h                 |  38 +++-
 4 files changed, 565 insertions(+), 37 deletions(-)
 create mode 100644 Documentation/networking/pse-pd/pse-pi.rst

(limited to 'include')

diff --git a/Documentation/networking/pse-pd/index.rst b/Documentation/networking/pse-pd/index.rst
index 18197bc7303d..de28a5aee316 100644
--- a/Documentation/networking/pse-pd/index.rst
+++ b/Documentation/networking/pse-pd/index.rst
@@ -7,3 +7,4 @@ Power Sourcing Equipment (PSE) Documentation
    :maxdepth: 2
 
    introduction
+   pse-pi
diff --git a/Documentation/networking/pse-pd/pse-pi.rst b/Documentation/networking/pse-pd/pse-pi.rst
new file mode 100644
index 000000000000..5cad14fedc13
--- /dev/null
+++ b/Documentation/networking/pse-pd/pse-pi.rst
@@ -0,0 +1,301 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+PSE Power Interface (PSE PI) Documentation
+==========================================
+
+The Power Sourcing Equipment Power Interface (PSE PI) plays a pivotal role in
+the architecture of Power over Ethernet (PoE) systems. It is essentially a
+blueprint that outlines how one or multiple power sources are connected to the
+eight-pin modular jack, commonly known as the Ethernet RJ45 port. This
+connection scheme is crucial for enabling the delivery of power alongside data
+over Ethernet cables.
+
+Documentation and Standards
+---------------------------
+
+The IEEE 802.3 standard provides detailed documentation on the PSE PI.
+Specifically:
+
+- Section "33.2.3 PI pin assignments" covers the pin assignments for PoE
+  systems that utilize two pairs for power delivery.
+- Section "145.2.4 PSE PI" addresses the configuration for PoE systems that
+  deliver power over all four pairs of an Ethernet cable.
+
+PSE PI and Single Pair Ethernet
+-------------------------------
+
+Single Pair Ethernet (SPE) represents a different approach to Ethernet
+connectivity, utilizing just one pair of conductors for both data and power
+transmission. Unlike the configurations detailed in the PSE PI for standard
+Ethernet, which can involve multiple power sourcing arrangements across four or
+two pairs of wires, SPE operates on a simpler model due to its single-pair
+design. As a result, the complexities of choosing between alternative pin
+assignments for power delivery, as described in the PSE PI for multi-pair
+Ethernet, are not applicable to SPE.
+
+Understanding PSE PI
+--------------------
+
+The Power Sourcing Equipment Power Interface (PSE PI) is a framework defining
+how Power Sourcing Equipment (PSE) delivers power to Powered Devices (PDs) over
+Ethernet cables. It details two main configurations for power delivery, known
+as Alternative A and Alternative B, which are distinguished not only by their
+method of power transmission but also by the implications for polarity and data
+transmission direction.
+
+Alternative A and B Overview
+----------------------------
+
+- **Alternative A:** Utilizes RJ45 conductors 1, 2, 3 and 6. In either case of
+  networks 10/100BaseT or 1G/2G/5G/10GBaseT, the pairs used are carrying data.
+  The power delivery's polarity in this alternative can vary based on the MDI
+  (Medium Dependent Interface) or MDI-X (Medium Dependent Interface Crossover)
+  configuration.
+
+- **Alternative B:** Utilizes RJ45 conductors 4, 5, 7 and 8. In case of
+  10/100BaseT network the pairs used are spare pairs without data and are less
+  influenced by data transmission direction. This is not the case for
+  1G/2G/5G/10GBaseT network. Alternative B includes two configurations with
+  different polarities, known as variant X and variant S, to accommodate
+  different network requirements and device specifications.
+
+Table 145-3 PSE Pinout Alternatives
+-----------------------------------
+
+The following table outlines the pin configurations for both Alternative A and
+Alternative B.
+
++------------+-------------------+-----------------+-----------------+-----------------+
+| Conductor  | Alternative A     | Alternative A   | Alternative B   | Alternative B   |
+|            |    (MDI-X)        |      (MDI)      |        (X)      |        (S)      |
++============+===================+=================+=================+=================+
+| 1          | Negative V        | Positive V      | -               | -               |
++------------+-------------------+-----------------+-----------------+-----------------+
+| 2          | Negative V        | Positive V      | -               | -               |
++------------+-------------------+-----------------+-----------------+-----------------+
+| 3          | Positive V        | Negative V      | -               | -               |
++------------+-------------------+-----------------+-----------------+-----------------+
+| 4          | -                 | -               | Negative V      | Positive V      |
++------------+-------------------+-----------------+-----------------+-----------------+
+| 5          | -                 | -               | Negative V      | Positive V      |
++------------+-------------------+-----------------+-----------------+-----------------+
+| 6          | Positive V        | Negative V      | -               | -               |
++------------+-------------------+-----------------+-----------------+-----------------+
+| 7          | -                 | -               | Positive V      | Negative V      |
++------------+-------------------+-----------------+-----------------+-----------------+
+| 8          | -                 | -               | Positive V      | Negative V      |
++------------+-------------------+-----------------+-----------------+-----------------+
+
+.. note::
+    - "Positive V" and "Negative V" indicate the voltage polarity for each pin.
+    - "-" indicates that the pin is not used for power delivery in that
+      specific configuration.
+
+PSE PI compatibilities
+----------------------
+
+The following table outlines the compatibility between the pinout alternative
+and the 1000/2.5G/5G/10GBaseT in the PSE 2 pairs connection.
+
++---------+---------------+---------------------+-----------------------+
+| Variant | Alternative   | Power Feeding Type  | Compatibility with    |
+|         | (A/B)         | (Direct/Phantom)    | 1000/2.5G/5G/10GBaseT |
++=========+===============+=====================+=======================+
+| 1       | A             | Phantom             | Yes                   |
++---------+---------------+---------------------+-----------------------+
+| 2       | B             | Phantom             | Yes                   |
++---------+---------------+---------------------+-----------------------+
+| 3       | B             | Direct              | No                    |
++---------+---------------+---------------------+-----------------------+
+
+.. note::
+    - "Direct" indicate a variant where the power is injected directly to pairs
+       without using magnetics in case of spare pairs.
+    - "Phantom" indicate power path over coils/magnetics as it is done for
+       Alternative A variant.
+
+In case of PSE 4 pairs, a PSE supporting only 10/100BaseT (which mean Direct
+Power on pinout Alternative B) is not compatible with a 4 pairs
+1000/2.5G/5G/10GBaseT.
+
+PSE Power Interface (PSE PI) Connection Diagram
+-----------------------------------------------
+
+The diagram below illustrates the connection architecture between the RJ45
+port, the Ethernet PHY (Physical Layer), and the PSE PI (Power Sourcing
+Equipment Power Interface), demonstrating how power and data are delivered
+simultaneously through an Ethernet cable. The RJ45 port serves as the physical
+interface for these connections, with each of its eight pins connected to both
+the Ethernet PHY for data transmission and the PSE PI for power delivery.
+
+.. code-block::
+
+    +--------------------------+
+    |                          |
+    |          RJ45 Port       |
+    |                          |
+    +--+--+--+--+--+--+--+--+--+                +-------------+
+      1| 2| 3| 4| 5| 6| 7| 8|                   |             |
+       |  |  |  |  |  |  |  o-------------------+             |
+       |  |  |  |  |  |  o--|-------------------+             +<--- PSE 1
+       |  |  |  |  |  o--|--|-------------------+             |
+       |  |  |  |  o--|--|--|-------------------+             |
+       |  |  |  o--|--|--|--|-------------------+  PSE PI     |
+       |  |  o--|--|--|--|--|-------------------+             |
+       |  o--|--|--|--|--|--|-------------------+             +<--- PSE 2 (optional)
+       o--|--|--|--|--|--|--|-------------------+             |
+       |  |  |  |  |  |  |  |                   |             |
+    +--+--+--+--+--+--+--+--+--+                +-------------+
+    |                          |
+    |       Ethernet PHY       |
+    |                          |
+    +--------------------------+
+
+Simple PSE PI Configuration for Alternative A
+---------------------------------------------
+
+The diagram below illustrates a straightforward PSE PI (Power Sourcing
+Equipment Power Interface) configuration designed to support the Alternative A
+setup for Power over Ethernet (PoE). This implementation is tailored to provide
+power delivery through the data-carrying pairs of an Ethernet cable, suitable
+for either MDI or MDI-X configurations, albeit supporting one variation at a
+time.
+
+.. code-block::
+
+         +-------------+
+         |    PSE PI   |
+ 8  -----+                             +-------------+
+ 7  -----+                    Rail 1   |
+ 6  -----+------+----------------------+
+ 5  -----+      |                      |
+ 4  -----+      |             Rail 2   |  PSE 1
+ 3  -----+------/         +------------+
+ 2  -----+--+-------------/            |
+ 1  -----+--/                          +-------------+
+         |
+         +-------------+
+
+In this configuration:
+
+- Pins 1 and 2, as well as pins 3 and 6, are utilized for power delivery in
+  addition to data transmission. This aligns with the standard wiring for
+  10/100BaseT Ethernet networks where these pairs are used for data.
+- Rail 1 and Rail 2 represent the positive and negative voltage rails, with
+  Rail 1 connected to pins 1 and 2, and Rail 2 connected to pins 3 and 6.
+  More advanced PSE PI configurations may include integrated or external
+  switches to change the polarity of the voltage rails, allowing for
+  compatibility with both MDI and MDI-X configurations.
+
+More complex PSE PI configurations may include additional components, to support
+Alternative B, or to provide additional features such as power management, or
+additional power delivery capabilities such as 2-pair or 4-pair power delivery.
+
+.. code-block::
+
+         +-------------+
+         |    PSE PI   |
+         |        +---+
+ 8  -----+--------+   |                 +-------------+
+ 7  -----+--------+   |       Rail 1   |
+ 6  -----+--------+   +-----------------+
+ 5  -----+--------+   |                |
+ 4  -----+--------+   |       Rail 2   |  PSE 1
+ 3  -----+--------+   +----------------+
+ 2  -----+--------+   |                |
+ 1  -----+--------+   |                 +-------------+
+         |        +---+
+         +-------------+
+
+Device Tree Configuration: Describing PSE PI Configurations
+-----------------------------------------------------------
+
+The necessity for a separate PSE PI node in the device tree is influenced by
+the intricacy of the Power over Ethernet (PoE) system's setup. Here are
+descriptions of both simple and complex PSE PI configurations to illustrate
+this decision-making process:
+
+**Simple PSE PI Configuration:**
+In a straightforward scenario, the PSE PI setup involves a direct, one-to-one
+connection between a single PSE controller and an Ethernet port. This setup
+typically supports basic PoE functionality without the need for dynamic
+configuration or management of multiple power delivery modes. For such simple
+configurations, detailing the PSE PI within the existing PSE controller's node
+may suffice, as the system does not encompass additional complexity that
+warrants a separate node. The primary focus here is on the clear and direct
+association of power delivery to a specific Ethernet port.
+
+**Complex PSE PI Configuration:**
+Contrastingly, a complex PSE PI setup may encompass multiple PSE controllers or
+auxiliary circuits that collectively manage power delivery to one Ethernet
+port. Such configurations might support a range of PoE standards and require
+the capability to dynamically configure power delivery based on the operational
+mode (e.g., PoE2 versus PoE4) or specific requirements of connected devices. In
+these instances, a dedicated PSE PI node becomes essential for accurately
+documenting the system architecture. This node would serve to detail the
+interactions between different PSE controllers, the support for various PoE
+modes, and any additional logic required to coordinate power delivery across
+the network infrastructure.
+
+**Guidance:**
+
+For simple PSE setups, including PSE PI information in the PSE controller node
+might suffice due to the straightforward nature of these systems. However,
+complex configurations, involving multiple components or advanced PoE features,
+benefit from a dedicated PSE PI node. This method adheres to IEEE 802.3
+specifications, improving documentation clarity and ensuring accurate
+representation of the PoE system's complexity.
+
+PSE PI Node: Essential Information
+----------------------------------
+
+The PSE PI (Power Sourcing Equipment Power Interface) node in a device tree can
+include several key pieces of information critical for defining the power
+delivery capabilities and configurations of a PoE (Power over Ethernet) system.
+Below is a list of such information, along with explanations for their
+necessity and reasons why they might not be found within a PSE controller node:
+
+1. **Powered Pairs Configuration**
+
+   - *Description:* Identifies the pairs used for power delivery in the
+     Ethernet cable.
+   - *Necessity:* Essential to ensure the correct pairs are powered according
+     to the board's design.
+   - *PSE Controller Node:* Typically lacks details on physical pair usage,
+     focusing on power regulation.
+
+2. **Polarity of Powered Pairs**
+
+   - *Description:* Specifies the polarity (positive or negative) for each
+     powered pair.
+   - *Necessity:* Critical for safe and effective power transmission to PDs.
+   - *PSE Controller Node:* Polarity management may exceed the standard
+     functionalities of PSE controllers.
+
+3. **PSE Cells Association**
+
+   - *Description:* Details the association of PSE cells with Ethernet ports or
+     pairs in multi-cell configurations.
+   - *Necessity:* Allows for optimized power resource allocation in complex
+     systems.
+   - *PSE Controller Node:* Controllers may not manage cell associations
+     directly, focusing instead on power flow regulation.
+
+4. **Support for PoE Standards**
+
+   - *Description:* Lists the PoE standards and configurations supported by the
+     system.
+   - *Necessity:* Ensures system compatibility with various PDs and adherence
+     to industry standards.
+   - *PSE Controller Node:* Specific capabilities may depend on the overall PSE
+     PI design rather than the controller alone. Multiple PSE cells per PI
+     do not necessarily imply support for multiple PoE standards.
+
+5. **Protection Mechanisms**
+
+   - *Description:* Outlines additional protection mechanisms, such as
+     overcurrent protection and thermal management.
+   - *Necessity:* Provides extra safety and stability, complementing PSE
+     controller protections.
+   - *PSE Controller Node:* Some protections may be implemented via
+     board-specific hardware or algorithms external to the controller.
diff --git a/drivers/net/pse-pd/pse_core.c b/drivers/net/pse-pd/pse_core.c
index fed006cbc185..ca5ced8e0d8a 100644
--- a/drivers/net/pse-pd/pse_core.c
+++ b/drivers/net/pse-pd/pse_core.c
@@ -27,38 +27,182 @@ struct pse_control {
 	struct kref refcnt;
 };
 
+static int of_load_single_pse_pi_pairset(struct device_node *node,
+					 struct pse_pi *pi,
+					 int pairset_num)
+{
+	struct device_node *pairset_np;
+	const char *name;
+	int ret;
+
+	ret = of_property_read_string_index(node, "pairset-names",
+					    pairset_num, &name);
+	if (ret)
+		return ret;
+
+	if (!strcmp(name, "alternative-a")) {
+		pi->pairset[pairset_num].pinout = ALTERNATIVE_A;
+	} else if (!strcmp(name, "alternative-b")) {
+		pi->pairset[pairset_num].pinout = ALTERNATIVE_B;
+	} else {
+		pr_err("pse: wrong pairset-names value %s (%pOF)\n",
+		       name, node);
+		return -EINVAL;
+	}
+
+	pairset_np = of_parse_phandle(node, "pairsets", pairset_num);
+	if (!pairset_np)
+		return -ENODEV;
+
+	pi->pairset[pairset_num].np = pairset_np;
+
+	return 0;
+}
+
 /**
- * of_pse_zero_xlate - dummy function for controllers with one only control
- * @pcdev: a pointer to the PSE controller device
- * @pse_spec: PSE line specifier as found in the device tree
+ * of_load_pse_pi_pairsets - load PSE PI pairsets pinout and polarity
+ * @node: a pointer of the device node
+ * @pi: a pointer of the PSE PI to fill
+ * @npairsets: the number of pairsets (1 or 2) used by the PI
  *
- * This static translation function is used by default if of_xlate in
- * :c:type:`pse_controller_dev` is not set. It is useful for all PSE
- * controllers with #pse-cells = <0>.
+ * Return: 0 on success and failure value on error
  */
-static int of_pse_zero_xlate(struct pse_controller_dev *pcdev,
-			     const struct of_phandle_args *pse_spec)
+static int of_load_pse_pi_pairsets(struct device_node *node,
+				   struct pse_pi *pi,
+				   int npairsets)
 {
-	return 0;
+	int i, ret;
+
+	ret = of_property_count_strings(node, "pairset-names");
+	if (ret != npairsets) {
+		pr_err("pse: amount of pairsets and pairset-names is not equal %d != %d (%pOF)\n",
+		       npairsets, ret, node);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < npairsets; i++) {
+		ret = of_load_single_pse_pi_pairset(node, pi, i);
+		if (ret)
+			goto out;
+	}
+
+	if (npairsets == 2 &&
+	    pi->pairset[0].pinout == pi->pairset[1].pinout) {
+		pr_err("pse: two PI pairsets can not have identical pinout (%pOF)",
+		       node);
+		ret = -EINVAL;
+	}
+
+out:
+	/* If an error appears, release all the pairset device node kref */
+	if (ret) {
+		of_node_put(pi->pairset[0].np);
+		pi->pairset[0].np = NULL;
+		of_node_put(pi->pairset[1].np);
+		pi->pairset[1].np = NULL;
+	}
+
+	return ret;
+}
+
+static void pse_release_pis(struct pse_controller_dev *pcdev)
+{
+	int i;
+
+	for (i = 0; i <= pcdev->nr_lines; i++) {
+		of_node_put(pcdev->pi[i].pairset[0].np);
+		of_node_put(pcdev->pi[i].pairset[1].np);
+		of_node_put(pcdev->pi[i].np);
+	}
+	kfree(pcdev->pi);
 }
 
 /**
- * of_pse_simple_xlate - translate pse_spec to the PSE line number
+ * of_load_pse_pis - load all the PSE PIs
  * @pcdev: a pointer to the PSE controller device
- * @pse_spec: PSE line specifier as found in the device tree
  *
- * This static translation function is used by default if of_xlate in
- * :c:type:`pse_controller_dev` is not set. It is useful for all PSE
- * controllers with 1:1 mapping, where PSE lines can be indexed by number
- * without gaps.
+ * Return: 0 on success and failure value on error
  */
-static int of_pse_simple_xlate(struct pse_controller_dev *pcdev,
-			       const struct of_phandle_args *pse_spec)
+static int of_load_pse_pis(struct pse_controller_dev *pcdev)
 {
-	if (pse_spec->args[0] >= pcdev->nr_lines)
-		return -EINVAL;
+	struct device_node *np = pcdev->dev->of_node;
+	struct device_node *node, *pis;
+	int ret;
 
-	return pse_spec->args[0];
+	if (!np)
+		return -ENODEV;
+
+	pis = of_get_child_by_name(np, "pse-pis");
+	if (!pis) {
+		/* no description of PSE PIs */
+		pcdev->no_of_pse_pi = true;
+		return 0;
+	}
+
+	pcdev->pi = kcalloc(pcdev->nr_lines, sizeof(*pcdev->pi), GFP_KERNEL);
+	if (!pcdev->pi) {
+		of_node_put(pis);
+		return -ENOMEM;
+	}
+
+	for_each_child_of_node(pis, node) {
+		struct pse_pi pi = {0};
+		u32 id;
+
+		if (!of_node_name_eq(node, "pse-pi"))
+			continue;
+
+		ret = of_property_read_u32(node, "reg", &id);
+		if (ret) {
+			dev_err(pcdev->dev,
+				"can't get reg property for node '%pOF'",
+				node);
+			goto out;
+		}
+
+		if (id >= pcdev->nr_lines) {
+			dev_err(pcdev->dev,
+				"reg value (%u) is out of range (%u) (%pOF)\n",
+				id, pcdev->nr_lines, node);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		if (pcdev->pi[id].np) {
+			dev_err(pcdev->dev,
+				"other node with same reg value was already registered. %pOF : %pOF\n",
+				pcdev->pi[id].np, node);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		ret = of_count_phandle_with_args(node, "pairsets", NULL);
+		/* npairsets is limited to value one or two */
+		if (ret == 1 || ret == 2) {
+			ret = of_load_pse_pi_pairsets(node, &pi, ret);
+			if (ret)
+				goto out;
+		} else if (ret != ENOENT) {
+			dev_err(pcdev->dev,
+				"error: wrong number of pairsets. Should be 1 or 2, got %d (%pOF)\n",
+				ret, node);
+			ret = -EINVAL;
+			goto out;
+		}
+
+		of_node_get(node);
+		pi.np = node;
+		memcpy(&pcdev->pi[id], &pi, sizeof(pi));
+	}
+
+	of_node_put(pis);
+	return 0;
+
+out:
+	pse_release_pis(pcdev);
+	of_node_put(node);
+	of_node_put(pis);
+	return ret;
 }
 
 /**
@@ -67,16 +211,18 @@ static int of_pse_simple_xlate(struct pse_controller_dev *pcdev,
  */
 int pse_controller_register(struct pse_controller_dev *pcdev)
 {
-	if (!pcdev->of_xlate) {
-		if (pcdev->of_pse_n_cells == 0)
-			pcdev->of_xlate = of_pse_zero_xlate;
-		else if (pcdev->of_pse_n_cells == 1)
-			pcdev->of_xlate = of_pse_simple_xlate;
-	}
+	int ret;
 
 	mutex_init(&pcdev->lock);
 	INIT_LIST_HEAD(&pcdev->pse_control_head);
 
+	if (!pcdev->nr_lines)
+		pcdev->nr_lines = 1;
+
+	ret = of_load_pse_pis(pcdev);
+	if (ret)
+		return ret;
+
 	mutex_lock(&pse_list_mutex);
 	list_add(&pcdev->list, &pse_controller_list);
 	mutex_unlock(&pse_list_mutex);
@@ -91,6 +237,7 @@ EXPORT_SYMBOL_GPL(pse_controller_register);
  */
 void pse_controller_unregister(struct pse_controller_dev *pcdev)
 {
+	pse_release_pis(pcdev);
 	mutex_lock(&pse_list_mutex);
 	list_del(&pcdev->list);
 	mutex_unlock(&pse_list_mutex);
@@ -203,8 +350,48 @@ pse_control_get_internal(struct pse_controller_dev *pcdev, unsigned int index)
 	return psec;
 }
 
-struct pse_control *
-of_pse_control_get(struct device_node *node)
+/**
+ * of_pse_match_pi - Find the PSE PI id matching the device node phandle
+ * @pcdev: a pointer to the PSE controller device
+ * @np: a pointer to the device node
+ *
+ * Return: id of the PSE PI, -EINVAL if not found
+ */
+static int of_pse_match_pi(struct pse_controller_dev *pcdev,
+			   struct device_node *np)
+{
+	int i;
+
+	for (i = 0; i <= pcdev->nr_lines; i++) {
+		if (pcdev->pi[i].np == np)
+			return i;
+	}
+
+	return -EINVAL;
+}
+
+/**
+ * psec_id_xlate - translate pse_spec to the PSE line number according
+ *		   to the number of pse-cells in case of no pse_pi node
+ * @pcdev: a pointer to the PSE controller device
+ * @pse_spec: PSE line specifier as found in the device tree
+ *
+ * Return: 0 if #pse-cells = <0>. Return PSE line number otherwise.
+ */
+static int psec_id_xlate(struct pse_controller_dev *pcdev,
+			 const struct of_phandle_args *pse_spec)
+{
+	if (!pcdev->of_pse_n_cells)
+		return 0;
+
+	if (pcdev->of_pse_n_cells > 1 ||
+	    pse_spec->args[0] >= pcdev->nr_lines)
+		return -EINVAL;
+
+	return pse_spec->args[0];
+}
+
+struct pse_control *of_pse_control_get(struct device_node *node)
 {
 	struct pse_controller_dev *r, *pcdev;
 	struct of_phandle_args args;
@@ -222,7 +409,14 @@ of_pse_control_get(struct device_node *node)
 	mutex_lock(&pse_list_mutex);
 	pcdev = NULL;
 	list_for_each_entry(r, &pse_controller_list, list) {
-		if (args.np == r->dev->of_node) {
+		if (!r->no_of_pse_pi) {
+			ret = of_pse_match_pi(r, args.np);
+			if (ret >= 0) {
+				pcdev = r;
+				psec_id = ret;
+				break;
+			}
+		} else if (args.np == r->dev->of_node) {
 			pcdev = r;
 			break;
 		}
@@ -238,10 +432,12 @@ of_pse_control_get(struct device_node *node)
 		goto out;
 	}
 
-	psec_id = pcdev->of_xlate(pcdev, &args);
-	if (psec_id < 0) {
-		psec = ERR_PTR(psec_id);
-		goto out;
+	if (pcdev->no_of_pse_pi) {
+		psec_id = psec_id_xlate(pcdev, &args);
+		if (psec_id < 0) {
+			psec = ERR_PTR(psec_id);
+			goto out;
+		}
 	}
 
 	/* pse_list_mutex also protects the pcdev's pse_control list */
diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h
index 19589571157f..e19d58b5e777 100644
--- a/include/linux/pse-pd/pse.h
+++ b/include/linux/pse-pd/pse.h
@@ -64,6 +64,36 @@ struct device_node;
 struct of_phandle_args;
 struct pse_control;
 
+/* PSE PI pairset pinout can either be Alternative A or Alternative B */
+enum pse_pi_pairset_pinout {
+	ALTERNATIVE_A,
+	ALTERNATIVE_B,
+};
+
+/**
+ * struct pse_pi_pairset - PSE PI pairset entity describing the pinout
+ *			   alternative ant its phandle
+ *
+ * @pinout: description of the pinout alternative
+ * @np: device node pointer describing the pairset phandle
+ */
+struct pse_pi_pairset {
+	enum pse_pi_pairset_pinout pinout;
+	struct device_node *np;
+};
+
+/**
+ * struct pse_pi - PSE PI (Power Interface) entity as described in
+ *		   IEEE 802.3-2022 145.2.4
+ *
+ * @pairset: table of the PSE PI pinout alternative for the two pairset
+ * @np: device node pointer of the PSE PI node
+ */
+struct pse_pi {
+	struct pse_pi_pairset pairset[2];
+	struct device_node *np;
+};
+
 /**
  * struct pse_controller_dev - PSE controller entity that might
  *                             provide multiple PSE controls
@@ -73,11 +103,11 @@ struct pse_control;
  * @pse_control_head: head of internal list of requested PSE controls
  * @dev: corresponding driver model device struct
  * @of_pse_n_cells: number of cells in PSE line specifiers
- * @of_xlate: translation function to translate from specifier as found in the
- *            device tree to id as given to the PSE control ops
  * @nr_lines: number of PSE controls in this controller device
  * @lock: Mutex for serialization access to the PSE controller
  * @types: types of the PSE controller
+ * @pi: table of PSE PIs described in this controller device
+ * @no_of_pse_pi: flag set if the pse_pis devicetree node is not used
  */
 struct pse_controller_dev {
 	const struct pse_controller_ops *ops;
@@ -86,11 +116,11 @@ struct pse_controller_dev {
 	struct list_head pse_control_head;
 	struct device *dev;
 	int of_pse_n_cells;
-	int (*of_xlate)(struct pse_controller_dev *pcdev,
-			const struct of_phandle_args *pse_spec);
 	unsigned int nr_lines;
 	struct mutex lock;
 	enum ethtool_pse_types types;
+	struct pse_pi *pi;
+	bool no_of_pse_pi;
 };
 
 #if IS_ENABLED(CONFIG_PSE_CONTROLLER)
-- 
cgit v1.2.3


From 29e28d1d7a16a3f6dba55c1df5cc6e0f9edf3017 Mon Sep 17 00:00:00 2001
From: "Kory Maincent (Dent Project)" <kory.maincent@bootlin.com>
Date: Wed, 17 Apr 2024 16:39:57 +0200
Subject: net: pse-pd: Add support for setup_pi_matrix callback

Implement setup_pi_matrix callback to configure the PSE PI matrix. This
functionality is invoked before registering the PSE and following the core
parsing of the pse_pis devicetree subnode.

Signed-off-by: Kory Maincent <kory.maincent@bootlin.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://lore.kernel.org/r/20240417-feature_poe-v9-9-242293fd1900@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/pse-pd/pse_core.c | 6 ++++++
 include/linux/pse-pd/pse.h    | 2 ++
 2 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/drivers/net/pse-pd/pse_core.c b/drivers/net/pse-pd/pse_core.c
index ca5ced8e0d8a..a7ff7676ab77 100644
--- a/drivers/net/pse-pd/pse_core.c
+++ b/drivers/net/pse-pd/pse_core.c
@@ -223,6 +223,12 @@ int pse_controller_register(struct pse_controller_dev *pcdev)
 	if (ret)
 		return ret;
 
+	if (pcdev->ops->setup_pi_matrix) {
+		ret = pcdev->ops->setup_pi_matrix(pcdev);
+		if (ret)
+			return ret;
+	}
+
 	mutex_lock(&pse_list_mutex);
 	list_add(&pcdev->list, &pse_controller_list);
 	mutex_unlock(&pse_list_mutex);
diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h
index e19d58b5e777..fa0c73da0cf1 100644
--- a/include/linux/pse-pd/pse.h
+++ b/include/linux/pse-pd/pse.h
@@ -49,6 +49,7 @@ struct pse_control_status {
  *
  * @ethtool_get_status: get PSE control status for ethtool interface
  * @ethtool_set_config: set PSE control configuration over ethtool interface
+ * @setup_pi_matrix: setup PI matrix of the PSE controller
  */
 struct pse_controller_ops {
 	int (*ethtool_get_status)(struct pse_controller_dev *pcdev,
@@ -57,6 +58,7 @@ struct pse_controller_ops {
 	int (*ethtool_set_config)(struct pse_controller_dev *pcdev,
 		unsigned long id, struct netlink_ext_ack *extack,
 		const struct pse_control_config *config);
+	int (*setup_pi_matrix)(struct pse_controller_dev *pcdev);
 };
 
 struct module;
-- 
cgit v1.2.3


From d83e13761d5b0568376963729abcccf6de5a43ba Mon Sep 17 00:00:00 2001
From: "Kory Maincent (Dent Project)" <kory.maincent@bootlin.com>
Date: Wed, 17 Apr 2024 16:39:58 +0200
Subject: net: pse-pd: Use regulator framework within PSE framework

Integrate the regulator framework to the PSE framework for enhanced
access to features such as voltage, power measurement, and limits, which
are akin to regulators. Additionally, PSE features like port priorities
could potentially enhance the regulator framework. Note that this
integration introduces some implementation complexity, including wrapper
callbacks, but the potential benefits make it worthwhile.

Regulator are using enable counter with specific behavior.
Two calls to regulator_disable will trigger kernel warnings.
If the counter exceeds one, regulator_disable call won't disable the
PSE PI. These behavior isn't suitable for PSE control.
Added a boolean 'enabled' state to prevent multiple calls to
regulator_enable/disable. These calls will only be called from PSE
framework as it won't have any regulator children, therefore no mutex are
needed to safeguards this boolean.

regulator_get needs the consumer device pointer. Use PSE as regulator
provider and consumer device until we have RJ45 ports represented in
the Kernel.

Signed-off-by: Kory Maincent <kory.maincent@bootlin.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://lore.kernel.org/r/20240417-feature_poe-v9-10-242293fd1900@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/pse-pd/pse_core.c      | 249 ++++++++++++++++++++++++++++++++++---
 drivers/net/pse-pd/pse_regulator.c |  48 +++----
 include/linux/pse-pd/pse.h         |  15 ++-
 3 files changed, 267 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/drivers/net/pse-pd/pse_core.c b/drivers/net/pse-pd/pse_core.c
index a7ff7676ab77..31f23c454678 100644
--- a/drivers/net/pse-pd/pse_core.c
+++ b/drivers/net/pse-pd/pse_core.c
@@ -8,6 +8,8 @@
 #include <linux/device.h>
 #include <linux/of.h>
 #include <linux/pse-pd/pse.h>
+#include <linux/regulator/driver.h>
+#include <linux/regulator/machine.h>
 
 static DEFINE_MUTEX(pse_list_mutex);
 static LIST_HEAD(pse_controller_list);
@@ -16,12 +18,14 @@ static LIST_HEAD(pse_controller_list);
  * struct pse_control - a PSE control
  * @pcdev: a pointer to the PSE controller device
  *         this PSE control belongs to
+ * @ps: PSE PI supply of the PSE control
  * @list: list entry for the pcdev's PSE controller list
  * @id: ID of the PSE line in the PSE controller device
  * @refcnt: Number of gets of this pse_control
  */
 struct pse_control {
 	struct pse_controller_dev *pcdev;
+	struct regulator *ps;
 	struct list_head list;
 	unsigned int id;
 	struct kref refcnt;
@@ -132,6 +136,10 @@ static int of_load_pse_pis(struct pse_controller_dev *pcdev)
 	if (!np)
 		return -ENODEV;
 
+	pcdev->pi = kcalloc(pcdev->nr_lines, sizeof(*pcdev->pi), GFP_KERNEL);
+	if (!pcdev->pi)
+		return -ENOMEM;
+
 	pis = of_get_child_by_name(np, "pse-pis");
 	if (!pis) {
 		/* no description of PSE PIs */
@@ -139,12 +147,6 @@ static int of_load_pse_pis(struct pse_controller_dev *pcdev)
 		return 0;
 	}
 
-	pcdev->pi = kcalloc(pcdev->nr_lines, sizeof(*pcdev->pi), GFP_KERNEL);
-	if (!pcdev->pi) {
-		of_node_put(pis);
-		return -ENOMEM;
-	}
-
 	for_each_child_of_node(pis, node) {
 		struct pse_pi pi = {0};
 		u32 id;
@@ -205,13 +207,124 @@ out:
 	return ret;
 }
 
+static int pse_pi_is_enabled(struct regulator_dev *rdev)
+{
+	struct pse_controller_dev *pcdev = rdev_get_drvdata(rdev);
+	const struct pse_controller_ops *ops;
+	int id, ret;
+
+	ops = pcdev->ops;
+	if (!ops->pi_is_enabled)
+		return -EOPNOTSUPP;
+
+	id = rdev_get_id(rdev);
+	mutex_lock(&pcdev->lock);
+	ret = ops->pi_is_enabled(pcdev, id);
+	mutex_unlock(&pcdev->lock);
+
+	return ret;
+}
+
+static int pse_pi_enable(struct regulator_dev *rdev)
+{
+	struct pse_controller_dev *pcdev = rdev_get_drvdata(rdev);
+	const struct pse_controller_ops *ops;
+	int id, ret;
+
+	ops = pcdev->ops;
+	if (!ops->pi_enable)
+		return -EOPNOTSUPP;
+
+	id = rdev_get_id(rdev);
+	mutex_lock(&pcdev->lock);
+	ret = ops->pi_enable(pcdev, id);
+	if (!ret)
+		pcdev->pi[id].admin_state_enabled = 1;
+	mutex_unlock(&pcdev->lock);
+
+	return ret;
+}
+
+static int pse_pi_disable(struct regulator_dev *rdev)
+{
+	struct pse_controller_dev *pcdev = rdev_get_drvdata(rdev);
+	const struct pse_controller_ops *ops;
+	int id, ret;
+
+	ops = pcdev->ops;
+	if (!ops->pi_disable)
+		return -EOPNOTSUPP;
+
+	id = rdev_get_id(rdev);
+	mutex_lock(&pcdev->lock);
+	ret = ops->pi_disable(pcdev, id);
+	if (!ret)
+		pcdev->pi[id].admin_state_enabled = 0;
+	mutex_unlock(&pcdev->lock);
+
+	return ret;
+}
+
+static const struct regulator_ops pse_pi_ops = {
+	.is_enabled = pse_pi_is_enabled,
+	.enable = pse_pi_enable,
+	.disable = pse_pi_disable,
+};
+
+static int
+devm_pse_pi_regulator_register(struct pse_controller_dev *pcdev,
+			       char *name, int id)
+{
+	struct regulator_init_data *rinit_data;
+	struct regulator_config rconfig = {0};
+	struct regulator_desc *rdesc;
+	struct regulator_dev *rdev;
+
+	rinit_data = devm_kzalloc(pcdev->dev, sizeof(*rinit_data),
+				  GFP_KERNEL);
+	if (!rinit_data)
+		return -ENOMEM;
+
+	rdesc = devm_kzalloc(pcdev->dev, sizeof(*rdesc), GFP_KERNEL);
+	if (!rdesc)
+		return -ENOMEM;
+
+	/* Regulator descriptor id have to be the same as its associated
+	 * PSE PI id for the well functioning of the PSE controls.
+	 */
+	rdesc->id = id;
+	rdesc->name = name;
+	rdesc->type = REGULATOR_CURRENT;
+	rdesc->ops = &pse_pi_ops;
+	rdesc->owner = pcdev->owner;
+
+	rinit_data->constraints.valid_ops_mask = REGULATOR_CHANGE_STATUS;
+	rinit_data->supply_regulator = "vpwr";
+
+	rconfig.dev = pcdev->dev;
+	rconfig.driver_data = pcdev;
+	rconfig.init_data = rinit_data;
+
+	rdev = devm_regulator_register(pcdev->dev, rdesc, &rconfig);
+	if (IS_ERR(rdev)) {
+		dev_err_probe(pcdev->dev, PTR_ERR(rdev),
+			      "Failed to register regulator\n");
+		return PTR_ERR(rdev);
+	}
+
+	pcdev->pi[id].rdev = rdev;
+
+	return 0;
+}
+
 /**
  * pse_controller_register - register a PSE controller device
  * @pcdev: a pointer to the initialized PSE controller device
  */
 int pse_controller_register(struct pse_controller_dev *pcdev)
 {
-	int ret;
+	size_t reg_name_len;
+	int ret, i;
 
 	mutex_init(&pcdev->lock);
 	INIT_LIST_HEAD(&pcdev->pse_control_head);
@@ -229,6 +342,31 @@ int pse_controller_register(struct pse_controller_dev *pcdev)
 			return ret;
 	}
 
+	/* Each regulator name len is pcdev dev name + 7 char +
+	 * int max digit number (10) + 1
+	 */
+	reg_name_len = strlen(dev_name(pcdev->dev)) + 18;
+
+	/* Register PI regulators */
+	for (i = 0; i < pcdev->nr_lines; i++) {
+		char *reg_name;
+
+		/* Do not register regulator for PIs not described */
+		if (!pcdev->no_of_pse_pi && !pcdev->pi[i].np)
+			continue;
+
+		reg_name = devm_kzalloc(pcdev->dev, reg_name_len, GFP_KERNEL);
+		if (!reg_name)
+			return -ENOMEM;
+
+		snprintf(reg_name, reg_name_len, "pse-%s_pi%d",
+			 dev_name(pcdev->dev), i);
+
+		ret = devm_pse_pi_regulator_register(pcdev, reg_name, i);
+		if (ret)
+			return ret;
+	}
+
 	mutex_lock(&pse_list_mutex);
 	list_add(&pcdev->list, &pse_controller_list);
 	mutex_unlock(&pse_list_mutex);
@@ -297,6 +435,10 @@ static void __pse_control_release(struct kref *kref)
 
 	lockdep_assert_held(&pse_list_mutex);
 
+	if (psec->pcdev->pi[psec->id].admin_state_enabled)
+		regulator_disable(psec->ps);
+	devm_regulator_put(psec->ps);
+
 	module_put(psec->pcdev->owner);
 
 	list_del(&psec->list);
@@ -329,6 +471,7 @@ static struct pse_control *
 pse_control_get_internal(struct pse_controller_dev *pcdev, unsigned int index)
 {
 	struct pse_control *psec;
+	int ret;
 
 	lockdep_assert_held(&pse_list_mutex);
 
@@ -344,16 +487,38 @@ pse_control_get_internal(struct pse_controller_dev *pcdev, unsigned int index)
 		return ERR_PTR(-ENOMEM);
 
 	if (!try_module_get(pcdev->owner)) {
-		kfree(psec);
-		return ERR_PTR(-ENODEV);
+		ret = -ENODEV;
+		goto free_psec;
 	}
 
+	psec->ps = devm_regulator_get_exclusive(pcdev->dev,
+						rdev_get_name(pcdev->pi[index].rdev));
+	if (IS_ERR(psec->ps)) {
+		ret = PTR_ERR(psec->ps);
+		goto put_module;
+	}
+
+	ret = regulator_is_enabled(psec->ps);
+	if (ret < 0)
+		goto regulator_put;
+
+	pcdev->pi[index].admin_state_enabled = ret;
+
 	psec->pcdev = pcdev;
 	list_add(&psec->list, &pcdev->pse_control_head);
 	psec->id = index;
 	kref_init(&psec->refcnt);
 
 	return psec;
+
+regulator_put:
+	devm_regulator_put(psec->ps);
+put_module:
+	module_put(pcdev->owner);
+free_psec:
+	kfree(psec);
+
+	return ERR_PTR(ret);
 }
 
 /**
@@ -486,6 +651,54 @@ int pse_ethtool_get_status(struct pse_control *psec,
 }
 EXPORT_SYMBOL_GPL(pse_ethtool_get_status);
 
+static int pse_ethtool_c33_set_config(struct pse_control *psec,
+				      const struct pse_control_config *config)
+{
+	int err = 0;
+
+	/* Look at admin_state_enabled status to not call regulator_enable
+	 * or regulator_disable twice creating a regulator counter mismatch
+	 */
+	switch (config->c33_admin_control) {
+	case ETHTOOL_C33_PSE_ADMIN_STATE_ENABLED:
+		if (!psec->pcdev->pi[psec->id].admin_state_enabled)
+			err = regulator_enable(psec->ps);
+		break;
+	case ETHTOOL_C33_PSE_ADMIN_STATE_DISABLED:
+		if (psec->pcdev->pi[psec->id].admin_state_enabled)
+			err = regulator_disable(psec->ps);
+		break;
+	default:
+		err = -EOPNOTSUPP;
+	}
+
+	return err;
+}
+
+static int pse_ethtool_podl_set_config(struct pse_control *psec,
+				       const struct pse_control_config *config)
+{
+	int err = 0;
+
+	/* Look at admin_state_enabled status to not call regulator_enable
+	 * or regulator_disable twice creating a regulator counter mismatch
+	 */
+	switch (config->podl_admin_control) {
+	case ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED:
+		if (!psec->pcdev->pi[psec->id].admin_state_enabled)
+			err = regulator_enable(psec->ps);
+		break;
+	case ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED:
+		if (psec->pcdev->pi[psec->id].admin_state_enabled)
+			err = regulator_disable(psec->ps);
+		break;
+	default:
+		err = -EOPNOTSUPP;
+	}
+
+	return err;
+}
+
 /**
  * pse_ethtool_set_config - set PSE control configuration
  * @psec: PSE control pointer
@@ -496,20 +709,16 @@ int pse_ethtool_set_config(struct pse_control *psec,
 			   struct netlink_ext_ack *extack,
 			   const struct pse_control_config *config)
 {
-	const struct pse_controller_ops *ops;
-	int err;
-
-	ops = psec->pcdev->ops;
+	int err = 0;
 
-	if (!ops->ethtool_set_config) {
-		NL_SET_ERR_MSG(extack,
-			       "PSE driver does not configuration");
-		return -EOPNOTSUPP;
+	if (pse_has_c33(psec)) {
+		err = pse_ethtool_c33_set_config(psec, config);
+		if (err)
+			return err;
 	}
 
-	mutex_lock(&psec->pcdev->lock);
-	err = ops->ethtool_set_config(psec->pcdev, psec->id, extack, config);
-	mutex_unlock(&psec->pcdev->lock);
+	if (pse_has_podl(psec))
+		err = pse_ethtool_podl_set_config(psec, config);
 
 	return err;
 }
diff --git a/drivers/net/pse-pd/pse_regulator.c b/drivers/net/pse-pd/pse_regulator.c
index 547af384764b..64ab36974fe0 100644
--- a/drivers/net/pse-pd/pse_regulator.c
+++ b/drivers/net/pse-pd/pse_regulator.c
@@ -24,37 +24,41 @@ static struct pse_reg_priv *to_pse_reg(struct pse_controller_dev *pcdev)
 }
 
 static int
-pse_reg_ethtool_set_config(struct pse_controller_dev *pcdev, unsigned long id,
-			   struct netlink_ext_ack *extack,
-			   const struct pse_control_config *config)
+pse_reg_pi_enable(struct pse_controller_dev *pcdev, int id)
 {
 	struct pse_reg_priv *priv = to_pse_reg(pcdev);
 	int ret;
 
-	if (priv->admin_state == config->podl_admin_control)
-		return 0;
-
-	switch (config->podl_admin_control) {
-	case ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED:
-		ret = regulator_enable(priv->ps);
-		break;
-	case ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED:
-		ret = regulator_disable(priv->ps);
-		break;
-	default:
-		dev_err(pcdev->dev, "Unknown admin state %i\n",
-			config->podl_admin_control);
-		ret = -ENOTSUPP;
-	}
-
+	ret = regulator_enable(priv->ps);
 	if (ret)
 		return ret;
 
-	priv->admin_state = config->podl_admin_control;
+	priv->admin_state = ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED;
+	return 0;
+}
+
+static int
+pse_reg_pi_disable(struct pse_controller_dev *pcdev, int id)
+{
+	struct pse_reg_priv *priv = to_pse_reg(pcdev);
+	int ret;
 
+	ret = regulator_disable(priv->ps);
+	if (ret)
+		return ret;
+
+	priv->admin_state = ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED;
 	return 0;
 }
 
+static int
+pse_reg_pi_is_enabled(struct pse_controller_dev *pcdev, int id)
+{
+	struct pse_reg_priv *priv = to_pse_reg(pcdev);
+
+	return regulator_is_enabled(priv->ps);
+}
+
 static int
 pse_reg_ethtool_get_status(struct pse_controller_dev *pcdev, unsigned long id,
 			   struct netlink_ext_ack *extack,
@@ -80,7 +84,9 @@ pse_reg_ethtool_get_status(struct pse_controller_dev *pcdev, unsigned long id,
 
 static const struct pse_controller_ops pse_reg_ops = {
 	.ethtool_get_status = pse_reg_ethtool_get_status,
-	.ethtool_set_config = pse_reg_ethtool_set_config,
+	.pi_enable = pse_reg_pi_enable,
+	.pi_is_enabled = pse_reg_pi_is_enabled,
+	.pi_disable = pse_reg_pi_disable,
 };
 
 static int
diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h
index fa0c73da0cf1..6d07c95dabb9 100644
--- a/include/linux/pse-pd/pse.h
+++ b/include/linux/pse-pd/pse.h
@@ -48,17 +48,20 @@ struct pse_control_status {
  * struct pse_controller_ops - PSE controller driver callbacks
  *
  * @ethtool_get_status: get PSE control status for ethtool interface
- * @ethtool_set_config: set PSE control configuration over ethtool interface
  * @setup_pi_matrix: setup PI matrix of the PSE controller
+ * @pi_is_enabled: Return 1 if the PSE PI is enabled, 0 if not.
+ *		   May also return negative errno.
+ * @pi_enable: Configure the PSE PI as enabled.
+ * @pi_disable: Configure the PSE PI as disabled.
  */
 struct pse_controller_ops {
 	int (*ethtool_get_status)(struct pse_controller_dev *pcdev,
 		unsigned long id, struct netlink_ext_ack *extack,
 		struct pse_control_status *status);
-	int (*ethtool_set_config)(struct pse_controller_dev *pcdev,
-		unsigned long id, struct netlink_ext_ack *extack,
-		const struct pse_control_config *config);
 	int (*setup_pi_matrix)(struct pse_controller_dev *pcdev);
+	int (*pi_is_enabled)(struct pse_controller_dev *pcdev, int id);
+	int (*pi_enable)(struct pse_controller_dev *pcdev, int id);
+	int (*pi_disable)(struct pse_controller_dev *pcdev, int id);
 };
 
 struct module;
@@ -90,10 +93,14 @@ struct pse_pi_pairset {
  *
  * @pairset: table of the PSE PI pinout alternative for the two pairset
  * @np: device node pointer of the PSE PI node
+ * @rdev: regulator represented by the PSE PI
+ * @admin_state_enabled: PI enabled state
  */
 struct pse_pi {
 	struct pse_pi_pairset pairset[2];
 	struct device_node *np;
+	struct regulator_dev *rdev;
+	bool admin_state_enabled;
 };
 
 /**
-- 
cgit v1.2.3


From c936e287df26929ad6b5fd7fef22b356e434e90e Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Mon, 12 Feb 2024 10:48:34 +0200
Subject: thunderbolt: Get rid of TB_CFG_PKG_PREPARE_TO_SLEEP

This is not used anywhere in the driver so remove it. No functional
changes.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
---
 drivers/thunderbolt/tb_msgs.h | 6 ------
 include/linux/thunderbolt.h   | 1 -
 2 files changed, 7 deletions(-)

(limited to 'include')

diff --git a/drivers/thunderbolt/tb_msgs.h b/drivers/thunderbolt/tb_msgs.h
index cd750e4b3440..a1670a96cbdc 100644
--- a/drivers/thunderbolt/tb_msgs.h
+++ b/drivers/thunderbolt/tb_msgs.h
@@ -98,12 +98,6 @@ struct cfg_reset_pkg {
 	struct tb_cfg_header header;
 } __packed;
 
-/* TB_CFG_PKG_PREPARE_TO_SLEEP */
-struct cfg_pts_pkg {
-	struct tb_cfg_header header;
-	u32 data;
-} __packed;
-
 /* ICM messages */
 
 enum icm_pkg_code {
diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h
index 4338ea9ac4fd..7d902d8c054b 100644
--- a/include/linux/thunderbolt.h
+++ b/include/linux/thunderbolt.h
@@ -33,7 +33,6 @@ enum tb_cfg_pkg_type {
 	TB_CFG_PKG_ICM_EVENT = 10,
 	TB_CFG_PKG_ICM_CMD = 11,
 	TB_CFG_PKG_ICM_RESP = 12,
-	TB_CFG_PKG_PREPARE_TO_SLEEP = 13,
 };
 
 /**
-- 
cgit v1.2.3


From 2887af4d22f90bcddd3ff9a6eb93ecdaab3acd94 Mon Sep 17 00:00:00 2001
From: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Date: Tue, 16 Apr 2024 13:53:58 +0300
Subject: wifi: iwlwifi: mvm: implement link grading

For selecting what link(s) out of the usable ones
to activate, calculate a grade for a given link.

Calculation of a link grade is done as follows:
1. get the estimated throughput according to the RSSI of the link, this
   will be the base grade
2. get the channel load from the BSS Load Element, subtracting the load
   caused by us. Apply the factor on the grade.
3. puncturing factor: calculate the percentage of the punctured
   subchannels (out of the total subchannels). Apply this on the grade.

The link grading will be used by the link selection mechanism in a later
patch.

Also add KUnit tests for it.

Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
Reviewed-by: Johannes Berg <johannes.berg@intel.com>
Link: https://msgid.link/20240416134215.a6799dbd5643.If137ca6dc443606c7d8c99ec1fc38b325003a7c1@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/intel/iwlwifi/mvm/Makefile    |   3 +-
 drivers/net/wireless/intel/iwlwifi/mvm/link.c      | 172 +++++++++++++++++
 drivers/net/wireless/intel/iwlwifi/mvm/mvm.h       |   1 +
 .../net/wireless/intel/iwlwifi/mvm/tests/Makefile  |   3 +
 .../net/wireless/intel/iwlwifi/mvm/tests/links.c   | 210 +++++++++++++++++++++
 .../net/wireless/intel/iwlwifi/mvm/tests/module.c  |  10 +
 include/linux/ieee80211.h                          |  18 ++
 7 files changed, 416 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/wireless/intel/iwlwifi/mvm/tests/Makefile
 create mode 100644 drivers/net/wireless/intel/iwlwifi/mvm/tests/links.c
 create mode 100644 drivers/net/wireless/intel/iwlwifi/mvm/tests/module.c

(limited to 'include')

diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/Makefile b/drivers/net/wireless/intel/iwlwifi/mvm/Makefile
index 593fe28d89cf..5c754b87ea20 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/Makefile
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/Makefile
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_IWLMVM)   += iwlmvm.o
+obj-$(CONFIG_IWLWIFI_KUNIT_TESTS) += tests/
 iwlmvm-y += fw.o mac80211.o nvm.o ops.o phy-ctxt.o mac-ctxt.o
 iwlmvm-y += utils.o rx.o rxmq.o tx.o binding.o quota.o sta.o sf.o
 iwlmvm-y += scan.o time-event.o rs.o rs-fw.o
@@ -15,4 +16,4 @@ iwlmvm-$(CONFIG_IWLWIFI_LEDS) += led.o
 iwlmvm-$(CONFIG_PM) += d3.o
 iwlmvm-$(CONFIG_IWLMEI) += vendor-cmd.o
 
-ccflags-y += -I $(srctree)/$(src)/../
+subdir-ccflags-y += -I $(srctree)/$(src)/../
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/link.c b/drivers/net/wireless/intel/iwlwifi/mvm/link.c
index 9f69e04594e4..23660238348b 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/link.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/link.c
@@ -329,3 +329,175 @@ int iwl_mvm_disable_link(struct iwl_mvm *mvm, struct ieee80211_vif *vif,
 
 	return ret;
 }
+
+struct iwl_mvm_rssi_to_grade {
+	s8 rssi[2];
+	u16 grade;
+};
+
+#define RSSI_TO_GRADE_LINE(_lb, _hb_uhb, _grade) \
+	{ \
+		.rssi = {_lb, _hb_uhb}, \
+		.grade = _grade \
+	}
+
+/*
+ * This array must be sorted by increasing RSSI for proper functionality.
+ * The grades are actually estimated throughput, represented as fixed-point
+ * with a scale factor of 1/10.
+ */
+static const struct iwl_mvm_rssi_to_grade rssi_to_grade_map[] = {
+	RSSI_TO_GRADE_LINE(-85, -89, 177),
+	RSSI_TO_GRADE_LINE(-83, -86, 344),
+	RSSI_TO_GRADE_LINE(-82, -85, 516),
+	RSSI_TO_GRADE_LINE(-80, -83, 688),
+	RSSI_TO_GRADE_LINE(-77, -79, 1032),
+	RSSI_TO_GRADE_LINE(-73, -76, 1376),
+	RSSI_TO_GRADE_LINE(-70, -74, 1548),
+	RSSI_TO_GRADE_LINE(-69, -72, 1750),
+	RSSI_TO_GRADE_LINE(-65, -68, 2064),
+	RSSI_TO_GRADE_LINE(-61, -66, 2294),
+	RSSI_TO_GRADE_LINE(-58, -61, 2580),
+	RSSI_TO_GRADE_LINE(-55, -58, 2868),
+	RSSI_TO_GRADE_LINE(-46, -55, 3098),
+	RSSI_TO_GRADE_LINE(-43, -54, 3442)
+};
+
+#define MAX_GRADE (rssi_to_grade_map[ARRAY_SIZE(rssi_to_grade_map) - 1].grade)
+
+#define DEFAULT_CHAN_LOAD_LB	30
+#define DEFAULT_CHAN_LOAD_HB	15
+#define DEFAULT_CHAN_LOAD_UHB	0
+
+/* Factors calculation is done with fixed-point with a scaling factor of 1/256 */
+#define SCALE_FACTOR 256
+
+/* Convert a percentage from [0,100] to [0,255] */
+#define NORMALIZE_PERCENT_TO_255(percentage) ((percentage) * SCALE_FACTOR / 100)
+
+static unsigned int
+iwl_mvm_get_puncturing_factor(const struct ieee80211_bss_conf *link_conf)
+{
+	enum nl80211_chan_width chan_width =
+		link_conf->chanreq.oper.width;
+	int mhz = nl80211_chan_width_to_mhz(chan_width);
+	unsigned int n_subchannels, n_punctured, puncturing_penalty;
+
+	if (WARN_ONCE(mhz < 20 || mhz > 320,
+		      "Invalid channel width : (%d)\n", mhz))
+		return SCALE_FACTOR;
+
+	/* No puncturing, no penalty */
+	if (mhz < 80)
+		return SCALE_FACTOR;
+
+	/* total number of subchannels */
+	n_subchannels = mhz / 20;
+	/* how many of these are punctured */
+	n_punctured = hweight16(link_conf->chanreq.oper.punctured);
+
+	puncturing_penalty = n_punctured * SCALE_FACTOR / n_subchannels;
+	return SCALE_FACTOR - puncturing_penalty;
+}
+
+static unsigned int
+iwl_mvm_get_chan_load_factor(struct ieee80211_bss_conf *link_conf)
+{
+	struct iwl_mvm_vif_link_info *mvm_link =
+		iwl_mvm_vif_from_mac80211(link_conf->vif)->link[link_conf->link_id];
+	const struct element *bss_load_elem =
+		ieee80211_bss_get_elem(link_conf->bss, WLAN_EID_QBSS_LOAD);
+	const struct ieee80211_bss_load_elem *bss_load;
+	enum nl80211_band band = link_conf->chanreq.oper.chan->band;
+	unsigned int chan_load;
+	u32 chan_load_by_us;
+
+	/* If there isn't BSS Load element, take the defaults */
+	if (!bss_load_elem ||
+	    bss_load_elem->datalen != sizeof(*bss_load)) {
+		switch (band) {
+		case NL80211_BAND_2GHZ:
+			chan_load = DEFAULT_CHAN_LOAD_LB;
+			break;
+		case NL80211_BAND_5GHZ:
+			chan_load = DEFAULT_CHAN_LOAD_HB;
+			break;
+		case NL80211_BAND_6GHZ:
+			chan_load = DEFAULT_CHAN_LOAD_UHB;
+			break;
+		default:
+			chan_load = 0;
+			break;
+		}
+		/* The defaults are given in percentage */
+		return SCALE_FACTOR - NORMALIZE_PERCENT_TO_255(chan_load);
+	}
+
+	bss_load = (const void *)bss_load_elem->data;
+	/* Channel util is in range 0-255 */
+	chan_load = bss_load->channel_util;
+
+	if (!mvm_link || !mvm_link->active)
+		goto done;
+
+	if (WARN_ONCE(!mvm_link->phy_ctxt,
+		      "Active link (%u) without phy ctxt assigned!\n",
+		      link_conf->link_id))
+		goto done;
+
+	/* channel load by us is given in percentage */
+	chan_load_by_us =
+		NORMALIZE_PERCENT_TO_255(mvm_link->phy_ctxt->channel_load_by_us);
+
+	/* Use only values that firmware sends that can possibly be valid */
+	if (chan_load_by_us <= chan_load)
+		chan_load -= chan_load_by_us;
+done:
+	return  SCALE_FACTOR - chan_load;
+}
+
+/* This function calculates the grade of a link. Returns 0 in error case */
+unsigned int iwl_mvm_get_link_grade(struct ieee80211_bss_conf *link_conf)
+{
+	enum nl80211_band band;
+	int i, rssi_idx;
+	s32 link_rssi;
+	unsigned int grade = MAX_GRADE;
+
+	if (WARN_ON_ONCE(!link_conf))
+		return 0;
+
+	band = link_conf->chanreq.oper.chan->band;
+	if (WARN_ONCE(band != NL80211_BAND_2GHZ &&
+		      band != NL80211_BAND_5GHZ &&
+		      band != NL80211_BAND_6GHZ,
+		      "Invalid band (%u)\n", band))
+		return 0;
+
+	link_rssi = MBM_TO_DBM(link_conf->bss->signal);
+	/*
+	 * For 6 GHz the RSSI of the beacons is lower than
+	 * the RSSI of the data.
+	 */
+	if (band == NL80211_BAND_6GHZ)
+		link_rssi += 4;
+
+	rssi_idx = band == NL80211_BAND_2GHZ ? 0 : 1;
+
+	/* Get grade based on RSSI */
+	for (i = 0; i < ARRAY_SIZE(rssi_to_grade_map); i++) {
+		const struct iwl_mvm_rssi_to_grade *line =
+			&rssi_to_grade_map[i];
+
+		if (link_rssi > line->rssi[rssi_idx])
+			continue;
+		grade = line->grade;
+		break;
+	}
+
+	/* apply the channel load and puncturing factors */
+	grade = grade * iwl_mvm_get_chan_load_factor(link_conf) / SCALE_FACTOR;
+	grade = grade * iwl_mvm_get_puncturing_factor(link_conf) / SCALE_FACTOR;
+	return grade;
+}
+EXPORT_SYMBOL_IF_IWLWIFI_KUNIT(iwl_mvm_get_link_grade);
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
index 0dd83e0171ba..c477978d8fa3 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h
@@ -2449,6 +2449,7 @@ u32 iwl_mvm_get_sec_flags(struct iwl_mvm *mvm,
 			  struct ieee80211_vif *vif,
 			  struct ieee80211_sta *sta,
 			  struct ieee80211_key_conf *keyconf);
+unsigned int iwl_mvm_get_link_grade(struct ieee80211_bss_conf *link_conf);
 
 bool iwl_rfi_supported(struct iwl_mvm *mvm);
 int iwl_rfi_send_config_cmd(struct iwl_mvm *mvm,
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tests/Makefile b/drivers/net/wireless/intel/iwlwifi/mvm/tests/Makefile
new file mode 100644
index 000000000000..b13aebbf7d5e
--- /dev/null
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/tests/Makefile
@@ -0,0 +1,3 @@
+iwlmvm-tests-y += module.o links.o
+
+obj-$(CONFIG_IWLWIFI_KUNIT_TESTS) += iwlmvm-tests.o
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tests/links.c b/drivers/net/wireless/intel/iwlwifi/mvm/tests/links.c
new file mode 100644
index 000000000000..321d18de1ca3
--- /dev/null
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/tests/links.c
@@ -0,0 +1,210 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KUnit tests for channel helper functions
+ *
+ * Copyright (C) 2024 Intel Corporation
+ */
+#include <net/mac80211.h>
+#include "../mvm.h"
+#include <kunit/test.h>
+
+MODULE_IMPORT_NS(EXPORTED_FOR_KUNIT_TESTING);
+
+static struct ieee80211_channel chan_5ghz = {
+	.band = NL80211_BAND_5GHZ,
+};
+
+static struct ieee80211_channel chan_6ghz = {
+	.band = NL80211_BAND_6GHZ,
+};
+
+static struct ieee80211_channel chan_2ghz = {
+	.band = NL80211_BAND_2GHZ,
+};
+
+static struct iwl_mvm_phy_ctxt ctx = {};
+
+static struct iwl_mvm_vif_link_info mvm_link = {
+	.phy_ctxt = &ctx,
+	.active = true
+};
+
+static struct cfg80211_bss bss = {};
+
+static struct ieee80211_bss_conf link_conf = {.bss = &bss};
+
+static const struct link_grading_case {
+	const char *desc;
+	const struct cfg80211_chan_def chandef;
+	s32 signal;
+	s16 channel_util;
+	int chan_load_by_us;
+	unsigned int grade;
+} link_grading_cases[] = {
+	{
+		.desc = "UHB, RSSI below range, no factors",
+		.chandef = {
+			.chan = &chan_6ghz,
+			.width = NL80211_CHAN_WIDTH_20,
+		},
+		.signal = -100,
+		.grade = 177,
+	},
+	{
+		.desc = "LB, RSSI in range, no factors",
+		.chandef = {
+			.chan = &chan_2ghz,
+			.width = NL80211_CHAN_WIDTH_20,
+		},
+		.signal = -84,
+		.grade = 344,
+	},
+	{
+		.desc = "HB, RSSI above range, no factors",
+		.chandef = {
+			.chan = &chan_5ghz,
+			.width = NL80211_CHAN_WIDTH_20,
+		},
+		.signal = -50,
+		.grade = 3442,
+	},
+	{
+		.desc = "HB, BSS Load IE (20 percent), inactive link, no puncturing factor",
+		.chandef = {
+			.chan = &chan_5ghz,
+			.width = NL80211_CHAN_WIDTH_20,
+		},
+		.signal = -66,
+		.channel_util = 51,
+		.grade = 1836,
+	},
+	{
+		.desc = "LB, BSS Load IE (20 percent), active link, chan_load_by_us=10 percent. No puncturing factor",
+		.chandef = {
+			.chan = &chan_2ghz,
+			.width = NL80211_CHAN_WIDTH_20,
+		},
+		.signal = -61,
+		.channel_util = 51,
+		.chan_load_by_us = 10,
+		.grade = 2061,
+	},
+	{
+		.desc = "UHB, BSS Load IE (40 percent), active link, chan_load_by_us=50 (invalid) percent. No puncturing factor",
+		.chandef = {
+			.chan = &chan_6ghz,
+			.width = NL80211_CHAN_WIDTH_20,
+		},
+		.signal = -66,
+		.channel_util = 102,
+		.chan_load_by_us = 50,
+		.grade = 1552,
+	},
+	{	.desc = "HB, 80 MHz, no channel load factor, punctured percentage 0",
+		.chandef = {
+			.chan = &chan_5ghz,
+			.width = NL80211_CHAN_WIDTH_80,
+			.punctured = 0x0000
+		},
+		.signal = -72,
+		.grade = 1750,
+	},
+	{	.desc = "HB, 160 MHz, no channel load factor, punctured percentage 25",
+		.chandef = {
+			.chan = &chan_5ghz,
+			.width = NL80211_CHAN_WIDTH_160,
+			.punctured = 0x3
+		},
+		.signal = -72,
+		.grade = 1312,
+	},
+	{	.desc = "UHB, 320 MHz, no channel load factor, punctured percentage 12.5 (2/16)",
+		.chandef = {
+			.chan = &chan_6ghz,
+			.width = NL80211_CHAN_WIDTH_320,
+			.punctured = 0x3
+		},
+		.signal = -72,
+		.grade = 1806,
+	},
+	{	.desc = "HB, 160 MHz, channel load 20, channel load by us 10, punctured percentage 25",
+		.chandef = {
+			.chan = &chan_5ghz,
+			.width = NL80211_CHAN_WIDTH_160,
+			.punctured = 0x3
+		},
+		.channel_util = 51,
+		.chan_load_by_us = 10,
+		.signal = -72,
+		.grade = 1179,
+	},
+};
+
+KUNIT_ARRAY_PARAM_DESC(link_grading, link_grading_cases, desc)
+
+static void setup_link_conf(struct kunit *test)
+{
+	const struct link_grading_case *params = test->param_value;
+	size_t vif_size = sizeof(struct ieee80211_vif) +
+		sizeof(struct iwl_mvm_vif);
+	struct ieee80211_vif *vif = kunit_kzalloc(test, vif_size, GFP_KERNEL);
+	struct ieee80211_bss_load_elem *bss_load;
+	struct element *element;
+	size_t ies_size = sizeof(struct cfg80211_bss_ies) + sizeof(*bss_load) + sizeof(element);
+	struct cfg80211_bss_ies *ies;
+	struct iwl_mvm_vif *mvmvif;
+
+	KUNIT_ASSERT_NOT_NULL(test, vif);
+
+	mvmvif = iwl_mvm_vif_from_mac80211(vif);
+	if (params->chan_load_by_us > 0) {
+		ctx.channel_load_by_us = params->chan_load_by_us;
+		mvmvif->link[0] = &mvm_link;
+	}
+
+	link_conf.vif = vif;
+	link_conf.chanreq.oper = params->chandef;
+	bss.signal = DBM_TO_MBM(params->signal);
+
+	ies = kunit_kzalloc(test, ies_size, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_NULL(test, ies);
+	ies->len = sizeof(*bss_load) + sizeof(struct element);
+
+	element = (void *)ies->data;
+	element->datalen = sizeof(*bss_load);
+	element->id = 11;
+
+	bss_load = (void *)element->data;
+	bss_load->channel_util = params->channel_util;
+
+	rcu_assign_pointer(bss.ies, ies);
+}
+
+static void test_link_grading(struct kunit *test)
+{
+	const struct link_grading_case *params = test->param_value;
+	unsigned int ret;
+
+	setup_link_conf(test);
+
+	rcu_read_lock();
+	ret = iwl_mvm_get_link_grade(&link_conf);
+	rcu_read_unlock();
+
+	KUNIT_EXPECT_EQ(test, ret, params->grade);
+
+	kunit_kfree(test, link_conf.vif);
+	RCU_INIT_POINTER(bss.ies, NULL);
+}
+
+static struct kunit_case link_grading_test_cases[] = {
+	KUNIT_CASE_PARAM(test_link_grading, link_grading_gen_params),
+	{}
+};
+
+static struct kunit_suite link_grading = {
+	.name = "iwlmvm-link-grading",
+	.test_cases = link_grading_test_cases,
+};
+
+kunit_test_suite(link_grading);
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tests/module.c b/drivers/net/wireless/intel/iwlwifi/mvm/tests/module.c
new file mode 100644
index 000000000000..f556acbac77f
--- /dev/null
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/tests/module.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * This is just module boilerplate for the iwlmvm kunit module.
+ *
+ * Copyright (C) 2024 Intel Corporation
+ */
+#include <linux/module.h>
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("kunit tests for iwlmvm");
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 7f9b829dcb1e..de2dce743ee2 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1287,6 +1287,24 @@ struct ieee80211_ttlm_elem {
 	u8 optional[];
 } __packed;
 
+/**
+ * struct ieee80211_bss_load_elem - BSS Load elemen
+ *
+ * Defined in section 9.4.2.26 in IEEE 802.11-REVme D4.1
+ *
+ * @sta_count: total number of STAs currently associated with the AP.
+ * @channel_util: Percentage of time that the access point sensed the channel
+ *	was busy. This value is in range [0, 255], the highest value means
+ *	100% busy.
+ * @avail_admission_capa: remaining amount of medium time used for admission
+ *	control.
+ */
+struct ieee80211_bss_load_elem {
+	__le16 sta_count;
+	u8 channel_util;
+	__le16 avail_admission_capa;
+} __packed;
+
 struct ieee80211_mgmt {
 	__le16 frame_control;
 	__le16 duration;
-- 
cgit v1.2.3


From 62cc206cb18b6e4fbb78d027705cba95a555ec70 Mon Sep 17 00:00:00 2001
From: Jeff Johnson <quic_jjohnson@quicinc.com>
Date: Wed, 17 Apr 2024 17:27:42 -0700
Subject: wifi: cfg80211: fix cfg80211 function kernel-doc

Running "scripts/kernel-doc -Wall -Werror -none include/net/cfg80211.h"
produces many warnings of the form "warning: No description found for
return value of <function>", so make sure all of them have a Return:
clause. In some instances also add a Context: clause.

Signed-off-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Link: https://msgid.link/20240417-cfg80211-kdoc-v1-1-d54cb7143417@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 111 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 82 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 1e09329acc42..0112d645a2b4 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1149,6 +1149,8 @@ ieee80211_chandef_max_power(struct cfg80211_chan_def *chandef)
  * @band_mask: which bands to check on
  * @prohibited_flags: which channels to not consider usable,
  *	%IEEE80211_CHAN_DISABLED is always taken into account
+ *
+ * Return: %true if usable channels found, %false otherwise
  */
 bool cfg80211_any_usable_channels(struct wiphy *wiphy,
 				  unsigned long band_mask,
@@ -1833,9 +1835,11 @@ enum cfg80211_station_type {
  *
  * Utility function for the @change_station driver method. Call this function
  * with the appropriate station type looking up the station (and checking that
- * it exists). It will verify whether the station change is acceptable, and if
- * not will return an error code. Note that it may modify the parameters for
- * backward compatibility reasons, so don't use them before calling this.
+ * it exists). It will verify whether the station change is acceptable.
+ *
+ * Return: 0 if the change is acceptable, otherwise an error code. Note that
+ * it may modify the parameters for backward compatibility reasons, so don't
+ * use them before calling this.
  */
 int cfg80211_check_station_change(struct wiphy *wiphy,
 				  struct station_parameters *params,
@@ -2229,7 +2233,7 @@ struct cfg80211_sar_capa {
  * @mac_addr: the mac address of the station of interest
  * @sinfo: pointer to the structure to fill with the information
  *
- * Returns 0 on success and sinfo is filled with the available information
+ * Return: 0 on success and sinfo is filled with the available information
  * otherwise returns a negative error code and the content of sinfo has to be
  * considered undefined.
  */
@@ -5334,6 +5338,8 @@ struct wiphy_iftype_ext_capab {
  * cfg80211_get_iftype_ext_capa - lookup interface type extended capability
  * @wiphy: the wiphy to look up from
  * @type: the interface type to look up
+ *
+ * Return: The extended capability for the given interface @type, may be %NULL
  */
 const struct wiphy_iftype_ext_capab *
 cfg80211_get_iftype_ext_capa(struct wiphy *wiphy, enum nl80211_iftype type);
@@ -5904,6 +5910,10 @@ int wiphy_register(struct wiphy *wiphy);
 /**
  * get_wiphy_regdom - get custom regdomain for the given wiphy
  * @wiphy: the wiphy to get the regdomain from
+ *
+ * Context: Requires any of RTNL, wiphy mutex or RCU protection.
+ *
+ * Return: pointer to the regulatory domain associated with the wiphy
  */
 const struct ieee80211_regdomain *get_wiphy_regdom(struct wiphy *wiphy);
 
@@ -6421,6 +6431,8 @@ ieee80211_get_channel(struct wiphy *wiphy, int freq)
  *
  * The Preferred Scanning Channels (PSC) are defined in
  * Draft IEEE P802.11ax/D5.0, 26.17.2.3.3
+ *
+ * Return: %true if channel is a PSC, %false otherwise
  */
 static inline bool cfg80211_channel_is_psc(struct ieee80211_channel *chan)
 {
@@ -6450,8 +6462,8 @@ ieee80211_get_response_rate(struct ieee80211_supported_band *sband,
  * ieee80211_mandatory_rates - get mandatory rates for a given band
  * @sband: the band to look for rates in
  *
- * This function returns a bitmap of the mandatory rates for the given
- * band, bits are set according to the rate position in the bitrates array.
+ * Return: a bitmap of the mandatory rates for the given band, bits
+ * are set according to the rate position in the bitrates array.
  */
 u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband);
 
@@ -6665,6 +6677,8 @@ bool ieee80211_get_8023_tunnel_proto(const void *hdr, __be16 *proto);
  * header to the ethernet header (if present).
  *
  * @skb: The 802.3 frame with embedded mesh header
+ *
+ * Return: 0 on success. Non-zero on error.
  */
 int ieee80211_strip_8023_mesh_hdr(struct sk_buff *skb);
 
@@ -7043,6 +7057,8 @@ const struct ieee80211_reg_rule *freq_reg_info(struct wiphy *wiphy,
  *
  * You can use this to map the regulatory request initiator enum to a
  * proper string representation.
+ *
+ * Return: pointer to string representation of the initiator
  */
 const char *reg_initiator_name(enum nl80211_reg_initiator initiator);
 
@@ -7051,6 +7067,8 @@ const char *reg_initiator_name(enum nl80211_reg_initiator initiator);
  * @wiphy: wiphy for which pre-CAC capability is checked.
  *
  * Pre-CAC is allowed only in some regdomains (notable ETSI).
+ *
+ * Return: %true if allowed, %false otherwise
  */
 bool regulatory_pre_cac_allowed(struct wiphy *wiphy);
 
@@ -7185,6 +7203,8 @@ static inline void cfg80211_gen_new_bssid(const u8 *bssid, u8 max_bssid,
  * cfg80211_is_element_inherited - returns if element ID should be inherited
  * @element: element to check
  * @non_inherit_element: non inheritance element
+ *
+ * Return: %true if should be inherited, %false otherwise
  */
 bool cfg80211_is_element_inherited(const struct element *element,
 				   const struct element *non_inherit_element);
@@ -7197,6 +7217,8 @@ bool cfg80211_is_element_inherited(const struct element *element,
  * @sub_elem: current MBSSID subelement (profile)
  * @merged_ie: location of the merged profile
  * @max_copy_len: max merged profile length
+ *
+ * Return: the number of bytes merged
  */
 size_t cfg80211_merge_profile(const u8 *ie, size_t ielen,
 			      const struct element *mbssid_elem,
@@ -7224,7 +7246,7 @@ enum cfg80211_bss_frame_type {
  * @ielen: length of IEs
  * @band: enum nl80211_band of the channel
  *
- * Returns the channel number, or -1 if none could be determined.
+ * Return: the channel number, or -1 if none could be determined.
  */
 int cfg80211_get_ies_channel_number(const u8 *ie, size_t ielen,
 				    enum nl80211_band band);
@@ -7302,6 +7324,8 @@ cfg80211_inform_bss(struct wiphy *wiphy,
  * @bss_type: type of BSS, see &enum ieee80211_bss_type
  * @privacy: privacy filter, see &enum ieee80211_privacy
  * @use_for: indicates which use is intended
+ *
+ * Return: Reference-counted BSS on success. %NULL on error.
  */
 struct cfg80211_bss *__cfg80211_get_bss(struct wiphy *wiphy,
 					struct ieee80211_channel *channel,
@@ -7322,6 +7346,8 @@ struct cfg80211_bss *__cfg80211_get_bss(struct wiphy *wiphy,
  * @privacy: privacy filter, see &enum ieee80211_privacy
  *
  * This version implies regular usage, %NL80211_BSS_USE_FOR_NORMAL.
+ *
+ * Return: Reference-counted BSS on success. %NULL on error.
  */
 static inline struct cfg80211_bss *
 cfg80211_get_bss(struct wiphy *wiphy, struct ieee80211_channel *channel,
@@ -7706,8 +7732,9 @@ int cfg80211_vendor_cmd_reply(struct sk_buff *skb);
  * cfg80211_vendor_cmd_get_sender - get the current sender netlink ID
  * @wiphy: the wiphy
  *
- * Return the current netlink port ID in a vendor command handler.
- * Valid to call only there.
+ * Return: the current netlink port ID in a vendor command handler.
+ *
+ * Context: May only be called from a vendor command handler
  */
 unsigned int cfg80211_vendor_cmd_get_sender(struct wiphy *wiphy);
 
@@ -8260,6 +8287,8 @@ void cfg80211_tx_mgmt_expired(struct wireless_dev *wdev, u64 cookie,
  *
  * @sinfo: the station information
  * @gfp: allocation flags
+ *
+ * Return: 0 on success. Non-zero on error.
  */
 int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp);
 
@@ -8777,13 +8806,13 @@ bool cfg80211_reg_can_beacon(struct wiphy *wiphy,
  * also checks if IR-relaxation conditions apply, to allow beaconing under
  * more permissive conditions.
  *
- * Requires the wiphy mutex to be held.
+ * Context: Requires the wiphy mutex to be held.
  */
 bool cfg80211_reg_can_beacon_relax(struct wiphy *wiphy,
 				   struct cfg80211_chan_def *chandef,
 				   enum nl80211_iftype iftype);
 
-/*
+/**
  * cfg80211_ch_switch_notify - update wdev channel and notify userspace
  * @dev: the device which switched channels
  * @chandef: the new channel definition
@@ -8796,7 +8825,7 @@ void cfg80211_ch_switch_notify(struct net_device *dev,
 			       struct cfg80211_chan_def *chandef,
 			       unsigned int link_id);
 
-/*
+/**
  * cfg80211_ch_switch_started_notify - notify channel switch start
  * @dev: the device on which the channel switch started
  * @chandef: the future channel definition
@@ -8819,7 +8848,7 @@ void cfg80211_ch_switch_started_notify(struct net_device *dev,
  * @operating_class: the operating class to convert
  * @band: band pointer to fill
  *
- * Returns %true if the conversion was successful, %false otherwise.
+ * Return: %true if the conversion was successful, %false otherwise.
  */
 bool ieee80211_operating_class_to_band(u8 operating_class,
 				       enum nl80211_band *band);
@@ -8831,7 +8860,7 @@ bool ieee80211_operating_class_to_band(u8 operating_class,
  * @chan: the ieee80211_channel to convert
  * @chandef: a pointer to the resulting chandef
  *
- * Returns %true if the conversion was successful, %false otherwise.
+ * Return: %true if the conversion was successful, %false otherwise.
  */
 bool ieee80211_operating_class_to_chandef(u8 operating_class,
 					  struct ieee80211_channel *chan,
@@ -8843,7 +8872,7 @@ bool ieee80211_operating_class_to_chandef(u8 operating_class,
  * @chandef: the chandef to convert
  * @op_class: a pointer to the resulting operating class
  *
- * Returns %true if the conversion was successful, %false otherwise.
+ * Return: %true if the conversion was successful, %false otherwise.
  */
 bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef,
 					  u8 *op_class);
@@ -8853,7 +8882,7 @@ bool ieee80211_chandef_to_operating_class(struct cfg80211_chan_def *chandef,
  *
  * @chandef: the chandef to convert
  *
- * Returns the center frequency of chandef (1st segment) in KHz.
+ * Return: the center frequency of chandef (1st segment) in KHz.
  */
 static inline u32
 ieee80211_chandef_to_khz(const struct cfg80211_chan_def *chandef)
@@ -8861,7 +8890,7 @@ ieee80211_chandef_to_khz(const struct cfg80211_chan_def *chandef)
 	return MHZ_TO_KHZ(chandef->center_freq1) + chandef->freq1_offset;
 }
 
-/*
+/**
  * cfg80211_tdls_oper_request - request userspace to perform TDLS operation
  * @dev: the device on which the operation is requested
  * @peer: the MAC address of the peer device
@@ -8880,11 +8909,11 @@ void cfg80211_tdls_oper_request(struct net_device *dev, const u8 *peer,
 				enum nl80211_tdls_operation oper,
 				u16 reason_code, gfp_t gfp);
 
-/*
+/**
  * cfg80211_calculate_bitrate - calculate actual bitrate (in 100Kbps units)
  * @rate: given rate_info to calculate bitrate from
  *
- * return 0 if MCS index >= 32
+ * Return: calculated bitrate
  */
 u32 cfg80211_calculate_bitrate(struct rate_info *rate);
 
@@ -8898,7 +8927,7 @@ u32 cfg80211_calculate_bitrate(struct rate_info *rate);
  * when the driver wishes to unregister the wdev, e.g. when the hardware device
  * is unbound from the driver.
  *
- * Requires the RTNL and wiphy mutex to be held.
+ * Context: Requires the RTNL and wiphy mutex to be held.
  */
 void cfg80211_unregister_wdev(struct wireless_dev *wdev);
 
@@ -8911,7 +8940,9 @@ void cfg80211_unregister_wdev(struct wireless_dev *wdev);
  * held. Otherwise, both register_netdevice() and register_netdev() are usable
  * instead as well.
  *
- * Requires the RTNL and wiphy mutex to be held.
+ * Context: Requires the RTNL and wiphy mutex to be held.
+ *
+ * Return: 0 on success. Non-zero on error.
  */
 int cfg80211_register_netdevice(struct net_device *dev);
 
@@ -8924,7 +8955,7 @@ int cfg80211_register_netdevice(struct net_device *dev);
  * is held. Otherwise, both unregister_netdevice() and unregister_netdev() are
  * usable instead as well.
  *
- * Requires the RTNL and wiphy mutex to be held.
+ * Context: Requires the RTNL and wiphy mutex to be held.
  */
 static inline void cfg80211_unregister_netdevice(struct net_device *dev)
 {
@@ -9000,9 +9031,9 @@ int cfg80211_get_p2p_attr(const u8 *ies, unsigned int len,
  * correctly, if not the result of using this function will not
  * be ordered correctly either, i.e. it does no reordering.
  *
- * The function returns the offset where the next part of the
- * buffer starts, which may be @ielen if the entire (remainder)
- * of the buffer should be used.
+ * Return: The offset where the next part of the buffer starts, which
+ * may be @ielen if the entire (remainder) of the buffer should be
+ * used.
  */
 size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen,
 			      const u8 *ids, int n_ids,
@@ -9030,9 +9061,9 @@ size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen,
  * correctly, if not the result of using this function will not
  * be ordered correctly either, i.e. it does no reordering.
  *
- * The function returns the offset where the next part of the
- * buffer starts, which may be @ielen if the entire (remainder)
- * of the buffer should be used.
+ * Return: The offset where the next part of the buffer starts, which
+ * may be @ielen if the entire (remainder) of the buffer should be
+ * used.
  */
 static inline size_t ieee80211_ie_split(const u8 *ies, size_t ielen,
 					const u8 *ids, int n_ids, size_t offset)
@@ -9096,6 +9127,8 @@ unsigned int ieee80211_get_num_supported_channels(struct wiphy *wiphy);
  * This function can be called by the driver to check whether a
  * combination of interfaces and their types are allowed according to
  * the interface combinations.
+ *
+ * Return: 0 if combinations are allowed. Non-zero on error.
  */
 int cfg80211_check_combinations(struct wiphy *wiphy,
 				struct iface_combination_params *params);
@@ -9111,6 +9144,8 @@ int cfg80211_check_combinations(struct wiphy *wiphy,
  * This function can be called by the driver to check what possible
  * combinations it fits in at a given moment, e.g. for channel switching
  * purposes.
+ *
+ * Return: 0 on success. Non-zero on error.
  */
 int cfg80211_iter_combinations(struct wiphy *wiphy,
 			       struct iface_combination_params *params,
@@ -9118,7 +9153,7 @@ int cfg80211_iter_combinations(struct wiphy *wiphy,
 					    void *data),
 			       void *data);
 
-/*
+/**
  * cfg80211_stop_iface - trigger interface disconnection
  *
  * @wiphy: the wiphy
@@ -9173,6 +9208,8 @@ static inline void wiphy_ext_feature_set(struct wiphy *wiphy,
  *
  * The extended features are flagged in multiple bytes (see
  * &struct wiphy.@ext_features)
+ *
+ * Return: %true if extended feature flag is set, %false otherwise
  */
 static inline bool
 wiphy_ext_feature_isset(struct wiphy *wiphy,
@@ -9294,6 +9331,8 @@ void cfg80211_pmsr_complete(struct wireless_dev *wdev,
  * Check whether the interface is allowed to operate; additionally, this API
  * can be used to check iftype against the software interfaces when
  * check_swif is '1'.
+ *
+ * Return: %true if allowed, %false otherwise
  */
 bool cfg80211_iftype_allowed(struct wiphy *wiphy, enum nl80211_iftype iftype,
 			     bool is_4addr, u8 check_swif);
@@ -9386,6 +9425,8 @@ void cfg80211_bss_flush(struct wiphy *wiphy);
  * @cmd: the actual event we want to notify
  * @count: the number of TBTTs until the color change happens
  * @color_bitmap: representations of the colors that the local BSS is aware of
+ *
+ * Return: 0 on success. Non-zero on error.
  */
 int cfg80211_bss_color_notify(struct net_device *dev,
 			      enum nl80211_commands cmd, u8 count,
@@ -9395,6 +9436,8 @@ int cfg80211_bss_color_notify(struct net_device *dev,
  * cfg80211_obss_color_collision_notify - notify about bss color collision
  * @dev: network device
  * @color_bitmap: representations of the colors that the local BSS is aware of
+ *
+ * Return: 0 on success. Non-zero on error.
  */
 static inline int cfg80211_obss_color_collision_notify(struct net_device *dev,
 						       u64 color_bitmap)
@@ -9409,6 +9452,8 @@ static inline int cfg80211_obss_color_collision_notify(struct net_device *dev,
  * @count: the number of TBTTs until the color change happens
  *
  * Inform the userspace about the color change that has started.
+ *
+ * Return: 0 on success. Non-zero on error.
  */
 static inline int cfg80211_color_change_started_notify(struct net_device *dev,
 						       u8 count)
@@ -9422,6 +9467,8 @@ static inline int cfg80211_color_change_started_notify(struct net_device *dev,
  * @dev: the device on which the color is switched
  *
  * Inform the userspace about the color change that has aborted.
+ *
+ * Return: 0 on success. Non-zero on error.
  */
 static inline int cfg80211_color_change_aborted_notify(struct net_device *dev)
 {
@@ -9434,6 +9481,8 @@ static inline int cfg80211_color_change_aborted_notify(struct net_device *dev)
  * @dev: the device on which the color was switched
  *
  * Inform the userspace about the color change that has completed.
+ *
+ * Return: 0 on success. Non-zero on error.
  */
 static inline int cfg80211_color_change_notify(struct net_device *dev)
 {
@@ -9477,6 +9526,8 @@ void cfg80211_schedule_channels_check(struct wireless_dev *wdev);
  * @ppos: read position
  * @handler: the read handler to call (under wiphy lock)
  * @data: additional data to pass to the read handler
+ *
+ * Return: the number of characters read, or a negative errno
  */
 ssize_t wiphy_locked_debugfs_read(struct wiphy *wiphy, struct file *file,
 				  char *buf, size_t bufsize,
@@ -9499,6 +9550,8 @@ ssize_t wiphy_locked_debugfs_read(struct wiphy *wiphy, struct file *file,
  * @count: read count
  * @handler: the write handler to call (under wiphy lock)
  * @data: additional data to pass to the write handler
+ *
+ * Return: the number of characters written, or a negative errno
  */
 ssize_t wiphy_locked_debugfs_write(struct wiphy *wiphy, struct file *file,
 				   char *buf, size_t bufsize,
-- 
cgit v1.2.3


From 7253c1d1e7a52504ffd08138e9843d0a80788690 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 18 Apr 2024 07:32:38 +0000
Subject: net_sched: sch_choke: implement lockless choke_dump()

Instead of relying on RTNL, choke_dump() can use READ_ONCE()
annotations, paired with WRITE_ONCE() ones in choke_change().

v2: added a WRITE_ONCE(p->Scell_log, Scell_log)
    per Simon feedback in V1
    Removed the READ_ONCE(q->limit) in choke_enqueue()

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/red.h     | 12 ++++++------
 net/sched/sch_choke.c | 21 +++++++++++----------
 2 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/red.h b/include/net/red.h
index 425364de0df7..802287d52c9e 100644
--- a/include/net/red.h
+++ b/include/net/red.h
@@ -233,10 +233,10 @@ static inline void red_set_parms(struct red_parms *p,
 	int delta = qth_max - qth_min;
 	u32 max_p_delta;
 
-	p->qth_min	= qth_min << Wlog;
-	p->qth_max	= qth_max << Wlog;
-	p->Wlog		= Wlog;
-	p->Plog		= Plog;
+	WRITE_ONCE(p->qth_min, qth_min << Wlog);
+	WRITE_ONCE(p->qth_max, qth_max << Wlog);
+	WRITE_ONCE(p->Wlog, Wlog);
+	WRITE_ONCE(p->Plog, Plog);
 	if (delta <= 0)
 		delta = 1;
 	p->qth_delta	= delta;
@@ -244,7 +244,7 @@ static inline void red_set_parms(struct red_parms *p,
 		max_P = red_maxp(Plog);
 		max_P *= delta; /* max_P = (qth_max - qth_min)/2^Plog */
 	}
-	p->max_P = max_P;
+	WRITE_ONCE(p->max_P, max_P);
 	max_p_delta = max_P / delta;
 	max_p_delta = max(max_p_delta, 1U);
 	p->max_P_reciprocal  = reciprocal_value(max_p_delta);
@@ -257,7 +257,7 @@ static inline void red_set_parms(struct red_parms *p,
 	p->target_min = qth_min + 2*delta;
 	p->target_max = qth_min + 3*delta;
 
-	p->Scell_log	= Scell_log;
+	WRITE_ONCE(p->Scell_log, Scell_log);
 	p->Scell_max	= (255 << Scell_log);
 
 	if (stab)
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index ea108030c6b4..91072010923d 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -405,8 +405,8 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt,
 	} else
 		sch_tree_lock(sch);
 
-	q->flags = ctl->flags;
-	q->limit = ctl->limit;
+	WRITE_ONCE(q->flags, ctl->flags);
+	WRITE_ONCE(q->limit, ctl->limit);
 
 	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
 		      ctl->Plog, ctl->Scell_log,
@@ -431,15 +431,16 @@ static int choke_init(struct Qdisc *sch, struct nlattr *opt,
 static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct choke_sched_data *q = qdisc_priv(sch);
+	u8 Wlog = READ_ONCE(q->parms.Wlog);
 	struct nlattr *opts = NULL;
 	struct tc_red_qopt opt = {
-		.limit		= q->limit,
-		.flags		= q->flags,
-		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
-		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
-		.Wlog		= q->parms.Wlog,
-		.Plog		= q->parms.Plog,
-		.Scell_log	= q->parms.Scell_log,
+		.limit		= READ_ONCE(q->limit),
+		.flags		= READ_ONCE(q->flags),
+		.qth_min	= READ_ONCE(q->parms.qth_min) >> Wlog,
+		.qth_max	= READ_ONCE(q->parms.qth_max) >> Wlog,
+		.Wlog		= Wlog,
+		.Plog		= READ_ONCE(q->parms.Plog),
+		.Scell_log	= READ_ONCE(q->parms.Scell_log),
 	};
 
 	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
@@ -447,7 +448,7 @@ static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
 		goto nla_put_failure;
 
 	if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
-	    nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
+	    nla_put_u32(skb, TCA_CHOKE_MAX_P, READ_ONCE(q->parms.max_P)))
 		goto nla_put_failure;
 	return nla_nest_end(skb, opts);
 
-- 
cgit v1.2.3


From f1d6aec41f13aad3c3ff8daa9fddb38539afe8f6 Mon Sep 17 00:00:00 2001
From: Imre Deak <imre.deak@intel.com>
Date: Wed, 17 Apr 2024 01:10:00 +0300
Subject: drm/i915/dp: Fix DSC line buffer depth programming

Fix the calculation of the DSC line buffer depth. This is limited both
by the source's and sink's maximum line buffer depth, but the former one
was not taken into account. On all Intel platform's the source's maximum
buffer depth is 13, so the overall limit is simply the minimum of the
source/sink's limit, regardless of the DSC version.

This leaves the DSI DSC line buffer depth calculation as-is, trusting
VBT.

On DSC version 1.2 for sinks reporting a maximum line buffer depth of 16
the line buffer depth was incorrectly programmed as 0, leading to a
corruption in color gradients / lines on the decompressed screen image.

Cc: dri-devel@lists.freedesktop.org
Reviewed-by: Ankit Nautiyal <ankit.k.nautiyal@intel.com>
Reviewed-by: Manasi Navare <navaremanasi@chromium.org>
Acked-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Signed-off-by: Imre Deak <imre.deak@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240416221010.376865-2-imre.deak@intel.com
---
 drivers/gpu/drm/i915/display/intel_dp.c | 16 ++++++----------
 include/drm/display/drm_dsc.h           |  3 ---
 2 files changed, 6 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c
index 163da48bc406..23808e9d41d5 100644
--- a/drivers/gpu/drm/i915/display/intel_dp.c
+++ b/drivers/gpu/drm/i915/display/intel_dp.c
@@ -89,6 +89,9 @@
 #define DP_DSC_MAX_ENC_THROUGHPUT_0		340000
 #define DP_DSC_MAX_ENC_THROUGHPUT_1		400000
 
+/* Max DSC line buffer depth supported by HW. */
+#define INTEL_DP_DSC_MAX_LINE_BUF_DEPTH		13
+
 /* DP DSC FEC Overhead factor in ppm = 1/(0.972261) = 1.028530 */
 #define DP_DSC_FEC_OVERHEAD_FACTOR		1028530
 
@@ -1705,7 +1708,6 @@ static int intel_dp_dsc_compute_params(const struct intel_connector *connector,
 {
 	struct drm_i915_private *i915 = to_i915(connector->base.dev);
 	struct drm_dsc_config *vdsc_cfg = &crtc_state->dsc.config;
-	u8 line_buf_depth;
 	int ret;
 
 	/*
@@ -1734,20 +1736,14 @@ static int intel_dp_dsc_compute_params(const struct intel_connector *connector,
 			connector->dp.dsc_dpcd[DP_DSC_DEC_COLOR_FORMAT_CAP - DP_DSC_SUPPORT] &
 			DP_DSC_RGB;
 
-	line_buf_depth = drm_dp_dsc_sink_line_buf_depth(connector->dp.dsc_dpcd);
-	if (!line_buf_depth) {
+	vdsc_cfg->line_buf_depth = min(INTEL_DP_DSC_MAX_LINE_BUF_DEPTH,
+				       drm_dp_dsc_sink_line_buf_depth(connector->dp.dsc_dpcd));
+	if (!vdsc_cfg->line_buf_depth) {
 		drm_dbg_kms(&i915->drm,
 			    "DSC Sink Line Buffer Depth invalid\n");
 		return -EINVAL;
 	}
 
-	if (vdsc_cfg->dsc_version_minor == 2)
-		vdsc_cfg->line_buf_depth = (line_buf_depth == DSC_1_2_MAX_LINEBUF_DEPTH_BITS) ?
-			DSC_1_2_MAX_LINEBUF_DEPTH_VAL : line_buf_depth;
-	else
-		vdsc_cfg->line_buf_depth = (line_buf_depth > DSC_1_1_MAX_LINEBUF_DEPTH_BITS) ?
-			DSC_1_1_MAX_LINEBUF_DEPTH_BITS : line_buf_depth;
-
 	vdsc_cfg->block_pred_enable =
 		connector->dp.dsc_dpcd[DP_DSC_BLK_PREDICTION_SUPPORT - DP_DSC_SUPPORT] &
 		DP_DSC_BLK_PREDICTION_IS_SUPPORTED;
diff --git a/include/drm/display/drm_dsc.h b/include/drm/display/drm_dsc.h
index bc90273d06a6..bbbe7438473d 100644
--- a/include/drm/display/drm_dsc.h
+++ b/include/drm/display/drm_dsc.h
@@ -40,9 +40,6 @@
 #define DSC_PPS_RC_RANGE_MINQP_SHIFT		11
 #define DSC_PPS_RC_RANGE_MAXQP_SHIFT		6
 #define DSC_PPS_NATIVE_420_SHIFT		1
-#define DSC_1_2_MAX_LINEBUF_DEPTH_BITS		16
-#define DSC_1_2_MAX_LINEBUF_DEPTH_VAL		0
-#define DSC_1_1_MAX_LINEBUF_DEPTH_BITS		13
 
 /**
  * struct drm_dsc_rc_range_parameters - DSC Rate Control range parameters
-- 
cgit v1.2.3


From 2145af01b9cb326b7dd202a4281976fae614183e Mon Sep 17 00:00:00 2001
From: Peng Fan <peng.fan@nxp.com>
Date: Thu, 18 Apr 2024 09:32:30 +0800
Subject: firmware: arm_scmi: Add basic support for SCMI v3.2 pincontrol
 protocol

Add basic implementation of the SCMI v3.2 pincontrol protocol.

Co-developed-by: Oleksii Moisieiev <oleksii_moisieiev@epam.com>
Signed-off-by: Oleksii Moisieiev <oleksii_moisieiev@epam.com>
Co-developed-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Peng Fan <peng.fan@nxp.com>
Link: https://lore.kernel.org/r/20240418-pinctrl-scmi-v11-3-499dca9864a7@nxp.com
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
---
 drivers/firmware/arm_scmi/Makefile    |   3 +-
 drivers/firmware/arm_scmi/driver.c    |   2 +
 drivers/firmware/arm_scmi/pinctrl.c   | 916 ++++++++++++++++++++++++++++++++++
 drivers/firmware/arm_scmi/protocols.h |   1 +
 include/linux/scmi_protocol.h         |  84 ++++
 5 files changed, 1005 insertions(+), 1 deletion(-)
 create mode 100644 drivers/firmware/arm_scmi/pinctrl.c

(limited to 'include')

diff --git a/drivers/firmware/arm_scmi/Makefile b/drivers/firmware/arm_scmi/Makefile
index a7bc4796519c..fd59f58ce8a2 100644
--- a/drivers/firmware/arm_scmi/Makefile
+++ b/drivers/firmware/arm_scmi/Makefile
@@ -10,7 +10,8 @@ scmi-transport-$(CONFIG_ARM_SCMI_TRANSPORT_SMC) += smc.o
 scmi-transport-$(CONFIG_ARM_SCMI_HAVE_MSG) += msg.o
 scmi-transport-$(CONFIG_ARM_SCMI_TRANSPORT_VIRTIO) += virtio.o
 scmi-transport-$(CONFIG_ARM_SCMI_TRANSPORT_OPTEE) += optee.o
-scmi-protocols-y = base.o clock.o perf.o power.o reset.o sensors.o system.o voltage.o powercap.o
+scmi-protocols-y := base.o clock.o perf.o power.o reset.o sensors.o system.o voltage.o powercap.o
+scmi-protocols-y += pinctrl.o
 scmi-module-objs := $(scmi-driver-y) $(scmi-protocols-y) $(scmi-transport-y)
 
 obj-$(CONFIG_ARM_SCMI_PROTOCOL) += scmi-core.o
diff --git a/drivers/firmware/arm_scmi/driver.c b/drivers/firmware/arm_scmi/driver.c
index ab5c138e0d09..6b6957f4743f 100644
--- a/drivers/firmware/arm_scmi/driver.c
+++ b/drivers/firmware/arm_scmi/driver.c
@@ -3334,6 +3334,7 @@ static int __init scmi_driver_init(void)
 	scmi_voltage_register();
 	scmi_system_register();
 	scmi_powercap_register();
+	scmi_pinctrl_register();
 
 	return platform_driver_register(&scmi_driver);
 }
@@ -3351,6 +3352,7 @@ static void __exit scmi_driver_exit(void)
 	scmi_voltage_unregister();
 	scmi_system_unregister();
 	scmi_powercap_unregister();
+	scmi_pinctrl_unregister();
 
 	scmi_transports_exit();
 
diff --git a/drivers/firmware/arm_scmi/pinctrl.c b/drivers/firmware/arm_scmi/pinctrl.c
new file mode 100644
index 000000000000..a2a7f880d6a3
--- /dev/null
+++ b/drivers/firmware/arm_scmi/pinctrl.c
@@ -0,0 +1,916 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * System Control and Management Interface (SCMI) Pinctrl Protocol
+ *
+ * Copyright (C) 2024 EPAM
+ * Copyright 2024 NXP
+ */
+
+#include <asm/byteorder.h>
+#include <linux/bits.h>
+#include <linux/bitfield.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/scmi_protocol.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include "common.h"
+#include "protocols.h"
+
+/* Updated only after ALL the mandatory features for that version are merged */
+#define SCMI_PROTOCOL_SUPPORTED_VERSION		0x10000
+
+#define GET_GROUPS_NR(x)	le32_get_bits((x), GENMASK(31, 16))
+#define GET_PINS_NR(x)		le32_get_bits((x), GENMASK(15, 0))
+#define GET_FUNCTIONS_NR(x)	le32_get_bits((x), GENMASK(15, 0))
+
+#define EXT_NAME_FLAG(x)	le32_get_bits((x), BIT(31))
+#define NUM_ELEMS(x)		le32_get_bits((x), GENMASK(15, 0))
+
+#define REMAINING(x)		le32_get_bits((x), GENMASK(31, 16))
+#define RETURNED(x)		le32_get_bits((x), GENMASK(11, 0))
+
+#define CONFIG_FLAG_MASK	GENMASK(19, 18)
+#define SELECTOR_MASK		GENMASK(17, 16)
+#define SKIP_CONFIGS_MASK	GENMASK(15, 8)
+#define CONFIG_TYPE_MASK	GENMASK(7, 0)
+
+enum scmi_pinctrl_protocol_cmd {
+	PINCTRL_ATTRIBUTES = 0x3,
+	PINCTRL_LIST_ASSOCIATIONS = 0x4,
+	PINCTRL_SETTINGS_GET = 0x5,
+	PINCTRL_SETTINGS_CONFIGURE = 0x6,
+	PINCTRL_REQUEST = 0x7,
+	PINCTRL_RELEASE = 0x8,
+	PINCTRL_NAME_GET = 0x9,
+	PINCTRL_SET_PERMISSIONS = 0xa,
+};
+
+struct scmi_msg_settings_conf {
+	__le32 identifier;
+	__le32 function_id;
+	__le32 attributes;
+	__le32 configs[];
+};
+
+struct scmi_msg_settings_get {
+	__le32 identifier;
+	__le32 attributes;
+};
+
+struct scmi_resp_settings_get {
+	__le32 function_selected;
+	__le32 num_configs;
+	__le32 configs[];
+};
+
+struct scmi_msg_pinctrl_protocol_attributes {
+	__le32 attributes_low;
+	__le32 attributes_high;
+};
+
+struct scmi_msg_pinctrl_attributes {
+	__le32 identifier;
+	__le32 flags;
+};
+
+struct scmi_resp_pinctrl_attributes {
+	__le32 attributes;
+	u8 name[SCMI_SHORT_NAME_MAX_SIZE];
+};
+
+struct scmi_msg_pinctrl_list_assoc {
+	__le32 identifier;
+	__le32 flags;
+	__le32 index;
+};
+
+struct scmi_resp_pinctrl_list_assoc {
+	__le32 flags;
+	__le16 array[];
+};
+
+struct scmi_msg_request {
+	__le32 identifier;
+	__le32 flags;
+};
+
+struct scmi_group_info {
+	char name[SCMI_MAX_STR_SIZE];
+	bool present;
+	u32 *group_pins;
+	u32 nr_pins;
+};
+
+struct scmi_function_info {
+	char name[SCMI_MAX_STR_SIZE];
+	bool present;
+	u32 *groups;
+	u32 nr_groups;
+};
+
+struct scmi_pin_info {
+	char name[SCMI_MAX_STR_SIZE];
+	bool present;
+};
+
+struct scmi_pinctrl_info {
+	u32 version;
+	int nr_groups;
+	int nr_functions;
+	int nr_pins;
+	struct scmi_group_info *groups;
+	struct scmi_function_info *functions;
+	struct scmi_pin_info *pins;
+};
+
+static int scmi_pinctrl_attributes_get(const struct scmi_protocol_handle *ph,
+				       struct scmi_pinctrl_info *pi)
+{
+	int ret;
+	struct scmi_xfer *t;
+	struct scmi_msg_pinctrl_protocol_attributes *attr;
+
+	ret = ph->xops->xfer_get_init(ph, PROTOCOL_ATTRIBUTES, 0, sizeof(*attr), &t);
+	if (ret)
+		return ret;
+
+	attr = t->rx.buf;
+
+	ret = ph->xops->do_xfer(ph, t);
+	if (!ret) {
+		pi->nr_functions = GET_FUNCTIONS_NR(attr->attributes_high);
+		pi->nr_groups = GET_GROUPS_NR(attr->attributes_low);
+		pi->nr_pins = GET_PINS_NR(attr->attributes_low);
+		if (pi->nr_pins == 0) {
+			dev_warn(ph->dev, "returned zero pins\n");
+			ret = -EINVAL;
+		}
+	}
+
+	ph->xops->xfer_put(ph, t);
+	return ret;
+}
+
+static int scmi_pinctrl_count_get(const struct scmi_protocol_handle *ph,
+				  enum scmi_pinctrl_selector_type type)
+{
+	struct scmi_pinctrl_info *pi = ph->get_priv(ph);
+
+	switch (type) {
+	case PIN_TYPE:
+		return pi->nr_pins;
+	case GROUP_TYPE:
+		return pi->nr_groups;
+	case FUNCTION_TYPE:
+		return pi->nr_functions;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int scmi_pinctrl_validate_id(const struct scmi_protocol_handle *ph,
+				    u32 selector,
+				    enum scmi_pinctrl_selector_type type)
+{
+	int value;
+
+	value = scmi_pinctrl_count_get(ph, type);
+	if (value < 0)
+		return value;
+
+	if (selector >= value || value == 0)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int scmi_pinctrl_attributes(const struct scmi_protocol_handle *ph,
+				   enum scmi_pinctrl_selector_type type,
+				   u32 selector, char *name,
+				   u32 *n_elems)
+{
+	int ret;
+	struct scmi_xfer *t;
+	struct scmi_msg_pinctrl_attributes *tx;
+	struct scmi_resp_pinctrl_attributes *rx;
+	bool ext_name_flag;
+
+	if (!name)
+		return -EINVAL;
+
+	ret = scmi_pinctrl_validate_id(ph, selector, type);
+	if (ret)
+		return ret;
+
+	ret = ph->xops->xfer_get_init(ph, PINCTRL_ATTRIBUTES, sizeof(*tx),
+				      sizeof(*rx), &t);
+	if (ret)
+		return ret;
+
+	tx = t->tx.buf;
+	rx = t->rx.buf;
+	tx->identifier = cpu_to_le32(selector);
+	tx->flags = cpu_to_le32(type);
+
+	ret = ph->xops->do_xfer(ph, t);
+	if (!ret) {
+		if (n_elems)
+			*n_elems = NUM_ELEMS(rx->attributes);
+
+		strscpy(name, rx->name, SCMI_SHORT_NAME_MAX_SIZE);
+
+		ext_name_flag = !!EXT_NAME_FLAG(rx->attributes);
+	}
+
+	ph->xops->xfer_put(ph, t);
+
+	if (ret)
+		return ret;
+	/*
+	 * If supported overwrite short name with the extended one;
+	 * on error just carry on and use already provided short name.
+	 */
+	if (ext_name_flag)
+		ret = ph->hops->extended_name_get(ph, PINCTRL_NAME_GET,
+						  selector, (u32 *)&type, name,
+						  SCMI_MAX_STR_SIZE);
+	return ret;
+}
+
+struct scmi_pinctrl_ipriv {
+	u32 selector;
+	enum scmi_pinctrl_selector_type type;
+	u32 *array;
+};
+
+static void iter_pinctrl_assoc_prepare_message(void *message,
+					       u32 desc_index,
+					       const void *priv)
+{
+	struct scmi_msg_pinctrl_list_assoc *msg = message;
+	const struct scmi_pinctrl_ipriv *p = priv;
+
+	msg->identifier = cpu_to_le32(p->selector);
+	msg->flags = cpu_to_le32(p->type);
+	msg->index = cpu_to_le32(desc_index);
+}
+
+static int iter_pinctrl_assoc_update_state(struct scmi_iterator_state *st,
+					   const void *response, void *priv)
+{
+	const struct scmi_resp_pinctrl_list_assoc *r = response;
+
+	st->num_returned = RETURNED(r->flags);
+	st->num_remaining = REMAINING(r->flags);
+
+	return 0;
+}
+
+static int
+iter_pinctrl_assoc_process_response(const struct scmi_protocol_handle *ph,
+				    const void *response,
+				    struct scmi_iterator_state *st, void *priv)
+{
+	const struct scmi_resp_pinctrl_list_assoc *r = response;
+	struct scmi_pinctrl_ipriv *p = priv;
+
+	p->array[st->desc_index + st->loop_idx] =
+		le16_to_cpu(r->array[st->loop_idx]);
+
+	return 0;
+}
+
+static int scmi_pinctrl_list_associations(const struct scmi_protocol_handle *ph,
+					  u32 selector,
+					  enum scmi_pinctrl_selector_type type,
+					  u16 size, u32 *array)
+{
+	int ret;
+	void *iter;
+	struct scmi_iterator_ops ops = {
+		.prepare_message = iter_pinctrl_assoc_prepare_message,
+		.update_state = iter_pinctrl_assoc_update_state,
+		.process_response = iter_pinctrl_assoc_process_response,
+	};
+	struct scmi_pinctrl_ipriv ipriv = {
+		.selector = selector,
+		.type = type,
+		.array = array,
+	};
+
+	if (!array || !size || type == PIN_TYPE)
+		return -EINVAL;
+
+	ret = scmi_pinctrl_validate_id(ph, selector, type);
+	if (ret)
+		return ret;
+
+	iter = ph->hops->iter_response_init(ph, &ops, size,
+					    PINCTRL_LIST_ASSOCIATIONS,
+					    sizeof(struct scmi_msg_pinctrl_list_assoc),
+					    &ipriv);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter);
+
+	return ph->hops->iter_response_run(iter);
+}
+
+struct scmi_settings_get_ipriv {
+	u32 selector;
+	enum scmi_pinctrl_selector_type type;
+	bool get_all;
+	unsigned int *nr_configs;
+	enum scmi_pinctrl_conf_type *config_types;
+	u32 *config_values;
+};
+
+static void
+iter_pinctrl_settings_get_prepare_message(void *message, u32 desc_index,
+					  const void *priv)
+{
+	struct scmi_msg_settings_get *msg = message;
+	const struct scmi_settings_get_ipriv *p = priv;
+	u32 attributes;
+
+	attributes = FIELD_PREP(SELECTOR_MASK, p->type);
+
+	if (p->get_all) {
+		attributes |= FIELD_PREP(CONFIG_FLAG_MASK, 1) |
+			FIELD_PREP(SKIP_CONFIGS_MASK, desc_index);
+	} else {
+		attributes |= FIELD_PREP(CONFIG_TYPE_MASK, p->config_types[0]);
+	}
+
+	msg->attributes = cpu_to_le32(attributes);
+	msg->identifier = cpu_to_le32(p->selector);
+}
+
+static int
+iter_pinctrl_settings_get_update_state(struct scmi_iterator_state *st,
+				       const void *response, void *priv)
+{
+	const struct scmi_resp_settings_get *r = response;
+	struct scmi_settings_get_ipriv *p = priv;
+
+	if (p->get_all) {
+		st->num_returned = le32_get_bits(r->num_configs, GENMASK(7, 0));
+		st->num_remaining = le32_get_bits(r->num_configs, GENMASK(31, 24));
+	} else {
+		st->num_returned = 1;
+		st->num_remaining = 0;
+	}
+
+	return 0;
+}
+
+static int
+iter_pinctrl_settings_get_process_response(const struct scmi_protocol_handle *ph,
+					   const void *response,
+					   struct scmi_iterator_state *st,
+					   void *priv)
+{
+	const struct scmi_resp_settings_get *r = response;
+	struct scmi_settings_get_ipriv *p = priv;
+	u32 type = le32_get_bits(r->configs[st->loop_idx * 2], GENMASK(7, 0));
+	u32 val = le32_to_cpu(r->configs[st->loop_idx * 2 + 1]);
+
+	if (p->get_all) {
+		p->config_types[st->desc_index + st->loop_idx] = type;
+	} else {
+		if (p->config_types[0] != type)
+			return -EINVAL;
+	}
+
+	p->config_values[st->desc_index + st->loop_idx] = val;
+	++*p->nr_configs;
+
+	return 0;
+}
+
+static int
+scmi_pinctrl_settings_get(const struct scmi_protocol_handle *ph, u32 selector,
+			  enum scmi_pinctrl_selector_type type,
+			  unsigned int *nr_configs,
+			  enum scmi_pinctrl_conf_type *config_types,
+			  u32 *config_values)
+{
+	int ret;
+	void *iter;
+	unsigned int max_configs = *nr_configs;
+	struct scmi_iterator_ops ops = {
+		.prepare_message = iter_pinctrl_settings_get_prepare_message,
+		.update_state = iter_pinctrl_settings_get_update_state,
+		.process_response = iter_pinctrl_settings_get_process_response,
+	};
+	struct scmi_settings_get_ipriv ipriv = {
+		.selector = selector,
+		.type = type,
+		.get_all = (max_configs > 1),
+		.nr_configs = nr_configs,
+		.config_types = config_types,
+		.config_values = config_values,
+	};
+
+	if (!config_types || !config_values || type == FUNCTION_TYPE)
+		return -EINVAL;
+
+	ret = scmi_pinctrl_validate_id(ph, selector, type);
+	if (ret)
+		return ret;
+
+	/* Prepare to count returned configs */
+	*nr_configs = 0;
+	iter = ph->hops->iter_response_init(ph, &ops, max_configs,
+					    PINCTRL_SETTINGS_GET,
+					    sizeof(struct scmi_msg_settings_get),
+					    &ipriv);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter);
+
+	return ph->hops->iter_response_run(iter);
+}
+
+static int scmi_pinctrl_settings_get_one(const struct scmi_protocol_handle *ph,
+					 u32 selector,
+					 enum scmi_pinctrl_selector_type type,
+					 enum scmi_pinctrl_conf_type config_type,
+					 u32 *config_value)
+{
+	unsigned int nr_configs = 1;
+
+	return scmi_pinctrl_settings_get(ph, selector, type, &nr_configs,
+					 &config_type, config_value);
+}
+
+static int scmi_pinctrl_settings_get_all(const struct scmi_protocol_handle *ph,
+					 u32 selector,
+					 enum scmi_pinctrl_selector_type type,
+					 unsigned int *nr_configs,
+					 enum scmi_pinctrl_conf_type *config_types,
+					 u32 *config_values)
+{
+	if (!nr_configs || *nr_configs == 0)
+		return -EINVAL;
+
+	return scmi_pinctrl_settings_get(ph, selector, type, nr_configs,
+					 config_types, config_values);
+}
+
+static int
+scmi_pinctrl_settings_conf(const struct scmi_protocol_handle *ph,
+			   u32 selector,
+			   enum scmi_pinctrl_selector_type type,
+			   u32 nr_configs,
+			   enum scmi_pinctrl_conf_type *config_type,
+			   u32 *config_value)
+{
+	struct scmi_xfer *t;
+	struct scmi_msg_settings_conf *tx;
+	u32 attributes;
+	int ret, i;
+	u32 configs_in_chunk, conf_num = 0;
+	u32 chunk;
+	int max_msg_size = ph->hops->get_max_msg_size(ph);
+
+	if (!config_type || !config_value || type == FUNCTION_TYPE)
+		return -EINVAL;
+
+	ret = scmi_pinctrl_validate_id(ph, selector, type);
+	if (ret)
+		return ret;
+
+	configs_in_chunk = (max_msg_size - sizeof(*tx)) / (sizeof(__le32) * 2);
+	while (conf_num < nr_configs) {
+		chunk = (nr_configs - conf_num > configs_in_chunk) ?
+			configs_in_chunk : nr_configs - conf_num;
+
+		ret = ph->xops->xfer_get_init(ph, PINCTRL_SETTINGS_CONFIGURE,
+					      sizeof(*tx) +
+					      chunk * 2 * sizeof(__le32), 0, &t);
+		if (ret)
+			break;
+
+		tx = t->tx.buf;
+		tx->identifier = cpu_to_le32(selector);
+		tx->function_id = cpu_to_le32(0xFFFFFFFF);
+		attributes = FIELD_PREP(GENMASK(1, 0), type) |
+			FIELD_PREP(GENMASK(9, 2), chunk);
+		tx->attributes = cpu_to_le32(attributes);
+
+		for (i = 0; i < chunk; i++) {
+			tx->configs[i * 2] =
+				cpu_to_le32(config_type[conf_num + i]);
+			tx->configs[i * 2 + 1] =
+				cpu_to_le32(config_value[conf_num + i]);
+		}
+
+		ret = ph->xops->do_xfer(ph, t);
+
+		ph->xops->xfer_put(ph, t);
+
+		if (ret)
+			break;
+
+		conf_num += chunk;
+	}
+
+	return ret;
+}
+
+static int scmi_pinctrl_function_select(const struct scmi_protocol_handle *ph,
+					u32 group,
+					enum scmi_pinctrl_selector_type type,
+					u32 function_id)
+{
+	int ret;
+	struct scmi_xfer *t;
+	struct scmi_msg_settings_conf *tx;
+	u32 attributes;
+
+	ret = scmi_pinctrl_validate_id(ph, group, type);
+	if (ret)
+		return ret;
+
+	ret = ph->xops->xfer_get_init(ph, PINCTRL_SETTINGS_CONFIGURE,
+				      sizeof(*tx), 0, &t);
+	if (ret)
+		return ret;
+
+	tx = t->tx.buf;
+	tx->identifier = cpu_to_le32(group);
+	tx->function_id = cpu_to_le32(function_id);
+	attributes = FIELD_PREP(GENMASK(1, 0), type) | BIT(10);
+	tx->attributes = cpu_to_le32(attributes);
+
+	ret = ph->xops->do_xfer(ph, t);
+	ph->xops->xfer_put(ph, t);
+
+	return ret;
+}
+
+static int scmi_pinctrl_request_free(const struct scmi_protocol_handle *ph,
+				     u32 identifier,
+				     enum scmi_pinctrl_selector_type type,
+				     enum scmi_pinctrl_protocol_cmd cmd)
+{
+	int ret;
+	struct scmi_xfer *t;
+	struct scmi_msg_request *tx;
+
+	if (type == FUNCTION_TYPE)
+		return -EINVAL;
+
+	if (cmd != PINCTRL_REQUEST && cmd != PINCTRL_RELEASE)
+		return -EINVAL;
+
+	ret = scmi_pinctrl_validate_id(ph, identifier, type);
+	if (ret)
+		return ret;
+
+	ret = ph->xops->xfer_get_init(ph, cmd, sizeof(*tx), 0, &t);
+	if (ret)
+		return ret;
+
+	tx = t->tx.buf;
+	tx->identifier = cpu_to_le32(identifier);
+	tx->flags = cpu_to_le32(type);
+
+	ret = ph->xops->do_xfer(ph, t);
+	ph->xops->xfer_put(ph, t);
+
+	return ret;
+}
+
+static int scmi_pinctrl_pin_request(const struct scmi_protocol_handle *ph,
+				    u32 pin)
+{
+	return scmi_pinctrl_request_free(ph, pin, PIN_TYPE, PINCTRL_REQUEST);
+}
+
+static int scmi_pinctrl_pin_free(const struct scmi_protocol_handle *ph, u32 pin)
+{
+	return scmi_pinctrl_request_free(ph, pin, PIN_TYPE, PINCTRL_RELEASE);
+}
+
+static int scmi_pinctrl_get_group_info(const struct scmi_protocol_handle *ph,
+				       u32 selector,
+				       struct scmi_group_info *group)
+{
+	int ret;
+
+	ret = scmi_pinctrl_attributes(ph, GROUP_TYPE, selector, group->name,
+				      &group->nr_pins);
+	if (ret)
+		return ret;
+
+	if (!group->nr_pins) {
+		dev_err(ph->dev, "Group %d has 0 elements", selector);
+		return -ENODATA;
+	}
+
+	group->group_pins = kmalloc_array(group->nr_pins,
+					  sizeof(*group->group_pins),
+					  GFP_KERNEL);
+	if (!group->group_pins)
+		return -ENOMEM;
+
+	ret = scmi_pinctrl_list_associations(ph, selector, GROUP_TYPE,
+					     group->nr_pins, group->group_pins);
+	if (ret) {
+		kfree(group->group_pins);
+		return ret;
+	}
+
+	group->present = true;
+	return 0;
+}
+
+static int scmi_pinctrl_get_group_name(const struct scmi_protocol_handle *ph,
+				       u32 selector, const char **name)
+{
+	struct scmi_pinctrl_info *pi = ph->get_priv(ph);
+
+	if (!name)
+		return -EINVAL;
+
+	if (selector >= pi->nr_groups || pi->nr_groups == 0)
+		return -EINVAL;
+
+	if (!pi->groups[selector].present) {
+		int ret;
+
+		ret = scmi_pinctrl_get_group_info(ph, selector,
+						  &pi->groups[selector]);
+		if (ret)
+			return ret;
+	}
+
+	*name = pi->groups[selector].name;
+
+	return 0;
+}
+
+static int scmi_pinctrl_group_pins_get(const struct scmi_protocol_handle *ph,
+				       u32 selector, const u32 **pins,
+				       u32 *nr_pins)
+{
+	struct scmi_pinctrl_info *pi = ph->get_priv(ph);
+
+	if (!pins || !nr_pins)
+		return -EINVAL;
+
+	if (selector >= pi->nr_groups || pi->nr_groups == 0)
+		return -EINVAL;
+
+	if (!pi->groups[selector].present) {
+		int ret;
+
+		ret = scmi_pinctrl_get_group_info(ph, selector,
+						  &pi->groups[selector]);
+		if (ret)
+			return ret;
+	}
+
+	*pins = pi->groups[selector].group_pins;
+	*nr_pins = pi->groups[selector].nr_pins;
+
+	return 0;
+}
+
+static int scmi_pinctrl_get_function_info(const struct scmi_protocol_handle *ph,
+					  u32 selector,
+					  struct scmi_function_info *func)
+{
+	int ret;
+
+	ret = scmi_pinctrl_attributes(ph, FUNCTION_TYPE, selector, func->name,
+				      &func->nr_groups);
+	if (ret)
+		return ret;
+
+	if (!func->nr_groups) {
+		dev_err(ph->dev, "Function %d has 0 elements", selector);
+		return -ENODATA;
+	}
+
+	func->groups = kmalloc_array(func->nr_groups, sizeof(*func->groups),
+				     GFP_KERNEL);
+	if (!func->groups)
+		return -ENOMEM;
+
+	ret = scmi_pinctrl_list_associations(ph, selector, FUNCTION_TYPE,
+					     func->nr_groups, func->groups);
+	if (ret) {
+		kfree(func->groups);
+		return ret;
+	}
+
+	func->present = true;
+	return 0;
+}
+
+static int scmi_pinctrl_get_function_name(const struct scmi_protocol_handle *ph,
+					  u32 selector, const char **name)
+{
+	struct scmi_pinctrl_info *pi = ph->get_priv(ph);
+
+	if (!name)
+		return -EINVAL;
+
+	if (selector >= pi->nr_functions || pi->nr_functions == 0)
+		return -EINVAL;
+
+	if (!pi->functions[selector].present) {
+		int ret;
+
+		ret = scmi_pinctrl_get_function_info(ph, selector,
+						     &pi->functions[selector]);
+		if (ret)
+			return ret;
+	}
+
+	*name = pi->functions[selector].name;
+	return 0;
+}
+
+static int
+scmi_pinctrl_function_groups_get(const struct scmi_protocol_handle *ph,
+				 u32 selector, u32 *nr_groups,
+				 const u32 **groups)
+{
+	struct scmi_pinctrl_info *pi = ph->get_priv(ph);
+
+	if (!groups || !nr_groups)
+		return -EINVAL;
+
+	if (selector >= pi->nr_functions || pi->nr_functions == 0)
+		return -EINVAL;
+
+	if (!pi->functions[selector].present) {
+		int ret;
+
+		ret = scmi_pinctrl_get_function_info(ph, selector,
+						     &pi->functions[selector]);
+		if (ret)
+			return ret;
+	}
+
+	*groups = pi->functions[selector].groups;
+	*nr_groups = pi->functions[selector].nr_groups;
+
+	return 0;
+}
+
+static int scmi_pinctrl_mux_set(const struct scmi_protocol_handle *ph,
+				u32 selector, u32 group)
+{
+	return scmi_pinctrl_function_select(ph, group, GROUP_TYPE, selector);
+}
+
+static int scmi_pinctrl_get_pin_info(const struct scmi_protocol_handle *ph,
+				     u32 selector, struct scmi_pin_info *pin)
+{
+	int ret;
+
+	if (!pin)
+		return -EINVAL;
+
+	ret = scmi_pinctrl_attributes(ph, PIN_TYPE, selector, pin->name, NULL);
+	if (ret)
+		return ret;
+
+	pin->present = true;
+	return 0;
+}
+
+static int scmi_pinctrl_get_pin_name(const struct scmi_protocol_handle *ph,
+				     u32 selector, const char **name)
+{
+	struct scmi_pinctrl_info *pi = ph->get_priv(ph);
+
+	if (!name)
+		return -EINVAL;
+
+	if (selector >= pi->nr_pins)
+		return -EINVAL;
+
+	if (!pi->pins[selector].present) {
+		int ret;
+
+		ret = scmi_pinctrl_get_pin_info(ph, selector, &pi->pins[selector]);
+		if (ret)
+			return ret;
+	}
+
+	*name = pi->pins[selector].name;
+
+	return 0;
+}
+
+static int scmi_pinctrl_name_get(const struct scmi_protocol_handle *ph,
+				 u32 selector,
+				 enum scmi_pinctrl_selector_type type,
+				 const char **name)
+{
+	switch (type) {
+	case PIN_TYPE:
+		return scmi_pinctrl_get_pin_name(ph, selector, name);
+	case GROUP_TYPE:
+		return scmi_pinctrl_get_group_name(ph, selector, name);
+	case FUNCTION_TYPE:
+		return scmi_pinctrl_get_function_name(ph, selector, name);
+	default:
+		return -EINVAL;
+	}
+}
+
+static const struct scmi_pinctrl_proto_ops pinctrl_proto_ops = {
+	.count_get = scmi_pinctrl_count_get,
+	.name_get = scmi_pinctrl_name_get,
+	.group_pins_get = scmi_pinctrl_group_pins_get,
+	.function_groups_get = scmi_pinctrl_function_groups_get,
+	.mux_set = scmi_pinctrl_mux_set,
+	.settings_get_one = scmi_pinctrl_settings_get_one,
+	.settings_get_all = scmi_pinctrl_settings_get_all,
+	.settings_conf = scmi_pinctrl_settings_conf,
+	.pin_request = scmi_pinctrl_pin_request,
+	.pin_free = scmi_pinctrl_pin_free,
+};
+
+static int scmi_pinctrl_protocol_init(const struct scmi_protocol_handle *ph)
+{
+	int ret;
+	u32 version;
+	struct scmi_pinctrl_info *pinfo;
+
+	ret = ph->xops->version_get(ph, &version);
+	if (ret)
+		return ret;
+
+	dev_dbg(ph->dev, "Pinctrl Version %d.%d\n",
+		PROTOCOL_REV_MAJOR(version), PROTOCOL_REV_MINOR(version));
+
+	pinfo = devm_kzalloc(ph->dev, sizeof(*pinfo), GFP_KERNEL);
+	if (!pinfo)
+		return -ENOMEM;
+
+	ret = scmi_pinctrl_attributes_get(ph, pinfo);
+	if (ret)
+		return ret;
+
+	pinfo->pins = devm_kcalloc(ph->dev, pinfo->nr_pins,
+				   sizeof(*pinfo->pins), GFP_KERNEL);
+	if (!pinfo->pins)
+		return -ENOMEM;
+
+	pinfo->groups = devm_kcalloc(ph->dev, pinfo->nr_groups,
+				     sizeof(*pinfo->groups), GFP_KERNEL);
+	if (!pinfo->groups)
+		return -ENOMEM;
+
+	pinfo->functions = devm_kcalloc(ph->dev, pinfo->nr_functions,
+					sizeof(*pinfo->functions), GFP_KERNEL);
+	if (!pinfo->functions)
+		return -ENOMEM;
+
+	pinfo->version = version;
+
+	return ph->set_priv(ph, pinfo, version);
+}
+
+static int scmi_pinctrl_protocol_deinit(const struct scmi_protocol_handle *ph)
+{
+	int i;
+	struct scmi_pinctrl_info *pi = ph->get_priv(ph);
+
+	/* Free groups_pins allocated in scmi_pinctrl_get_group_info */
+	for (i = 0; i < pi->nr_groups; i++) {
+		if (pi->groups[i].present) {
+			kfree(pi->groups[i].group_pins);
+			pi->groups[i].present = false;
+		}
+	}
+
+	/* Free groups allocated in scmi_pinctrl_get_function_info */
+	for (i = 0; i < pi->nr_functions; i++) {
+		if (pi->functions[i].present) {
+			kfree(pi->functions[i].groups);
+			pi->functions[i].present = false;
+		}
+	}
+
+	return 0;
+}
+
+static const struct scmi_protocol scmi_pinctrl = {
+	.id = SCMI_PROTOCOL_PINCTRL,
+	.owner = THIS_MODULE,
+	.instance_init = &scmi_pinctrl_protocol_init,
+	.instance_deinit = &scmi_pinctrl_protocol_deinit,
+	.ops = &pinctrl_proto_ops,
+	.supported_version = SCMI_PROTOCOL_SUPPORTED_VERSION,
+};
+DEFINE_SCMI_PROTOCOL_REGISTER_UNREGISTER(pinctrl, scmi_pinctrl)
diff --git a/drivers/firmware/arm_scmi/protocols.h b/drivers/firmware/arm_scmi/protocols.h
index 314371dc0f65..8e95f53bd7b7 100644
--- a/drivers/firmware/arm_scmi/protocols.h
+++ b/drivers/firmware/arm_scmi/protocols.h
@@ -370,6 +370,7 @@ void __exit scmi_##name##_unregister(void)			\
 DECLARE_SCMI_REGISTER_UNREGISTER(base);
 DECLARE_SCMI_REGISTER_UNREGISTER(clock);
 DECLARE_SCMI_REGISTER_UNREGISTER(perf);
+DECLARE_SCMI_REGISTER_UNREGISTER(pinctrl);
 DECLARE_SCMI_REGISTER_UNREGISTER(power);
 DECLARE_SCMI_REGISTER_UNREGISTER(reset);
 DECLARE_SCMI_REGISTER_UNREGISTER(sensors);
diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h
index a3addb07e00a..3a9bb5b9a9e8 100644
--- a/include/linux/scmi_protocol.h
+++ b/include/linux/scmi_protocol.h
@@ -737,6 +737,89 @@ struct scmi_powercap_proto_ops {
 					  u32 *power_thresh_high);
 };
 
+enum scmi_pinctrl_selector_type {
+	PIN_TYPE = 0,
+	GROUP_TYPE,
+	FUNCTION_TYPE,
+};
+
+enum scmi_pinctrl_conf_type {
+	SCMI_PIN_DEFAULT = 0,
+	SCMI_PIN_BIAS_BUS_HOLD = 1,
+	SCMI_PIN_BIAS_DISABLE = 2,
+	SCMI_PIN_BIAS_HIGH_IMPEDANCE = 3,
+	SCMI_PIN_BIAS_PULL_UP = 4,
+	SCMI_PIN_BIAS_PULL_DEFAULT = 5,
+	SCMI_PIN_BIAS_PULL_DOWN = 6,
+	SCMI_PIN_DRIVE_OPEN_DRAIN = 7,
+	SCMI_PIN_DRIVE_OPEN_SOURCE = 8,
+	SCMI_PIN_DRIVE_PUSH_PULL = 9,
+	SCMI_PIN_DRIVE_STRENGTH = 10,
+	SCMI_PIN_INPUT_DEBOUNCE = 11,
+	SCMI_PIN_INPUT_MODE = 12,
+	SCMI_PIN_PULL_MODE = 13,
+	SCMI_PIN_INPUT_VALUE = 14,
+	SCMI_PIN_INPUT_SCHMITT = 15,
+	SCMI_PIN_LOW_POWER_MODE = 16,
+	SCMI_PIN_OUTPUT_MODE = 17,
+	SCMI_PIN_OUTPUT_VALUE = 18,
+	SCMI_PIN_POWER_SOURCE = 19,
+	SCMI_PIN_SLEW_RATE = 20,
+	SCMI_PIN_OEM_START = 192,
+	SCMI_PIN_OEM_END = 255,
+};
+
+/**
+ * struct scmi_pinctrl_proto_ops - represents the various operations provided
+ * by SCMI Pinctrl Protocol
+ *
+ * @count_get: returns count of the registered elements in given type
+ * @name_get: returns name by index of given type
+ * @group_pins_get: returns the set of pins, assigned to the specified group
+ * @function_groups_get: returns the set of groups, assigned to the specified
+ *	function
+ * @mux_set: set muxing function for groups of pins
+ * @settings_get_one: returns one configuration parameter for pin or group
+ *	specified by config_type
+ * @settings_get_all: returns all configuration parameters for pin or group
+ * @settings_conf: sets the configuration parameter for pin or group
+ * @pin_request: aquire pin before selecting mux setting
+ * @pin_free: frees pin, acquired by request_pin call
+ */
+struct scmi_pinctrl_proto_ops {
+	int (*count_get)(const struct scmi_protocol_handle *ph,
+			 enum scmi_pinctrl_selector_type type);
+	int (*name_get)(const struct scmi_protocol_handle *ph, u32 selector,
+			enum scmi_pinctrl_selector_type type,
+			const char **name);
+	int (*group_pins_get)(const struct scmi_protocol_handle *ph,
+			      u32 selector, const unsigned int **pins,
+			      unsigned int *nr_pins);
+	int (*function_groups_get)(const struct scmi_protocol_handle *ph,
+				   u32 selector, unsigned int *nr_groups,
+				   const unsigned int **groups);
+	int (*mux_set)(const struct scmi_protocol_handle *ph, u32 selector,
+		       u32 group);
+	int (*settings_get_one)(const struct scmi_protocol_handle *ph,
+				u32 selector,
+				enum scmi_pinctrl_selector_type type,
+				enum scmi_pinctrl_conf_type config_type,
+				u32 *config_value);
+	int (*settings_get_all)(const struct scmi_protocol_handle *ph,
+				u32 selector,
+				enum scmi_pinctrl_selector_type type,
+				unsigned int *nr_configs,
+				enum scmi_pinctrl_conf_type *config_types,
+				u32 *config_values);
+	int (*settings_conf)(const struct scmi_protocol_handle *ph,
+			     u32 selector, enum scmi_pinctrl_selector_type type,
+			     unsigned int nr_configs,
+			     enum scmi_pinctrl_conf_type *config_type,
+			     u32 *config_value);
+	int (*pin_request)(const struct scmi_protocol_handle *ph, u32 pin);
+	int (*pin_free)(const struct scmi_protocol_handle *ph, u32 pin);
+};
+
 /**
  * struct scmi_notify_ops  - represents notifications' operations provided by
  * SCMI core
@@ -842,6 +925,7 @@ enum scmi_std_protocol {
 	SCMI_PROTOCOL_RESET = 0x16,
 	SCMI_PROTOCOL_VOLTAGE = 0x17,
 	SCMI_PROTOCOL_POWERCAP = 0x18,
+	SCMI_PROTOCOL_PINCTRL = 0x19,
 };
 
 enum scmi_system_events {
-- 
cgit v1.2.3


From 8976bf1877ef4f902fae53fd0ad61b36d0c5a70c Mon Sep 17 00:00:00 2001
From: Imre Deak <imre.deak@intel.com>
Date: Wed, 17 Apr 2024 17:19:35 +0300
Subject: drm/dp: Add drm_dp_128b132b_supported()

Factor out a function to check for 128b/132b channel coding support used
by a follow-up patch in the patchset.

v2: s/drm_dp_uhbr_channel_coding_supported()/drm_dp128b132b_supported()
    (Jani)

Cc: dri-devel@lists.freedesktop.org
Cc: Jani Nikula <jani.nikula@intel.com>
Reviewed-by: Ankit Nautiyal <ankit.k.nautiyal@intel.com>
Reviewed-by: Manasi Navare <navaremanasi@chromium.org>
Acked-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Signed-off-by: Imre Deak <imre.deak@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240417141936.457796-1-imre.deak@intel.com
---
 drivers/gpu/drm/i915/display/intel_dp.c | 2 +-
 include/drm/display/drm_dp_helper.h     | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c
index 23808e9d41d5..e05e25cd4a94 100644
--- a/drivers/gpu/drm/i915/display/intel_dp.c
+++ b/drivers/gpu/drm/i915/display/intel_dp.c
@@ -225,7 +225,7 @@ static void intel_dp_set_dpcd_sink_rates(struct intel_dp *intel_dp)
 	 * Sink rates for 128b/132b. If set, sink should support all 8b/10b
 	 * rates and 10 Gbps.
 	 */
-	if (intel_dp->dpcd[DP_MAIN_LINK_CHANNEL_CODING] & DP_CAP_ANSI_128B132B) {
+	if (drm_dp_128b132b_supported(intel_dp->dpcd)) {
 		u8 uhbr_rates = 0;
 
 		BUILD_BUG_ON(ARRAY_SIZE(intel_dp->sink_rates) < ARRAY_SIZE(dp_rates) + 3);
diff --git a/include/drm/display/drm_dp_helper.h b/include/drm/display/drm_dp_helper.h
index 10147ae96326..8c6ccad0bfea 100644
--- a/include/drm/display/drm_dp_helper.h
+++ b/include/drm/display/drm_dp_helper.h
@@ -251,6 +251,12 @@ drm_dp_channel_coding_supported(const u8 dpcd[DP_RECEIVER_CAP_SIZE])
 	return dpcd[DP_MAIN_LINK_CHANNEL_CODING] & DP_CAP_ANSI_8B10B;
 }
 
+static inline bool
+drm_dp_128b132b_supported(const u8 dpcd[DP_RECEIVER_CAP_SIZE])
+{
+	return dpcd[DP_MAIN_LINK_CHANNEL_CODING] & DP_CAP_ANSI_128B132B;
+}
+
 static inline bool
 drm_dp_alternate_scrambler_reset_cap(const u8 dpcd[DP_RECEIVER_CAP_SIZE])
 {
-- 
cgit v1.2.3


From ca5d9d78f0a2c595561bce7cce4b86b42ca7042f Mon Sep 17 00:00:00 2001
From: Imre Deak <imre.deak@intel.com>
Date: Wed, 17 Apr 2024 01:10:07 +0300
Subject: drm/dp_mst: Factor out drm_dp_mst_port_is_logical()

Factor out a function to check if an MST port is logical, used by a
follow-up i915 patch in the patchset.

v2: Move drm_dp_mst_aux_for_parent() forward declaration to the next
    patch. (Ankit)

Cc: Lyude Paul <lyude@redhat.com>
Cc: dri-devel@lists.freedesktop.org
Reviewed-by: Ankit Nautiyal <ankit.k.nautiyal@intel.com>
Acked-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Signed-off-by: Imre Deak <imre.deak@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240416221010.376865-9-imre.deak@intel.com
---
 drivers/gpu/drm/display/drm_dp_mst_topology.c | 6 +++---
 include/drm/display/drm_dp_mst_helper.h       | 6 ++++++
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/display/drm_dp_mst_topology.c b/drivers/gpu/drm/display/drm_dp_mst_topology.c
index c193be3577f7..46b99d5fe008 100644
--- a/drivers/gpu/drm/display/drm_dp_mst_topology.c
+++ b/drivers/gpu/drm/display/drm_dp_mst_topology.c
@@ -2274,7 +2274,7 @@ drm_dp_mst_port_add_connector(struct drm_dp_mst_branch *mstb,
 
 	if (port->pdt != DP_PEER_DEVICE_NONE &&
 	    drm_dp_mst_is_end_device(port->pdt, port->mcs) &&
-	    port->port_num >= DP_MST_LOGICAL_PORT_0)
+	    drm_dp_mst_port_is_logical(port))
 		port->cached_edid = drm_edid_read_ddc(port->connector,
 						      &port->aux.ddc);
 
@@ -4219,7 +4219,7 @@ drm_dp_mst_detect_port(struct drm_connector *connector,
 	case DP_PEER_DEVICE_SST_SINK:
 		ret = connector_status_connected;
 		/* for logical ports - cache the EDID */
-		if (port->port_num >= DP_MST_LOGICAL_PORT_0 && !port->cached_edid)
+		if (drm_dp_mst_port_is_logical(port) && !port->cached_edid)
 			port->cached_edid = drm_edid_read_ddc(connector, &port->aux.ddc);
 		break;
 	case DP_PEER_DEVICE_DP_LEGACY_CONV:
@@ -5983,7 +5983,7 @@ static bool drm_dp_mst_is_virtual_dpcd(struct drm_dp_mst_port *port)
 		return false;
 
 	/* Virtual DP Sink (Internal Display Panel) */
-	if (port->port_num >= 8)
+	if (drm_dp_mst_port_is_logical(port))
 		return true;
 
 	/* DP-to-HDMI Protocol Converter */
diff --git a/include/drm/display/drm_dp_mst_helper.h b/include/drm/display/drm_dp_mst_helper.h
index 3c9e128c444a..48fd1b448149 100644
--- a/include/drm/display/drm_dp_mst_helper.h
+++ b/include/drm/display/drm_dp_mst_helper.h
@@ -949,6 +949,12 @@ int __must_check drm_dp_mst_root_conn_atomic_check(struct drm_connector_state *n
 void drm_dp_mst_get_port_malloc(struct drm_dp_mst_port *port);
 void drm_dp_mst_put_port_malloc(struct drm_dp_mst_port *port);
 
+static inline
+bool drm_dp_mst_port_is_logical(struct drm_dp_mst_port *port)
+{
+	return port->port_num >= DP_MST_LOGICAL_PORT_0;
+}
+
 struct drm_dp_aux *drm_dp_mst_dsc_aux_for_port(struct drm_dp_mst_port *port);
 
 static inline struct drm_dp_mst_topology_state *
-- 
cgit v1.2.3


From 93cb4094a7fd30a6e20b9781aafb94e3fffbe59a Mon Sep 17 00:00:00 2001
From: Imre Deak <imre.deak@intel.com>
Date: Wed, 17 Apr 2024 01:10:08 +0300
Subject: drm/dp_mst: Add drm_dp_mst_aux_for_parent()

Add a function to get the AUX device of the parent of an MST port, used
by a follow-up i915 patch in the patchset.

v2: Move drm_dp_mst_aux_for_parent() forward declaration to this patch
    (Ankit)

Cc: Lyude Paul <lyude@redhat.com>
Cc: dri-devel@lists.freedesktop.org
Reviewed-by: Ankit Nautiyal <ankit.k.nautiyal@intel.com>
Acked-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Signed-off-by: Imre Deak <imre.deak@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240416221010.376865-10-imre.deak@intel.com
---
 drivers/gpu/drm/display/drm_dp_mst_topology.c | 16 ++++++++++++++++
 include/drm/display/drm_dp_mst_helper.h       |  1 +
 2 files changed, 17 insertions(+)

(limited to 'include')

diff --git a/drivers/gpu/drm/display/drm_dp_mst_topology.c b/drivers/gpu/drm/display/drm_dp_mst_topology.c
index 46b99d5fe008..3577786b5db2 100644
--- a/drivers/gpu/drm/display/drm_dp_mst_topology.c
+++ b/drivers/gpu/drm/display/drm_dp_mst_topology.c
@@ -6010,6 +6010,22 @@ static bool drm_dp_mst_is_virtual_dpcd(struct drm_dp_mst_port *port)
 	return false;
 }
 
+/**
+ * drm_dp_mst_aux_for_parent() - Get the AUX device for an MST port's parent
+ * @port: MST port whose parent's AUX device is returned
+ *
+ * Return the AUX device for @port's parent or NULL if port's parent is the
+ * root port.
+ */
+struct drm_dp_aux *drm_dp_mst_aux_for_parent(struct drm_dp_mst_port *port)
+{
+	if (!port->parent || !port->parent->port_parent)
+		return NULL;
+
+	return &port->parent->port_parent->aux;
+}
+EXPORT_SYMBOL(drm_dp_mst_aux_for_parent);
+
 /**
  * drm_dp_mst_dsc_aux_for_port() - Find the correct aux for DSC
  * @port: The port to check. A leaf of the MST tree with an attached display.
diff --git a/include/drm/display/drm_dp_mst_helper.h b/include/drm/display/drm_dp_mst_helper.h
index 48fd1b448149..39796e0defdb 100644
--- a/include/drm/display/drm_dp_mst_helper.h
+++ b/include/drm/display/drm_dp_mst_helper.h
@@ -955,6 +955,7 @@ bool drm_dp_mst_port_is_logical(struct drm_dp_mst_port *port)
 	return port->port_num >= DP_MST_LOGICAL_PORT_0;
 }
 
+struct drm_dp_aux *drm_dp_mst_aux_for_parent(struct drm_dp_mst_port *port);
 struct drm_dp_aux *drm_dp_mst_dsc_aux_for_port(struct drm_dp_mst_port *port);
 
 static inline struct drm_dp_mst_topology_state *
-- 
cgit v1.2.3


From c23e2b7103090b05e4d567d8976f99926ea855e9 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Mon, 22 Jan 2024 15:53:11 -0800
Subject: KVM: Allow page-sized MMU caches to be initialized with custom 64-bit
 values

Add support to MMU caches for initializing a page with a custom 64-bit
value, e.g. to pre-fill an entire page table with non-zero PTE values.
The functionality will be used by x86 to support Intel's TDX, which needs
to set bit 63 in all non-present PTEs in order to prevent !PRESENT page
faults from getting reflected into the guest (Intel's EPT Violation #VE
architecture made the less than brilliant decision of having the per-PTE
behavior be opt-out instead of opt-in).

Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
Message-Id: <5919f685f109a1b0ebc6bd8fc4536ee94bcc172d.1705965635.git.isaku.yamahata@intel.com>
Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 include/linux/kvm_types.h |  1 +
 virt/kvm/kvm_main.c       | 16 ++++++++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index d93f6522b2c3..827ecc0b7e10 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -86,6 +86,7 @@ struct gfn_to_pfn_cache {
 struct kvm_mmu_memory_cache {
 	gfp_t gfp_zero;
 	gfp_t gfp_custom;
+	u64 init_value;
 	struct kmem_cache *kmem_cache;
 	int capacity;
 	int nobjs;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 658581d4ad68..38b498669ef9 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -401,12 +401,17 @@ static void kvm_flush_shadow_all(struct kvm *kvm)
 static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
 					       gfp_t gfp_flags)
 {
+	void *page;
+
 	gfp_flags |= mc->gfp_zero;
 
 	if (mc->kmem_cache)
 		return kmem_cache_alloc(mc->kmem_cache, gfp_flags);
-	else
-		return (void *)__get_free_page(gfp_flags);
+
+	page = (void *)__get_free_page(gfp_flags);
+	if (page && mc->init_value)
+		memset64(page, mc->init_value, PAGE_SIZE / sizeof(u64));
+	return page;
 }
 
 int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
@@ -421,6 +426,13 @@ int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity,
 		if (WARN_ON_ONCE(!capacity))
 			return -EIO;
 
+		/*
+		 * Custom init values can be used only for page allocations,
+		 * and obviously conflict with __GFP_ZERO.
+		 */
+		if (WARN_ON_ONCE(mc->init_value && (mc->kmem_cache || mc->gfp_zero)))
+			return -EIO;
+
 		mc->objects = kvmalloc_array(capacity, sizeof(void *), gfp);
 		if (!mc->objects)
 			return -ENOMEM;
-- 
cgit v1.2.3


From 92f750d847c997ff4b0f04d83443af00fb729ba3 Mon Sep 17 00:00:00 2001
From: Chao Yu <chao@kernel.org>
Date: Tue, 16 Apr 2024 15:23:18 +0800
Subject: f2fs: convert f2fs__page tracepoint class to use folio

Convert f2fs__page tracepoint class() and its instances to use folio
and related functionality, and rename it to f2fs__folio().

Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/checkpoint.c        |  4 ++--
 fs/f2fs/data.c              | 10 +++++-----
 fs/f2fs/node.c              |  4 ++--
 include/trace/events/f2fs.h | 42 +++++++++++++++++++++---------------------
 4 files changed, 30 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index eac698b8dd38..5d05a413f451 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -345,7 +345,7 @@ static int __f2fs_write_meta_page(struct page *page,
 {
 	struct f2fs_sb_info *sbi = F2FS_P_SB(page);
 
-	trace_f2fs_writepage(page, META);
+	trace_f2fs_writepage(page_folio(page), META);
 
 	if (unlikely(f2fs_cp_error(sbi))) {
 		if (is_sbi_flag_set(sbi, SBI_IS_CLOSE)) {
@@ -492,7 +492,7 @@ stop:
 static bool f2fs_dirty_meta_folio(struct address_space *mapping,
 		struct folio *folio)
 {
-	trace_f2fs_set_page_dirty(&folio->page, META);
+	trace_f2fs_set_page_dirty(folio, META);
 
 	if (!folio_test_uptodate(folio))
 		folio_mark_uptodate(folio);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 763f0952efdd..bee1e45f76b8 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2461,7 +2461,7 @@ static int f2fs_read_data_folio(struct file *file, struct folio *folio)
 	struct inode *inode = folio_file_mapping(folio)->host;
 	int ret = -EAGAIN;
 
-	trace_f2fs_readpage(&folio->page, DATA);
+	trace_f2fs_readpage(folio, DATA);
 
 	if (!f2fs_is_compress_backend_ready(inode)) {
 		folio_unlock(folio);
@@ -2710,7 +2710,7 @@ got_it:
 		} else {
 			set_inode_flag(inode, FI_UPDATE_WRITE);
 		}
-		trace_f2fs_do_write_data_page(fio->page, IPU);
+		trace_f2fs_do_write_data_page(page_folio(page), IPU);
 		return err;
 	}
 
@@ -2739,7 +2739,7 @@ got_it:
 
 	/* LFS mode write path */
 	f2fs_outplace_write_data(&dn, fio);
-	trace_f2fs_do_write_data_page(page, OPU);
+	trace_f2fs_do_write_data_page(page_folio(page), OPU);
 	set_inode_flag(inode, FI_APPEND_WRITE);
 out_writepage:
 	f2fs_put_dnode(&dn);
@@ -2786,7 +2786,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
 		.last_block = last_block,
 	};
 
-	trace_f2fs_writepage(page, DATA);
+	trace_f2fs_writepage(page_folio(page), DATA);
 
 	/* we should bypass data pages to proceed the kworker jobs */
 	if (unlikely(f2fs_cp_error(sbi))) {
@@ -3760,7 +3760,7 @@ static bool f2fs_dirty_data_folio(struct address_space *mapping,
 {
 	struct inode *inode = mapping->host;
 
-	trace_f2fs_set_page_dirty(&folio->page, DATA);
+	trace_f2fs_set_page_dirty(folio, DATA);
 
 	if (!folio_test_uptodate(folio))
 		folio_mark_uptodate(folio);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 3b9eb5693683..95cecf08cb37 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1624,7 +1624,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
 	};
 	unsigned int seq;
 
-	trace_f2fs_writepage(page, NODE);
+	trace_f2fs_writepage(page_folio(page), NODE);
 
 	if (unlikely(f2fs_cp_error(sbi))) {
 		/* keep node pages in remount-ro mode */
@@ -2171,7 +2171,7 @@ skip_write:
 static bool f2fs_dirty_node_folio(struct address_space *mapping,
 		struct folio *folio)
 {
-	trace_f2fs_set_page_dirty(&folio->page, NODE);
+	trace_f2fs_set_page_dirty(folio, NODE);
 
 	if (!folio_test_uptodate(folio))
 		folio_mark_uptodate(folio);
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 7ed0fc430dc6..371ba28415f5 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -1304,11 +1304,11 @@ TRACE_EVENT(f2fs_write_end,
 		__entry->copied)
 );
 
-DECLARE_EVENT_CLASS(f2fs__page,
+DECLARE_EVENT_CLASS(f2fs__folio,
 
-	TP_PROTO(struct page *page, int type),
+	TP_PROTO(struct folio *folio, int type),
 
-	TP_ARGS(page, type),
+	TP_ARGS(folio, type),
 
 	TP_STRUCT__entry(
 		__field(dev_t,	dev)
@@ -1321,14 +1321,14 @@ DECLARE_EVENT_CLASS(f2fs__page,
 	),
 
 	TP_fast_assign(
-		__entry->dev	= page_file_mapping(page)->host->i_sb->s_dev;
-		__entry->ino	= page_file_mapping(page)->host->i_ino;
+		__entry->dev	= folio_file_mapping(folio)->host->i_sb->s_dev;
+		__entry->ino	= folio_file_mapping(folio)->host->i_ino;
 		__entry->type	= type;
 		__entry->dir	=
-			S_ISDIR(page_file_mapping(page)->host->i_mode);
-		__entry->index	= page->index;
-		__entry->dirty	= PageDirty(page);
-		__entry->uptodate = PageUptodate(page);
+			S_ISDIR(folio_file_mapping(folio)->host->i_mode);
+		__entry->index	= folio_index(folio);
+		__entry->dirty	= folio_test_dirty(folio);
+		__entry->uptodate = folio_test_uptodate(folio);
 	),
 
 	TP_printk("dev = (%d,%d), ino = %lu, %s, %s, index = %lu, "
@@ -1341,32 +1341,32 @@ DECLARE_EVENT_CLASS(f2fs__page,
 		__entry->uptodate)
 );
 
-DEFINE_EVENT(f2fs__page, f2fs_writepage,
+DEFINE_EVENT(f2fs__folio, f2fs_writepage,
 
-	TP_PROTO(struct page *page, int type),
+	TP_PROTO(struct folio *folio, int type),
 
-	TP_ARGS(page, type)
+	TP_ARGS(folio, type)
 );
 
-DEFINE_EVENT(f2fs__page, f2fs_do_write_data_page,
+DEFINE_EVENT(f2fs__folio, f2fs_do_write_data_page,
 
-	TP_PROTO(struct page *page, int type),
+	TP_PROTO(struct folio *folio, int type),
 
-	TP_ARGS(page, type)
+	TP_ARGS(folio, type)
 );
 
-DEFINE_EVENT(f2fs__page, f2fs_readpage,
+DEFINE_EVENT(f2fs__folio, f2fs_readpage,
 
-	TP_PROTO(struct page *page, int type),
+	TP_PROTO(struct folio *folio, int type),
 
-	TP_ARGS(page, type)
+	TP_ARGS(folio, type)
 );
 
-DEFINE_EVENT(f2fs__page, f2fs_set_page_dirty,
+DEFINE_EVENT(f2fs__folio, f2fs_set_page_dirty,
 
-	TP_PROTO(struct page *page, int type),
+	TP_PROTO(struct folio *folio, int type),
 
-	TP_ARGS(page, type)
+	TP_ARGS(folio, type)
 );
 
 TRACE_EVENT(f2fs_replace_atomic_write_block,
-- 
cgit v1.2.3


From a094de22e2efc2ec7f540d10d1edb7038f863925 Mon Sep 17 00:00:00 2001
From: Nuno Sa <nuno.sa@analog.com>
Date: Fri, 19 Apr 2024 10:25:34 +0200
Subject: iio: buffer-dma: add iio_dmaengine_buffer_setup()

This brings the DMA buffer API more in line with what we have in the
triggered buffer. There's no need of having both
devm_iio_dmaengine_buffer_setup() and devm_iio_dmaengine_buffer_alloc().
Hence we introduce the new iio_dmaengine_buffer_setup() that together
with devm_iio_dmaengine_buffer_setup() should be all we need.

Note that as part of this change iio_dmaengine_buffer_alloc() is again
static and the axi-adc was updated accordingly.

Signed-off-by: Nuno Sa <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20240419-iio-backend-axi-dac-v4-1-5ca45b4de294@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/adc/adi-axi-adc.c                      | 16 +-------
 drivers/iio/buffer/industrialio-buffer-dmaengine.c | 48 +++++++++-------------
 include/linux/iio/buffer-dmaengine.h               |  5 ++-
 3 files changed, 24 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/drivers/iio/adc/adi-axi-adc.c b/drivers/iio/adc/adi-axi-adc.c
index 4156639b3c8b..184b36dca6d0 100644
--- a/drivers/iio/adc/adi-axi-adc.c
+++ b/drivers/iio/adc/adi-axi-adc.c
@@ -124,26 +124,12 @@ static struct iio_buffer *axi_adc_request_buffer(struct iio_backend *back,
 						 struct iio_dev *indio_dev)
 {
 	struct adi_axi_adc_state *st = iio_backend_get_priv(back);
-	struct iio_buffer *buffer;
 	const char *dma_name;
-	int ret;
 
 	if (device_property_read_string(st->dev, "dma-names", &dma_name))
 		dma_name = "rx";
 
-	buffer = iio_dmaengine_buffer_alloc(st->dev, dma_name);
-	if (IS_ERR(buffer)) {
-		dev_err(st->dev, "Could not get DMA buffer, %ld\n",
-			PTR_ERR(buffer));
-		return ERR_CAST(buffer);
-	}
-
-	indio_dev->modes |= INDIO_BUFFER_HARDWARE;
-	ret = iio_device_attach_buffer(indio_dev, buffer);
-	if (ret)
-		return ERR_PTR(ret);
-
-	return buffer;
+	return iio_dmaengine_buffer_setup(st->dev, indio_dev, dma_name);
 }
 
 static void axi_adc_free_buffer(struct iio_backend *back,
diff --git a/drivers/iio/buffer/industrialio-buffer-dmaengine.c b/drivers/iio/buffer/industrialio-buffer-dmaengine.c
index a18c1da292af..97f3116566f5 100644
--- a/drivers/iio/buffer/industrialio-buffer-dmaengine.c
+++ b/drivers/iio/buffer/industrialio-buffer-dmaengine.c
@@ -159,7 +159,7 @@ static const struct iio_dev_attr *iio_dmaengine_buffer_attrs[] = {
  * Once done using the buffer iio_dmaengine_buffer_free() should be used to
  * release it.
  */
-struct iio_buffer *iio_dmaengine_buffer_alloc(struct device *dev,
+static struct iio_buffer *iio_dmaengine_buffer_alloc(struct device *dev,
 	const char *channel)
 {
 	struct dmaengine_buffer *dmaengine_buffer;
@@ -210,7 +210,6 @@ err_free:
 	kfree(dmaengine_buffer);
 	return ERR_PTR(ret);
 }
-EXPORT_SYMBOL_NS_GPL(iio_dmaengine_buffer_alloc, IIO_DMAENGINE_BUFFER);
 
 /**
  * iio_dmaengine_buffer_free() - Free dmaengine buffer
@@ -230,39 +229,33 @@ void iio_dmaengine_buffer_free(struct iio_buffer *buffer)
 }
 EXPORT_SYMBOL_NS_GPL(iio_dmaengine_buffer_free, IIO_DMAENGINE_BUFFER);
 
-static void __devm_iio_dmaengine_buffer_free(void *buffer)
-{
-	iio_dmaengine_buffer_free(buffer);
-}
-
-/**
- * devm_iio_dmaengine_buffer_alloc() - Resource-managed iio_dmaengine_buffer_alloc()
- * @dev: Parent device for the buffer
- * @channel: DMA channel name, typically "rx".
- *
- * This allocates a new IIO buffer which internally uses the DMAengine framework
- * to perform its transfers. The parent device will be used to request the DMA
- * channel.
- *
- * The buffer will be automatically de-allocated once the device gets destroyed.
- */
-static struct iio_buffer *devm_iio_dmaengine_buffer_alloc(struct device *dev,
-	const char *channel)
+struct iio_buffer *iio_dmaengine_buffer_setup(struct device *dev,
+					      struct iio_dev *indio_dev,
+					      const char *channel)
 {
 	struct iio_buffer *buffer;
 	int ret;
 
 	buffer = iio_dmaengine_buffer_alloc(dev, channel);
 	if (IS_ERR(buffer))
-		return buffer;
+		return ERR_CAST(buffer);
+
+	indio_dev->modes |= INDIO_BUFFER_HARDWARE;
 
-	ret = devm_add_action_or_reset(dev, __devm_iio_dmaengine_buffer_free,
-				       buffer);
-	if (ret)
+	ret = iio_device_attach_buffer(indio_dev, buffer);
+	if (ret) {
+		iio_dmaengine_buffer_free(buffer);
 		return ERR_PTR(ret);
+	}
 
 	return buffer;
 }
+EXPORT_SYMBOL_NS_GPL(iio_dmaengine_buffer_setup, IIO_DMAENGINE_BUFFER);
+
+static void __devm_iio_dmaengine_buffer_free(void *buffer)
+{
+	iio_dmaengine_buffer_free(buffer);
+}
 
 /**
  * devm_iio_dmaengine_buffer_setup() - Setup a DMA buffer for an IIO device
@@ -281,13 +274,12 @@ int devm_iio_dmaengine_buffer_setup(struct device *dev,
 {
 	struct iio_buffer *buffer;
 
-	buffer = devm_iio_dmaengine_buffer_alloc(dev, channel);
+	buffer = iio_dmaengine_buffer_setup(dev, indio_dev, channel);
 	if (IS_ERR(buffer))
 		return PTR_ERR(buffer);
 
-	indio_dev->modes |= INDIO_BUFFER_HARDWARE;
-
-	return iio_device_attach_buffer(indio_dev, buffer);
+	return devm_add_action_or_reset(dev, __devm_iio_dmaengine_buffer_free,
+					buffer);
 }
 EXPORT_SYMBOL_NS_GPL(devm_iio_dmaengine_buffer_setup, IIO_DMAENGINE_BUFFER);
 
diff --git a/include/linux/iio/buffer-dmaengine.h b/include/linux/iio/buffer-dmaengine.h
index cbb8ba957fad..acb60f9a3fff 100644
--- a/include/linux/iio/buffer-dmaengine.h
+++ b/include/linux/iio/buffer-dmaengine.h
@@ -10,9 +10,10 @@
 struct iio_dev;
 struct device;
 
-struct iio_buffer *iio_dmaengine_buffer_alloc(struct device *dev,
-					      const char *channel);
 void iio_dmaengine_buffer_free(struct iio_buffer *buffer);
+struct iio_buffer *iio_dmaengine_buffer_setup(struct device *dev,
+					      struct iio_dev *indio_dev,
+					      const char *channel);
 int devm_iio_dmaengine_buffer_setup(struct device *dev,
 				    struct iio_dev *indio_dev,
 				    const char *channel);
-- 
cgit v1.2.3


From 04ae3b1a76b77f98a4a0c8ed2c544007334fc680 Mon Sep 17 00:00:00 2001
From: Paul Cercueil <paul@crapouillou.net>
Date: Fri, 19 Apr 2024 10:25:35 +0200
Subject: iio: buffer-dma: Rename iio_dma_buffer_data_available()

Change its name to iio_dma_buffer_usage(), as this function can be used
both for the .data_available and the .space_available callbacks.

Signed-off-by: Paul Cercueil <paul@crapouillou.net>
Signed-off-by: Nuno Sa <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20240419-iio-backend-axi-dac-v4-2-5ca45b4de294@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/buffer/industrialio-buffer-dma.c       | 11 ++++++-----
 drivers/iio/buffer/industrialio-buffer-dmaengine.c |  2 +-
 include/linux/iio/buffer-dma.h                     |  2 +-
 3 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/iio/buffer/industrialio-buffer-dma.c b/drivers/iio/buffer/industrialio-buffer-dma.c
index 5610ba67925e..404f9867bdc5 100644
--- a/drivers/iio/buffer/industrialio-buffer-dma.c
+++ b/drivers/iio/buffer/industrialio-buffer-dma.c
@@ -547,13 +547,14 @@ out_unlock:
 EXPORT_SYMBOL_GPL(iio_dma_buffer_read);
 
 /**
- * iio_dma_buffer_data_available() - DMA buffer data_available callback
+ * iio_dma_buffer_usage() - DMA buffer data_available and
+ * space_available callback
  * @buf: Buffer to check for data availability
  *
- * Should be used as the data_available callback for iio_buffer_access_ops
- * struct for DMA buffers.
+ * Should be used as the data_available and space_available callbacks for
+ * iio_buffer_access_ops struct for DMA buffers.
  */
-size_t iio_dma_buffer_data_available(struct iio_buffer *buf)
+size_t iio_dma_buffer_usage(struct iio_buffer *buf)
 {
 	struct iio_dma_buffer_queue *queue = iio_buffer_to_queue(buf);
 	struct iio_dma_buffer_block *block;
@@ -586,7 +587,7 @@ size_t iio_dma_buffer_data_available(struct iio_buffer *buf)
 
 	return data_available;
 }
-EXPORT_SYMBOL_GPL(iio_dma_buffer_data_available);
+EXPORT_SYMBOL_GPL(iio_dma_buffer_usage);
 
 /**
  * iio_dma_buffer_set_bytes_per_datum() - DMA buffer set_bytes_per_datum callback
diff --git a/drivers/iio/buffer/industrialio-buffer-dmaengine.c b/drivers/iio/buffer/industrialio-buffer-dmaengine.c
index 97f3116566f5..df05d66afff9 100644
--- a/drivers/iio/buffer/industrialio-buffer-dmaengine.c
+++ b/drivers/iio/buffer/industrialio-buffer-dmaengine.c
@@ -117,7 +117,7 @@ static const struct iio_buffer_access_funcs iio_dmaengine_buffer_ops = {
 	.request_update = iio_dma_buffer_request_update,
 	.enable = iio_dma_buffer_enable,
 	.disable = iio_dma_buffer_disable,
-	.data_available = iio_dma_buffer_data_available,
+	.data_available = iio_dma_buffer_usage,
 	.release = iio_dmaengine_buffer_release,
 
 	.modes = INDIO_BUFFER_HARDWARE,
diff --git a/include/linux/iio/buffer-dma.h b/include/linux/iio/buffer-dma.h
index 18d3702fa95d..52a838ec0e57 100644
--- a/include/linux/iio/buffer-dma.h
+++ b/include/linux/iio/buffer-dma.h
@@ -132,7 +132,7 @@ int iio_dma_buffer_disable(struct iio_buffer *buffer,
 	struct iio_dev *indio_dev);
 int iio_dma_buffer_read(struct iio_buffer *buffer, size_t n,
 	char __user *user_buffer);
-size_t iio_dma_buffer_data_available(struct iio_buffer *buffer);
+size_t iio_dma_buffer_usage(struct iio_buffer *buffer);
 int iio_dma_buffer_set_bytes_per_datum(struct iio_buffer *buffer, size_t bpd);
 int iio_dma_buffer_set_length(struct iio_buffer *buffer, unsigned int length);
 int iio_dma_buffer_request_update(struct iio_buffer *buffer);
-- 
cgit v1.2.3


From fb09febafd160b7aefd9e61f710a0c50f0472403 Mon Sep 17 00:00:00 2001
From: Paul Cercueil <paul@crapouillou.net>
Date: Fri, 19 Apr 2024 10:25:36 +0200
Subject: iio: buffer-dma: Enable buffer write support

Adding write support to the buffer-dma code is easy - the write()
function basically needs to do the exact same thing as the read()
function: dequeue a block, read or write the data, enqueue the block
when entirely processed.

Therefore, the iio_buffer_dma_read() and the new iio_buffer_dma_write()
now both call a function iio_buffer_dma_io(), which will perform this
task.

Note that we preemptively reset block->bytes_used to the buffer's size
in iio_dma_buffer_request_update(), as in the future the
iio_dma_buffer_enqueue() function won't reset it.

Signed-off-by: Paul Cercueil <paul@crapouillou.net>
Reviewed-by: Alexandru Ardelean <ardeleanalex@gmail.com>
Signed-off-by: Nuno Sa <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20240419-iio-backend-axi-dac-v4-3-5ca45b4de294@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/buffer/industrialio-buffer-dma.c | 89 +++++++++++++++++++++++-----
 include/linux/iio/buffer-dma.h               |  2 +
 2 files changed, 75 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/iio/buffer/industrialio-buffer-dma.c b/drivers/iio/buffer/industrialio-buffer-dma.c
index 404f9867bdc5..13b1a858969e 100644
--- a/drivers/iio/buffer/industrialio-buffer-dma.c
+++ b/drivers/iio/buffer/industrialio-buffer-dma.c
@@ -195,6 +195,18 @@ static void _iio_dma_buffer_block_done(struct iio_dma_buffer_block *block)
 		block->state = IIO_BLOCK_STATE_DONE;
 }
 
+static void iio_dma_buffer_queue_wake(struct iio_dma_buffer_queue *queue)
+{
+	__poll_t flags;
+
+	if (queue->buffer.direction == IIO_BUFFER_DIRECTION_IN)
+		flags = EPOLLIN | EPOLLRDNORM;
+	else
+		flags = EPOLLOUT | EPOLLWRNORM;
+
+	wake_up_interruptible_poll(&queue->buffer.pollq, flags);
+}
+
 /**
  * iio_dma_buffer_block_done() - Indicate that a block has been completed
  * @block: The completed block
@@ -212,7 +224,7 @@ void iio_dma_buffer_block_done(struct iio_dma_buffer_block *block)
 	spin_unlock_irqrestore(&queue->list_lock, flags);
 
 	iio_buffer_block_put_atomic(block);
-	wake_up_interruptible_poll(&queue->buffer.pollq, EPOLLIN | EPOLLRDNORM);
+	iio_dma_buffer_queue_wake(queue);
 }
 EXPORT_SYMBOL_GPL(iio_dma_buffer_block_done);
 
@@ -241,7 +253,7 @@ void iio_dma_buffer_block_list_abort(struct iio_dma_buffer_queue *queue,
 	}
 	spin_unlock_irqrestore(&queue->list_lock, flags);
 
-	wake_up_interruptible_poll(&queue->buffer.pollq, EPOLLIN | EPOLLRDNORM);
+	iio_dma_buffer_queue_wake(queue);
 }
 EXPORT_SYMBOL_GPL(iio_dma_buffer_block_list_abort);
 
@@ -335,8 +347,24 @@ int iio_dma_buffer_request_update(struct iio_buffer *buffer)
 			queue->fileio.blocks[i] = block;
 		}
 
-		block->state = IIO_BLOCK_STATE_QUEUED;
-		list_add_tail(&block->head, &queue->incoming);
+		/*
+		 * block->bytes_used may have been modified previously, e.g. by
+		 * iio_dma_buffer_block_list_abort(). Reset it here to the
+		 * block's so that iio_dma_buffer_io() will work.
+		 */
+		block->bytes_used = block->size;
+
+		/*
+		 * If it's an input buffer, mark the block as queued, and
+		 * iio_dma_buffer_enable() will submit it. Otherwise mark it as
+		 * done, which means it's ready to be dequeued.
+		 */
+		if (queue->buffer.direction == IIO_BUFFER_DIRECTION_IN) {
+			block->state = IIO_BLOCK_STATE_QUEUED;
+			list_add_tail(&block->head, &queue->incoming);
+		} else {
+			block->state = IIO_BLOCK_STATE_DONE;
+		}
 	}
 
 out_unlock:
@@ -488,20 +516,12 @@ static struct iio_dma_buffer_block *iio_dma_buffer_dequeue(
 	return block;
 }
 
-/**
- * iio_dma_buffer_read() - DMA buffer read callback
- * @buffer: Buffer to read form
- * @n: Number of bytes to read
- * @user_buffer: Userspace buffer to copy the data to
- *
- * Should be used as the read callback for iio_buffer_access_ops
- * struct for DMA buffers.
- */
-int iio_dma_buffer_read(struct iio_buffer *buffer, size_t n,
-	char __user *user_buffer)
+static int iio_dma_buffer_io(struct iio_buffer *buffer, size_t n,
+			     char __user *user_buffer, bool is_from_user)
 {
 	struct iio_dma_buffer_queue *queue = iio_buffer_to_queue(buffer);
 	struct iio_dma_buffer_block *block;
+	void *addr;
 	int ret;
 
 	if (n < buffer->bytes_per_datum)
@@ -524,8 +544,13 @@ int iio_dma_buffer_read(struct iio_buffer *buffer, size_t n,
 	n = rounddown(n, buffer->bytes_per_datum);
 	if (n > block->bytes_used - queue->fileio.pos)
 		n = block->bytes_used - queue->fileio.pos;
+	addr = block->vaddr + queue->fileio.pos;
 
-	if (copy_to_user(user_buffer, block->vaddr + queue->fileio.pos, n)) {
+	if (is_from_user)
+		ret = copy_from_user(addr, user_buffer, n);
+	else
+		ret = copy_to_user(user_buffer, addr, n);
+	if (ret) {
 		ret = -EFAULT;
 		goto out_unlock;
 	}
@@ -544,8 +569,40 @@ out_unlock:
 
 	return ret;
 }
+
+/**
+ * iio_dma_buffer_read() - DMA buffer read callback
+ * @buffer: Buffer to read form
+ * @n: Number of bytes to read
+ * @user_buffer: Userspace buffer to copy the data to
+ *
+ * Should be used as the read callback for iio_buffer_access_ops
+ * struct for DMA buffers.
+ */
+int iio_dma_buffer_read(struct iio_buffer *buffer, size_t n,
+			char __user *user_buffer)
+{
+	return iio_dma_buffer_io(buffer, n, user_buffer, false);
+}
 EXPORT_SYMBOL_GPL(iio_dma_buffer_read);
 
+/**
+ * iio_dma_buffer_write() - DMA buffer write callback
+ * @buffer: Buffer to read form
+ * @n: Number of bytes to read
+ * @user_buffer: Userspace buffer to copy the data from
+ *
+ * Should be used as the write callback for iio_buffer_access_ops
+ * struct for DMA buffers.
+ */
+int iio_dma_buffer_write(struct iio_buffer *buffer, size_t n,
+			 const char __user *user_buffer)
+{
+	return iio_dma_buffer_io(buffer, n,
+				 (__force __user char *)user_buffer, true);
+}
+EXPORT_SYMBOL_GPL(iio_dma_buffer_write);
+
 /**
  * iio_dma_buffer_usage() - DMA buffer data_available and
  * space_available callback
diff --git a/include/linux/iio/buffer-dma.h b/include/linux/iio/buffer-dma.h
index 52a838ec0e57..6e27e47077d5 100644
--- a/include/linux/iio/buffer-dma.h
+++ b/include/linux/iio/buffer-dma.h
@@ -132,6 +132,8 @@ int iio_dma_buffer_disable(struct iio_buffer *buffer,
 	struct iio_dev *indio_dev);
 int iio_dma_buffer_read(struct iio_buffer *buffer, size_t n,
 	char __user *user_buffer);
+int iio_dma_buffer_write(struct iio_buffer *buffer, size_t n,
+			 const char __user *user_buffer);
 size_t iio_dma_buffer_usage(struct iio_buffer *buffer);
 int iio_dma_buffer_set_bytes_per_datum(struct iio_buffer *buffer, size_t bpd);
 int iio_dma_buffer_set_length(struct iio_buffer *buffer, unsigned int length);
-- 
cgit v1.2.3


From c1b91566580c245cf1147745d174be5e059ace6b Mon Sep 17 00:00:00 2001
From: Paul Cercueil <paul@crapouillou.net>
Date: Fri, 19 Apr 2024 10:25:37 +0200
Subject: iio: buffer-dmaengine: Support specifying buffer direction

Update the devm_iio_dmaengine_buffer_setup() function to support
specifying the buffer direction.

Update the iio_dmaengine_buffer_submit() function to handle input
buffers as well as output buffers.

Signed-off-by: Paul Cercueil <paul@crapouillou.net>
Reviewed-by: Alexandru Ardelean <ardeleanalex@gmail.com>
Signed-off-by: Nuno Sa <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20240419-iio-backend-axi-dac-v4-4-5ca45b4de294@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/buffer/industrialio-buffer-dmaengine.c | 44 +++++++++++++++-------
 include/linux/iio/buffer-dmaengine.h               | 25 +++++++++---
 2 files changed, 49 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/iio/buffer/industrialio-buffer-dmaengine.c b/drivers/iio/buffer/industrialio-buffer-dmaengine.c
index df05d66afff9..951012651018 100644
--- a/drivers/iio/buffer/industrialio-buffer-dmaengine.c
+++ b/drivers/iio/buffer/industrialio-buffer-dmaengine.c
@@ -64,14 +64,25 @@ static int iio_dmaengine_buffer_submit_block(struct iio_dma_buffer_queue *queue,
 	struct dmaengine_buffer *dmaengine_buffer =
 		iio_buffer_to_dmaengine_buffer(&queue->buffer);
 	struct dma_async_tx_descriptor *desc;
+	enum dma_transfer_direction dma_dir;
+	size_t max_size;
 	dma_cookie_t cookie;
 
-	block->bytes_used = min(block->size, dmaengine_buffer->max_size);
-	block->bytes_used = round_down(block->bytes_used,
-			dmaengine_buffer->align);
+	max_size = min(block->size, dmaengine_buffer->max_size);
+	max_size = round_down(max_size, dmaengine_buffer->align);
+
+	if (queue->buffer.direction == IIO_BUFFER_DIRECTION_IN) {
+		block->bytes_used = max_size;
+		dma_dir = DMA_DEV_TO_MEM;
+	} else {
+		dma_dir = DMA_MEM_TO_DEV;
+	}
+
+	if (!block->bytes_used || block->bytes_used > max_size)
+		return -EINVAL;
 
 	desc = dmaengine_prep_slave_single(dmaengine_buffer->chan,
-		block->phys_addr, block->bytes_used, DMA_DEV_TO_MEM,
+		block->phys_addr, block->bytes_used, dma_dir,
 		DMA_PREP_INTERRUPT);
 	if (!desc)
 		return -ENOMEM;
@@ -229,9 +240,10 @@ void iio_dmaengine_buffer_free(struct iio_buffer *buffer)
 }
 EXPORT_SYMBOL_NS_GPL(iio_dmaengine_buffer_free, IIO_DMAENGINE_BUFFER);
 
-struct iio_buffer *iio_dmaengine_buffer_setup(struct device *dev,
-					      struct iio_dev *indio_dev,
-					      const char *channel)
+struct iio_buffer *iio_dmaengine_buffer_setup_ext(struct device *dev,
+						  struct iio_dev *indio_dev,
+						  const char *channel,
+						  enum iio_buffer_direction dir)
 {
 	struct iio_buffer *buffer;
 	int ret;
@@ -242,6 +254,8 @@ struct iio_buffer *iio_dmaengine_buffer_setup(struct device *dev,
 
 	indio_dev->modes |= INDIO_BUFFER_HARDWARE;
 
+	buffer->direction = dir;
+
 	ret = iio_device_attach_buffer(indio_dev, buffer);
 	if (ret) {
 		iio_dmaengine_buffer_free(buffer);
@@ -250,7 +264,7 @@ struct iio_buffer *iio_dmaengine_buffer_setup(struct device *dev,
 
 	return buffer;
 }
-EXPORT_SYMBOL_NS_GPL(iio_dmaengine_buffer_setup, IIO_DMAENGINE_BUFFER);
+EXPORT_SYMBOL_NS_GPL(iio_dmaengine_buffer_setup_ext, IIO_DMAENGINE_BUFFER);
 
 static void __devm_iio_dmaengine_buffer_free(void *buffer)
 {
@@ -258,30 +272,32 @@ static void __devm_iio_dmaengine_buffer_free(void *buffer)
 }
 
 /**
- * devm_iio_dmaengine_buffer_setup() - Setup a DMA buffer for an IIO device
+ * devm_iio_dmaengine_buffer_setup_ext() - Setup a DMA buffer for an IIO device
  * @dev: Parent device for the buffer
  * @indio_dev: IIO device to which to attach this buffer.
  * @channel: DMA channel name, typically "rx".
+ * @dir: Direction of buffer (in or out)
  *
  * This allocates a new IIO buffer with devm_iio_dmaengine_buffer_alloc()
  * and attaches it to an IIO device with iio_device_attach_buffer().
  * It also appends the INDIO_BUFFER_HARDWARE mode to the supported modes of the
  * IIO device.
  */
-int devm_iio_dmaengine_buffer_setup(struct device *dev,
-				    struct iio_dev *indio_dev,
-				    const char *channel)
+int devm_iio_dmaengine_buffer_setup_ext(struct device *dev,
+					struct iio_dev *indio_dev,
+					const char *channel,
+					enum iio_buffer_direction dir)
 {
 	struct iio_buffer *buffer;
 
-	buffer = iio_dmaengine_buffer_setup(dev, indio_dev, channel);
+	buffer = iio_dmaengine_buffer_setup_ext(dev, indio_dev, channel, dir);
 	if (IS_ERR(buffer))
 		return PTR_ERR(buffer);
 
 	return devm_add_action_or_reset(dev, __devm_iio_dmaengine_buffer_free,
 					buffer);
 }
-EXPORT_SYMBOL_NS_GPL(devm_iio_dmaengine_buffer_setup, IIO_DMAENGINE_BUFFER);
+EXPORT_SYMBOL_NS_GPL(devm_iio_dmaengine_buffer_setup_ext, IIO_DMAENGINE_BUFFER);
 
 MODULE_AUTHOR("Lars-Peter Clausen <lars@metafoo.de>");
 MODULE_DESCRIPTION("DMA buffer for the IIO framework");
diff --git a/include/linux/iio/buffer-dmaengine.h b/include/linux/iio/buffer-dmaengine.h
index acb60f9a3fff..81d9a19aeb91 100644
--- a/include/linux/iio/buffer-dmaengine.h
+++ b/include/linux/iio/buffer-dmaengine.h
@@ -7,15 +7,28 @@
 #ifndef __IIO_DMAENGINE_H__
 #define __IIO_DMAENGINE_H__
 
+#include <linux/iio/buffer.h>
+
 struct iio_dev;
 struct device;
 
 void iio_dmaengine_buffer_free(struct iio_buffer *buffer);
-struct iio_buffer *iio_dmaengine_buffer_setup(struct device *dev,
-					      struct iio_dev *indio_dev,
-					      const char *channel);
-int devm_iio_dmaengine_buffer_setup(struct device *dev,
-				    struct iio_dev *indio_dev,
-				    const char *channel);
+struct iio_buffer *iio_dmaengine_buffer_setup_ext(struct device *dev,
+						  struct iio_dev *indio_dev,
+						  const char *channel,
+						  enum iio_buffer_direction dir);
+
+#define iio_dmaengine_buffer_setup(dev, indio_dev, channel)	\
+	iio_dmaengine_buffer_setup_ext(dev, indio_dev, channel,	\
+				       IIO_BUFFER_DIRECTION_IN)
+
+int devm_iio_dmaengine_buffer_setup_ext(struct device *dev,
+					struct iio_dev *indio_dev,
+					const char *channel,
+					enum iio_buffer_direction dir);
+
+#define devm_iio_dmaengine_buffer_setup(dev, indio_dev, channel)	\
+	devm_iio_dmaengine_buffer_setup_ext(dev, indio_dev, channel,	\
+					    IIO_BUFFER_DIRECTION_IN)
 
 #endif
-- 
cgit v1.2.3


From 87800c4342a29d4e1c378ce72d27e3976d094ffa Mon Sep 17 00:00:00 2001
From: Nuno Sa <nuno.sa@analog.com>
Date: Fri, 19 Apr 2024 10:25:41 +0200
Subject: iio: backend: add new functionality

This adds the needed backend ops for supporting a backend inerfacing
with an high speed dac. The new ops are:

* data_source_set();
* set_sampling_freq();
* extend_chan_spec();
* ext_info_set();
* ext_info_get().

Also to note the new helpers that are meant to be used by the backends
when extending an IIO channel (adding extended info):

* iio_backend_ext_info_set();
* iio_backend_ext_info_get().

Signed-off-by: Nuno Sa <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20240419-iio-backend-axi-dac-v4-8-5ca45b4de294@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/industrialio-backend.c | 179 +++++++++++++++++++++++++++++++++++++
 include/linux/iio/backend.h        |  49 ++++++++++
 2 files changed, 228 insertions(+)

(limited to 'include')

diff --git a/drivers/iio/industrialio-backend.c b/drivers/iio/industrialio-backend.c
index 2fea2bbbe47f..f08ed6d70ae5 100644
--- a/drivers/iio/industrialio-backend.c
+++ b/drivers/iio/industrialio-backend.c
@@ -43,10 +43,12 @@
 #include <linux/types.h>
 
 #include <linux/iio/backend.h>
+#include <linux/iio/iio.h>
 
 struct iio_backend {
 	struct list_head entry;
 	const struct iio_backend_ops *ops;
+	struct device *frontend_dev;
 	struct device *dev;
 	struct module *owner;
 	void *priv;
@@ -186,6 +188,44 @@ int iio_backend_data_format_set(struct iio_backend *back, unsigned int chan,
 }
 EXPORT_SYMBOL_NS_GPL(iio_backend_data_format_set, IIO_BACKEND);
 
+/**
+ * iio_backend_data_source_set - Select data source
+ * @back:	Backend device
+ * @chan:	Channel number
+ * @data:	Data source
+ *
+ * A given backend may have different sources to stream/sync data. This allows
+ * to choose that source.
+ *
+ * RETURNS:
+ * 0 on success, negative error number on failure.
+ */
+int iio_backend_data_source_set(struct iio_backend *back, unsigned int chan,
+				enum iio_backend_data_source data)
+{
+	if (data >= IIO_BACKEND_DATA_SOURCE_MAX)
+		return -EINVAL;
+
+	return iio_backend_op_call(back, data_source_set, chan, data);
+}
+EXPORT_SYMBOL_NS_GPL(iio_backend_data_source_set, IIO_BACKEND);
+
+/**
+ * iio_backend_set_sampling_freq - Set channel sampling rate
+ * @back:		Backend device
+ * @chan:		Channel number
+ * @sample_rate_hz:	Sample rate
+ *
+ * RETURNS:
+ * 0 on success, negative error number on failure.
+ */
+int iio_backend_set_sampling_freq(struct iio_backend *back, unsigned int chan,
+				  u64 sample_rate_hz)
+{
+	return iio_backend_op_call(back, set_sample_rate, chan, sample_rate_hz);
+}
+EXPORT_SYMBOL_NS_GPL(iio_backend_set_sampling_freq, IIO_BACKEND);
+
 static void iio_backend_free_buffer(void *arg)
 {
 	struct iio_backend_buffer_pair *pair = arg;
@@ -231,6 +271,143 @@ int devm_iio_backend_request_buffer(struct device *dev,
 }
 EXPORT_SYMBOL_NS_GPL(devm_iio_backend_request_buffer, IIO_BACKEND);
 
+static struct iio_backend *iio_backend_from_indio_dev_parent(const struct device *dev)
+{
+	struct iio_backend *back = ERR_PTR(-ENODEV), *iter;
+
+	/*
+	 * We deliberately go through all backends even after finding a match.
+	 * The reason is that we want to catch frontend devices which have more
+	 * than one backend in which case returning the first we find is bogus.
+	 * For those cases, frontends need to explicitly define
+	 * get_iio_backend() in struct iio_info.
+	 */
+	guard(mutex)(&iio_back_lock);
+	list_for_each_entry(iter, &iio_back_list, entry) {
+		if (dev == iter->frontend_dev) {
+			if (!IS_ERR(back)) {
+				dev_warn(dev,
+					 "Multiple backends! get_iio_backend() needs to be implemented");
+				return ERR_PTR(-ENODEV);
+			}
+
+			back = iter;
+		}
+	}
+
+	return back;
+}
+
+/**
+ * iio_backend_ext_info_get - IIO ext_info read callback
+ * @indio_dev:	IIO device
+ * @private:	Data private to the driver
+ * @chan:	IIO channel
+ * @buf:	Buffer where to place the attribute data
+ *
+ * This helper is intended to be used by backends that extend an IIO channel
+ * (through iio_backend_extend_chan_spec()) with extended info. In that case,
+ * backends are not supposed to give their own callbacks (as they would not have
+ * a way to get the backend from indio_dev). This is the getter.
+ *
+ * RETURNS:
+ * Number of bytes written to buf, negative error number on failure.
+ */
+ssize_t iio_backend_ext_info_get(struct iio_dev *indio_dev, uintptr_t private,
+				 const struct iio_chan_spec *chan, char *buf)
+{
+	struct iio_backend *back;
+
+	/*
+	 * The below should work for the majority of the cases. It will not work
+	 * when one frontend has multiple backends in which case we'll need a
+	 * new callback in struct iio_info so we can directly request the proper
+	 * backend from the frontend. Anyways, let's only introduce new options
+	 * when really needed...
+	 */
+	back = iio_backend_from_indio_dev_parent(indio_dev->dev.parent);
+	if (IS_ERR(back))
+		return PTR_ERR(back);
+
+	return iio_backend_op_call(back, ext_info_get, private, chan, buf);
+}
+EXPORT_SYMBOL_NS_GPL(iio_backend_ext_info_get, IIO_BACKEND);
+
+/**
+ * iio_backend_ext_info_set - IIO ext_info write callback
+ * @indio_dev:	IIO device
+ * @private:	Data private to the driver
+ * @chan:	IIO channel
+ * @buf:	Buffer holding the sysfs attribute
+ * @len:	Buffer length
+ *
+ * This helper is intended to be used by backends that extend an IIO channel
+ * (trough iio_backend_extend_chan_spec()) with extended info. In that case,
+ * backends are not supposed to give their own callbacks (as they would not have
+ * a way to get the backend from indio_dev). This is the setter.
+ *
+ * RETURNS:
+ * Buffer length on success, negative error number on failure.
+ */
+ssize_t iio_backend_ext_info_set(struct iio_dev *indio_dev, uintptr_t private,
+				 const struct iio_chan_spec *chan,
+				 const char *buf, size_t len)
+{
+	struct iio_backend *back;
+
+	back = iio_backend_from_indio_dev_parent(indio_dev->dev.parent);
+	if (IS_ERR(back))
+		return PTR_ERR(back);
+
+	return iio_backend_op_call(back, ext_info_set, private, chan, buf, len);
+}
+EXPORT_SYMBOL_NS_GPL(iio_backend_ext_info_set, IIO_BACKEND);
+
+/**
+ * iio_backend_extend_chan_spec - Extend an IIO channel
+ * @indio_dev:	IIO device
+ * @back:	Backend device
+ * @chan:	IIO channel
+ *
+ * Some backends may have their own functionalities and hence capable of
+ * extending a frontend's channel.
+ *
+ * RETURNS:
+ * 0 on success, negative error number on failure.
+ */
+int iio_backend_extend_chan_spec(struct iio_dev *indio_dev,
+				 struct iio_backend *back,
+				 struct iio_chan_spec *chan)
+{
+	const struct iio_chan_spec_ext_info *frontend_ext_info = chan->ext_info;
+	const struct iio_chan_spec_ext_info *back_ext_info;
+	int ret;
+
+	ret = iio_backend_op_call(back, extend_chan_spec, chan);
+	if (ret)
+		return ret;
+	/*
+	 * Let's keep things simple for now. Don't allow to overwrite the
+	 * frontend's extended info. If ever needed, we can support appending
+	 * it.
+	 */
+	if (frontend_ext_info && chan->ext_info != frontend_ext_info)
+		return -EOPNOTSUPP;
+	if (!chan->ext_info)
+		return 0;
+
+	/* Don't allow backends to get creative and force their own handlers */
+	for (back_ext_info = chan->ext_info; back_ext_info->name; back_ext_info++) {
+		if (back_ext_info->read != iio_backend_ext_info_get)
+			return -EINVAL;
+		if (back_ext_info->write != iio_backend_ext_info_set)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(iio_backend_extend_chan_spec, IIO_BACKEND);
+
 static void iio_backend_release(void *arg)
 {
 	struct iio_backend *back = arg;
@@ -263,6 +440,8 @@ static int __devm_iio_backend_get(struct device *dev, struct iio_backend *back)
 				     "Could not link to supplier(%s)\n",
 				     dev_name(back->dev));
 
+	back->frontend_dev = dev;
+
 	dev_dbg(dev, "Found backend(%s) device\n", dev_name(back->dev));
 
 	return 0;
diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h
index a6d79381866e..9d144631134d 100644
--- a/include/linux/iio/backend.h
+++ b/include/linux/iio/backend.h
@@ -4,6 +4,7 @@
 
 #include <linux/types.h>
 
+struct iio_chan_spec;
 struct fwnode_handle;
 struct iio_backend;
 struct device;
@@ -15,6 +16,26 @@ enum iio_backend_data_type {
 	IIO_BACKEND_DATA_TYPE_MAX
 };
 
+enum iio_backend_data_source {
+	IIO_BACKEND_INTERNAL_CONTINUOS_WAVE,
+	IIO_BACKEND_EXTERNAL,
+	IIO_BACKEND_DATA_SOURCE_MAX
+};
+
+/**
+ * IIO_BACKEND_EX_INFO - Helper for an IIO extended channel attribute
+ * @_name:	Attribute name
+ * @_shared:	Whether the attribute is shared between all channels
+ * @_what:	Data private to the driver
+ */
+#define IIO_BACKEND_EX_INFO(_name, _shared, _what) {	\
+	.name = (_name),				\
+	.shared = (_shared),				\
+	.read =  iio_backend_ext_info_get,		\
+	.write = iio_backend_ext_info_set,		\
+	.private = (_what),				\
+}
+
 /**
  * struct iio_backend_data_fmt - Backend data format
  * @type:		Data type.
@@ -35,8 +56,13 @@ struct iio_backend_data_fmt {
  * @chan_enable:	Enable one channel.
  * @chan_disable:	Disable one channel.
  * @data_format_set:	Configure the data format for a specific channel.
+ * @data_source_set:	Configure the data source for a specific channel.
+ * @set_sample_rate:	Configure the sampling rate for a specific channel.
  * @request_buffer:	Request an IIO buffer.
  * @free_buffer:	Free an IIO buffer.
+ * @extend_chan_spec:	Extend an IIO channel.
+ * @ext_info_set:	Extended info setter.
+ * @ext_info_get:	Extended info getter.
  **/
 struct iio_backend_ops {
 	int (*enable)(struct iio_backend *back);
@@ -45,10 +71,21 @@ struct iio_backend_ops {
 	int (*chan_disable)(struct iio_backend *back, unsigned int chan);
 	int (*data_format_set)(struct iio_backend *back, unsigned int chan,
 			       const struct iio_backend_data_fmt *data);
+	int (*data_source_set)(struct iio_backend *back, unsigned int chan,
+			       enum iio_backend_data_source data);
+	int (*set_sample_rate)(struct iio_backend *back, unsigned int chan,
+			       u64 sample_rate_hz);
 	struct iio_buffer *(*request_buffer)(struct iio_backend *back,
 					     struct iio_dev *indio_dev);
 	void (*free_buffer)(struct iio_backend *back,
 			    struct iio_buffer *buffer);
+	int (*extend_chan_spec)(struct iio_backend *back,
+				struct iio_chan_spec *chan);
+	int (*ext_info_set)(struct iio_backend *back, uintptr_t private,
+			    const struct iio_chan_spec *chan,
+			    const char *buf, size_t len);
+	int (*ext_info_get)(struct iio_backend *back, uintptr_t private,
+			    const struct iio_chan_spec *chan, char *buf);
 };
 
 int iio_backend_chan_enable(struct iio_backend *back, unsigned int chan);
@@ -56,10 +93,22 @@ int iio_backend_chan_disable(struct iio_backend *back, unsigned int chan);
 int devm_iio_backend_enable(struct device *dev, struct iio_backend *back);
 int iio_backend_data_format_set(struct iio_backend *back, unsigned int chan,
 				const struct iio_backend_data_fmt *data);
+int iio_backend_data_source_set(struct iio_backend *back, unsigned int chan,
+				enum iio_backend_data_source data);
+int iio_backend_set_sampling_freq(struct iio_backend *back, unsigned int chan,
+				  u64 sample_rate_hz);
 int devm_iio_backend_request_buffer(struct device *dev,
 				    struct iio_backend *back,
 				    struct iio_dev *indio_dev);
+ssize_t iio_backend_ext_info_set(struct iio_dev *indio_dev, uintptr_t private,
+				 const struct iio_chan_spec *chan,
+				 const char *buf, size_t len);
+ssize_t iio_backend_ext_info_get(struct iio_dev *indio_dev, uintptr_t private,
+				 const struct iio_chan_spec *chan, char *buf);
 
+int iio_backend_extend_chan_spec(struct iio_dev *indio_dev,
+				 struct iio_backend *back,
+				 struct iio_chan_spec *chan);
 void *iio_backend_get_priv(const struct iio_backend *conv);
 struct iio_backend *devm_iio_backend_get(struct device *dev, const char *name);
 struct iio_backend *
-- 
cgit v1.2.3


From f592cc5794747b81e53b53dd6e80219ee25f0611 Mon Sep 17 00:00:00 2001
From: Maulik Shah <quic_mkshah@quicinc.com>
Date: Thu, 15 Feb 2024 10:55:44 +0530
Subject: soc: qcom: rpmh-rsc: Enhance check for VRM in-flight request

Each RPMh VRM accelerator resource has 3 or 4 contiguous 4-byte aligned
addresses associated with it. These control voltage, enable state, mode,
and in legacy targets, voltage headroom. The current in-flight request
checking logic looks for exact address matches. Requests for different
addresses of the same RPMh resource as thus not detected as in-flight.

Add new cmd-db API cmd_db_match_resource_addr() to enhance the in-flight
request check for VRM requests by ignoring the address offset.

This ensures that only one request is allowed to be in-flight for a given
VRM resource. This is needed to avoid scenarios where request commands are
carried out by RPMh hardware out-of-order leading to LDO regulator
over-current protection triggering.

Fixes: 658628e7ef78 ("drivers: qcom: rpmh-rsc: add RPMH controller for QCOM SoCs")
Cc: stable@vger.kernel.org
Reviewed-by: Konrad Dybcio <konrad.dybcio@linaro.org>
Tested-by: Elliot Berman <quic_eberman@quicinc.com> # sm8650-qrd
Signed-off-by: Maulik Shah <quic_mkshah@quicinc.com>
Link: https://lore.kernel.org/r/20240215-rpmh-rsc-fixes-v4-1-9cbddfcba05b@quicinc.com
Signed-off-by: Bjorn Andersson <andersson@kernel.org>
---
 drivers/soc/qcom/cmd-db.c   | 32 +++++++++++++++++++++++++++++++-
 drivers/soc/qcom/rpmh-rsc.c |  3 ++-
 include/soc/qcom/cmd-db.h   | 10 +++++++++-
 3 files changed, 42 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/soc/qcom/cmd-db.c b/drivers/soc/qcom/cmd-db.c
index c344107bc36c..b4e613c34a5c 100644
--- a/drivers/soc/qcom/cmd-db.c
+++ b/drivers/soc/qcom/cmd-db.c
@@ -1,6 +1,10 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2016-2018, 2020, The Linux Foundation. All rights reserved. */
+/*
+ * Copyright (c) 2016-2018, 2020, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2024, Qualcomm Innovation Center, Inc. All rights reserved.
+ */
 
+#include <linux/bitfield.h>
 #include <linux/debugfs.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -17,6 +21,8 @@
 #define MAX_SLV_ID		8
 #define SLAVE_ID_MASK		0x7
 #define SLAVE_ID_SHIFT		16
+#define SLAVE_ID(addr)		FIELD_GET(GENMASK(19, 16), addr)
+#define VRM_ADDR(addr)		FIELD_GET(GENMASK(19, 4), addr)
 
 /**
  * struct entry_header: header for each entry in cmddb
@@ -220,6 +226,30 @@ const void *cmd_db_read_aux_data(const char *id, size_t *len)
 }
 EXPORT_SYMBOL_GPL(cmd_db_read_aux_data);
 
+/**
+ * cmd_db_match_resource_addr() - Compare if both Resource addresses are same
+ *
+ * @addr1: Resource address to compare
+ * @addr2: Resource address to compare
+ *
+ * Return: true if two addresses refer to the same resource, false otherwise
+ */
+bool cmd_db_match_resource_addr(u32 addr1, u32 addr2)
+{
+	/*
+	 * Each RPMh VRM accelerator resource has 3 or 4 contiguous 4-byte
+	 * aligned addresses associated with it. Ignore the offset to check
+	 * for VRM requests.
+	 */
+	if (addr1 == addr2)
+		return true;
+	else if (SLAVE_ID(addr1) == CMD_DB_HW_VRM && VRM_ADDR(addr1) == VRM_ADDR(addr2))
+		return true;
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(cmd_db_match_resource_addr);
+
 /**
  * cmd_db_read_slave_id - Get the slave ID for a given resource address
  *
diff --git a/drivers/soc/qcom/rpmh-rsc.c b/drivers/soc/qcom/rpmh-rsc.c
index c4c7aad957e6..561d8037b50a 100644
--- a/drivers/soc/qcom/rpmh-rsc.c
+++ b/drivers/soc/qcom/rpmh-rsc.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (c) 2016-2018, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2023-2024, Qualcomm Innovation Center, Inc. All rights reserved.
  */
 
 #define pr_fmt(fmt) "%s " fmt, KBUILD_MODNAME
@@ -557,7 +558,7 @@ static int check_for_req_inflight(struct rsc_drv *drv, struct tcs_group *tcs,
 		for_each_set_bit(j, &curr_enabled, MAX_CMDS_PER_TCS) {
 			addr = read_tcs_cmd(drv, drv->regs[RSC_DRV_CMD_ADDR], i, j);
 			for (k = 0; k < msg->num_cmds; k++) {
-				if (addr == msg->cmds[k].addr)
+				if (cmd_db_match_resource_addr(msg->cmds[k].addr, addr))
 					return -EBUSY;
 			}
 		}
diff --git a/include/soc/qcom/cmd-db.h b/include/soc/qcom/cmd-db.h
index c8bb56e6852a..47a6cab75e63 100644
--- a/include/soc/qcom/cmd-db.h
+++ b/include/soc/qcom/cmd-db.h
@@ -1,5 +1,8 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (c) 2016-2018, The Linux Foundation. All rights reserved. */
+/*
+ * Copyright (c) 2016-2018, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2024, Qualcomm Innovation Center, Inc. All rights reserved.
+ */
 
 #ifndef __QCOM_COMMAND_DB_H__
 #define __QCOM_COMMAND_DB_H__
@@ -21,6 +24,8 @@ u32 cmd_db_read_addr(const char *resource_id);
 
 const void *cmd_db_read_aux_data(const char *resource_id, size_t *len);
 
+bool cmd_db_match_resource_addr(u32 addr1, u32 addr2);
+
 enum cmd_db_hw_type cmd_db_read_slave_id(const char *resource_id);
 
 int cmd_db_ready(void);
@@ -31,6 +36,9 @@ static inline u32 cmd_db_read_addr(const char *resource_id)
 static inline const void *cmd_db_read_aux_data(const char *resource_id, size_t *len)
 { return ERR_PTR(-ENODEV); }
 
+static inline bool cmd_db_match_resource_addr(u32 addr1, u32 addr2)
+{ return false; }
+
 static inline enum cmd_db_hw_type cmd_db_read_slave_id(const char *resource_id)
 { return -ENODEV; }
 
-- 
cgit v1.2.3


From bfa858f220ab8c950dd3e1310fee61950d0ecdae Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Thu, 18 Apr 2024 11:40:08 +0200
Subject: sysctl: treewide: constify ctl_table_header::ctl_table_arg
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To be able to constify instances of struct ctl_tables it is necessary to
remove ways through which non-const versions are exposed from the
sysctl core.
One of these is the ctl_table_arg member of struct ctl_table_header.

Constify this reference as a prerequisite for the full constification of
struct ctl_table instances.
No functional change.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vrf.c                       | 2 +-
 include/linux/sysctl.h                  | 2 +-
 ipc/ipc_sysctl.c                        | 2 +-
 ipc/mq_sysctl.c                         | 2 +-
 kernel/ucount.c                         | 2 +-
 net/ax25/sysctl_net_ax25.c              | 2 +-
 net/bridge/br_netfilter_hooks.c         | 2 +-
 net/core/sysctl_net_core.c              | 2 +-
 net/ieee802154/6lowpan/reassembly.c     | 2 +-
 net/ipv4/devinet.c                      | 2 +-
 net/ipv4/ip_fragment.c                  | 2 +-
 net/ipv4/route.c                        | 2 +-
 net/ipv4/sysctl_net_ipv4.c              | 2 +-
 net/ipv4/xfrm4_policy.c                 | 2 +-
 net/ipv6/addrconf.c                     | 2 +-
 net/ipv6/netfilter/nf_conntrack_reasm.c | 2 +-
 net/ipv6/reassembly.c                   | 2 +-
 net/ipv6/sysctl_net_ipv6.c              | 6 +++---
 net/ipv6/xfrm6_policy.c                 | 2 +-
 net/mpls/af_mpls.c                      | 4 ++--
 net/mptcp/ctrl.c                        | 2 +-
 net/netfilter/nf_conntrack_standalone.c | 2 +-
 net/netfilter/nf_log.c                  | 2 +-
 net/sctp/sysctl.c                       | 2 +-
 net/smc/smc_sysctl.c                    | 2 +-
 net/unix/sysctl_net_unix.c              | 2 +-
 net/xfrm/xfrm_sysctl.c                  | 2 +-
 27 files changed, 30 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index bb95ce43cd97..66f8542f3b18 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -1971,7 +1971,7 @@ static int vrf_netns_init_sysctl(struct net *net, struct netns_vrf *nn_vrf)
 static void vrf_netns_exit_sysctl(struct net *net)
 {
 	struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id);
-	struct ctl_table *table;
+	const struct ctl_table *table;
 
 	table = nn_vrf->ctl_hdr->ctl_table_arg;
 	unregister_net_sysctl_table(nn_vrf->ctl_hdr);
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index ee7d33b89e9e..9413241df962 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -182,7 +182,7 @@ struct ctl_table_header {
 		struct rcu_head rcu;
 	};
 	struct completion *unregistering;
-	struct ctl_table *ctl_table_arg;
+	const struct ctl_table *ctl_table_arg;
 	struct ctl_table_root *root;
 	struct ctl_table_set *set;
 	struct ctl_dir *parent;
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index 45cb1dabce29..3c3755918d34 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -306,7 +306,7 @@ bool setup_ipc_sysctls(struct ipc_namespace *ns)
 
 void retire_ipc_sysctls(struct ipc_namespace *ns)
 {
-	struct ctl_table *tbl;
+	const struct ctl_table *tbl;
 
 	tbl = ns->ipc_sysctls->ctl_table_arg;
 	unregister_sysctl_table(ns->ipc_sysctls);
diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c
index 21fba3a6edaf..69c709262f5a 100644
--- a/ipc/mq_sysctl.c
+++ b/ipc/mq_sysctl.c
@@ -160,7 +160,7 @@ bool setup_mq_sysctls(struct ipc_namespace *ns)
 
 void retire_mq_sysctls(struct ipc_namespace *ns)
 {
-	struct ctl_table *tbl;
+	const struct ctl_table *tbl;
 
 	tbl = ns->mq_sysctls->ctl_table_arg;
 	unregister_sysctl_table(ns->mq_sysctls);
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 4aa6166cb856..d9e283600f5c 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -119,7 +119,7 @@ bool setup_userns_sysctls(struct user_namespace *ns)
 void retire_userns_sysctls(struct user_namespace *ns)
 {
 #ifdef CONFIG_SYSCTL
-	struct ctl_table *tbl;
+	const struct ctl_table *tbl;
 
 	tbl = ns->sysctls->ctl_table_arg;
 	unregister_sysctl_table(ns->sysctls);
diff --git a/net/ax25/sysctl_net_ax25.c b/net/ax25/sysctl_net_ax25.c
index db66e11e7fe8..e0128dc9def3 100644
--- a/net/ax25/sysctl_net_ax25.c
+++ b/net/ax25/sysctl_net_ax25.c
@@ -171,7 +171,7 @@ int ax25_register_dev_sysctl(ax25_dev *ax25_dev)
 void ax25_unregister_dev_sysctl(ax25_dev *ax25_dev)
 {
 	struct ctl_table_header *header = ax25_dev->sysheader;
-	struct ctl_table *table;
+	const struct ctl_table *table;
 
 	if (header) {
 		ax25_dev->sysheader = NULL;
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index c683d8377862..7948a9e7542c 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -1275,7 +1275,7 @@ static int br_netfilter_sysctl_init_net(struct net *net)
 static void br_netfilter_sysctl_exit_net(struct net *net,
 					 struct brnf_net *brnet)
 {
-	struct ctl_table *table = brnet->ctl_hdr->ctl_table_arg;
+	const struct ctl_table *table = brnet->ctl_hdr->ctl_table_arg;
 
 	unregister_net_sysctl_table(brnet->ctl_hdr);
 	if (!net_eq(net, &init_net))
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 6973dda3abda..903ab4a51c17 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -743,7 +743,7 @@ err_dup:
 
 static __net_exit void sysctl_core_net_exit(struct net *net)
 {
-	struct ctl_table *tbl;
+	const struct ctl_table *tbl;
 
 	tbl = net->core.sysctl_hdr->ctl_table_arg;
 	unregister_net_sysctl_table(net->core.sysctl_hdr);
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 6dd960ec558c..2a983cf450da 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -399,7 +399,7 @@ err_alloc:
 
 static void __net_exit lowpan_frags_ns_sysctl_unregister(struct net *net)
 {
-	struct ctl_table *table;
+	const struct ctl_table *table;
 	struct netns_ieee802154_lowpan *ieee802154_lowpan =
 		net_ieee802154_lowpan(net);
 
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 7a437f0d4190..7592f242336b 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -2749,7 +2749,7 @@ err_alloc_all:
 static __net_exit void devinet_exit_net(struct net *net)
 {
 #ifdef CONFIG_SYSCTL
-	struct ctl_table *tbl;
+	const struct ctl_table *tbl;
 
 	tbl = net->ipv4.forw_hdr->ctl_table_arg;
 	unregister_net_sysctl_table(net->ipv4.forw_hdr);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index fb947d1613fe..534b98a0744a 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -632,7 +632,7 @@ err_alloc:
 
 static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
 {
-	struct ctl_table *table;
+	const struct ctl_table *table;
 
 	table = net->ipv4.frags_hdr->ctl_table_arg;
 	unregister_net_sysctl_table(net->ipv4.frags_hdr);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6133f1fd848e..2e29ab819967 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -3576,7 +3576,7 @@ err_dup:
 
 static __net_exit void sysctl_route_net_exit(struct net *net)
 {
-	struct ctl_table *tbl;
+	const struct ctl_table *tbl;
 
 	tbl = net->ipv4.route_hdr->ctl_table_arg;
 	unregister_net_sysctl_table(net->ipv4.route_hdr);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 7e4f16a7dcc1..ce5d19978a26 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1554,7 +1554,7 @@ err_alloc:
 
 static __net_exit void ipv4_sysctl_exit_net(struct net *net)
 {
-	struct ctl_table *table;
+	const struct ctl_table *table;
 
 	kfree(net->ipv4.sysctl_local_reserved_ports);
 	table = net->ipv4.ipv4_hdr->ctl_table_arg;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index c33bca2c3841..1dda59e0aeab 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -186,7 +186,7 @@ err_alloc:
 
 static __net_exit void xfrm4_net_sysctl_exit(struct net *net)
 {
-	struct ctl_table *table;
+	const struct ctl_table *table;
 
 	if (!net->ipv4.xfrm4_hdr)
 		return;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 3a92d002abe4..9aa0900abfa1 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -7238,7 +7238,7 @@ out:
 static void __addrconf_sysctl_unregister(struct net *net,
 					 struct ipv6_devconf *p, int ifindex)
 {
-	struct ctl_table *table;
+	const struct ctl_table *table;
 
 	if (!p->sysctl_header)
 		return;
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index d0dcbaca1994..ce8c14d8aff5 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -105,7 +105,7 @@ err_alloc:
 static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net)
 {
 	struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net);
-	struct ctl_table *table;
+	const struct ctl_table *table;
 
 	table = nf_frag->nf_frag_frags_hdr->ctl_table_arg;
 	unregister_net_sysctl_table(nf_frag->nf_frag_frags_hdr);
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index acb4f119e11f..ee95cdcc8747 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -487,7 +487,7 @@ err_alloc:
 
 static void __net_exit ip6_frags_ns_sysctl_unregister(struct net *net)
 {
-	struct ctl_table *table;
+	const struct ctl_table *table;
 
 	table = net->ipv6.sysctl.frags_hdr->ctl_table_arg;
 	unregister_net_sysctl_table(net->ipv6.sysctl.frags_hdr);
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 888676163e90..75de55f907b0 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -313,9 +313,9 @@ out_ipv6_table:
 
 static void __net_exit ipv6_sysctl_net_exit(struct net *net)
 {
-	struct ctl_table *ipv6_table;
-	struct ctl_table *ipv6_route_table;
-	struct ctl_table *ipv6_icmp_table;
+	const struct ctl_table *ipv6_table;
+	const struct ctl_table *ipv6_route_table;
+	const struct ctl_table *ipv6_icmp_table;
 
 	ipv6_table = net->ipv6.sysctl.hdr->ctl_table_arg;
 	ipv6_route_table = net->ipv6.sysctl.route_hdr->ctl_table_arg;
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 42fb6996b077..4891012b692f 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -218,7 +218,7 @@ err_alloc:
 
 static void __net_exit xfrm6_net_sysctl_exit(struct net *net)
 {
-	struct ctl_table *table;
+	const struct ctl_table *table;
 
 	if (!net->ipv6.sysctl.xfrm6_hdr)
 		return;
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 0315b8deed3f..5d2012d1cf4a 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -1422,7 +1422,7 @@ static void mpls_dev_sysctl_unregister(struct net_device *dev,
 				       struct mpls_dev *mdev)
 {
 	struct net *net = dev_net(dev);
-	struct ctl_table *table;
+	const struct ctl_table *table;
 
 	if (!mdev->sysctl)
 		return;
@@ -2690,7 +2690,7 @@ static void mpls_net_exit(struct net *net)
 {
 	struct mpls_route __rcu **platform_label;
 	size_t platform_labels;
-	struct ctl_table *table;
+	const struct ctl_table *table;
 	unsigned int index;
 
 	table = net->mpls.ctl->ctl_table_arg;
diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c
index 13fe0748dde8..8d661156ab8c 100644
--- a/net/mptcp/ctrl.c
+++ b/net/mptcp/ctrl.c
@@ -198,7 +198,7 @@ err_alloc:
 
 static void mptcp_pernet_del_table(struct mptcp_pernet *pernet)
 {
-	struct ctl_table *table = pernet->ctl_table_hdr->ctl_table_arg;
+	const struct ctl_table *table = pernet->ctl_table_hdr->ctl_table_arg;
 
 	unregister_net_sysctl_table(pernet->ctl_table_hdr);
 
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 0ee98ce5b816..bb9dea676ec1 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -1122,7 +1122,7 @@ out_unregister_netfilter:
 static void nf_conntrack_standalone_fini_sysctl(struct net *net)
 {
 	struct nf_conntrack_net *cnet = nf_ct_pernet(net);
-	struct ctl_table *table;
+	const struct ctl_table *table;
 
 	table = cnet->sysctl_header->ctl_table_arg;
 	unregister_net_sysctl_table(cnet->sysctl_header);
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 370f8231385c..efedd2f13ac7 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -514,7 +514,7 @@ err_alloc:
 
 static void netfilter_log_sysctl_exit(struct net *net)
 {
-	struct ctl_table *table;
+	const struct ctl_table *table;
 
 	table = net->nf.nf_log_dir_header->ctl_table_arg;
 	unregister_net_sysctl_table(net->nf.nf_log_dir_header);
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index f65d6f92afcb..25bdf17c7262 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -624,7 +624,7 @@ int sctp_sysctl_net_register(struct net *net)
 
 void sctp_sysctl_net_unregister(struct net *net)
 {
-	struct ctl_table *table;
+	const struct ctl_table *table;
 
 	table = net->sctp.sysctl_header->ctl_table_arg;
 	unregister_net_sysctl_table(net->sctp.sysctl_header);
diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c
index a5946d1b9d60..4e8baa2e7ea4 100644
--- a/net/smc/smc_sysctl.c
+++ b/net/smc/smc_sysctl.c
@@ -133,7 +133,7 @@ err_alloc:
 
 void __net_exit smc_sysctl_net_exit(struct net *net)
 {
-	struct ctl_table *table;
+	const struct ctl_table *table;
 
 	table = net->smc.smc_hdr->ctl_table_arg;
 	unregister_net_sysctl_table(net->smc.smc_hdr);
diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c
index 3e84b31c355a..44996af61999 100644
--- a/net/unix/sysctl_net_unix.c
+++ b/net/unix/sysctl_net_unix.c
@@ -52,7 +52,7 @@ err_alloc:
 
 void unix_sysctl_unregister(struct net *net)
 {
-	struct ctl_table *table;
+	const struct ctl_table *table;
 
 	table = net->unx.ctl->ctl_table_arg;
 	unregister_net_sysctl_table(net->unx.ctl);
diff --git a/net/xfrm/xfrm_sysctl.c b/net/xfrm/xfrm_sysctl.c
index 7fdeafc838a7..e972930c292b 100644
--- a/net/xfrm/xfrm_sysctl.c
+++ b/net/xfrm/xfrm_sysctl.c
@@ -76,7 +76,7 @@ out_kmemdup:
 
 void __net_exit xfrm_sysctl_fini(struct net *net)
 {
-	struct ctl_table *table;
+	const struct ctl_table *table;
 
 	table = net->xfrm.sysctl_hdr->ctl_table_arg;
 	unregister_net_sysctl_table(net->xfrm.sysctl_hdr);
-- 
cgit v1.2.3


From 105aa4c65b76c3a344ca89a2d2dc96c84cca557f Mon Sep 17 00:00:00 2001
From: Ville Syrjälä <ville.syrjala@linux.intel.com>
Date: Thu, 18 Apr 2024 14:42:18 +0300
Subject: drm: Fix plane SIZE_HINTS property docs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix the typos in the plane SIZE_HINTS kernel docs.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Fixes: 9677547d8362 ("drm: Introduce plane SIZE_HINTS property")
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240418114218.9162-1-ville.syrjala@linux.intel.com
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
---
 drivers/gpu/drm/drm_plane.c   | 2 +-
 include/drm/drm_mode_config.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/drm_plane.c b/drivers/gpu/drm/drm_plane.c
index eecc24c54efd..57662a1fd345 100644
--- a/drivers/gpu/drm/drm_plane.c
+++ b/drivers/gpu/drm/drm_plane.c
@@ -1750,7 +1750,7 @@ int drm_plane_create_scaling_filter_property(struct drm_plane *plane,
 EXPORT_SYMBOL(drm_plane_create_scaling_filter_property);
 
 /**
- * drm_plane_add_size_hint_property - create a size hint property
+ * drm_plane_add_size_hints_property - create a size hints property
  *
  * @plane: drm plane
  * @hints: size hints
diff --git a/include/drm/drm_mode_config.h b/include/drm/drm_mode_config.h
index 06d7777a881f..8de3c9a5f61b 100644
--- a/include/drm/drm_mode_config.h
+++ b/include/drm/drm_mode_config.h
@@ -953,7 +953,7 @@ struct drm_mode_config {
 	struct drm_property *modifiers_property;
 
 	/**
-	 * @size_hints_propertty: Plane SIZE_HINTS property.
+	 * @size_hints_property: Plane SIZE_HINTS property.
 	 */
 	struct drm_property *size_hints_property;
 
-- 
cgit v1.2.3


From 977b07f769970aec97b907cfc93fb681ecffc9fe Mon Sep 17 00:00:00 2001
From: Peng Fan <peng.fan@nxp.com>
Date: Mon, 1 Apr 2024 21:28:15 +0800
Subject: dt-bindings: clock: add i.MX95 clock header

Add clock header for i.MX95 BLK CTL modules

Signed-off-by: Peng Fan <peng.fan@nxp.com>
Acked-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20240401-imx95-blk-ctl-v6-1-84d4eca1e759@nxp.com
Signed-off-by: Abel Vesa <abel.vesa@linaro.org>
---
 include/dt-bindings/clock/nxp,imx95-clock.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 include/dt-bindings/clock/nxp,imx95-clock.h

(limited to 'include')

diff --git a/include/dt-bindings/clock/nxp,imx95-clock.h b/include/dt-bindings/clock/nxp,imx95-clock.h
new file mode 100644
index 000000000000..782662c3e740
--- /dev/null
+++ b/include/dt-bindings/clock/nxp,imx95-clock.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-only OR MIT */
+/*
+ * Copyright 2024 NXP
+ */
+
+#ifndef __DT_BINDINGS_CLOCK_IMX95_H
+#define __DT_BINDINGS_CLOCK_IMX95_H
+
+#define IMX95_CLK_VPUBLK_WAVE			0
+#define IMX95_CLK_VPUBLK_JPEG_ENC		1
+#define IMX95_CLK_VPUBLK_JPEG_DEC		2
+
+#define IMX95_CLK_CAMBLK_CSI2_FOR0		0
+#define IMX95_CLK_CAMBLK_CSI2_FOR1		1
+#define IMX95_CLK_CAMBLK_ISP_AXI		2
+#define IMX95_CLK_CAMBLK_ISP_PIXEL		3
+#define IMX95_CLK_CAMBLK_ISP			4
+
+#define IMX95_CLK_DISPMIX_LVDS_PHY_DIV		0
+#define IMX95_CLK_DISPMIX_LVDS_CH0_GATE		1
+#define IMX95_CLK_DISPMIX_LVDS_CH1_GATE		2
+#define IMX95_CLK_DISPMIX_PIX_DI0_GATE		3
+#define IMX95_CLK_DISPMIX_PIX_DI1_GATE		4
+
+#define IMX95_CLK_DISPMIX_ENG0_SEL		0
+#define IMX95_CLK_DISPMIX_ENG1_SEL		1
+
+#endif	/* __DT_BINDINGS_CLOCK_IMX95_H */
-- 
cgit v1.2.3


From e84c60032a39e3267f0b46175d5368da33e214a6 Mon Sep 17 00:00:00 2001
From: Anna-Maria Behnsen <anna-maria@linutronix.de>
Date: Wed, 17 Apr 2024 16:02:29 +0200
Subject: timerqueue: Remove never used function timerqueue_node_expires()

This function was introduced with commit 60bda037f1dd ("posix-cpu-timers:
Utilize timerqueue for storage") but never used. Remove it.

Signed-off-by: Anna-Maria Behnsen <anna-maria@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20240417140229.19633-1-anna-maria@linutronix.de
---
 include/linux/timerqueue.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h
index 62973f7d4610..d306d9dd2207 100644
--- a/include/linux/timerqueue.h
+++ b/include/linux/timerqueue.h
@@ -37,11 +37,6 @@ static inline bool timerqueue_node_queued(struct timerqueue_node *node)
 	return !RB_EMPTY_NODE(&node->node);
 }
 
-static inline bool timerqueue_node_expires(struct timerqueue_node *node)
-{
-	return node->expires;
-}
-
 static inline void timerqueue_init_head(struct timerqueue_head *head)
 {
 	head->rb_root = RB_ROOT_CACHED;
-- 
cgit v1.2.3


From 35c8711c8fc4c16ad2749b8314da5829a493e28e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 5 Mar 2024 07:31:52 -0700
Subject: io_uring/kbuf: add helpers for getting/peeking multiple buffers

Our provided buffer interface only allows selection of a single buffer.
Add an API that allows getting/peeking multiple buffers at the same time.

This is only implemented for the ring provided buffers. It could be added
for the legacy provided buffers as well, but since it's strongly
encouraged to use the new interface, let's keep it simpler and just
provide it for the new API. The legacy interface will always just select
a single buffer.

There are two new main functions:

io_buffers_select(), which selects up as many buffers as it can. The
caller supplies the iovec array, and io_buffers_select() may allocate a
bigger array if the 'out_len' being passed in is non-zero and bigger
than what fits in the provided iovec. Buffers grabbed with this helper
are permanently assigned.

io_buffers_peek(), which works like io_buffers_select(), except they can
be recycled, if needed. Callers using either of these functions should
call io_put_kbufs() rather than io_put_kbuf() at completion time. The
peek interface must be called with the ctx locked from peek to
completion.

This add a bit state for the request:

- REQ_F_BUFFERS_COMMIT, which means that the the buffers have been
  peeked and should be committed to the buffer ring head when they are
  put as part of completion. Prior to this, req->buf_list was cleared to
  NULL when committed.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |   3 +
 io_uring/kbuf.c                | 157 ++++++++++++++++++++++++++++++++++++++++-
 io_uring/kbuf.h                |  53 +++++++++++---
 3 files changed, 201 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index c47f412cf18e..7a6b190c7da7 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -472,6 +472,7 @@ enum {
 	REQ_F_CAN_POLL_BIT,
 	REQ_F_BL_EMPTY_BIT,
 	REQ_F_BL_NO_RECYCLE_BIT,
+	REQ_F_BUFFERS_COMMIT_BIT,
 
 	/* not a real bit, just to check we're not overflowing the space */
 	__REQ_F_LAST_BIT,
@@ -550,6 +551,8 @@ enum {
 	REQ_F_BL_EMPTY		= IO_REQ_FLAG(REQ_F_BL_EMPTY_BIT),
 	/* don't recycle provided buffers for this request */
 	REQ_F_BL_NO_RECYCLE	= IO_REQ_FLAG(REQ_F_BL_NO_RECYCLE_BIT),
+	/* buffer ring head needs incrementing on put */
+	REQ_F_BUFFERS_COMMIT	= IO_REQ_FLAG(REQ_F_BUFFERS_COMMIT_BIT),
 };
 
 typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 3846a055df44..d2945c9c812b 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -117,6 +117,27 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
 	return NULL;
 }
 
+static int io_provided_buffers_select(struct io_kiocb *req, size_t *len,
+				      struct io_buffer_list *bl,
+				      struct iovec *iov)
+{
+	void __user *buf;
+
+	buf = io_provided_buffer_select(req, len, bl);
+	if (unlikely(!buf))
+		return -ENOBUFS;
+
+	iov[0].iov_base = buf;
+	iov[0].iov_len = *len;
+	return 0;
+}
+
+static struct io_uring_buf *io_ring_head_to_buf(struct io_uring_buf_ring *br,
+						__u16 head, __u16 mask)
+{
+	return &br->bufs[head & mask];
+}
+
 static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
 					  struct io_buffer_list *bl,
 					  unsigned int issue_flags)
@@ -132,11 +153,10 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
 	if (head + 1 == tail)
 		req->flags |= REQ_F_BL_EMPTY;
 
-	head &= bl->mask;
-	buf = &br->bufs[head];
+	buf = io_ring_head_to_buf(br, head, bl->mask);
 	if (*len == 0 || *len > buf->len)
 		*len = buf->len;
-	req->flags |= REQ_F_BUFFER_RING;
+	req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT;
 	req->buf_list = bl;
 	req->buf_index = buf->bid;
 
@@ -151,6 +171,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
 		 * the transfer completes (or if we get -EAGAIN and must poll of
 		 * retry).
 		 */
+		req->flags &= ~REQ_F_BUFFERS_COMMIT;
 		req->buf_list = NULL;
 		bl->head++;
 	}
@@ -177,6 +198,136 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
 	return ret;
 }
 
+/* cap it at a reasonable 256, will be one page even for 4K */
+#define PEEK_MAX_IMPORT		256
+
+static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
+				struct io_buffer_list *bl)
+{
+	struct io_uring_buf_ring *br = bl->buf_ring;
+	struct iovec *iov = arg->iovs;
+	int nr_iovs = arg->nr_iovs;
+	__u16 nr_avail, tail, head;
+	struct io_uring_buf *buf;
+
+	tail = smp_load_acquire(&br->tail);
+	head = bl->head;
+	nr_avail = min_t(__u16, tail - head, UIO_MAXIOV);
+	if (unlikely(!nr_avail))
+		return -ENOBUFS;
+
+	buf = io_ring_head_to_buf(br, head, bl->mask);
+	if (arg->max_len) {
+		int needed;
+
+		needed = (arg->max_len + buf->len - 1) / buf->len;
+		needed = min(needed, PEEK_MAX_IMPORT);
+		if (nr_avail > needed)
+			nr_avail = needed;
+	}
+
+	/*
+	 * only alloc a bigger array if we know we have data to map, eg not
+	 * a speculative peek operation.
+	 */
+	if (arg->mode & KBUF_MODE_EXPAND && nr_avail > nr_iovs && arg->max_len) {
+		iov = kmalloc_array(nr_avail, sizeof(struct iovec), GFP_KERNEL);
+		if (unlikely(!iov))
+			return -ENOMEM;
+		if (arg->mode & KBUF_MODE_FREE)
+			kfree(arg->iovs);
+		arg->iovs = iov;
+		nr_iovs = nr_avail;
+	} else if (nr_avail < nr_iovs) {
+		nr_iovs = nr_avail;
+	}
+
+	/* set it to max, if not set, so we can use it unconditionally */
+	if (!arg->max_len)
+		arg->max_len = INT_MAX;
+
+	req->buf_index = buf->bid;
+	do {
+		/* truncate end piece, if needed */
+		if (buf->len > arg->max_len)
+			buf->len = arg->max_len;
+
+		iov->iov_base = u64_to_user_ptr(buf->addr);
+		iov->iov_len = buf->len;
+		iov++;
+
+		arg->out_len += buf->len;
+		arg->max_len -= buf->len;
+		if (!arg->max_len)
+			break;
+
+		buf = io_ring_head_to_buf(br, ++head, bl->mask);
+	} while (--nr_iovs);
+
+	if (head == tail)
+		req->flags |= REQ_F_BL_EMPTY;
+
+	req->flags |= REQ_F_BUFFER_RING;
+	req->buf_list = bl;
+	return iov - arg->iovs;
+}
+
+int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
+		      unsigned int issue_flags)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_buffer_list *bl;
+	int ret = -ENOENT;
+
+	io_ring_submit_lock(ctx, issue_flags);
+	bl = io_buffer_get_list(ctx, req->buf_index);
+	if (unlikely(!bl))
+		goto out_unlock;
+
+	if (bl->is_buf_ring) {
+		ret = io_ring_buffers_peek(req, arg, bl);
+		/*
+		 * Don't recycle these buffers if we need to go through poll.
+		 * Nobody else can use them anyway, and holding on to provided
+		 * buffers for a send/write operation would happen on the app
+		 * side anyway with normal buffers. Besides, we already
+		 * committed them, they cannot be put back in the queue.
+		 */
+		if (ret > 0) {
+			req->flags |= REQ_F_BL_NO_RECYCLE;
+			req->buf_list->head += ret;
+		}
+	} else {
+		ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs);
+	}
+out_unlock:
+	io_ring_submit_unlock(ctx, issue_flags);
+	return ret;
+}
+
+int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	struct io_buffer_list *bl;
+	int ret;
+
+	lockdep_assert_held(&ctx->uring_lock);
+
+	bl = io_buffer_get_list(ctx, req->buf_index);
+	if (unlikely(!bl))
+		return -ENOENT;
+
+	if (bl->is_buf_ring) {
+		ret = io_ring_buffers_peek(req, arg, bl);
+		if (ret > 0)
+			req->flags |= REQ_F_BUFFERS_COMMIT;
+		return ret;
+	}
+
+	/* don't support multiple buffer selections for legacy */
+	return io_provided_buffers_select(req, &arg->max_len, bl, arg->iovs);
+}
+
 static int __io_remove_buffers(struct io_ring_ctx *ctx,
 			       struct io_buffer_list *bl, unsigned nbufs)
 {
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 5a9635ee0217..b90aca3a57fa 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -41,8 +41,26 @@ struct io_buffer {
 	__u16 bgid;
 };
 
+enum {
+	/* can alloc a bigger vec */
+	KBUF_MODE_EXPAND	= 1,
+	/* if bigger vec allocated, free old one */
+	KBUF_MODE_FREE		= 2,
+};
+
+struct buf_sel_arg {
+	struct iovec *iovs;
+	size_t out_len;
+	size_t max_len;
+	int nr_iovs;
+	int mode;
+};
+
 void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
 			      unsigned int issue_flags);
+int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
+		      unsigned int issue_flags);
+int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg);
 void io_destroy_buffers(struct io_ring_ctx *ctx);
 
 int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
@@ -75,7 +93,7 @@ static inline bool io_kbuf_recycle_ring(struct io_kiocb *req)
 	 */
 	if (req->buf_list) {
 		req->buf_index = req->buf_list->bgid;
-		req->flags &= ~REQ_F_BUFFER_RING;
+		req->flags &= ~(REQ_F_BUFFER_RING|REQ_F_BUFFERS_COMMIT);
 		return true;
 	}
 	return false;
@@ -99,11 +117,16 @@ static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
 	return false;
 }
 
-static inline void __io_put_kbuf_ring(struct io_kiocb *req)
+static inline void __io_put_kbuf_ring(struct io_kiocb *req, int nr)
 {
-	if (req->buf_list) {
-		req->buf_index = req->buf_list->bgid;
-		req->buf_list->head++;
+	struct io_buffer_list *bl = req->buf_list;
+
+	if (bl) {
+		if (req->flags & REQ_F_BUFFERS_COMMIT) {
+			bl->head += nr;
+			req->flags &= ~REQ_F_BUFFERS_COMMIT;
+		}
+		req->buf_index = bl->bgid;
 	}
 	req->flags &= ~REQ_F_BUFFER_RING;
 }
@@ -112,7 +135,7 @@ static inline void __io_put_kbuf_list(struct io_kiocb *req,
 				      struct list_head *list)
 {
 	if (req->flags & REQ_F_BUFFER_RING) {
-		__io_put_kbuf_ring(req);
+		__io_put_kbuf_ring(req, 1);
 	} else {
 		req->buf_index = req->kbuf->bgid;
 		list_add(&req->kbuf->list, list);
@@ -130,8 +153,8 @@ static inline void io_kbuf_drop(struct io_kiocb *req)
 	__io_put_kbuf_list(req, &req->ctx->io_buffers_comp);
 }
 
-static inline unsigned int io_put_kbuf(struct io_kiocb *req,
-				       unsigned issue_flags)
+static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int nbufs,
+					  unsigned issue_flags)
 {
 	unsigned int ret;
 
@@ -140,9 +163,21 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req,
 
 	ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
 	if (req->flags & REQ_F_BUFFER_RING)
-		__io_put_kbuf_ring(req);
+		__io_put_kbuf_ring(req, nbufs);
 	else
 		__io_put_kbuf(req, issue_flags);
 	return ret;
 }
+
+static inline unsigned int io_put_kbuf(struct io_kiocb *req,
+				       unsigned issue_flags)
+{
+	return __io_put_kbufs(req, 1, issue_flags);
+}
+
+static inline unsigned int io_put_kbufs(struct io_kiocb *req, int nbufs,
+					unsigned issue_flags)
+{
+	return __io_put_kbufs(req, nbufs, issue_flags);
+}
 #endif
-- 
cgit v1.2.3


From a05d1f625c7aa681d8816bc0f10089289ad07aad Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 5 Mar 2024 13:10:04 -0700
Subject: io_uring/net: support bundles for send

If IORING_OP_SEND is used with provided buffers, the caller may also
set IORING_RECVSEND_BUNDLE to turn it into a multi-buffer send. The idea
is that an application can fill outgoing buffers in a provided buffer
group, and then arm a single send that will service them all. Once
there are no more buffers to send, or if the requested length has
been sent, the request posts a single completion for all the buffers.

This only enables it for IORING_OP_SEND, IORING_OP_SENDMSG is coming
in a separate patch. However, this patch does do a lot of the prep
work that makes wiring up the sendmsg variant pretty trivial. They
share the prep side.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |   9 +++
 io_uring/net.c                | 145 +++++++++++++++++++++++++++++++++++++-----
 2 files changed, 137 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index a7f847543a7f..7f583927c908 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -351,11 +351,20 @@ enum io_uring_op {
  *				0 is reported if zerocopy was actually possible.
  *				IORING_NOTIF_USAGE_ZC_COPIED if data was copied
  *				(at least partially).
+ *
+ * IORING_RECVSEND_BUNDLE	Used with IOSQE_BUFFER_SELECT. If set, send will
+ *				grab as many buffers from the buffer group ID
+ *				given and send them all. The completion result
+ *				will be the number of buffers send, with the
+ *				starting buffer ID in cqe->flags as per usual
+ *				for provided buffer usage. The buffers will be
+ *				contigious from the starting buffer ID.
  */
 #define IORING_RECVSEND_POLL_FIRST	(1U << 0)
 #define IORING_RECV_MULTISHOT		(1U << 1)
 #define IORING_RECVSEND_FIXED_BUF	(1U << 2)
 #define IORING_SEND_ZC_REPORT_USAGE	(1U << 3)
+#define IORING_RECVSEND_BUNDLE		(1U << 4)
 
 /*
  * cqe.res for IORING_CQE_F_NOTIF if
diff --git a/io_uring/net.c b/io_uring/net.c
index 13685d133582..3e326576254b 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -57,7 +57,7 @@ struct io_sr_msg {
 		struct user_msghdr __user	*umsg;
 		void __user			*buf;
 	};
-	unsigned			len;
+	int				len;
 	unsigned			done_io;
 	unsigned			msg_flags;
 	unsigned			nr_multishot_loops;
@@ -389,6 +389,8 @@ static int io_sendmsg_prep_setup(struct io_kiocb *req, int is_msg)
 	return ret;
 }
 
+#define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE)
+
 int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
@@ -407,11 +409,20 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	sr->len = READ_ONCE(sqe->len);
 	sr->flags = READ_ONCE(sqe->ioprio);
-	if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
+	if (sr->flags & ~SENDMSG_FLAGS)
 		return -EINVAL;
 	sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
 	if (sr->msg_flags & MSG_DONTWAIT)
 		req->flags |= REQ_F_NOWAIT;
+	if (sr->flags & IORING_RECVSEND_BUNDLE) {
+		if (req->opcode == IORING_OP_SENDMSG)
+			return -EINVAL;
+		if (!(req->flags & REQ_F_BUFFER_SELECT))
+			return -EINVAL;
+		sr->msg_flags |= MSG_WAITALL;
+		sr->buf_group = req->buf_index;
+		req->buf_list = NULL;
+	}
 
 #ifdef CONFIG_COMPAT
 	if (req->ctx->compat)
@@ -427,6 +438,79 @@ static void io_req_msg_cleanup(struct io_kiocb *req,
 	io_netmsg_recycle(req, issue_flags);
 }
 
+/*
+ * For bundle completions, we need to figure out how many segments we consumed.
+ * A bundle could be using a single ITER_UBUF if that's all we mapped, or it
+ * could be using an ITER_IOVEC. If the latter, then if we consumed all of
+ * the segments, then it's a trivial questiont o answer. If we have residual
+ * data in the iter, then loop the segments to figure out how much we
+ * transferred.
+ */
+static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret)
+{
+	struct iovec *iov;
+	int nbufs;
+
+	/* no data is always zero segments, and a ubuf is always 1 segment */
+	if (ret <= 0)
+		return 0;
+	if (iter_is_ubuf(&kmsg->msg.msg_iter))
+		return 1;
+
+	iov = kmsg->free_iov;
+	if (!iov)
+		iov = &kmsg->fast_iov;
+
+	/* if all data was transferred, it's basic pointer math */
+	if (!iov_iter_count(&kmsg->msg.msg_iter))
+		return iter_iov(&kmsg->msg.msg_iter) - iov;
+
+	/* short transfer, count segments */
+	nbufs = 0;
+	do {
+		int this_len = min_t(int, iov[nbufs].iov_len, ret);
+
+		nbufs++;
+		ret -= this_len;
+	} while (ret);
+
+	return nbufs;
+}
+
+static inline bool io_send_finish(struct io_kiocb *req, int *ret,
+				  struct io_async_msghdr *kmsg,
+				  unsigned issue_flags)
+{
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+	bool bundle_finished = *ret <= 0;
+	unsigned int cflags;
+
+	if (!(sr->flags & IORING_RECVSEND_BUNDLE)) {
+		cflags = io_put_kbuf(req, issue_flags);
+		goto finish;
+	}
+
+	cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), issue_flags);
+
+	if (bundle_finished || req->flags & REQ_F_BL_EMPTY)
+		goto finish;
+
+	/*
+	 * Fill CQE for this receive and see if we should keep trying to
+	 * receive from this socket.
+	 */
+	if (io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) {
+		io_mshot_prep_retry(req, kmsg);
+		return false;
+	}
+
+	/* Otherwise stop bundle and use the current result. */
+finish:
+	io_req_set_res(req, *ret, cflags);
+	*ret = IOU_OK;
+	return true;
+}
+
 int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
@@ -482,7 +566,6 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 	struct io_async_msghdr *kmsg = req->async_data;
 	struct socket *sock;
-	unsigned int cflags;
 	unsigned flags;
 	int min_ret = 0;
 	int ret;
@@ -495,21 +578,47 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
 	    (sr->flags & IORING_RECVSEND_POLL_FIRST))
 		return -EAGAIN;
 
+	flags = sr->msg_flags;
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		flags |= MSG_DONTWAIT;
+
+retry_bundle:
 	if (io_do_buffer_select(req)) {
-		size_t len = sr->len;
-		void __user *buf;
+		struct buf_sel_arg arg = {
+			.iovs = &kmsg->fast_iov,
+			.max_len = min_not_zero(sr->len, INT_MAX),
+			.nr_iovs = 1,
+			.mode = KBUF_MODE_EXPAND,
+		};
+
+		if (kmsg->free_iov) {
+			arg.nr_iovs = kmsg->free_iov_nr;
+			arg.iovs = kmsg->free_iov;
+			arg.mode |= KBUF_MODE_FREE;
+		}
 
-		buf = io_buffer_select(req, &len, issue_flags);
-		if (unlikely(!buf))
-			return -ENOBUFS;
-		sr->buf = buf;
-		sr->len = len;
+		if (!(sr->flags & IORING_RECVSEND_BUNDLE))
+			arg.nr_iovs = 1;
+
+		ret = io_buffers_select(req, &arg, issue_flags);
+		if (unlikely(ret < 0))
+			return ret;
+
+		sr->len = arg.out_len;
+		iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, arg.iovs, ret,
+				arg.out_len);
+		if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) {
+			kmsg->free_iov_nr = ret;
+			kmsg->free_iov = arg.iovs;
+		}
 	}
 
-	flags = sr->msg_flags;
-	if (issue_flags & IO_URING_F_NONBLOCK)
-		flags |= MSG_DONTWAIT;
-	if (flags & MSG_WAITALL)
+	/*
+	 * If MSG_WAITALL is set, or this is a bundle send, then we need
+	 * the full amount. If just bundle is set, if we do a short send
+	 * then we complete the bundle sequence rather than continue on.
+	 */
+	if (flags & MSG_WAITALL || sr->flags & IORING_RECVSEND_BUNDLE)
 		min_ret = iov_iter_count(&kmsg->msg.msg_iter);
 
 	flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
@@ -534,10 +643,12 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
 		ret += sr->done_io;
 	else if (sr->done_io)
 		ret = sr->done_io;
+
+	if (!io_send_finish(req, &ret, kmsg, issue_flags))
+		goto retry_bundle;
+
 	io_req_msg_cleanup(req, issue_flags);
-	cflags = io_put_kbuf(req, issue_flags);
-	io_req_set_res(req, ret, cflags);
-	return IOU_OK;
+	return ret;
 }
 
 static int io_recvmsg_mshot_prep(struct io_kiocb *req,
-- 
cgit v1.2.3


From 2f9c9515bdfde9e4df1f35782284074d3625ff8a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 5 Mar 2024 16:22:04 -0700
Subject: io_uring/net: support bundles for recv

If IORING_OP_RECV is used with provided buffers, the caller may also set
IORING_RECVSEND_BUNDLE to turn it into a multi-buffer recv. This grabs
buffers available and receives into them, posting a single completion for
all of it.

This can be used with multishot receive as well, or without it.

Now that both send and receive support bundles, add a feature flag for
it as well. If IORING_FEAT_RECVSEND_BUNDLE is set after registering the
ring, then the kernel supports bundles for recv and send.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  15 +++---
 io_uring/io_uring.c           |   3 +-
 io_uring/net.c                | 116 ++++++++++++++++++++++++++++++++++--------
 3 files changed, 105 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 7f583927c908..f093cb2300d9 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -352,13 +352,13 @@ enum io_uring_op {
  *				IORING_NOTIF_USAGE_ZC_COPIED if data was copied
  *				(at least partially).
  *
- * IORING_RECVSEND_BUNDLE	Used with IOSQE_BUFFER_SELECT. If set, send will
- *				grab as many buffers from the buffer group ID
- *				given and send them all. The completion result
- *				will be the number of buffers send, with the
- *				starting buffer ID in cqe->flags as per usual
- *				for provided buffer usage. The buffers will be
- *				contigious from the starting buffer ID.
+ * IORING_RECVSEND_BUNDLE	Used with IOSQE_BUFFER_SELECT. If set, send or
+ *				recv will grab as many buffers from the buffer
+ *				group ID given and send them all. The completion
+ *				result 	will be the number of buffers send, with
+ *				the starting buffer ID in cqe->flags as per
+ *				usual for provided buffer usage. The buffers
+ *				will be	contigious from the starting buffer ID.
  */
 #define IORING_RECVSEND_POLL_FIRST	(1U << 0)
 #define IORING_RECV_MULTISHOT		(1U << 1)
@@ -529,6 +529,7 @@ struct io_uring_params {
 #define IORING_FEAT_CQE_SKIP		(1U << 11)
 #define IORING_FEAT_LINKED_FILE		(1U << 12)
 #define IORING_FEAT_REG_REG_RING	(1U << 13)
+#define IORING_FEAT_RECVSEND_BUNDLE	(1U << 14)
 
 /*
  * io_uring_register(2) opcodes and arguments
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c67ae6e36c4f..64845634d89f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3583,7 +3583,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 			IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED |
 			IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
 			IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
-			IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING;
+			IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING |
+			IORING_FEAT_RECVSEND_BUNDLE;
 
 	if (copy_to_user(params, p, sizeof(*p))) {
 		ret = -EFAULT;
diff --git a/io_uring/net.c b/io_uring/net.c
index 3e326576254b..51c41d771c50 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -747,7 +747,8 @@ static int io_recvmsg_prep_setup(struct io_kiocb *req)
 	return ret;
 }
 
-#define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT)
+#define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT | \
+			IORING_RECVSEND_BUNDLE)
 
 int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
@@ -761,21 +762,14 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	sr->len = READ_ONCE(sqe->len);
 	sr->flags = READ_ONCE(sqe->ioprio);
-	if (sr->flags & ~(RECVMSG_FLAGS))
+	if (sr->flags & ~RECVMSG_FLAGS)
 		return -EINVAL;
 	sr->msg_flags = READ_ONCE(sqe->msg_flags);
 	if (sr->msg_flags & MSG_DONTWAIT)
 		req->flags |= REQ_F_NOWAIT;
 	if (sr->msg_flags & MSG_ERRQUEUE)
 		req->flags |= REQ_F_CLEAR_POLLIN;
-	if (sr->flags & IORING_RECV_MULTISHOT) {
-		if (!(req->flags & REQ_F_BUFFER_SELECT))
-			return -EINVAL;
-		if (sr->msg_flags & MSG_WAITALL)
-			return -EINVAL;
-		if (req->opcode == IORING_OP_RECV && sr->len)
-			return -EINVAL;
-		req->flags |= REQ_F_APOLL_MULTISHOT;
+	if (req->flags & REQ_F_BUFFER_SELECT) {
 		/*
 		 * Store the buffer group for this multishot receive separately,
 		 * as if we end up doing an io-wq based issue that selects a
@@ -785,6 +779,20 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		 * restore it.
 		 */
 		sr->buf_group = req->buf_index;
+		req->buf_list = NULL;
+	}
+	if (sr->flags & IORING_RECV_MULTISHOT) {
+		if (!(req->flags & REQ_F_BUFFER_SELECT))
+			return -EINVAL;
+		if (sr->msg_flags & MSG_WAITALL)
+			return -EINVAL;
+		if (req->opcode == IORING_OP_RECV && sr->len)
+			return -EINVAL;
+		req->flags |= REQ_F_APOLL_MULTISHOT;
+	}
+	if (sr->flags & IORING_RECVSEND_BUNDLE) {
+		if (req->opcode == IORING_OP_RECVMSG)
+			return -EINVAL;
 	}
 
 #ifdef CONFIG_COMPAT
@@ -805,19 +813,28 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
 				  struct io_async_msghdr *kmsg,
 				  bool mshot_finished, unsigned issue_flags)
 {
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 	unsigned int cflags;
 
-	cflags = io_put_kbuf(req, issue_flags);
+	if (sr->flags & IORING_RECVSEND_BUNDLE)
+		cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret),
+				      issue_flags);
+	else
+		cflags = io_put_kbuf(req, issue_flags);
+
 	if (kmsg->msg.msg_inq > 0)
 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;
 
+	/* bundle with no more immediate buffers, we're done */
+	if (sr->flags & IORING_RECVSEND_BUNDLE && req->flags & REQ_F_BL_EMPTY)
+		goto finish;
+
 	/*
 	 * Fill CQE for this receive and see if we should keep trying to
 	 * receive from this socket.
 	 */
 	if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished &&
 	    io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) {
-		struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
 		int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE;
 
 		io_mshot_prep_retry(req, kmsg);
@@ -837,6 +854,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
 	}
 
 	/* Finish the request / stop multishot. */
+finish:
 	io_req_set_res(req, *ret, cflags);
 
 	if (issue_flags & IO_URING_F_MULTISHOT)
@@ -1020,6 +1038,69 @@ retry_multishot:
 	return ret;
 }
 
+static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg,
+			      size_t *len, unsigned int issue_flags)
+{
+	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
+	int ret;
+
+	/*
+	 * If the ring isn't locked, then don't use the peek interface
+	 * to grab multiple buffers as we will lock/unlock between
+	 * this selection and posting the buffers.
+	 */
+	if (!(issue_flags & IO_URING_F_UNLOCKED) &&
+	    sr->flags & IORING_RECVSEND_BUNDLE) {
+		struct buf_sel_arg arg = {
+			.iovs = &kmsg->fast_iov,
+			.nr_iovs = 1,
+			.mode = KBUF_MODE_EXPAND,
+		};
+
+		if (kmsg->free_iov) {
+			arg.nr_iovs = kmsg->free_iov_nr;
+			arg.iovs = kmsg->free_iov;
+			arg.mode |= KBUF_MODE_FREE;
+		}
+
+		if (kmsg->msg.msg_inq > 0)
+			arg.max_len = min_not_zero(sr->len, kmsg->msg.msg_inq);
+
+		ret = io_buffers_peek(req, &arg);
+		if (unlikely(ret < 0))
+			return ret;
+
+		/* special case 1 vec, can be a fast path */
+		if (ret == 1) {
+			sr->buf = arg.iovs[0].iov_base;
+			sr->len = arg.iovs[0].iov_len;
+			goto map_ubuf;
+		}
+		iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret,
+				arg.out_len);
+		if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) {
+			kmsg->free_iov_nr = ret;
+			kmsg->free_iov = arg.iovs;
+		}
+	} else {
+		void __user *buf;
+
+		*len = sr->len;
+		buf = io_buffer_select(req, len, issue_flags);
+		if (!buf)
+			return -ENOBUFS;
+		sr->buf = buf;
+		sr->len = *len;
+map_ubuf:
+		ret = import_ubuf(ITER_DEST, sr->buf, sr->len,
+				  &kmsg->msg.msg_iter);
+		if (unlikely(ret))
+			return ret;
+	}
+
+	return 0;
+}
+
 int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
@@ -1044,17 +1125,10 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
 
 retry_multishot:
 	if (io_do_buffer_select(req)) {
-		void __user *buf;
-
-		buf = io_buffer_select(req, &len, issue_flags);
-		if (!buf)
-			return -ENOBUFS;
-		sr->buf = buf;
-		sr->len = len;
-		ret = import_ubuf(ITER_DEST, sr->buf, sr->len,
-				  &kmsg->msg.msg_iter);
+		ret = io_recv_buf_select(req, kmsg, &len, issue_flags);
 		if (unlikely(ret))
 			goto out_free;
+		sr->buf = NULL;
 	}
 
 	kmsg->msg.msg_inq = -1;
-- 
cgit v1.2.3


From 96950929eb232038022abd961be46d492d7a6f0f Mon Sep 17 00:00:00 2001
From: Arunpravin Paneer Selvam <Arunpravin.PaneerSelvam@amd.com>
Date: Fri, 19 Apr 2024 12:05:36 +0530
Subject: drm/buddy: Implement tracking clear page feature
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add tracking clear page feature.

- Driver should enable the DRM_BUDDY_CLEARED flag if it
  successfully clears the blocks in the free path. On the otherhand,
  DRM buddy marks each block as cleared.

- Track the available cleared pages size

- If driver requests cleared memory we prefer cleared memory
  but fallback to uncleared if we can't find the cleared blocks.
  when driver requests uncleared memory we try to use uncleared but
  fallback to cleared memory if necessary.

- When a block gets freed we clear it and mark the freed block as cleared,
  when there are buddies which are cleared as well we can merge them.
  Otherwise, we prefer to keep the blocks as separated.

- Add a function to support defragmentation.

v1:
  - Depends on the flag check DRM_BUDDY_CLEARED, enable the block as
    cleared. Else, reset the clear flag for each block in the list(Christian)
  - For merging the 2 cleared blocks compare as below,
    drm_buddy_is_clear(block) != drm_buddy_is_clear(buddy)(Christian)
  - Defragment the memory beginning from min_order
    till the required memory space is available.

v2: (Matthew)
  - Add a wrapper drm_buddy_free_list_internal for the freeing of blocks
    operation within drm buddy.
  - Write a macro block_incompatible() to allocate the required blocks.
  - Update the xe driver for the drm_buddy_free_list change in arguments.
  - add a warning if the two blocks are incompatible on
    defragmentation
  - call full defragmentation in the fini() function
  - place a condition to test if min_order is equal to 0
  - replace the list with safe_reverse() variant as we might
    remove the block from the list.

v3:
  - fix Gitlab user reported lockup issue.
  - Keep DRM_BUDDY_HEADER_CLEAR define sorted(Matthew)
  - modify to pass the root order instead max_order in fini()
    function(Matthew)
  - change bool 1 to true(Matthew)
  - add check if min_block_size is power of 2(Matthew)
  - modify the min_block_size datatype to u64(Matthew)

v4:
  - rename the function drm_buddy_defrag with __force_merge.
  - Include __force_merge directly in drm buddy file and remove
    the defrag use in amdgpu driver.
  - Remove list_empty() check(Matthew)
  - Remove unnecessary space, headers and placement of new variables(Matthew)
  - Add a unit test case(Matthew)

v5:
  - remove force merge support to actual range allocation and not to bail
    out when contains && split(Matthew)
  - add range support to force merge function.

v6:
  - modify the alloc_range() function clear page non merged blocks
    allocation(Matthew)
  - correct the list_insert function name(Matthew).

Signed-off-by: Arunpravin Paneer Selvam <Arunpravin.PaneerSelvam@amd.com>
Signed-off-by: Matthew Auld <matthew.auld@intel.com>
Suggested-by: Christian König <christian.koenig@amd.com>
Suggested-by: Matthew Auld <matthew.auld@intel.com>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240419063538.11957-1-Arunpravin.PaneerSelvam@amd.com
Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  |   6 +-
 drivers/gpu/drm/drm_buddy.c                   | 427 ++++++++++++++++++++------
 drivers/gpu/drm/i915/i915_ttm_buddy_manager.c |   6 +-
 drivers/gpu/drm/tests/drm_buddy_test.c        |  28 +-
 drivers/gpu/drm/xe/xe_ttm_vram_mgr.c          |   4 +-
 include/drm/drm_buddy.h                       |  16 +-
 6 files changed, 363 insertions(+), 124 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 8db880244324..c0c851409241 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -571,7 +571,7 @@ static int amdgpu_vram_mgr_new(struct ttm_resource_manager *man,
 	return 0;
 
 error_free_blocks:
-	drm_buddy_free_list(mm, &vres->blocks);
+	drm_buddy_free_list(mm, &vres->blocks, 0);
 	mutex_unlock(&mgr->lock);
 error_fini:
 	ttm_resource_fini(man, &vres->base);
@@ -604,7 +604,7 @@ static void amdgpu_vram_mgr_del(struct ttm_resource_manager *man,
 
 	amdgpu_vram_mgr_do_reserve(man);
 
-	drm_buddy_free_list(mm, &vres->blocks);
+	drm_buddy_free_list(mm, &vres->blocks, 0);
 	mutex_unlock(&mgr->lock);
 
 	atomic64_sub(vis_usage, &mgr->vis_usage);
@@ -912,7 +912,7 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev)
 		kfree(rsv);
 
 	list_for_each_entry_safe(rsv, temp, &mgr->reserved_pages, blocks) {
-		drm_buddy_free_list(&mgr->mm, &rsv->allocated);
+		drm_buddy_free_list(&mgr->mm, &rsv->allocated, 0);
 		kfree(rsv);
 	}
 	if (!adev->gmc.is_app_apu)
diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 5ebdd6f8f36e..284ebae71cc4 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -57,6 +57,16 @@ static void list_insert_sorted(struct drm_buddy *mm,
 	__list_add(&block->link, node->link.prev, &node->link);
 }
 
+static void clear_reset(struct drm_buddy_block *block)
+{
+	block->header &= ~DRM_BUDDY_HEADER_CLEAR;
+}
+
+static void mark_cleared(struct drm_buddy_block *block)
+{
+	block->header |= DRM_BUDDY_HEADER_CLEAR;
+}
+
 static void mark_allocated(struct drm_buddy_block *block)
 {
 	block->header &= ~DRM_BUDDY_HEADER_STATE;
@@ -82,6 +92,133 @@ static void mark_split(struct drm_buddy_block *block)
 	list_del(&block->link);
 }
 
+static inline bool overlaps(u64 s1, u64 e1, u64 s2, u64 e2)
+{
+	return s1 <= e2 && e1 >= s2;
+}
+
+static inline bool contains(u64 s1, u64 e1, u64 s2, u64 e2)
+{
+	return s1 <= s2 && e1 >= e2;
+}
+
+static struct drm_buddy_block *
+__get_buddy(struct drm_buddy_block *block)
+{
+	struct drm_buddy_block *parent;
+
+	parent = block->parent;
+	if (!parent)
+		return NULL;
+
+	if (parent->left == block)
+		return parent->right;
+
+	return parent->left;
+}
+
+static unsigned int __drm_buddy_free(struct drm_buddy *mm,
+				     struct drm_buddy_block *block,
+				     bool force_merge)
+{
+	struct drm_buddy_block *parent;
+	unsigned int order;
+
+	while ((parent = block->parent)) {
+		struct drm_buddy_block *buddy;
+
+		buddy = __get_buddy(block);
+
+		if (!drm_buddy_block_is_free(buddy))
+			break;
+
+		if (!force_merge) {
+			/*
+			 * Check the block and its buddy clear state and exit
+			 * the loop if they both have the dissimilar state.
+			 */
+			if (drm_buddy_block_is_clear(block) !=
+			    drm_buddy_block_is_clear(buddy))
+				break;
+
+			if (drm_buddy_block_is_clear(block))
+				mark_cleared(parent);
+		}
+
+		list_del(&buddy->link);
+		if (force_merge && drm_buddy_block_is_clear(buddy))
+			mm->clear_avail -= drm_buddy_block_size(mm, buddy);
+
+		drm_block_free(mm, block);
+		drm_block_free(mm, buddy);
+
+		block = parent;
+	}
+
+	order = drm_buddy_block_order(block);
+	mark_free(mm, block);
+
+	return order;
+}
+
+static int __force_merge(struct drm_buddy *mm,
+			 u64 start,
+			 u64 end,
+			 unsigned int min_order)
+{
+	unsigned int order;
+	int i;
+
+	if (!min_order)
+		return -ENOMEM;
+
+	if (min_order > mm->max_order)
+		return -EINVAL;
+
+	for (i = min_order - 1; i >= 0; i--) {
+		struct drm_buddy_block *block, *prev;
+
+		list_for_each_entry_safe_reverse(block, prev, &mm->free_list[i], link) {
+			struct drm_buddy_block *buddy;
+			u64 block_start, block_end;
+
+			if (!block->parent)
+				continue;
+
+			block_start = drm_buddy_block_offset(block);
+			block_end = block_start + drm_buddy_block_size(mm, block) - 1;
+
+			if (!contains(start, end, block_start, block_end))
+				continue;
+
+			buddy = __get_buddy(block);
+			if (!drm_buddy_block_is_free(buddy))
+				continue;
+
+			WARN_ON(drm_buddy_block_is_clear(block) ==
+				drm_buddy_block_is_clear(buddy));
+
+			/*
+			 * If the prev block is same as buddy, don't access the
+			 * block in the next iteration as we would free the
+			 * buddy block as part of the free function.
+			 */
+			if (prev == buddy)
+				prev = list_prev_entry(prev, link);
+
+			list_del(&block->link);
+			if (drm_buddy_block_is_clear(block))
+				mm->clear_avail -= drm_buddy_block_size(mm, block);
+
+			order = __drm_buddy_free(mm, block, true);
+			if (order >= min_order)
+				return 0;
+		}
+	}
+
+	return -ENOMEM;
+}
+
 /**
  * drm_buddy_init - init memory manager
  *
@@ -186,11 +323,21 @@ EXPORT_SYMBOL(drm_buddy_init);
  */
 void drm_buddy_fini(struct drm_buddy *mm)
 {
+	u64 root_size, size;
+	unsigned int order;
 	int i;
 
+	size = mm->size;
+
 	for (i = 0; i < mm->n_roots; ++i) {
+		order = ilog2(size) - ilog2(mm->chunk_size);
+		__force_merge(mm, 0, size, order);
+
 		WARN_ON(!drm_buddy_block_is_free(mm->roots[i]));
 		drm_block_free(mm, mm->roots[i]);
+
+		root_size = mm->chunk_size << order;
+		size -= root_size;
 	}
 
 	WARN_ON(mm->avail != mm->size);
@@ -223,26 +370,17 @@ static int split_block(struct drm_buddy *mm,
 	mark_free(mm, block->left);
 	mark_free(mm, block->right);
 
+	if (drm_buddy_block_is_clear(block)) {
+		mark_cleared(block->left);
+		mark_cleared(block->right);
+		clear_reset(block);
+	}
+
 	mark_split(block);
 
 	return 0;
 }
 
-static struct drm_buddy_block *
-__get_buddy(struct drm_buddy_block *block)
-{
-	struct drm_buddy_block *parent;
-
-	parent = block->parent;
-	if (!parent)
-		return NULL;
-
-	if (parent->left == block)
-		return parent->right;
-
-	return parent->left;
-}
-
 /**
  * drm_get_buddy - get buddy address
  *
@@ -260,30 +398,6 @@ drm_get_buddy(struct drm_buddy_block *block)
 }
 EXPORT_SYMBOL(drm_get_buddy);
 
-static void __drm_buddy_free(struct drm_buddy *mm,
-			     struct drm_buddy_block *block)
-{
-	struct drm_buddy_block *parent;
-
-	while ((parent = block->parent)) {
-		struct drm_buddy_block *buddy;
-
-		buddy = __get_buddy(block);
-
-		if (!drm_buddy_block_is_free(buddy))
-			break;
-
-		list_del(&buddy->link);
-
-		drm_block_free(mm, block);
-		drm_block_free(mm, buddy);
-
-		block = parent;
-	}
-
-	mark_free(mm, block);
-}
-
 /**
  * drm_buddy_free_block - free a block
  *
@@ -295,42 +409,74 @@ void drm_buddy_free_block(struct drm_buddy *mm,
 {
 	BUG_ON(!drm_buddy_block_is_allocated(block));
 	mm->avail += drm_buddy_block_size(mm, block);
-	__drm_buddy_free(mm, block);
+	if (drm_buddy_block_is_clear(block))
+		mm->clear_avail += drm_buddy_block_size(mm, block);
+
+	__drm_buddy_free(mm, block, false);
 }
 EXPORT_SYMBOL(drm_buddy_free_block);
 
-/**
- * drm_buddy_free_list - free blocks
- *
- * @mm: DRM buddy manager
- * @objects: input list head to free blocks
- */
-void drm_buddy_free_list(struct drm_buddy *mm, struct list_head *objects)
+static void __drm_buddy_free_list(struct drm_buddy *mm,
+				  struct list_head *objects,
+				  bool mark_clear,
+				  bool mark_dirty)
 {
 	struct drm_buddy_block *block, *on;
 
+	WARN_ON(mark_dirty && mark_clear);
+
 	list_for_each_entry_safe(block, on, objects, link) {
+		if (mark_clear)
+			mark_cleared(block);
+		else if (mark_dirty)
+			clear_reset(block);
 		drm_buddy_free_block(mm, block);
 		cond_resched();
 	}
 	INIT_LIST_HEAD(objects);
 }
-EXPORT_SYMBOL(drm_buddy_free_list);
 
-static inline bool overlaps(u64 s1, u64 e1, u64 s2, u64 e2)
+static void drm_buddy_free_list_internal(struct drm_buddy *mm,
+					 struct list_head *objects)
 {
-	return s1 <= e2 && e1 >= s2;
+	/*
+	 * Don't touch the clear/dirty bit, since allocation is still internal
+	 * at this point. For example we might have just failed part of the
+	 * allocation.
+	 */
+	__drm_buddy_free_list(mm, objects, false, false);
 }
 
-static inline bool contains(u64 s1, u64 e1, u64 s2, u64 e2)
+/**
+ * drm_buddy_free_list - free blocks
+ *
+ * @mm: DRM buddy manager
+ * @objects: input list head to free blocks
+ * @flags: optional flags like DRM_BUDDY_CLEARED
+ */
+void drm_buddy_free_list(struct drm_buddy *mm,
+			 struct list_head *objects,
+			 unsigned int flags)
 {
-	return s1 <= s2 && e1 >= e2;
+	bool mark_clear = flags & DRM_BUDDY_CLEARED;
+
+	__drm_buddy_free_list(mm, objects, mark_clear, !mark_clear);
+}
+EXPORT_SYMBOL(drm_buddy_free_list);
+
+static bool block_incompatible(struct drm_buddy_block *block, unsigned int flags)
+{
+	bool needs_clear = flags & DRM_BUDDY_CLEAR_ALLOCATION;
+
+	return needs_clear != drm_buddy_block_is_clear(block);
 }
 
 static struct drm_buddy_block *
-alloc_range_bias(struct drm_buddy *mm,
-		 u64 start, u64 end,
-		 unsigned int order)
+__alloc_range_bias(struct drm_buddy *mm,
+		   u64 start, u64 end,
+		   unsigned int order,
+		   unsigned long flags,
+		   bool fallback)
 {
 	u64 req_size = mm->chunk_size << order;
 	struct drm_buddy_block *block;
@@ -379,6 +525,9 @@ alloc_range_bias(struct drm_buddy *mm,
 
 		if (contains(start, end, block_start, block_end) &&
 		    order == drm_buddy_block_order(block)) {
+			if (!fallback && block_incompatible(block, flags))
+				continue;
+
 			/*
 			 * Find the free block within the range.
 			 */
@@ -410,30 +559,57 @@ err_undo:
 	if (buddy &&
 	    (drm_buddy_block_is_free(block) &&
 	     drm_buddy_block_is_free(buddy)))
-		__drm_buddy_free(mm, block);
+		__drm_buddy_free(mm, block, false);
 	return ERR_PTR(err);
 }
 
 static struct drm_buddy_block *
-get_maxblock(struct drm_buddy *mm, unsigned int order)
+__drm_buddy_alloc_range_bias(struct drm_buddy *mm,
+			     u64 start, u64 end,
+			     unsigned int order,
+			     unsigned long flags)
+{
+	struct drm_buddy_block *block;
+	bool fallback = false;
+
+	block = __alloc_range_bias(mm, start, end, order,
+				   flags, fallback);
+	if (IS_ERR(block) && mm->clear_avail)
+		return __alloc_range_bias(mm, start, end, order,
+					  flags, !fallback);
+
+	return block;
+}
+
+static struct drm_buddy_block *
+get_maxblock(struct drm_buddy *mm, unsigned int order,
+	     unsigned long flags)
 {
-	struct drm_buddy_block *max_block = NULL, *node;
+	struct drm_buddy_block *max_block = NULL, *block = NULL;
 	unsigned int i;
 
 	for (i = order; i <= mm->max_order; ++i) {
-		if (!list_empty(&mm->free_list[i])) {
-			node = list_last_entry(&mm->free_list[i],
-					       struct drm_buddy_block,
-					       link);
-			if (!max_block) {
-				max_block = node;
+		struct drm_buddy_block *tmp_block;
+
+		list_for_each_entry_reverse(tmp_block, &mm->free_list[i], link) {
+			if (block_incompatible(tmp_block, flags))
 				continue;
-			}
 
-			if (drm_buddy_block_offset(node) >
-			    drm_buddy_block_offset(max_block)) {
-				max_block = node;
-			}
+			block = tmp_block;
+			break;
+		}
+
+		if (!block)
+			continue;
+
+		if (!max_block) {
+			max_block = block;
+			continue;
+		}
+
+		if (drm_buddy_block_offset(block) >
+		    drm_buddy_block_offset(max_block)) {
+			max_block = block;
 		}
 	}
 
@@ -450,11 +626,29 @@ alloc_from_freelist(struct drm_buddy *mm,
 	int err;
 
 	if (flags & DRM_BUDDY_TOPDOWN_ALLOCATION) {
-		block = get_maxblock(mm, order);
+		block = get_maxblock(mm, order, flags);
 		if (block)
 			/* Store the obtained block order */
 			tmp = drm_buddy_block_order(block);
 	} else {
+		for (tmp = order; tmp <= mm->max_order; ++tmp) {
+			struct drm_buddy_block *tmp_block;
+
+			list_for_each_entry_reverse(tmp_block, &mm->free_list[tmp], link) {
+				if (block_incompatible(tmp_block, flags))
+					continue;
+
+				block = tmp_block;
+				break;
+			}
+
+			if (block)
+				break;
+		}
+	}
+
+	if (!block) {
+		/* Fallback method */
 		for (tmp = order; tmp <= mm->max_order; ++tmp) {
 			if (!list_empty(&mm->free_list[tmp])) {
 				block = list_last_entry(&mm->free_list[tmp],
@@ -464,10 +658,10 @@ alloc_from_freelist(struct drm_buddy *mm,
 					break;
 			}
 		}
-	}
 
-	if (!block)
-		return ERR_PTR(-ENOSPC);
+		if (!block)
+			return ERR_PTR(-ENOSPC);
+	}
 
 	BUG_ON(!drm_buddy_block_is_free(block));
 
@@ -483,7 +677,7 @@ alloc_from_freelist(struct drm_buddy *mm,
 
 err_undo:
 	if (tmp != order)
-		__drm_buddy_free(mm, block);
+		__drm_buddy_free(mm, block, false);
 	return ERR_PTR(err);
 }
 
@@ -526,16 +720,18 @@ static int __alloc_range(struct drm_buddy *mm,
 		}
 
 		if (contains(start, end, block_start, block_end)) {
-			if (!drm_buddy_block_is_free(block)) {
+			if (drm_buddy_block_is_free(block)) {
+				mark_allocated(block);
+				total_allocated += drm_buddy_block_size(mm, block);
+				mm->avail -= drm_buddy_block_size(mm, block);
+				if (drm_buddy_block_is_clear(block))
+					mm->clear_avail -= drm_buddy_block_size(mm, block);
+				list_add_tail(&block->link, &allocated);
+				continue;
+			} else if (!mm->clear_avail) {
 				err = -ENOSPC;
 				goto err_free;
 			}
-
-			mark_allocated(block);
-			total_allocated += drm_buddy_block_size(mm, block);
-			mm->avail -= drm_buddy_block_size(mm, block);
-			list_add_tail(&block->link, &allocated);
-			continue;
 		}
 
 		if (!drm_buddy_block_is_split(block)) {
@@ -567,14 +763,14 @@ err_undo:
 	if (buddy &&
 	    (drm_buddy_block_is_free(block) &&
 	     drm_buddy_block_is_free(buddy)))
-		__drm_buddy_free(mm, block);
+		__drm_buddy_free(mm, block, false);
 
 err_free:
 	if (err == -ENOSPC && total_allocated_on_err) {
 		list_splice_tail(&allocated, blocks);
 		*total_allocated_on_err = total_allocated;
 	} else {
-		drm_buddy_free_list(mm, &allocated);
+		drm_buddy_free_list_internal(mm, &allocated);
 	}
 
 	return err;
@@ -640,11 +836,11 @@ static int __alloc_contig_try_harder(struct drm_buddy *mm,
 			list_splice(&blocks_lhs, blocks);
 			return 0;
 		} else if (err != -ENOSPC) {
-			drm_buddy_free_list(mm, blocks);
+			drm_buddy_free_list_internal(mm, blocks);
 			return err;
 		}
 		/* Free blocks for the next iteration */
-		drm_buddy_free_list(mm, blocks);
+		drm_buddy_free_list_internal(mm, blocks);
 	}
 
 	return -ENOSPC;
@@ -700,6 +896,8 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
 	list_del(&block->link);
 	mark_free(mm, block);
 	mm->avail += drm_buddy_block_size(mm, block);
+	if (drm_buddy_block_is_clear(block))
+		mm->clear_avail += drm_buddy_block_size(mm, block);
 
 	/* Prevent recursively freeing this node */
 	parent = block->parent;
@@ -711,6 +909,8 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
 	if (err) {
 		mark_allocated(block);
 		mm->avail -= drm_buddy_block_size(mm, block);
+		if (drm_buddy_block_is_clear(block))
+			mm->clear_avail -= drm_buddy_block_size(mm, block);
 		list_add(&block->link, blocks);
 	}
 
@@ -719,13 +919,28 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
 }
 EXPORT_SYMBOL(drm_buddy_block_trim);
 
+static struct drm_buddy_block *
+__drm_buddy_alloc_blocks(struct drm_buddy *mm,
+			 u64 start, u64 end,
+			 unsigned int order,
+			 unsigned long flags)
+{
+	if (flags & DRM_BUDDY_RANGE_ALLOCATION)
+		/* Allocate traversing within the range */
+		return  __drm_buddy_alloc_range_bias(mm, start, end,
+						     order, flags);
+	else
+		/* Allocate from freelist */
+		return alloc_from_freelist(mm, order, flags);
+}
+
 /**
  * drm_buddy_alloc_blocks - allocate power-of-two blocks
  *
  * @mm: DRM buddy manager to allocate from
  * @start: start of the allowed range for this block
  * @end: end of the allowed range for this block
- * @size: size of the allocation
+ * @size: size of the allocation in bytes
  * @min_block_size: alignment of the allocation
  * @blocks: output list head to add allocated blocks
  * @flags: DRM_BUDDY_*_ALLOCATION flags
@@ -800,23 +1015,33 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
 		BUG_ON(order < min_order);
 
 		do {
-			if (flags & DRM_BUDDY_RANGE_ALLOCATION)
-				/* Allocate traversing within the range */
-				block = alloc_range_bias(mm, start, end, order);
-			else
-				/* Allocate from freelist */
-				block = alloc_from_freelist(mm, order, flags);
-
+			block = __drm_buddy_alloc_blocks(mm, start,
+							 end,
+							 order,
+							 flags);
 			if (!IS_ERR(block))
 				break;
 
 			if (order-- == min_order) {
+				/* Try allocation through force merge method */
+				if (mm->clear_avail &&
+				    !__force_merge(mm, start, end, min_order)) {
+					block = __drm_buddy_alloc_blocks(mm, start,
+									 end,
+									 min_order,
+									 flags);
+					if (!IS_ERR(block)) {
+						order = min_order;
+						break;
+					}
+				}
+
+				/*
+				 * Try contiguous block allocation through
+				 * try harder method.
+				 */
 				if (flags & DRM_BUDDY_CONTIGUOUS_ALLOCATION &&
 				    !(flags & DRM_BUDDY_RANGE_ALLOCATION))
-					/*
-					 * Try contiguous block allocation through
-					 * try harder method
-					 */
 					return __alloc_contig_try_harder(mm,
 									 original_size,
 									 original_min_size,
@@ -828,6 +1053,8 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
 
 		mark_allocated(block);
 		mm->avail -= drm_buddy_block_size(mm, block);
+		if (drm_buddy_block_is_clear(block))
+			mm->clear_avail -= drm_buddy_block_size(mm, block);
 		kmemleak_update_trace(block);
 		list_add_tail(&block->link, &allocated);
 
@@ -866,7 +1093,7 @@ int drm_buddy_alloc_blocks(struct drm_buddy *mm,
 	return 0;
 
 err_free:
-	drm_buddy_free_list(mm, &allocated);
+	drm_buddy_free_list_internal(mm, &allocated);
 	return err;
 }
 EXPORT_SYMBOL(drm_buddy_alloc_blocks);
@@ -899,8 +1126,8 @@ void drm_buddy_print(struct drm_buddy *mm, struct drm_printer *p)
 {
 	int order;
 
-	drm_printf(p, "chunk_size: %lluKiB, total: %lluMiB, free: %lluMiB\n",
-		   mm->chunk_size >> 10, mm->size >> 20, mm->avail >> 20);
+	drm_printf(p, "chunk_size: %lluKiB, total: %lluMiB, free: %lluMiB, clear_free: %lluMiB\n",
+		   mm->chunk_size >> 10, mm->size >> 20, mm->avail >> 20, mm->clear_avail >> 20);
 
 	for (order = mm->max_order; order >= 0; order--) {
 		struct drm_buddy_block *block;
diff --git a/drivers/gpu/drm/i915/i915_ttm_buddy_manager.c b/drivers/gpu/drm/i915/i915_ttm_buddy_manager.c
index 0d735d5c2b35..942345548bc3 100644
--- a/drivers/gpu/drm/i915/i915_ttm_buddy_manager.c
+++ b/drivers/gpu/drm/i915/i915_ttm_buddy_manager.c
@@ -126,7 +126,7 @@ static int i915_ttm_buddy_man_alloc(struct ttm_resource_manager *man,
 	return 0;
 
 err_free_blocks:
-	drm_buddy_free_list(mm, &bman_res->blocks);
+	drm_buddy_free_list(mm, &bman_res->blocks, 0);
 	mutex_unlock(&bman->lock);
 err_free_res:
 	ttm_resource_fini(man, &bman_res->base);
@@ -141,7 +141,7 @@ static void i915_ttm_buddy_man_free(struct ttm_resource_manager *man,
 	struct i915_ttm_buddy_manager *bman = to_buddy_manager(man);
 
 	mutex_lock(&bman->lock);
-	drm_buddy_free_list(&bman->mm, &bman_res->blocks);
+	drm_buddy_free_list(&bman->mm, &bman_res->blocks, 0);
 	bman->visible_avail += bman_res->used_visible_size;
 	mutex_unlock(&bman->lock);
 
@@ -345,7 +345,7 @@ int i915_ttm_buddy_man_fini(struct ttm_device *bdev, unsigned int type)
 	ttm_set_driver_manager(bdev, type, NULL);
 
 	mutex_lock(&bman->lock);
-	drm_buddy_free_list(mm, &bman->reserved);
+	drm_buddy_free_list(mm, &bman->reserved, 0);
 	drm_buddy_fini(mm);
 	bman->visible_avail += bman->visible_reserved;
 	WARN_ON_ONCE(bman->visible_avail != bman->visible_size);
diff --git a/drivers/gpu/drm/tests/drm_buddy_test.c b/drivers/gpu/drm/tests/drm_buddy_test.c
index e48863a44556..4621a860cb05 100644
--- a/drivers/gpu/drm/tests/drm_buddy_test.c
+++ b/drivers/gpu/drm/tests/drm_buddy_test.c
@@ -103,7 +103,7 @@ static void drm_test_buddy_alloc_range_bias(struct kunit *test)
 							      DRM_BUDDY_RANGE_ALLOCATION),
 				       "buddy_alloc i failed with bias(%x-%x), size=%u, ps=%u\n",
 				       bias_start, bias_end, bias_size, bias_size);
-		drm_buddy_free_list(&mm, &tmp);
+		drm_buddy_free_list(&mm, &tmp, 0);
 
 		/* single page with internal round_up */
 		KUNIT_ASSERT_FALSE_MSG(test,
@@ -113,7 +113,7 @@ static void drm_test_buddy_alloc_range_bias(struct kunit *test)
 							      DRM_BUDDY_RANGE_ALLOCATION),
 				       "buddy_alloc failed with bias(%x-%x), size=%u, ps=%u\n",
 				       bias_start, bias_end, ps, bias_size);
-		drm_buddy_free_list(&mm, &tmp);
+		drm_buddy_free_list(&mm, &tmp, 0);
 
 		/* random size within */
 		size = max(round_up(prandom_u32_state(&prng) % bias_rem, ps), ps);
@@ -153,14 +153,14 @@ static void drm_test_buddy_alloc_range_bias(struct kunit *test)
 			 * unallocated, and ideally not always on the bias
 			 * boundaries.
 			 */
-			drm_buddy_free_list(&mm, &tmp);
+			drm_buddy_free_list(&mm, &tmp, 0);
 		} else {
 			list_splice_tail(&tmp, &allocated);
 		}
 	}
 
 	kfree(order);
-	drm_buddy_free_list(&mm, &allocated);
+	drm_buddy_free_list(&mm, &allocated, 0);
 	drm_buddy_fini(&mm);
 
 	/*
@@ -220,7 +220,7 @@ static void drm_test_buddy_alloc_range_bias(struct kunit *test)
 			      "buddy_alloc passed with bias(%x-%x), size=%u\n",
 			      bias_start, bias_end, ps);
 
-	drm_buddy_free_list(&mm, &allocated);
+	drm_buddy_free_list(&mm, &allocated, 0);
 	drm_buddy_fini(&mm);
 }
 
@@ -269,7 +269,7 @@ static void drm_test_buddy_alloc_contiguous(struct kunit *test)
 							   DRM_BUDDY_CONTIGUOUS_ALLOCATION),
 			       "buddy_alloc didn't error size=%lu\n", 3 * ps);
 
-	drm_buddy_free_list(&mm, &middle);
+	drm_buddy_free_list(&mm, &middle, 0);
 	KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size,
 							   3 * ps, ps, &allocated,
 							   DRM_BUDDY_CONTIGUOUS_ALLOCATION),
@@ -279,7 +279,7 @@ static void drm_test_buddy_alloc_contiguous(struct kunit *test)
 							   DRM_BUDDY_CONTIGUOUS_ALLOCATION),
 			       "buddy_alloc didn't error size=%lu\n", 2 * ps);
 
-	drm_buddy_free_list(&mm, &right);
+	drm_buddy_free_list(&mm, &right, 0);
 	KUNIT_ASSERT_TRUE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size,
 							   3 * ps, ps, &allocated,
 							   DRM_BUDDY_CONTIGUOUS_ALLOCATION),
@@ -294,7 +294,7 @@ static void drm_test_buddy_alloc_contiguous(struct kunit *test)
 							    DRM_BUDDY_CONTIGUOUS_ALLOCATION),
 			       "buddy_alloc hit an error size=%lu\n", 2 * ps);
 
-	drm_buddy_free_list(&mm, &left);
+	drm_buddy_free_list(&mm, &left, 0);
 	KUNIT_ASSERT_FALSE_MSG(test, drm_buddy_alloc_blocks(&mm, 0, mm_size,
 							    3 * ps, ps, &allocated,
 							    DRM_BUDDY_CONTIGUOUS_ALLOCATION),
@@ -306,7 +306,7 @@ static void drm_test_buddy_alloc_contiguous(struct kunit *test)
 
 	KUNIT_ASSERT_EQ(test, total, ps * 2 + ps * 3);
 
-	drm_buddy_free_list(&mm, &allocated);
+	drm_buddy_free_list(&mm, &allocated, 0);
 	drm_buddy_fini(&mm);
 }
 
@@ -375,7 +375,7 @@ static void drm_test_buddy_alloc_pathological(struct kunit *test)
 							  top, max_order);
 	}
 
-	drm_buddy_free_list(&mm, &holes);
+	drm_buddy_free_list(&mm, &holes, 0);
 
 	/* Nothing larger than blocks of chunk_size now available */
 	for (order = 1; order <= max_order; order++) {
@@ -387,7 +387,7 @@ static void drm_test_buddy_alloc_pathological(struct kunit *test)
 	}
 
 	list_splice_tail(&holes, &blocks);
-	drm_buddy_free_list(&mm, &blocks);
+	drm_buddy_free_list(&mm, &blocks, 0);
 	drm_buddy_fini(&mm);
 }
 
@@ -482,7 +482,7 @@ static void drm_test_buddy_alloc_pessimistic(struct kunit *test)
 
 	list_del(&block->link);
 	drm_buddy_free_block(&mm, block);
-	drm_buddy_free_list(&mm, &blocks);
+	drm_buddy_free_list(&mm, &blocks, 0);
 	drm_buddy_fini(&mm);
 }
 
@@ -528,7 +528,7 @@ static void drm_test_buddy_alloc_optimistic(struct kunit *test)
 							   size, size, &tmp, flags),
 						  "buddy_alloc unexpectedly succeeded, it should be full!");
 
-	drm_buddy_free_list(&mm, &blocks);
+	drm_buddy_free_list(&mm, &blocks, 0);
 	drm_buddy_fini(&mm);
 }
 
@@ -563,7 +563,7 @@ static void drm_test_buddy_alloc_limit(struct kunit *test)
 						drm_buddy_block_size(&mm, block),
 						BIT_ULL(mm.max_order) * PAGE_SIZE);
 
-	drm_buddy_free_list(&mm, &allocated);
+	drm_buddy_free_list(&mm, &allocated, 0);
 	drm_buddy_fini(&mm);
 }
 
diff --git a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
index 115ec745e502..1ad678b62c4a 100644
--- a/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
+++ b/drivers/gpu/drm/xe/xe_ttm_vram_mgr.c
@@ -196,7 +196,7 @@ static int xe_ttm_vram_mgr_new(struct ttm_resource_manager *man,
 	return 0;
 
 error_free_blocks:
-	drm_buddy_free_list(mm, &vres->blocks);
+	drm_buddy_free_list(mm, &vres->blocks, 0);
 	mutex_unlock(&mgr->lock);
 error_fini:
 	ttm_resource_fini(man, &vres->base);
@@ -214,7 +214,7 @@ static void xe_ttm_vram_mgr_del(struct ttm_resource_manager *man,
 	struct drm_buddy *mm = &mgr->mm;
 
 	mutex_lock(&mgr->lock);
-	drm_buddy_free_list(mm, &vres->blocks);
+	drm_buddy_free_list(mm, &vres->blocks, 0);
 	mgr->visible_avail += vres->used_visible_size;
 	mutex_unlock(&mgr->lock);
 
diff --git a/include/drm/drm_buddy.h b/include/drm/drm_buddy.h
index a5b39fc01003..82570f77e817 100644
--- a/include/drm/drm_buddy.h
+++ b/include/drm/drm_buddy.h
@@ -25,6 +25,8 @@
 #define DRM_BUDDY_RANGE_ALLOCATION		BIT(0)
 #define DRM_BUDDY_TOPDOWN_ALLOCATION		BIT(1)
 #define DRM_BUDDY_CONTIGUOUS_ALLOCATION		BIT(2)
+#define DRM_BUDDY_CLEAR_ALLOCATION		BIT(3)
+#define DRM_BUDDY_CLEARED			BIT(4)
 
 struct drm_buddy_block {
 #define DRM_BUDDY_HEADER_OFFSET GENMASK_ULL(63, 12)
@@ -32,8 +34,9 @@ struct drm_buddy_block {
 #define   DRM_BUDDY_ALLOCATED	   (1 << 10)
 #define   DRM_BUDDY_FREE	   (2 << 10)
 #define   DRM_BUDDY_SPLIT	   (3 << 10)
+#define DRM_BUDDY_HEADER_CLEAR  GENMASK_ULL(9, 9)
 /* Free to be used, if needed in the future */
-#define DRM_BUDDY_HEADER_UNUSED GENMASK_ULL(9, 6)
+#define DRM_BUDDY_HEADER_UNUSED GENMASK_ULL(8, 6)
 #define DRM_BUDDY_HEADER_ORDER  GENMASK_ULL(5, 0)
 	u64 header;
 
@@ -86,6 +89,7 @@ struct drm_buddy {
 	u64 chunk_size;
 	u64 size;
 	u64 avail;
+	u64 clear_avail;
 };
 
 static inline u64
@@ -112,6 +116,12 @@ drm_buddy_block_is_allocated(struct drm_buddy_block *block)
 	return drm_buddy_block_state(block) == DRM_BUDDY_ALLOCATED;
 }
 
+static inline bool
+drm_buddy_block_is_clear(struct drm_buddy_block *block)
+{
+	return block->header & DRM_BUDDY_HEADER_CLEAR;
+}
+
 static inline bool
 drm_buddy_block_is_free(struct drm_buddy_block *block)
 {
@@ -150,7 +160,9 @@ int drm_buddy_block_trim(struct drm_buddy *mm,
 
 void drm_buddy_free_block(struct drm_buddy *mm, struct drm_buddy_block *block);
 
-void drm_buddy_free_list(struct drm_buddy *mm, struct list_head *objects);
+void drm_buddy_free_list(struct drm_buddy *mm,
+			 struct list_head *objects,
+			 unsigned int flags);
 
 void drm_buddy_print(struct drm_buddy *mm, struct drm_printer *p);
 void drm_buddy_block_print(struct drm_buddy *mm,
-- 
cgit v1.2.3


From 5625ca5640caa3fb797f155601d56379d260d6ba Mon Sep 17 00:00:00 2001
From: Mateusz Polchlopek <mateusz.polchlopek@intel.com>
Date: Fri, 19 Apr 2024 04:08:49 -0400
Subject: devlink: extend devlink_param *set pointer

Extend devlink_param *set function pointer to take extack as a param.
Sometimes it is needed to pass information to the end user from set
function. It is more proper to use for that netlink instead of passing
message to dmesg.

Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Signed-off-by: Mateusz Polchlopek <mateusz.polchlopek@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/crypto/marvell/octeontx2/otx2_cpt_devlink.c        |  9 ++++++---
 drivers/net/ethernet/amd/pds_core/core.h                   |  3 ++-
 drivers/net/ethernet/amd/pds_core/devlink.c                |  3 ++-
 drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c          |  6 ++++--
 drivers/net/ethernet/intel/ice/devlink/devlink.c           | 12 ++++++------
 drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c    | 12 ++++++++----
 drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c  |  3 ++-
 drivers/net/ethernet/mellanox/mlx4/main.c                  |  6 ++++--
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c          |  3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c |  3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.c          |  3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c         |  3 ++-
 drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c    |  3 ++-
 drivers/net/ethernet/netronome/nfp/devlink_param.c         |  3 ++-
 drivers/net/ethernet/qlogic/qed/qed_devlink.c              |  3 ++-
 drivers/net/ethernet/ti/am65-cpsw-nuss.c                   |  3 ++-
 drivers/net/ethernet/ti/cpsw_new.c                         |  6 ++++--
 drivers/net/wwan/iosm/iosm_ipc_devlink.c                   |  3 ++-
 include/net/devlink.h                                      |  3 ++-
 include/net/dsa.h                                          |  3 ++-
 net/devlink/param.c                                        |  7 ++++---
 net/dsa/devlink.c                                          |  3 ++-
 22 files changed, 66 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/drivers/crypto/marvell/octeontx2/otx2_cpt_devlink.c b/drivers/crypto/marvell/octeontx2/otx2_cpt_devlink.c
index d2b8d26db968..215a1a8ba7e9 100644
--- a/drivers/crypto/marvell/octeontx2/otx2_cpt_devlink.c
+++ b/drivers/crypto/marvell/octeontx2/otx2_cpt_devlink.c
@@ -4,7 +4,8 @@
 #include "otx2_cpt_devlink.h"
 
 static int otx2_cpt_dl_egrp_create(struct devlink *dl, u32 id,
-				   struct devlink_param_gset_ctx *ctx)
+				   struct devlink_param_gset_ctx *ctx,
+				   struct netlink_ext_ack *extack)
 {
 	struct otx2_cpt_devlink *cpt_dl = devlink_priv(dl);
 	struct otx2_cptpf_dev *cptpf = cpt_dl->cptpf;
@@ -13,7 +14,8 @@ static int otx2_cpt_dl_egrp_create(struct devlink *dl, u32 id,
 }
 
 static int otx2_cpt_dl_egrp_delete(struct devlink *dl, u32 id,
-				   struct devlink_param_gset_ctx *ctx)
+				   struct devlink_param_gset_ctx *ctx,
+				   struct netlink_ext_ack *extack)
 {
 	struct otx2_cpt_devlink *cpt_dl = devlink_priv(dl);
 	struct otx2_cptpf_dev *cptpf = cpt_dl->cptpf;
@@ -45,7 +47,8 @@ static int otx2_cpt_dl_t106_mode_get(struct devlink *dl, u32 id,
 }
 
 static int otx2_cpt_dl_t106_mode_set(struct devlink *dl, u32 id,
-				     struct devlink_param_gset_ctx *ctx)
+				     struct devlink_param_gset_ctx *ctx,
+				     struct netlink_ext_ack *extack)
 {
 	struct otx2_cpt_devlink *cpt_dl = devlink_priv(dl);
 	struct otx2_cptpf_dev *cptpf = cpt_dl->cptpf;
diff --git a/drivers/net/ethernet/amd/pds_core/core.h b/drivers/net/ethernet/amd/pds_core/core.h
index a3e17a0c187a..14522d6d5f86 100644
--- a/drivers/net/ethernet/amd/pds_core/core.h
+++ b/drivers/net/ethernet/amd/pds_core/core.h
@@ -256,7 +256,8 @@ int pdsc_dl_flash_update(struct devlink *dl,
 int pdsc_dl_enable_get(struct devlink *dl, u32 id,
 		       struct devlink_param_gset_ctx *ctx);
 int pdsc_dl_enable_set(struct devlink *dl, u32 id,
-		       struct devlink_param_gset_ctx *ctx);
+		       struct devlink_param_gset_ctx *ctx,
+		       struct netlink_ext_ack *extack);
 int pdsc_dl_enable_validate(struct devlink *dl, u32 id,
 			    union devlink_param_value val,
 			    struct netlink_ext_ack *extack);
diff --git a/drivers/net/ethernet/amd/pds_core/devlink.c b/drivers/net/ethernet/amd/pds_core/devlink.c
index 54864f27c87a..2681889162a2 100644
--- a/drivers/net/ethernet/amd/pds_core/devlink.c
+++ b/drivers/net/ethernet/amd/pds_core/devlink.c
@@ -37,7 +37,8 @@ int pdsc_dl_enable_get(struct devlink *dl, u32 id,
 }
 
 int pdsc_dl_enable_set(struct devlink *dl, u32 id,
-		       struct devlink_param_gset_ctx *ctx)
+		       struct devlink_param_gset_ctx *ctx,
+		       struct netlink_ext_ack *extack)
 {
 	struct pdsc *pdsc = devlink_priv(dl);
 	struct pdsc_viftype *vt_entry;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
index ae4529c043f0..d9ea6fa23923 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
@@ -1096,7 +1096,8 @@ static int bnxt_dl_nvm_param_get(struct devlink *dl, u32 id,
 }
 
 static int bnxt_dl_nvm_param_set(struct devlink *dl, u32 id,
-				 struct devlink_param_gset_ctx *ctx)
+				 struct devlink_param_gset_ctx *ctx,
+				 struct netlink_ext_ack *extack)
 {
 	struct bnxt *bp = bnxt_get_bp_from_dl(dl);
 	struct hwrm_nvm_set_variable_input *req;
@@ -1145,7 +1146,8 @@ static int bnxt_remote_dev_reset_get(struct devlink *dl, u32 id,
 }
 
 static int bnxt_remote_dev_reset_set(struct devlink *dl, u32 id,
-				     struct devlink_param_gset_ctx *ctx)
+				     struct devlink_param_gset_ctx *ctx,
+				     struct netlink_ext_ack *extack)
 {
 	struct bnxt *bp = bnxt_get_bp_from_dl(dl);
 	int rc;
diff --git a/drivers/net/ethernet/intel/ice/devlink/devlink.c b/drivers/net/ethernet/intel/ice/devlink/devlink.c
index be9244bb8bbc..acbace240977 100644
--- a/drivers/net/ethernet/intel/ice/devlink/devlink.c
+++ b/drivers/net/ethernet/intel/ice/devlink/devlink.c
@@ -1144,9 +1144,9 @@ ice_devlink_enable_roce_get(struct devlink *devlink, u32 id,
 	return 0;
 }
 
-static int
-ice_devlink_enable_roce_set(struct devlink *devlink, u32 id,
-			    struct devlink_param_gset_ctx *ctx)
+static int ice_devlink_enable_roce_set(struct devlink *devlink, u32 id,
+				       struct devlink_param_gset_ctx *ctx,
+				       struct netlink_ext_ack *extack)
 {
 	struct ice_pf *pf = devlink_priv(devlink);
 	bool roce_ena = ctx->val.vbool;
@@ -1195,9 +1195,9 @@ ice_devlink_enable_iw_get(struct devlink *devlink, u32 id,
 	return 0;
 }
 
-static int
-ice_devlink_enable_iw_set(struct devlink *devlink, u32 id,
-			  struct devlink_param_gset_ctx *ctx)
+static int ice_devlink_enable_iw_set(struct devlink *devlink, u32 id,
+				     struct devlink_param_gset_ctx *ctx,
+				     struct netlink_ext_ack *extack)
 {
 	struct ice_pf *pf = devlink_priv(devlink);
 	bool iw_ena = ctx->val.vbool;
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c
index 96c04f7d93f8..7498ab429963 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c
@@ -1202,7 +1202,8 @@ static int rvu_af_dl_dwrr_mtu_validate(struct devlink *devlink, u32 id,
 }
 
 static int rvu_af_dl_dwrr_mtu_set(struct devlink *devlink, u32 id,
-				  struct devlink_param_gset_ctx *ctx)
+				  struct devlink_param_gset_ctx *ctx,
+				  struct netlink_ext_ack *extack)
 {
 	struct rvu_devlink *rvu_dl = devlink_priv(devlink);
 	struct rvu *rvu = rvu_dl->rvu;
@@ -1256,7 +1257,8 @@ static int rvu_af_npc_exact_feature_get(struct devlink *devlink, u32 id,
 }
 
 static int rvu_af_npc_exact_feature_disable(struct devlink *devlink, u32 id,
-					    struct devlink_param_gset_ctx *ctx)
+					    struct devlink_param_gset_ctx *ctx,
+					    struct netlink_ext_ack *extack)
 {
 	struct rvu_devlink *rvu_dl = devlink_priv(devlink);
 	struct rvu *rvu = rvu_dl->rvu;
@@ -1310,7 +1312,8 @@ static int rvu_af_dl_npc_mcam_high_zone_percent_get(struct devlink *devlink, u32
 }
 
 static int rvu_af_dl_npc_mcam_high_zone_percent_set(struct devlink *devlink, u32 id,
-						    struct devlink_param_gset_ctx *ctx)
+						    struct devlink_param_gset_ctx *ctx,
+						    struct netlink_ext_ack *extack)
 {
 	struct rvu_devlink *rvu_dl = devlink_priv(devlink);
 	struct rvu *rvu = rvu_dl->rvu;
@@ -1367,7 +1370,8 @@ static int rvu_af_dl_nix_maxlf_get(struct devlink *devlink, u32 id,
 }
 
 static int rvu_af_dl_nix_maxlf_set(struct devlink *devlink, u32 id,
-				   struct devlink_param_gset_ctx *ctx)
+				   struct devlink_param_gset_ctx *ctx,
+				   struct netlink_ext_ack *extack)
 {
 	struct rvu_devlink *rvu_dl = devlink_priv(devlink);
 	struct rvu *rvu = rvu_dl->rvu;
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c
index 4e1130496573..99ddf31269d9 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c
@@ -32,7 +32,8 @@ static int otx2_dl_mcam_count_validate(struct devlink *devlink, u32 id,
 }
 
 static int otx2_dl_mcam_count_set(struct devlink *devlink, u32 id,
-				  struct devlink_param_gset_ctx *ctx)
+				  struct devlink_param_gset_ctx *ctx,
+				  struct netlink_ext_ack *extack)
 {
 	struct otx2_devlink *otx2_dl = devlink_priv(devlink);
 	struct otx2_nic *pfvf = otx2_dl->pfvf;
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 7b02ff61126d..98688e4dbec5 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -185,7 +185,8 @@ static int mlx4_devlink_ierr_reset_get(struct devlink *devlink, u32 id,
 }
 
 static int mlx4_devlink_ierr_reset_set(struct devlink *devlink, u32 id,
-				       struct devlink_param_gset_ctx *ctx)
+				       struct devlink_param_gset_ctx *ctx,
+				       struct netlink_ext_ack *extack)
 {
 	mlx4_internal_err_reset = ctx->val.vbool;
 	return 0;
@@ -202,7 +203,8 @@ static int mlx4_devlink_crdump_snapshot_get(struct devlink *devlink, u32 id,
 }
 
 static int mlx4_devlink_crdump_snapshot_set(struct devlink *devlink, u32 id,
-					    struct devlink_param_gset_ctx *ctx)
+					    struct devlink_param_gset_ctx *ctx,
+					    struct netlink_ext_ack *extack)
 {
 	struct mlx4_priv *priv = devlink_priv(devlink);
 	struct mlx4_dev *dev = &priv->dev;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 1789800faaeb..17f78091ad30 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1805,7 +1805,8 @@ err:
 }
 
 static int mlx5_devlink_esw_multiport_set(struct devlink *devlink, u32 id,
-					  struct devlink_param_gset_ctx *ctx)
+					  struct devlink_param_gset_ctx *ctx,
+					  struct netlink_ext_ack *extack)
 {
 	struct mlx5_core_dev *dev = devlink_priv(devlink);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index dfcc7e85e0a3..f378b8176e47 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -2413,7 +2413,8 @@ err:
 }
 
 static int esw_port_metadata_set(struct devlink *devlink, u32 id,
-				 struct devlink_param_gset_ctx *ctx)
+				 struct devlink_param_gset_ctx *ctx,
+				 struct netlink_ext_ack *extack)
 {
 	struct mlx5_core_dev *dev = devlink_priv(devlink);
 	struct mlx5_eswitch *esw = dev->priv.eswitch;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
index cf085a478e3e..32cdacc34a0d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
@@ -3332,7 +3332,8 @@ static int mlx5_fs_mode_validate(struct devlink *devlink, u32 id,
 }
 
 static int mlx5_fs_mode_set(struct devlink *devlink, u32 id,
-			    struct devlink_param_gset_ctx *ctx)
+			    struct devlink_param_gset_ctx *ctx,
+			    struct netlink_ext_ack *extack)
 {
 	struct mlx5_core_dev *dev = devlink_priv(devlink);
 	enum mlx5_flow_steering_mode mode;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
index 2911aa34a5be..979c49ae6b5c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
@@ -52,7 +52,8 @@ static void mlx5_set_fw_rst_ack(struct mlx5_core_dev *dev)
 }
 
 static int mlx5_fw_reset_enable_remote_dev_reset_set(struct devlink *devlink, u32 id,
-						     struct devlink_param_gset_ctx *ctx)
+						     struct devlink_param_gset_ctx *ctx,
+						     struct netlink_ext_ack *extack)
 {
 	struct mlx5_core_dev *dev = devlink_priv(devlink);
 	struct mlx5_fw_reset *fw_reset;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c
index f20052776b3f..baedf0d45e85 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_tcam.c
@@ -1465,7 +1465,8 @@ mlxsw_sp_acl_tcam_region_rehash_intrvl_get(struct devlink *devlink, u32 id,
 
 static int
 mlxsw_sp_acl_tcam_region_rehash_intrvl_set(struct devlink *devlink, u32 id,
-					   struct devlink_param_gset_ctx *ctx)
+					   struct devlink_param_gset_ctx *ctx,
+					   struct netlink_ext_ack *extack)
 {
 	struct mlxsw_core *mlxsw_core = devlink_priv(devlink);
 	struct mlxsw_sp_acl_tcam_vregion *vregion;
diff --git a/drivers/net/ethernet/netronome/nfp/devlink_param.c b/drivers/net/ethernet/netronome/nfp/devlink_param.c
index a655f9e69a7b..0e1a3800f371 100644
--- a/drivers/net/ethernet/netronome/nfp/devlink_param.c
+++ b/drivers/net/ethernet/netronome/nfp/devlink_param.c
@@ -132,7 +132,8 @@ exit_close_nsp:
 
 static int
 nfp_devlink_param_u8_set(struct devlink *devlink, u32 id,
-			 struct devlink_param_gset_ctx *ctx)
+			 struct devlink_param_gset_ctx *ctx,
+			 struct netlink_ext_ack *extack)
 {
 	const struct nfp_devlink_param_u8_arg *arg;
 	struct nfp_pf *pf = devlink_priv(devlink);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_devlink.c b/drivers/net/ethernet/qlogic/qed/qed_devlink.c
index dad8e617c393..1adc7fbb3f2f 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_devlink.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_devlink.c
@@ -132,7 +132,8 @@ static int qed_dl_param_get(struct devlink *dl, u32 id,
 }
 
 static int qed_dl_param_set(struct devlink *dl, u32 id,
-			    struct devlink_param_gset_ctx *ctx)
+			    struct devlink_param_gset_ctx *ctx,
+			    struct netlink_ext_ack *extack)
 {
 	struct qed_devlink *qed_dl = devlink_priv(dl);
 	struct qed_dev *cdev;
diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
index a7dbbbe3ff5d..bf3df86554d1 100644
--- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c
+++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
@@ -3061,7 +3061,8 @@ static void am65_cpsw_init_port_switch_ale(struct am65_cpsw_port *port)
 }
 
 static int am65_cpsw_dl_switch_mode_set(struct devlink *dl, u32 id,
-					struct devlink_param_gset_ctx *ctx)
+					struct devlink_param_gset_ctx *ctx,
+					struct netlink_ext_ack *extack)
 {
 	struct am65_cpsw_devlink *dl_priv = devlink_priv(dl);
 	struct am65_cpsw_common *cpsw = dl_priv->common;
diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c
index 087dcb67505a..2baa198ebfa0 100644
--- a/drivers/net/ethernet/ti/cpsw_new.c
+++ b/drivers/net/ethernet/ti/cpsw_new.c
@@ -1625,7 +1625,8 @@ static int cpsw_dl_switch_mode_get(struct devlink *dl, u32 id,
 }
 
 static int cpsw_dl_switch_mode_set(struct devlink *dl, u32 id,
-				   struct devlink_param_gset_ctx *ctx)
+				   struct devlink_param_gset_ctx *ctx,
+				   struct netlink_ext_ack *extack)
 {
 	struct cpsw_devlink *dl_priv = devlink_priv(dl);
 	struct cpsw_common *cpsw = dl_priv->cpsw;
@@ -1762,7 +1763,8 @@ static int cpsw_dl_ale_ctrl_get(struct devlink *dl, u32 id,
 }
 
 static int cpsw_dl_ale_ctrl_set(struct devlink *dl, u32 id,
-				struct devlink_param_gset_ctx *ctx)
+				struct devlink_param_gset_ctx *ctx,
+				struct netlink_ext_ack *extack)
 {
 	struct cpsw_devlink *dl_priv = devlink_priv(dl);
 	struct cpsw_common *cpsw = dl_priv->cpsw;
diff --git a/drivers/net/wwan/iosm/iosm_ipc_devlink.c b/drivers/net/wwan/iosm/iosm_ipc_devlink.c
index 2fe724d623c0..bef6819986e9 100644
--- a/drivers/net/wwan/iosm/iosm_ipc_devlink.c
+++ b/drivers/net/wwan/iosm/iosm_ipc_devlink.c
@@ -33,7 +33,8 @@ static int ipc_devlink_get_param(struct devlink *dl, u32 id,
 
 /* Set the param values for the specific param ID's */
 static int ipc_devlink_set_param(struct devlink *dl, u32 id,
-				 struct devlink_param_gset_ctx *ctx)
+				 struct devlink_param_gset_ctx *ctx,
+				 struct netlink_ext_ack *extack)
 {
 	struct iosm_devlink *ipc_devlink = devlink_priv(dl);
 
diff --git a/include/net/devlink.h b/include/net/devlink.h
index d31769a116ce..35eb0f884386 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -483,7 +483,8 @@ struct devlink_param {
 	int (*get)(struct devlink *devlink, u32 id,
 		   struct devlink_param_gset_ctx *ctx);
 	int (*set)(struct devlink *devlink, u32 id,
-		   struct devlink_param_gset_ctx *ctx);
+		   struct devlink_param_gset_ctx *ctx,
+		   struct netlink_ext_ack *extack);
 	int (*validate)(struct devlink *devlink, u32 id,
 			union devlink_param_value val,
 			struct netlink_ext_ack *extack);
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 7edfd8de8882..a6ef7e4c503f 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -1258,7 +1258,8 @@ struct dsa_switch_ops {
 int dsa_devlink_param_get(struct devlink *dl, u32 id,
 			  struct devlink_param_gset_ctx *ctx);
 int dsa_devlink_param_set(struct devlink *dl, u32 id,
-			  struct devlink_param_gset_ctx *ctx);
+			  struct devlink_param_gset_ctx *ctx,
+			  struct netlink_ext_ack *extack);
 int dsa_devlink_params_register(struct dsa_switch *ds,
 				const struct devlink_param *params,
 				size_t params_count);
diff --git a/net/devlink/param.c b/net/devlink/param.c
index 22bc3b500518..dcf0d1ccebba 100644
--- a/net/devlink/param.c
+++ b/net/devlink/param.c
@@ -158,11 +158,12 @@ static int devlink_param_get(struct devlink *devlink,
 
 static int devlink_param_set(struct devlink *devlink,
 			     const struct devlink_param *param,
-			     struct devlink_param_gset_ctx *ctx)
+			     struct devlink_param_gset_ctx *ctx,
+			     struct netlink_ext_ack *extack)
 {
 	if (!param->set)
 		return -EOPNOTSUPP;
-	return param->set(devlink, param->id, ctx);
+	return param->set(devlink, param->id, ctx, extack);
 }
 
 static int
@@ -571,7 +572,7 @@ static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink,
 			return -EOPNOTSUPP;
 		ctx.val = value;
 		ctx.cmode = cmode;
-		err = devlink_param_set(devlink, param, &ctx);
+		err = devlink_param_set(devlink, param, &ctx, info->extack);
 		if (err)
 			return err;
 	}
diff --git a/net/dsa/devlink.c b/net/dsa/devlink.c
index 431bf52290a1..0aac887d0098 100644
--- a/net/dsa/devlink.c
+++ b/net/dsa/devlink.c
@@ -194,7 +194,8 @@ int dsa_devlink_param_get(struct devlink *dl, u32 id,
 EXPORT_SYMBOL_GPL(dsa_devlink_param_get);
 
 int dsa_devlink_param_set(struct devlink *dl, u32 id,
-			  struct devlink_param_gset_ctx *ctx)
+			  struct devlink_param_gset_ctx *ctx,
+			  struct netlink_ext_ack *extack)
 {
 	struct dsa_switch *ds = dsa_devlink_to_ds(dl);
 
-- 
cgit v1.2.3


From 51da7f68edae38e81543d57fd71811f7481c0472 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 22 Apr 2024 10:03:13 -1000
Subject: workqueue: Use "@..." in function comment to describe variable length
 argument

Previously, it was using "remaining args" without leading "@" which isn't
valid. Let's follow snprintf()'s example and use "@...".

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
---
 include/linux/workqueue.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 2df1188c0f96..fb3993894536 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -473,7 +473,7 @@ void workqueue_softirq_dead(unsigned int cpu);
  * @fmt: printf format for the name of the workqueue
  * @flags: WQ_* flags
  * @max_active: max in-flight work items, 0 for default
- * remaining args: args for @fmt
+ * @...: args for @fmt
  *
  * For a per-cpu workqueue, @max_active limits the number of in-flight work
  * items for each CPU. e.g. @max_active of 1 indicates that each CPU can be
-- 
cgit v1.2.3


From 20516d6e51dd9994afda8d556507cfbe7853384b Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 11 Apr 2024 13:46:14 -0300
Subject: x86: Stop using weak symbols for __iowrite32_copy()

Start switching iomap_copy routines over to use #define and arch provided
inline/macro functions instead of weak symbols.

Inline functions allow more compiler optimization and this is often a
driver hot path.

x86 has the only weak implementation for __iowrite32_copy(), so replace it
with a static inline containing the same single instruction inline
assembly. The compiler will generate the "mov edx,ecx" in a more optimal
way.

Remove iomap_copy_64.S

Link: https://lore.kernel.org/r/1-v3-1893cd8b9369+1925-mlx5_arm_wc_jgg@nvidia.com
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 arch/x86/include/asm/io.h    | 17 +++++++++++++++++
 arch/x86/lib/Makefile        |  1 -
 arch/x86/lib/iomap_copy_64.S | 15 ---------------
 include/linux/io.h           |  5 ++++-
 lib/iomap_copy.c             |  6 +++---
 5 files changed, 24 insertions(+), 20 deletions(-)
 delete mode 100644 arch/x86/lib/iomap_copy_64.S

(limited to 'include')

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 294cd2a40818..4b99ed326b17 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -209,6 +209,23 @@ void memset_io(volatile void __iomem *, int, size_t);
 #define memcpy_toio memcpy_toio
 #define memset_io memset_io
 
+#ifdef CONFIG_X86_64
+/*
+ * Commit 0f07496144c2 ("[PATCH] Add faster __iowrite32_copy routine for
+ * x86_64") says that circa 2006 rep movsl is noticeably faster than a copy
+ * loop.
+ */
+static inline void __iowrite32_copy(void __iomem *to, const void *from,
+				    size_t count)
+{
+	asm volatile("rep ; movsl"
+		     : "=&c"(count), "=&D"(to), "=&S"(from)
+		     : "0"(count), "1"(to), "2"(from)
+		     : "memory");
+}
+#define __iowrite32_copy __iowrite32_copy
+#endif
+
 /*
  * ISA space is 'always mapped' on a typical x86 system, no need to
  * explicitly ioremap() it. The fact that the ISA IO space is mapped
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 6da73513f026..98583a9dbab3 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -53,7 +53,6 @@ ifneq ($(CONFIG_X86_CMPXCHG64),y)
         lib-y += atomic64_386_32.o
 endif
 else
-        obj-y += iomap_copy_64.o
 ifneq ($(CONFIG_GENERIC_CSUM),y)
         lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
 endif
diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S
deleted file mode 100644
index 6ff2f56cb0f7..000000000000
--- a/arch/x86/lib/iomap_copy_64.S
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright 2006 PathScale, Inc.  All Rights Reserved.
- */
-
-#include <linux/linkage.h>
-
-/*
- * override generic version in lib/iomap_copy.c
- */
-SYM_FUNC_START(__iowrite32_copy)
-	movl %edx,%ecx
-	rep movsl
-	RET
-SYM_FUNC_END(__iowrite32_copy)
diff --git a/include/linux/io.h b/include/linux/io.h
index 235ba7d80a8f..ce86120ce9d5 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -16,7 +16,10 @@
 struct device;
 struct resource;
 
-__visible void __iowrite32_copy(void __iomem *to, const void *from, size_t count);
+#ifndef __iowrite32_copy
+void __iowrite32_copy(void __iomem *to, const void *from, size_t count);
+#endif
+
 void __ioread32_copy(void *to, const void __iomem *from, size_t count);
 void __iowrite64_copy(void __iomem *to, const void *from, size_t count);
 
diff --git a/lib/iomap_copy.c b/lib/iomap_copy.c
index 5de7c04e05ef..8ddcbb53507d 100644
--- a/lib/iomap_copy.c
+++ b/lib/iomap_copy.c
@@ -16,9 +16,8 @@
  * time.  Order of access is not guaranteed, nor is a memory barrier
  * performed afterwards.
  */
-void __attribute__((weak)) __iowrite32_copy(void __iomem *to,
-					    const void *from,
-					    size_t count)
+#ifndef __iowrite32_copy
+void __iowrite32_copy(void __iomem *to, const void *from, size_t count)
 {
 	u32 __iomem *dst = to;
 	const u32 *src = from;
@@ -28,6 +27,7 @@ void __attribute__((weak)) __iowrite32_copy(void __iomem *to,
 		__raw_writel(*src++, dst++);
 }
 EXPORT_SYMBOL_GPL(__iowrite32_copy);
+#endif
 
 /**
  * __ioread32_copy - copy data from MMIO space, in 32-bit units
-- 
cgit v1.2.3


From e7bc47b16622d1016b3b77bbdb20fb9e213045f2 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 11 Apr 2024 13:46:16 -0300
Subject: s390: Stop using weak symbols for __iowrite64_copy()

Complete switching the __iowriteXX_copy() routines over to use #define and
arch provided inline/macro functions instead of weak symbols.

S390 has an implementation that simply calls another memcpy
function. Inline this so the callers don't have to do two jumps.

Link: https://lore.kernel.org/r/3-v3-1893cd8b9369+1925-mlx5_arm_wc_jgg@nvidia.com
Acked-by: Niklas Schnelle <schnelle@linux.ibm.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 arch/s390/include/asm/io.h | 7 +++++++
 arch/s390/pci/pci.c        | 6 ------
 include/linux/io.h         | 3 +++
 lib/iomap_copy.c           | 7 +++----
 4 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/arch/s390/include/asm/io.h b/arch/s390/include/asm/io.h
index 00704fc8a54b..0fbc992d7a5e 100644
--- a/arch/s390/include/asm/io.h
+++ b/arch/s390/include/asm/io.h
@@ -81,6 +81,13 @@ static inline void __iowrite32_copy(void __iomem *to, const void *from,
 }
 #define __iowrite32_copy __iowrite32_copy
 
+static inline void __iowrite64_copy(void __iomem *to, const void *from,
+				    size_t count)
+{
+	zpci_memcpy_toio(to, from, count * 8);
+}
+#define __iowrite64_copy __iowrite64_copy
+
 #endif /* CONFIG_PCI */
 
 #include <asm-generic/io.h>
diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index 26afde0d1ed3..0de0f6e405b5 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -250,12 +250,6 @@ resource_size_t pcibios_align_resource(void *data, const struct resource *res,
 	return 0;
 }
 
-/* combine single writes by using store-block insn */
-void __iowrite64_copy(void __iomem *to, const void *from, size_t count)
-{
-	zpci_memcpy_toio(to, from, count * 8);
-}
-
 void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
 			   unsigned long prot)
 {
diff --git a/include/linux/io.h b/include/linux/io.h
index ce86120ce9d5..42e132808f00 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -21,7 +21,10 @@ void __iowrite32_copy(void __iomem *to, const void *from, size_t count);
 #endif
 
 void __ioread32_copy(void *to, const void __iomem *from, size_t count);
+
+#ifndef __iowrite64_copy
 void __iowrite64_copy(void __iomem *to, const void *from, size_t count);
+#endif
 
 #ifdef CONFIG_MMU
 int ioremap_page_range(unsigned long addr, unsigned long end,
diff --git a/lib/iomap_copy.c b/lib/iomap_copy.c
index 8ddcbb53507d..2fd5712fb7c0 100644
--- a/lib/iomap_copy.c
+++ b/lib/iomap_copy.c
@@ -60,9 +60,8 @@ EXPORT_SYMBOL_GPL(__ioread32_copy);
  * time.  Order of access is not guaranteed, nor is a memory barrier
  * performed afterwards.
  */
-void __attribute__((weak)) __iowrite64_copy(void __iomem *to,
-					    const void *from,
-					    size_t count)
+#ifndef __iowrite64_copy
+void __iowrite64_copy(void __iomem *to, const void *from, size_t count)
 {
 #ifdef CONFIG_64BIT
 	u64 __iomem *dst = to;
@@ -75,5 +74,5 @@ void __attribute__((weak)) __iowrite64_copy(void __iomem *to,
 	__iowrite32_copy(to, from, count * 2);
 #endif
 }
-
 EXPORT_SYMBOL_GPL(__iowrite64_copy);
+#endif
-- 
cgit v1.2.3


From eca1e8a62888a31c8dcfad79457232594be40c1a Mon Sep 17 00:00:00 2001
From: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Date: Fri, 19 Apr 2024 11:04:42 +0300
Subject: net/mlx5e: Use DIM constants for CQ period mode parameter

Use core DIM CQ period mode enum values for the CQ parameter for the period
mode. Translate the value to the specific mlx5 device constant for the
selected period mode when creating a CQ. Avoid needing to translate mlx5
device constants to DIM constants for core DIM functionality.

Co-developed-by: Nabil S. Alramli <dev@nalramli.com>
Signed-off-by: Nabil S. Alramli <dev@nalramli.com>
Co-developed-by: Joe Damato <jdamato@fastly.com>
Signed-off-by: Joe Damato <jdamato@fastly.com>
Signed-off-by: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://lore.kernel.org/r/20240419080445.417574-3-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/dim.h   | 26 +++++++++++++++++
 .../net/ethernet/mellanox/mlx5/core/en/params.c    | 34 +++++++---------------
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   | 16 +++-------
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 20 ++++---------
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   |  7 +++--
 include/linux/mlx5/mlx5_ifc.h                      |  4 +--
 6 files changed, 53 insertions(+), 54 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/dim.h b/drivers/net/ethernet/mellanox/mlx5/core/en/dim.h
index cd2cf647c85a..6411ae4c6b94 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/dim.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/dim.h
@@ -4,11 +4,37 @@
 #ifndef __MLX5_EN_DIM_H__
 #define __MLX5_EN_DIM_H__
 
+#include <linux/dim.h>
 #include <linux/types.h>
+#include <linux/mlx5/mlx5_ifc.h>
 
 /* Forward declarations */
 struct work_struct;
 
+/* convert a boolean value for cqe mode to appropriate dim constant
+ * true  : DIM_CQ_PERIOD_MODE_START_FROM_CQE
+ * false : DIM_CQ_PERIOD_MODE_START_FROM_EQE
+ */
+static inline int mlx5e_dim_cq_period_mode(bool start_from_cqe)
+{
+	return start_from_cqe ? DIM_CQ_PERIOD_MODE_START_FROM_CQE :
+		DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+}
+
+static inline enum mlx5_cq_period_mode
+mlx5e_cq_period_mode(enum dim_cq_period_mode cq_period_mode)
+{
+	switch (cq_period_mode) {
+	case DIM_CQ_PERIOD_MODE_START_FROM_EQE:
+		return MLX5_CQ_PERIOD_MODE_START_FROM_EQE;
+	case DIM_CQ_PERIOD_MODE_START_FROM_CQE:
+		return MLX5_CQ_PERIOD_MODE_START_FROM_CQE;
+	default:
+		WARN_ON_ONCE(true);
+		return MLX5_CQ_PERIOD_MODE_START_FROM_EQE;
+	}
+}
+
 void mlx5e_rx_dim_work(struct work_struct *work);
 void mlx5e_tx_dim_work(struct work_struct *work);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
index a3f31d9d527e..0424628405e0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
@@ -6,6 +6,7 @@
 #include "en/port.h"
 #include "en_accel/en_accel.h"
 #include "en_accel/ipsec.h"
+#include <linux/dim.h>
 #include <net/page_pool/types.h>
 #include <net/xdp_sock_drv.h>
 
@@ -520,7 +521,7 @@ static struct dim_cq_moder mlx5e_get_def_tx_moderation(u8 cq_period_mode)
 	moder.cq_period_mode = cq_period_mode;
 	moder.pkts = MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS;
 	moder.usec = MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC;
-	if (cq_period_mode == MLX5_CQ_PERIOD_MODE_START_FROM_CQE)
+	if (cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE)
 		moder.usec = MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC_FROM_CQE;
 
 	return moder;
@@ -533,39 +534,26 @@ static struct dim_cq_moder mlx5e_get_def_rx_moderation(u8 cq_period_mode)
 	moder.cq_period_mode = cq_period_mode;
 	moder.pkts = MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS;
 	moder.usec = MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC;
-	if (cq_period_mode == MLX5_CQ_PERIOD_MODE_START_FROM_CQE)
+	if (cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE)
 		moder.usec = MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE;
 
 	return moder;
 }
 
-static u8 mlx5_to_net_dim_cq_period_mode(u8 cq_period_mode)
-{
-	return cq_period_mode == MLX5_CQ_PERIOD_MODE_START_FROM_CQE ?
-		DIM_CQ_PERIOD_MODE_START_FROM_CQE :
-		DIM_CQ_PERIOD_MODE_START_FROM_EQE;
-}
-
 void mlx5e_reset_tx_moderation(struct mlx5e_params *params, u8 cq_period_mode)
 {
-	if (params->tx_dim_enabled) {
-		u8 dim_period_mode = mlx5_to_net_dim_cq_period_mode(cq_period_mode);
-
-		params->tx_cq_moderation = net_dim_get_def_tx_moderation(dim_period_mode);
-	} else {
+	if (params->tx_dim_enabled)
+		params->tx_cq_moderation = net_dim_get_def_tx_moderation(cq_period_mode);
+	else
 		params->tx_cq_moderation = mlx5e_get_def_tx_moderation(cq_period_mode);
-	}
 }
 
 void mlx5e_reset_rx_moderation(struct mlx5e_params *params, u8 cq_period_mode)
 {
-	if (params->rx_dim_enabled) {
-		u8 dim_period_mode = mlx5_to_net_dim_cq_period_mode(cq_period_mode);
-
-		params->rx_cq_moderation = net_dim_get_def_rx_moderation(dim_period_mode);
-	} else {
+	if (params->rx_dim_enabled)
+		params->rx_cq_moderation = net_dim_get_def_rx_moderation(cq_period_mode);
+	else
 		params->rx_cq_moderation = mlx5e_get_def_rx_moderation(cq_period_mode);
-	}
 }
 
 void mlx5e_set_tx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode)
@@ -573,7 +561,7 @@ void mlx5e_set_tx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode)
 	mlx5e_reset_tx_moderation(params, cq_period_mode);
 	MLX5E_SET_PFLAG(params, MLX5E_PFLAG_TX_CQE_BASED_MODER,
 			params->tx_cq_moderation.cq_period_mode ==
-				MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
+				DIM_CQ_PERIOD_MODE_START_FROM_CQE);
 }
 
 void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode)
@@ -581,7 +569,7 @@ void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode)
 	mlx5e_reset_rx_moderation(params, cq_period_mode);
 	MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_CQE_BASED_MODER,
 			params->rx_cq_moderation.cq_period_mode ==
-				MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
+				DIM_CQ_PERIOD_MODE_START_FROM_CQE);
 }
 
 bool slow_pci_heuristic(struct mlx5_core_dev *mdev)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index 75bf7f3d9f25..c07785e675bc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -30,6 +30,7 @@
  * SOFTWARE.
  */
 
+#include <linux/dim.h>
 #include <linux/ethtool_netlink.h>
 
 #include "en.h"
@@ -627,15 +628,6 @@ mlx5e_set_priv_channels_rx_coalesce(struct mlx5e_priv *priv, struct ethtool_coal
 	}
 }
 
-/* convert a boolean value of cq_mode to mlx5 period mode
- * true  : MLX5_CQ_PERIOD_MODE_START_FROM_CQE
- * false : MLX5_CQ_PERIOD_MODE_START_FROM_EQE
- */
-static int cqe_mode_to_period_mode(bool val)
-{
-	return val ? MLX5_CQ_PERIOD_MODE_START_FROM_CQE : MLX5_CQ_PERIOD_MODE_START_FROM_EQE;
-}
-
 int mlx5e_ethtool_set_coalesce(struct mlx5e_priv *priv,
 			       struct ethtool_coalesce *coal,
 			       struct kernel_ethtool_coalesce *kernel_coal,
@@ -688,13 +680,13 @@ int mlx5e_ethtool_set_coalesce(struct mlx5e_priv *priv,
 	reset_rx = !!coal->use_adaptive_rx_coalesce != priv->channels.params.rx_dim_enabled;
 	reset_tx = !!coal->use_adaptive_tx_coalesce != priv->channels.params.tx_dim_enabled;
 
-	cq_period_mode = cqe_mode_to_period_mode(kernel_coal->use_cqe_mode_rx);
+	cq_period_mode = mlx5e_dim_cq_period_mode(kernel_coal->use_cqe_mode_rx);
 	if (cq_period_mode != rx_moder->cq_period_mode) {
 		mlx5e_set_rx_cq_mode_params(&new_params, cq_period_mode);
 		reset_rx = true;
 	}
 
-	cq_period_mode = cqe_mode_to_period_mode(kernel_coal->use_cqe_mode_tx);
+	cq_period_mode = mlx5e_dim_cq_period_mode(kernel_coal->use_cqe_mode_tx);
 	if (cq_period_mode != tx_moder->cq_period_mode) {
 		mlx5e_set_tx_cq_mode_params(&new_params, cq_period_mode);
 		reset_tx = true;
@@ -1915,7 +1907,7 @@ static int set_pflag_cqe_based_moder(struct net_device *netdev, bool enable,
 	if (enable && !MLX5_CAP_GEN(priv->mdev, cq_period_start_from_cqe))
 		return -EOPNOTSUPP;
 
-	cq_period_mode = cqe_mode_to_period_mode(enable);
+	cq_period_mode = mlx5e_dim_cq_period_mode(enable);
 
 	current_cq_period_mode = is_rx_cq ?
 		priv->channels.params.rx_cq_moderation.cq_period_mode :
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index cf529f07faf1..12d1f4548343 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -30,6 +30,7 @@
  * SOFTWARE.
  */
 
+#include <linux/dim.h>
 #include <net/tc_act/tc_gact.h>
 #include <linux/mlx5/fs.h>
 #include <net/vxlan.h>
@@ -962,15 +963,7 @@ static int mlx5e_alloc_rq(struct mlx5e_params *params,
 	}
 
 	INIT_WORK(&rq->dim.work, mlx5e_rx_dim_work);
-
-	switch (params->rx_cq_moderation.cq_period_mode) {
-	case MLX5_CQ_PERIOD_MODE_START_FROM_CQE:
-		rq->dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_CQE;
-		break;
-	case MLX5_CQ_PERIOD_MODE_START_FROM_EQE:
-	default:
-		rq->dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
-	}
+	rq->dim.mode = params->rx_cq_moderation.cq_period_mode;
 
 	return 0;
 
@@ -2090,7 +2083,7 @@ static int mlx5e_create_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param)
 	mlx5_fill_page_frag_array(&cq->wq_ctrl.buf,
 				  (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas));
 
-	MLX5_SET(cqc,   cqc, cq_period_mode, param->cq_period_mode);
+	MLX5_SET(cqc,   cqc, cq_period_mode, mlx5e_cq_period_mode(param->cq_period_mode));
 	MLX5_SET(cqc,   cqc, c_eqn_or_apu_element, eqn);
 	MLX5_SET(cqc,   cqc, uar_page,      mdev->priv.uar->index);
 	MLX5_SET(cqc,   cqc, log_page_size, cq->wq_ctrl.buf.page_shift -
@@ -5059,13 +5052,12 @@ void mlx5e_build_nic_params(struct mlx5e_priv *priv, struct mlx5e_xsk *xsk, u16
 	params->packet_merge.timeout = mlx5e_choose_lro_timeout(mdev, MLX5E_DEFAULT_LRO_TIMEOUT);
 
 	/* CQ moderation params */
-	rx_cq_period_mode = MLX5_CAP_GEN(mdev, cq_period_start_from_cqe) ?
-			MLX5_CQ_PERIOD_MODE_START_FROM_CQE :
-			MLX5_CQ_PERIOD_MODE_START_FROM_EQE;
+	rx_cq_period_mode =
+		mlx5e_dim_cq_period_mode(MLX5_CAP_GEN(mdev, cq_period_start_from_cqe));
 	params->rx_dim_enabled = MLX5_CAP_GEN(mdev, cq_moderation);
 	params->tx_dim_enabled = MLX5_CAP_GEN(mdev, cq_moderation);
 	mlx5e_set_rx_cq_mode_params(params, rx_cq_period_mode);
-	mlx5e_set_tx_cq_mode_params(params, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
+	mlx5e_set_tx_cq_mode_params(params, DIM_CQ_PERIOD_MODE_START_FROM_EQE);
 
 	/* TX inline */
 	mlx5_query_min_inline(mdev, &params->tx_min_inline_mode);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 6acecf2e7cf6..309771300581 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -30,6 +30,7 @@
  * SOFTWARE.
  */
 
+#include <linux/dim.h>
 #include <linux/debugfs.h>
 #include <linux/mlx5/fs.h>
 #include <net/switchdev.h>
@@ -40,6 +41,7 @@
 
 #include "eswitch.h"
 #include "en.h"
+#include "en/dim.h"
 #include "en_rep.h"
 #include "en/params.h"
 #include "en/txrx.h"
@@ -836,9 +838,8 @@ static void mlx5e_build_rep_params(struct net_device *netdev)
 	struct mlx5_core_dev *mdev = priv->mdev;
 	struct mlx5e_params *params;
 
-	u8 cq_period_mode = MLX5_CAP_GEN(mdev, cq_period_start_from_cqe) ?
-					 MLX5_CQ_PERIOD_MODE_START_FROM_CQE :
-					 MLX5_CQ_PERIOD_MODE_START_FROM_EQE;
+	u8 cq_period_mode =
+		mlx5e_dim_cq_period_mode(MLX5_CAP_GEN(mdev, cq_period_start_from_cqe));
 
 	params = &priv->channels.params;
 
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 35ffc9b9f241..8c7ddb22bf20 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -4385,10 +4385,10 @@ enum {
 	MLX5_CQC_ST_FIRED                                 = 0xa,
 };
 
-enum {
+enum mlx5_cq_period_mode {
 	MLX5_CQ_PERIOD_MODE_START_FROM_EQE = 0x0,
 	MLX5_CQ_PERIOD_MODE_START_FROM_CQE = 0x1,
-	MLX5_CQ_PERIOD_NUM_MODES
+	MLX5_CQ_PERIOD_NUM_MODES,
 };
 
 struct mlx5_ifc_cqc_bits {
-- 
cgit v1.2.3


From 445a25f6e1a2f6a132b06af6ede4f3c9b5f9af68 Mon Sep 17 00:00:00 2001
From: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Date: Fri, 19 Apr 2024 11:04:44 +0300
Subject: net/mlx5e: Support updating coalescing configuration without
 resetting channels

When CQE mode or DIM state is changed, gracefully reconfigure channels to
handle new configuration. Previously, would create new channels that would
reflect the changes rather than update the original channels.

Co-developed-by: Nabil S. Alramli <dev@nalramli.com>
Signed-off-by: Nabil S. Alramli <dev@nalramli.com>
Co-developed-by: Joe Damato <jdamato@fastly.com>
Signed-off-by: Joe Damato <jdamato@fastly.com>
Signed-off-by: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://lore.kernel.org/r/20240419080445.417574-5-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h       |  20 ++
 .../net/ethernet/mellanox/mlx5/core/en/channels.c  |  83 ++++++++
 .../net/ethernet/mellanox/mlx5/core/en/channels.h  |   4 +
 drivers/net/ethernet/mellanox/mlx5/core/en/dim.h   |   4 +
 .../net/ethernet/mellanox/mlx5/core/en/params.c    |  58 ------
 .../net/ethernet/mellanox/mlx5/core/en/params.h    |   5 -
 drivers/net/ethernet/mellanox/mlx5/core/en_dim.c   |  89 ++++++++-
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   | 141 ++++++++------
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 210 +++++++++++++++++----
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   |   5 +-
 include/linux/mlx5/cq.h                            |   7 +-
 include/linux/mlx5/mlx5_ifc.h                      |   3 +-
 12 files changed, 460 insertions(+), 169 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index c8c0a0614e7b..eb09778327cc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -320,6 +320,8 @@ struct mlx5e_params {
 	bool scatter_fcs_en;
 	bool rx_dim_enabled;
 	bool tx_dim_enabled;
+	bool rx_moder_use_cqe_mode;
+	bool tx_moder_use_cqe_mode;
 	u32 pflags;
 	struct bpf_prog *xdp_prog;
 	struct mlx5e_xsk *xsk;
@@ -797,6 +799,10 @@ struct mlx5e_channel {
 	int                        cpu;
 	/* Sync between icosq recovery and XSK enable/disable. */
 	struct mutex               icosq_recovery_lock;
+
+	/* coalescing configuration */
+	struct dim_cq_moder        rx_cq_moder;
+	struct dim_cq_moder        tx_cq_moder;
 };
 
 struct mlx5e_ptp;
@@ -1040,6 +1046,11 @@ void mlx5e_close_rq(struct mlx5e_rq *rq);
 int mlx5e_create_rq(struct mlx5e_rq *rq, struct mlx5e_rq_param *param, u16 q_counter);
 void mlx5e_destroy_rq(struct mlx5e_rq *rq);
 
+bool mlx5e_reset_rx_moderation(struct dim_cq_moder *cq_moder, u8 cq_period_mode,
+			       bool dim_enabled);
+bool mlx5e_reset_rx_channels_moderation(struct mlx5e_channels *chs, u8 cq_period_mode,
+					bool dim_enabled, bool keep_dim_state);
+
 struct mlx5e_sq_param;
 int mlx5e_open_xdpsq(struct mlx5e_channel *c, struct mlx5e_params *params,
 		     struct mlx5e_sq_param *param, struct xsk_buff_pool *xsk_pool,
@@ -1060,6 +1071,10 @@ int mlx5e_open_cq(struct mlx5_core_dev *mdev, struct dim_cq_moder moder,
 		  struct mlx5e_cq_param *param, struct mlx5e_create_cq_param *ccp,
 		  struct mlx5e_cq *cq);
 void mlx5e_close_cq(struct mlx5e_cq *cq);
+int mlx5e_modify_cq_period_mode(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
+				u8 cq_period_mode);
+int mlx5e_modify_cq_moderation(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
+			       u16 cq_period, u16 cq_max_count, u8 cq_period_mode);
 
 int mlx5e_open_locked(struct net_device *netdev);
 int mlx5e_close_locked(struct net_device *netdev);
@@ -1118,6 +1133,11 @@ int mlx5e_create_sq_rdy(struct mlx5_core_dev *mdev,
 void mlx5e_tx_err_cqe_work(struct work_struct *recover_work);
 void mlx5e_close_txqsq(struct mlx5e_txqsq *sq);
 
+bool mlx5e_reset_tx_moderation(struct dim_cq_moder *cq_moder, u8 cq_period_mode,
+			       bool dim_enabled);
+bool mlx5e_reset_tx_channels_moderation(struct mlx5e_channels *chs, u8 cq_period_mode,
+					bool dim_enabled, bool keep_dim_state);
+
 static inline bool mlx5_tx_swp_supported(struct mlx5_core_dev *mdev)
 {
 	return MLX5_CAP_ETH(mdev, swp) &&
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/channels.c b/drivers/net/ethernet/mellanox/mlx5/core/en/channels.c
index 874a1016623c..66e719e88503 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/channels.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/channels.c
@@ -3,6 +3,7 @@
 
 #include "channels.h"
 #include "en.h"
+#include "en/dim.h"
 #include "en/ptp.h"
 
 unsigned int mlx5e_channels_get_num(struct mlx5e_channels *chs)
@@ -55,3 +56,85 @@ bool mlx5e_channels_get_ptp_rqn(struct mlx5e_channels *chs, u32 *rqn)
 	*rqn = c->rq.rqn;
 	return true;
 }
+
+int mlx5e_channels_rx_change_dim(struct mlx5e_channels *chs, bool enable)
+{
+	int i;
+
+	for (i = 0; i < chs->num; i++) {
+		int err = mlx5e_dim_rx_change(&chs->c[i]->rq, enable);
+
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+int mlx5e_channels_tx_change_dim(struct mlx5e_channels *chs, bool enable)
+{
+	int i, tc;
+
+	for (i = 0; i < chs->num; i++) {
+		for (tc = 0; tc < mlx5e_get_dcb_num_tc(&chs->params); tc++) {
+			int err = mlx5e_dim_tx_change(&chs->c[i]->sq[tc], enable);
+
+			if (err)
+				return err;
+		}
+	}
+
+	return 0;
+}
+
+int mlx5e_channels_rx_toggle_dim(struct mlx5e_channels *chs)
+{
+	int i;
+
+	for (i = 0; i < chs->num; i++) {
+		/* If dim is enabled for the channel, reset the dim state so the
+		 * collected statistics will be reset. This is useful for
+		 * supporting legacy interfaces that allow things like changing
+		 * the CQ period mode for all channels without disturbing
+		 * individual channel configurations.
+		 */
+		if (chs->c[i]->rq.dim) {
+			int err;
+
+			mlx5e_dim_rx_change(&chs->c[i]->rq, false);
+			err = mlx5e_dim_rx_change(&chs->c[i]->rq, true);
+			if (err)
+				return err;
+		}
+	}
+
+	return 0;
+}
+
+int mlx5e_channels_tx_toggle_dim(struct mlx5e_channels *chs)
+{
+	int i, tc;
+
+	for (i = 0; i < chs->num; i++) {
+		for (tc = 0; tc < mlx5e_get_dcb_num_tc(&chs->params); tc++) {
+			int err;
+
+			/* If dim is enabled for the channel, reset the dim
+			 * state so the collected statistics will be reset. This
+			 * is useful for supporting legacy interfaces that allow
+			 * things like changing the CQ period mode for all
+			 * channels without disturbing individual channel
+			 * configurations.
+			 */
+			if (!chs->c[i]->sq[tc].dim)
+				continue;
+
+			mlx5e_dim_tx_change(&chs->c[i]->sq[tc], false);
+			err = mlx5e_dim_tx_change(&chs->c[i]->sq[tc], true);
+			if (err)
+				return err;
+		}
+	}
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/channels.h b/drivers/net/ethernet/mellanox/mlx5/core/en/channels.h
index 6715aa9383b9..eda80f8c6c02 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/channels.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/channels.h
@@ -15,5 +15,9 @@ void mlx5e_channels_get_regular_rqn(struct mlx5e_channels *chs, unsigned int ix,
 void mlx5e_channels_get_xsk_rqn(struct mlx5e_channels *chs, unsigned int ix, u32 *rqn,
 				u32 *vhca_id);
 bool mlx5e_channels_get_ptp_rqn(struct mlx5e_channels *chs, u32 *rqn);
+int mlx5e_channels_rx_change_dim(struct mlx5e_channels *chs, bool enabled);
+int mlx5e_channels_tx_change_dim(struct mlx5e_channels *chs, bool enabled);
+int mlx5e_channels_rx_toggle_dim(struct mlx5e_channels *chs);
+int mlx5e_channels_tx_toggle_dim(struct mlx5e_channels *chs);
 
 #endif /* __MLX5_EN_CHANNELS_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/dim.h b/drivers/net/ethernet/mellanox/mlx5/core/en/dim.h
index 6411ae4c6b94..110e2c6b7e51 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/dim.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/dim.h
@@ -9,6 +9,8 @@
 #include <linux/mlx5/mlx5_ifc.h>
 
 /* Forward declarations */
+struct mlx5e_rq;
+struct mlx5e_txqsq;
 struct work_struct;
 
 /* convert a boolean value for cqe mode to appropriate dim constant
@@ -37,5 +39,7 @@ mlx5e_cq_period_mode(enum dim_cq_period_mode cq_period_mode)
 
 void mlx5e_rx_dim_work(struct work_struct *work);
 void mlx5e_tx_dim_work(struct work_struct *work);
+int mlx5e_dim_rx_change(struct mlx5e_rq *rq, bool enabled);
+int mlx5e_dim_tx_change(struct mlx5e_txqsq *sq, bool enabled);
 
 #endif /* __MLX5_EN_DIM_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
index 0424628405e0..ec819dfc98be 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
@@ -514,64 +514,6 @@ int mlx5e_validate_params(struct mlx5_core_dev *mdev, struct mlx5e_params *param
 	return 0;
 }
 
-static struct dim_cq_moder mlx5e_get_def_tx_moderation(u8 cq_period_mode)
-{
-	struct dim_cq_moder moder = {};
-
-	moder.cq_period_mode = cq_period_mode;
-	moder.pkts = MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS;
-	moder.usec = MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC;
-	if (cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE)
-		moder.usec = MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC_FROM_CQE;
-
-	return moder;
-}
-
-static struct dim_cq_moder mlx5e_get_def_rx_moderation(u8 cq_period_mode)
-{
-	struct dim_cq_moder moder = {};
-
-	moder.cq_period_mode = cq_period_mode;
-	moder.pkts = MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS;
-	moder.usec = MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC;
-	if (cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE)
-		moder.usec = MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE;
-
-	return moder;
-}
-
-void mlx5e_reset_tx_moderation(struct mlx5e_params *params, u8 cq_period_mode)
-{
-	if (params->tx_dim_enabled)
-		params->tx_cq_moderation = net_dim_get_def_tx_moderation(cq_period_mode);
-	else
-		params->tx_cq_moderation = mlx5e_get_def_tx_moderation(cq_period_mode);
-}
-
-void mlx5e_reset_rx_moderation(struct mlx5e_params *params, u8 cq_period_mode)
-{
-	if (params->rx_dim_enabled)
-		params->rx_cq_moderation = net_dim_get_def_rx_moderation(cq_period_mode);
-	else
-		params->rx_cq_moderation = mlx5e_get_def_rx_moderation(cq_period_mode);
-}
-
-void mlx5e_set_tx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode)
-{
-	mlx5e_reset_tx_moderation(params, cq_period_mode);
-	MLX5E_SET_PFLAG(params, MLX5E_PFLAG_TX_CQE_BASED_MODER,
-			params->tx_cq_moderation.cq_period_mode ==
-				DIM_CQ_PERIOD_MODE_START_FROM_CQE);
-}
-
-void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode)
-{
-	mlx5e_reset_rx_moderation(params, cq_period_mode);
-	MLX5E_SET_PFLAG(params, MLX5E_PFLAG_RX_CQE_BASED_MODER,
-			params->rx_cq_moderation.cq_period_mode ==
-				DIM_CQ_PERIOD_MODE_START_FROM_CQE);
-}
-
 bool slow_pci_heuristic(struct mlx5_core_dev *mdev)
 {
 	u32 link_speed = 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h
index 9a781f18b57f..749b2ec0436e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.h
@@ -77,11 +77,6 @@ u8 mlx5e_mpwrq_max_log_rq_pkts(struct mlx5_core_dev *mdev, u8 page_shift,
 
 /* Parameter calculations */
 
-void mlx5e_reset_tx_moderation(struct mlx5e_params *params, u8 cq_period_mode);
-void mlx5e_reset_rx_moderation(struct mlx5e_params *params, u8 cq_period_mode);
-void mlx5e_set_tx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode);
-void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode);
-
 bool slow_pci_heuristic(struct mlx5_core_dev *mdev);
 int mlx5e_mpwrq_validate_regular(struct mlx5_core_dev *mdev, struct mlx5e_params *params);
 int mlx5e_mpwrq_validate_xsk(struct mlx5_core_dev *mdev, struct mlx5e_params *params,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
index 106a1f70dd9a..298bb74ec5e9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dim.c
@@ -37,7 +37,8 @@ static void
 mlx5e_complete_dim_work(struct dim *dim, struct dim_cq_moder moder,
 			struct mlx5_core_dev *mdev, struct mlx5_core_cq *mcq)
 {
-	mlx5_core_modify_cq_moderation(mdev, mcq, moder.usec, moder.pkts);
+	mlx5e_modify_cq_moderation(mdev, mcq, moder.usec, moder.pkts,
+				   mlx5e_cq_period_mode(moder.cq_period_mode));
 	dim->state = DIM_START_MEASURE;
 }
 
@@ -60,3 +61,89 @@ void mlx5e_tx_dim_work(struct work_struct *work)
 
 	mlx5e_complete_dim_work(dim, cur_moder, sq->cq.mdev, &sq->cq.mcq);
 }
+
+static struct dim *mlx5e_dim_enable(struct mlx5_core_dev *mdev,
+				    void (*work_fun)(struct work_struct *), int cpu,
+				    u8 cq_period_mode, struct mlx5_core_cq *mcq,
+				    void *queue)
+{
+	struct dim *dim;
+	int err;
+
+	dim = kvzalloc_node(sizeof(*dim), GFP_KERNEL, cpu_to_node(cpu));
+	if (!dim)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_WORK(&dim->work, work_fun);
+
+	dim->mode = cq_period_mode;
+	dim->priv = queue;
+
+	err = mlx5e_modify_cq_period_mode(mdev, mcq, dim->mode);
+	if (err) {
+		kvfree(dim);
+		return ERR_PTR(err);
+	}
+
+	return dim;
+}
+
+static void mlx5e_dim_disable(struct dim *dim)
+{
+	cancel_work_sync(&dim->work);
+	kvfree(dim);
+}
+
+int mlx5e_dim_rx_change(struct mlx5e_rq *rq, bool enable)
+{
+	if (enable == !!rq->dim)
+		return 0;
+
+	if (enable) {
+		struct mlx5e_channel *c = rq->channel;
+		struct dim *dim;
+
+		dim = mlx5e_dim_enable(rq->mdev, mlx5e_rx_dim_work, c->cpu,
+				       c->rx_cq_moder.cq_period_mode, &rq->cq.mcq, rq);
+		if (IS_ERR(dim))
+			return PTR_ERR(dim);
+
+		rq->dim = dim;
+
+		__set_bit(MLX5E_RQ_STATE_DIM, &rq->state);
+	} else {
+		__clear_bit(MLX5E_RQ_STATE_DIM, &rq->state);
+
+		mlx5e_dim_disable(rq->dim);
+		rq->dim = NULL;
+	}
+
+	return 0;
+}
+
+int mlx5e_dim_tx_change(struct mlx5e_txqsq *sq, bool enable)
+{
+	if (enable == !!sq->dim)
+		return 0;
+
+	if (enable) {
+		struct mlx5e_channel *c = sq->channel;
+		struct dim *dim;
+
+		dim = mlx5e_dim_enable(sq->mdev, mlx5e_tx_dim_work, c->cpu,
+				       c->tx_cq_moder.cq_period_mode, &sq->cq.mcq, sq);
+		if (IS_ERR(dim))
+			return PTR_ERR(dim);
+
+		sq->dim = dim;
+
+		__set_bit(MLX5E_SQ_STATE_DIM, &sq->state);
+	} else {
+		__clear_bit(MLX5E_SQ_STATE_DIM, &sq->state);
+
+		mlx5e_dim_disable(sq->dim);
+		sq->dim = NULL;
+	}
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index c07785e675bc..c968874569cc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -34,6 +34,7 @@
 #include <linux/ethtool_netlink.h>
 
 #include "en.h"
+#include "en/channels.h"
 #include "en/dim.h"
 #include "en/port.h"
 #include "en/params.h"
@@ -567,16 +568,13 @@ int mlx5e_ethtool_get_coalesce(struct mlx5e_priv *priv,
 	coal->rx_coalesce_usecs		= rx_moder->usec;
 	coal->rx_max_coalesced_frames	= rx_moder->pkts;
 	coal->use_adaptive_rx_coalesce	= priv->channels.params.rx_dim_enabled;
+	kernel_coal->use_cqe_mode_rx    = priv->channels.params.rx_moder_use_cqe_mode;
 
 	tx_moder = &priv->channels.params.tx_cq_moderation;
 	coal->tx_coalesce_usecs		= tx_moder->usec;
 	coal->tx_max_coalesced_frames	= tx_moder->pkts;
 	coal->use_adaptive_tx_coalesce	= priv->channels.params.tx_dim_enabled;
-
-	kernel_coal->use_cqe_mode_rx =
-		MLX5E_GET_PFLAG(&priv->channels.params, MLX5E_PFLAG_RX_CQE_BASED_MODER);
-	kernel_coal->use_cqe_mode_tx =
-		MLX5E_GET_PFLAG(&priv->channels.params, MLX5E_PFLAG_TX_CQE_BASED_MODER);
+	kernel_coal->use_cqe_mode_tx    = priv->channels.params.tx_moder_use_cqe_mode;
 
 	return 0;
 }
@@ -595,7 +593,7 @@ static int mlx5e_get_coalesce(struct net_device *netdev,
 #define MLX5E_MAX_COAL_FRAMES		MLX5_MAX_CQ_COUNT
 
 static void
-mlx5e_set_priv_channels_tx_coalesce(struct mlx5e_priv *priv, struct ethtool_coalesce *coal)
+mlx5e_set_priv_channels_tx_coalesce(struct mlx5e_priv *priv, struct dim_cq_moder *moder)
 {
 	int tc;
 	int i;
@@ -603,28 +601,34 @@ mlx5e_set_priv_channels_tx_coalesce(struct mlx5e_priv *priv, struct ethtool_coal
 	for (i = 0; i < priv->channels.num; ++i) {
 		struct mlx5e_channel *c = priv->channels.c[i];
 		struct mlx5_core_dev *mdev = c->mdev;
+		enum mlx5_cq_period_mode mode;
+
+		mode = mlx5e_cq_period_mode(moder->cq_period_mode);
+		c->tx_cq_moder = *moder;
 
 		for (tc = 0; tc < c->num_tc; tc++) {
-			mlx5_core_modify_cq_moderation(mdev,
-						&c->sq[tc].cq.mcq,
-						coal->tx_coalesce_usecs,
-						coal->tx_max_coalesced_frames);
+			mlx5e_modify_cq_moderation(mdev, &c->sq[tc].cq.mcq,
+						   moder->usec, moder->pkts,
+						   mode);
 		}
 	}
 }
 
 static void
-mlx5e_set_priv_channels_rx_coalesce(struct mlx5e_priv *priv, struct ethtool_coalesce *coal)
+mlx5e_set_priv_channels_rx_coalesce(struct mlx5e_priv *priv, struct dim_cq_moder *moder)
 {
 	int i;
 
 	for (i = 0; i < priv->channels.num; ++i) {
 		struct mlx5e_channel *c = priv->channels.c[i];
 		struct mlx5_core_dev *mdev = c->mdev;
+		enum mlx5_cq_period_mode mode;
+
+		mode = mlx5e_cq_period_mode(moder->cq_period_mode);
+		c->rx_cq_moder = *moder;
 
-		mlx5_core_modify_cq_moderation(mdev, &c->rq.cq.mcq,
-					       coal->rx_coalesce_usecs,
-					       coal->rx_max_coalesced_frames);
+		mlx5e_modify_cq_moderation(mdev, &c->rq.cq.mcq, moder->usec, moder->pkts,
+					   mode);
 	}
 }
 
@@ -635,13 +639,14 @@ int mlx5e_ethtool_set_coalesce(struct mlx5e_priv *priv,
 {
 	struct dim_cq_moder *rx_moder, *tx_moder;
 	struct mlx5_core_dev *mdev = priv->mdev;
+	bool rx_dim_enabled, tx_dim_enabled;
 	struct mlx5e_params new_params;
 	bool reset_rx, reset_tx;
-	bool reset = true;
 	u8 cq_period_mode;
 	int err = 0;
 
-	if (!MLX5_CAP_GEN(mdev, cq_moderation))
+	if (!MLX5_CAP_GEN(mdev, cq_moderation) ||
+	    !MLX5_CAP_GEN(mdev, cq_period_mode_modify))
 		return -EOPNOTSUPP;
 
 	if (coal->tx_coalesce_usecs > MLX5E_MAX_COAL_TIME ||
@@ -664,60 +669,70 @@ int mlx5e_ethtool_set_coalesce(struct mlx5e_priv *priv,
 		return -EOPNOTSUPP;
 	}
 
+	rx_dim_enabled = !!coal->use_adaptive_rx_coalesce;
+	tx_dim_enabled = !!coal->use_adaptive_tx_coalesce;
+
 	mutex_lock(&priv->state_lock);
 	new_params = priv->channels.params;
 
-	rx_moder          = &new_params.rx_cq_moderation;
-	rx_moder->usec    = coal->rx_coalesce_usecs;
-	rx_moder->pkts    = coal->rx_max_coalesced_frames;
-	new_params.rx_dim_enabled = !!coal->use_adaptive_rx_coalesce;
+	cq_period_mode = mlx5e_dim_cq_period_mode(kernel_coal->use_cqe_mode_rx);
+	reset_rx = mlx5e_reset_rx_channels_moderation(&priv->channels, cq_period_mode,
+						      rx_dim_enabled, false);
+	MLX5E_SET_PFLAG(&new_params, MLX5E_PFLAG_RX_CQE_BASED_MODER, cq_period_mode);
 
-	tx_moder          = &new_params.tx_cq_moderation;
-	tx_moder->usec    = coal->tx_coalesce_usecs;
-	tx_moder->pkts    = coal->tx_max_coalesced_frames;
-	new_params.tx_dim_enabled = !!coal->use_adaptive_tx_coalesce;
+	cq_period_mode = mlx5e_dim_cq_period_mode(kernel_coal->use_cqe_mode_tx);
+	reset_tx = mlx5e_reset_tx_channels_moderation(&priv->channels, cq_period_mode,
+						      tx_dim_enabled, false);
+	MLX5E_SET_PFLAG(&new_params, MLX5E_PFLAG_TX_CQE_BASED_MODER, cq_period_mode);
 
-	reset_rx = !!coal->use_adaptive_rx_coalesce != priv->channels.params.rx_dim_enabled;
-	reset_tx = !!coal->use_adaptive_tx_coalesce != priv->channels.params.tx_dim_enabled;
+	reset_rx |= rx_dim_enabled != new_params.rx_dim_enabled;
+	reset_tx |= tx_dim_enabled != new_params.tx_dim_enabled;
 
-	cq_period_mode = mlx5e_dim_cq_period_mode(kernel_coal->use_cqe_mode_rx);
-	if (cq_period_mode != rx_moder->cq_period_mode) {
-		mlx5e_set_rx_cq_mode_params(&new_params, cq_period_mode);
-		reset_rx = true;
-	}
+	/* Solely used for global ethtool get coalesce */
+	rx_moder = &new_params.rx_cq_moderation;
+	new_params.rx_dim_enabled = rx_dim_enabled;
+	new_params.rx_moder_use_cqe_mode = kernel_coal->use_cqe_mode_rx;
 
-	cq_period_mode = mlx5e_dim_cq_period_mode(kernel_coal->use_cqe_mode_tx);
-	if (cq_period_mode != tx_moder->cq_period_mode) {
-		mlx5e_set_tx_cq_mode_params(&new_params, cq_period_mode);
-		reset_tx = true;
-	}
+	tx_moder = &new_params.tx_cq_moderation;
+	new_params.tx_dim_enabled = tx_dim_enabled;
+	new_params.tx_moder_use_cqe_mode = kernel_coal->use_cqe_mode_tx;
 
 	if (reset_rx) {
-		u8 mode = MLX5E_GET_PFLAG(&new_params,
-					  MLX5E_PFLAG_RX_CQE_BASED_MODER);
+		mlx5e_channels_rx_change_dim(&priv->channels, false);
+		mlx5e_reset_rx_moderation(rx_moder, new_params.rx_moder_use_cqe_mode,
+					  rx_dim_enabled);
 
-		mlx5e_reset_rx_moderation(&new_params, mode);
+		mlx5e_set_priv_channels_rx_coalesce(priv, rx_moder);
+	} else if (!rx_dim_enabled) {
+		rx_moder->usec = coal->rx_coalesce_usecs;
+		rx_moder->pkts = coal->rx_max_coalesced_frames;
+
+		mlx5e_set_priv_channels_rx_coalesce(priv, rx_moder);
 	}
+
 	if (reset_tx) {
-		u8 mode = MLX5E_GET_PFLAG(&new_params,
-					  MLX5E_PFLAG_TX_CQE_BASED_MODER);
+		mlx5e_channels_tx_change_dim(&priv->channels, false);
+		mlx5e_reset_tx_moderation(tx_moder, new_params.tx_moder_use_cqe_mode,
+					  tx_dim_enabled);
 
-		mlx5e_reset_tx_moderation(&new_params, mode);
-	}
+		mlx5e_set_priv_channels_tx_coalesce(priv, tx_moder);
+	} else if (!tx_dim_enabled) {
+		tx_moder->usec = coal->tx_coalesce_usecs;
+		tx_moder->pkts = coal->tx_max_coalesced_frames;
 
-	/* If DIM state hasn't changed, it's possible to modify interrupt
-	 * moderation parameters on the fly, even if the channels are open.
-	 */
-	if (!reset_rx && !reset_tx && test_bit(MLX5E_STATE_OPENED, &priv->state)) {
-		if (!coal->use_adaptive_rx_coalesce)
-			mlx5e_set_priv_channels_rx_coalesce(priv, coal);
-		if (!coal->use_adaptive_tx_coalesce)
-			mlx5e_set_priv_channels_tx_coalesce(priv, coal);
-		reset = false;
+		mlx5e_set_priv_channels_tx_coalesce(priv, tx_moder);
 	}
 
-	err = mlx5e_safe_switch_params(priv, &new_params, NULL, NULL, reset);
+	/* DIM enable/disable Rx and Tx channels */
+	err = mlx5e_channels_rx_change_dim(&priv->channels, rx_dim_enabled);
+	if (err)
+		goto state_unlock;
+	err = mlx5e_channels_tx_change_dim(&priv->channels, tx_dim_enabled);
+	if (err)
+		goto state_unlock;
 
+	err = mlx5e_safe_switch_params(priv, &new_params, NULL, NULL, false);
+state_unlock:
 	mutex_unlock(&priv->state_lock);
 	return err;
 }
@@ -1917,12 +1932,22 @@ static int set_pflag_cqe_based_moder(struct net_device *netdev, bool enable,
 		return 0;
 
 	new_params = priv->channels.params;
-	if (is_rx_cq)
-		mlx5e_set_rx_cq_mode_params(&new_params, cq_period_mode);
-	else
-		mlx5e_set_tx_cq_mode_params(&new_params, cq_period_mode);
+	if (is_rx_cq) {
+		mlx5e_reset_rx_channels_moderation(&priv->channels, cq_period_mode,
+						   false, true);
+		mlx5e_channels_rx_toggle_dim(&priv->channels);
+		MLX5E_SET_PFLAG(&new_params, MLX5E_PFLAG_RX_CQE_BASED_MODER,
+				cq_period_mode);
+	} else {
+		mlx5e_reset_tx_channels_moderation(&priv->channels, cq_period_mode,
+						   false, true);
+		mlx5e_channels_tx_toggle_dim(&priv->channels);
+		MLX5E_SET_PFLAG(&new_params, MLX5E_PFLAG_TX_CQE_BASED_MODER,
+				cq_period_mode);
+	}
 
-	return mlx5e_safe_switch_params(priv, &new_params, NULL, NULL, true);
+	/* Update pflags of existing channels without resetting them */
+	return mlx5e_safe_switch_params(priv, &new_params, NULL, NULL, false);
 }
 
 static int set_pflag_tx_cqe_based_moder(struct net_device *netdev, bool enable)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 8b4ecae0fd9f..3bd0695845c7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -962,20 +962,8 @@ static int mlx5e_alloc_rq(struct mlx5e_params *params,
 		}
 	}
 
-	rq->dim = kvzalloc_node(sizeof(*rq->dim), GFP_KERNEL, node);
-	if (!rq->dim) {
-		err = -ENOMEM;
-		goto err_unreg_xdp_rxq_info;
-	}
-
-	rq->dim->priv = rq;
-	INIT_WORK(&rq->dim->work, mlx5e_rx_dim_work);
-	rq->dim->mode = params->rx_cq_moderation.cq_period_mode;
-
 	return 0;
 
-err_unreg_xdp_rxq_info:
-	xdp_rxq_info_unreg(&rq->xdp_rxq);
 err_destroy_page_pool:
 	page_pool_destroy(rq->page_pool);
 err_free_by_rq_type:
@@ -1304,8 +1292,21 @@ int mlx5e_open_rq(struct mlx5e_params *params, struct mlx5e_rq_param *param,
 	if (MLX5_CAP_ETH(mdev, cqe_checksum_full))
 		__set_bit(MLX5E_RQ_STATE_CSUM_FULL, &rq->state);
 
-	if (params->rx_dim_enabled)
-		__set_bit(MLX5E_RQ_STATE_DIM, &rq->state);
+	if (rq->channel && !params->rx_dim_enabled) {
+		rq->channel->rx_cq_moder = params->rx_cq_moderation;
+	} else if (rq->channel) {
+		u8 cq_period_mode;
+
+		cq_period_mode = params->rx_moder_use_cqe_mode ?
+					 DIM_CQ_PERIOD_MODE_START_FROM_CQE :
+					 DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+		mlx5e_reset_rx_moderation(&rq->channel->rx_cq_moder, cq_period_mode,
+					  params->rx_dim_enabled);
+
+		err = mlx5e_dim_rx_change(rq, params->rx_dim_enabled);
+		if (err)
+			goto err_destroy_rq;
+	}
 
 	/* We disable csum_complete when XDP is enabled since
 	 * XDP programs might manipulate packets which will render
@@ -1351,7 +1352,8 @@ void mlx5e_deactivate_rq(struct mlx5e_rq *rq)
 
 void mlx5e_close_rq(struct mlx5e_rq *rq)
 {
-	cancel_work_sync(&rq->dim->work);
+	if (rq->dim)
+		cancel_work_sync(&rq->dim->work);
 	cancel_work_sync(&rq->recover_work);
 	mlx5e_destroy_rq(rq);
 	mlx5e_free_rx_descs(rq);
@@ -1626,20 +1628,9 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c,
 	err = mlx5e_alloc_txqsq_db(sq, cpu_to_node(c->cpu));
 	if (err)
 		goto err_sq_wq_destroy;
-	sq->dim = kvzalloc_node(sizeof(*sq->dim), GFP_KERNEL, cpu_to_node(c->cpu));
-	if (!sq->dim) {
-		err = -ENOMEM;
-		goto err_free_txqsq_db;
-	}
-
-	sq->dim->priv = sq;
-	INIT_WORK(&sq->dim->work, mlx5e_tx_dim_work);
-	sq->dim->mode = params->tx_cq_moderation.cq_period_mode;
 
 	return 0;
 
-err_free_txqsq_db:
-	mlx5e_free_txqsq_db(sq);
 err_sq_wq_destroy:
 	mlx5_wq_destroy(&sq->wq_ctrl);
 
@@ -1804,11 +1795,27 @@ int mlx5e_open_txqsq(struct mlx5e_channel *c, u32 tisn, int txq_ix,
 	if (tx_rate)
 		mlx5e_set_sq_maxrate(c->netdev, sq, tx_rate);
 
-	if (params->tx_dim_enabled)
-		sq->state |= BIT(MLX5E_SQ_STATE_DIM);
+	if (sq->channel && !params->tx_dim_enabled) {
+		sq->channel->tx_cq_moder = params->tx_cq_moderation;
+	} else if (sq->channel) {
+		u8 cq_period_mode;
+
+		cq_period_mode = params->tx_moder_use_cqe_mode ?
+					 DIM_CQ_PERIOD_MODE_START_FROM_CQE :
+					 DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+		mlx5e_reset_tx_moderation(&sq->channel->tx_cq_moder,
+					  cq_period_mode,
+					  params->tx_dim_enabled);
+
+		err = mlx5e_dim_tx_change(sq, params->tx_dim_enabled);
+		if (err)
+			goto err_destroy_sq;
+	}
 
 	return 0;
 
+err_destroy_sq:
+	mlx5e_destroy_sq(c->mdev, sq->sqn);
 err_free_txqsq:
 	mlx5e_free_txqsq(sq);
 
@@ -1860,7 +1867,8 @@ void mlx5e_close_txqsq(struct mlx5e_txqsq *sq)
 	struct mlx5_core_dev *mdev = sq->mdev;
 	struct mlx5_rate_limit rl = {0};
 
-	cancel_work_sync(&sq->dim->work);
+	if (sq->dim)
+		cancel_work_sync(&sq->dim->work);
 	cancel_work_sync(&sq->recover_work);
 	mlx5e_destroy_sq(mdev, sq->sqn);
 	if (sq->rate_limit) {
@@ -1879,6 +1887,49 @@ void mlx5e_tx_err_cqe_work(struct work_struct *recover_work)
 	mlx5e_reporter_tx_err_cqe(sq);
 }
 
+static struct dim_cq_moder mlx5e_get_def_tx_moderation(u8 cq_period_mode)
+{
+	return (struct dim_cq_moder) {
+		.cq_period_mode = cq_period_mode,
+		.pkts = MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS,
+		.usec = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
+				MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC_FROM_CQE :
+				MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_USEC,
+	};
+}
+
+bool mlx5e_reset_tx_moderation(struct dim_cq_moder *cq_moder, u8 cq_period_mode,
+			       bool dim_enabled)
+{
+	bool reset_needed = cq_moder->cq_period_mode != cq_period_mode;
+
+	if (dim_enabled)
+		*cq_moder = net_dim_get_def_tx_moderation(cq_period_mode);
+	else
+		*cq_moder = mlx5e_get_def_tx_moderation(cq_period_mode);
+
+	return reset_needed;
+}
+
+bool mlx5e_reset_tx_channels_moderation(struct mlx5e_channels *chs, u8 cq_period_mode,
+					bool dim_enabled, bool keep_dim_state)
+{
+	bool reset = false;
+	int i, tc;
+
+	for (i = 0; i < chs->num; i++) {
+		for (tc = 0; tc < mlx5e_get_dcb_num_tc(&chs->params); tc++) {
+			if (keep_dim_state)
+				dim_enabled = !!chs->c[i]->sq[tc].dim;
+
+			reset |= mlx5e_reset_tx_moderation(&chs->c[i]->tx_cq_moder,
+							   cq_period_mode, dim_enabled);
+		}
+	}
+
+	return reset;
+}
+
 static int mlx5e_open_icosq(struct mlx5e_channel *c, struct mlx5e_params *params,
 			    struct mlx5e_sq_param *param, struct mlx5e_icosq *sq,
 			    work_func_t recover_work_func)
@@ -2102,7 +2153,8 @@ static int mlx5e_create_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param)
 	mlx5_fill_page_frag_array(&cq->wq_ctrl.buf,
 				  (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas));
 
-	MLX5_SET(cqc,   cqc, cq_period_mode, mlx5e_cq_period_mode(param->cq_period_mode));
+	MLX5_SET(cqc, cqc, cq_period_mode, mlx5e_cq_period_mode(param->cq_period_mode));
+
 	MLX5_SET(cqc,   cqc, c_eqn_or_apu_element, eqn);
 	MLX5_SET(cqc,   cqc, uar_page,      mdev->priv.uar->index);
 	MLX5_SET(cqc,   cqc, log_page_size, cq->wq_ctrl.buf.page_shift -
@@ -2140,8 +2192,10 @@ int mlx5e_open_cq(struct mlx5_core_dev *mdev, struct dim_cq_moder moder,
 	if (err)
 		goto err_free_cq;
 
-	if (MLX5_CAP_GEN(mdev, cq_moderation))
-		mlx5_core_modify_cq_moderation(mdev, &cq->mcq, moder.usec, moder.pkts);
+	if (MLX5_CAP_GEN(mdev, cq_moderation) &&
+	    MLX5_CAP_GEN(mdev, cq_period_mode_modify))
+		mlx5e_modify_cq_moderation(mdev, &cq->mcq, moder.usec, moder.pkts,
+					   mlx5e_cq_period_mode(moder.cq_period_mode));
 	return 0;
 
 err_free_cq:
@@ -2156,6 +2210,40 @@ void mlx5e_close_cq(struct mlx5e_cq *cq)
 	mlx5e_free_cq(cq);
 }
 
+int mlx5e_modify_cq_period_mode(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
+				u8 cq_period_mode)
+{
+	u32 in[MLX5_ST_SZ_DW(modify_cq_in)] = {};
+	void *cqc;
+
+	MLX5_SET(modify_cq_in, in, cqn, cq->cqn);
+	cqc = MLX5_ADDR_OF(modify_cq_in, in, cq_context);
+	MLX5_SET(cqc, cqc, cq_period_mode, mlx5e_cq_period_mode(cq_period_mode));
+	MLX5_SET(modify_cq_in, in,
+		 modify_field_select_resize_field_select.modify_field_select.modify_field_select,
+		 MLX5_CQ_MODIFY_PERIOD_MODE);
+
+	return mlx5_core_modify_cq(dev, cq, in, sizeof(in));
+}
+
+int mlx5e_modify_cq_moderation(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
+			       u16 cq_period, u16 cq_max_count, u8 cq_period_mode)
+{
+	u32 in[MLX5_ST_SZ_DW(modify_cq_in)] = {};
+	void *cqc;
+
+	MLX5_SET(modify_cq_in, in, cqn, cq->cqn);
+	cqc = MLX5_ADDR_OF(modify_cq_in, in, cq_context);
+	MLX5_SET(cqc, cqc, cq_period, cq_period);
+	MLX5_SET(cqc, cqc, cq_max_count, cq_max_count);
+	MLX5_SET(cqc, cqc, cq_period_mode, cq_period_mode);
+	MLX5_SET(modify_cq_in, in,
+		 modify_field_select_resize_field_select.modify_field_select.modify_field_select,
+		 MLX5_CQ_MODIFY_PERIOD | MLX5_CQ_MODIFY_COUNT | MLX5_CQ_MODIFY_PERIOD_MODE);
+
+	return mlx5_core_modify_cq(dev, cq, in, sizeof(in));
+}
+
 static int mlx5e_open_tx_cqs(struct mlx5e_channel *c,
 			     struct mlx5e_params *params,
 			     struct mlx5e_create_cq_param *ccp,
@@ -3973,6 +4061,47 @@ static int set_feature_rx_all(struct net_device *netdev, bool enable)
 	return mlx5_set_port_fcs(mdev, !enable);
 }
 
+static struct dim_cq_moder mlx5e_get_def_rx_moderation(u8 cq_period_mode)
+{
+	return (struct dim_cq_moder) {
+		.cq_period_mode = cq_period_mode,
+		.pkts = MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS,
+		.usec = cq_period_mode == DIM_CQ_PERIOD_MODE_START_FROM_CQE ?
+				MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE :
+				MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC,
+	};
+}
+
+bool mlx5e_reset_rx_moderation(struct dim_cq_moder *cq_moder, u8 cq_period_mode,
+			       bool dim_enabled)
+{
+	bool reset_needed = cq_moder->cq_period_mode != cq_period_mode;
+
+	if (dim_enabled)
+		*cq_moder = net_dim_get_def_rx_moderation(cq_period_mode);
+	else
+		*cq_moder = mlx5e_get_def_rx_moderation(cq_period_mode);
+
+	return reset_needed;
+}
+
+bool mlx5e_reset_rx_channels_moderation(struct mlx5e_channels *chs, u8 cq_period_mode,
+					bool dim_enabled, bool keep_dim_state)
+{
+	bool reset = false;
+	int i;
+
+	for (i = 0; i < chs->num; i++) {
+		if (keep_dim_state)
+			dim_enabled = !!chs->c[i]->rq.dim;
+
+		reset |= mlx5e_reset_rx_moderation(&chs->c[i]->rx_cq_moder,
+						   cq_period_mode, dim_enabled);
+	}
+
+	return reset;
+}
+
 static int mlx5e_set_rx_port_ts(struct mlx5_core_dev *mdev, bool enable)
 {
 	u32 in[MLX5_ST_SZ_DW(pcmr_reg)] = {};
@@ -5037,7 +5166,6 @@ void mlx5e_build_nic_params(struct mlx5e_priv *priv, struct mlx5e_xsk *xsk, u16
 {
 	struct mlx5e_params *params = &priv->channels.params;
 	struct mlx5_core_dev *mdev = priv->mdev;
-	u8 rx_cq_period_mode;
 
 	params->sw_mtu = mtu;
 	params->hard_mtu = MLX5E_ETH_HARD_MTU;
@@ -5071,12 +5199,16 @@ void mlx5e_build_nic_params(struct mlx5e_priv *priv, struct mlx5e_xsk *xsk, u16
 	params->packet_merge.timeout = mlx5e_choose_lro_timeout(mdev, MLX5E_DEFAULT_LRO_TIMEOUT);
 
 	/* CQ moderation params */
-	rx_cq_period_mode =
-		mlx5e_dim_cq_period_mode(MLX5_CAP_GEN(mdev, cq_period_start_from_cqe));
-	params->rx_dim_enabled = MLX5_CAP_GEN(mdev, cq_moderation);
-	params->tx_dim_enabled = MLX5_CAP_GEN(mdev, cq_moderation);
-	mlx5e_set_rx_cq_mode_params(params, rx_cq_period_mode);
-	mlx5e_set_tx_cq_mode_params(params, DIM_CQ_PERIOD_MODE_START_FROM_EQE);
+	params->rx_dim_enabled = MLX5_CAP_GEN(mdev, cq_moderation) &&
+				 MLX5_CAP_GEN(mdev, cq_period_mode_modify);
+	params->tx_dim_enabled = MLX5_CAP_GEN(mdev, cq_moderation) &&
+				 MLX5_CAP_GEN(mdev, cq_period_mode_modify);
+	params->rx_moder_use_cqe_mode = !!MLX5_CAP_GEN(mdev, cq_period_start_from_cqe);
+	params->tx_moder_use_cqe_mode = false;
+	mlx5e_reset_rx_moderation(&params->rx_cq_moderation, params->rx_moder_use_cqe_mode,
+				  params->rx_dim_enabled);
+	mlx5e_reset_tx_moderation(&params->tx_cq_moderation, params->tx_moder_use_cqe_mode,
+				  params->tx_dim_enabled);
 
 	/* TX inline */
 	mlx5_query_min_inline(mdev, &params->tx_min_inline_mode);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 309771300581..6477b91ff512 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -838,9 +838,6 @@ static void mlx5e_build_rep_params(struct net_device *netdev)
 	struct mlx5_core_dev *mdev = priv->mdev;
 	struct mlx5e_params *params;
 
-	u8 cq_period_mode =
-		mlx5e_dim_cq_period_mode(MLX5_CAP_GEN(mdev, cq_period_start_from_cqe));
-
 	params = &priv->channels.params;
 
 	params->num_channels = MLX5E_REP_PARAMS_DEF_NUM_CHANNELS;
@@ -868,7 +865,7 @@ static void mlx5e_build_rep_params(struct net_device *netdev)
 
 	/* CQ moderation params */
 	params->rx_dim_enabled = MLX5_CAP_GEN(mdev, cq_moderation);
-	mlx5e_set_rx_cq_mode_params(params, cq_period_mode);
+	params->rx_moder_use_cqe_mode = !!MLX5_CAP_GEN(mdev, cq_period_start_from_cqe);
 
 	params->mqprio.num_tc       = 1;
 	if (rep->vport != MLX5_VPORT_UPLINK)
diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h
index cb15308b5cb0..991526039ccb 100644
--- a/include/linux/mlx5/cq.h
+++ b/include/linux/mlx5/cq.h
@@ -95,9 +95,10 @@ enum {
 };
 
 enum {
-	MLX5_CQ_MODIFY_PERIOD	= 1 << 0,
-	MLX5_CQ_MODIFY_COUNT	= 1 << 1,
-	MLX5_CQ_MODIFY_OVERRUN	= 1 << 2,
+	MLX5_CQ_MODIFY_PERIOD		= BIT(0),
+	MLX5_CQ_MODIFY_COUNT		= BIT(1),
+	MLX5_CQ_MODIFY_OVERRUN		= BIT(2),
+	MLX5_CQ_MODIFY_PERIOD_MODE	= BIT(4),
 };
 
 enum {
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 8c7ddb22bf20..f468763478ae 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1686,7 +1686,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         cq_oi[0x1];
 	u8         cq_resize[0x1];
 	u8         cq_moderation[0x1];
-	u8         reserved_at_223[0x3];
+	u8         cq_period_mode_modify[0x1];
+	u8         reserved_at_224[0x2];
 	u8         cq_eq_remap[0x1];
 	u8         pg[0x1];
 	u8         block_lb_mc[0x1];
-- 
cgit v1.2.3


From 7ab4f16f9e2440e797eae88812f800458e5879d2 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 19 Apr 2024 12:08:39 +0100
Subject: net: extend ubuf_info callback to ops structure

We'll need to associate additional callbacks with ubuf_info, introduce
a structure holding ubuf_info callbacks. Apart from a more smarter
io_uring notification management introduced in next patches, it can be
used to generalise msg_zerocopy_put_abort() and also store
->sg_from_iter, which is currently passed in struct msghdr.

Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://lore.kernel.org/all/a62015541de49c0e2a8a0377a1d5d0a5aeb07016.1713369317.git.asml.silence@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/tap.c                   |  2 +-
 drivers/net/tun.c                   |  2 +-
 drivers/net/xen-netback/common.h    |  5 ++---
 drivers/net/xen-netback/interface.c |  2 +-
 drivers/net/xen-netback/netback.c   | 11 ++++++++---
 drivers/vhost/net.c                 |  8 ++++++--
 include/linux/skbuff.h              | 19 +++++++++++--------
 io_uring/notif.c                    | 18 +++++++++++++-----
 net/core/skbuff.c                   | 16 ++++++++++------
 9 files changed, 53 insertions(+), 30 deletions(-)

(limited to 'include')

diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 9f0495e8df4d..bfdd3875fe86 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -754,7 +754,7 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control,
 		skb_zcopy_init(skb, msg_control);
 	} else if (msg_control) {
 		struct ubuf_info *uarg = msg_control;
-		uarg->callback(NULL, uarg, false);
+		uarg->ops->complete(NULL, uarg, false);
 	}
 
 	dev_queue_xmit(skb);
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 0b3f21cba552..b7401d990680 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1906,7 +1906,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 		skb_zcopy_init(skb, msg_control);
 	} else if (msg_control) {
 		struct ubuf_info *uarg = msg_control;
-		uarg->callback(NULL, uarg, false);
+		uarg->ops->complete(NULL, uarg, false);
 	}
 
 	skb_reset_network_header(skb);
diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 1fcbd83f7ff2..17421da139f2 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -390,9 +390,8 @@ bool xenvif_rx_queue_tail(struct xenvif_queue *queue, struct sk_buff *skb);
 
 void xenvif_carrier_on(struct xenvif *vif);
 
-/* Callback from stack when TX packet can be released */
-void xenvif_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *ubuf,
-			      bool zerocopy_success);
+/* Callbacks from stack when TX packet can be released */
+extern const struct ubuf_info_ops xenvif_ubuf_ops;
 
 static inline pending_ring_idx_t nr_pending_reqs(struct xenvif_queue *queue)
 {
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index 7cff90aa8d24..65db5f14465f 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -593,7 +593,7 @@ int xenvif_init_queue(struct xenvif_queue *queue)
 
 	for (i = 0; i < MAX_PENDING_REQS; i++) {
 		queue->pending_tx_info[i].callback_struct = (struct ubuf_info_msgzc)
-			{ { .callback = xenvif_zerocopy_callback },
+			{ { .ops = &xenvif_ubuf_ops },
 			  { { .ctx = NULL,
 			      .desc = i } } };
 		queue->grant_tx_handle[i] = NETBACK_INVALID_HANDLE;
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index ef76850d9bcd..bab7e43ea05b 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -1156,7 +1156,7 @@ static int xenvif_handle_frag_list(struct xenvif_queue *queue, struct sk_buff *s
 	uarg = skb_shinfo(skb)->destructor_arg;
 	/* increase inflight counter to offset decrement in callback */
 	atomic_inc(&queue->inflight_packets);
-	uarg->callback(NULL, uarg, true);
+	uarg->ops->complete(NULL, uarg, true);
 	skb_shinfo(skb)->destructor_arg = NULL;
 
 	/* Fill the skb with the new (local) frags. */
@@ -1278,8 +1278,9 @@ static int xenvif_tx_submit(struct xenvif_queue *queue)
 	return work_done;
 }
 
-void xenvif_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *ubuf_base,
-			      bool zerocopy_success)
+static void xenvif_zerocopy_callback(struct sk_buff *skb,
+				     struct ubuf_info *ubuf_base,
+				     bool zerocopy_success)
 {
 	unsigned long flags;
 	pending_ring_idx_t index;
@@ -1312,6 +1313,10 @@ void xenvif_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *ubuf_base,
 	xenvif_skb_zerocopy_complete(queue);
 }
 
+const struct ubuf_info_ops xenvif_ubuf_ops = {
+	.complete = xenvif_zerocopy_callback,
+};
+
 static inline void xenvif_tx_dealloc_action(struct xenvif_queue *queue)
 {
 	struct gnttab_unmap_grant_ref *gop;
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index c64ded183f8d..f16279351db5 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -380,7 +380,7 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net,
 	}
 }
 
-static void vhost_zerocopy_callback(struct sk_buff *skb,
+static void vhost_zerocopy_complete(struct sk_buff *skb,
 				    struct ubuf_info *ubuf_base, bool success)
 {
 	struct ubuf_info_msgzc *ubuf = uarg_to_msgzc(ubuf_base);
@@ -408,6 +408,10 @@ static void vhost_zerocopy_callback(struct sk_buff *skb,
 	rcu_read_unlock_bh();
 }
 
+static const struct ubuf_info_ops vhost_ubuf_ops = {
+	.complete = vhost_zerocopy_complete,
+};
+
 static inline unsigned long busy_clock(void)
 {
 	return local_clock() >> 10;
@@ -879,7 +883,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
 			vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
 			ubuf->ctx = nvq->ubufs;
 			ubuf->desc = nvq->upend_idx;
-			ubuf->ubuf.callback = vhost_zerocopy_callback;
+			ubuf->ubuf.ops = &vhost_ubuf_ops;
 			ubuf->ubuf.flags = SKBFL_ZEROCOPY_FRAG;
 			refcount_set(&ubuf->ubuf.refcnt, 1);
 			msg.msg_control = &ctl;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9d24aec064e8..a110e97e074a 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -527,6 +527,11 @@ enum {
 #define SKBFL_ALL_ZEROCOPY	(SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY | \
 				 SKBFL_DONT_ORPHAN | SKBFL_MANAGED_FRAG_REFS)
 
+struct ubuf_info_ops {
+	void (*complete)(struct sk_buff *, struct ubuf_info *,
+			 bool zerocopy_success);
+};
+
 /*
  * The callback notifies userspace to release buffers when skb DMA is done in
  * lower device, the skb last reference should be 0 when calling this.
@@ -536,8 +541,7 @@ enum {
  * The desc field is used to track userspace buffer index.
  */
 struct ubuf_info {
-	void (*callback)(struct sk_buff *, struct ubuf_info *,
-			 bool zerocopy_success);
+	const struct ubuf_info_ops *ops;
 	refcount_t refcnt;
 	u8 flags;
 };
@@ -1662,14 +1666,13 @@ static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset)
 }
 #endif
 
+extern const struct ubuf_info_ops msg_zerocopy_ubuf_ops;
+
 struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
 				       struct ubuf_info *uarg);
 
 void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);
 
-void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
-			   bool success);
-
 int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
 			    struct sk_buff *skb, struct iov_iter *from,
 			    size_t length);
@@ -1757,13 +1760,13 @@ static inline void *skb_zcopy_get_nouarg(struct sk_buff *skb)
 static inline void net_zcopy_put(struct ubuf_info *uarg)
 {
 	if (uarg)
-		uarg->callback(NULL, uarg, true);
+		uarg->ops->complete(NULL, uarg, true);
 }
 
 static inline void net_zcopy_put_abort(struct ubuf_info *uarg, bool have_uref)
 {
 	if (uarg) {
-		if (uarg->callback == msg_zerocopy_callback)
+		if (uarg->ops == &msg_zerocopy_ubuf_ops)
 			msg_zerocopy_put_abort(uarg, have_uref);
 		else if (have_uref)
 			net_zcopy_put(uarg);
@@ -1777,7 +1780,7 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success)
 
 	if (uarg) {
 		if (!skb_zcopy_is_nouarg(skb))
-			uarg->callback(skb, uarg, zerocopy_success);
+			uarg->ops->complete(skb, uarg, zerocopy_success);
 
 		skb_shinfo(skb)->flags &= ~SKBFL_ALL_ZEROCOPY;
 	}
diff --git a/io_uring/notif.c b/io_uring/notif.c
index d3e703c37aba..204f0fc7d966 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -24,7 +24,7 @@ static void io_notif_complete_tw_ext(struct io_kiocb *notif, struct io_tw_state
 	io_req_task_complete(notif, ts);
 }
 
-static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg,
+static void io_tx_ubuf_complete(struct sk_buff *skb, struct ubuf_info *uarg,
 				bool success)
 {
 	struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg);
@@ -45,19 +45,27 @@ static void io_tx_ubuf_callback_ext(struct sk_buff *skb, struct ubuf_info *uarg,
 		else if (!success && !nd->zc_copied)
 			WRITE_ONCE(nd->zc_copied, true);
 	}
-	io_tx_ubuf_callback(skb, uarg, success);
+	io_tx_ubuf_complete(skb, uarg, success);
 }
 
+static const struct ubuf_info_ops io_ubuf_ops = {
+	.complete = io_tx_ubuf_complete,
+};
+
+static const struct ubuf_info_ops io_ubuf_ops_ext = {
+	.complete = io_tx_ubuf_callback_ext,
+};
+
 void io_notif_set_extended(struct io_kiocb *notif)
 {
 	struct io_notif_data *nd = io_notif_to_data(notif);
 
-	if (nd->uarg.callback != io_tx_ubuf_callback_ext) {
+	if (nd->uarg.ops != &io_ubuf_ops_ext) {
 		nd->account_pages = 0;
 		nd->zc_report = false;
 		nd->zc_used = false;
 		nd->zc_copied = false;
-		nd->uarg.callback = io_tx_ubuf_callback_ext;
+		nd->uarg.ops = &io_ubuf_ops_ext;
 		notif->io_task_work.func = io_notif_complete_tw_ext;
 	}
 }
@@ -80,7 +88,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
 
 	nd = io_notif_to_data(notif);
 	nd->uarg.flags = IO_NOTIF_UBUF_FLAGS;
-	nd->uarg.callback = io_tx_ubuf_callback;
+	nd->uarg.ops = &io_ubuf_ops;
 	refcount_set(&nd->uarg.refcnt, 1);
 	return notif;
 }
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b99127712e67..9d36750459f3 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1708,7 +1708,7 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
 		return NULL;
 	}
 
-	uarg->ubuf.callback = msg_zerocopy_callback;
+	uarg->ubuf.ops = &msg_zerocopy_ubuf_ops;
 	uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
 	uarg->len = 1;
 	uarg->bytelen = size;
@@ -1734,7 +1734,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
 		u32 bytelen, next;
 
 		/* there might be non MSG_ZEROCOPY users */
-		if (uarg->callback != msg_zerocopy_callback)
+		if (uarg->ops != &msg_zerocopy_ubuf_ops)
 			return NULL;
 
 		/* realloc only when socket is locked (TCP, UDP cork),
@@ -1845,8 +1845,8 @@ release:
 	sock_put(sk);
 }
 
-void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
-			   bool success)
+static void msg_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *uarg,
+				  bool success)
 {
 	struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg);
 
@@ -1855,7 +1855,6 @@ void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
 	if (refcount_dec_and_test(&uarg->refcnt))
 		__msg_zerocopy_callback(uarg_zc);
 }
-EXPORT_SYMBOL_GPL(msg_zerocopy_callback);
 
 void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
 {
@@ -1865,10 +1864,15 @@ void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
 	uarg_to_msgzc(uarg)->len--;
 
 	if (have_uref)
-		msg_zerocopy_callback(NULL, uarg, true);
+		msg_zerocopy_complete(NULL, uarg, true);
 }
 EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort);
 
+const struct ubuf_info_ops msg_zerocopy_ubuf_ops = {
+	.complete = msg_zerocopy_complete,
+};
+EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops);
+
 int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
 			     struct msghdr *msg, int len,
 			     struct ubuf_info *uarg)
-- 
cgit v1.2.3


From 65bada80dec1f2108a751644773b2120bd789934 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 19 Apr 2024 12:08:40 +0100
Subject: net: add callback for setting a ubuf_info to skb

At the moment an skb can only have one ubuf_info associated with it,
which might be a performance problem for zerocopy sends in cases like
TCP via io_uring. Add a callback for assigning ubuf_info to skb, this
way we will implement smarter assignment later like linking ubuf_info
together.

Note, it's an optional callback, which should be compatible with
skb_zcopy_set(), that's because the net stack might potentially decide
to clone an skb and take another reference to ubuf_info whenever it
wishes. Also, a correct implementation should always be able to bind to
an skb without prior ubuf_info, otherwise we could end up in a situation
when the send would not be able to progress.

Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://lore.kernel.org/all/b7918aadffeb787c84c9e72e34c729dc04f3a45d.1713369317.git.asml.silence@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h |  2 ++
 net/core/skbuff.c      | 20 ++++++++++++++------
 2 files changed, 16 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a110e97e074a..ced69f37977f 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -530,6 +530,8 @@ enum {
 struct ubuf_info_ops {
 	void (*complete)(struct sk_buff *, struct ubuf_info *,
 			 bool zerocopy_success);
+	/* has to be compatible with skb_zcopy_set() */
+	int (*link_skb)(struct sk_buff *skb, struct ubuf_info *uarg);
 };
 
 /*
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 9d36750459f3..4a0c10685cd2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1880,11 +1880,18 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
 	struct ubuf_info *orig_uarg = skb_zcopy(skb);
 	int err, orig_len = skb->len;
 
-	/* An skb can only point to one uarg. This edge case happens when
-	 * TCP appends to an skb, but zerocopy_realloc triggered a new alloc.
-	 */
-	if (orig_uarg && uarg != orig_uarg)
-		return -EEXIST;
+	if (uarg->ops->link_skb) {
+		err = uarg->ops->link_skb(skb, uarg);
+		if (err)
+			return err;
+	} else {
+		/* An skb can only point to one uarg. This edge case happens
+		 * when TCP appends to an skb, but zerocopy_realloc triggered
+		 * a new alloc.
+		 */
+		if (orig_uarg && uarg != orig_uarg)
+			return -EEXIST;
+	}
 
 	err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len);
 	if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
@@ -1898,7 +1905,8 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
 		return err;
 	}
 
-	skb_zcopy_set(skb, uarg, NULL);
+	if (!uarg->ops->link_skb)
+		skb_zcopy_set(skb, uarg, NULL);
 	return skb->len - orig_len;
 }
 EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
-- 
cgit v1.2.3


From 89d5d9e9500826cbd3b15ea7b6e8d9fae966f073 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 22 Apr 2024 17:48:50 +0300
Subject: counter: Don't use "proxy" headers

Update header inclusions to follow IWYU (Include What You Use)
principle.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20240422144850.2031076-1-andriy.shevchenko@linux.intel.com
Signed-off-by: William Breathitt Gray <wbg@kernel.org>
---
 include/linux/counter.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/counter.h b/include/linux/counter.h
index cd35d8574ee2..426b7d58a438 100644
--- a/include/linux/counter.h
+++ b/include/linux/counter.h
@@ -6,14 +6,15 @@
 #ifndef _COUNTER_H_
 #define _COUNTER_H_
 
+#include <linux/array_size.h>
 #include <linux/cdev.h>
 #include <linux/device.h>
-#include <linux/kernel.h>
 #include <linux/kfifo.h>
 #include <linux/mutex.h>
 #include <linux/spinlock_types.h>
 #include <linux/types.h>
 #include <linux/wait.h>
+
 #include <uapi/linux/counter.h>
 
 struct counter_device;
-- 
cgit v1.2.3


From 96e20adc43c4f81e9163a5188cee75a6dd393e09 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <mazziesaccount@gmail.com>
Date: Mon, 22 Apr 2024 09:38:33 +0300
Subject: regulator: change stubbed devm_regulator_get_enable to return Ok

The devm_regulator_get_enable() should be a 'call and forget' API,
meaning, when it is used to enable the regulators, the API does not
provide a handle to do any further control of the regulators. It gives
no real benefit to return an error from the stub if CONFIG_REGULATOR is
not set.

On the contrary, returning and error is causing problems to drivers when
hardware is such it works out just fine with no regulator control.
Returning an error forces drivers to specifically handle the case where
CONFIG_REGULATOR is not set, making the mere existence of the stub
questionalble. Furthermore, the stub of the regulator_enable() seems to
be returning Ok.

Change the stub implementation for the devm_regulator_get_enable() to
return Ok so drivers do not separately handle the case where the
CONFIG_REGULATOR is not set.

Signed-off-by: Matti Vaittinen <mazziesaccount@gmail.com>
Reported-by: Aleksander Mazur <deweloper@wp.pl>
Suggested-by: Guenter Roeck <linux@roeck-us.net>
Fixes: da279e6965b3 ("regulator: Add devm helpers for get and enable")
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Link: https://lore.kernel.org/r/ZiYF6d1V1vSPcsJS@drtxq0yyyyyyyyyyyyyby-3.rev.dnainternet.fi
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/consumer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index 4660582a3302..71232fb7dda3 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -320,7 +320,7 @@ devm_regulator_get_exclusive(struct device *dev, const char *id)
 
 static inline int devm_regulator_get_enable(struct device *dev, const char *id)
 {
-	return -ENODEV;
+	return 0;
 }
 
 static inline int devm_regulator_get_enable_optional(struct device *dev,
-- 
cgit v1.2.3


From 3249c68e3cdacc2da8fe811a54f8db43df124c0e Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 22 Apr 2024 18:15:13 +0300
Subject: ASoC: soc.h: Don't use "proxy" headers

Update header inclusions to follow IWYU (Include What You Use)
principle.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20240422151513.2052167-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index 0376f7e4c15d..2a1b6c198547 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -11,20 +11,30 @@
 #define __LINUX_SND_SOC_H
 
 #include <linux/args.h>
+#include <linux/array_size.h>
+#include <linux/device.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/lockdep.h>
+#include <linux/log2.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
 #include <linux/of.h>
-#include <linux/platform_device.h>
 #include <linux/types.h>
-#include <linux/notifier.h>
 #include <linux/workqueue.h>
-#include <linux/interrupt.h>
-#include <linux/kernel.h>
-#include <linux/regmap.h>
-#include <linux/log2.h>
-#include <sound/core.h>
-#include <sound/pcm.h>
+
+#include <sound/ac97_codec.h>
 #include <sound/compress_driver.h>
 #include <sound/control.h>
-#include <sound/ac97_codec.h>
+#include <sound/core.h>
+#include <sound/pcm.h>
+
+struct module;
+struct platform_device;
+
+/* For the current users of sound/soc.h to avoid build issues */
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
 
 /*
  * Convenience kcontrol builders
@@ -412,7 +422,6 @@
 #define SOC_ENUM_SINGLE_VIRT_DECL(name, xtexts) \
 	const struct soc_enum name = SOC_ENUM_SINGLE_VIRT(ARRAY_SIZE(xtexts), xtexts)
 
-struct device_node;
 struct snd_jack;
 struct snd_soc_card;
 struct snd_soc_pcm_stream;
@@ -427,6 +436,7 @@ struct soc_enum;
 struct snd_soc_jack;
 struct snd_soc_jack_zone;
 struct snd_soc_jack_pin;
+
 #include <sound/soc-dapm.h>
 #include <sound/soc-dpcm.h>
 #include <sound/soc-topology.h>
-- 
cgit v1.2.3


From 118005713e35a1893c6ee47ab2926cca277737de Mon Sep 17 00:00:00 2001
From: Sourabh Jain <sourabhjain@linux.ibm.com>
Date: Tue, 26 Mar 2024 11:24:08 +0530
Subject: crash: forward memory_notify arg to arch crash hotplug handler

In the event of memory hotplug or online/offline events, the crash
memory hotplug notifier `crash_memhp_notifier()` receives a
`memory_notify` object but doesn't forward that object to the
generic and architecture-specific crash hotplug handler.

The `memory_notify` object contains the starting PFN (Page Frame Number)
and the number of pages in the hot-removed memory. This information is
necessary for architectures like PowerPC to update/recreate the kdump
image, specifically `elfcorehdr`.

So update the function signature of `crash_handle_hotplug_event()` and
`arch_crash_handle_hotplug_event()` to accept the `memory_notify` object
as an argument from crash memory hotplug notifier.

Since no such object is available in the case of CPU hotplug event, the
crash CPU hotplug notifier `crash_cpuhp_online()` passes NULL to the
crash hotplug handler.

Signed-off-by: Sourabh Jain <sourabhjain@linux.ibm.com>
Acked-by: Baoquan He <bhe@redhat.com>
Acked-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://msgid.link/20240326055413.186534-2-sourabhjain@linux.ibm.com
---
 arch/x86/include/asm/kexec.h |  2 +-
 arch/x86/kernel/crash.c      |  4 +++-
 include/linux/crash_core.h   |  2 +-
 kernel/crash_core.c          | 14 +++++++-------
 4 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 91ca9a9ee3a2..cb1320ebbc23 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -207,7 +207,7 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image);
 extern void kdump_nmi_shootdown_cpus(void);
 
 #ifdef CONFIG_CRASH_HOTPLUG
-void arch_crash_handle_hotplug_event(struct kimage *image);
+void arch_crash_handle_hotplug_event(struct kimage *image, void *arg);
 #define arch_crash_handle_hotplug_event arch_crash_handle_hotplug_event
 
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index e74d0c4286c1..2a682fe86352 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -432,10 +432,12 @@ unsigned int arch_crash_get_elfcorehdr_size(void)
 /**
  * arch_crash_handle_hotplug_event() - Handle hotplug elfcorehdr changes
  * @image: a pointer to kexec_crash_image
+ * @arg: struct memory_notify handler for memory hotplug case and
+ *       NULL for CPU hotplug case.
  *
  * Prepare the new elfcorehdr and replace the existing elfcorehdr.
  */
-void arch_crash_handle_hotplug_event(struct kimage *image)
+void arch_crash_handle_hotplug_event(struct kimage *image, void *arg)
 {
 	void *elfbuf = NULL, *old_elfcorehdr;
 	unsigned long nr_mem_ranges;
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index d33352c2e386..647e928efee8 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -37,7 +37,7 @@ static inline void arch_kexec_unprotect_crashkres(void) { }
 
 
 #ifndef arch_crash_handle_hotplug_event
-static inline void arch_crash_handle_hotplug_event(struct kimage *image) { }
+static inline void arch_crash_handle_hotplug_event(struct kimage *image, void *arg) { }
 #endif
 
 int crash_check_update_elfcorehdr(void);
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 78b5dc7cee3a..70fa8111a9d6 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -534,7 +534,7 @@ int crash_check_update_elfcorehdr(void)
  * list of segments it checks (since the elfcorehdr changes and thus
  * would require an update to purgatory itself to update the digest).
  */
-static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu)
+static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu, void *arg)
 {
 	struct kimage *image;
 
@@ -596,7 +596,7 @@ static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu)
 	image->hp_action = hp_action;
 
 	/* Now invoke arch-specific update handler */
-	arch_crash_handle_hotplug_event(image);
+	arch_crash_handle_hotplug_event(image, arg);
 
 	/* No longer handling a hotplug event */
 	image->hp_action = KEXEC_CRASH_HP_NONE;
@@ -612,17 +612,17 @@ out:
 	crash_hotplug_unlock();
 }
 
-static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *v)
+static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *arg)
 {
 	switch (val) {
 	case MEM_ONLINE:
 		crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_MEMORY,
-			KEXEC_CRASH_HP_INVALID_CPU);
+			KEXEC_CRASH_HP_INVALID_CPU, arg);
 		break;
 
 	case MEM_OFFLINE:
 		crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_MEMORY,
-			KEXEC_CRASH_HP_INVALID_CPU);
+			KEXEC_CRASH_HP_INVALID_CPU, arg);
 		break;
 	}
 	return NOTIFY_OK;
@@ -635,13 +635,13 @@ static struct notifier_block crash_memhp_nb = {
 
 static int crash_cpuhp_online(unsigned int cpu)
 {
-	crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_CPU, cpu);
+	crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_CPU, cpu, NULL);
 	return 0;
 }
 
 static int crash_cpuhp_offline(unsigned int cpu)
 {
-	crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_CPU, cpu);
+	crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_CPU, cpu, NULL);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 79365026f86948b52c3cb7bf099dded92c559b4c Mon Sep 17 00:00:00 2001
From: Sourabh Jain <sourabhjain@linux.ibm.com>
Date: Tue, 26 Mar 2024 11:24:09 +0530
Subject: crash: add a new kexec flag for hotplug support

Commit a72bbec70da2 ("crash: hotplug support for kexec_load()")
introduced a new kexec flag, `KEXEC_UPDATE_ELFCOREHDR`. Kexec tool uses
this flag to indicate to the kernel that it is safe to modify the
elfcorehdr of the kdump image loaded using the kexec_load system call.

However, it is possible that architectures may need to update kexec
segments other then elfcorehdr. For example, FDT (Flatten Device Tree)
on PowerPC. Introducing a new kexec flag for every new kexec segment
may not be a good solution. Hence, a generic kexec flag bit,
`KEXEC_CRASH_HOTPLUG_SUPPORT`, is introduced to share the CPU/Memory
hotplug support intent between the kexec tool and the kernel for the
kexec_load system call.

Now we have two kexec flags that enables crash hotplug support for
kexec_load system call. First is KEXEC_UPDATE_ELFCOREHDR (only used in
x86), and second is KEXEC_CRASH_HOTPLUG_SUPPORT (for all architectures).

To simplify the process of finding and reporting the crash hotplug
support the following changes are introduced.

1. Define arch specific function to process the kexec flags and
   determine crash hotplug support

2. Rename the @update_elfcorehdr member of struct kimage to
   @hotplug_support and populate it for both kexec_load and
   kexec_file_load syscalls, because architecture can update more than
   one kexec segment

3. Let generic function crash_check_hotplug_support report hotplug
   support for loaded kdump image based on value of @hotplug_support

To bring the x86 crash hotplug support in line with the above points,
the following changes have been made:

- Introduce the arch_crash_hotplug_support function to process kexec
  flags and determine crash hotplug support

- Remove the arch_crash_hotplug_[cpu|memory]_support functions

Signed-off-by: Sourabh Jain <sourabhjain@linux.ibm.com>
Acked-by: Baoquan He <bhe@redhat.com>
Acked-by: Hari Bathini <hbathini@linux.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://msgid.link/20240326055413.186534-3-sourabhjain@linux.ibm.com
---
 arch/x86/include/asm/kexec.h | 11 ++---------
 arch/x86/kernel/crash.c      | 28 +++++++++++++++++-----------
 drivers/base/cpu.c           |  2 +-
 drivers/base/memory.c        |  2 +-
 include/linux/crash_core.h   | 13 ++++++-------
 include/linux/kexec.h        | 11 +++++++----
 include/uapi/linux/kexec.h   |  1 +
 kernel/crash_core.c          | 15 ++++++---------
 kernel/kexec.c               |  4 ++--
 kernel/kexec_file.c          |  5 +++++
 10 files changed, 48 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index cb1320ebbc23..ae5482a2f0ca 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -210,15 +210,8 @@ extern void kdump_nmi_shootdown_cpus(void);
 void arch_crash_handle_hotplug_event(struct kimage *image, void *arg);
 #define arch_crash_handle_hotplug_event arch_crash_handle_hotplug_event
 
-#ifdef CONFIG_HOTPLUG_CPU
-int arch_crash_hotplug_cpu_support(void);
-#define crash_hotplug_cpu_support arch_crash_hotplug_cpu_support
-#endif
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-int arch_crash_hotplug_memory_support(void);
-#define crash_hotplug_memory_support arch_crash_hotplug_memory_support
-#endif
+int arch_crash_hotplug_support(struct kimage *image, unsigned long kexec_flags);
+#define arch_crash_hotplug_support arch_crash_hotplug_support
 
 unsigned int arch_crash_get_elfcorehdr_size(void);
 #define crash_get_elfcorehdr_size arch_crash_get_elfcorehdr_size
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 2a682fe86352..f06501445cd9 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -402,20 +402,26 @@ int crash_load_segments(struct kimage *image)
 #undef pr_fmt
 #define pr_fmt(fmt) "crash hp: " fmt
 
-/* These functions provide the value for the sysfs crash_hotplug nodes */
-#ifdef CONFIG_HOTPLUG_CPU
-int arch_crash_hotplug_cpu_support(void)
+int arch_crash_hotplug_support(struct kimage *image, unsigned long kexec_flags)
 {
-	return crash_check_update_elfcorehdr();
-}
-#endif
 
-#ifdef CONFIG_MEMORY_HOTPLUG
-int arch_crash_hotplug_memory_support(void)
-{
-	return crash_check_update_elfcorehdr();
-}
+#ifdef CONFIG_KEXEC_FILE
+	if (image->file_mode)
+		return 1;
 #endif
+	/*
+	 * Initially, crash hotplug support for kexec_load was added
+	 * with the KEXEC_UPDATE_ELFCOREHDR flag. Later, this
+	 * functionality was expanded to accommodate multiple kexec
+	 * segment updates, leading to the introduction of the
+	 * KEXEC_CRASH_HOTPLUG_SUPPORT kexec flag bit. Consequently,
+	 * when the kexec tool sends either of these flags, it indicates
+	 * that the required kexec segment (elfcorehdr) is excluded from
+	 * the SHA calculation.
+	 */
+	return (kexec_flags & KEXEC_UPDATE_ELFCOREHDR ||
+		kexec_flags & KEXEC_CRASH_HOTPLUG_SUPPORT);
+}
 
 unsigned int arch_crash_get_elfcorehdr_size(void)
 {
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 56fba44ba391..c61ecb0c2ae2 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -306,7 +306,7 @@ static ssize_t crash_hotplug_show(struct device *dev,
 				     struct device_attribute *attr,
 				     char *buf)
 {
-	return sysfs_emit(buf, "%d\n", crash_hotplug_cpu_support());
+	return sysfs_emit(buf, "%d\n", crash_check_hotplug_support());
 }
 static DEVICE_ATTR_ADMIN_RO(crash_hotplug);
 #endif
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index c0436f46cfb7..67858eeb92ed 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -535,7 +535,7 @@ static DEVICE_ATTR_RW(auto_online_blocks);
 static ssize_t crash_hotplug_show(struct device *dev,
 				       struct device_attribute *attr, char *buf)
 {
-	return sysfs_emit(buf, "%d\n", crash_hotplug_memory_support());
+	return sysfs_emit(buf, "%d\n", crash_check_hotplug_support());
 }
 static DEVICE_ATTR_RO(crash_hotplug);
 #endif
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 647e928efee8..44305336314e 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -40,14 +40,13 @@ static inline void arch_kexec_unprotect_crashkres(void) { }
 static inline void arch_crash_handle_hotplug_event(struct kimage *image, void *arg) { }
 #endif
 
-int crash_check_update_elfcorehdr(void);
+int crash_check_hotplug_support(void);
 
-#ifndef crash_hotplug_cpu_support
-static inline int crash_hotplug_cpu_support(void) { return 0; }
-#endif
-
-#ifndef crash_hotplug_memory_support
-static inline int crash_hotplug_memory_support(void) { return 0; }
+#ifndef arch_crash_hotplug_support
+static inline int arch_crash_hotplug_support(struct kimage *image, unsigned long kexec_flags)
+{
+	return 0;
+}
 #endif
 
 #ifndef crash_get_elfcorehdr_size
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 060835bb82d5..5b93a5767413 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -319,8 +319,10 @@ struct kimage {
 	/* If set, we are using file mode kexec syscall */
 	unsigned int file_mode:1;
 #ifdef CONFIG_CRASH_HOTPLUG
-	/* If set, allow changes to elfcorehdr of kexec_load'd image */
-	unsigned int update_elfcorehdr:1;
+	/* If set, it is safe to update kexec segments that are
+	 * excluded from SHA calculation.
+	 */
+	unsigned int hotplug_support:1;
 #endif
 
 #ifdef ARCH_HAS_KIMAGE_ARCH
@@ -391,9 +393,10 @@ bool kexec_load_permitted(int kexec_image_type);
 
 /* List of defined/legal kexec flags */
 #ifndef CONFIG_KEXEC_JUMP
-#define KEXEC_FLAGS    (KEXEC_ON_CRASH | KEXEC_UPDATE_ELFCOREHDR)
+#define KEXEC_FLAGS    (KEXEC_ON_CRASH | KEXEC_UPDATE_ELFCOREHDR | KEXEC_CRASH_HOTPLUG_SUPPORT)
 #else
-#define KEXEC_FLAGS    (KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT | KEXEC_UPDATE_ELFCOREHDR)
+#define KEXEC_FLAGS    (KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT | KEXEC_UPDATE_ELFCOREHDR | \
+			KEXEC_CRASH_HOTPLUG_SUPPORT)
 #endif
 
 /* List of defined/legal kexec file flags */
diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h
index c17bb096ea68..5ae1741ea8ea 100644
--- a/include/uapi/linux/kexec.h
+++ b/include/uapi/linux/kexec.h
@@ -13,6 +13,7 @@
 #define KEXEC_ON_CRASH		0x00000001
 #define KEXEC_PRESERVE_CONTEXT	0x00000002
 #define KEXEC_UPDATE_ELFCOREHDR	0x00000004
+#define KEXEC_CRASH_HOTPLUG_SUPPORT 0x00000008
 #define KEXEC_ARCH_MASK		0xffff0000
 
 /*
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 70fa8111a9d6..394db3ebe835 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -493,10 +493,10 @@ static DEFINE_MUTEX(__crash_hotplug_lock);
 
 /*
  * This routine utilized when the crash_hotplug sysfs node is read.
- * It reflects the kernel's ability/permission to update the crash
- * elfcorehdr directly.
+ * It reflects the kernel's ability/permission to update the kdump
+ * image directly.
  */
-int crash_check_update_elfcorehdr(void)
+int crash_check_hotplug_support(void)
 {
 	int rc = 0;
 
@@ -508,10 +508,7 @@ int crash_check_update_elfcorehdr(void)
 		return 0;
 	}
 	if (kexec_crash_image) {
-		if (kexec_crash_image->file_mode)
-			rc = 1;
-		else
-			rc = kexec_crash_image->update_elfcorehdr;
+		rc = kexec_crash_image->hotplug_support;
 	}
 	/* Release lock now that update complete */
 	kexec_unlock();
@@ -552,8 +549,8 @@ static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu,
 
 	image = kexec_crash_image;
 
-	/* Check that updating elfcorehdr is permitted */
-	if (!(image->file_mode || image->update_elfcorehdr))
+	/* Check that kexec segments update is permitted */
+	if (!image->hotplug_support)
 		goto out;
 
 	if (hp_action == KEXEC_CRASH_HP_ADD_CPU ||
diff --git a/kernel/kexec.c b/kernel/kexec.c
index bab542fc1463..a6b3f96bb50c 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -135,8 +135,8 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
 		image->preserve_context = 1;
 
 #ifdef CONFIG_CRASH_HOTPLUG
-	if (flags & KEXEC_UPDATE_ELFCOREHDR)
-		image->update_elfcorehdr = 1;
+	if ((flags & KEXEC_ON_CRASH) && arch_crash_hotplug_support(image, flags))
+		image->hotplug_support = 1;
 #endif
 
 	ret = machine_kexec_prepare(image);
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 2d1db05fbf04..3d64290d24c9 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -376,6 +376,11 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
 	if (ret)
 		goto out;
 
+#ifdef CONFIG_CRASH_HOTPLUG
+	if ((flags & KEXEC_FILE_ON_CRASH) && arch_crash_hotplug_support(image, flags))
+		image->hotplug_support = 1;
+#endif
+
 	ret = machine_kexec_prepare(image);
 	if (ret)
 		goto out;
-- 
cgit v1.2.3


From b7c0e1ecee403a43abc89eb3e75672b01ff2ece9 Mon Sep 17 00:00:00 2001
From: Marco Pagani <marpagan@redhat.com>
Date: Fri, 19 Apr 2024 10:35:59 +0200
Subject: fpga: region: add owner module and take its refcount

The current implementation of the fpga region assumes that the low-level
module registers a driver for the parent device and uses its owner pointer
to take the module's refcount. This approach is problematic since it can
lead to a null pointer dereference while attempting to get the region
during programming if the parent device does not have a driver.

To address this problem, add a module owner pointer to the fpga_region
struct and use it to take the module's refcount. Modify the functions for
registering a region to take an additional owner module parameter and
rename them to avoid conflicts. Use the old function names for helper
macros that automatically set the module that registers the region as the
owner. This ensures compatibility with existing low-level control modules
and reduces the chances of registering a region without setting the owner.

Also, update the documentation to keep it consistent with the new interface
for registering an fpga region.

Fixes: 0fa20cdfcc1f ("fpga: fpga-region: device tree control for FPGA")
Suggested-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Suggested-by: Xu Yilun <yilun.xu@intel.com>
Reviewed-by: Russ Weight <russ.weight@linux.dev>
Signed-off-by: Marco Pagani <marpagan@redhat.com>
Acked-by: Xu Yilun <yilun.xu@intel.com>
Link: https://lore.kernel.org/r/20240419083601.77403-1-marpagan@redhat.com
Signed-off-by: Xu Yilun <yilun.xu@linux.intel.com>
---
 Documentation/driver-api/fpga/fpga-region.rst | 13 ++++++++-----
 drivers/fpga/fpga-region.c                    | 24 ++++++++++++++----------
 include/linux/fpga/fpga-region.h              | 13 ++++++++++---
 3 files changed, 32 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/Documentation/driver-api/fpga/fpga-region.rst b/Documentation/driver-api/fpga/fpga-region.rst
index dc55d60a0b4a..2d03b5fb7657 100644
--- a/Documentation/driver-api/fpga/fpga-region.rst
+++ b/Documentation/driver-api/fpga/fpga-region.rst
@@ -46,13 +46,16 @@ API to add a new FPGA region
 ----------------------------
 
 * struct fpga_region - The FPGA region struct
-* struct fpga_region_info - Parameter structure for fpga_region_register_full()
-* fpga_region_register_full() -  Create and register an FPGA region using the
+* struct fpga_region_info - Parameter structure for __fpga_region_register_full()
+* __fpga_region_register_full() -  Create and register an FPGA region using the
   fpga_region_info structure to provide the full flexibility of options
-* fpga_region_register() -  Create and register an FPGA region using standard
+* __fpga_region_register() -  Create and register an FPGA region using standard
   arguments
 * fpga_region_unregister() -  Unregister an FPGA region
 
+Helper macros ``fpga_region_register()`` and ``fpga_region_register_full()``
+automatically set the module that registers the FPGA region as the owner.
+
 The FPGA region's probe function will need to get a reference to the FPGA
 Manager it will be using to do the programming.  This usually would happen
 during the region's probe function.
@@ -82,10 +85,10 @@ following APIs to handle building or tearing down that list.
    :functions: fpga_region_info
 
 .. kernel-doc:: drivers/fpga/fpga-region.c
-   :functions: fpga_region_register_full
+   :functions: __fpga_region_register_full
 
 .. kernel-doc:: drivers/fpga/fpga-region.c
-   :functions: fpga_region_register
+   :functions: __fpga_region_register
 
 .. kernel-doc:: drivers/fpga/fpga-region.c
    :functions: fpga_region_unregister
diff --git a/drivers/fpga/fpga-region.c b/drivers/fpga/fpga-region.c
index b364a929425c..753cd142503e 100644
--- a/drivers/fpga/fpga-region.c
+++ b/drivers/fpga/fpga-region.c
@@ -53,7 +53,7 @@ static struct fpga_region *fpga_region_get(struct fpga_region *region)
 	}
 
 	get_device(dev);
-	if (!try_module_get(dev->parent->driver->owner)) {
+	if (!try_module_get(region->ops_owner)) {
 		put_device(dev);
 		mutex_unlock(&region->mutex);
 		return ERR_PTR(-ENODEV);
@@ -75,7 +75,7 @@ static void fpga_region_put(struct fpga_region *region)
 
 	dev_dbg(dev, "put\n");
 
-	module_put(dev->parent->driver->owner);
+	module_put(region->ops_owner);
 	put_device(dev);
 	mutex_unlock(&region->mutex);
 }
@@ -181,14 +181,16 @@ static struct attribute *fpga_region_attrs[] = {
 ATTRIBUTE_GROUPS(fpga_region);
 
 /**
- * fpga_region_register_full - create and register an FPGA Region device
+ * __fpga_region_register_full - create and register an FPGA Region device
  * @parent: device parent
  * @info: parameters for FPGA Region
+ * @owner: module containing the get_bridges function
  *
  * Return: struct fpga_region or ERR_PTR()
  */
 struct fpga_region *
-fpga_region_register_full(struct device *parent, const struct fpga_region_info *info)
+__fpga_region_register_full(struct device *parent, const struct fpga_region_info *info,
+			    struct module *owner)
 {
 	struct fpga_region *region;
 	int id, ret = 0;
@@ -213,6 +215,7 @@ fpga_region_register_full(struct device *parent, const struct fpga_region_info *
 	region->compat_id = info->compat_id;
 	region->priv = info->priv;
 	region->get_bridges = info->get_bridges;
+	region->ops_owner = owner;
 
 	mutex_init(&region->mutex);
 	INIT_LIST_HEAD(&region->bridge_list);
@@ -241,13 +244,14 @@ err_free:
 
 	return ERR_PTR(ret);
 }
-EXPORT_SYMBOL_GPL(fpga_region_register_full);
+EXPORT_SYMBOL_GPL(__fpga_region_register_full);
 
 /**
- * fpga_region_register - create and register an FPGA Region device
+ * __fpga_region_register - create and register an FPGA Region device
  * @parent: device parent
  * @mgr: manager that programs this region
  * @get_bridges: optional function to get bridges to a list
+ * @owner: module containing the get_bridges function
  *
  * This simple version of the register function should be sufficient for most users.
  * The fpga_region_register_full() function is available for users that need to
@@ -256,17 +260,17 @@ EXPORT_SYMBOL_GPL(fpga_region_register_full);
  * Return: struct fpga_region or ERR_PTR()
  */
 struct fpga_region *
-fpga_region_register(struct device *parent, struct fpga_manager *mgr,
-		     int (*get_bridges)(struct fpga_region *))
+__fpga_region_register(struct device *parent, struct fpga_manager *mgr,
+		       int (*get_bridges)(struct fpga_region *), struct module *owner)
 {
 	struct fpga_region_info info = { 0 };
 
 	info.mgr = mgr;
 	info.get_bridges = get_bridges;
 
-	return fpga_region_register_full(parent, &info);
+	return __fpga_region_register_full(parent, &info, owner);
 }
-EXPORT_SYMBOL_GPL(fpga_region_register);
+EXPORT_SYMBOL_GPL(__fpga_region_register);
 
 /**
  * fpga_region_unregister - unregister an FPGA region
diff --git a/include/linux/fpga/fpga-region.h b/include/linux/fpga/fpga-region.h
index 9d4d32909340..5fbc05fe70a6 100644
--- a/include/linux/fpga/fpga-region.h
+++ b/include/linux/fpga/fpga-region.h
@@ -36,6 +36,7 @@ struct fpga_region_info {
  * @mgr: FPGA manager
  * @info: FPGA image info
  * @compat_id: FPGA region id for compatibility check.
+ * @ops_owner: module containing the get_bridges function
  * @priv: private data
  * @get_bridges: optional function to get bridges to a list
  */
@@ -46,6 +47,7 @@ struct fpga_region {
 	struct fpga_manager *mgr;
 	struct fpga_image_info *info;
 	struct fpga_compat_id *compat_id;
+	struct module *ops_owner;
 	void *priv;
 	int (*get_bridges)(struct fpga_region *region);
 };
@@ -58,12 +60,17 @@ fpga_region_class_find(struct device *start, const void *data,
 
 int fpga_region_program_fpga(struct fpga_region *region);
 
+#define fpga_region_register_full(parent, info) \
+	__fpga_region_register_full(parent, info, THIS_MODULE)
 struct fpga_region *
-fpga_region_register_full(struct device *parent, const struct fpga_region_info *info);
+__fpga_region_register_full(struct device *parent, const struct fpga_region_info *info,
+			    struct module *owner);
 
+#define fpga_region_register(parent, mgr, get_bridges) \
+	__fpga_region_register(parent, mgr, get_bridges, THIS_MODULE)
 struct fpga_region *
-fpga_region_register(struct device *parent, struct fpga_manager *mgr,
-		     int (*get_bridges)(struct fpga_region *));
+__fpga_region_register(struct device *parent, struct fpga_manager *mgr,
+		       int (*get_bridges)(struct fpga_region *), struct module *owner);
 void fpga_region_unregister(struct fpga_region *region);
 
 #endif /* _FPGA_REGION_H */
-- 
cgit v1.2.3


From ade17653411284e8f7d07279097039f8b84f85d3 Mon Sep 17 00:00:00 2001
From: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Date: Thu, 22 Feb 2024 15:41:10 +0000
Subject: soc: mediatek: cmdq: Add parameter shift_pa to cmdq_pkt_jump()

In original design, cmdq_pkt_jump() call cmdq_get_shift_pa() every
time to get shift_pa. But the shift_pa is constant value for each
SoC, so client driver just need to call cmdq_get_shift_pa() once
and pass shift_pa to cmdq_pkt_jump() to prevent frequent function
call.

Signed-off-by: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Link: https://lore.kernel.org/r/20240222154120.16959-3-chunkuang.hu@kernel.org
Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
---
 drivers/soc/mediatek/mtk-cmdq-helper.c | 5 ++---
 include/linux/soc/mediatek/mtk-cmdq.h  | 6 ++++--
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/soc/mediatek/mtk-cmdq-helper.c b/drivers/soc/mediatek/mtk-cmdq-helper.c
index 0b2e5690dacf..3380e56dd69b 100644
--- a/drivers/soc/mediatek/mtk-cmdq-helper.c
+++ b/drivers/soc/mediatek/mtk-cmdq-helper.c
@@ -393,14 +393,13 @@ int cmdq_pkt_assign(struct cmdq_pkt *pkt, u16 reg_idx, u32 value)
 }
 EXPORT_SYMBOL(cmdq_pkt_assign);
 
-int cmdq_pkt_jump(struct cmdq_pkt *pkt, dma_addr_t addr)
+int cmdq_pkt_jump(struct cmdq_pkt *pkt, dma_addr_t addr, u8 shift_pa)
 {
 	struct cmdq_instruction inst = {};
 
 	inst.op = CMDQ_CODE_JUMP;
 	inst.offset = CMDQ_JUMP_ABSOLUTE;
-	inst.value = addr >>
-		cmdq_get_shift_pa(((struct cmdq_client *)pkt->cl)->chan);
+	inst.value = addr >> shift_pa;
 	return cmdq_pkt_append_command(pkt, inst);
 }
 EXPORT_SYMBOL(cmdq_pkt_jump);
diff --git a/include/linux/soc/mediatek/mtk-cmdq.h b/include/linux/soc/mediatek/mtk-cmdq.h
index 649955d2cf5c..72adfd867cd9 100644
--- a/include/linux/soc/mediatek/mtk-cmdq.h
+++ b/include/linux/soc/mediatek/mtk-cmdq.h
@@ -253,10 +253,12 @@ int cmdq_pkt_assign(struct cmdq_pkt *pkt, u16 reg_idx, u32 value);
  *		     a physical address which should contains more instruction.
  * @pkt:        the CMDQ packet
  * @addr:       physical address of target instruction buffer
+ * @shift_pa:	shift bits of physical address in CMDQ instruction. This value
+ *		is got by cmdq_get_shift_pa().
  *
  * Return: 0 for success; else the error code is returned
  */
-int cmdq_pkt_jump(struct cmdq_pkt *pkt, dma_addr_t addr);
+int cmdq_pkt_jump(struct cmdq_pkt *pkt, dma_addr_t addr, u8 shift_pa);
 
 /**
  * cmdq_pkt_finalize() - Append EOC and jump command to pkt.
@@ -374,7 +376,7 @@ static inline int cmdq_pkt_assign(struct cmdq_pkt *pkt, u16 reg_idx, u32 value)
 	return -EINVAL;
 }
 
-static inline int cmdq_pkt_jump(struct cmdq_pkt *pkt, dma_addr_t addr)
+static inline int cmdq_pkt_jump(struct cmdq_pkt *pkt, dma_addr_t addr, u8 shift_pa)
 {
 	return -EINVAL;
 }
-- 
cgit v1.2.3


From 7218be3b6fcf00d1a3a6867aa1be6b9ab91cce45 Mon Sep 17 00:00:00 2001
From: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Date: Thu, 22 Feb 2024 15:41:11 +0000
Subject: soc: mediatek: cmdq: Rename cmdq_pkt_jump() to cmdq_pkt_jump_abs()

In order to distinguish absolute jump and relative jump,
cmdq_pkt_jump() append absolute jump command, so rename it to
cmdq_pkt_jump_abs().

Signed-off-by: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Link: https://lore.kernel.org/r/20240222154120.16959-4-chunkuang.hu@kernel.org
[Angelo: Added temporary wrapper to avoid build breakage]
Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
---
 drivers/soc/mediatek/mtk-cmdq-helper.c |  4 ++--
 include/linux/soc/mediatek/mtk-cmdq.h  | 22 +++++++++++++++++-----
 2 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/soc/mediatek/mtk-cmdq-helper.c b/drivers/soc/mediatek/mtk-cmdq-helper.c
index 3380e56dd69b..38d9077725d2 100644
--- a/drivers/soc/mediatek/mtk-cmdq-helper.c
+++ b/drivers/soc/mediatek/mtk-cmdq-helper.c
@@ -393,7 +393,7 @@ int cmdq_pkt_assign(struct cmdq_pkt *pkt, u16 reg_idx, u32 value)
 }
 EXPORT_SYMBOL(cmdq_pkt_assign);
 
-int cmdq_pkt_jump(struct cmdq_pkt *pkt, dma_addr_t addr, u8 shift_pa)
+int cmdq_pkt_jump_abs(struct cmdq_pkt *pkt, dma_addr_t addr, u8 shift_pa)
 {
 	struct cmdq_instruction inst = {};
 
@@ -402,7 +402,7 @@ int cmdq_pkt_jump(struct cmdq_pkt *pkt, dma_addr_t addr, u8 shift_pa)
 	inst.value = addr >> shift_pa;
 	return cmdq_pkt_append_command(pkt, inst);
 }
-EXPORT_SYMBOL(cmdq_pkt_jump);
+EXPORT_SYMBOL(cmdq_pkt_jump_abs);
 
 int cmdq_pkt_finalize(struct cmdq_pkt *pkt)
 {
diff --git a/include/linux/soc/mediatek/mtk-cmdq.h b/include/linux/soc/mediatek/mtk-cmdq.h
index 72adfd867cd9..961f4c521067 100644
--- a/include/linux/soc/mediatek/mtk-cmdq.h
+++ b/include/linux/soc/mediatek/mtk-cmdq.h
@@ -248,17 +248,24 @@ int cmdq_pkt_poll_mask(struct cmdq_pkt *pkt, u8 subsys,
 int cmdq_pkt_assign(struct cmdq_pkt *pkt, u16 reg_idx, u32 value);
 
 /**
- * cmdq_pkt_jump() - Append jump command to the CMDQ packet, ask GCE
- *		     to execute an instruction that change current thread PC to
- *		     a physical address which should contains more instruction.
+ * cmdq_pkt_jump_abs() - Append jump command to the CMDQ packet, ask GCE
+ *			 to execute an instruction that change current thread
+ *			 PC to a absolute physical address which should
+ *			 contains more instruction.
  * @pkt:        the CMDQ packet
- * @addr:       physical address of target instruction buffer
+ * @addr:       absolute physical address of target instruction buffer
  * @shift_pa:	shift bits of physical address in CMDQ instruction. This value
  *		is got by cmdq_get_shift_pa().
  *
  * Return: 0 for success; else the error code is returned
  */
-int cmdq_pkt_jump(struct cmdq_pkt *pkt, dma_addr_t addr, u8 shift_pa);
+int cmdq_pkt_jump_abs(struct cmdq_pkt *pkt, dma_addr_t addr, u8 shift_pa);
+
+/* This wrapper has to be removed after all users migrated to jump_abs */
+static inline int cmdq_pkt_jump(struct cmdq_pkt *pkt, dma_addr_t addr, u8 shift_pa)
+{
+	return cmdq_pkt_jump_abs(pkt, addr, shift_pa);
+}
 
 /**
  * cmdq_pkt_finalize() - Append EOC and jump command to pkt.
@@ -376,6 +383,11 @@ static inline int cmdq_pkt_assign(struct cmdq_pkt *pkt, u16 reg_idx, u32 value)
 	return -EINVAL;
 }
 
+static inline int cmdq_pkt_jump_abs(struct cmdq_pkt *pkt, dma_addr_t addr, u8 shift_pa)
+{
+	return -EINVAL;
+}
+
 static inline int cmdq_pkt_jump(struct cmdq_pkt *pkt, dma_addr_t addr, u8 shift_pa)
 {
 	return -EINVAL;
-- 
cgit v1.2.3


From 698cdcb19579f9de005698f9dfb660ab6ec65d32 Mon Sep 17 00:00:00 2001
From: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Date: Thu, 22 Feb 2024 15:41:12 +0000
Subject: soc: mediatek: cmdq: Add cmdq_pkt_jump_rel() helper function

cmdq_pkt_jump_rel() append relative jump command to the packet.
Relative jump change PC to the target address with offset from
current PC.

Signed-off-by: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Link: https://lore.kernel.org/r/20240222154120.16959-5-chunkuang.hu@kernel.org
Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
---
 drivers/soc/mediatek/mtk-cmdq-helper.c | 10 ++++++++++
 include/linux/soc/mediatek/mtk-cmdq.h  | 19 +++++++++++++++++++
 2 files changed, 29 insertions(+)

(limited to 'include')

diff --git a/drivers/soc/mediatek/mtk-cmdq-helper.c b/drivers/soc/mediatek/mtk-cmdq-helper.c
index 38d9077725d2..678db09983d4 100644
--- a/drivers/soc/mediatek/mtk-cmdq-helper.c
+++ b/drivers/soc/mediatek/mtk-cmdq-helper.c
@@ -404,6 +404,16 @@ int cmdq_pkt_jump_abs(struct cmdq_pkt *pkt, dma_addr_t addr, u8 shift_pa)
 }
 EXPORT_SYMBOL(cmdq_pkt_jump_abs);
 
+int cmdq_pkt_jump_rel(struct cmdq_pkt *pkt, s32 offset, u8 shift_pa)
+{
+	struct cmdq_instruction inst = { {0} };
+
+	inst.op = CMDQ_CODE_JUMP;
+	inst.value = (u32)offset >> shift_pa;
+	return cmdq_pkt_append_command(pkt, inst);
+}
+EXPORT_SYMBOL(cmdq_pkt_jump_rel);
+
 int cmdq_pkt_finalize(struct cmdq_pkt *pkt)
 {
 	struct cmdq_instruction inst = { {0} };
diff --git a/include/linux/soc/mediatek/mtk-cmdq.h b/include/linux/soc/mediatek/mtk-cmdq.h
index 961f4c521067..cadb6a157607 100644
--- a/include/linux/soc/mediatek/mtk-cmdq.h
+++ b/include/linux/soc/mediatek/mtk-cmdq.h
@@ -267,6 +267,20 @@ static inline int cmdq_pkt_jump(struct cmdq_pkt *pkt, dma_addr_t addr, u8 shift_
 	return cmdq_pkt_jump_abs(pkt, addr, shift_pa);
 }
 
+/**
+ * cmdq_pkt_jump_rel() - Append jump command to the CMDQ packet, ask GCE
+ *			 to execute an instruction that change current thread
+ *			 PC to a physical address with relative offset. The
+ *			 target address should contains more instruction.
+ * @pkt:	the CMDQ packet
+ * @offset:	relative offset of target instruction buffer from current PC.
+ * @shift_pa:	shift bits of physical address in CMDQ instruction. This value
+ *		is got by cmdq_get_shift_pa().
+ *
+ * Return: 0 for success; else the error code is returned
+ */
+int cmdq_pkt_jump_rel(struct cmdq_pkt *pkt, s32 offset, u8 shift_pa);
+
 /**
  * cmdq_pkt_finalize() - Append EOC and jump command to pkt.
  * @pkt:	the CMDQ packet
@@ -393,6 +407,11 @@ static inline int cmdq_pkt_jump(struct cmdq_pkt *pkt, dma_addr_t addr, u8 shift_
 	return -EINVAL;
 }
 
+static inline int cmdq_pkt_jump_rel(struct cmdq_pkt *pkt, s32 offset, u8 shift_pa)
+{
+	return -EINVAL;
+}
+
 static inline int cmdq_pkt_finalize(struct cmdq_pkt *pkt)
 {
 	return -EINVAL;
-- 
cgit v1.2.3


From 3d86ced95d8c09d11c172f542edd40f58a4098fa Mon Sep 17 00:00:00 2001
From: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Date: Thu, 22 Feb 2024 15:41:13 +0000
Subject: soc: mediatek: cmdq: Add cmdq_pkt_eoc() helper function

cmdq_pkt_eoc() append eoc command to CMDQ packet. eoc command
would ask GCE to generate IRQ. It's usually appended to the end
of packet to notify all command in the packet is done.

Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Signed-off-by: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Link: https://lore.kernel.org/r/20240222154120.16959-6-chunkuang.hu@kernel.org
Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
---
 drivers/soc/mediatek/mtk-cmdq-helper.c | 10 ++++++++++
 include/linux/soc/mediatek/mtk-cmdq.h  | 19 +++++++++++++++++++
 2 files changed, 29 insertions(+)

(limited to 'include')

diff --git a/drivers/soc/mediatek/mtk-cmdq-helper.c b/drivers/soc/mediatek/mtk-cmdq-helper.c
index 678db09983d4..766dbafaef62 100644
--- a/drivers/soc/mediatek/mtk-cmdq-helper.c
+++ b/drivers/soc/mediatek/mtk-cmdq-helper.c
@@ -414,6 +414,16 @@ int cmdq_pkt_jump_rel(struct cmdq_pkt *pkt, s32 offset, u8 shift_pa)
 }
 EXPORT_SYMBOL(cmdq_pkt_jump_rel);
 
+int cmdq_pkt_eoc(struct cmdq_pkt *pkt)
+{
+	struct cmdq_instruction inst = { {0} };
+
+	inst.op = CMDQ_CODE_EOC;
+	inst.value = CMDQ_EOC_IRQ_EN;
+	return cmdq_pkt_append_command(pkt, inst);
+}
+EXPORT_SYMBOL(cmdq_pkt_eoc);
+
 int cmdq_pkt_finalize(struct cmdq_pkt *pkt)
 {
 	struct cmdq_instruction inst = { {0} };
diff --git a/include/linux/soc/mediatek/mtk-cmdq.h b/include/linux/soc/mediatek/mtk-cmdq.h
index cadb6a157607..921d631361d2 100644
--- a/include/linux/soc/mediatek/mtk-cmdq.h
+++ b/include/linux/soc/mediatek/mtk-cmdq.h
@@ -281,6 +281,20 @@ static inline int cmdq_pkt_jump(struct cmdq_pkt *pkt, dma_addr_t addr, u8 shift_
  */
 int cmdq_pkt_jump_rel(struct cmdq_pkt *pkt, s32 offset, u8 shift_pa);
 
+/**
+ * cmdq_pkt_eoc() - Append EOC and ask GCE to generate an IRQ at end of execution
+ * @pkt:	The CMDQ packet
+ *
+ * Appends an End Of Code (EOC) command to the CMDQ packet and asks the GCE
+ * to generate an interrupt at the end of the execution of all commands in
+ * the pipeline.
+ * The EOC command is usually appended to the end of the pipeline to notify
+ * that all commands are done.
+ *
+ * Return: 0 for success or negative error number
+ */
+int cmdq_pkt_eoc(struct cmdq_pkt *pkt);
+
 /**
  * cmdq_pkt_finalize() - Append EOC and jump command to pkt.
  * @pkt:	the CMDQ packet
@@ -412,6 +426,11 @@ static inline int cmdq_pkt_jump_rel(struct cmdq_pkt *pkt, s32 offset, u8 shift_p
 	return -EINVAL;
 }
 
+static inline int cmdq_pkt_eoc(struct cmdq_pkt *pkt)
+{
+	return -EINVAL;
+}
+
 static inline int cmdq_pkt_finalize(struct cmdq_pkt *pkt)
 {
 	return -EINVAL;
-- 
cgit v1.2.3


From 9935af96a72735e8e86aaaf06b031b014109ebc1 Mon Sep 17 00:00:00 2001
From: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Date: Thu, 22 Feb 2024 15:41:14 +0000
Subject: soc: mediatek: cmdq: Remove cmdq_pkt_flush_async() helper function

cmdq_pkt_flush_async() is not used by all client drivers (MediaTek
drm driver and MediaTek mdp3 driver), so remove it.

Signed-off-by: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Link: https://lore.kernel.org/r/20240222154120.16959-7-chunkuang.hu@kernel.org
Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
---
 drivers/soc/mediatek/mtk-cmdq-helper.c | 15 ---------------
 include/linux/soc/mediatek/mtk-cmdq.h  | 18 ------------------
 2 files changed, 33 deletions(-)

(limited to 'include')

diff --git a/drivers/soc/mediatek/mtk-cmdq-helper.c b/drivers/soc/mediatek/mtk-cmdq-helper.c
index 766dbafaef62..bbe41302210a 100644
--- a/drivers/soc/mediatek/mtk-cmdq-helper.c
+++ b/drivers/soc/mediatek/mtk-cmdq-helper.c
@@ -446,19 +446,4 @@ int cmdq_pkt_finalize(struct cmdq_pkt *pkt)
 }
 EXPORT_SYMBOL(cmdq_pkt_finalize);
 
-int cmdq_pkt_flush_async(struct cmdq_pkt *pkt)
-{
-	int err;
-	struct cmdq_client *client = (struct cmdq_client *)pkt->cl;
-
-	err = mbox_send_message(client->chan, pkt);
-	if (err < 0)
-		return err;
-	/* We can send next packet immediately, so just call txdone. */
-	mbox_client_txdone(client->chan, 0);
-
-	return 0;
-}
-EXPORT_SYMBOL(cmdq_pkt_flush_async);
-
 MODULE_LICENSE("GPL v2");
diff --git a/include/linux/soc/mediatek/mtk-cmdq.h b/include/linux/soc/mediatek/mtk-cmdq.h
index 921d631361d2..e79dc8c4ad41 100644
--- a/include/linux/soc/mediatek/mtk-cmdq.h
+++ b/include/linux/soc/mediatek/mtk-cmdq.h
@@ -303,19 +303,6 @@ int cmdq_pkt_eoc(struct cmdq_pkt *pkt);
  */
 int cmdq_pkt_finalize(struct cmdq_pkt *pkt);
 
-/**
- * cmdq_pkt_flush_async() - trigger CMDQ to asynchronously execute the CMDQ
- *                          packet and call back at the end of done packet
- * @pkt:	the CMDQ packet
- *
- * Return: 0 for success; else the error code is returned
- *
- * Trigger CMDQ to asynchronously execute the CMDQ packet and call back
- * at the end of done packet. Note that this is an ASYNC function. When the
- * function returned, it may or may not be finished.
- */
-int cmdq_pkt_flush_async(struct cmdq_pkt *pkt);
-
 #else /* IS_ENABLED(CONFIG_MTK_CMDQ) */
 
 static inline int cmdq_dev_get_client_reg(struct device *dev,
@@ -436,11 +423,6 @@ static inline int cmdq_pkt_finalize(struct cmdq_pkt *pkt)
 	return -EINVAL;
 }
 
-static inline int cmdq_pkt_flush_async(struct cmdq_pkt *pkt)
-{
-	return -EINVAL;
-}
-
 #endif /* IS_ENABLED(CONFIG_MTK_CMDQ) */
 
 #endif	/* __MTK_CMDQ_H__ */
-- 
cgit v1.2.3


From b81b2d5534fc72f70d808c72b19f0e3f9df32a90 Mon Sep 17 00:00:00 2001
From: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Date: Thu, 22 Feb 2024 15:41:15 +0000
Subject: soc: mediatek: cmdq: Refine cmdq_pkt_create() and cmdq_pkt_destroy()

cmdq_pkt_create() and cmdq_pkt_destroy() is not suitable for
client drivers so each client driver has implement its own
function. This refinement would pass struct cmdq_pkt pointer into
cmdq_pkt_create(). In addition, client driver has the struct
cmdq_client information, so it's not necessary to store this
information in struct cmdq_pkt. After this refinement, client
drivers could use these helper funciton instead of implementing
its own version.

Signed-off-by: Chun-Kuang Hu <chunkuang.hu@kernel.org>
Link: https://lore.kernel.org/r/20240222154120.16959-8-chunkuang.hu@kernel.org
Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
---
 drivers/soc/mediatek/mtk-cmdq-helper.c | 24 +++++++-----------------
 include/linux/soc/mediatek/mtk-cmdq.h  | 14 ++++++++------
 2 files changed, 15 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/drivers/soc/mediatek/mtk-cmdq-helper.c b/drivers/soc/mediatek/mtk-cmdq-helper.c
index bbe41302210a..1a1f1b260a06 100644
--- a/drivers/soc/mediatek/mtk-cmdq-helper.c
+++ b/drivers/soc/mediatek/mtk-cmdq-helper.c
@@ -106,22 +106,16 @@ void cmdq_mbox_destroy(struct cmdq_client *client)
 }
 EXPORT_SYMBOL(cmdq_mbox_destroy);
 
-struct cmdq_pkt *cmdq_pkt_create(struct cmdq_client *client, size_t size)
+int cmdq_pkt_create(struct cmdq_client *client, struct cmdq_pkt *pkt, size_t size)
 {
-	struct cmdq_pkt *pkt;
 	struct device *dev;
 	dma_addr_t dma_addr;
 
-	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
-	if (!pkt)
-		return ERR_PTR(-ENOMEM);
 	pkt->va_base = kzalloc(size, GFP_KERNEL);
-	if (!pkt->va_base) {
-		kfree(pkt);
-		return ERR_PTR(-ENOMEM);
-	}
+	if (!pkt->va_base)
+		return -ENOMEM;
+
 	pkt->buf_size = size;
-	pkt->cl = (void *)client;
 
 	dev = client->chan->mbox->dev;
 	dma_addr = dma_map_single(dev, pkt->va_base, pkt->buf_size,
@@ -129,24 +123,20 @@ struct cmdq_pkt *cmdq_pkt_create(struct cmdq_client *client, size_t size)
 	if (dma_mapping_error(dev, dma_addr)) {
 		dev_err(dev, "dma map failed, size=%u\n", (u32)(u64)size);
 		kfree(pkt->va_base);
-		kfree(pkt);
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 	}
 
 	pkt->pa_base = dma_addr;
 
-	return pkt;
+	return 0;
 }
 EXPORT_SYMBOL(cmdq_pkt_create);
 
-void cmdq_pkt_destroy(struct cmdq_pkt *pkt)
+void cmdq_pkt_destroy(struct cmdq_client *client, struct cmdq_pkt *pkt)
 {
-	struct cmdq_client *client = (struct cmdq_client *)pkt->cl;
-
 	dma_unmap_single(client->chan->mbox->dev, pkt->pa_base, pkt->buf_size,
 			 DMA_TO_DEVICE);
 	kfree(pkt->va_base);
-	kfree(pkt);
 }
 EXPORT_SYMBOL(cmdq_pkt_destroy);
 
diff --git a/include/linux/soc/mediatek/mtk-cmdq.h b/include/linux/soc/mediatek/mtk-cmdq.h
index e79dc8c4ad41..e5335fb22a3b 100644
--- a/include/linux/soc/mediatek/mtk-cmdq.h
+++ b/include/linux/soc/mediatek/mtk-cmdq.h
@@ -62,17 +62,19 @@ void cmdq_mbox_destroy(struct cmdq_client *client);
 /**
  * cmdq_pkt_create() - create a CMDQ packet
  * @client:	the CMDQ mailbox client
+ * @pkt:	the CMDQ packet
  * @size:	required CMDQ buffer size
  *
- * Return: CMDQ packet pointer
+ * Return: 0 for success; else the error code is returned
  */
-struct cmdq_pkt *cmdq_pkt_create(struct cmdq_client *client, size_t size);
+int cmdq_pkt_create(struct cmdq_client *client, struct cmdq_pkt *pkt, size_t size);
 
 /**
  * cmdq_pkt_destroy() - destroy the CMDQ packet
+ * @client:	the CMDQ mailbox client
  * @pkt:	the CMDQ packet
  */
-void cmdq_pkt_destroy(struct cmdq_pkt *pkt);
+void cmdq_pkt_destroy(struct cmdq_client *client, struct cmdq_pkt *pkt);
 
 /**
  * cmdq_pkt_write() - append write command to the CMDQ packet
@@ -318,12 +320,12 @@ static inline struct cmdq_client *cmdq_mbox_create(struct device *dev, int index
 
 static inline void cmdq_mbox_destroy(struct cmdq_client *client) { }
 
-static inline  struct cmdq_pkt *cmdq_pkt_create(struct cmdq_client *client, size_t size)
+static inline int cmdq_pkt_create(struct cmdq_client *client, struct cmdq_pkt *pkt, size_t size)
 {
-	return ERR_PTR(-EINVAL);
+	return -EINVAL;
 }
 
-static inline void cmdq_pkt_destroy(struct cmdq_pkt *pkt) { }
+static inline void cmdq_pkt_destroy(struct cmdq_client *client, struct cmdq_pkt *pkt) { }
 
 static inline int cmdq_pkt_write(struct cmdq_pkt *pkt, u8 subsys, u16 offset, u32 value)
 {
-- 
cgit v1.2.3


From 49ddaa495f8b60c8c4022ebe540024dd870e4dd1 Mon Sep 17 00:00:00 2001
From: "Jason-JH.Lin" <jason-jh.lin@mediatek.com>
Date: Thu, 7 Mar 2024 09:34:55 +0800
Subject: soc: mediatek: mtk-cmdq: Add specific purpose register definitions
 for GCE

Add specific purpose register definitions for GCE, so CMDQ users can
use them as a buffer to store data.

Signed-off-by: Jason-JH.Lin <jason-jh.lin@mediatek.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Link: https://lore.kernel.org/r/20240307013458.23550-2-jason-jh.lin@mediatek.com
Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
---
 include/linux/soc/mediatek/mtk-cmdq.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/linux/soc/mediatek/mtk-cmdq.h b/include/linux/soc/mediatek/mtk-cmdq.h
index e5335fb22a3b..042c61f29dc1 100644
--- a/include/linux/soc/mediatek/mtk-cmdq.h
+++ b/include/linux/soc/mediatek/mtk-cmdq.h
@@ -14,6 +14,15 @@
 #define CMDQ_ADDR_HIGH(addr)	((u32)(((addr) >> 16) & GENMASK(31, 0)))
 #define CMDQ_ADDR_LOW(addr)	((u16)(addr) | BIT(1))
 
+/*
+ * Every cmdq thread has its own SPRs (Specific Purpose Registers),
+ * so there are 4 * N (threads) SPRs in GCE that shares the same indexes below.
+ */
+#define CMDQ_THR_SPR_IDX0	(0)
+#define CMDQ_THR_SPR_IDX1	(1)
+#define CMDQ_THR_SPR_IDX2	(2)
+#define CMDQ_THR_SPR_IDX3	(3)
+
 struct cmdq_pkt;
 
 struct cmdq_client_reg {
-- 
cgit v1.2.3


From 263801f8e5a159cff2dfbc2a1d9cf9f1afecfa81 Mon Sep 17 00:00:00 2001
From: "Jason-JH.Lin" <jason-jh.lin@mediatek.com>
Date: Thu, 7 Mar 2024 09:34:56 +0800
Subject: soc: mediatek: mtk-cmdq: Add cmdq_pkt_mem_move() function

Add cmdq_pkt_mem_move() function to support CMDQ user making
an instruction for moving a value from a source address to a
destination address.

Signed-off-by: Jason-JH.Lin <jason-jh.lin@mediatek.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Link: https://lore.kernel.org/r/20240307013458.23550-3-jason-jh.lin@mediatek.com
Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
---
 drivers/soc/mediatek/mtk-cmdq-helper.c | 26 ++++++++++++++++++++++++++
 include/linux/soc/mediatek/mtk-cmdq.h  | 12 ++++++++++++
 2 files changed, 38 insertions(+)

(limited to 'include')

diff --git a/drivers/soc/mediatek/mtk-cmdq-helper.c b/drivers/soc/mediatek/mtk-cmdq-helper.c
index 1a1f1b260a06..4cd1a25866a9 100644
--- a/drivers/soc/mediatek/mtk-cmdq-helper.c
+++ b/drivers/soc/mediatek/mtk-cmdq-helper.c
@@ -290,6 +290,32 @@ int cmdq_pkt_write_s_mask_value(struct cmdq_pkt *pkt, u8 high_addr_reg_idx,
 }
 EXPORT_SYMBOL(cmdq_pkt_write_s_mask_value);
 
+int cmdq_pkt_mem_move(struct cmdq_pkt *pkt, dma_addr_t src_addr, dma_addr_t dst_addr)
+{
+	const u16 high_addr_reg_idx  = CMDQ_THR_SPR_IDX0;
+	const u16 value_reg_idx = CMDQ_THR_SPR_IDX1;
+	int ret;
+
+	/* read the value of src_addr into high_addr_reg_idx */
+	ret = cmdq_pkt_assign(pkt, high_addr_reg_idx, CMDQ_ADDR_HIGH(src_addr));
+	if (ret < 0)
+		return ret;
+	ret = cmdq_pkt_read_s(pkt, high_addr_reg_idx, CMDQ_ADDR_LOW(src_addr), value_reg_idx);
+	if (ret < 0)
+		return ret;
+
+	/* write the value of value_reg_idx into dst_addr */
+	ret = cmdq_pkt_assign(pkt, high_addr_reg_idx, CMDQ_ADDR_HIGH(dst_addr));
+	if (ret < 0)
+		return ret;
+	ret = cmdq_pkt_write_s(pkt, high_addr_reg_idx, CMDQ_ADDR_LOW(dst_addr), value_reg_idx);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+EXPORT_SYMBOL(cmdq_pkt_mem_move);
+
 int cmdq_pkt_wfe(struct cmdq_pkt *pkt, u16 event, bool clear)
 {
 	struct cmdq_instruction inst = { {0} };
diff --git a/include/linux/soc/mediatek/mtk-cmdq.h b/include/linux/soc/mediatek/mtk-cmdq.h
index 042c61f29dc1..167c8fecac30 100644
--- a/include/linux/soc/mediatek/mtk-cmdq.h
+++ b/include/linux/soc/mediatek/mtk-cmdq.h
@@ -184,6 +184,18 @@ int cmdq_pkt_write_s_value(struct cmdq_pkt *pkt, u8 high_addr_reg_idx,
 int cmdq_pkt_write_s_mask_value(struct cmdq_pkt *pkt, u8 high_addr_reg_idx,
 				u16 addr_low, u32 value, u32 mask);
 
+/**
+ * cmdq_pkt_mem_move() - append memory move command to the CMDQ packet
+ * @pkt:	the CMDQ packet
+ * @src_addr:	source address
+ * @dst_addr:	destination address
+ *
+ * Appends a CMDQ command to copy the value found in `src_addr` to `dst_addr`.
+ *
+ * Return: 0 for success; else the error code is returned
+ */
+int cmdq_pkt_mem_move(struct cmdq_pkt *pkt, dma_addr_t src_addr, dma_addr_t dst_addr);
+
 /**
  * cmdq_pkt_wfe() - append wait for event command to the CMDQ packet
  * @pkt:	the CMDQ packet
-- 
cgit v1.2.3


From 400e2fa8058e7032974a892c7336b3ea20e24cfd Mon Sep 17 00:00:00 2001
From: "Jason-JH.Lin" <jason-jh.lin@mediatek.com>
Date: Thu, 7 Mar 2024 09:34:57 +0800
Subject: soc: mediatek: mtk-cmdq: Add cmdq_pkt_poll_addr() function

Add cmdq_pkt_poll_addr function to support CMDQ user making
an instruction for polling a specific address of hardware rigster
to check the value with or without mask.

POLL is a legacy operation in GCE, so it does not support SPR and
CMDQ_CODE_LOGIC. To support polling the register address which doesn't
have the subsys id, CMDQ users need to make an instruction with GPR and
CMDQ_CODE_MASK operation to move the register address to be poll into GPR.
Then users can make an POLL instruction with GPR to poll the register
address assigned in previous instruction.

Signed-off-by: Jason-JH.Lin <jason-jh.lin@mediatek.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Link: https://lore.kernel.org/r/20240307013458.23550-4-jason-jh.lin@mediatek.com
Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
---
 drivers/soc/mediatek/mtk-cmdq-helper.c | 49 ++++++++++++++++++++++++++++++++++
 include/linux/soc/mediatek/mtk-cmdq.h  | 21 +++++++++++++++
 2 files changed, 70 insertions(+)

(limited to 'include')

diff --git a/drivers/soc/mediatek/mtk-cmdq-helper.c b/drivers/soc/mediatek/mtk-cmdq-helper.c
index 4cd1a25866a9..ee55992f6491 100644
--- a/drivers/soc/mediatek/mtk-cmdq-helper.c
+++ b/drivers/soc/mediatek/mtk-cmdq-helper.c
@@ -12,6 +12,8 @@
 
 #define CMDQ_WRITE_ENABLE_MASK	BIT(0)
 #define CMDQ_POLL_ENABLE_MASK	BIT(0)
+/* dedicate the last GPR_R15 to assign the register address to be poll */
+#define CMDQ_POLL_ADDR_GPR	(15)
 #define CMDQ_EOC_IRQ_EN		BIT(0)
 #define CMDQ_REG_TYPE		1
 #define CMDQ_JUMP_RELATIVE	0
@@ -397,6 +399,53 @@ int cmdq_pkt_poll_mask(struct cmdq_pkt *pkt, u8 subsys,
 }
 EXPORT_SYMBOL(cmdq_pkt_poll_mask);
 
+int cmdq_pkt_poll_addr(struct cmdq_pkt *pkt, dma_addr_t addr, u32 value, u32 mask)
+{
+	struct cmdq_instruction inst = { {0} };
+	u8 use_mask = 0;
+	int ret;
+
+	/*
+	 * Append an MASK instruction to set the mask for following POLL instruction
+	 * which enables use_mask bit.
+	 */
+	if (mask != GENMASK(31, 0)) {
+		inst.op = CMDQ_CODE_MASK;
+		inst.mask = ~mask;
+		ret = cmdq_pkt_append_command(pkt, inst);
+		if (ret < 0)
+			return ret;
+		use_mask = CMDQ_POLL_ENABLE_MASK;
+	}
+
+	/*
+	 * POLL is an legacy operation in GCE and it does not support SPR and CMDQ_CODE_LOGIC,
+	 * so it can not use cmdq_pkt_assign to keep polling register address to SPR.
+	 * If user wants to poll a register address which doesn't have a subsys id,
+	 * user needs to use GPR and CMDQ_CODE_MASK to move polling register address to GPR.
+	 */
+	inst.op = CMDQ_CODE_MASK;
+	inst.dst_t = CMDQ_REG_TYPE;
+	inst.sop = CMDQ_POLL_ADDR_GPR;
+	inst.value = addr;
+	ret = cmdq_pkt_append_command(pkt, inst);
+	if (ret < 0)
+		return ret;
+
+	/* Append POLL instruction to poll the register address assign to GPR previously. */
+	inst.op = CMDQ_CODE_POLL;
+	inst.dst_t = CMDQ_REG_TYPE;
+	inst.sop = CMDQ_POLL_ADDR_GPR;
+	inst.offset = use_mask;
+	inst.value = value;
+	ret = cmdq_pkt_append_command(pkt, inst);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+EXPORT_SYMBOL(cmdq_pkt_poll_addr);
+
 int cmdq_pkt_assign(struct cmdq_pkt *pkt, u16 reg_idx, u32 value)
 {
 	struct cmdq_instruction inst = {};
diff --git a/include/linux/soc/mediatek/mtk-cmdq.h b/include/linux/soc/mediatek/mtk-cmdq.h
index 167c8fecac30..f59f42e46ef1 100644
--- a/include/linux/soc/mediatek/mtk-cmdq.h
+++ b/include/linux/soc/mediatek/mtk-cmdq.h
@@ -270,6 +270,22 @@ int cmdq_pkt_poll_mask(struct cmdq_pkt *pkt, u8 subsys,
  */
 int cmdq_pkt_assign(struct cmdq_pkt *pkt, u16 reg_idx, u32 value);
 
+/**
+ * cmdq_pkt_poll_addr() - Append blocking POLL command to CMDQ packet
+ * @pkt:	the CMDQ packet
+ * @addr:	the hardware register address
+ * @value:	the specified target register value
+ * @mask:	the specified target register mask
+ *
+ * Appends a polling (POLL) command to the CMDQ packet and asks the GCE
+ * to execute an instruction that checks for the specified `value` (with
+ * or without `mask`) to appear in the specified hardware register `addr`.
+ * All GCE threads will be blocked by this instruction.
+ *
+ * Return: 0 for success or negative error code
+ */
+int cmdq_pkt_poll_addr(struct cmdq_pkt *pkt, dma_addr_t addr, u32 value, u32 mask);
+
 /**
  * cmdq_pkt_jump_abs() - Append jump command to the CMDQ packet, ask GCE
  *			 to execute an instruction that change current thread
@@ -421,6 +437,11 @@ static inline int cmdq_pkt_assign(struct cmdq_pkt *pkt, u16 reg_idx, u32 value)
 	return -EINVAL;
 }
 
+static inline int cmdq_pkt_poll_addr(struct cmdq_pkt *pkt, dma_addr_t addr, u32 value, u32 mask)
+{
+	return -EINVAL;
+}
+
 static inline int cmdq_pkt_jump_abs(struct cmdq_pkt *pkt, dma_addr_t addr, u8 shift_pa)
 {
 	return -EINVAL;
-- 
cgit v1.2.3


From 69ff68332dc5005539ac37c5c85444aaaec7c914 Mon Sep 17 00:00:00 2001
From: "Jason-JH.Lin" <jason-jh.lin@mediatek.com>
Date: Thu, 7 Mar 2024 09:34:58 +0800
Subject: soc: mediatek: mtk-cmdq: Add cmdq_pkt_acquire_event() function

Add cmdq_pkt_acquire_event() function to support CMDQ user making
an instruction for acquiring event.

CMDQ users can use cmdq_pkt_acquire_event() as `mutex_lock`
and cmdq_pkt_clear_event() as `mutex_unlock` to protect the global
resource modified instructions between them.

cmdq_pkt_acquire_event() would wait for event to be cleared.
After event is cleared by cmdq_pkt_clear_event() in other GCE threads,
cmdq_pkt_acquire_event() would set event and keep executing next
instruction. So the mutex would work like this:

    cmdq_pkt_acquire_event() /* mutex lock */

    /* critical secton instructions that modified global resource */

    cmdq_pkt_clear_event() /* mutex unlock */

Prevent the critical section instructions from being affected by other
GCE threads.

Signed-off-by: Jason-JH.Lin <jason-jh.lin@mediatek.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Link: https://lore.kernel.org/r/20240307013458.23550-5-jason-jh.lin@mediatek.com
Signed-off-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
---
 drivers/soc/mediatek/mtk-cmdq-helper.c | 15 +++++++++++++++
 include/linux/soc/mediatek/mtk-cmdq.h  | 15 +++++++++++++++
 2 files changed, 30 insertions(+)

(limited to 'include')

diff --git a/drivers/soc/mediatek/mtk-cmdq-helper.c b/drivers/soc/mediatek/mtk-cmdq-helper.c
index ee55992f6491..d1ca4b21587e 100644
--- a/drivers/soc/mediatek/mtk-cmdq-helper.c
+++ b/drivers/soc/mediatek/mtk-cmdq-helper.c
@@ -334,6 +334,21 @@ int cmdq_pkt_wfe(struct cmdq_pkt *pkt, u16 event, bool clear)
 }
 EXPORT_SYMBOL(cmdq_pkt_wfe);
 
+int cmdq_pkt_acquire_event(struct cmdq_pkt *pkt, u16 event)
+{
+	struct cmdq_instruction inst = {};
+
+	if (event >= CMDQ_MAX_EVENT)
+		return -EINVAL;
+
+	inst.op = CMDQ_CODE_WFE;
+	inst.value = CMDQ_WFE_UPDATE | CMDQ_WFE_UPDATE_VALUE | CMDQ_WFE_WAIT;
+	inst.event = event;
+
+	return cmdq_pkt_append_command(pkt, inst);
+}
+EXPORT_SYMBOL(cmdq_pkt_acquire_event);
+
 int cmdq_pkt_clear_event(struct cmdq_pkt *pkt, u16 event)
 {
 	struct cmdq_instruction inst = { {0} };
diff --git a/include/linux/soc/mediatek/mtk-cmdq.h b/include/linux/soc/mediatek/mtk-cmdq.h
index f59f42e46ef1..d4a8e34505e6 100644
--- a/include/linux/soc/mediatek/mtk-cmdq.h
+++ b/include/linux/soc/mediatek/mtk-cmdq.h
@@ -206,6 +206,21 @@ int cmdq_pkt_mem_move(struct cmdq_pkt *pkt, dma_addr_t src_addr, dma_addr_t dst_
  */
 int cmdq_pkt_wfe(struct cmdq_pkt *pkt, u16 event, bool clear);
 
+/**
+ * cmdq_pkt_acquire_event() - append acquire event command to the CMDQ packet
+ * @pkt:	the CMDQ packet
+ * @event:	the desired event to be acquired
+ *
+ * User can use cmdq_pkt_acquire_event() as `mutex_lock` and cmdq_pkt_clear_event()
+ * as `mutex_unlock` to protect some `critical section` instructions between them.
+ * cmdq_pkt_acquire_event() would wait for event to be cleared.
+ * After event is cleared by cmdq_pkt_clear_event in other GCE threads,
+ * cmdq_pkt_acquire_event() would set event and keep executing next instruction.
+ *
+ * Return: 0 for success; else the error code is returned
+ */
+int cmdq_pkt_acquire_event(struct cmdq_pkt *pkt, u16 event);
+
 /**
  * cmdq_pkt_clear_event() - append clear event command to the CMDQ packet
  * @pkt:	the CMDQ packet
-- 
cgit v1.2.3


From a2ca202350f97bced8dac4cbfbb3cbf855973e57 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <npitre@baylibre.com>
Date: Mon, 1 Apr 2024 23:25:40 -0400
Subject: dt-bindings: thermal: mediatek: Add LVTS thermal controller
 definition for MT8186

Add LVTS thermal controller definition for MT8186.

Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
Acked-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://lore.kernel.org/r/20240402032729.2736685-7-nico@fluxnic.net
---
 .../devicetree/bindings/thermal/mediatek,lvts-thermal.yaml     |  2 ++
 include/dt-bindings/thermal/mediatek,lvts-thermal.h            | 10 ++++++++++
 2 files changed, 12 insertions(+)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/thermal/mediatek,lvts-thermal.yaml b/Documentation/devicetree/bindings/thermal/mediatek,lvts-thermal.yaml
index e6665af52ee6..4173bae53032 100644
--- a/Documentation/devicetree/bindings/thermal/mediatek,lvts-thermal.yaml
+++ b/Documentation/devicetree/bindings/thermal/mediatek,lvts-thermal.yaml
@@ -19,6 +19,7 @@ properties:
   compatible:
     enum:
       - mediatek,mt7988-lvts-ap
+      - mediatek,mt8186-lvts
       - mediatek,mt8192-lvts-ap
       - mediatek,mt8192-lvts-mcu
       - mediatek,mt8195-lvts-ap
@@ -75,6 +76,7 @@ allOf:
         compatible:
           contains:
             enum:
+              - mediatek,mt8186-lvts
               - mediatek,mt8195-lvts-ap
               - mediatek,mt8195-lvts-mcu
     then:
diff --git a/include/dt-bindings/thermal/mediatek,lvts-thermal.h b/include/dt-bindings/thermal/mediatek,lvts-thermal.h
index 997e2f55128a..433d298826d3 100644
--- a/include/dt-bindings/thermal/mediatek,lvts-thermal.h
+++ b/include/dt-bindings/thermal/mediatek,lvts-thermal.h
@@ -16,6 +16,16 @@
 #define MT7988_ETHWARP_0	6
 #define MT7988_ETHWARP_1	7
 
+#define MT8186_LITTLE_CPU0	0
+#define MT8186_LITTLE_CPU1	1
+#define MT8186_LITTLE_CPU2	2
+#define MT8186_CAM		3
+#define MT8186_BIG_CPU0	4
+#define MT8186_BIG_CPU1	5
+#define MT8186_NNA		6
+#define MT8186_ADSP		7
+#define MT8186_MFG		8
+
 #define MT8195_MCU_BIG_CPU0     0
 #define MT8195_MCU_BIG_CPU1     1
 #define MT8195_MCU_BIG_CPU2     2
-- 
cgit v1.2.3


From 78c88534e5e1f8c5334c380b626c6ee91b62161b Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <npitre@baylibre.com>
Date: Mon, 1 Apr 2024 23:25:46 -0400
Subject: dt-bindings: thermal: mediatek: Add LVTS thermal controller
 definition for MT8188

Add LVTS thermal controller definition for MT8188.

Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: https://lore.kernel.org/r/20240402032729.2736685-13-nico@fluxnic.net
---
 .../bindings/thermal/mediatek,lvts-thermal.yaml          |  4 ++++
 include/dt-bindings/thermal/mediatek,lvts-thermal.h      | 16 ++++++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/thermal/mediatek,lvts-thermal.yaml b/Documentation/devicetree/bindings/thermal/mediatek,lvts-thermal.yaml
index 4173bae53032..331cf4e662e3 100644
--- a/Documentation/devicetree/bindings/thermal/mediatek,lvts-thermal.yaml
+++ b/Documentation/devicetree/bindings/thermal/mediatek,lvts-thermal.yaml
@@ -20,6 +20,8 @@ properties:
     enum:
       - mediatek,mt7988-lvts-ap
       - mediatek,mt8186-lvts
+      - mediatek,mt8188-lvts-ap
+      - mediatek,mt8188-lvts-mcu
       - mediatek,mt8192-lvts-ap
       - mediatek,mt8192-lvts-mcu
       - mediatek,mt8195-lvts-ap
@@ -61,6 +63,8 @@ allOf:
         compatible:
           contains:
             enum:
+              - mediatek,mt8188-lvts-ap
+              - mediatek,mt8188-lvts-mcu
               - mediatek,mt8192-lvts-ap
               - mediatek,mt8192-lvts-mcu
     then:
diff --git a/include/dt-bindings/thermal/mediatek,lvts-thermal.h b/include/dt-bindings/thermal/mediatek,lvts-thermal.h
index 433d298826d3..bf95309d2525 100644
--- a/include/dt-bindings/thermal/mediatek,lvts-thermal.h
+++ b/include/dt-bindings/thermal/mediatek,lvts-thermal.h
@@ -26,6 +26,22 @@
 #define MT8186_ADSP		7
 #define MT8186_MFG		8
 
+#define MT8188_MCU_LITTLE_CPU0	0
+#define MT8188_MCU_LITTLE_CPU1	1
+#define MT8188_MCU_LITTLE_CPU2	2
+#define MT8188_MCU_LITTLE_CPU3	3
+#define MT8188_MCU_BIG_CPU0	4
+#define MT8188_MCU_BIG_CPU1	5
+
+#define MT8188_AP_APU		0
+#define MT8188_AP_GPU1		1
+#define MT8188_AP_GPU2		2
+#define MT8188_AP_SOC1		3
+#define MT8188_AP_SOC2		4
+#define MT8188_AP_SOC3		5
+#define MT8188_AP_CAM1		6
+#define MT8188_AP_CAM2		7
+
 #define MT8195_MCU_BIG_CPU0     0
 #define MT8195_MCU_BIG_CPU1     1
 #define MT8195_MCU_BIG_CPU2     2
-- 
cgit v1.2.3


From a8f59e5a5deaf3e99a8b7252e96cee9af67858a9 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Sat, 20 Apr 2024 16:58:11 +0900
Subject: block: use a per disk workqueue for zone write plugging

A zone write plug BIO work function blk_zone_wplug_bio_work() calls
submit_bio_noacct_nocheck() to execute the next unplugged BIO. This
function may block. So executing zone plugs BIO works using the block
layer global kblockd workqueue can potentially lead to preformance or
latency issues as the number of concurrent work for a workqueue is
limited to WQ_DFL_ACTIVE (256).
1) For a system with a large number of zoned disks, issuing write
   requests to otherwise unused zones may be delayed wiating for a work
   thread to become available.
2) Requeue operations which use kblockd but are independent of zone
   write plugging may alsoi end up being delayed.

To avoid these potential performance issues, create a workqueue per
zoned device to execute zone plugs BIO work. The workqueue max active
parameter is set to the maximum number of zone write plugs allocated
with the zone write plug mempool. This limit is equal to the maximum
number of open zones of the disk and defaults to 128 for disks that do
not have a limit on the number of open zones.

Fixes: dd291d77cc90 ("block: Introduce zone write plugging")
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20240420075811.1276893-3-dlemoal@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-zoned.c      | 32 ++++++++++++++++++++++++--------
 include/linux/blkdev.h |  1 +
 2 files changed, 25 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 3befebe6b319..3a796420f240 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -1129,7 +1129,7 @@ static void disk_zone_wplug_unplug_bio(struct gendisk *disk,
 	/* Schedule submission of the next plugged BIO if we have one. */
 	if (!bio_list_empty(&zwplug->bio_list)) {
 		spin_unlock_irqrestore(&zwplug->lock, flags);
-		kblockd_schedule_work(&zwplug->bio_work);
+		queue_work(disk->zone_wplugs_wq, &zwplug->bio_work);
 		return;
 	}
 
@@ -1332,7 +1332,7 @@ static void disk_zone_wplug_handle_error(struct gendisk *disk,
 	/* Restart BIO submission if we still have any BIO left. */
 	if (!bio_list_empty(&zwplug->bio_list)) {
 		WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED));
-		kblockd_schedule_work(&zwplug->bio_work);
+		queue_work(disk->zone_wplugs_wq, &zwplug->bio_work);
 		goto unlock;
 	}
 
@@ -1409,14 +1409,25 @@ static int disk_alloc_zone_resources(struct gendisk *disk,
 
 	disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size,
 						sizeof(struct blk_zone_wplug));
-	if (!disk->zone_wplugs_pool) {
-		kfree(disk->zone_wplugs_hash);
-		disk->zone_wplugs_hash = NULL;
-		disk->zone_wplugs_hash_bits = 0;
-		return -ENOMEM;
-	}
+	if (!disk->zone_wplugs_pool)
+		goto free_hash;
+
+	disk->zone_wplugs_wq =
+		alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI,
+				pool_size, disk->disk_name);
+	if (!disk->zone_wplugs_wq)
+		goto destroy_pool;
 
 	return 0;
+
+destroy_pool:
+	mempool_destroy(disk->zone_wplugs_pool);
+	disk->zone_wplugs_pool = NULL;
+free_hash:
+	kfree(disk->zone_wplugs_hash);
+	disk->zone_wplugs_hash = NULL;
+	disk->zone_wplugs_hash_bits = 0;
+	return -ENOMEM;
 }
 
 static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
@@ -1447,6 +1458,11 @@ void disk_free_zone_resources(struct gendisk *disk)
 {
 	cancel_work_sync(&disk->zone_wplugs_work);
 
+	if (disk->zone_wplugs_wq) {
+		destroy_workqueue(disk->zone_wplugs_wq);
+		disk->zone_wplugs_wq = NULL;
+	}
+
 	disk_destroy_zone_wplugs_hash_table(disk);
 
 	/*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1ce8ba08e318..d22eb41a05b8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -191,6 +191,7 @@ struct gendisk {
 	struct hlist_head       *zone_wplugs_hash;
 	struct list_head        zone_wplugs_err_list;
 	struct work_struct	zone_wplugs_work;
+	struct workqueue_struct *zone_wplugs_wq;
 #endif /* CONFIG_BLK_DEV_ZONED */
 
 #if IS_ENABLED(CONFIG_CDROM)
-- 
cgit v1.2.3


From 06fe8fd6808562971637c6b133c806bcf49097ad Mon Sep 17 00:00:00 2001
From: Nipun Gupta <nipun.gupta@amd.com>
Date: Tue, 23 Apr 2024 16:40:20 +0530
Subject: genirq/msi: Add MSI allocation helper and export MSI functions

MSI functions for allocation and free can be directly used by
the device drivers without any wrapper provided by bus drivers.
So export these MSI functions.

Also, add a wrapper API to allocate MSIs providing only the
number of interrupts rather than range for simpler driver usage.

Signed-off-by: Nipun Gupta <nipun.gupta@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20240423111021.1686144-1-nipun.gupta@amd.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/msi.h | 6 ++++++
 kernel/irq/msi.c    | 2 ++
 2 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 26d07e23052e..765a65581a66 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -676,6 +676,12 @@ int platform_device_msi_init_and_alloc_irqs(struct device *dev, unsigned int nve
 void platform_device_msi_free_irqs_all(struct device *dev);
 
 bool msi_device_has_isolated_msi(struct device *dev);
+
+static inline int msi_domain_alloc_irqs(struct device *dev, unsigned int domid, int nirqs)
+{
+	return msi_domain_alloc_irqs_range(dev, domid, 0, nirqs - 1);
+}
+
 #else /* CONFIG_GENERIC_MSI_IRQ */
 static inline bool msi_device_has_isolated_msi(struct device *dev)
 {
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index f90952ebc494..2024f89baea4 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -1434,6 +1434,7 @@ int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid,
 	msi_unlock_descs(dev);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(msi_domain_alloc_irqs_range);
 
 /**
  * msi_domain_alloc_irqs_all_locked - Allocate all interrupts from a MSI interrupt domain
@@ -1680,6 +1681,7 @@ void msi_domain_free_irqs_range(struct device *dev, unsigned int domid,
 	msi_domain_free_irqs_range_locked(dev, domid, first, last);
 	msi_unlock_descs(dev);
 }
+EXPORT_SYMBOL_GPL(msi_domain_free_irqs_all);
 
 /**
  * msi_domain_free_irqs_all_locked - Free all interrupts from a MSI interrupt domain
-- 
cgit v1.2.3


From ff33132605c1a0acea59e4c523cb7c6fabe856b2 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <mazziesaccount@gmail.com>
Date: Tue, 23 Apr 2024 14:38:28 +0300
Subject: regulator: change devm_regulator_get_enable_optional() stub to return
 Ok

The devm_regulator_get_enable_optional() should be a 'call and forget'
API, meaning, when it is used to enable the regulators, the API does not
provide a handle to do any further control of the regulators. It gives
no real benefit to return an error from the stub if CONFIG_REGULATOR is
not set.

On the contrary, returning an error is causing problems to drivers when
hardware is such it works out just fine with no regulator control.
Returning an error forces drivers to specifically handle the case where
CONFIG_REGULATOR is not set, making the mere existence of the stub
questionalble.

Change the stub implementation for the
devm_regulator_get_enable_optional() to return Ok so drivers do not
separately handle the case where the CONFIG_REGULATOR is not set.

Signed-off-by: Matti Vaittinen <mazziesaccount@gmail.com>
Fixes: da279e6965b3 ("regulator: Add devm helpers for get and enable")
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Link: https://lore.kernel.org/r/ZiedtOE00Zozd3XO@fedora
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regulator/consumer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index 71232fb7dda3..ed180ca419da 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -326,7 +326,7 @@ static inline int devm_regulator_get_enable(struct device *dev, const char *id)
 static inline int devm_regulator_get_enable_optional(struct device *dev,
 						     const char *id)
 {
-	return -ENODEV;
+	return 0;
 }
 
 static inline struct regulator *__must_check
-- 
cgit v1.2.3


From d56b63cf0c0f71e1b2e04dd8220b408f049e67ff Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <bentiss@kernel.org>
Date: Sat, 20 Apr 2024 11:09:05 +0200
Subject: bpf: add support for bpf_wq user type

Mostly a copy/paste from the bpf_timer API, without the initialization
and free, as they will be done in a separate patch.

Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-5-6c986a5a741f@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h      | 11 ++++++++++-
 include/uapi/linux/bpf.h |  4 ++++
 kernel/bpf/btf.c         | 17 +++++++++++++++++
 kernel/bpf/syscall.c     |  6 +++++-
 kernel/bpf/verifier.c    |  9 +++++++++
 5 files changed, 45 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5034c1b4ded7..c7dcfd395555 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -185,7 +185,7 @@ struct bpf_map_ops {
 
 enum {
 	/* Support at most 10 fields in a BTF type */
-	BTF_FIELDS_MAX	   = 10,
+	BTF_FIELDS_MAX	   = 11,
 };
 
 enum btf_field_type {
@@ -202,6 +202,7 @@ enum btf_field_type {
 	BPF_GRAPH_NODE = BPF_RB_NODE | BPF_LIST_NODE,
 	BPF_GRAPH_ROOT = BPF_RB_ROOT | BPF_LIST_HEAD,
 	BPF_REFCOUNT   = (1 << 9),
+	BPF_WORKQUEUE  = (1 << 10),
 };
 
 typedef void (*btf_dtor_kfunc_t)(void *);
@@ -238,6 +239,7 @@ struct btf_record {
 	u32 field_mask;
 	int spin_lock_off;
 	int timer_off;
+	int wq_off;
 	int refcount_off;
 	struct btf_field fields[];
 };
@@ -312,6 +314,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type)
 		return "bpf_spin_lock";
 	case BPF_TIMER:
 		return "bpf_timer";
+	case BPF_WORKQUEUE:
+		return "bpf_wq";
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
 		return "kptr";
@@ -340,6 +344,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type)
 		return sizeof(struct bpf_spin_lock);
 	case BPF_TIMER:
 		return sizeof(struct bpf_timer);
+	case BPF_WORKQUEUE:
+		return sizeof(struct bpf_wq);
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
 	case BPF_KPTR_PERCPU:
@@ -367,6 +373,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type)
 		return __alignof__(struct bpf_spin_lock);
 	case BPF_TIMER:
 		return __alignof__(struct bpf_timer);
+	case BPF_WORKQUEUE:
+		return __alignof__(struct bpf_wq);
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
 	case BPF_KPTR_PERCPU:
@@ -406,6 +414,7 @@ static inline void bpf_obj_init_field(const struct btf_field *field, void *addr)
 		/* RB_ROOT_CACHED 0-inits, no need to do anything after memset */
 	case BPF_SPIN_LOCK:
 	case BPF_TIMER:
+	case BPF_WORKQUEUE:
 	case BPF_KPTR_UNREF:
 	case BPF_KPTR_REF:
 	case BPF_KPTR_PERCPU:
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index cee0a7915c08..e4ae83550fb3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -7306,6 +7306,10 @@ struct bpf_timer {
 	__u64 __opaque[2];
 } __attribute__((aligned(8)));
 
+struct bpf_wq {
+	__u64 __opaque[2];
+} __attribute__((aligned(8)));
+
 struct bpf_dynptr {
 	__u64 __opaque[2];
 } __attribute__((aligned(8)));
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 6d46cee47ae3..8291fbfd27b1 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3464,6 +3464,15 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
 			goto end;
 		}
 	}
+	if (field_mask & BPF_WORKQUEUE) {
+		if (!strcmp(name, "bpf_wq")) {
+			if (*seen_mask & BPF_WORKQUEUE)
+				return -E2BIG;
+			*seen_mask |= BPF_WORKQUEUE;
+			type = BPF_WORKQUEUE;
+			goto end;
+		}
+	}
 	field_mask_test_name(BPF_LIST_HEAD, "bpf_list_head");
 	field_mask_test_name(BPF_LIST_NODE, "bpf_list_node");
 	field_mask_test_name(BPF_RB_ROOT,   "bpf_rb_root");
@@ -3515,6 +3524,7 @@ static int btf_find_struct_field(const struct btf *btf,
 		switch (field_type) {
 		case BPF_SPIN_LOCK:
 		case BPF_TIMER:
+		case BPF_WORKQUEUE:
 		case BPF_LIST_NODE:
 		case BPF_RB_NODE:
 		case BPF_REFCOUNT:
@@ -3582,6 +3592,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
 		switch (field_type) {
 		case BPF_SPIN_LOCK:
 		case BPF_TIMER:
+		case BPF_WORKQUEUE:
 		case BPF_LIST_NODE:
 		case BPF_RB_NODE:
 		case BPF_REFCOUNT:
@@ -3816,6 +3827,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
 
 	rec->spin_lock_off = -EINVAL;
 	rec->timer_off = -EINVAL;
+	rec->wq_off = -EINVAL;
 	rec->refcount_off = -EINVAL;
 	for (i = 0; i < cnt; i++) {
 		field_type_size = btf_field_type_size(info_arr[i].type);
@@ -3846,6 +3858,11 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
 			/* Cache offset for faster lookup at runtime */
 			rec->timer_off = rec->fields[i].offset;
 			break;
+		case BPF_WORKQUEUE:
+			WARN_ON_ONCE(rec->wq_off >= 0);
+			/* Cache offset for faster lookup at runtime */
+			rec->wq_off = rec->fields[i].offset;
+			break;
 		case BPF_REFCOUNT:
 			WARN_ON_ONCE(rec->refcount_off >= 0);
 			/* Cache offset for faster lookup at runtime */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7d392ec83655..0848e4141b00 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -559,6 +559,7 @@ void btf_record_free(struct btf_record *rec)
 		case BPF_SPIN_LOCK:
 		case BPF_TIMER:
 		case BPF_REFCOUNT:
+		case BPF_WORKQUEUE:
 			/* Nothing to release */
 			break;
 		default:
@@ -608,6 +609,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
 		case BPF_SPIN_LOCK:
 		case BPF_TIMER:
 		case BPF_REFCOUNT:
+		case BPF_WORKQUEUE:
 			/* Nothing to acquire */
 			break;
 		default:
@@ -679,6 +681,8 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 		case BPF_TIMER:
 			bpf_timer_cancel_and_free(field_ptr);
 			break;
+		case BPF_WORKQUEUE:
+			break;
 		case BPF_KPTR_UNREF:
 			WRITE_ONCE(*(u64 *)field_ptr, 0);
 			break;
@@ -1085,7 +1089,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
 
 	map->record = btf_parse_fields(btf, value_type,
 				       BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
-				       BPF_RB_ROOT | BPF_REFCOUNT,
+				       BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE,
 				       map->value_size);
 	if (!IS_ERR_OR_NULL(map->record)) {
 		int i;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5a7e34e83a5b..89490a95b120 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1838,6 +1838,8 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
 			 */
 			if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER))
 				reg->map_uid = reg->id;
+			if (btf_record_has_field(map->inner_map_meta->record, BPF_WORKQUEUE))
+				reg->map_uid = reg->id;
 		} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
 			reg->type = PTR_TO_XDP_SOCK;
 		} else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
@@ -18141,6 +18143,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
 		}
 	}
 
+	if (btf_record_has_field(map->record, BPF_WORKQUEUE)) {
+		if (is_tracing_prog_type(prog_type)) {
+			verbose(env, "tracing progs cannot use bpf_wq yet\n");
+			return -EINVAL;
+		}
+	}
+
 	if ((bpf_prog_is_offloaded(prog->aux) || bpf_map_is_offloaded(map)) &&
 	    !bpf_offload_prog_map_match(prog, map)) {
 		verbose(env, "offload device mismatch between prog and map\n");
-- 
cgit v1.2.3


From 246331e3f1eac905170a923f0ec76725c2558232 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <bentiss@kernel.org>
Date: Sat, 20 Apr 2024 11:09:09 +0200
Subject: bpf: allow struct bpf_wq to be embedded in arraymaps and hashmaps

Currently bpf_wq_cancel_and_free() is just a placeholder as there is
no memory allocation for bpf_wq just yet.

Again, duplication of the bpf_timer approach

Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-9-6c986a5a741f@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h   |  2 ++
 kernel/bpf/arraymap.c | 18 ++++++++++-------
 kernel/bpf/hashtab.c  | 55 ++++++++++++++++++++++++++++++++++++++++-----------
 kernel/bpf/helpers.c  |  8 ++++++++
 kernel/bpf/syscall.c  |  9 +++++++++
 5 files changed, 73 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c7dcfd395555..64bf0cf3ee95 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -534,6 +534,7 @@ static inline void zero_map_value(struct bpf_map *map, void *dst)
 void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
 			   bool lock_src);
 void bpf_timer_cancel_and_free(void *timer);
+void bpf_wq_cancel_and_free(void *timer);
 void bpf_list_head_free(const struct btf_field *field, void *list_head,
 			struct bpf_spin_lock *spin_lock);
 void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
@@ -2204,6 +2205,7 @@ void bpf_map_free_record(struct bpf_map *map);
 struct btf_record *btf_record_dup(const struct btf_record *rec);
 bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b);
 void bpf_obj_free_timer(const struct btf_record *rec, void *obj);
+void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj);
 void bpf_obj_free_fields(const struct btf_record *rec, void *obj);
 void __bpf_obj_drop_impl(void *p, const struct btf_record *rec, bool percpu);
 
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 8c1e6d7654bb..580d07b15471 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -428,17 +428,21 @@ static void *array_map_vmalloc_addr(struct bpf_array *array)
 	return (void *)round_down((unsigned long)array, PAGE_SIZE);
 }
 
-static void array_map_free_timers(struct bpf_map *map)
+static void array_map_free_timers_wq(struct bpf_map *map)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	int i;
 
-	/* We don't reset or free fields other than timer on uref dropping to zero. */
-	if (!btf_record_has_field(map->record, BPF_TIMER))
-		return;
+	/* We don't reset or free fields other than timer and workqueue
+	 * on uref dropping to zero.
+	 */
+	if (btf_record_has_field(map->record, BPF_TIMER))
+		for (i = 0; i < array->map.max_entries; i++)
+			bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i));
 
-	for (i = 0; i < array->map.max_entries; i++)
-		bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i));
+	if (btf_record_has_field(map->record, BPF_WORKQUEUE))
+		for (i = 0; i < array->map.max_entries; i++)
+			bpf_obj_free_workqueue(map->record, array_map_elem_ptr(array, i));
 }
 
 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
@@ -782,7 +786,7 @@ const struct bpf_map_ops array_map_ops = {
 	.map_alloc = array_map_alloc,
 	.map_free = array_map_free,
 	.map_get_next_key = array_map_get_next_key,
-	.map_release_uref = array_map_free_timers,
+	.map_release_uref = array_map_free_timers_wq,
 	.map_lookup_elem = array_map_lookup_elem,
 	.map_update_elem = array_map_update_elem,
 	.map_delete_elem = array_map_delete_elem,
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index c3e79a0b9361..0179183c543a 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -240,6 +240,26 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab)
 	}
 }
 
+static void htab_free_prealloced_wq(struct bpf_htab *htab)
+{
+	u32 num_entries = htab->map.max_entries;
+	int i;
+
+	if (!btf_record_has_field(htab->map.record, BPF_WORKQUEUE))
+		return;
+	if (htab_has_extra_elems(htab))
+		num_entries += num_possible_cpus();
+
+	for (i = 0; i < num_entries; i++) {
+		struct htab_elem *elem;
+
+		elem = get_htab_elem(htab, i);
+		bpf_obj_free_workqueue(htab->map.record,
+				       elem->key + round_up(htab->map.key_size, 8));
+		cond_resched();
+	}
+}
+
 static void htab_free_prealloced_fields(struct bpf_htab *htab)
 {
 	u32 num_entries = htab->map.max_entries;
@@ -1495,7 +1515,7 @@ static void delete_all_elements(struct bpf_htab *htab)
 	migrate_enable();
 }
 
-static void htab_free_malloced_timers(struct bpf_htab *htab)
+static void htab_free_malloced_timers_or_wq(struct bpf_htab *htab, bool is_timer)
 {
 	int i;
 
@@ -1507,24 +1527,35 @@ static void htab_free_malloced_timers(struct bpf_htab *htab)
 
 		hlist_nulls_for_each_entry(l, n, head, hash_node) {
 			/* We only free timer on uref dropping to zero */
-			bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8));
+			if (is_timer)
+				bpf_obj_free_timer(htab->map.record,
+						   l->key + round_up(htab->map.key_size, 8));
+			else
+				bpf_obj_free_workqueue(htab->map.record,
+						       l->key + round_up(htab->map.key_size, 8));
 		}
 		cond_resched_rcu();
 	}
 	rcu_read_unlock();
 }
 
-static void htab_map_free_timers(struct bpf_map *map)
+static void htab_map_free_timers_and_wq(struct bpf_map *map)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 
-	/* We only free timer on uref dropping to zero */
-	if (!btf_record_has_field(htab->map.record, BPF_TIMER))
-		return;
-	if (!htab_is_prealloc(htab))
-		htab_free_malloced_timers(htab);
-	else
-		htab_free_prealloced_timers(htab);
+	/* We only free timer and workqueue on uref dropping to zero */
+	if (btf_record_has_field(htab->map.record, BPF_TIMER)) {
+		if (!htab_is_prealloc(htab))
+			htab_free_malloced_timers_or_wq(htab, true);
+		else
+			htab_free_prealloced_timers(htab);
+	}
+	if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) {
+		if (!htab_is_prealloc(htab))
+			htab_free_malloced_timers_or_wq(htab, false);
+		else
+			htab_free_prealloced_wq(htab);
+	}
 }
 
 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
@@ -2260,7 +2291,7 @@ const struct bpf_map_ops htab_map_ops = {
 	.map_alloc = htab_map_alloc,
 	.map_free = htab_map_free,
 	.map_get_next_key = htab_map_get_next_key,
-	.map_release_uref = htab_map_free_timers,
+	.map_release_uref = htab_map_free_timers_and_wq,
 	.map_lookup_elem = htab_map_lookup_elem,
 	.map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
 	.map_update_elem = htab_map_update_elem,
@@ -2281,7 +2312,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
 	.map_alloc = htab_map_alloc,
 	.map_free = htab_map_free,
 	.map_get_next_key = htab_map_get_next_key,
-	.map_release_uref = htab_map_free_timers,
+	.map_release_uref = htab_map_free_timers_and_wq,
 	.map_lookup_elem = htab_lru_map_lookup_elem,
 	.map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem,
 	.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 0b8ba6c81994..fe827f6e5234 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1468,6 +1468,14 @@ void bpf_timer_cancel_and_free(void *val)
 	kfree_rcu(t, cb.rcu);
 }
 
+/* This function is called by map_delete/update_elem for individual element and
+ * by ops->map_release_uref when the user space reference to a map reaches zero.
+ */
+void bpf_wq_cancel_and_free(void *val)
+{
+	BTF_TYPE_EMIT(struct bpf_wq);
+}
+
 BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr)
 {
 	unsigned long *kptr = map_value;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0848e4141b00..63e368337483 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -661,6 +661,13 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
 	bpf_timer_cancel_and_free(obj + rec->timer_off);
 }
 
+void bpf_obj_free_workqueue(const struct btf_record *rec, void *obj)
+{
+	if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_WORKQUEUE)))
+		return;
+	bpf_wq_cancel_and_free(obj + rec->wq_off);
+}
+
 void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 {
 	const struct btf_field *fields;
@@ -682,6 +689,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
 			bpf_timer_cancel_and_free(field_ptr);
 			break;
 		case BPF_WORKQUEUE:
+			bpf_wq_cancel_and_free(field_ptr);
 			break;
 		case BPF_KPTR_UNREF:
 			WRITE_ONCE(*(u64 *)field_ptr, 0);
@@ -1119,6 +1127,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
 				}
 				break;
 			case BPF_TIMER:
+			case BPF_WORKQUEUE:
 				if (map->map_type != BPF_MAP_TYPE_HASH &&
 				    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
 				    map->map_type != BPF_MAP_TYPE_ARRAY) {
-- 
cgit v1.2.3


From 81f1d7a583fa1fa14f0c4e6140d34b5e3d08d227 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <bentiss@kernel.org>
Date: Sat, 20 Apr 2024 11:09:13 +0200
Subject: bpf: wq: add bpf_wq_set_callback_impl

To support sleepable async callbacks, we need to tell push_async_cb()
whether the cb is sleepable or not.

The verifier now detects that we are in bpf_wq_set_callback_impl and
can allow a sleepable callback to happen.

Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-13-6c986a5a741f@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  1 +
 kernel/bpf/helpers.c         | 15 +++++++++++
 kernel/bpf/verifier.c        | 60 +++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 70 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 36d19cd32eb5..9db35530c878 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -426,6 +426,7 @@ struct bpf_verifier_state {
 	 * while they are still in use.
 	 */
 	bool used_as_loop_entry;
+	bool in_sleepable;
 
 	/* first and last insn idx of this verifier state */
 	u32 first_insn_idx;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1fc002cfe2b1..dd2eaaf77c93 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2711,6 +2711,20 @@ __bpf_kfunc int bpf_wq_init(struct bpf_wq *wq, void *p__map, unsigned int flags)
 	return __bpf_async_init(async, map, flags, BPF_ASYNC_TYPE_WQ);
 }
 
+__bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq,
+					 int (callback_fn)(void *map, int *key, struct bpf_wq *wq),
+					 unsigned int flags,
+					 void *aux__ign)
+{
+	struct bpf_prog_aux *aux = (struct bpf_prog_aux *)aux__ign;
+	struct bpf_async_kern *async = (struct bpf_async_kern *)wq;
+
+	if (flags)
+		return -EINVAL;
+
+	return __bpf_async_set_callback(async, callback_fn, aux, flags, BPF_ASYNC_TYPE_WQ);
+}
+
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(generic_btf_ids)
@@ -2789,6 +2803,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_size)
 BTF_ID_FLAGS(func, bpf_dynptr_clone)
 BTF_ID_FLAGS(func, bpf_modify_return_test_tp)
 BTF_ID_FLAGS(func, bpf_wq_init)
+BTF_ID_FLAGS(func, bpf_wq_set_callback_impl)
 BTF_KFUNCS_END(common_btf_ids)
 
 static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index cc20fda907fb..9715c88cc025 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -501,8 +501,12 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id)
 }
 
 static bool is_sync_callback_calling_kfunc(u32 btf_id);
+static bool is_async_callback_calling_kfunc(u32 btf_id);
+static bool is_callback_calling_kfunc(u32 btf_id);
 static bool is_bpf_throw_kfunc(struct bpf_insn *insn);
 
+static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id);
+
 static bool is_sync_callback_calling_function(enum bpf_func_id func_id)
 {
 	return func_id == BPF_FUNC_for_each_map_elem ||
@@ -530,7 +534,8 @@ static bool is_sync_callback_calling_insn(struct bpf_insn *insn)
 
 static bool is_async_callback_calling_insn(struct bpf_insn *insn)
 {
-	return bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm);
+	return (bpf_helper_call(insn) && is_async_callback_calling_function(insn->imm)) ||
+	       (bpf_pseudo_kfunc_call(insn) && is_async_callback_calling_kfunc(insn->imm));
 }
 
 static bool is_may_goto_insn(struct bpf_insn *insn)
@@ -1429,6 +1434,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	}
 	dst_state->speculative = src->speculative;
 	dst_state->active_rcu_lock = src->active_rcu_lock;
+	dst_state->in_sleepable = src->in_sleepable;
 	dst_state->curframe = src->curframe;
 	dst_state->active_lock.ptr = src->active_lock.ptr;
 	dst_state->active_lock.id = src->active_lock.id;
@@ -2404,7 +2410,7 @@ static void init_func_state(struct bpf_verifier_env *env,
 /* Similar to push_stack(), but for async callbacks */
 static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
 						int insn_idx, int prev_insn_idx,
-						int subprog)
+						int subprog, bool is_sleepable)
 {
 	struct bpf_verifier_stack_elem *elem;
 	struct bpf_func_state *frame;
@@ -2431,6 +2437,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
 	 * Initialize it similar to do_check_common().
 	 */
 	elem->st.branches = 1;
+	elem->st.in_sleepable = is_sleepable;
 	frame = kzalloc(sizeof(*frame), GFP_KERNEL);
 	if (!frame)
 		goto err;
@@ -5278,7 +5285,8 @@ bad_type:
 
 static bool in_sleepable(struct bpf_verifier_env *env)
 {
-	return env->prog->sleepable;
+	return env->prog->sleepable ||
+	       (env->cur_state && env->cur_state->in_sleepable);
 }
 
 /* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock()
@@ -9513,7 +9521,7 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins
 	 */
 	env->subprog_info[subprog].is_cb = true;
 	if (bpf_pseudo_kfunc_call(insn) &&
-	    !is_sync_callback_calling_kfunc(insn->imm)) {
+	    !is_callback_calling_kfunc(insn->imm)) {
 		verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n",
 			func_id_name(insn->imm), insn->imm);
 		return -EFAULT;
@@ -9527,10 +9535,11 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins
 	if (is_async_callback_calling_insn(insn)) {
 		struct bpf_verifier_state *async_cb;
 
-		/* there is no real recursion here. timer callbacks are async */
+		/* there is no real recursion here. timer and workqueue callbacks are async */
 		env->subprog_info[subprog].is_async_cb = true;
 		async_cb = push_async_cb(env, env->subprog_info[subprog].start,
-					 insn_idx, subprog);
+					 insn_idx, subprog,
+					 is_bpf_wq_set_callback_impl_kfunc(insn->imm));
 		if (!async_cb)
 			return -EFAULT;
 		callee = async_cb->frame[0];
@@ -11017,6 +11026,7 @@ enum special_kfunc_type {
 	KF_bpf_percpu_obj_new_impl,
 	KF_bpf_percpu_obj_drop_impl,
 	KF_bpf_throw,
+	KF_bpf_wq_set_callback_impl,
 	KF_bpf_iter_css_task_new,
 };
 
@@ -11041,6 +11051,7 @@ BTF_ID(func, bpf_dynptr_clone)
 BTF_ID(func, bpf_percpu_obj_new_impl)
 BTF_ID(func, bpf_percpu_obj_drop_impl)
 BTF_ID(func, bpf_throw)
+BTF_ID(func, bpf_wq_set_callback_impl)
 #ifdef CONFIG_CGROUPS
 BTF_ID(func, bpf_iter_css_task_new)
 #endif
@@ -11069,6 +11080,7 @@ BTF_ID(func, bpf_dynptr_clone)
 BTF_ID(func, bpf_percpu_obj_new_impl)
 BTF_ID(func, bpf_percpu_obj_drop_impl)
 BTF_ID(func, bpf_throw)
+BTF_ID(func, bpf_wq_set_callback_impl)
 #ifdef CONFIG_CGROUPS
 BTF_ID(func, bpf_iter_css_task_new)
 #else
@@ -11402,12 +11414,28 @@ static bool is_sync_callback_calling_kfunc(u32 btf_id)
 	return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
 }
 
+static bool is_async_callback_calling_kfunc(u32 btf_id)
+{
+	return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl];
+}
+
 static bool is_bpf_throw_kfunc(struct bpf_insn *insn)
 {
 	return bpf_pseudo_kfunc_call(insn) && insn->off == 0 &&
 	       insn->imm == special_kfunc_list[KF_bpf_throw];
 }
 
+static bool is_bpf_wq_set_callback_impl_kfunc(u32 btf_id)
+{
+	return btf_id == special_kfunc_list[KF_bpf_wq_set_callback_impl];
+}
+
+static bool is_callback_calling_kfunc(u32 btf_id)
+{
+	return is_sync_callback_calling_kfunc(btf_id) ||
+	       is_async_callback_calling_kfunc(btf_id);
+}
+
 static bool is_rbtree_lock_required_kfunc(u32 btf_id)
 {
 	return is_bpf_rbtree_api_kfunc(btf_id);
@@ -12219,6 +12247,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		}
 	}
 
+	if (is_bpf_wq_set_callback_impl_kfunc(meta.func_id)) {
+		err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+					 set_timer_callback_state);
+		if (err) {
+			verbose(env, "kfunc %s#%d failed callback verification\n",
+				func_name, meta.func_id);
+			return err;
+		}
+	}
+
 	rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta);
 	rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta);
 
@@ -16968,6 +17006,9 @@ static bool states_equal(struct bpf_verifier_env *env,
 	if (old->active_rcu_lock != cur->active_rcu_lock)
 		return false;
 
+	if (old->in_sleepable != cur->in_sleepable)
+		return false;
+
 	/* for states to be equal callsites have to be the same
 	 * and all frame states need to be equivalent
 	 */
@@ -19639,6 +19680,13 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		   desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
 		insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
 		*cnt = 1;
+	} else if (is_bpf_wq_set_callback_impl_kfunc(desc->func_id)) {
+		struct bpf_insn ld_addrs[2] = { BPF_LD_IMM64(BPF_REG_4, (long)env->prog->aux) };
+
+		insn_buf[0] = ld_addrs[0];
+		insn_buf[1] = ld_addrs[1];
+		insn_buf[2] = *insn;
+		*cnt = 3;
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From 520713a93d550406dae14d49cdb8778d70cecdfd Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Fri, 15 Mar 2024 19:11:30 +0100
Subject: sysctl: treewide: drop unused argument
 ctl_table_root::set_ownership(table)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove the 'table' argument from set_ownership as it is never used. This
change is a step towards putting "struct ctl_table" into .rodata and
eventually having sysctl core only use "const struct ctl_table".

The patch was created with the following coccinelle script:

  @@
  identifier func, head, table, uid, gid;
  @@

  void func(
    struct ctl_table_header *head,
  - struct ctl_table *table,
    kuid_t *uid, kgid_t *gid)
  { ... }

No additional occurrences of 'set_ownership' were found after doing a
tree-wide search.

Reviewed-by: Joel Granados <j.granados@samsung.com>
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Signed-off-by: Joel Granados <j.granados@samsung.com>
---
 fs/proc/proc_sysctl.c  | 2 +-
 include/linux/sysctl.h | 1 -
 ipc/ipc_sysctl.c       | 3 +--
 ipc/mq_sysctl.c        | 3 +--
 net/sysctl_net.c       | 1 -
 5 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 37cde0efee57..ed3a41ed9705 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -480,7 +480,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
 	}
 
 	if (root->set_ownership)
-		root->set_ownership(head, table, &inode->i_uid, &inode->i_gid);
+		root->set_ownership(head, &inode->i_uid, &inode->i_gid);
 	else {
 		inode->i_uid = GLOBAL_ROOT_UID;
 		inode->i_gid = GLOBAL_ROOT_GID;
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index ee7d33b89e9e..60333a6b9370 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -205,7 +205,6 @@ struct ctl_table_root {
 	struct ctl_table_set default_set;
 	struct ctl_table_set *(*lookup)(struct ctl_table_root *root);
 	void (*set_ownership)(struct ctl_table_header *head,
-			      struct ctl_table *table,
 			      kuid_t *uid, kgid_t *gid);
 	int (*permissions)(struct ctl_table_header *head, struct ctl_table *table);
 };
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index 45cb1dabce29..1a5085e5b178 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -192,7 +192,6 @@ static int set_is_seen(struct ctl_table_set *set)
 }
 
 static void ipc_set_ownership(struct ctl_table_header *head,
-			      struct ctl_table *table,
 			      kuid_t *uid, kgid_t *gid)
 {
 	struct ipc_namespace *ns =
@@ -224,7 +223,7 @@ static int ipc_permissions(struct ctl_table_header *head, struct ctl_table *tabl
 		kuid_t ns_root_uid;
 		kgid_t ns_root_gid;
 
-		ipc_set_ownership(head, table, &ns_root_uid, &ns_root_gid);
+		ipc_set_ownership(head, &ns_root_uid, &ns_root_gid);
 
 		if (uid_eq(current_euid(), ns_root_uid))
 			mode >>= 6;
diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c
index 21fba3a6edaf..6bb1c5397c69 100644
--- a/ipc/mq_sysctl.c
+++ b/ipc/mq_sysctl.c
@@ -78,7 +78,6 @@ static int set_is_seen(struct ctl_table_set *set)
 }
 
 static void mq_set_ownership(struct ctl_table_header *head,
-			     struct ctl_table *table,
 			     kuid_t *uid, kgid_t *gid)
 {
 	struct ipc_namespace *ns =
@@ -97,7 +96,7 @@ static int mq_permissions(struct ctl_table_header *head, struct ctl_table *table
 	kuid_t ns_root_uid;
 	kgid_t ns_root_gid;
 
-	mq_set_ownership(head, table, &ns_root_uid, &ns_root_gid);
+	mq_set_ownership(head, &ns_root_uid, &ns_root_gid);
 
 	if (uid_eq(current_euid(), ns_root_uid))
 		mode >>= 6;
diff --git a/net/sysctl_net.c b/net/sysctl_net.c
index 051ed5f6fc93..a0a7a79991f9 100644
--- a/net/sysctl_net.c
+++ b/net/sysctl_net.c
@@ -54,7 +54,6 @@ static int net_ctl_permissions(struct ctl_table_header *head,
 }
 
 static void net_ctl_set_ownership(struct ctl_table_header *head,
-				  struct ctl_table *table,
 				  kuid_t *uid, kgid_t *gid)
 {
 	struct net *net = container_of(head->set, struct net, sysctls);
-- 
cgit v1.2.3


From 795f90c6f13c30484ff10355a6775979f57f78cb Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Fri, 15 Mar 2024 19:11:31 +0100
Subject: sysctl: treewide: constify argument
 ctl_table_root::permissions(table)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The permissions callback should not modify the ctl_table. Enforce this
expectation via the typesystem. This is a step to put "struct ctl_table"
into .rodata throughout the kernel.

The patch was created with the following coccinelle script:

  @@
  identifier func, head, ctl;
  @@

  int func(
    struct ctl_table_header *head,
  - struct ctl_table *ctl)
  + const struct ctl_table *ctl)
  { ... }

(insert_entry() from fs/proc/proc_sysctl.c is a false-positive)

No additional occurrences of '.permissions =' were found after a
tree-wide search for places missed by the conccinelle script.

Reviewed-by: Joel Granados <j.granados@samsung.com>
Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Signed-off-by: Joel Granados <j.granados@samsung.com>
---
 include/linux/sysctl.h | 2 +-
 ipc/ipc_sysctl.c       | 2 +-
 ipc/mq_sysctl.c        | 2 +-
 kernel/ucount.c        | 2 +-
 net/sysctl_net.c       | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 60333a6b9370..f9214de0490c 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -206,7 +206,7 @@ struct ctl_table_root {
 	struct ctl_table_set *(*lookup)(struct ctl_table_root *root);
 	void (*set_ownership)(struct ctl_table_header *head,
 			      kuid_t *uid, kgid_t *gid);
-	int (*permissions)(struct ctl_table_header *head, struct ctl_table *table);
+	int (*permissions)(struct ctl_table_header *head, const struct ctl_table *table);
 };
 
 #define register_sysctl(path, table)	\
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index 1a5085e5b178..19b2a67aef40 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -204,7 +204,7 @@ static void ipc_set_ownership(struct ctl_table_header *head,
 	*gid = gid_valid(ns_root_gid) ? ns_root_gid : GLOBAL_ROOT_GID;
 }
 
-static int ipc_permissions(struct ctl_table_header *head, struct ctl_table *table)
+static int ipc_permissions(struct ctl_table_header *head, const struct ctl_table *table)
 {
 	int mode = table->mode;
 
diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c
index 6bb1c5397c69..43c0825da9e8 100644
--- a/ipc/mq_sysctl.c
+++ b/ipc/mq_sysctl.c
@@ -90,7 +90,7 @@ static void mq_set_ownership(struct ctl_table_header *head,
 	*gid = gid_valid(ns_root_gid) ? ns_root_gid : GLOBAL_ROOT_GID;
 }
 
-static int mq_permissions(struct ctl_table_header *head, struct ctl_table *table)
+static int mq_permissions(struct ctl_table_header *head, const struct ctl_table *table)
 {
 	int mode = table->mode;
 	kuid_t ns_root_uid;
diff --git a/kernel/ucount.c b/kernel/ucount.c
index e196da0204dc..4d5b9c12c014 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -38,7 +38,7 @@ static int set_is_seen(struct ctl_table_set *set)
 }
 
 static int set_permissions(struct ctl_table_header *head,
-				  struct ctl_table *table)
+			   const struct ctl_table *table)
 {
 	struct user_namespace *user_ns =
 		container_of(head->set, struct user_namespace, set);
diff --git a/net/sysctl_net.c b/net/sysctl_net.c
index a0a7a79991f9..f5017012a049 100644
--- a/net/sysctl_net.c
+++ b/net/sysctl_net.c
@@ -40,7 +40,7 @@ static int is_seen(struct ctl_table_set *set)
 
 /* Return standard mode bits for table entry. */
 static int net_ctl_permissions(struct ctl_table_header *head,
-			       struct ctl_table *table)
+			       const struct ctl_table *table)
 {
 	struct net *net = container_of(head->set, struct net, sysctls);
 
-- 
cgit v1.2.3


From 4a7b29f6509470a7e63dd67e516268fbef226194 Mon Sep 17 00:00:00 2001
From: Thomas Weißschuh <linux@weissschuh.net>
Date: Fri, 22 Mar 2024 18:05:57 +0100
Subject: sysctl: move sysctl type to ctl_table_header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move the SYSCTL_TABLE_TYPE_{DEFAULT,PERMANENTLY_EMPTY} enums from
ctl_table to ctl_table_header.
Removing the mutable member is necessary to constify static instances
of struct ctl_table.

Move the initialization of the sysctl_mount_point type into
init_header() where all the other header fields are also initialized.

As a side-effect the memory usage of the sysctl core is reduced.
Each ctl_table_header instance can manage multiple ctl_table instances
and is only allocated when the table is actually registered.
This saves 8 bytes of memory per ctl_table on 64bit, 4 due to the enum
field itself and 4 due to padding.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
Signed-off-by: Joel Granados <j.granados@samsung.com>
---
 fs/proc/proc_sysctl.c  | 10 ++++++----
 include/linux/sysctl.h | 22 +++++++++++-----------
 2 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 3f8abb6d2ad5..a847b0bc63c4 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -31,7 +31,7 @@ static const struct inode_operations proc_sys_dir_operations;
 
 /* Support for permanently empty directories */
 static struct ctl_table sysctl_mount_point[] = {
-	{.type = SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY }
+	{ }
 };
 
 /**
@@ -49,11 +49,11 @@ struct ctl_table_header *register_sysctl_mount_point(const char *path)
 EXPORT_SYMBOL(register_sysctl_mount_point);
 
 #define sysctl_is_perm_empty_ctl_header(hptr)		\
-	(hptr->ctl_table[0].type == SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY)
+	(hptr->type == SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY)
 #define sysctl_set_perm_empty_ctl_header(hptr)		\
-	(hptr->ctl_table[0].type = SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY)
+	(hptr->type = SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY)
 #define sysctl_clear_perm_empty_ctl_header(hptr)	\
-	(hptr->ctl_table[0].type = SYSCTL_TABLE_TYPE_DEFAULT)
+	(hptr->type = SYSCTL_TABLE_TYPE_DEFAULT)
 
 void proc_sys_poll_notify(struct ctl_table_poll *poll)
 {
@@ -208,6 +208,8 @@ static void init_header(struct ctl_table_header *head,
 			node++;
 		}
 	}
+	if (table == sysctl_mount_point)
+		sysctl_set_perm_empty_ctl_header(head);
 }
 
 static void erase_header(struct ctl_table_header *head)
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index f9214de0490c..47bd28ffa88f 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -137,17 +137,6 @@ struct ctl_table {
 	void *data;
 	int maxlen;
 	umode_t mode;
-	/**
-	 * enum type - Enumeration to differentiate between ctl target types
-	 * @SYSCTL_TABLE_TYPE_DEFAULT: ctl target with no special considerations
-	 * @SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY: Used to identify a permanently
-	 *                                       empty directory target to serve
-	 *                                       as mount point.
-	 */
-	enum {
-		SYSCTL_TABLE_TYPE_DEFAULT,
-		SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY
-	} type;
 	proc_handler *proc_handler;	/* Callback for text formatting */
 	struct ctl_table_poll *poll;
 	void *extra1;
@@ -188,6 +177,17 @@ struct ctl_table_header {
 	struct ctl_dir *parent;
 	struct ctl_node *node;
 	struct hlist_head inodes; /* head for proc_inode->sysctl_inodes */
+	/**
+	 * enum type - Enumeration to differentiate between ctl target types
+	 * @SYSCTL_TABLE_TYPE_DEFAULT: ctl target with no special considerations
+	 * @SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY: Used to identify a permanently
+	 *                                       empty directory target to serve
+	 *                                       as mount point.
+	 */
+	enum {
+		SYSCTL_TABLE_TYPE_DEFAULT,
+		SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY,
+	} type;
 };
 
 struct ctl_dir {
-- 
cgit v1.2.3


From 75d659317bb136d849a10a30ffe7e0462c982d29 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Tue, 26 Mar 2024 10:16:12 +0100
Subject: cpufreq: Add a cpufreq pressure feedback for the scheduler

Provide to the scheduler a feedback about the temporary max available
capacity. Unlike arch_update_thermal_pressure(), this doesn't need to be
filtered as the pressure will happen for dozens of ms or more.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Qais Yousef <qyousef@layalina.io>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Dhruva Gole <d-gole@ti.com>
Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Link: https://lore.kernel.org/r/20240326091616.3696851-2-vincent.guittot@linaro.org
---
 drivers/cpufreq/cpufreq.c | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/cpufreq.h   | 10 ++++++++++
 2 files changed, 46 insertions(+)

(limited to 'include')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 66e10a19d76a..1de8bd105934 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2582,6 +2582,40 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu)
 }
 EXPORT_SYMBOL(cpufreq_get_policy);
 
+DEFINE_PER_CPU(unsigned long, cpufreq_pressure);
+
+/**
+ * cpufreq_update_pressure() - Update cpufreq pressure for CPUs
+ * @policy: cpufreq policy of the CPUs.
+ *
+ * Update the value of cpufreq pressure for all @cpus in the policy.
+ */
+static void cpufreq_update_pressure(struct cpufreq_policy *policy)
+{
+	unsigned long max_capacity, capped_freq, pressure;
+	u32 max_freq;
+	int cpu;
+
+	cpu = cpumask_first(policy->related_cpus);
+	max_freq = arch_scale_freq_ref(cpu);
+	capped_freq = policy->max;
+
+	/*
+	 * Handle properly the boost frequencies, which should simply clean
+	 * the cpufreq pressure value.
+	 */
+	if (max_freq <= capped_freq) {
+		pressure = 0;
+	} else {
+		max_capacity = arch_scale_cpu_capacity(cpu);
+		pressure = max_capacity -
+			   mult_frac(max_capacity, capped_freq, max_freq);
+	}
+
+	for_each_cpu(cpu, policy->related_cpus)
+		WRITE_ONCE(per_cpu(cpufreq_pressure, cpu), pressure);
+}
+
 /**
  * cpufreq_set_policy - Modify cpufreq policy parameters.
  * @policy: Policy object to modify.
@@ -2637,6 +2671,8 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
 	policy->max = __resolve_freq(policy, policy->max, CPUFREQ_RELATION_H);
 	trace_cpu_frequency_limits(policy);
 
+	cpufreq_update_pressure(policy);
+
 	policy->cached_target_freq = UINT_MAX;
 
 	pr_debug("new min and max freqs are %u - %u kHz\n",
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 9956afb9acc2..20f7e98ee8af 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -241,6 +241,12 @@ struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
 void cpufreq_enable_fast_switch(struct cpufreq_policy *policy);
 void cpufreq_disable_fast_switch(struct cpufreq_policy *policy);
 bool has_target_index(void);
+
+DECLARE_PER_CPU(unsigned long, cpufreq_pressure);
+static inline unsigned long cpufreq_get_pressure(int cpu)
+{
+	return READ_ONCE(per_cpu(cpufreq_pressure, cpu));
+}
 #else
 static inline unsigned int cpufreq_get(unsigned int cpu)
 {
@@ -264,6 +270,10 @@ static inline bool cpufreq_supports_freq_invariance(void)
 }
 static inline void disable_cpufreq(void) { }
 static inline void cpufreq_update_limits(unsigned int cpu) { }
+static inline unsigned long cpufreq_get_pressure(int cpu)
+{
+	return 0;
+}
 #endif
 
 #ifdef CONFIG_CPU_FREQ_STAT
-- 
cgit v1.2.3


From d4dbc991714eefcbd8d54a3204bd77a0a52bd32d Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Tue, 26 Mar 2024 10:16:15 +0100
Subject: sched/cpufreq: Rename arch_update_thermal_pressure() =>
 arch_update_hw_pressure()

Now that cpufreq provides a pressure value to the scheduler, rename
arch_update_thermal_pressure into HW pressure to reflect that it returns
a pressure applied by HW (i.e. with a high frequency change) and not
always related to thermal mitigation but also generated by max current
limitation as an example. Such high frequency signal needs filtering to be
smoothed and provide an value that reflects the average available capacity
into the scheduler time scale.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Lukasz Luba <lukasz.luba@arm.com>
Reviewed-by: Qais Yousef <qyousef@layalina.io>
Reviewed-by: Lukasz Luba <lukasz.luba@arm.com>
Link: https://lore.kernel.org/r/20240326091616.3696851-5-vincent.guittot@linaro.org
---
 arch/arm/include/asm/topology.h         |  6 +++---
 arch/arm64/include/asm/topology.h       |  6 +++---
 drivers/base/arch_topology.c            | 26 +++++++++++++-------------
 drivers/cpufreq/qcom-cpufreq-hw.c       |  4 ++--
 include/linux/arch_topology.h           |  8 ++++----
 include/linux/sched/topology.h          |  8 ++++----
 include/trace/events/hw_pressure.h      | 29 +++++++++++++++++++++++++++++
 include/trace/events/sched.h            |  2 +-
 include/trace/events/thermal_pressure.h | 29 -----------------------------
 init/Kconfig                            | 12 ++++++------
 kernel/sched/core.c                     |  8 ++++----
 kernel/sched/fair.c                     | 16 ++++++++--------
 kernel/sched/pelt.c                     | 18 +++++++++---------
 kernel/sched/pelt.h                     | 16 ++++++++--------
 kernel/sched/sched.h                    | 10 +++++-----
 15 files changed, 99 insertions(+), 99 deletions(-)
 create mode 100644 include/trace/events/hw_pressure.h
 delete mode 100644 include/trace/events/thermal_pressure.h

(limited to 'include')

diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index 853c4f81ba4a..ad36b6570067 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -22,9 +22,9 @@
 /* Enable topology flag updates */
 #define arch_update_cpu_topology topology_update_cpu_topology
 
-/* Replace task scheduler's default thermal pressure API */
-#define arch_scale_thermal_pressure topology_get_thermal_pressure
-#define arch_update_thermal_pressure	topology_update_thermal_pressure
+/* Replace task scheduler's default HW pressure API */
+#define arch_scale_hw_pressure topology_get_hw_pressure
+#define arch_update_hw_pressure	topology_update_hw_pressure
 
 #else
 
diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index a323b109b9c4..0f6ef432fb84 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -35,9 +35,9 @@ void update_freq_counters_refs(void);
 /* Enable topology flag updates */
 #define arch_update_cpu_topology topology_update_cpu_topology
 
-/* Replace task scheduler's default thermal pressure API */
-#define arch_scale_thermal_pressure topology_get_thermal_pressure
-#define arch_update_thermal_pressure	topology_update_thermal_pressure
+/* Replace task scheduler's default HW pressure API */
+#define arch_scale_hw_pressure topology_get_hw_pressure
+#define arch_update_hw_pressure	topology_update_hw_pressure
 
 #include <asm-generic/topology.h>
 
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 024b78a0cfc1..0248912ff687 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -22,7 +22,7 @@
 #include <linux/units.h>
 
 #define CREATE_TRACE_POINTS
-#include <trace/events/thermal_pressure.h>
+#include <trace/events/hw_pressure.h>
 
 static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data);
 static struct cpumask scale_freq_counters_mask;
@@ -160,26 +160,26 @@ void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity)
 	per_cpu(cpu_scale, cpu) = capacity;
 }
 
-DEFINE_PER_CPU(unsigned long, thermal_pressure);
+DEFINE_PER_CPU(unsigned long, hw_pressure);
 
 /**
- * topology_update_thermal_pressure() - Update thermal pressure for CPUs
+ * topology_update_hw_pressure() - Update HW pressure for CPUs
  * @cpus        : The related CPUs for which capacity has been reduced
  * @capped_freq : The maximum allowed frequency that CPUs can run at
  *
- * Update the value of thermal pressure for all @cpus in the mask. The
+ * Update the value of HW pressure for all @cpus in the mask. The
  * cpumask should include all (online+offline) affected CPUs, to avoid
  * operating on stale data when hot-plug is used for some CPUs. The
  * @capped_freq reflects the currently allowed max CPUs frequency due to
- * thermal capping. It might be also a boost frequency value, which is bigger
+ * HW capping. It might be also a boost frequency value, which is bigger
  * than the internal 'capacity_freq_ref' max frequency. In such case the
  * pressure value should simply be removed, since this is an indication that
- * there is no thermal throttling. The @capped_freq must be provided in kHz.
+ * there is no HW throttling. The @capped_freq must be provided in kHz.
  */
-void topology_update_thermal_pressure(const struct cpumask *cpus,
+void topology_update_hw_pressure(const struct cpumask *cpus,
 				      unsigned long capped_freq)
 {
-	unsigned long max_capacity, capacity, th_pressure;
+	unsigned long max_capacity, capacity, hw_pressure;
 	u32 max_freq;
 	int cpu;
 
@@ -189,21 +189,21 @@ void topology_update_thermal_pressure(const struct cpumask *cpus,
 
 	/*
 	 * Handle properly the boost frequencies, which should simply clean
-	 * the thermal pressure value.
+	 * the HW pressure value.
 	 */
 	if (max_freq <= capped_freq)
 		capacity = max_capacity;
 	else
 		capacity = mult_frac(max_capacity, capped_freq, max_freq);
 
-	th_pressure = max_capacity - capacity;
+	hw_pressure = max_capacity - capacity;
 
-	trace_thermal_pressure_update(cpu, th_pressure);
+	trace_hw_pressure_update(cpu, hw_pressure);
 
 	for_each_cpu(cpu, cpus)
-		WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure);
+		WRITE_ONCE(per_cpu(hw_pressure, cpu), hw_pressure);
 }
-EXPORT_SYMBOL_GPL(topology_update_thermal_pressure);
+EXPORT_SYMBOL_GPL(topology_update_hw_pressure);
 
 static ssize_t cpu_capacity_show(struct device *dev,
 				 struct device_attribute *attr,
diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c
index 70b0f21968a0..ec8df5496a0c 100644
--- a/drivers/cpufreq/qcom-cpufreq-hw.c
+++ b/drivers/cpufreq/qcom-cpufreq-hw.c
@@ -347,8 +347,8 @@ static void qcom_lmh_dcvs_notify(struct qcom_cpufreq_data *data)
 
 	throttled_freq = freq_hz / HZ_PER_KHZ;
 
-	/* Update thermal pressure (the boost frequencies are accepted) */
-	arch_update_thermal_pressure(policy->related_cpus, throttled_freq);
+	/* Update HW pressure (the boost frequencies are accepted) */
+	arch_update_hw_pressure(policy->related_cpus, throttled_freq);
 
 	/*
 	 * In the unlikely case policy is unregistered do not enable
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index a63d61ca55af..b721f360d759 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -60,14 +60,14 @@ void topology_scale_freq_tick(void);
 void topology_set_scale_freq_source(struct scale_freq_data *data, const struct cpumask *cpus);
 void topology_clear_scale_freq_source(enum scale_freq_source source, const struct cpumask *cpus);
 
-DECLARE_PER_CPU(unsigned long, thermal_pressure);
+DECLARE_PER_CPU(unsigned long, hw_pressure);
 
-static inline unsigned long topology_get_thermal_pressure(int cpu)
+static inline unsigned long topology_get_hw_pressure(int cpu)
 {
-	return per_cpu(thermal_pressure, cpu);
+	return per_cpu(hw_pressure, cpu);
 }
 
-void topology_update_thermal_pressure(const struct cpumask *cpus,
+void topology_update_hw_pressure(const struct cpumask *cpus,
 				      unsigned long capped_freq);
 
 struct cpu_topology {
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index c8fe9bab981b..4237daa5ac7a 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -270,17 +270,17 @@ unsigned long arch_scale_cpu_capacity(int cpu)
 }
 #endif
 
-#ifndef arch_scale_thermal_pressure
+#ifndef arch_scale_hw_pressure
 static __always_inline
-unsigned long arch_scale_thermal_pressure(int cpu)
+unsigned long arch_scale_hw_pressure(int cpu)
 {
 	return 0;
 }
 #endif
 
-#ifndef arch_update_thermal_pressure
+#ifndef arch_update_hw_pressure
 static __always_inline
-void arch_update_thermal_pressure(const struct cpumask *cpus,
+void arch_update_hw_pressure(const struct cpumask *cpus,
 				  unsigned long capped_frequency)
 { }
 #endif
diff --git a/include/trace/events/hw_pressure.h b/include/trace/events/hw_pressure.h
new file mode 100644
index 000000000000..b9cd68854128
--- /dev/null
+++ b/include/trace/events/hw_pressure.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hw_pressure
+
+#if !defined(_TRACE_THERMAL_PRESSURE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_THERMAL_PRESSURE_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(hw_pressure_update,
+	TP_PROTO(int cpu, unsigned long hw_pressure),
+	TP_ARGS(cpu, hw_pressure),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, hw_pressure)
+		__field(int, cpu)
+	),
+
+	TP_fast_assign(
+		__entry->hw_pressure = hw_pressure;
+		__entry->cpu = cpu;
+	),
+
+	TP_printk("cpu=%d hw_pressure=%lu", __entry->cpu, __entry->hw_pressure)
+);
+#endif /* _TRACE_THERMAL_PRESSURE_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index dbb01b4b7451..d115d64c4011 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -752,7 +752,7 @@ DECLARE_TRACE(pelt_dl_tp,
 	TP_PROTO(struct rq *rq),
 	TP_ARGS(rq));
 
-DECLARE_TRACE(pelt_thermal_tp,
+DECLARE_TRACE(pelt_hw_tp,
 	TP_PROTO(struct rq *rq),
 	TP_ARGS(rq));
 
diff --git a/include/trace/events/thermal_pressure.h b/include/trace/events/thermal_pressure.h
deleted file mode 100644
index b68680201360..000000000000
--- a/include/trace/events/thermal_pressure.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM thermal_pressure
-
-#if !defined(_TRACE_THERMAL_PRESSURE_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_THERMAL_PRESSURE_H
-
-#include <linux/tracepoint.h>
-
-TRACE_EVENT(thermal_pressure_update,
-	TP_PROTO(int cpu, unsigned long thermal_pressure),
-	TP_ARGS(cpu, thermal_pressure),
-
-	TP_STRUCT__entry(
-		__field(unsigned long, thermal_pressure)
-		__field(int, cpu)
-	),
-
-	TP_fast_assign(
-		__entry->thermal_pressure = thermal_pressure;
-		__entry->cpu = cpu;
-	),
-
-	TP_printk("cpu=%d thermal_pressure=%lu", __entry->cpu, __entry->thermal_pressure)
-);
-#endif /* _TRACE_THERMAL_PRESSURE_H */
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
diff --git a/init/Kconfig b/init/Kconfig
index aa02aec6aa7d..f0c9117962ec 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -547,24 +547,24 @@ config HAVE_SCHED_AVG_IRQ
 	depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING
 	depends on SMP
 
-config SCHED_THERMAL_PRESSURE
+config SCHED_HW_PRESSURE
 	bool
 	default y if ARM && ARM_CPU_TOPOLOGY
 	default y if ARM64
 	depends on SMP
 	depends on CPU_FREQ_THERMAL
 	help
-	  Select this option to enable thermal pressure accounting in the
-	  scheduler. Thermal pressure is the value conveyed to the scheduler
+	  Select this option to enable HW pressure accounting in the
+	  scheduler. HW pressure is the value conveyed to the scheduler
 	  that reflects the reduction in CPU compute capacity resulted from
-	  thermal throttling. Thermal throttling occurs when the performance of
-	  a CPU is capped due to high operating temperatures.
+	  HW throttling. HW throttling occurs when the performance of
+	  a CPU is capped due to high operating temperatures as an example.
 
 	  If selected, the scheduler will be able to balance tasks accordingly,
 	  i.e. put less load on throttled CPUs than on non/less throttled ones.
 
 	  This requires the architecture to implement
-	  arch_update_thermal_pressure() and arch_scale_thermal_pressure().
+	  arch_update_hw_pressure() and arch_scale_thermal_pressure().
 
 config BSD_PROCESS_ACCT
 	bool "BSD Process Accounting"
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0621e4ee31de..67a8302c3131 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -108,7 +108,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
-EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_hw_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
@@ -5668,7 +5668,7 @@ void sched_tick(void)
 	struct rq *rq = cpu_rq(cpu);
 	struct task_struct *curr = rq->curr;
 	struct rq_flags rf;
-	unsigned long thermal_pressure;
+	unsigned long hw_pressure;
 	u64 resched_latency;
 
 	if (housekeeping_cpu(cpu, HK_TYPE_TICK))
@@ -5679,8 +5679,8 @@ void sched_tick(void)
 	rq_lock(rq, &rf);
 
 	update_rq_clock(rq);
-	thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
-	update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
+	hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
+	update_hw_load_avg(rq_clock_hw(rq), rq, hw_pressure);
 	curr->sched_class->task_tick(rq, curr, 0);
 	if (sched_feat(LATENCY_WARN))
 		resched_latency = cpu_resched_latency(rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 19199c119829..eef39ae3efcf 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -78,7 +78,7 @@ static unsigned int normalized_sysctl_sched_base_slice	= 750000ULL;
 
 const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
 
-int sched_thermal_decay_shift;
+int sched_hw_decay_shift;
 static int __init setup_sched_thermal_decay_shift(char *str)
 {
 	int _shift = 0;
@@ -86,7 +86,7 @@ static int __init setup_sched_thermal_decay_shift(char *str)
 	if (kstrtoint(str, 0, &_shift))
 		pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
 
-	sched_thermal_decay_shift = clamp(_shift, 0, 10);
+	sched_hw_decay_shift = clamp(_shift, 0, 10);
 	return 1;
 }
 __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
@@ -4969,7 +4969,7 @@ static inline unsigned long get_actual_cpu_capacity(int cpu)
 {
 	unsigned long capacity = arch_scale_cpu_capacity(cpu);
 
-	capacity -= max(thermal_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
+	capacity -= max(hw_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
 
 	return capacity;
 }
@@ -5002,7 +5002,7 @@ static inline int util_fits_cpu(unsigned long util,
 	 * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it
 	 * should fit a little cpu even if there's some pressure.
 	 *
-	 * Only exception is for thermal pressure since it has a direct impact
+	 * Only exception is for HW or cpufreq pressure since it has a direct impact
 	 * on available OPP of the system.
 	 *
 	 * We honour it for uclamp_min only as a drop in performance level
@@ -9324,7 +9324,7 @@ static inline bool others_have_blocked(struct rq *rq)
 	if (cpu_util_dl(rq))
 		return true;
 
-	if (thermal_load_avg(rq))
+	if (hw_load_avg(rq))
 		return true;
 
 	if (cpu_util_irq(rq))
@@ -9354,7 +9354,7 @@ static bool __update_blocked_others(struct rq *rq, bool *done)
 {
 	const struct sched_class *curr_class;
 	u64 now = rq_clock_pelt(rq);
-	unsigned long thermal_pressure;
+	unsigned long hw_pressure;
 	bool decayed;
 
 	/*
@@ -9363,11 +9363,11 @@ static bool __update_blocked_others(struct rq *rq, bool *done)
 	 */
 	curr_class = rq->curr->sched_class;
 
-	thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
+	hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
 
 	decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
 		  update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
-		  update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) |
+		  update_hw_load_avg(rq_clock_hw(rq), rq, hw_pressure) |
 		  update_irq_load_avg(rq, 0);
 
 	if (others_have_blocked(rq))
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 3a96da25b67c..ef00382de595 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -384,30 +384,30 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
 	return 0;
 }
 
-#ifdef CONFIG_SCHED_THERMAL_PRESSURE
+#ifdef CONFIG_SCHED_HW_PRESSURE
 /*
- * thermal:
+ * hardware:
  *
  *   load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked
  *
  *   util_avg and runnable_load_avg are not supported and meaningless.
  *
  * Unlike rt/dl utilization tracking that track time spent by a cpu
- * running a rt/dl task through util_avg, the average thermal pressure is
- * tracked through load_avg. This is because thermal pressure signal is
+ * running a rt/dl task through util_avg, the average HW pressure is
+ * tracked through load_avg. This is because HW pressure signal is
  * time weighted "delta" capacity unlike util_avg which is binary.
  * "delta capacity" =  actual capacity  -
- *			capped capacity a cpu due to a thermal event.
+ *			capped capacity a cpu due to a HW event.
  */
 
-int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
 {
-	if (___update_load_sum(now, &rq->avg_thermal,
+	if (___update_load_sum(now, &rq->avg_hw,
 			       capacity,
 			       capacity,
 			       capacity)) {
-		___update_load_avg(&rq->avg_thermal, 1);
-		trace_pelt_thermal_tp(rq);
+		___update_load_avg(&rq->avg_hw, 1);
+		trace_pelt_hw_tp(rq);
 		return 1;
 	}
 
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 9e1083465fbc..2150062949d4 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -7,21 +7,21 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);
 int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
 int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
 
-#ifdef CONFIG_SCHED_THERMAL_PRESSURE
-int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity);
+#ifdef CONFIG_SCHED_HW_PRESSURE
+int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity);
 
-static inline u64 thermal_load_avg(struct rq *rq)
+static inline u64 hw_load_avg(struct rq *rq)
 {
-	return READ_ONCE(rq->avg_thermal.load_avg);
+	return READ_ONCE(rq->avg_hw.load_avg);
 }
 #else
 static inline int
-update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
 {
 	return 0;
 }
 
-static inline u64 thermal_load_avg(struct rq *rq)
+static inline u64 hw_load_avg(struct rq *rq)
 {
 	return 0;
 }
@@ -202,12 +202,12 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
 }
 
 static inline int
-update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
 {
 	return 0;
 }
 
-static inline u64 thermal_load_avg(struct rq *rq)
+static inline u64 hw_load_avg(struct rq *rq)
 {
 	return 0;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7c39dbf31f75..993edb02fb0d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1108,8 +1108,8 @@ struct rq {
 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
 	struct sched_avg	avg_irq;
 #endif
-#ifdef CONFIG_SCHED_THERMAL_PRESSURE
-	struct sched_avg	avg_thermal;
+#ifdef CONFIG_SCHED_HW_PRESSURE
+	struct sched_avg	avg_hw;
 #endif
 	u64			idle_stamp;
 	u64			avg_idle;
@@ -1561,11 +1561,11 @@ static inline u64 rq_clock_task(struct rq *rq)
  *	3			256
  *	4			512
  */
-extern int sched_thermal_decay_shift;
+extern int sched_hw_decay_shift;
 
-static inline u64 rq_clock_thermal(struct rq *rq)
+static inline u64 rq_clock_hw(struct rq *rq)
 {
-	return rq_clock_task(rq) >> sched_thermal_decay_shift;
+	return rq_clock_task(rq) >> sched_hw_decay_shift;
 }
 
 static inline void rq_clock_skip_update(struct rq *rq)
-- 
cgit v1.2.3


From c661050f93d3fd37a33c06041bb18a89688de7d2 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Mon, 22 Apr 2024 05:38:56 -0700
Subject: net: create a dummy net_device allocator

It is impossible to use init_dummy_netdev together with alloc_netdev()
as the 'setup' argument.

This is because alloc_netdev() initializes some fields in the net_device
structure, and later init_dummy_netdev() memzero them all. This causes
some problems as reported here:

	https://lore.kernel.org/all/20240322082336.49f110cc@kernel.org/

Split the init_dummy_netdev() function in two. Create a new function called
init_dummy_netdev_core() that does not memzero the net_device structure.
Then have init_dummy_netdev() memzero-ing and calling
init_dummy_netdev_core(), keeping the old behaviour.

init_dummy_netdev_core() is the new function that could be called as an
argument for alloc_netdev().

Also, create a helper to allocate and initialize dummy net devices,
leveraging init_dummy_netdev_core() as the setup argument. This function
basically simplify the allocation of dummy devices, by allocating and
initializing it. Freeing the device continue to be done through
free_netdev()

Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Breno Leitao <leitao@debian.org>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  3 +++
 net/core/dev.c            | 56 ++++++++++++++++++++++++++++++++---------------
 2 files changed, 41 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d45f330d083d..f849e7d110ed 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4519,6 +4519,9 @@ static inline void netif_addr_unlock_bh(struct net_device *dev)
 
 void ether_setup(struct net_device *dev);
 
+/* Allocate dummy net_device */
+struct net_device *alloc_netdev_dummy(int sizeof_priv);
+
 /* Support for loadable net-drivers */
 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 				    unsigned char name_assign_type,
diff --git a/net/core/dev.c b/net/core/dev.c
index 62b39d6b1d8f..e09aa3785c15 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10420,25 +10420,12 @@ err_free_name:
 }
 EXPORT_SYMBOL(register_netdevice);
 
-/**
- *	init_dummy_netdev	- init a dummy network device for NAPI
- *	@dev: device to init
- *
- *	This takes a network device structure and initializes the minimum
- *	amount of fields so it can be used to schedule NAPI polls without
- *	registering a full blown interface. This is to be used by drivers
- *	that need to tie several hardware interfaces to a single NAPI
- *	poll scheduler due to HW limitations.
+/* Initialize the core of a dummy net device.
+ * This is useful if you are calling this function after alloc_netdev(),
+ * since it does not memset the net_device fields.
  */
-void init_dummy_netdev(struct net_device *dev)
+static void init_dummy_netdev_core(struct net_device *dev)
 {
-	/* Clear everything. Note we don't initialize spinlocks
-	 * as they aren't supposed to be taken by any of the
-	 * NAPI code and this dummy netdev is supposed to be
-	 * only ever used for NAPI polls
-	 */
-	memset(dev, 0, sizeof(struct net_device));
-
 	/* make sure we BUG if trying to hit standard
 	 * register/unregister code path
 	 */
@@ -10459,8 +10446,28 @@ void init_dummy_netdev(struct net_device *dev)
 	 * its refcount.
 	 */
 }
-EXPORT_SYMBOL_GPL(init_dummy_netdev);
 
+/**
+ *	init_dummy_netdev	- init a dummy network device for NAPI
+ *	@dev: device to init
+ *
+ *	This takes a network device structure and initializes the minimum
+ *	amount of fields so it can be used to schedule NAPI polls without
+ *	registering a full blown interface. This is to be used by drivers
+ *	that need to tie several hardware interfaces to a single NAPI
+ *	poll scheduler due to HW limitations.
+ */
+void init_dummy_netdev(struct net_device *dev)
+{
+	/* Clear everything. Note we don't initialize spinlocks
+	 * as they aren't supposed to be taken by any of the
+	 * NAPI code and this dummy netdev is supposed to be
+	 * only ever used for NAPI polls
+	 */
+	memset(dev, 0, sizeof(struct net_device));
+	init_dummy_netdev_core(dev);
+}
+EXPORT_SYMBOL_GPL(init_dummy_netdev);
 
 /**
  *	register_netdev	- register a network device
@@ -11080,6 +11087,19 @@ void free_netdev(struct net_device *dev)
 }
 EXPORT_SYMBOL(free_netdev);
 
+/**
+ * alloc_netdev_dummy - Allocate and initialize a dummy net device.
+ * @sizeof_priv: size of private data to allocate space for
+ *
+ * Return: the allocated net_device on success, NULL otherwise
+ */
+struct net_device *alloc_netdev_dummy(int sizeof_priv)
+{
+	return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN,
+			    init_dummy_netdev_core);
+}
+EXPORT_SYMBOL_GPL(alloc_netdev_dummy);
+
 /**
  *	synchronize_net -  Synchronize with packet receive processing
  *
-- 
cgit v1.2.3


From e8293395b9caa57e38cc05b9102d4ec7835b0a0f Mon Sep 17 00:00:00 2001
From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Date: Wed, 24 Apr 2024 10:24:20 +0800
Subject: coresight: Remove duplicate linux/amba/bus.h header

./include/linux/coresight.h: linux/amba/bus.h is included more than once.

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=8869
Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Reviewed-by: James Clark <james.clark@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Link: https://lore.kernel.org/r/20240424022420.58516-1-jiapeng.chong@linux.alibaba.com
---
 include/linux/coresight.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index 653f1712eb77..f09ace92176e 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -12,7 +12,6 @@
 #include <linux/io.h>
 #include <linux/perf_event.h>
 #include <linux/sched.h>
-#include <linux/amba/bus.h>
 #include <linux/platform_device.h>
 
 /* Peripheral id registers (0xFD0-0xFEC) */
-- 
cgit v1.2.3


From 0efc5990bca540b8d438fda23db3a72efa733eb0 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 9 Apr 2024 19:31:50 -0700
Subject: string.h: Introduce memtostr() and memtostr_pad()

Another ambiguous use of strncpy() is to copy from strings that may not
be NUL-terminated. These cases depend on having the destination buffer
be explicitly larger than the source buffer's maximum size, having
the size of the copy exactly match the source buffer's maximum size,
and for the destination buffer to get explicitly NUL terminated.

This usually happens when parsing protocols or hardware character arrays
that are not guaranteed to be NUL-terminated. The code pattern is
effectively this:

	char dest[sizeof(src) + 1];

	strncpy(dest, src, sizeof(src));
	dest[sizeof(dest) - 1] = '\0';

In practice it usually looks like:

struct from_hardware {
	...
	char name[HW_NAME_SIZE] __nonstring;
	...
};

	struct from_hardware *p = ...;
	char name[HW_NAME_SIZE + 1];

	strncpy(name, p->name, HW_NAME_SIZE);
	name[NW_NAME_SIZE] = '\0';

This cannot be replaced with:

	strscpy(name, p->name, sizeof(name));

because p->name is smaller and not NUL-terminated, so FORTIFY will
trigger when strnlen(p->name, sizeof(name)) is used. And it cannot be
replaced with:

	strscpy(name, p->name, sizeof(p->name));

because then "name" may contain a 1 character early truncation of
p->name.

Provide an unambiguous interface for converting a maybe not-NUL-terminated
string to a NUL-terminated string, with compile-time buffer size checking
so that it can never fail at runtime: memtostr() and memtostr_pad(). Also
add KUnit tests for both.

Link: https://lore.kernel.org/r/20240410023155.2100422-1-keescook@chromium.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/string.h | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 lib/string_kunit.c     | 26 ++++++++++++++++++++++++++
 2 files changed, 75 insertions(+)

(limited to 'include')

diff --git a/include/linux/string.h b/include/linux/string.h
index 9ba8b4597009..86aa6cd35167 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -422,6 +422,55 @@ void memcpy_and_pad(void *dest, size_t dest_len, const void *src, size_t count,
 	memcpy(dest, src, strnlen(src, min(_src_len, _dest_len)));	\
 } while (0)
 
+/**
+ * memtostr - Copy a possibly non-NUL-term string to a NUL-term string
+ * @dest: Pointer to destination NUL-terminates string
+ * @src: Pointer to character array (likely marked as __nonstring)
+ *
+ * This is a replacement for strncpy() uses where the source is not
+ * a NUL-terminated string.
+ *
+ * Note that sizes of @dest and @src must be known at compile-time.
+ */
+#define memtostr(dest, src)	do {					\
+	const size_t _dest_len = __builtin_object_size(dest, 1);	\
+	const size_t _src_len = __builtin_object_size(src, 1);		\
+	const size_t _src_chars = strnlen(src, _src_len);		\
+	const size_t _copy_len = min(_dest_len - 1, _src_chars);	\
+									\
+	BUILD_BUG_ON(!__builtin_constant_p(_dest_len) ||		\
+		     !__builtin_constant_p(_src_len) ||			\
+		     _dest_len == 0 || _dest_len == (size_t)-1 ||	\
+		     _src_len == 0 || _src_len == (size_t)-1);		\
+	memcpy(dest, src, _copy_len);					\
+	dest[_copy_len] = '\0';						\
+} while (0)
+
+/**
+ * memtostr_pad - Copy a possibly non-NUL-term string to a NUL-term string
+ *                with NUL padding in the destination
+ * @dest: Pointer to destination NUL-terminates string
+ * @src: Pointer to character array (likely marked as __nonstring)
+ *
+ * This is a replacement for strncpy() uses where the source is not
+ * a NUL-terminated string.
+ *
+ * Note that sizes of @dest and @src must be known at compile-time.
+ */
+#define memtostr_pad(dest, src)		do {				\
+	const size_t _dest_len = __builtin_object_size(dest, 1);	\
+	const size_t _src_len = __builtin_object_size(src, 1);		\
+	const size_t _src_chars = strnlen(src, _src_len);		\
+	const size_t _copy_len = min(_dest_len - 1, _src_chars);	\
+									\
+	BUILD_BUG_ON(!__builtin_constant_p(_dest_len) ||		\
+		     !__builtin_constant_p(_src_len) ||			\
+		     _dest_len == 0 || _dest_len == (size_t)-1 ||	\
+		     _src_len == 0 || _src_len == (size_t)-1);		\
+	memcpy(dest, src, _copy_len);					\
+	memset(&dest[_copy_len], 0, _dest_len - _copy_len);		\
+} while (0)
+
 /**
  * memset_after - Set a value after a struct member to the end of a struct
  *
diff --git a/lib/string_kunit.c b/lib/string_kunit.c
index de4eae91403f..fadad46c1b05 100644
--- a/lib/string_kunit.c
+++ b/lib/string_kunit.c
@@ -524,6 +524,31 @@ static void string_test_strlcat(struct kunit *test)
 	KUNIT_EXPECT_STREQ(test, dest, "fourABE");
 }
 
+static void string_test_memtostr(struct kunit *test)
+{
+	char nonstring[7] = { 'a', 'b', 'c', 'd', 'e', 'f', 'g' };
+	char nonstring_small[3] = { 'a', 'b', 'c' };
+	char dest[sizeof(nonstring) + 1];
+
+	/* Copy in a non-NUL-terminated string into exactly right-sized dest. */
+	KUNIT_EXPECT_EQ(test, sizeof(dest), sizeof(nonstring) + 1);
+	memset(dest, 'X', sizeof(dest));
+	memtostr(dest, nonstring);
+	KUNIT_EXPECT_STREQ(test, dest, "abcdefg");
+	memset(dest, 'X', sizeof(dest));
+	memtostr(dest, nonstring_small);
+	KUNIT_EXPECT_STREQ(test, dest, "abc");
+	KUNIT_EXPECT_EQ(test, dest[7], 'X');
+
+	memset(dest, 'X', sizeof(dest));
+	memtostr_pad(dest, nonstring);
+	KUNIT_EXPECT_STREQ(test, dest, "abcdefg");
+	memset(dest, 'X', sizeof(dest));
+	memtostr_pad(dest, nonstring_small);
+	KUNIT_EXPECT_STREQ(test, dest, "abc");
+	KUNIT_EXPECT_EQ(test, dest[7], '\0');
+}
+
 static struct kunit_case string_test_cases[] = {
 	KUNIT_CASE(string_test_memset16),
 	KUNIT_CASE(string_test_memset32),
@@ -543,6 +568,7 @@ static struct kunit_case string_test_cases[] = {
 	KUNIT_CASE(string_test_strcat),
 	KUNIT_CASE(string_test_strncat),
 	KUNIT_CASE(string_test_strlcat),
+	KUNIT_CASE(string_test_memtostr),
 	{}
 };
 
-- 
cgit v1.2.3


From fc7566ad0a826cdc8886c5dbbb39ce72a0dc6333 Mon Sep 17 00:00:00 2001
From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Date: Wed, 24 Apr 2024 03:13:14 +0000
Subject: bpf: Introduce bpf_preempt_[disable,enable] kfuncs

Introduce two new BPF kfuncs, bpf_preempt_disable and
bpf_preempt_enable. These kfuncs allow disabling preemption in BPF
programs. Nesting is allowed, since the intended use cases includes
building native BPF spin locks without kernel helper involvement. Apart
from that, this can be used to per-CPU data structures for cases where
programs (or userspace) may preempt one or the other. Currently, while
per-CPU access is stable, whether it will be consistent is not
guaranteed, as only migration is disabled for BPF programs.

Global functions are disallowed from being called, but support for them
will be added as a follow up not just preempt kfuncs, but rcu_read_lock
kfuncs as well. Static subprog calls are permitted. Sleepable helpers
and kfuncs are disallowed in non-preemptible regions.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Link: https://lore.kernel.org/r/20240424031315.2757363-2-memxor@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf_verifier.h |  1 +
 kernel/bpf/helpers.c         | 12 ++++++++
 kernel/bpf/verifier.c        | 71 ++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 82 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 9db35530c878..50aa87f8d77f 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -421,6 +421,7 @@ struct bpf_verifier_state {
 	struct bpf_active_lock active_lock;
 	bool speculative;
 	bool active_rcu_lock;
+	u32 active_preempt_lock;
 	/* If this state was ever pointed-to by other state's loop_entry field
 	 * this flag would be set to true. Used to avoid freeing such states
 	 * while they are still in use.
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 5f61ffa7505a..14c401b78e12 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2734,6 +2734,16 @@ __bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq,
 	return __bpf_async_set_callback(async, callback_fn, aux, flags, BPF_ASYNC_TYPE_WQ);
 }
 
+__bpf_kfunc void bpf_preempt_disable(void)
+{
+	preempt_disable();
+}
+
+__bpf_kfunc void bpf_preempt_enable(void)
+{
+	preempt_enable();
+}
+
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(generic_btf_ids)
@@ -2814,6 +2824,8 @@ BTF_ID_FLAGS(func, bpf_modify_return_test_tp)
 BTF_ID_FLAGS(func, bpf_wq_init)
 BTF_ID_FLAGS(func, bpf_wq_set_callback_impl)
 BTF_ID_FLAGS(func, bpf_wq_start)
+BTF_ID_FLAGS(func, bpf_preempt_disable)
+BTF_ID_FLAGS(func, bpf_preempt_enable)
 BTF_KFUNCS_END(common_btf_ids)
 
 static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9715c88cc025..8312c7a0ee19 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1434,6 +1434,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
 	}
 	dst_state->speculative = src->speculative;
 	dst_state->active_rcu_lock = src->active_rcu_lock;
+	dst_state->active_preempt_lock = src->active_preempt_lock;
 	dst_state->in_sleepable = src->in_sleepable;
 	dst_state->curframe = src->curframe;
 	dst_state->active_lock.ptr = src->active_lock.ptr;
@@ -9599,6 +9600,13 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			return -EINVAL;
 		}
 
+		/* Only global subprogs cannot be called with preemption disabled. */
+		if (env->cur_state->active_preempt_lock) {
+			verbose(env, "global function calls are not allowed with preemption disabled,\n"
+				     "use static function instead\n");
+			return -EINVAL;
+		}
+
 		if (err) {
 			verbose(env, "Caller passes invalid args into func#%d ('%s')\n",
 				subprog, sub_name);
@@ -10285,6 +10293,17 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
 			env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
 	}
 
+	if (env->cur_state->active_preempt_lock) {
+		if (fn->might_sleep) {
+			verbose(env, "sleepable helper %s#%d in non-preemptible region\n",
+				func_id_name(func_id), func_id);
+			return -EINVAL;
+		}
+
+		if (in_sleepable(env) && is_storage_get_function(func_id))
+			env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
+	}
+
 	meta.func_id = func_id;
 	/* check args */
 	for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
@@ -11027,6 +11046,8 @@ enum special_kfunc_type {
 	KF_bpf_percpu_obj_drop_impl,
 	KF_bpf_throw,
 	KF_bpf_wq_set_callback_impl,
+	KF_bpf_preempt_disable,
+	KF_bpf_preempt_enable,
 	KF_bpf_iter_css_task_new,
 };
 
@@ -11081,6 +11102,8 @@ BTF_ID(func, bpf_percpu_obj_new_impl)
 BTF_ID(func, bpf_percpu_obj_drop_impl)
 BTF_ID(func, bpf_throw)
 BTF_ID(func, bpf_wq_set_callback_impl)
+BTF_ID(func, bpf_preempt_disable)
+BTF_ID(func, bpf_preempt_enable)
 #ifdef CONFIG_CGROUPS
 BTF_ID(func, bpf_iter_css_task_new)
 #else
@@ -11107,6 +11130,16 @@ static bool is_kfunc_bpf_rcu_read_unlock(struct bpf_kfunc_call_arg_meta *meta)
 	return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_unlock];
 }
 
+static bool is_kfunc_bpf_preempt_disable(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return meta->func_id == special_kfunc_list[KF_bpf_preempt_disable];
+}
+
+static bool is_kfunc_bpf_preempt_enable(struct bpf_kfunc_call_arg_meta *meta)
+{
+	return meta->func_id == special_kfunc_list[KF_bpf_preempt_enable];
+}
+
 static enum kfunc_ptr_arg_type
 get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
 		       struct bpf_kfunc_call_arg_meta *meta,
@@ -12195,11 +12228,11 @@ static int check_return_code(struct bpf_verifier_env *env, int regno, const char
 static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			    int *insn_idx_p)
 {
-	const struct btf_type *t, *ptr_type;
+	bool sleepable, rcu_lock, rcu_unlock, preempt_disable, preempt_enable;
 	u32 i, nargs, ptr_type_id, release_ref_obj_id;
 	struct bpf_reg_state *regs = cur_regs(env);
 	const char *func_name, *ptr_type_name;
-	bool sleepable, rcu_lock, rcu_unlock;
+	const struct btf_type *t, *ptr_type;
 	struct bpf_kfunc_call_arg_meta meta;
 	struct bpf_insn_aux_data *insn_aux;
 	int err, insn_idx = *insn_idx_p;
@@ -12260,6 +12293,9 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 	rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta);
 	rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta);
 
+	preempt_disable = is_kfunc_bpf_preempt_disable(&meta);
+	preempt_enable = is_kfunc_bpf_preempt_enable(&meta);
+
 	if (env->cur_state->active_rcu_lock) {
 		struct bpf_func_state *state;
 		struct bpf_reg_state *reg;
@@ -12292,6 +12328,22 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 		return -EINVAL;
 	}
 
+	if (env->cur_state->active_preempt_lock) {
+		if (preempt_disable) {
+			env->cur_state->active_preempt_lock++;
+		} else if (preempt_enable) {
+			env->cur_state->active_preempt_lock--;
+		} else if (sleepable) {
+			verbose(env, "kernel func %s is sleepable within non-preemptible region\n", func_name);
+			return -EACCES;
+		}
+	} else if (preempt_disable) {
+		env->cur_state->active_preempt_lock++;
+	} else if (preempt_enable) {
+		verbose(env, "unmatched attempt to enable preemption (kernel function %s)\n", func_name);
+		return -EINVAL;
+	}
+
 	/* In case of release function, we get register number of refcounted
 	 * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
 	 */
@@ -15439,6 +15491,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		return -EINVAL;
 	}
 
+	if (env->cur_state->active_preempt_lock) {
+		verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_preempt_disable-ed region\n");
+		return -EINVAL;
+	}
+
 	if (regs[ctx_reg].type != PTR_TO_CTX) {
 		verbose(env,
 			"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
@@ -17006,6 +17063,9 @@ static bool states_equal(struct bpf_verifier_env *env,
 	if (old->active_rcu_lock != cur->active_rcu_lock)
 		return false;
 
+	if (old->active_preempt_lock != cur->active_preempt_lock)
+		return false;
+
 	if (old->in_sleepable != cur->in_sleepable)
 		return false;
 
@@ -17957,6 +18017,13 @@ process_bpf_exit_full:
 					return -EINVAL;
 				}
 
+				if (env->cur_state->active_preempt_lock && !env->cur_state->curframe) {
+					verbose(env, "%d bpf_preempt_enable%s missing\n",
+						env->cur_state->active_preempt_lock,
+						env->cur_state->active_preempt_lock == 1 ? " is" : "(s) are");
+					return -EINVAL;
+				}
+
 				/* We must do check_reference_leak here before
 				 * prepare_func_exit to handle the case when
 				 * state->curframe > 0, it may be a callback
-- 
cgit v1.2.3


From 306ec721d043bbe5e818d59fbb37c28d999b5d8b Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 18 Apr 2024 13:36:07 +0200
Subject: net: intel: introduce {, Intel} Ethernet common library

Not a secret there's a ton of code duplication between two and more Intel
ethernet modules.

Before introducing new changes, which would need to be copied over again,
start decoupling the already existing duplicate functionality into a new
module, which will be shared between several Intel Ethernet drivers.
Add the lookup table which converts 8/10-bit hardware packet type into
a parsed bitfield structure for easy checking packet format parameters,
such as payload level, IP version, etc. This is currently used by i40e,
ice and iavf and it's all the same in all three drivers.
The only difference introduced in this implementation is that instead of
defining a 256 (or 1024 in case of ice) element array, add unlikely()
condition to limit the input to 154 (current maximum non-reserved packet
type). There's no reason to waste 600 (or even 3600) bytes only to not
hurt very unlikely exception packets.
The hash computation function now takes payload level directly as a
pkt_hash_type. There's a couple cases when non-IP ptypes are marked as
L3 payload and in the previous versions their hash level would be 2, not
3. But skb_set_hash() only sees difference between L4 and non-L4, thus
this won't change anything at all.
The module is behind the hidden Kconfig symbol, which the drivers will
select when needed. The exports are behind 'LIBIE' namespace to limit
the scope of the functions.

Not that non-HW-specific symbols will live in yet another module,
libeth. This is done to easily distinguish pretty generic code ready
for reusing by any other vendor and/or for moving the layer up from
the code useful in Intel's 1-100G drivers only.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/Kconfig               |   7 +
 drivers/net/ethernet/intel/Makefile              |   3 +
 drivers/net/ethernet/intel/i40e/i40e_common.c    | 253 ------------------
 drivers/net/ethernet/intel/i40e/i40e_main.c      |   1 +
 drivers/net/ethernet/intel/i40e/i40e_prototype.h |   7 -
 drivers/net/ethernet/intel/i40e/i40e_txrx.c      |  72 ++---
 drivers/net/ethernet/intel/i40e/i40e_type.h      |  88 -------
 drivers/net/ethernet/intel/iavf/iavf_common.c    | 253 ------------------
 drivers/net/ethernet/intel/iavf/iavf_main.c      |   1 +
 drivers/net/ethernet/intel/iavf/iavf_prototype.h |   7 -
 drivers/net/ethernet/intel/iavf/iavf_txrx.c      |  70 ++---
 drivers/net/ethernet/intel/iavf/iavf_type.h      |  88 -------
 drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h   | 320 -----------------------
 drivers/net/ethernet/intel/ice/ice_main.c        |   1 +
 drivers/net/ethernet/intel/ice/ice_txrx_lib.c    | 111 ++------
 drivers/net/ethernet/intel/libeth/Kconfig        |   8 +
 drivers/net/ethernet/intel/libeth/Makefile       |   6 +
 drivers/net/ethernet/intel/libeth/rx.c           |  52 ++++
 drivers/net/ethernet/intel/libie/Kconfig         |  10 +
 drivers/net/ethernet/intel/libie/Makefile        |   6 +
 drivers/net/ethernet/intel/libie/rx.c            | 124 +++++++++
 include/linux/net/intel/libie/rx.h               |  33 +++
 include/net/libeth/rx.h                          | 125 +++++++++
 23 files changed, 426 insertions(+), 1220 deletions(-)
 create mode 100644 drivers/net/ethernet/intel/libeth/Kconfig
 create mode 100644 drivers/net/ethernet/intel/libeth/Makefile
 create mode 100644 drivers/net/ethernet/intel/libeth/rx.c
 create mode 100644 drivers/net/ethernet/intel/libie/Kconfig
 create mode 100644 drivers/net/ethernet/intel/libie/Makefile
 create mode 100644 drivers/net/ethernet/intel/libie/rx.c
 create mode 100644 include/linux/net/intel/libie/rx.h
 create mode 100644 include/net/libeth/rx.h

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig
index 6e7901e12699..e0287fbd501d 100644
--- a/drivers/net/ethernet/intel/Kconfig
+++ b/drivers/net/ethernet/intel/Kconfig
@@ -16,6 +16,9 @@ config NET_VENDOR_INTEL
 
 if NET_VENDOR_INTEL
 
+source "drivers/net/ethernet/intel/libeth/Kconfig"
+source "drivers/net/ethernet/intel/libie/Kconfig"
+
 config E100
 	tristate "Intel(R) PRO/100+ support"
 	depends on PCI
@@ -225,6 +228,7 @@ config I40E
 	depends on PTP_1588_CLOCK_OPTIONAL
 	depends on PCI
 	select AUXILIARY_BUS
+	select LIBIE
 	select NET_DEVLINK
 	help
 	  This driver supports Intel(R) Ethernet Controller XL710 Family of
@@ -253,6 +257,8 @@ config I40E_DCB
 # so that CONFIG_IAVF symbol will always mirror the state of CONFIG_I40EVF
 config IAVF
 	tristate
+	select LIBIE
+
 config I40EVF
 	tristate "Intel(R) Ethernet Adaptive Virtual Function support"
 	select IAVF
@@ -283,6 +289,7 @@ config ICE
 	depends on GNSS || GNSS = n
 	select AUXILIARY_BUS
 	select DIMLIB
+	select LIBIE
 	select NET_DEVLINK
 	select PLDMFW
 	select DPLL
diff --git a/drivers/net/ethernet/intel/Makefile b/drivers/net/ethernet/intel/Makefile
index dacb481ee5b1..04c844ef4964 100644
--- a/drivers/net/ethernet/intel/Makefile
+++ b/drivers/net/ethernet/intel/Makefile
@@ -3,6 +3,9 @@
 # Makefile for the Intel network device drivers.
 #
 
+obj-$(CONFIG_LIBETH) += libeth/
+obj-$(CONFIG_LIBIE) += libie/
+
 obj-$(CONFIG_E100) += e100.o
 obj-$(CONFIG_E1000) += e1000/
 obj-$(CONFIG_E1000E) += e1000e/
diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c
index de6ca6295742..e8031f1a9b4f 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -381,259 +381,6 @@ int i40e_aq_set_rss_key(struct i40e_hw *hw,
 	return i40e_aq_get_set_rss_key(hw, vsi_id, key, true);
 }
 
-/* The i40e_ptype_lookup table is used to convert from the 8-bit ptype in the
- * hardware to a bit-field that can be used by SW to more easily determine the
- * packet type.
- *
- * Macros are used to shorten the table lines and make this table human
- * readable.
- *
- * We store the PTYPE in the top byte of the bit field - this is just so that
- * we can check that the table doesn't have a row missing, as the index into
- * the table should be the PTYPE.
- *
- * Typical work flow:
- *
- * IF NOT i40e_ptype_lookup[ptype].known
- * THEN
- *      Packet is unknown
- * ELSE IF i40e_ptype_lookup[ptype].outer_ip == I40E_RX_PTYPE_OUTER_IP
- *      Use the rest of the fields to look at the tunnels, inner protocols, etc
- * ELSE
- *      Use the enum i40e_rx_l2_ptype to decode the packet type
- * ENDIF
- */
-
-/* macro to make the table lines short, use explicit indexing with [PTYPE] */
-#define I40E_PTT(PTYPE, OUTER_IP, OUTER_IP_VER, OUTER_FRAG, T, TE, TEF, I, PL)\
-	[PTYPE] = { \
-		1, \
-		I40E_RX_PTYPE_OUTER_##OUTER_IP, \
-		I40E_RX_PTYPE_OUTER_##OUTER_IP_VER, \
-		I40E_RX_PTYPE_##OUTER_FRAG, \
-		I40E_RX_PTYPE_TUNNEL_##T, \
-		I40E_RX_PTYPE_TUNNEL_END_##TE, \
-		I40E_RX_PTYPE_##TEF, \
-		I40E_RX_PTYPE_INNER_PROT_##I, \
-		I40E_RX_PTYPE_PAYLOAD_LAYER_##PL }
-
-#define I40E_PTT_UNUSED_ENTRY(PTYPE) [PTYPE] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }
-
-/* shorter macros makes the table fit but are terse */
-#define I40E_RX_PTYPE_NOF		I40E_RX_PTYPE_NOT_FRAG
-#define I40E_RX_PTYPE_FRG		I40E_RX_PTYPE_FRAG
-#define I40E_RX_PTYPE_INNER_PROT_TS	I40E_RX_PTYPE_INNER_PROT_TIMESYNC
-
-/* Lookup table mapping in the 8-bit HW PTYPE to the bit field for decoding */
-struct i40e_rx_ptype_decoded i40e_ptype_lookup[BIT(8)] = {
-	/* L2 Packet types */
-	I40E_PTT_UNUSED_ENTRY(0),
-	I40E_PTT(1,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	I40E_PTT(2,  L2, NONE, NOF, NONE, NONE, NOF, TS,   PAY2),
-	I40E_PTT(3,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	I40E_PTT_UNUSED_ENTRY(4),
-	I40E_PTT_UNUSED_ENTRY(5),
-	I40E_PTT(6,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	I40E_PTT(7,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	I40E_PTT_UNUSED_ENTRY(8),
-	I40E_PTT_UNUSED_ENTRY(9),
-	I40E_PTT(10, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	I40E_PTT(11, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE),
-	I40E_PTT(12, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(13, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(14, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(15, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(16, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(17, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(18, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(19, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(20, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(21, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-
-	/* Non Tunneled IPv4 */
-	I40E_PTT(22, IP, IPV4, FRG, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(23, IP, IPV4, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(24, IP, IPV4, NOF, NONE, NONE, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(25),
-	I40E_PTT(26, IP, IPV4, NOF, NONE, NONE, NOF, TCP,  PAY4),
-	I40E_PTT(27, IP, IPV4, NOF, NONE, NONE, NOF, SCTP, PAY4),
-	I40E_PTT(28, IP, IPV4, NOF, NONE, NONE, NOF, ICMP, PAY4),
-
-	/* IPv4 --> IPv4 */
-	I40E_PTT(29, IP, IPV4, NOF, IP_IP, IPV4, FRG, NONE, PAY3),
-	I40E_PTT(30, IP, IPV4, NOF, IP_IP, IPV4, NOF, NONE, PAY3),
-	I40E_PTT(31, IP, IPV4, NOF, IP_IP, IPV4, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(32),
-	I40E_PTT(33, IP, IPV4, NOF, IP_IP, IPV4, NOF, TCP,  PAY4),
-	I40E_PTT(34, IP, IPV4, NOF, IP_IP, IPV4, NOF, SCTP, PAY4),
-	I40E_PTT(35, IP, IPV4, NOF, IP_IP, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 --> IPv6 */
-	I40E_PTT(36, IP, IPV4, NOF, IP_IP, IPV6, FRG, NONE, PAY3),
-	I40E_PTT(37, IP, IPV4, NOF, IP_IP, IPV6, NOF, NONE, PAY3),
-	I40E_PTT(38, IP, IPV4, NOF, IP_IP, IPV6, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(39),
-	I40E_PTT(40, IP, IPV4, NOF, IP_IP, IPV6, NOF, TCP,  PAY4),
-	I40E_PTT(41, IP, IPV4, NOF, IP_IP, IPV6, NOF, SCTP, PAY4),
-	I40E_PTT(42, IP, IPV4, NOF, IP_IP, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT */
-	I40E_PTT(43, IP, IPV4, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3),
-
-	/* IPv4 --> GRE/NAT --> IPv4 */
-	I40E_PTT(44, IP, IPV4, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3),
-	I40E_PTT(45, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3),
-	I40E_PTT(46, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(47),
-	I40E_PTT(48, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, TCP,  PAY4),
-	I40E_PTT(49, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4),
-	I40E_PTT(50, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT --> IPv6 */
-	I40E_PTT(51, IP, IPV4, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3),
-	I40E_PTT(52, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3),
-	I40E_PTT(53, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(54),
-	I40E_PTT(55, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, TCP,  PAY4),
-	I40E_PTT(56, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4),
-	I40E_PTT(57, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT --> MAC */
-	I40E_PTT(58, IP, IPV4, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3),
-
-	/* IPv4 --> GRE/NAT --> MAC --> IPv4 */
-	I40E_PTT(59, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3),
-	I40E_PTT(60, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3),
-	I40E_PTT(61, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(62),
-	I40E_PTT(63, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP,  PAY4),
-	I40E_PTT(64, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4),
-	I40E_PTT(65, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT -> MAC --> IPv6 */
-	I40E_PTT(66, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3),
-	I40E_PTT(67, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3),
-	I40E_PTT(68, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(69),
-	I40E_PTT(70, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP,  PAY4),
-	I40E_PTT(71, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4),
-	I40E_PTT(72, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT --> MAC/VLAN */
-	I40E_PTT(73, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3),
-
-	/* IPv4 ---> GRE/NAT -> MAC/VLAN --> IPv4 */
-	I40E_PTT(74, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3),
-	I40E_PTT(75, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3),
-	I40E_PTT(76, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(77),
-	I40E_PTT(78, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP,  PAY4),
-	I40E_PTT(79, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4),
-	I40E_PTT(80, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 -> GRE/NAT -> MAC/VLAN --> IPv6 */
-	I40E_PTT(81, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3),
-	I40E_PTT(82, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3),
-	I40E_PTT(83, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(84),
-	I40E_PTT(85, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP,  PAY4),
-	I40E_PTT(86, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4),
-	I40E_PTT(87, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4),
-
-	/* Non Tunneled IPv6 */
-	I40E_PTT(88, IP, IPV6, FRG, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(89, IP, IPV6, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(90, IP, IPV6, NOF, NONE, NONE, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(91),
-	I40E_PTT(92, IP, IPV6, NOF, NONE, NONE, NOF, TCP,  PAY4),
-	I40E_PTT(93, IP, IPV6, NOF, NONE, NONE, NOF, SCTP, PAY4),
-	I40E_PTT(94, IP, IPV6, NOF, NONE, NONE, NOF, ICMP, PAY4),
-
-	/* IPv6 --> IPv4 */
-	I40E_PTT(95,  IP, IPV6, NOF, IP_IP, IPV4, FRG, NONE, PAY3),
-	I40E_PTT(96,  IP, IPV6, NOF, IP_IP, IPV4, NOF, NONE, PAY3),
-	I40E_PTT(97,  IP, IPV6, NOF, IP_IP, IPV4, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(98),
-	I40E_PTT(99,  IP, IPV6, NOF, IP_IP, IPV4, NOF, TCP,  PAY4),
-	I40E_PTT(100, IP, IPV6, NOF, IP_IP, IPV4, NOF, SCTP, PAY4),
-	I40E_PTT(101, IP, IPV6, NOF, IP_IP, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> IPv6 */
-	I40E_PTT(102, IP, IPV6, NOF, IP_IP, IPV6, FRG, NONE, PAY3),
-	I40E_PTT(103, IP, IPV6, NOF, IP_IP, IPV6, NOF, NONE, PAY3),
-	I40E_PTT(104, IP, IPV6, NOF, IP_IP, IPV6, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(105),
-	I40E_PTT(106, IP, IPV6, NOF, IP_IP, IPV6, NOF, TCP,  PAY4),
-	I40E_PTT(107, IP, IPV6, NOF, IP_IP, IPV6, NOF, SCTP, PAY4),
-	I40E_PTT(108, IP, IPV6, NOF, IP_IP, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT */
-	I40E_PTT(109, IP, IPV6, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3),
-
-	/* IPv6 --> GRE/NAT -> IPv4 */
-	I40E_PTT(110, IP, IPV6, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3),
-	I40E_PTT(111, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3),
-	I40E_PTT(112, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(113),
-	I40E_PTT(114, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, TCP,  PAY4),
-	I40E_PTT(115, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4),
-	I40E_PTT(116, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> IPv6 */
-	I40E_PTT(117, IP, IPV6, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3),
-	I40E_PTT(118, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3),
-	I40E_PTT(119, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(120),
-	I40E_PTT(121, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, TCP,  PAY4),
-	I40E_PTT(122, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4),
-	I40E_PTT(123, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC */
-	I40E_PTT(124, IP, IPV6, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3),
-
-	/* IPv6 --> GRE/NAT -> MAC -> IPv4 */
-	I40E_PTT(125, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3),
-	I40E_PTT(126, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3),
-	I40E_PTT(127, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(128),
-	I40E_PTT(129, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP,  PAY4),
-	I40E_PTT(130, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4),
-	I40E_PTT(131, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC -> IPv6 */
-	I40E_PTT(132, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3),
-	I40E_PTT(133, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3),
-	I40E_PTT(134, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(135),
-	I40E_PTT(136, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP,  PAY4),
-	I40E_PTT(137, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4),
-	I40E_PTT(138, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC/VLAN */
-	I40E_PTT(139, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3),
-
-	/* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv4 */
-	I40E_PTT(140, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3),
-	I40E_PTT(141, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3),
-	I40E_PTT(142, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(143),
-	I40E_PTT(144, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP,  PAY4),
-	I40E_PTT(145, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4),
-	I40E_PTT(146, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv6 */
-	I40E_PTT(147, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3),
-	I40E_PTT(148, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3),
-	I40E_PTT(149, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(150),
-	I40E_PTT(151, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP,  PAY4),
-	I40E_PTT(152, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4),
-	I40E_PTT(153, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4),
-
-	/* unused entries */
-	[154 ... 255] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }
-};
-
 /**
  * i40e_init_shared_code - Initialize the shared code
  * @hw: pointer to hardware structure
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 0bdcdea0be3e..164afd8ce5a0 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -100,6 +100,7 @@ MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all), Debug mask (0x8XXXXXXX
 
 MODULE_AUTHOR("Intel Corporation, <e1000-devel@lists.sourceforge.net>");
 MODULE_DESCRIPTION("Intel(R) Ethernet Connection XL710 Network Driver");
+MODULE_IMPORT_NS(LIBIE);
 MODULE_LICENSE("GPL v2");
 
 static struct workqueue_struct *i40e_wq;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_prototype.h b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
index ce1f11b8ad65..5a0699ca7ce5 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_prototype.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
@@ -371,13 +371,6 @@ void i40e_set_pci_config_data(struct i40e_hw *hw, u16 link_status);
 
 int i40e_set_mac_type(struct i40e_hw *hw);
 
-extern struct i40e_rx_ptype_decoded i40e_ptype_lookup[];
-
-static inline struct i40e_rx_ptype_decoded decode_rx_desc_ptype(u8 ptype)
-{
-	return i40e_ptype_lookup[ptype];
-}
-
 /**
  * i40e_virtchnl_link_speed - Convert AdminQ link_speed to virtchnl definition
  * @link_speed: the speed to convert
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 64d198ed166b..bc9e766d88cb 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2,6 +2,7 @@
 /* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include <linux/bpf_trace.h>
+#include <linux/net/intel/libie/rx.h>
 #include <linux/prefetch.h>
 #include <linux/sctp.h>
 #include <net/mpls.h>
@@ -1741,38 +1742,30 @@ static inline void i40e_rx_checksum(struct i40e_vsi *vsi,
 				    struct sk_buff *skb,
 				    union i40e_rx_desc *rx_desc)
 {
-	struct i40e_rx_ptype_decoded decoded;
+	struct libeth_rx_pt decoded;
 	u32 rx_error, rx_status;
 	bool ipv4, ipv6;
 	u8 ptype;
 	u64 qword;
 
-	qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
-	ptype = FIELD_GET(I40E_RXD_QW1_PTYPE_MASK, qword);
-	rx_error = FIELD_GET(I40E_RXD_QW1_ERROR_MASK, qword);
-	rx_status = FIELD_GET(I40E_RXD_QW1_STATUS_MASK, qword);
-	decoded = decode_rx_desc_ptype(ptype);
-
 	skb->ip_summed = CHECKSUM_NONE;
 
-	skb_checksum_none_assert(skb);
+	qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
+	ptype = FIELD_GET(I40E_RXD_QW1_PTYPE_MASK, qword);
 
-	/* Rx csum enabled and ip headers found? */
-	if (!(vsi->netdev->features & NETIF_F_RXCSUM))
+	decoded = libie_rx_pt_parse(ptype);
+	if (!libeth_rx_pt_has_checksum(vsi->netdev, decoded))
 		return;
 
+	rx_error = FIELD_GET(I40E_RXD_QW1_ERROR_MASK, qword);
+	rx_status = FIELD_GET(I40E_RXD_QW1_STATUS_MASK, qword);
+
 	/* did the hardware decode the packet and checksum? */
 	if (!(rx_status & BIT(I40E_RX_DESC_STATUS_L3L4P_SHIFT)))
 		return;
 
-	/* both known and outer_ip must be set for the below code to work */
-	if (!(decoded.known && decoded.outer_ip))
-		return;
-
-	ipv4 = (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP) &&
-	       (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV4);
-	ipv6 = (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP) &&
-	       (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6);
+	ipv4 = libeth_rx_pt_get_ip_ver(decoded) == LIBETH_RX_PT_OUTER_IPV4;
+	ipv6 = libeth_rx_pt_get_ip_ver(decoded) == LIBETH_RX_PT_OUTER_IPV6;
 
 	if (ipv4 &&
 	    (rx_error & (BIT(I40E_RX_DESC_ERROR_IPE_SHIFT) |
@@ -1800,49 +1793,16 @@ static inline void i40e_rx_checksum(struct i40e_vsi *vsi,
 	 * we need to bump the checksum level by 1 to reflect the fact that
 	 * we are indicating we validated the inner checksum.
 	 */
-	if (decoded.tunnel_type >= I40E_RX_PTYPE_TUNNEL_IP_GRENAT)
+	if (decoded.tunnel_type >= LIBETH_RX_PT_TUNNEL_IP_GRENAT)
 		skb->csum_level = 1;
 
-	/* Only report checksum unnecessary for TCP, UDP, or SCTP */
-	switch (decoded.inner_prot) {
-	case I40E_RX_PTYPE_INNER_PROT_TCP:
-	case I40E_RX_PTYPE_INNER_PROT_UDP:
-	case I40E_RX_PTYPE_INNER_PROT_SCTP:
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
-		fallthrough;
-	default:
-		break;
-	}
-
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
 	return;
 
 checksum_fail:
 	vsi->back->hw_csum_rx_error++;
 }
 
-/**
- * i40e_ptype_to_htype - get a hash type
- * @ptype: the ptype value from the descriptor
- *
- * Returns a hash type to be used by skb_set_hash
- **/
-static inline int i40e_ptype_to_htype(u8 ptype)
-{
-	struct i40e_rx_ptype_decoded decoded = decode_rx_desc_ptype(ptype);
-
-	if (!decoded.known)
-		return PKT_HASH_TYPE_NONE;
-
-	if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
-	    decoded.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY4)
-		return PKT_HASH_TYPE_L4;
-	else if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
-		 decoded.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY3)
-		return PKT_HASH_TYPE_L3;
-	else
-		return PKT_HASH_TYPE_L2;
-}
-
 /**
  * i40e_rx_hash - set the hash value in the skb
  * @ring: descriptor ring
@@ -1855,17 +1815,19 @@ static inline void i40e_rx_hash(struct i40e_ring *ring,
 				struct sk_buff *skb,
 				u8 rx_ptype)
 {
+	struct libeth_rx_pt decoded;
 	u32 hash;
 	const __le64 rss_mask =
 		cpu_to_le64((u64)I40E_RX_DESC_FLTSTAT_RSS_HASH <<
 			    I40E_RX_DESC_STATUS_FLTSTAT_SHIFT);
 
-	if (!(ring->netdev->features & NETIF_F_RXHASH))
+	decoded = libie_rx_pt_parse(rx_ptype);
+	if (!libeth_rx_pt_has_hash(ring->netdev, decoded))
 		return;
 
 	if ((rx_desc->wb.qword1.status_error_len & rss_mask) == rss_mask) {
 		hash = le32_to_cpu(rx_desc->wb.qword0.hi_dword.rss);
-		skb_set_hash(skb, hash, i40e_ptype_to_htype(rx_ptype));
+		libeth_rx_pt_set_hash(skb, hash, decoded);
 	}
 }
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_type.h b/drivers/net/ethernet/intel/i40e/i40e_type.h
index d9031499697e..28568e126850 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_type.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_type.h
@@ -745,94 +745,6 @@ enum i40e_rx_desc_error_l3l4e_fcoe_masks {
 #define I40E_RXD_QW1_PTYPE_SHIFT	30
 #define I40E_RXD_QW1_PTYPE_MASK		(0xFFULL << I40E_RXD_QW1_PTYPE_SHIFT)
 
-/* Packet type non-ip values */
-enum i40e_rx_l2_ptype {
-	I40E_RX_PTYPE_L2_RESERVED			= 0,
-	I40E_RX_PTYPE_L2_MAC_PAY2			= 1,
-	I40E_RX_PTYPE_L2_TIMESYNC_PAY2			= 2,
-	I40E_RX_PTYPE_L2_FIP_PAY2			= 3,
-	I40E_RX_PTYPE_L2_OUI_PAY2			= 4,
-	I40E_RX_PTYPE_L2_MACCNTRL_PAY2			= 5,
-	I40E_RX_PTYPE_L2_LLDP_PAY2			= 6,
-	I40E_RX_PTYPE_L2_ECP_PAY2			= 7,
-	I40E_RX_PTYPE_L2_EVB_PAY2			= 8,
-	I40E_RX_PTYPE_L2_QCN_PAY2			= 9,
-	I40E_RX_PTYPE_L2_EAPOL_PAY2			= 10,
-	I40E_RX_PTYPE_L2_ARP				= 11,
-	I40E_RX_PTYPE_L2_FCOE_PAY3			= 12,
-	I40E_RX_PTYPE_L2_FCOE_FCDATA_PAY3		= 13,
-	I40E_RX_PTYPE_L2_FCOE_FCRDY_PAY3		= 14,
-	I40E_RX_PTYPE_L2_FCOE_FCRSP_PAY3		= 15,
-	I40E_RX_PTYPE_L2_FCOE_FCOTHER_PA		= 16,
-	I40E_RX_PTYPE_L2_FCOE_VFT_PAY3			= 17,
-	I40E_RX_PTYPE_L2_FCOE_VFT_FCDATA		= 18,
-	I40E_RX_PTYPE_L2_FCOE_VFT_FCRDY			= 19,
-	I40E_RX_PTYPE_L2_FCOE_VFT_FCRSP			= 20,
-	I40E_RX_PTYPE_L2_FCOE_VFT_FCOTHER		= 21,
-	I40E_RX_PTYPE_GRENAT4_MAC_PAY3			= 58,
-	I40E_RX_PTYPE_GRENAT4_MACVLAN_IPV6_ICMP_PAY4	= 87,
-	I40E_RX_PTYPE_GRENAT6_MAC_PAY3			= 124,
-	I40E_RX_PTYPE_GRENAT6_MACVLAN_IPV6_ICMP_PAY4	= 153
-};
-
-struct i40e_rx_ptype_decoded {
-	u32 known:1;
-	u32 outer_ip:1;
-	u32 outer_ip_ver:1;
-	u32 outer_frag:1;
-	u32 tunnel_type:3;
-	u32 tunnel_end_prot:2;
-	u32 tunnel_end_frag:1;
-	u32 inner_prot:4;
-	u32 payload_layer:3;
-};
-
-enum i40e_rx_ptype_outer_ip {
-	I40E_RX_PTYPE_OUTER_L2	= 0,
-	I40E_RX_PTYPE_OUTER_IP	= 1
-};
-
-enum i40e_rx_ptype_outer_ip_ver {
-	I40E_RX_PTYPE_OUTER_NONE	= 0,
-	I40E_RX_PTYPE_OUTER_IPV4	= 0,
-	I40E_RX_PTYPE_OUTER_IPV6	= 1
-};
-
-enum i40e_rx_ptype_outer_fragmented {
-	I40E_RX_PTYPE_NOT_FRAG	= 0,
-	I40E_RX_PTYPE_FRAG	= 1
-};
-
-enum i40e_rx_ptype_tunnel_type {
-	I40E_RX_PTYPE_TUNNEL_NONE		= 0,
-	I40E_RX_PTYPE_TUNNEL_IP_IP		= 1,
-	I40E_RX_PTYPE_TUNNEL_IP_GRENAT		= 2,
-	I40E_RX_PTYPE_TUNNEL_IP_GRENAT_MAC	= 3,
-	I40E_RX_PTYPE_TUNNEL_IP_GRENAT_MAC_VLAN	= 4,
-};
-
-enum i40e_rx_ptype_tunnel_end_prot {
-	I40E_RX_PTYPE_TUNNEL_END_NONE	= 0,
-	I40E_RX_PTYPE_TUNNEL_END_IPV4	= 1,
-	I40E_RX_PTYPE_TUNNEL_END_IPV6	= 2,
-};
-
-enum i40e_rx_ptype_inner_prot {
-	I40E_RX_PTYPE_INNER_PROT_NONE		= 0,
-	I40E_RX_PTYPE_INNER_PROT_UDP		= 1,
-	I40E_RX_PTYPE_INNER_PROT_TCP		= 2,
-	I40E_RX_PTYPE_INNER_PROT_SCTP		= 3,
-	I40E_RX_PTYPE_INNER_PROT_ICMP		= 4,
-	I40E_RX_PTYPE_INNER_PROT_TIMESYNC	= 5
-};
-
-enum i40e_rx_ptype_payload_layer {
-	I40E_RX_PTYPE_PAYLOAD_LAYER_NONE	= 0,
-	I40E_RX_PTYPE_PAYLOAD_LAYER_PAY2	= 1,
-	I40E_RX_PTYPE_PAYLOAD_LAYER_PAY3	= 2,
-	I40E_RX_PTYPE_PAYLOAD_LAYER_PAY4	= 3,
-};
-
 #define I40E_RXD_QW1_LENGTH_PBUF_SHIFT	38
 #define I40E_RXD_QW1_LENGTH_PBUF_MASK	(0x3FFFULL << \
 					 I40E_RXD_QW1_LENGTH_PBUF_SHIFT)
diff --git a/drivers/net/ethernet/intel/iavf/iavf_common.c b/drivers/net/ethernet/intel/iavf/iavf_common.c
index 5a25233a89d5..aa751ce3425b 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_common.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_common.c
@@ -432,259 +432,6 @@ enum iavf_status iavf_aq_set_rss_key(struct iavf_hw *hw, u16 vsi_id,
 	return iavf_aq_get_set_rss_key(hw, vsi_id, key, true);
 }
 
-/* The iavf_ptype_lookup table is used to convert from the 8-bit ptype in the
- * hardware to a bit-field that can be used by SW to more easily determine the
- * packet type.
- *
- * Macros are used to shorten the table lines and make this table human
- * readable.
- *
- * We store the PTYPE in the top byte of the bit field - this is just so that
- * we can check that the table doesn't have a row missing, as the index into
- * the table should be the PTYPE.
- *
- * Typical work flow:
- *
- * IF NOT iavf_ptype_lookup[ptype].known
- * THEN
- *      Packet is unknown
- * ELSE IF iavf_ptype_lookup[ptype].outer_ip == IAVF_RX_PTYPE_OUTER_IP
- *      Use the rest of the fields to look at the tunnels, inner protocols, etc
- * ELSE
- *      Use the enum iavf_rx_l2_ptype to decode the packet type
- * ENDIF
- */
-
-/* macro to make the table lines short, use explicit indexing with [PTYPE] */
-#define IAVF_PTT(PTYPE, OUTER_IP, OUTER_IP_VER, OUTER_FRAG, T, TE, TEF, I, PL)\
-	[PTYPE] = { \
-		1, \
-		IAVF_RX_PTYPE_OUTER_##OUTER_IP, \
-		IAVF_RX_PTYPE_OUTER_##OUTER_IP_VER, \
-		IAVF_RX_PTYPE_##OUTER_FRAG, \
-		IAVF_RX_PTYPE_TUNNEL_##T, \
-		IAVF_RX_PTYPE_TUNNEL_END_##TE, \
-		IAVF_RX_PTYPE_##TEF, \
-		IAVF_RX_PTYPE_INNER_PROT_##I, \
-		IAVF_RX_PTYPE_PAYLOAD_LAYER_##PL }
-
-#define IAVF_PTT_UNUSED_ENTRY(PTYPE) [PTYPE] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }
-
-/* shorter macros makes the table fit but are terse */
-#define IAVF_RX_PTYPE_NOF		IAVF_RX_PTYPE_NOT_FRAG
-#define IAVF_RX_PTYPE_FRG		IAVF_RX_PTYPE_FRAG
-#define IAVF_RX_PTYPE_INNER_PROT_TS	IAVF_RX_PTYPE_INNER_PROT_TIMESYNC
-
-/* Lookup table mapping the 8-bit HW PTYPE to the bit field for decoding */
-struct iavf_rx_ptype_decoded iavf_ptype_lookup[BIT(8)] = {
-	/* L2 Packet types */
-	IAVF_PTT_UNUSED_ENTRY(0),
-	IAVF_PTT(1,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	IAVF_PTT(2,  L2, NONE, NOF, NONE, NONE, NOF, TS,   PAY2),
-	IAVF_PTT(3,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	IAVF_PTT_UNUSED_ENTRY(4),
-	IAVF_PTT_UNUSED_ENTRY(5),
-	IAVF_PTT(6,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	IAVF_PTT(7,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	IAVF_PTT_UNUSED_ENTRY(8),
-	IAVF_PTT_UNUSED_ENTRY(9),
-	IAVF_PTT(10, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	IAVF_PTT(11, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE),
-	IAVF_PTT(12, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(13, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(14, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(15, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(16, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(17, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(18, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(19, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(20, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(21, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-
-	/* Non Tunneled IPv4 */
-	IAVF_PTT(22, IP, IPV4, FRG, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(23, IP, IPV4, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(24, IP, IPV4, NOF, NONE, NONE, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(25),
-	IAVF_PTT(26, IP, IPV4, NOF, NONE, NONE, NOF, TCP,  PAY4),
-	IAVF_PTT(27, IP, IPV4, NOF, NONE, NONE, NOF, SCTP, PAY4),
-	IAVF_PTT(28, IP, IPV4, NOF, NONE, NONE, NOF, ICMP, PAY4),
-
-	/* IPv4 --> IPv4 */
-	IAVF_PTT(29, IP, IPV4, NOF, IP_IP, IPV4, FRG, NONE, PAY3),
-	IAVF_PTT(30, IP, IPV4, NOF, IP_IP, IPV4, NOF, NONE, PAY3),
-	IAVF_PTT(31, IP, IPV4, NOF, IP_IP, IPV4, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(32),
-	IAVF_PTT(33, IP, IPV4, NOF, IP_IP, IPV4, NOF, TCP,  PAY4),
-	IAVF_PTT(34, IP, IPV4, NOF, IP_IP, IPV4, NOF, SCTP, PAY4),
-	IAVF_PTT(35, IP, IPV4, NOF, IP_IP, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 --> IPv6 */
-	IAVF_PTT(36, IP, IPV4, NOF, IP_IP, IPV6, FRG, NONE, PAY3),
-	IAVF_PTT(37, IP, IPV4, NOF, IP_IP, IPV6, NOF, NONE, PAY3),
-	IAVF_PTT(38, IP, IPV4, NOF, IP_IP, IPV6, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(39),
-	IAVF_PTT(40, IP, IPV4, NOF, IP_IP, IPV6, NOF, TCP,  PAY4),
-	IAVF_PTT(41, IP, IPV4, NOF, IP_IP, IPV6, NOF, SCTP, PAY4),
-	IAVF_PTT(42, IP, IPV4, NOF, IP_IP, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT */
-	IAVF_PTT(43, IP, IPV4, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3),
-
-	/* IPv4 --> GRE/NAT --> IPv4 */
-	IAVF_PTT(44, IP, IPV4, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3),
-	IAVF_PTT(45, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3),
-	IAVF_PTT(46, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(47),
-	IAVF_PTT(48, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, TCP,  PAY4),
-	IAVF_PTT(49, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4),
-	IAVF_PTT(50, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT --> IPv6 */
-	IAVF_PTT(51, IP, IPV4, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3),
-	IAVF_PTT(52, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3),
-	IAVF_PTT(53, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(54),
-	IAVF_PTT(55, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, TCP,  PAY4),
-	IAVF_PTT(56, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4),
-	IAVF_PTT(57, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT --> MAC */
-	IAVF_PTT(58, IP, IPV4, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3),
-
-	/* IPv4 --> GRE/NAT --> MAC --> IPv4 */
-	IAVF_PTT(59, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3),
-	IAVF_PTT(60, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3),
-	IAVF_PTT(61, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(62),
-	IAVF_PTT(63, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP,  PAY4),
-	IAVF_PTT(64, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4),
-	IAVF_PTT(65, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT -> MAC --> IPv6 */
-	IAVF_PTT(66, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3),
-	IAVF_PTT(67, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3),
-	IAVF_PTT(68, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(69),
-	IAVF_PTT(70, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP,  PAY4),
-	IAVF_PTT(71, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4),
-	IAVF_PTT(72, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT --> MAC/VLAN */
-	IAVF_PTT(73, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3),
-
-	/* IPv4 ---> GRE/NAT -> MAC/VLAN --> IPv4 */
-	IAVF_PTT(74, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3),
-	IAVF_PTT(75, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3),
-	IAVF_PTT(76, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(77),
-	IAVF_PTT(78, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP,  PAY4),
-	IAVF_PTT(79, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4),
-	IAVF_PTT(80, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 -> GRE/NAT -> MAC/VLAN --> IPv6 */
-	IAVF_PTT(81, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3),
-	IAVF_PTT(82, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3),
-	IAVF_PTT(83, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(84),
-	IAVF_PTT(85, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP,  PAY4),
-	IAVF_PTT(86, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4),
-	IAVF_PTT(87, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4),
-
-	/* Non Tunneled IPv6 */
-	IAVF_PTT(88, IP, IPV6, FRG, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(89, IP, IPV6, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(90, IP, IPV6, NOF, NONE, NONE, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(91),
-	IAVF_PTT(92, IP, IPV6, NOF, NONE, NONE, NOF, TCP,  PAY4),
-	IAVF_PTT(93, IP, IPV6, NOF, NONE, NONE, NOF, SCTP, PAY4),
-	IAVF_PTT(94, IP, IPV6, NOF, NONE, NONE, NOF, ICMP, PAY4),
-
-	/* IPv6 --> IPv4 */
-	IAVF_PTT(95,  IP, IPV6, NOF, IP_IP, IPV4, FRG, NONE, PAY3),
-	IAVF_PTT(96,  IP, IPV6, NOF, IP_IP, IPV4, NOF, NONE, PAY3),
-	IAVF_PTT(97,  IP, IPV6, NOF, IP_IP, IPV4, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(98),
-	IAVF_PTT(99,  IP, IPV6, NOF, IP_IP, IPV4, NOF, TCP,  PAY4),
-	IAVF_PTT(100, IP, IPV6, NOF, IP_IP, IPV4, NOF, SCTP, PAY4),
-	IAVF_PTT(101, IP, IPV6, NOF, IP_IP, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> IPv6 */
-	IAVF_PTT(102, IP, IPV6, NOF, IP_IP, IPV6, FRG, NONE, PAY3),
-	IAVF_PTT(103, IP, IPV6, NOF, IP_IP, IPV6, NOF, NONE, PAY3),
-	IAVF_PTT(104, IP, IPV6, NOF, IP_IP, IPV6, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(105),
-	IAVF_PTT(106, IP, IPV6, NOF, IP_IP, IPV6, NOF, TCP,  PAY4),
-	IAVF_PTT(107, IP, IPV6, NOF, IP_IP, IPV6, NOF, SCTP, PAY4),
-	IAVF_PTT(108, IP, IPV6, NOF, IP_IP, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT */
-	IAVF_PTT(109, IP, IPV6, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3),
-
-	/* IPv6 --> GRE/NAT -> IPv4 */
-	IAVF_PTT(110, IP, IPV6, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3),
-	IAVF_PTT(111, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3),
-	IAVF_PTT(112, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(113),
-	IAVF_PTT(114, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, TCP,  PAY4),
-	IAVF_PTT(115, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4),
-	IAVF_PTT(116, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> IPv6 */
-	IAVF_PTT(117, IP, IPV6, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3),
-	IAVF_PTT(118, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3),
-	IAVF_PTT(119, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(120),
-	IAVF_PTT(121, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, TCP,  PAY4),
-	IAVF_PTT(122, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4),
-	IAVF_PTT(123, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC */
-	IAVF_PTT(124, IP, IPV6, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3),
-
-	/* IPv6 --> GRE/NAT -> MAC -> IPv4 */
-	IAVF_PTT(125, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3),
-	IAVF_PTT(126, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3),
-	IAVF_PTT(127, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(128),
-	IAVF_PTT(129, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP,  PAY4),
-	IAVF_PTT(130, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4),
-	IAVF_PTT(131, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC -> IPv6 */
-	IAVF_PTT(132, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3),
-	IAVF_PTT(133, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3),
-	IAVF_PTT(134, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(135),
-	IAVF_PTT(136, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP,  PAY4),
-	IAVF_PTT(137, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4),
-	IAVF_PTT(138, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC/VLAN */
-	IAVF_PTT(139, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3),
-
-	/* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv4 */
-	IAVF_PTT(140, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3),
-	IAVF_PTT(141, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3),
-	IAVF_PTT(142, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(143),
-	IAVF_PTT(144, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP,  PAY4),
-	IAVF_PTT(145, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4),
-	IAVF_PTT(146, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv6 */
-	IAVF_PTT(147, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3),
-	IAVF_PTT(148, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3),
-	IAVF_PTT(149, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(150),
-	IAVF_PTT(151, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP,  PAY4),
-	IAVF_PTT(152, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4),
-	IAVF_PTT(153, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4),
-
-	/* unused entries */
-	[154 ... 255] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }
-};
-
 /**
  * iavf_aq_send_msg_to_pf
  * @hw: pointer to the hardware structure
diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 13361a780ece..d6cbe5022815 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -45,6 +45,7 @@ MODULE_DEVICE_TABLE(pci, iavf_pci_tbl);
 MODULE_ALIAS("i40evf");
 MODULE_AUTHOR("Intel Corporation, <linux.nics@intel.com>");
 MODULE_DESCRIPTION("Intel(R) Ethernet Adaptive Virtual Function Network Driver");
+MODULE_IMPORT_NS(LIBIE);
 MODULE_LICENSE("GPL v2");
 
 static const struct net_device_ops iavf_netdev_ops;
diff --git a/drivers/net/ethernet/intel/iavf/iavf_prototype.h b/drivers/net/ethernet/intel/iavf/iavf_prototype.h
index 4a48e6171405..48c3901381b4 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_prototype.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_prototype.h
@@ -45,13 +45,6 @@ enum iavf_status iavf_aq_set_rss_lut(struct iavf_hw *hw, u16 seid,
 enum iavf_status iavf_aq_set_rss_key(struct iavf_hw *hw, u16 seid,
 				     struct iavf_aqc_get_set_rss_key_data *key);
 
-extern struct iavf_rx_ptype_decoded iavf_ptype_lookup[];
-
-static inline struct iavf_rx_ptype_decoded decode_rx_desc_ptype(u8 ptype)
-{
-	return iavf_ptype_lookup[ptype];
-}
-
 void iavf_vf_parse_hw_config(struct iavf_hw *hw,
 			     struct virtchnl_vf_resource *msg);
 enum iavf_status iavf_aq_send_msg_to_pf(struct iavf_hw *hw,
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index 32bb604a1382..33ec01d0ed67 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -2,6 +2,7 @@
 /* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include <linux/bitfield.h>
+#include <linux/net/intel/libie/rx.h>
 #include <linux/prefetch.h>
 
 #include "iavf.h"
@@ -982,38 +983,30 @@ static void iavf_rx_checksum(struct iavf_vsi *vsi,
 			     struct sk_buff *skb,
 			     union iavf_rx_desc *rx_desc)
 {
-	struct iavf_rx_ptype_decoded decoded;
+	struct libeth_rx_pt decoded;
 	u32 rx_error, rx_status;
 	bool ipv4, ipv6;
 	u8 ptype;
 	u64 qword;
 
-	qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
-	ptype = FIELD_GET(IAVF_RXD_QW1_PTYPE_MASK, qword);
-	rx_error = FIELD_GET(IAVF_RXD_QW1_ERROR_MASK, qword);
-	rx_status = FIELD_GET(IAVF_RXD_QW1_STATUS_MASK, qword);
-	decoded = decode_rx_desc_ptype(ptype);
-
 	skb->ip_summed = CHECKSUM_NONE;
 
-	skb_checksum_none_assert(skb);
+	qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
+	ptype = FIELD_GET(IAVF_RXD_QW1_PTYPE_MASK, qword);
 
-	/* Rx csum enabled and ip headers found? */
-	if (!(vsi->netdev->features & NETIF_F_RXCSUM))
+	decoded = libie_rx_pt_parse(ptype);
+	if (!libeth_rx_pt_has_checksum(vsi->netdev, decoded))
 		return;
 
+	rx_error = FIELD_GET(IAVF_RXD_QW1_ERROR_MASK, qword);
+	rx_status = FIELD_GET(IAVF_RXD_QW1_STATUS_MASK, qword);
+
 	/* did the hardware decode the packet and checksum? */
 	if (!(rx_status & BIT(IAVF_RX_DESC_STATUS_L3L4P_SHIFT)))
 		return;
 
-	/* both known and outer_ip must be set for the below code to work */
-	if (!(decoded.known && decoded.outer_ip))
-		return;
-
-	ipv4 = (decoded.outer_ip == IAVF_RX_PTYPE_OUTER_IP) &&
-	       (decoded.outer_ip_ver == IAVF_RX_PTYPE_OUTER_IPV4);
-	ipv6 = (decoded.outer_ip == IAVF_RX_PTYPE_OUTER_IP) &&
-	       (decoded.outer_ip_ver == IAVF_RX_PTYPE_OUTER_IPV6);
+	ipv4 = libeth_rx_pt_get_ip_ver(decoded) == LIBETH_RX_PT_OUTER_IPV4;
+	ipv6 = libeth_rx_pt_get_ip_ver(decoded) == LIBETH_RX_PT_OUTER_IPV6;
 
 	if (ipv4 &&
 	    (rx_error & (BIT(IAVF_RX_DESC_ERROR_IPE_SHIFT) |
@@ -1037,46 +1030,13 @@ static void iavf_rx_checksum(struct iavf_vsi *vsi,
 	if (rx_error & BIT(IAVF_RX_DESC_ERROR_PPRS_SHIFT))
 		return;
 
-	/* Only report checksum unnecessary for TCP, UDP, or SCTP */
-	switch (decoded.inner_prot) {
-	case IAVF_RX_PTYPE_INNER_PROT_TCP:
-	case IAVF_RX_PTYPE_INNER_PROT_UDP:
-	case IAVF_RX_PTYPE_INNER_PROT_SCTP:
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
-		fallthrough;
-	default:
-		break;
-	}
-
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
 	return;
 
 checksum_fail:
 	vsi->back->hw_csum_rx_error++;
 }
 
-/**
- * iavf_ptype_to_htype - get a hash type
- * @ptype: the ptype value from the descriptor
- *
- * Returns a hash type to be used by skb_set_hash
- **/
-static int iavf_ptype_to_htype(u8 ptype)
-{
-	struct iavf_rx_ptype_decoded decoded = decode_rx_desc_ptype(ptype);
-
-	if (!decoded.known)
-		return PKT_HASH_TYPE_NONE;
-
-	if (decoded.outer_ip == IAVF_RX_PTYPE_OUTER_IP &&
-	    decoded.payload_layer == IAVF_RX_PTYPE_PAYLOAD_LAYER_PAY4)
-		return PKT_HASH_TYPE_L4;
-	else if (decoded.outer_ip == IAVF_RX_PTYPE_OUTER_IP &&
-		 decoded.payload_layer == IAVF_RX_PTYPE_PAYLOAD_LAYER_PAY3)
-		return PKT_HASH_TYPE_L3;
-	else
-		return PKT_HASH_TYPE_L2;
-}
-
 /**
  * iavf_rx_hash - set the hash value in the skb
  * @ring: descriptor ring
@@ -1089,17 +1049,19 @@ static void iavf_rx_hash(struct iavf_ring *ring,
 			 struct sk_buff *skb,
 			 u8 rx_ptype)
 {
+	struct libeth_rx_pt decoded;
 	u32 hash;
 	const __le64 rss_mask =
 		cpu_to_le64((u64)IAVF_RX_DESC_FLTSTAT_RSS_HASH <<
 			    IAVF_RX_DESC_STATUS_FLTSTAT_SHIFT);
 
-	if (!(ring->netdev->features & NETIF_F_RXHASH))
+	decoded = libie_rx_pt_parse(rx_ptype);
+	if (!libeth_rx_pt_has_hash(ring->netdev, decoded))
 		return;
 
 	if ((rx_desc->wb.qword1.status_error_len & rss_mask) == rss_mask) {
 		hash = le32_to_cpu(rx_desc->wb.qword0.hi_dword.rss);
-		skb_set_hash(skb, hash, iavf_ptype_to_htype(rx_ptype));
+		libeth_rx_pt_set_hash(skb, hash, decoded);
 	}
 }
 
diff --git a/drivers/net/ethernet/intel/iavf/iavf_type.h b/drivers/net/ethernet/intel/iavf/iavf_type.h
index 2b6a207fa441..23ded4fcd94f 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_type.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_type.h
@@ -327,94 +327,6 @@ enum iavf_rx_desc_error_l3l4e_fcoe_masks {
 #define IAVF_RXD_QW1_PTYPE_SHIFT	30
 #define IAVF_RXD_QW1_PTYPE_MASK		(0xFFULL << IAVF_RXD_QW1_PTYPE_SHIFT)
 
-/* Packet type non-ip values */
-enum iavf_rx_l2_ptype {
-	IAVF_RX_PTYPE_L2_RESERVED			= 0,
-	IAVF_RX_PTYPE_L2_MAC_PAY2			= 1,
-	IAVF_RX_PTYPE_L2_TIMESYNC_PAY2			= 2,
-	IAVF_RX_PTYPE_L2_FIP_PAY2			= 3,
-	IAVF_RX_PTYPE_L2_OUI_PAY2			= 4,
-	IAVF_RX_PTYPE_L2_MACCNTRL_PAY2			= 5,
-	IAVF_RX_PTYPE_L2_LLDP_PAY2			= 6,
-	IAVF_RX_PTYPE_L2_ECP_PAY2			= 7,
-	IAVF_RX_PTYPE_L2_EVB_PAY2			= 8,
-	IAVF_RX_PTYPE_L2_QCN_PAY2			= 9,
-	IAVF_RX_PTYPE_L2_EAPOL_PAY2			= 10,
-	IAVF_RX_PTYPE_L2_ARP				= 11,
-	IAVF_RX_PTYPE_L2_FCOE_PAY3			= 12,
-	IAVF_RX_PTYPE_L2_FCOE_FCDATA_PAY3		= 13,
-	IAVF_RX_PTYPE_L2_FCOE_FCRDY_PAY3		= 14,
-	IAVF_RX_PTYPE_L2_FCOE_FCRSP_PAY3		= 15,
-	IAVF_RX_PTYPE_L2_FCOE_FCOTHER_PA		= 16,
-	IAVF_RX_PTYPE_L2_FCOE_VFT_PAY3			= 17,
-	IAVF_RX_PTYPE_L2_FCOE_VFT_FCDATA		= 18,
-	IAVF_RX_PTYPE_L2_FCOE_VFT_FCRDY			= 19,
-	IAVF_RX_PTYPE_L2_FCOE_VFT_FCRSP			= 20,
-	IAVF_RX_PTYPE_L2_FCOE_VFT_FCOTHER		= 21,
-	IAVF_RX_PTYPE_GRENAT4_MAC_PAY3			= 58,
-	IAVF_RX_PTYPE_GRENAT4_MACVLAN_IPV6_ICMP_PAY4	= 87,
-	IAVF_RX_PTYPE_GRENAT6_MAC_PAY3			= 124,
-	IAVF_RX_PTYPE_GRENAT6_MACVLAN_IPV6_ICMP_PAY4	= 153
-};
-
-struct iavf_rx_ptype_decoded {
-	u32 known:1;
-	u32 outer_ip:1;
-	u32 outer_ip_ver:1;
-	u32 outer_frag:1;
-	u32 tunnel_type:3;
-	u32 tunnel_end_prot:2;
-	u32 tunnel_end_frag:1;
-	u32 inner_prot:4;
-	u32 payload_layer:3;
-};
-
-enum iavf_rx_ptype_outer_ip {
-	IAVF_RX_PTYPE_OUTER_L2	= 0,
-	IAVF_RX_PTYPE_OUTER_IP	= 1
-};
-
-enum iavf_rx_ptype_outer_ip_ver {
-	IAVF_RX_PTYPE_OUTER_NONE	= 0,
-	IAVF_RX_PTYPE_OUTER_IPV4	= 0,
-	IAVF_RX_PTYPE_OUTER_IPV6	= 1
-};
-
-enum iavf_rx_ptype_outer_fragmented {
-	IAVF_RX_PTYPE_NOT_FRAG	= 0,
-	IAVF_RX_PTYPE_FRAG	= 1
-};
-
-enum iavf_rx_ptype_tunnel_type {
-	IAVF_RX_PTYPE_TUNNEL_NONE		= 0,
-	IAVF_RX_PTYPE_TUNNEL_IP_IP		= 1,
-	IAVF_RX_PTYPE_TUNNEL_IP_GRENAT		= 2,
-	IAVF_RX_PTYPE_TUNNEL_IP_GRENAT_MAC	= 3,
-	IAVF_RX_PTYPE_TUNNEL_IP_GRENAT_MAC_VLAN	= 4,
-};
-
-enum iavf_rx_ptype_tunnel_end_prot {
-	IAVF_RX_PTYPE_TUNNEL_END_NONE	= 0,
-	IAVF_RX_PTYPE_TUNNEL_END_IPV4	= 1,
-	IAVF_RX_PTYPE_TUNNEL_END_IPV6	= 2,
-};
-
-enum iavf_rx_ptype_inner_prot {
-	IAVF_RX_PTYPE_INNER_PROT_NONE		= 0,
-	IAVF_RX_PTYPE_INNER_PROT_UDP		= 1,
-	IAVF_RX_PTYPE_INNER_PROT_TCP		= 2,
-	IAVF_RX_PTYPE_INNER_PROT_SCTP		= 3,
-	IAVF_RX_PTYPE_INNER_PROT_ICMP		= 4,
-	IAVF_RX_PTYPE_INNER_PROT_TIMESYNC	= 5
-};
-
-enum iavf_rx_ptype_payload_layer {
-	IAVF_RX_PTYPE_PAYLOAD_LAYER_NONE	= 0,
-	IAVF_RX_PTYPE_PAYLOAD_LAYER_PAY2	= 1,
-	IAVF_RX_PTYPE_PAYLOAD_LAYER_PAY3	= 2,
-	IAVF_RX_PTYPE_PAYLOAD_LAYER_PAY4	= 3,
-};
-
 #define IAVF_RXD_QW1_LENGTH_PBUF_SHIFT	38
 #define IAVF_RXD_QW1_LENGTH_PBUF_MASK	(0x3FFFULL << \
 					 IAVF_RXD_QW1_LENGTH_PBUF_SHIFT)
diff --git a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h
index d384ddfcb83e..611577ebc29d 100644
--- a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h
+++ b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h
@@ -160,64 +160,6 @@ struct ice_fltr_desc {
 				(0x1ULL << ICE_FXD_FLTR_WB_QW1_FAIL_PROF_S)
 #define ICE_FXD_FLTR_WB_QW1_FAIL_PROF_YES	0x1ULL
 
-struct ice_rx_ptype_decoded {
-	u32 known:1;
-	u32 outer_ip:1;
-	u32 outer_ip_ver:2;
-	u32 outer_frag:1;
-	u32 tunnel_type:3;
-	u32 tunnel_end_prot:2;
-	u32 tunnel_end_frag:1;
-	u32 inner_prot:4;
-	u32 payload_layer:3;
-};
-
-enum ice_rx_ptype_outer_ip {
-	ICE_RX_PTYPE_OUTER_L2	= 0,
-	ICE_RX_PTYPE_OUTER_IP	= 1,
-};
-
-enum ice_rx_ptype_outer_ip_ver {
-	ICE_RX_PTYPE_OUTER_NONE	= 0,
-	ICE_RX_PTYPE_OUTER_IPV4	= 1,
-	ICE_RX_PTYPE_OUTER_IPV6	= 2,
-};
-
-enum ice_rx_ptype_outer_fragmented {
-	ICE_RX_PTYPE_NOT_FRAG	= 0,
-	ICE_RX_PTYPE_FRAG	= 1,
-};
-
-enum ice_rx_ptype_tunnel_type {
-	ICE_RX_PTYPE_TUNNEL_NONE		= 0,
-	ICE_RX_PTYPE_TUNNEL_IP_IP		= 1,
-	ICE_RX_PTYPE_TUNNEL_IP_GRENAT		= 2,
-	ICE_RX_PTYPE_TUNNEL_IP_GRENAT_MAC	= 3,
-	ICE_RX_PTYPE_TUNNEL_IP_GRENAT_MAC_VLAN	= 4,
-};
-
-enum ice_rx_ptype_tunnel_end_prot {
-	ICE_RX_PTYPE_TUNNEL_END_NONE	= 0,
-	ICE_RX_PTYPE_TUNNEL_END_IPV4	= 1,
-	ICE_RX_PTYPE_TUNNEL_END_IPV6	= 2,
-};
-
-enum ice_rx_ptype_inner_prot {
-	ICE_RX_PTYPE_INNER_PROT_NONE		= 0,
-	ICE_RX_PTYPE_INNER_PROT_UDP		= 1,
-	ICE_RX_PTYPE_INNER_PROT_TCP		= 2,
-	ICE_RX_PTYPE_INNER_PROT_SCTP		= 3,
-	ICE_RX_PTYPE_INNER_PROT_ICMP		= 4,
-	ICE_RX_PTYPE_INNER_PROT_TIMESYNC	= 5,
-};
-
-enum ice_rx_ptype_payload_layer {
-	ICE_RX_PTYPE_PAYLOAD_LAYER_NONE	= 0,
-	ICE_RX_PTYPE_PAYLOAD_LAYER_PAY2	= 1,
-	ICE_RX_PTYPE_PAYLOAD_LAYER_PAY3	= 2,
-	ICE_RX_PTYPE_PAYLOAD_LAYER_PAY4	= 3,
-};
-
 /* Rx Flex Descriptor
  * This descriptor is used instead of the legacy version descriptor when
  * ice_rlan_ctx.adv_desc is set
@@ -651,266 +593,4 @@ struct ice_tlan_ctx {
 	u8 int_q_state;	/* width not needed - internal - DO NOT WRITE!!! */
 };
 
-/* The ice_ptype_lkup table is used to convert from the 10-bit ptype in the
- * hardware to a bit-field that can be used by SW to more easily determine the
- * packet type.
- *
- * Macros are used to shorten the table lines and make this table human
- * readable.
- *
- * We store the PTYPE in the top byte of the bit field - this is just so that
- * we can check that the table doesn't have a row missing, as the index into
- * the table should be the PTYPE.
- *
- * Typical work flow:
- *
- * IF NOT ice_ptype_lkup[ptype].known
- * THEN
- *      Packet is unknown
- * ELSE IF ice_ptype_lkup[ptype].outer_ip == ICE_RX_PTYPE_OUTER_IP
- *      Use the rest of the fields to look at the tunnels, inner protocols, etc
- * ELSE
- *      Use the enum ice_rx_l2_ptype to decode the packet type
- * ENDIF
- */
-#define ICE_PTYPES								\
-	/* L2 Packet types */							\
-	ICE_PTT_UNUSED_ENTRY(0),						\
-	ICE_PTT(1, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),			\
-	ICE_PTT_UNUSED_ENTRY(2),						\
-	ICE_PTT_UNUSED_ENTRY(3),						\
-	ICE_PTT_UNUSED_ENTRY(4),						\
-	ICE_PTT_UNUSED_ENTRY(5),						\
-	ICE_PTT(6, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE),			\
-	ICE_PTT(7, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE),			\
-	ICE_PTT_UNUSED_ENTRY(8),						\
-	ICE_PTT_UNUSED_ENTRY(9),						\
-	ICE_PTT(10, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE),		\
-	ICE_PTT(11, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE),		\
-	ICE_PTT_UNUSED_ENTRY(12),						\
-	ICE_PTT_UNUSED_ENTRY(13),						\
-	ICE_PTT_UNUSED_ENTRY(14),						\
-	ICE_PTT_UNUSED_ENTRY(15),						\
-	ICE_PTT_UNUSED_ENTRY(16),						\
-	ICE_PTT_UNUSED_ENTRY(17),						\
-	ICE_PTT_UNUSED_ENTRY(18),						\
-	ICE_PTT_UNUSED_ENTRY(19),						\
-	ICE_PTT_UNUSED_ENTRY(20),						\
-	ICE_PTT_UNUSED_ENTRY(21),						\
-										\
-	/* Non Tunneled IPv4 */							\
-	ICE_PTT(22, IP, IPV4, FRG, NONE, NONE, NOF, NONE, PAY3),		\
-	ICE_PTT(23, IP, IPV4, NOF, NONE, NONE, NOF, NONE, PAY3),		\
-	ICE_PTT(24, IP, IPV4, NOF, NONE, NONE, NOF, UDP,  PAY4),		\
-	ICE_PTT_UNUSED_ENTRY(25),						\
-	ICE_PTT(26, IP, IPV4, NOF, NONE, NONE, NOF, TCP,  PAY4),		\
-	ICE_PTT(27, IP, IPV4, NOF, NONE, NONE, NOF, SCTP, PAY4),		\
-	ICE_PTT(28, IP, IPV4, NOF, NONE, NONE, NOF, ICMP, PAY4),		\
-										\
-	/* IPv4 --> IPv4 */							\
-	ICE_PTT(29, IP, IPV4, NOF, IP_IP, IPV4, FRG, NONE, PAY3),		\
-	ICE_PTT(30, IP, IPV4, NOF, IP_IP, IPV4, NOF, NONE, PAY3),		\
-	ICE_PTT(31, IP, IPV4, NOF, IP_IP, IPV4, NOF, UDP,  PAY4),		\
-	ICE_PTT_UNUSED_ENTRY(32),						\
-	ICE_PTT(33, IP, IPV4, NOF, IP_IP, IPV4, NOF, TCP,  PAY4),		\
-	ICE_PTT(34, IP, IPV4, NOF, IP_IP, IPV4, NOF, SCTP, PAY4),		\
-	ICE_PTT(35, IP, IPV4, NOF, IP_IP, IPV4, NOF, ICMP, PAY4),		\
-										\
-	/* IPv4 --> IPv6 */							\
-	ICE_PTT(36, IP, IPV4, NOF, IP_IP, IPV6, FRG, NONE, PAY3),		\
-	ICE_PTT(37, IP, IPV4, NOF, IP_IP, IPV6, NOF, NONE, PAY3),		\
-	ICE_PTT(38, IP, IPV4, NOF, IP_IP, IPV6, NOF, UDP,  PAY4),		\
-	ICE_PTT_UNUSED_ENTRY(39),						\
-	ICE_PTT(40, IP, IPV4, NOF, IP_IP, IPV6, NOF, TCP,  PAY4),		\
-	ICE_PTT(41, IP, IPV4, NOF, IP_IP, IPV6, NOF, SCTP, PAY4),		\
-	ICE_PTT(42, IP, IPV4, NOF, IP_IP, IPV6, NOF, ICMP, PAY4),		\
-										\
-	/* IPv4 --> GRE/NAT */							\
-	ICE_PTT(43, IP, IPV4, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3),		\
-										\
-	/* IPv4 --> GRE/NAT --> IPv4 */						\
-	ICE_PTT(44, IP, IPV4, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3),		\
-	ICE_PTT(45, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3),		\
-	ICE_PTT(46, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, UDP,  PAY4),		\
-	ICE_PTT_UNUSED_ENTRY(47),						\
-	ICE_PTT(48, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, TCP,  PAY4),		\
-	ICE_PTT(49, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4),		\
-	ICE_PTT(50, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4),		\
-										\
-	/* IPv4 --> GRE/NAT --> IPv6 */						\
-	ICE_PTT(51, IP, IPV4, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3),		\
-	ICE_PTT(52, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3),		\
-	ICE_PTT(53, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, UDP,  PAY4),		\
-	ICE_PTT_UNUSED_ENTRY(54),						\
-	ICE_PTT(55, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, TCP,  PAY4),		\
-	ICE_PTT(56, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4),		\
-	ICE_PTT(57, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4),		\
-										\
-	/* IPv4 --> GRE/NAT --> MAC */						\
-	ICE_PTT(58, IP, IPV4, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3),	\
-										\
-	/* IPv4 --> GRE/NAT --> MAC --> IPv4 */					\
-	ICE_PTT(59, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3),	\
-	ICE_PTT(60, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3),	\
-	ICE_PTT(61, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP,  PAY4),	\
-	ICE_PTT_UNUSED_ENTRY(62),						\
-	ICE_PTT(63, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP,  PAY4),	\
-	ICE_PTT(64, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4),	\
-	ICE_PTT(65, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4),	\
-										\
-	/* IPv4 --> GRE/NAT -> MAC --> IPv6 */					\
-	ICE_PTT(66, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3),	\
-	ICE_PTT(67, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3),	\
-	ICE_PTT(68, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP,  PAY4),	\
-	ICE_PTT_UNUSED_ENTRY(69),						\
-	ICE_PTT(70, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP,  PAY4),	\
-	ICE_PTT(71, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4),	\
-	ICE_PTT(72, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4),	\
-										\
-	/* IPv4 --> GRE/NAT --> MAC/VLAN */					\
-	ICE_PTT(73, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3),	\
-										\
-	/* IPv4 ---> GRE/NAT -> MAC/VLAN --> IPv4 */				\
-	ICE_PTT(74, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3),	\
-	ICE_PTT(75, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3),	\
-	ICE_PTT(76, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP,  PAY4),	\
-	ICE_PTT_UNUSED_ENTRY(77),						\
-	ICE_PTT(78, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP,  PAY4),	\
-	ICE_PTT(79, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4),	\
-	ICE_PTT(80, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4),	\
-										\
-	/* IPv4 -> GRE/NAT -> MAC/VLAN --> IPv6 */				\
-	ICE_PTT(81, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3),	\
-	ICE_PTT(82, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3),	\
-	ICE_PTT(83, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP,  PAY4),	\
-	ICE_PTT_UNUSED_ENTRY(84),						\
-	ICE_PTT(85, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP,  PAY4),	\
-	ICE_PTT(86, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4),	\
-	ICE_PTT(87, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4),	\
-										\
-	/* Non Tunneled IPv6 */							\
-	ICE_PTT(88, IP, IPV6, FRG, NONE, NONE, NOF, NONE, PAY3),		\
-	ICE_PTT(89, IP, IPV6, NOF, NONE, NONE, NOF, NONE, PAY3),		\
-	ICE_PTT(90, IP, IPV6, NOF, NONE, NONE, NOF, UDP,  PAY4),		\
-	ICE_PTT_UNUSED_ENTRY(91),						\
-	ICE_PTT(92, IP, IPV6, NOF, NONE, NONE, NOF, TCP,  PAY4),		\
-	ICE_PTT(93, IP, IPV6, NOF, NONE, NONE, NOF, SCTP, PAY4),		\
-	ICE_PTT(94, IP, IPV6, NOF, NONE, NONE, NOF, ICMP, PAY4),		\
-										\
-	/* IPv6 --> IPv4 */							\
-	ICE_PTT(95, IP, IPV6, NOF, IP_IP, IPV4, FRG, NONE, PAY3),		\
-	ICE_PTT(96, IP, IPV6, NOF, IP_IP, IPV4, NOF, NONE, PAY3),		\
-	ICE_PTT(97, IP, IPV6, NOF, IP_IP, IPV4, NOF, UDP,  PAY4),		\
-	ICE_PTT_UNUSED_ENTRY(98),						\
-	ICE_PTT(99, IP, IPV6, NOF, IP_IP, IPV4, NOF, TCP,  PAY4),		\
-	ICE_PTT(100, IP, IPV6, NOF, IP_IP, IPV4, NOF, SCTP, PAY4),		\
-	ICE_PTT(101, IP, IPV6, NOF, IP_IP, IPV4, NOF, ICMP, PAY4),		\
-										\
-	/* IPv6 --> IPv6 */							\
-	ICE_PTT(102, IP, IPV6, NOF, IP_IP, IPV6, FRG, NONE, PAY3),		\
-	ICE_PTT(103, IP, IPV6, NOF, IP_IP, IPV6, NOF, NONE, PAY3),		\
-	ICE_PTT(104, IP, IPV6, NOF, IP_IP, IPV6, NOF, UDP,  PAY4),		\
-	ICE_PTT_UNUSED_ENTRY(105),						\
-	ICE_PTT(106, IP, IPV6, NOF, IP_IP, IPV6, NOF, TCP,  PAY4),		\
-	ICE_PTT(107, IP, IPV6, NOF, IP_IP, IPV6, NOF, SCTP, PAY4),		\
-	ICE_PTT(108, IP, IPV6, NOF, IP_IP, IPV6, NOF, ICMP, PAY4),		\
-										\
-	/* IPv6 --> GRE/NAT */							\
-	ICE_PTT(109, IP, IPV6, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3),		\
-										\
-	/* IPv6 --> GRE/NAT -> IPv4 */						\
-	ICE_PTT(110, IP, IPV6, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3),		\
-	ICE_PTT(111, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3),		\
-	ICE_PTT(112, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, UDP,  PAY4),		\
-	ICE_PTT_UNUSED_ENTRY(113),						\
-	ICE_PTT(114, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, TCP,  PAY4),		\
-	ICE_PTT(115, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4),		\
-	ICE_PTT(116, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4),		\
-										\
-	/* IPv6 --> GRE/NAT -> IPv6 */						\
-	ICE_PTT(117, IP, IPV6, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3),		\
-	ICE_PTT(118, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3),		\
-	ICE_PTT(119, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, UDP,  PAY4),		\
-	ICE_PTT_UNUSED_ENTRY(120),						\
-	ICE_PTT(121, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, TCP,  PAY4),		\
-	ICE_PTT(122, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4),		\
-	ICE_PTT(123, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4),		\
-										\
-	/* IPv6 --> GRE/NAT -> MAC */						\
-	ICE_PTT(124, IP, IPV6, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3),	\
-										\
-	/* IPv6 --> GRE/NAT -> MAC -> IPv4 */					\
-	ICE_PTT(125, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3),	\
-	ICE_PTT(126, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3),	\
-	ICE_PTT(127, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP,  PAY4),	\
-	ICE_PTT_UNUSED_ENTRY(128),						\
-	ICE_PTT(129, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP,  PAY4),	\
-	ICE_PTT(130, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4),	\
-	ICE_PTT(131, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4),	\
-										\
-	/* IPv6 --> GRE/NAT -> MAC -> IPv6 */					\
-	ICE_PTT(132, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3),	\
-	ICE_PTT(133, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3),	\
-	ICE_PTT(134, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP,  PAY4),	\
-	ICE_PTT_UNUSED_ENTRY(135),						\
-	ICE_PTT(136, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP,  PAY4),	\
-	ICE_PTT(137, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4),	\
-	ICE_PTT(138, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4),	\
-										\
-	/* IPv6 --> GRE/NAT -> MAC/VLAN */					\
-	ICE_PTT(139, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3),	\
-										\
-	/* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv4 */				\
-	ICE_PTT(140, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3),	\
-	ICE_PTT(141, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3),	\
-	ICE_PTT(142, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP,  PAY4),	\
-	ICE_PTT_UNUSED_ENTRY(143),						\
-	ICE_PTT(144, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP,  PAY4),	\
-	ICE_PTT(145, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4),	\
-	ICE_PTT(146, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4),	\
-										\
-	/* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv6 */				\
-	ICE_PTT(147, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3),	\
-	ICE_PTT(148, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3),	\
-	ICE_PTT(149, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP,  PAY4),	\
-	ICE_PTT_UNUSED_ENTRY(150),						\
-	ICE_PTT(151, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP,  PAY4),	\
-	ICE_PTT(152, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4),	\
-	ICE_PTT(153, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4),
-
-#define ICE_NUM_DEFINED_PTYPES	154
-
-/* macro to make the table lines short, use explicit indexing with [PTYPE] */
-#define ICE_PTT(PTYPE, OUTER_IP, OUTER_IP_VER, OUTER_FRAG, T, TE, TEF, I, PL)\
-	[PTYPE] = { \
-		1, \
-		ICE_RX_PTYPE_OUTER_##OUTER_IP, \
-		ICE_RX_PTYPE_OUTER_##OUTER_IP_VER, \
-		ICE_RX_PTYPE_##OUTER_FRAG, \
-		ICE_RX_PTYPE_TUNNEL_##T, \
-		ICE_RX_PTYPE_TUNNEL_END_##TE, \
-		ICE_RX_PTYPE_##TEF, \
-		ICE_RX_PTYPE_INNER_PROT_##I, \
-		ICE_RX_PTYPE_PAYLOAD_LAYER_##PL }
-
-#define ICE_PTT_UNUSED_ENTRY(PTYPE) [PTYPE] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }
-
-/* shorter macros makes the table fit but are terse */
-#define ICE_RX_PTYPE_NOF		ICE_RX_PTYPE_NOT_FRAG
-#define ICE_RX_PTYPE_FRG		ICE_RX_PTYPE_FRAG
-
-/* Lookup table mapping in the 10-bit HW PTYPE to the bit field for decoding */
-static const struct ice_rx_ptype_decoded ice_ptype_lkup[BIT(10)] = {
-	ICE_PTYPES
-
-	/* unused entries */
-	[ICE_NUM_DEFINED_PTYPES ... 1023] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }
-};
-
-static inline struct ice_rx_ptype_decoded ice_decode_rx_desc_ptype(u16 ptype)
-{
-	return ice_ptype_lkup[ptype];
-}
-
-
 #endif /* _ICE_LAN_TX_RX_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 2b173638b877..f55b57daec83 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -37,6 +37,7 @@ static const char ice_copyright[] = "Copyright (c) 2018, Intel Corporation.";
 
 MODULE_AUTHOR("Intel Corporation, <linux.nics@intel.com>");
 MODULE_DESCRIPTION(DRV_SUMMARY);
+MODULE_IMPORT_NS(LIBIE);
 MODULE_LICENSE("GPL v2");
 MODULE_FIRMWARE(ICE_DDP_PKG_FILE);
 
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
index df072ce767b1..2719f0e20933 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
@@ -2,6 +2,7 @@
 /* Copyright (c) 2019, Intel Corporation. */
 
 #include <linux/filter.h>
+#include <linux/net/intel/libie/rx.h>
 
 #include "ice_txrx_lib.h"
 #include "ice_eswitch.h"
@@ -38,30 +39,6 @@ void ice_release_rx_desc(struct ice_rx_ring *rx_ring, u16 val)
 	}
 }
 
-/**
- * ice_ptype_to_htype - get a hash type
- * @ptype: the ptype value from the descriptor
- *
- * Returns appropriate hash type (such as PKT_HASH_TYPE_L2/L3/L4) to be used by
- * skb_set_hash based on PTYPE as parsed by HW Rx pipeline and is part of
- * Rx desc.
- */
-static enum pkt_hash_types ice_ptype_to_htype(u16 ptype)
-{
-	struct ice_rx_ptype_decoded decoded = ice_decode_rx_desc_ptype(ptype);
-
-	if (!decoded.known)
-		return PKT_HASH_TYPE_NONE;
-	if (decoded.payload_layer == ICE_RX_PTYPE_PAYLOAD_LAYER_PAY4)
-		return PKT_HASH_TYPE_L4;
-	if (decoded.payload_layer == ICE_RX_PTYPE_PAYLOAD_LAYER_PAY3)
-		return PKT_HASH_TYPE_L3;
-	if (decoded.outer_ip == ICE_RX_PTYPE_OUTER_L2)
-		return PKT_HASH_TYPE_L2;
-
-	return PKT_HASH_TYPE_NONE;
-}
-
 /**
  * ice_get_rx_hash - get RX hash value from descriptor
  * @rx_desc: specific descriptor
@@ -91,14 +68,16 @@ ice_rx_hash_to_skb(const struct ice_rx_ring *rx_ring,
 		   const union ice_32b_rx_flex_desc *rx_desc,
 		   struct sk_buff *skb, u16 rx_ptype)
 {
+	struct libeth_rx_pt decoded;
 	u32 hash;
 
-	if (!(rx_ring->netdev->features & NETIF_F_RXHASH))
+	decoded = libie_rx_pt_parse(rx_ptype);
+	if (!libeth_rx_pt_has_hash(rx_ring->netdev, decoded))
 		return;
 
 	hash = ice_get_rx_hash(rx_desc);
 	if (likely(hash))
-		skb_set_hash(skb, hash, ice_ptype_to_htype(rx_ptype));
+		libeth_rx_pt_set_hash(skb, hash, decoded);
 }
 
 /**
@@ -114,34 +93,26 @@ static void
 ice_rx_csum(struct ice_rx_ring *ring, struct sk_buff *skb,
 	    union ice_32b_rx_flex_desc *rx_desc, u16 ptype)
 {
-	struct ice_rx_ptype_decoded decoded;
+	struct libeth_rx_pt decoded;
 	u16 rx_status0, rx_status1;
 	bool ipv4, ipv6;
 
-	rx_status0 = le16_to_cpu(rx_desc->wb.status_error0);
-	rx_status1 = le16_to_cpu(rx_desc->wb.status_error1);
-
-	decoded = ice_decode_rx_desc_ptype(ptype);
-
 	/* Start with CHECKSUM_NONE and by default csum_level = 0 */
 	skb->ip_summed = CHECKSUM_NONE;
-	skb_checksum_none_assert(skb);
 
-	/* check if Rx checksum is enabled */
-	if (!(ring->netdev->features & NETIF_F_RXCSUM))
+	decoded = libie_rx_pt_parse(ptype);
+	if (!libeth_rx_pt_has_checksum(ring->netdev, decoded))
 		return;
 
+	rx_status0 = le16_to_cpu(rx_desc->wb.status_error0);
+	rx_status1 = le16_to_cpu(rx_desc->wb.status_error1);
+
 	/* check if HW has decoded the packet and checksum */
 	if (!(rx_status0 & BIT(ICE_RX_FLEX_DESC_STATUS0_L3L4P_S)))
 		return;
 
-	if (!(decoded.known && decoded.outer_ip))
-		return;
-
-	ipv4 = (decoded.outer_ip == ICE_RX_PTYPE_OUTER_IP) &&
-	       (decoded.outer_ip_ver == ICE_RX_PTYPE_OUTER_IPV4);
-	ipv6 = (decoded.outer_ip == ICE_RX_PTYPE_OUTER_IP) &&
-	       (decoded.outer_ip_ver == ICE_RX_PTYPE_OUTER_IPV6);
+	ipv4 = libeth_rx_pt_get_ip_ver(decoded) == LIBETH_RX_PT_OUTER_IPV4;
+	ipv6 = libeth_rx_pt_get_ip_ver(decoded) == LIBETH_RX_PT_OUTER_IPV6;
 
 	if (ipv4 && (rx_status0 & (BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_EIPE_S)))) {
 		ring->vsi->back->hw_rx_eipe_error++;
@@ -169,19 +140,10 @@ ice_rx_csum(struct ice_rx_ring *ring, struct sk_buff *skb,
 	 * we need to bump the checksum level by 1 to reflect the fact that
 	 * we are indicating we validated the inner checksum.
 	 */
-	if (decoded.tunnel_type >= ICE_RX_PTYPE_TUNNEL_IP_GRENAT)
+	if (decoded.tunnel_type >= LIBETH_RX_PT_TUNNEL_IP_GRENAT)
 		skb->csum_level = 1;
 
-	/* Only report checksum unnecessary for TCP, UDP, or SCTP */
-	switch (decoded.inner_prot) {
-	case ICE_RX_PTYPE_INNER_PROT_TCP:
-	case ICE_RX_PTYPE_INNER_PROT_UDP:
-	case ICE_RX_PTYPE_INNER_PROT_SCTP:
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
-		break;
-	default:
-		break;
-	}
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
 	return;
 
 checksum_fail:
@@ -536,42 +498,6 @@ static int ice_xdp_rx_hw_ts(const struct xdp_md *ctx, u64 *ts_ns)
 	return 0;
 }
 
-/* Define a ptype index -> XDP hash type lookup table.
- * It uses the same ptype definitions as ice_decode_rx_desc_ptype[],
- * avoiding possible copy-paste errors.
- */
-#undef ICE_PTT
-#undef ICE_PTT_UNUSED_ENTRY
-
-#define ICE_PTT(PTYPE, OUTER_IP, OUTER_IP_VER, OUTER_FRAG, T, TE, TEF, I, PL)\
-	[PTYPE] = XDP_RSS_L3_##OUTER_IP_VER | XDP_RSS_L4_##I | XDP_RSS_TYPE_##PL
-
-#define ICE_PTT_UNUSED_ENTRY(PTYPE) [PTYPE] = 0
-
-/* A few supplementary definitions for when XDP hash types do not coincide
- * with what can be generated from ptype definitions
- * by means of preprocessor concatenation.
- */
-#define XDP_RSS_L3_NONE		XDP_RSS_TYPE_NONE
-#define XDP_RSS_L4_NONE		XDP_RSS_TYPE_NONE
-#define XDP_RSS_TYPE_PAY2	XDP_RSS_TYPE_L2
-#define XDP_RSS_TYPE_PAY3	XDP_RSS_TYPE_NONE
-#define XDP_RSS_TYPE_PAY4	XDP_RSS_L4
-
-static const enum xdp_rss_hash_type
-ice_ptype_to_xdp_hash[ICE_NUM_DEFINED_PTYPES] = {
-	ICE_PTYPES
-};
-
-#undef XDP_RSS_L3_NONE
-#undef XDP_RSS_L4_NONE
-#undef XDP_RSS_TYPE_PAY2
-#undef XDP_RSS_TYPE_PAY3
-#undef XDP_RSS_TYPE_PAY4
-
-#undef ICE_PTT
-#undef ICE_PTT_UNUSED_ENTRY
-
 /**
  * ice_xdp_rx_hash_type - Get XDP-specific hash type from the RX descriptor
  * @eop_desc: End of Packet descriptor
@@ -579,12 +505,7 @@ ice_ptype_to_xdp_hash[ICE_NUM_DEFINED_PTYPES] = {
 static enum xdp_rss_hash_type
 ice_xdp_rx_hash_type(const union ice_32b_rx_flex_desc *eop_desc)
 {
-	u16 ptype = ice_get_ptype(eop_desc);
-
-	if (unlikely(ptype >= ICE_NUM_DEFINED_PTYPES))
-		return 0;
-
-	return ice_ptype_to_xdp_hash[ptype];
+	return libie_rx_pt_parse(ice_get_ptype(eop_desc)).hash_type;
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/libeth/Kconfig b/drivers/net/ethernet/intel/libeth/Kconfig
new file mode 100644
index 000000000000..af970a63c227
--- /dev/null
+++ b/drivers/net/ethernet/intel/libeth/Kconfig
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+# Copyright (C) 2024 Intel Corporation
+
+config LIBETH
+	tristate
+	help
+	  libeth is a common library containing routines shared between several
+	  drivers, but not yet promoted to the generic kernel API.
diff --git a/drivers/net/ethernet/intel/libeth/Makefile b/drivers/net/ethernet/intel/libeth/Makefile
new file mode 100644
index 000000000000..cb99203d1dd2
--- /dev/null
+++ b/drivers/net/ethernet/intel/libeth/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
+# Copyright (C) 2024 Intel Corporation
+
+obj-$(CONFIG_LIBETH)		+= libeth.o
+
+libeth-objs			+= rx.o
diff --git a/drivers/net/ethernet/intel/libeth/rx.c b/drivers/net/ethernet/intel/libeth/rx.c
new file mode 100644
index 000000000000..879c4dfd6a4e
--- /dev/null
+++ b/drivers/net/ethernet/intel/libeth/rx.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2024 Intel Corporation */
+
+#include <net/libeth/rx.h>
+
+/* Converting abstract packet type numbers into a software structure with
+ * the packet parameters to do O(1) lookup on Rx.
+ */
+
+static const u16 libeth_rx_pt_xdp_oip[] = {
+	[LIBETH_RX_PT_OUTER_L2]		= XDP_RSS_TYPE_NONE,
+	[LIBETH_RX_PT_OUTER_IPV4]	= XDP_RSS_L3_IPV4,
+	[LIBETH_RX_PT_OUTER_IPV6]	= XDP_RSS_L3_IPV6,
+};
+
+static const u16 libeth_rx_pt_xdp_iprot[] = {
+	[LIBETH_RX_PT_INNER_NONE]	= XDP_RSS_TYPE_NONE,
+	[LIBETH_RX_PT_INNER_UDP]	= XDP_RSS_L4_UDP,
+	[LIBETH_RX_PT_INNER_TCP]	= XDP_RSS_L4_TCP,
+	[LIBETH_RX_PT_INNER_SCTP]	= XDP_RSS_L4_SCTP,
+	[LIBETH_RX_PT_INNER_ICMP]	= XDP_RSS_L4_ICMP,
+	[LIBETH_RX_PT_INNER_TIMESYNC]	= XDP_RSS_TYPE_NONE,
+};
+
+static const u16 libeth_rx_pt_xdp_pl[] = {
+	[LIBETH_RX_PT_PAYLOAD_NONE]	= XDP_RSS_TYPE_NONE,
+	[LIBETH_RX_PT_PAYLOAD_L2]	= XDP_RSS_TYPE_NONE,
+	[LIBETH_RX_PT_PAYLOAD_L3]	= XDP_RSS_TYPE_NONE,
+	[LIBETH_RX_PT_PAYLOAD_L4]	= XDP_RSS_L4,
+};
+
+/**
+ * libeth_rx_pt_gen_hash_type - generate an XDP RSS hash type for a PT
+ * @pt: PT structure to evaluate
+ *
+ * Generates ```hash_type``` field with XDP RSS type values from the parsed
+ * packet parameters if they're obtained dynamically at runtime.
+ */
+void libeth_rx_pt_gen_hash_type(struct libeth_rx_pt *pt)
+{
+	pt->hash_type = 0;
+	pt->hash_type |= libeth_rx_pt_xdp_oip[pt->outer_ip];
+	pt->hash_type |= libeth_rx_pt_xdp_iprot[pt->inner_prot];
+	pt->hash_type |= libeth_rx_pt_xdp_pl[pt->payload_layer];
+}
+EXPORT_SYMBOL_NS_GPL(libeth_rx_pt_gen_hash_type, LIBETH);
+
+/* Module */
+
+MODULE_AUTHOR("Intel Corporation");
+MODULE_DESCRIPTION("Common Ethernet library");
+MODULE_LICENSE("GPL");
diff --git a/drivers/net/ethernet/intel/libie/Kconfig b/drivers/net/ethernet/intel/libie/Kconfig
new file mode 100644
index 000000000000..33aff6bc8f81
--- /dev/null
+++ b/drivers/net/ethernet/intel/libie/Kconfig
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-only
+# Copyright (C) 2024 Intel Corporation
+
+config LIBIE
+	tristate
+	select LIBETH
+	help
+	  libie (Intel Ethernet library) is a common library built on top of
+	  libeth and containing vendor-specific routines shared between several
+	  Intel Ethernet drivers.
diff --git a/drivers/net/ethernet/intel/libie/Makefile b/drivers/net/ethernet/intel/libie/Makefile
new file mode 100644
index 000000000000..bf42c5aeeedd
--- /dev/null
+++ b/drivers/net/ethernet/intel/libie/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
+# Copyright (C) 2024 Intel Corporation
+
+obj-$(CONFIG_LIBIE)	+= libie.o
+
+libie-objs		+= rx.o
diff --git a/drivers/net/ethernet/intel/libie/rx.c b/drivers/net/ethernet/intel/libie/rx.c
new file mode 100644
index 000000000000..38201ee1e891
--- /dev/null
+++ b/drivers/net/ethernet/intel/libie/rx.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2024 Intel Corporation */
+
+#include <linux/net/intel/libie/rx.h>
+
+/* O(1) converting i40e/ice/iavf's 8/10-bit hardware packet type to a parsed
+ * bitfield struct.
+ */
+
+/* A few supplementary definitions for when XDP hash types do not coincide
+ * with what can be generated from ptype definitions by means of preprocessor
+ * concatenation.
+ */
+#define XDP_RSS_L3_L2			XDP_RSS_TYPE_NONE
+#define XDP_RSS_L4_NONE			XDP_RSS_TYPE_NONE
+#define XDP_RSS_L4_TIMESYNC		XDP_RSS_TYPE_NONE
+#define XDP_RSS_TYPE_L3			XDP_RSS_TYPE_NONE
+#define XDP_RSS_TYPE_L4			XDP_RSS_L4
+
+#define LIBIE_RX_PT(oip, ofrag, tun, tp, tefr, iprot, pl) {		   \
+		.outer_ip		= LIBETH_RX_PT_OUTER_##oip,	   \
+		.outer_frag		= LIBETH_RX_PT_##ofrag,		   \
+		.tunnel_type		= LIBETH_RX_PT_TUNNEL_IP_##tun,	   \
+		.tunnel_end_prot	= LIBETH_RX_PT_TUNNEL_END_##tp,	   \
+		.tunnel_end_frag	= LIBETH_RX_PT_##tefr,		   \
+		.inner_prot		= LIBETH_RX_PT_INNER_##iprot,	   \
+		.payload_layer		= LIBETH_RX_PT_PAYLOAD_##pl,	   \
+		.hash_type		= XDP_RSS_L3_##oip |		   \
+					  XDP_RSS_L4_##iprot |		   \
+					  XDP_RSS_TYPE_##pl,		   \
+	}
+
+#define LIBIE_RX_PT_UNUSED		{ }
+
+#define __LIBIE_RX_PT_L2(iprot, pl)					   \
+	LIBIE_RX_PT(L2, NOT_FRAG, NONE, NONE, NOT_FRAG, iprot, pl)
+#define LIBIE_RX_PT_L2		__LIBIE_RX_PT_L2(NONE, L2)
+#define LIBIE_RX_PT_TS		__LIBIE_RX_PT_L2(TIMESYNC, L2)
+#define LIBIE_RX_PT_L3		__LIBIE_RX_PT_L2(NONE, L3)
+
+#define LIBIE_RX_PT_IP_FRAG(oip)					   \
+	LIBIE_RX_PT(IPV##oip, FRAG, NONE, NONE, NOT_FRAG, NONE, L3)
+#define LIBIE_RX_PT_IP_L3(oip, tun, teprot, tefr)			   \
+	LIBIE_RX_PT(IPV##oip, NOT_FRAG, tun, teprot, tefr, NONE, L3)
+#define LIBIE_RX_PT_IP_L4(oip, tun, teprot, iprot)			   \
+	LIBIE_RX_PT(IPV##oip, NOT_FRAG, tun, teprot, NOT_FRAG, iprot, L4)
+
+#define LIBIE_RX_PT_IP_NOF(oip, tun, ver)				   \
+	LIBIE_RX_PT_IP_L3(oip, tun, ver, NOT_FRAG),			   \
+	LIBIE_RX_PT_IP_L4(oip, tun, ver, UDP),				   \
+	LIBIE_RX_PT_UNUSED,						   \
+	LIBIE_RX_PT_IP_L4(oip, tun, ver, TCP),				   \
+	LIBIE_RX_PT_IP_L4(oip, tun, ver, SCTP),				   \
+	LIBIE_RX_PT_IP_L4(oip, tun, ver, ICMP)
+
+/* IPv oip --> tun --> IPv ver */
+#define LIBIE_RX_PT_IP_TUN_VER(oip, tun, ver)				   \
+	LIBIE_RX_PT_IP_L3(oip, tun, ver, FRAG),				   \
+	LIBIE_RX_PT_IP_NOF(oip, tun, ver)
+
+/* Non Tunneled IPv oip */
+#define LIBIE_RX_PT_IP_RAW(oip)						   \
+	LIBIE_RX_PT_IP_FRAG(oip),					   \
+	LIBIE_RX_PT_IP_NOF(oip, NONE, NONE)
+
+/* IPv oip --> tun --> { IPv4, IPv6 } */
+#define LIBIE_RX_PT_IP_TUN(oip, tun)					   \
+	LIBIE_RX_PT_IP_TUN_VER(oip, tun, IPV4),				   \
+	LIBIE_RX_PT_IP_TUN_VER(oip, tun, IPV6)
+
+/* IPv oip --> GRE/NAT tun --> { x, IPv4, IPv6 } */
+#define LIBIE_RX_PT_IP_GRE(oip, tun)					   \
+	LIBIE_RX_PT_IP_L3(oip, tun, NONE, NOT_FRAG),			   \
+	LIBIE_RX_PT_IP_TUN(oip, tun)
+
+/* Non Tunneled IPv oip
+ * IPv oip --> { IPv4, IPv6 }
+ * IPv oip --> GRE/NAT --> { x, IPv4, IPv6 }
+ * IPv oip --> GRE/NAT --> MAC --> { x, IPv4, IPv6 }
+ * IPv oip --> GRE/NAT --> MAC/VLAN --> { x, IPv4, IPv6 }
+ */
+#define LIBIE_RX_PT_IP(oip)						   \
+	LIBIE_RX_PT_IP_RAW(oip),					   \
+	LIBIE_RX_PT_IP_TUN(oip, IP),					   \
+	LIBIE_RX_PT_IP_GRE(oip, GRENAT),				   \
+	LIBIE_RX_PT_IP_GRE(oip, GRENAT_MAC),				   \
+	LIBIE_RX_PT_IP_GRE(oip, GRENAT_MAC_VLAN)
+
+/* Lookup table mapping for O(1) parsing */
+const struct libeth_rx_pt libie_rx_pt_lut[LIBIE_RX_PT_NUM] = {
+	/* L2 packet types */
+	LIBIE_RX_PT_UNUSED,
+	LIBIE_RX_PT_L2,
+	LIBIE_RX_PT_TS,
+	LIBIE_RX_PT_L2,
+	LIBIE_RX_PT_UNUSED,
+	LIBIE_RX_PT_UNUSED,
+	LIBIE_RX_PT_L2,
+	LIBIE_RX_PT_L2,
+	LIBIE_RX_PT_UNUSED,
+	LIBIE_RX_PT_UNUSED,
+	LIBIE_RX_PT_L2,
+	LIBIE_RX_PT_UNUSED,
+
+	LIBIE_RX_PT_L3,
+	LIBIE_RX_PT_L3,
+	LIBIE_RX_PT_L3,
+	LIBIE_RX_PT_L3,
+	LIBIE_RX_PT_L3,
+	LIBIE_RX_PT_L3,
+	LIBIE_RX_PT_L3,
+	LIBIE_RX_PT_L3,
+	LIBIE_RX_PT_L3,
+	LIBIE_RX_PT_L3,
+
+	LIBIE_RX_PT_IP(4),
+	LIBIE_RX_PT_IP(6),
+};
+EXPORT_SYMBOL_NS_GPL(libie_rx_pt_lut, LIBIE);
+
+MODULE_AUTHOR("Intel Corporation");
+MODULE_DESCRIPTION("Intel(R) Ethernet common library");
+MODULE_IMPORT_NS(LIBETH);
+MODULE_LICENSE("GPL");
diff --git a/include/linux/net/intel/libie/rx.h b/include/linux/net/intel/libie/rx.h
new file mode 100644
index 000000000000..37c5c8f26fb9
--- /dev/null
+++ b/include/linux/net/intel/libie/rx.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (C) 2024 Intel Corporation */
+
+#ifndef __LIBIE_RX_H
+#define __LIBIE_RX_H
+
+#include <net/libeth/rx.h>
+
+/* O(1) converting i40e/ice/iavf's 8/10-bit hardware packet type to a parsed
+ * bitfield struct.
+ */
+
+#define LIBIE_RX_PT_NUM		154
+
+extern const struct libeth_rx_pt libie_rx_pt_lut[LIBIE_RX_PT_NUM];
+
+/**
+ * libie_rx_pt_parse - convert HW packet type to software bitfield structure
+ * @pt: 10-bit hardware packet type value from the descriptor
+ *
+ * ```libie_rx_pt_lut``` must be accessed only using this wrapper.
+ *
+ * Return: parsed bitfield struct corresponding to the provided ptype.
+ */
+static inline struct libeth_rx_pt libie_rx_pt_parse(u32 pt)
+{
+	if (unlikely(pt >= LIBIE_RX_PT_NUM))
+		pt = 0;
+
+	return libie_rx_pt_lut[pt];
+}
+
+#endif /* __LIBIE_RX_H */
diff --git a/include/net/libeth/rx.h b/include/net/libeth/rx.h
new file mode 100644
index 000000000000..0807e19f44b3
--- /dev/null
+++ b/include/net/libeth/rx.h
@@ -0,0 +1,125 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (C) 2024 Intel Corporation */
+
+#ifndef __LIBETH_RX_H
+#define __LIBETH_RX_H
+
+#include <net/xdp.h>
+
+/* Converting abstract packet type numbers into a software structure with
+ * the packet parameters to do O(1) lookup on Rx.
+ */
+
+enum {
+	LIBETH_RX_PT_OUTER_L2			= 0U,
+	LIBETH_RX_PT_OUTER_IPV4,
+	LIBETH_RX_PT_OUTER_IPV6,
+};
+
+enum {
+	LIBETH_RX_PT_NOT_FRAG			= 0U,
+	LIBETH_RX_PT_FRAG,
+};
+
+enum {
+	LIBETH_RX_PT_TUNNEL_IP_NONE		= 0U,
+	LIBETH_RX_PT_TUNNEL_IP_IP,
+	LIBETH_RX_PT_TUNNEL_IP_GRENAT,
+	LIBETH_RX_PT_TUNNEL_IP_GRENAT_MAC,
+	LIBETH_RX_PT_TUNNEL_IP_GRENAT_MAC_VLAN,
+};
+
+enum {
+	LIBETH_RX_PT_TUNNEL_END_NONE		= 0U,
+	LIBETH_RX_PT_TUNNEL_END_IPV4,
+	LIBETH_RX_PT_TUNNEL_END_IPV6,
+};
+
+enum {
+	LIBETH_RX_PT_INNER_NONE			= 0U,
+	LIBETH_RX_PT_INNER_UDP,
+	LIBETH_RX_PT_INNER_TCP,
+	LIBETH_RX_PT_INNER_SCTP,
+	LIBETH_RX_PT_INNER_ICMP,
+	LIBETH_RX_PT_INNER_TIMESYNC,
+};
+
+#define LIBETH_RX_PT_PAYLOAD_NONE		PKT_HASH_TYPE_NONE
+#define LIBETH_RX_PT_PAYLOAD_L2			PKT_HASH_TYPE_L2
+#define LIBETH_RX_PT_PAYLOAD_L3			PKT_HASH_TYPE_L3
+#define LIBETH_RX_PT_PAYLOAD_L4			PKT_HASH_TYPE_L4
+
+struct libeth_rx_pt {
+	u32					outer_ip:2;
+	u32					outer_frag:1;
+	u32					tunnel_type:3;
+	u32					tunnel_end_prot:2;
+	u32					tunnel_end_frag:1;
+	u32					inner_prot:3;
+	enum pkt_hash_types			payload_layer:2;
+
+	u32					pad:2;
+	enum xdp_rss_hash_type			hash_type:16;
+};
+
+void libeth_rx_pt_gen_hash_type(struct libeth_rx_pt *pt);
+
+/**
+ * libeth_rx_pt_get_ip_ver - get IP version from a packet type structure
+ * @pt: packet type params
+ *
+ * Wrapper to compile out the IPv6 code from the drivers when not supported
+ * by the kernel.
+ *
+ * Return: @pt.outer_ip or stub for IPv6 when not compiled-in.
+ */
+static inline u32 libeth_rx_pt_get_ip_ver(struct libeth_rx_pt pt)
+{
+#if !IS_ENABLED(CONFIG_IPV6)
+	switch (pt.outer_ip) {
+	case LIBETH_RX_PT_OUTER_IPV4:
+		return LIBETH_RX_PT_OUTER_IPV4;
+	default:
+		return LIBETH_RX_PT_OUTER_L2;
+	}
+#else
+	return pt.outer_ip;
+#endif
+}
+
+/* libeth_has_*() can be used to quickly check whether the HW metadata is
+ * available to avoid further expensive processing such as descriptor reads.
+ * They already check for the corresponding netdev feature to be enabled,
+ * thus can be used as drop-in replacements.
+ */
+
+static inline bool libeth_rx_pt_has_checksum(const struct net_device *dev,
+					     struct libeth_rx_pt pt)
+{
+	/* Non-zero _INNER* is only possible when _OUTER_IPV* is set,
+	 * it is enough to check only for the L4 type.
+	 */
+	return likely(pt.inner_prot > LIBETH_RX_PT_INNER_NONE &&
+		      (dev->features & NETIF_F_RXCSUM));
+}
+
+static inline bool libeth_rx_pt_has_hash(const struct net_device *dev,
+					 struct libeth_rx_pt pt)
+{
+	return likely(pt.payload_layer > LIBETH_RX_PT_PAYLOAD_NONE &&
+		      (dev->features & NETIF_F_RXHASH));
+}
+
+/**
+ * libeth_rx_pt_set_hash - fill in skb hash value basing on the PT
+ * @skb: skb to fill the hash in
+ * @hash: 32-bit hash value from the descriptor
+ * @pt: packet type
+ */
+static inline void libeth_rx_pt_set_hash(struct sk_buff *skb, u32 hash,
+					 struct libeth_rx_pt pt)
+{
+	skb_set_hash(skb, hash, pt.payload_layer);
+}
+
+#endif /* __LIBETH_RX_H */
-- 
cgit v1.2.3


From a1d6063d9f2f4f4f4ed1733ed3f3f63244c4afb5 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 18 Apr 2024 13:36:10 +0200
Subject: slab: introduce kvmalloc_array_node() and kvcalloc_node()

Add NUMA-aware counterparts for kvmalloc_array() and kvcalloc() to be
able to flexibly allocate arrays for a particular node.
Rewrite kvmalloc_array() to kvmalloc_array_node(NUMA_NO_NODE) call.

Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/linux/slab.h | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index e53cbfa18325..d1d1fa5e7983 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -774,14 +774,27 @@ static inline __alloc_size(1) void *kvzalloc(size_t size, gfp_t flags)
 	return kvmalloc(size, flags | __GFP_ZERO);
 }
 
-static inline __alloc_size(1, 2) void *kvmalloc_array(size_t n, size_t size, gfp_t flags)
+static inline __alloc_size(1, 2) void *
+kvmalloc_array_node(size_t n, size_t size, gfp_t flags, int node)
 {
 	size_t bytes;
 
 	if (unlikely(check_mul_overflow(n, size, &bytes)))
 		return NULL;
 
-	return kvmalloc(bytes, flags);
+	return kvmalloc_node(bytes, flags, node);
+}
+
+static inline __alloc_size(1, 2) void *
+kvmalloc_array(size_t n, size_t size, gfp_t flags)
+{
+	return kvmalloc_array_node(n, size, flags, NUMA_NO_NODE);
+}
+
+static inline __alloc_size(1, 2) void *
+kvcalloc_node(size_t n, size_t size, gfp_t flags, int node)
+{
+	return kvmalloc_array_node(n, size, flags | __GFP_ZERO, node);
 }
 
 static inline __alloc_size(1, 2) void *kvcalloc(size_t n, size_t size, gfp_t flags)
-- 
cgit v1.2.3


From ef9226cd56b718c79184a3466d32984a51cb449c Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 18 Apr 2024 13:36:11 +0200
Subject: page_pool: constify some read-only function arguments

There are several functions taking pointers to data they don't modify.
This includes statistics fetching, page and page_pool parameters, etc.
Constify the pointers, so that call sites will be able to pass const
pointers as well.
No functional changes, no visible changes in functions sizes.

Reviewed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/net/page_pool/helpers.h | 10 +++++-----
 include/net/page_pool/types.h   |  4 ++--
 net/core/page_pool.c            | 10 +++++-----
 3 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h
index 1d397c1a0043..c7bb06750e85 100644
--- a/include/net/page_pool/helpers.h
+++ b/include/net/page_pool/helpers.h
@@ -58,7 +58,7 @@
 /* Deprecated driver-facing API, use netlink instead */
 int page_pool_ethtool_stats_get_count(void);
 u8 *page_pool_ethtool_stats_get_strings(u8 *data);
-u64 *page_pool_ethtool_stats_get(u64 *data, void *stats);
+u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats);
 
 bool page_pool_get_stats(const struct page_pool *pool,
 			 struct page_pool_stats *stats);
@@ -73,7 +73,7 @@ static inline u8 *page_pool_ethtool_stats_get_strings(u8 *data)
 	return data;
 }
 
-static inline u64 *page_pool_ethtool_stats_get(u64 *data, void *stats)
+static inline u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats)
 {
 	return data;
 }
@@ -204,8 +204,8 @@ static inline void *page_pool_dev_alloc_va(struct page_pool *pool,
  * Get the stored dma direction. A driver might decide to store this locally
  * and avoid the extra cache line from page_pool to determine the direction.
  */
-static
-inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool)
+static inline enum dma_data_direction
+page_pool_get_dma_dir(const struct page_pool *pool)
 {
 	return pool->p.dma_dir;
 }
@@ -370,7 +370,7 @@ static inline void page_pool_free_va(struct page_pool *pool, void *va,
  * Fetch the DMA address of the page. The page pool to which the page belongs
  * must had been created with PP_FLAG_DMA_MAP.
  */
-static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
+static inline dma_addr_t page_pool_get_dma_addr(const struct page *page)
 {
 	dma_addr_t ret = page->dma_addr;
 
diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
index 5e43a08d3231..a6ebed002216 100644
--- a/include/net/page_pool/types.h
+++ b/include/net/page_pool/types.h
@@ -213,7 +213,7 @@ struct xdp_mem_info;
 #ifdef CONFIG_PAGE_POOL
 void page_pool_destroy(struct page_pool *pool);
 void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
-			   struct xdp_mem_info *mem);
+			   const struct xdp_mem_info *mem);
 void page_pool_put_page_bulk(struct page_pool *pool, void **data,
 			     int count);
 #else
@@ -223,7 +223,7 @@ static inline void page_pool_destroy(struct page_pool *pool)
 
 static inline void page_pool_use_xdp_mem(struct page_pool *pool,
 					 void (*disconnect)(void *),
-					 struct xdp_mem_info *mem)
+					 const struct xdp_mem_info *mem)
 {
 }
 
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 4c175091fc0a..273c24429bce 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -123,9 +123,9 @@ int page_pool_ethtool_stats_get_count(void)
 }
 EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
 
-u64 *page_pool_ethtool_stats_get(u64 *data, void *stats)
+u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats)
 {
-	struct page_pool_stats *pool_stats = stats;
+	const struct page_pool_stats *pool_stats = stats;
 
 	*data++ = pool_stats->alloc_stats.fast;
 	*data++ = pool_stats->alloc_stats.slow;
@@ -383,8 +383,8 @@ static struct page *__page_pool_get_cached(struct page_pool *pool)
 	return page;
 }
 
-static void page_pool_dma_sync_for_device(struct page_pool *pool,
-					  struct page *page,
+static void page_pool_dma_sync_for_device(const struct page_pool *pool,
+					  const struct page *page,
 					  unsigned int dma_sync_size)
 {
 	dma_addr_t dma_addr = page_pool_get_dma_addr(page);
@@ -987,7 +987,7 @@ static void page_pool_release_retry(struct work_struct *wq)
 }
 
 void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
-			   struct xdp_mem_info *mem)
+			   const struct xdp_mem_info *mem)
 {
 	refcount_inc(&pool->user_cnt);
 	pool->disconnect = disconnect;
-- 
cgit v1.2.3


From ce230f4f8981e2a7f06b71c22cc742cfe91a525d Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 18 Apr 2024 13:36:12 +0200
Subject: page_pool: add DMA-sync-for-CPU inline helper

Each driver is responsible for syncing buffers written by HW for CPU
before accessing them. Almost each PP-enabled driver uses the same
pattern, which could be shorthanded into a static inline to make driver
code a little bit more compact.
Introduce a simple helper which performs DMA synchronization for the
size passed from the driver. It can be used even when the pool doesn't
manage DMA-syncs-for-device, just make sure the page has a correct DMA
address set via page_pool_set_dma_addr().

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Reviewed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 include/net/page_pool/helpers.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include')

diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h
index c7bb06750e85..873631c79ab1 100644
--- a/include/net/page_pool/helpers.h
+++ b/include/net/page_pool/helpers.h
@@ -52,6 +52,8 @@
 #ifndef _NET_PAGE_POOL_HELPERS_H
 #define _NET_PAGE_POOL_HELPERS_H
 
+#include <linux/dma-mapping.h>
+
 #include <net/page_pool/types.h>
 
 #ifdef CONFIG_PAGE_POOL_STATS
@@ -395,6 +397,28 @@ static inline bool page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
 	return false;
 }
 
+/**
+ * page_pool_dma_sync_for_cpu - sync Rx page for CPU after it's written by HW
+ * @pool: &page_pool the @page belongs to
+ * @page: page to sync
+ * @offset: offset from page start to "hard" start if using PP frags
+ * @dma_sync_size: size of the data written to the page
+ *
+ * Can be used as a shorthand to sync Rx pages before accessing them in the
+ * driver. Caller must ensure the pool was created with ``PP_FLAG_DMA_MAP``.
+ * Note that this version performs DMA sync unconditionally, even if the
+ * associated PP doesn't perform sync-for-device.
+ */
+static inline void page_pool_dma_sync_for_cpu(const struct page_pool *pool,
+					      const struct page *page,
+					      u32 offset, u32 dma_sync_size)
+{
+	dma_sync_single_range_for_cpu(pool->p.dev,
+				      page_pool_get_dma_addr(page),
+				      offset + pool->p.offset, dma_sync_size,
+				      page_pool_get_dma_dir(pool));
+}
+
 static inline bool page_pool_put(struct page_pool *pool)
 {
 	return refcount_dec_and_test(&pool->user_cnt);
-- 
cgit v1.2.3


From e6c91556b97f855436fa45f75e69165d671012a7 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 18 Apr 2024 13:36:13 +0200
Subject: libeth: add Rx buffer management

Add a couple intuitive helpers to hide Rx buffer implementation details
in the library and not multiplicate it between drivers. The settings are
sorta optimized for 100G+ NICs, but nothing really HW-specific here.
Use the new page_pool_dev_alloc() to dynamically switch between
split-page and full-page modes depending on MTU, page size, required
headroom etc. For example, on x86_64 with the default driver settings
each page is shared between 2 buffers. Turning on XDP (not in this
series) -> increasing headroom requirement pushes truesize out of 2048
boundary, leading to that each buffer starts getting a full page.
The "ceiling" limit is %PAGE_SIZE, as only order-0 pages are used to
avoid compound overhead. For the above architecture, this means maximum
linear frame size of 3712 w/o XDP.
Not that &libeth_buf_queue is not a complete queue/ring structure for
now, rather a shim, but eventually the libeth-enabled drivers will move
to it, with iavf being the first one.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/libeth/Kconfig |   1 +
 drivers/net/ethernet/intel/libeth/rx.c    |  98 +++++++++++++++++++++++++
 include/net/libeth/rx.h                   | 117 ++++++++++++++++++++++++++++++
 3 files changed, 216 insertions(+)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/libeth/Kconfig b/drivers/net/ethernet/intel/libeth/Kconfig
index af970a63c227..480293b71dbc 100644
--- a/drivers/net/ethernet/intel/libeth/Kconfig
+++ b/drivers/net/ethernet/intel/libeth/Kconfig
@@ -3,6 +3,7 @@
 
 config LIBETH
 	tristate
+	select PAGE_POOL
 	help
 	  libeth is a common library containing routines shared between several
 	  drivers, but not yet promoted to the generic kernel API.
diff --git a/drivers/net/ethernet/intel/libeth/rx.c b/drivers/net/ethernet/intel/libeth/rx.c
index 879c4dfd6a4e..6221b88c34ac 100644
--- a/drivers/net/ethernet/intel/libeth/rx.c
+++ b/drivers/net/ethernet/intel/libeth/rx.c
@@ -3,6 +3,104 @@
 
 #include <net/libeth/rx.h>
 
+/* Rx buffer management */
+
+/**
+ * libeth_rx_hw_len - get the actual buffer size to be passed to HW
+ * @pp: &page_pool_params of the netdev to calculate the size for
+ * @max_len: maximum buffer size for a single descriptor
+ *
+ * Return: HW-writeable length per one buffer to pass it to the HW accounting:
+ * MTU the @dev has, HW required alignment, minimum and maximum allowed values,
+ * and system's page size.
+ */
+static u32 libeth_rx_hw_len(const struct page_pool_params *pp, u32 max_len)
+{
+	u32 len;
+
+	len = READ_ONCE(pp->netdev->mtu) + LIBETH_RX_LL_LEN;
+	len = ALIGN(len, LIBETH_RX_BUF_STRIDE);
+	len = min3(len, ALIGN_DOWN(max_len ? : U32_MAX, LIBETH_RX_BUF_STRIDE),
+		   pp->max_len);
+
+	return len;
+}
+
+/**
+ * libeth_rx_fq_create - create a PP with the default libeth settings
+ * @fq: buffer queue struct to fill
+ * @napi: &napi_struct covering this PP (no usage outside its poll loops)
+ *
+ * Return: %0 on success, -%errno on failure.
+ */
+int libeth_rx_fq_create(struct libeth_fq *fq, struct napi_struct *napi)
+{
+	struct page_pool_params pp = {
+		.flags		= PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV,
+		.order		= LIBETH_RX_PAGE_ORDER,
+		.pool_size	= fq->count,
+		.nid		= fq->nid,
+		.dev		= napi->dev->dev.parent,
+		.netdev		= napi->dev,
+		.napi		= napi,
+		.dma_dir	= DMA_FROM_DEVICE,
+		.offset		= LIBETH_SKB_HEADROOM,
+	};
+	struct libeth_fqe *fqes;
+	struct page_pool *pool;
+
+	/* HW-writeable / syncable length per one page */
+	pp.max_len = LIBETH_RX_PAGE_LEN(pp.offset);
+
+	/* HW-writeable length per buffer */
+	fq->buf_len = libeth_rx_hw_len(&pp, fq->buf_len);
+	/* Buffer size to allocate */
+	fq->truesize = roundup_pow_of_two(SKB_HEAD_ALIGN(pp.offset +
+							 fq->buf_len));
+
+	pool = page_pool_create(&pp);
+	if (IS_ERR(pool))
+		return PTR_ERR(pool);
+
+	fqes = kvcalloc_node(fq->count, sizeof(*fqes), GFP_KERNEL, fq->nid);
+	if (!fqes)
+		goto err_buf;
+
+	fq->fqes = fqes;
+	fq->pp = pool;
+
+	return 0;
+
+err_buf:
+	page_pool_destroy(pool);
+
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_NS_GPL(libeth_rx_fq_create, LIBETH);
+
+/**
+ * libeth_rx_fq_destroy - destroy a &page_pool created by libeth
+ * @fq: buffer queue to process
+ */
+void libeth_rx_fq_destroy(struct libeth_fq *fq)
+{
+	kvfree(fq->fqes);
+	page_pool_destroy(fq->pp);
+}
+EXPORT_SYMBOL_NS_GPL(libeth_rx_fq_destroy, LIBETH);
+
+/**
+ * libeth_rx_recycle_slow - recycle a libeth page from the NAPI context
+ * @page: page to recycle
+ *
+ * To be used on exceptions or rare cases not requiring fast inline recycling.
+ */
+void libeth_rx_recycle_slow(struct page *page)
+{
+	page_pool_recycle_direct(page->pp, page);
+}
+EXPORT_SYMBOL_NS_GPL(libeth_rx_recycle_slow, LIBETH);
+
 /* Converting abstract packet type numbers into a software structure with
  * the packet parameters to do O(1) lookup on Rx.
  */
diff --git a/include/net/libeth/rx.h b/include/net/libeth/rx.h
index 0807e19f44b3..f29ea3e34c6c 100644
--- a/include/net/libeth/rx.h
+++ b/include/net/libeth/rx.h
@@ -4,8 +4,125 @@
 #ifndef __LIBETH_RX_H
 #define __LIBETH_RX_H
 
+#include <linux/if_vlan.h>
+
+#include <net/page_pool/helpers.h>
 #include <net/xdp.h>
 
+/* Rx buffer management */
+
+/* Space reserved in front of each frame */
+#define LIBETH_SKB_HEADROOM	(NET_SKB_PAD + NET_IP_ALIGN)
+/* Maximum headroom for worst-case calculations */
+#define LIBETH_MAX_HEADROOM	LIBETH_SKB_HEADROOM
+/* Link layer / L2 overhead: Ethernet, 2 VLAN tags (C + S), FCS */
+#define LIBETH_RX_LL_LEN	(ETH_HLEN + 2 * VLAN_HLEN + ETH_FCS_LEN)
+
+/* Always use order-0 pages */
+#define LIBETH_RX_PAGE_ORDER	0
+/* Pick a sane buffer stride and align to a cacheline boundary */
+#define LIBETH_RX_BUF_STRIDE	SKB_DATA_ALIGN(128)
+/* HW-writeable space in one buffer: truesize - headroom/tailroom, aligned */
+#define LIBETH_RX_PAGE_LEN(hr)						  \
+	ALIGN_DOWN(SKB_MAX_ORDER(hr, LIBETH_RX_PAGE_ORDER),		  \
+		   LIBETH_RX_BUF_STRIDE)
+
+/**
+ * struct libeth_fqe - structure representing an Rx buffer (fill queue element)
+ * @page: page holding the buffer
+ * @offset: offset from the page start (to the headroom)
+ * @truesize: total space occupied by the buffer (w/ headroom and tailroom)
+ *
+ * Depending on the MTU, API switches between one-page-per-frame and shared
+ * page model (to conserve memory on bigger-page platforms). In case of the
+ * former, @offset is always 0 and @truesize is always ```PAGE_SIZE```.
+ */
+struct libeth_fqe {
+	struct page		*page;
+	u32			offset;
+	u32			truesize;
+} __aligned_largest;
+
+/**
+ * struct libeth_fq - structure representing a buffer (fill) queue
+ * @fp: hotpath part of the structure
+ * @pp: &page_pool for buffer management
+ * @fqes: array of Rx buffers
+ * @truesize: size to allocate per buffer, w/overhead
+ * @count: number of descriptors/buffers the queue has
+ * @buf_len: HW-writeable length per each buffer
+ * @nid: ID of the closest NUMA node with memory
+ */
+struct libeth_fq {
+	struct_group_tagged(libeth_fq_fp, fp,
+		struct page_pool	*pp;
+		struct libeth_fqe	*fqes;
+
+		u32			truesize;
+		u32			count;
+	);
+
+	/* Cold fields */
+	u32			buf_len;
+	int			nid;
+};
+
+int libeth_rx_fq_create(struct libeth_fq *fq, struct napi_struct *napi);
+void libeth_rx_fq_destroy(struct libeth_fq *fq);
+
+/**
+ * libeth_rx_alloc - allocate a new Rx buffer
+ * @fq: fill queue to allocate for
+ * @i: index of the buffer within the queue
+ *
+ * Return: DMA address to be passed to HW for Rx on successful allocation,
+ * ```DMA_MAPPING_ERROR``` otherwise.
+ */
+static inline dma_addr_t libeth_rx_alloc(const struct libeth_fq_fp *fq, u32 i)
+{
+	struct libeth_fqe *buf = &fq->fqes[i];
+
+	buf->truesize = fq->truesize;
+	buf->page = page_pool_dev_alloc(fq->pp, &buf->offset, &buf->truesize);
+	if (unlikely(!buf->page))
+		return DMA_MAPPING_ERROR;
+
+	return page_pool_get_dma_addr(buf->page) + buf->offset +
+	       fq->pp->p.offset;
+}
+
+void libeth_rx_recycle_slow(struct page *page);
+
+/**
+ * libeth_rx_sync_for_cpu - synchronize or recycle buffer post DMA
+ * @fqe: buffer to process
+ * @len: frame length from the descriptor
+ *
+ * Process the buffer after it's written by HW. The regular path is to
+ * synchronize DMA for CPU, but in case of no data it will be immediately
+ * recycled back to its PP.
+ *
+ * Return: true when there's data to process, false otherwise.
+ */
+static inline bool libeth_rx_sync_for_cpu(const struct libeth_fqe *fqe,
+					  u32 len)
+{
+	struct page *page = fqe->page;
+
+	/* Very rare, but possible case. The most common reason:
+	 * the last fragment contained FCS only, which was then
+	 * stripped by the HW.
+	 */
+	if (unlikely(!len)) {
+		libeth_rx_recycle_slow(page);
+		return false;
+	}
+
+	page_pool_dma_sync_for_cpu(page->pp, page, fqe->offset, len);
+
+	return true;
+}
+
 /* Converting abstract packet type numbers into a software structure with
  * the packet parameters to do O(1) lookup on Rx.
  */
-- 
cgit v1.2.3


From 5fa4caff59f251bf9f766fc48c9f0a774a9216a0 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 18 Apr 2024 13:36:15 +0200
Subject: iavf: switch to Page Pool

Now that the IAVF driver simply uses dev_alloc_page() + free_page() with
no custom recycling logics, it can easily be switched to using Page
Pool / libeth API instead.
This allows to removing the whole dancing around headroom, HW buffer
size, and page order. All DMA-for-device is now done in the PP core,
for-CPU -- in the libeth helper.
Use skb_mark_for_recycle() to bring back the recycling and restore the
performance. Speaking of performance: on par with the baseline and
faster with the PP optimization series applied. But the memory usage for
1500b MTU is now almost 2x lower (x86_64) thanks to allocating a page
every second descriptor.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_main.c     |   7 +-
 drivers/net/ethernet/intel/iavf/iavf_txrx.c     | 257 +++++++-----------------
 drivers/net/ethernet/intel/iavf/iavf_txrx.h     |  34 +---
 drivers/net/ethernet/intel/iavf/iavf_virtchnl.c |  10 +-
 include/linux/net/intel/libie/rx.h              |  17 ++
 5 files changed, 111 insertions(+), 214 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index ffb71a62b105..7e0de0a9b883 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright(c) 2013 - 2018 Intel Corporation. */
 
+#include <linux/net/intel/libie/rx.h>
+
 #include "iavf.h"
 #include "iavf_prototype.h"
 /* All iavf tracepoints are defined by the include below, which must
@@ -45,6 +47,7 @@ MODULE_DEVICE_TABLE(pci, iavf_pci_tbl);
 MODULE_ALIAS("i40evf");
 MODULE_AUTHOR("Intel Corporation, <linux.nics@intel.com>");
 MODULE_DESCRIPTION("Intel(R) Ethernet Adaptive Virtual Function Network Driver");
+MODULE_IMPORT_NS(LIBETH);
 MODULE_IMPORT_NS(LIBIE);
 MODULE_LICENSE("GPL v2");
 
@@ -1586,7 +1589,6 @@ static int iavf_alloc_queues(struct iavf_adapter *adapter)
 		rx_ring = &adapter->rx_rings[i];
 		rx_ring->queue_index = i;
 		rx_ring->netdev = adapter->netdev;
-		rx_ring->dev = &adapter->pdev->dev;
 		rx_ring->count = adapter->rx_desc_count;
 		rx_ring->itr_setting = IAVF_ITR_RX_DEF;
 	}
@@ -2613,9 +2615,8 @@ static void iavf_init_config_adapter(struct iavf_adapter *adapter)
 	iavf_set_ethtool_ops(netdev);
 	netdev->watchdog_timeo = 5 * HZ;
 
-	/* MTU range: 68 - 9710 */
 	netdev->min_mtu = ETH_MIN_MTU;
-	netdev->max_mtu = IAVF_MAX_RXBUFFER - IAVF_PACKET_HDR_PAD;
+	netdev->max_mtu = LIBIE_MAX_MTU;
 
 	if (!is_valid_ether_addr(adapter->hw.mac.addr)) {
 		dev_info(&pdev->dev, "Invalid MAC address %pM, using random\n",
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index 2ec68b51bebe..26b424fd6718 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -690,11 +690,8 @@ err:
  **/
 static void iavf_clean_rx_ring(struct iavf_ring *rx_ring)
 {
-	unsigned long bi_size;
-	u16 i;
-
 	/* ring already cleared, nothing to do */
-	if (!rx_ring->rx_bi)
+	if (!rx_ring->rx_fqes)
 		return;
 
 	if (rx_ring->skb) {
@@ -702,40 +699,16 @@ static void iavf_clean_rx_ring(struct iavf_ring *rx_ring)
 		rx_ring->skb = NULL;
 	}
 
-	/* Free all the Rx ring sk_buffs */
-	for (i = 0; i < rx_ring->count; i++) {
-		struct iavf_rx_buffer *rx_bi = &rx_ring->rx_bi[i];
+	/* Free all the Rx ring buffers */
+	for (u32 i = rx_ring->next_to_clean; i != rx_ring->next_to_use; ) {
+		const struct libeth_fqe *rx_fqes = &rx_ring->rx_fqes[i];
 
-		if (!rx_bi->page)
-			continue;
+		page_pool_put_full_page(rx_ring->pp, rx_fqes->page, false);
 
-		/* Invalidate cache lines that may have been written to by
-		 * device so that we avoid corrupting memory.
-		 */
-		dma_sync_single_range_for_cpu(rx_ring->dev,
-					      rx_bi->dma,
-					      rx_bi->page_offset,
-					      IAVF_RXBUFFER_3072,
-					      DMA_FROM_DEVICE);
-
-		/* free resources associated with mapping */
-		dma_unmap_page_attrs(rx_ring->dev, rx_bi->dma,
-				     iavf_rx_pg_size(rx_ring),
-				     DMA_FROM_DEVICE,
-				     IAVF_RX_DMA_ATTR);
-
-		__free_page(rx_bi->page);
-
-		rx_bi->page = NULL;
-		rx_bi->page_offset = 0;
+		if (unlikely(++i == rx_ring->count))
+			i = 0;
 	}
 
-	bi_size = sizeof(struct iavf_rx_buffer) * rx_ring->count;
-	memset(rx_ring->rx_bi, 0, bi_size);
-
-	/* Zero out the descriptor ring */
-	memset(rx_ring->desc, 0, rx_ring->size);
-
 	rx_ring->next_to_clean = 0;
 	rx_ring->next_to_use = 0;
 }
@@ -748,15 +721,22 @@ static void iavf_clean_rx_ring(struct iavf_ring *rx_ring)
  **/
 void iavf_free_rx_resources(struct iavf_ring *rx_ring)
 {
+	struct libeth_fq fq = {
+		.fqes	= rx_ring->rx_fqes,
+		.pp	= rx_ring->pp,
+	};
+
 	iavf_clean_rx_ring(rx_ring);
-	kfree(rx_ring->rx_bi);
-	rx_ring->rx_bi = NULL;
 
 	if (rx_ring->desc) {
-		dma_free_coherent(rx_ring->dev, rx_ring->size,
+		dma_free_coherent(rx_ring->pp->p.dev, rx_ring->size,
 				  rx_ring->desc, rx_ring->dma);
 		rx_ring->desc = NULL;
 	}
+
+	libeth_rx_fq_destroy(&fq);
+	rx_ring->rx_fqes = NULL;
+	rx_ring->pp = NULL;
 }
 
 /**
@@ -767,26 +747,32 @@ void iavf_free_rx_resources(struct iavf_ring *rx_ring)
  **/
 int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
 {
-	struct device *dev = rx_ring->dev;
-	int bi_size;
-
-	/* warn if we are about to overwrite the pointer */
-	WARN_ON(rx_ring->rx_bi);
-	bi_size = sizeof(struct iavf_rx_buffer) * rx_ring->count;
-	rx_ring->rx_bi = kzalloc(bi_size, GFP_KERNEL);
-	if (!rx_ring->rx_bi)
-		goto err;
+	struct libeth_fq fq = {
+		.count		= rx_ring->count,
+		.buf_len	= LIBIE_MAX_RX_BUF_LEN,
+		.nid		= NUMA_NO_NODE,
+	};
+	int ret;
+
+	ret = libeth_rx_fq_create(&fq, &rx_ring->q_vector->napi);
+	if (ret)
+		return ret;
+
+	rx_ring->pp = fq.pp;
+	rx_ring->rx_fqes = fq.fqes;
+	rx_ring->truesize = fq.truesize;
+	rx_ring->rx_buf_len = fq.buf_len;
 
 	u64_stats_init(&rx_ring->syncp);
 
 	/* Round up to nearest 4K */
 	rx_ring->size = rx_ring->count * sizeof(union iavf_32byte_rx_desc);
 	rx_ring->size = ALIGN(rx_ring->size, 4096);
-	rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
+	rx_ring->desc = dma_alloc_coherent(fq.pp->p.dev, rx_ring->size,
 					   &rx_ring->dma, GFP_KERNEL);
 
 	if (!rx_ring->desc) {
-		dev_info(dev, "Unable to allocate memory for the Rx descriptor ring, size=%d\n",
+		dev_info(fq.pp->p.dev, "Unable to allocate memory for the Rx descriptor ring, size=%d\n",
 			 rx_ring->size);
 		goto err;
 	}
@@ -795,9 +781,12 @@ int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
 	rx_ring->next_to_use = 0;
 
 	return 0;
+
 err:
-	kfree(rx_ring->rx_bi);
-	rx_ring->rx_bi = NULL;
+	libeth_rx_fq_destroy(&fq);
+	rx_ring->rx_fqes = NULL;
+	rx_ring->pp = NULL;
+
 	return -ENOMEM;
 }
 
@@ -819,49 +808,6 @@ static void iavf_release_rx_desc(struct iavf_ring *rx_ring, u32 val)
 	writel(val, rx_ring->tail);
 }
 
-/**
- * iavf_alloc_mapped_page - recycle or make a new page
- * @rx_ring: ring to use
- * @bi: rx_buffer struct to modify
- *
- * Returns true if the page was successfully allocated or
- * reused.
- **/
-static bool iavf_alloc_mapped_page(struct iavf_ring *rx_ring,
-				   struct iavf_rx_buffer *bi)
-{
-	struct page *page = bi->page;
-	dma_addr_t dma;
-
-	/* alloc new page for storage */
-	page = dev_alloc_pages(iavf_rx_pg_order(rx_ring));
-	if (unlikely(!page)) {
-		rx_ring->rx_stats.alloc_page_failed++;
-		return false;
-	}
-
-	/* map page for use */
-	dma = dma_map_page_attrs(rx_ring->dev, page, 0,
-				 iavf_rx_pg_size(rx_ring),
-				 DMA_FROM_DEVICE,
-				 IAVF_RX_DMA_ATTR);
-
-	/* if mapping failed free memory back to system since
-	 * there isn't much point in holding memory we can't use
-	 */
-	if (dma_mapping_error(rx_ring->dev, dma)) {
-		__free_pages(page, iavf_rx_pg_order(rx_ring));
-		rx_ring->rx_stats.alloc_page_failed++;
-		return false;
-	}
-
-	bi->dma = dma;
-	bi->page = page;
-	bi->page_offset = IAVF_SKB_PAD;
-
-	return true;
-}
-
 /**
  * iavf_receive_skb - Send a completed packet up the stack
  * @rx_ring:  rx ring in play
@@ -892,38 +838,37 @@ static void iavf_receive_skb(struct iavf_ring *rx_ring,
  **/
 bool iavf_alloc_rx_buffers(struct iavf_ring *rx_ring, u16 cleaned_count)
 {
+	const struct libeth_fq_fp fq = {
+		.pp		= rx_ring->pp,
+		.fqes		= rx_ring->rx_fqes,
+		.truesize	= rx_ring->truesize,
+		.count		= rx_ring->count,
+	};
 	u16 ntu = rx_ring->next_to_use;
 	union iavf_rx_desc *rx_desc;
-	struct iavf_rx_buffer *bi;
 
 	/* do nothing if no valid netdev defined */
 	if (!rx_ring->netdev || !cleaned_count)
 		return false;
 
 	rx_desc = IAVF_RX_DESC(rx_ring, ntu);
-	bi = &rx_ring->rx_bi[ntu];
 
 	do {
-		if (!iavf_alloc_mapped_page(rx_ring, bi))
-			goto no_buffers;
+		dma_addr_t addr;
 
-		/* sync the buffer for use by the device */
-		dma_sync_single_range_for_device(rx_ring->dev, bi->dma,
-						 bi->page_offset,
-						 IAVF_RXBUFFER_3072,
-						 DMA_FROM_DEVICE);
+		addr = libeth_rx_alloc(&fq, ntu);
+		if (addr == DMA_MAPPING_ERROR)
+			goto no_buffers;
 
 		/* Refresh the desc even if buffer_addrs didn't change
 		 * because each write-back erases this info.
 		 */
-		rx_desc->read.pkt_addr = cpu_to_le64(bi->dma + bi->page_offset);
+		rx_desc->read.pkt_addr = cpu_to_le64(addr);
 
 		rx_desc++;
-		bi++;
 		ntu++;
 		if (unlikely(ntu == rx_ring->count)) {
 			rx_desc = IAVF_RX_DESC(rx_ring, 0);
-			bi = rx_ring->rx_bi;
 			ntu = 0;
 		}
 
@@ -942,6 +887,8 @@ no_buffers:
 	if (rx_ring->next_to_use != ntu)
 		iavf_release_rx_desc(rx_ring, ntu);
 
+	rx_ring->rx_stats.alloc_page_failed++;
+
 	/* make sure to come back via polling to try again after
 	 * allocation failure
 	 */
@@ -1090,9 +1037,8 @@ static bool iavf_cleanup_headers(struct iavf_ring *rx_ring, struct sk_buff *skb)
 
 /**
  * iavf_add_rx_frag - Add contents of Rx buffer to sk_buff
- * @rx_ring: rx descriptor ring to transact packets on
- * @rx_buffer: buffer containing page to add
  * @skb: sk_buff to place the data into
+ * @rx_buffer: buffer containing page to add
  * @size: packet length from rx_desc
  *
  * This function will add the data contained in rx_buffer->page to the skb.
@@ -1100,105 +1046,49 @@ static bool iavf_cleanup_headers(struct iavf_ring *rx_ring, struct sk_buff *skb)
  *
  * The function will then update the page offset.
  **/
-static void iavf_add_rx_frag(struct iavf_ring *rx_ring,
-			     struct iavf_rx_buffer *rx_buffer,
-			     struct sk_buff *skb,
+static void iavf_add_rx_frag(struct sk_buff *skb,
+			     const struct libeth_fqe *rx_buffer,
 			     unsigned int size)
 {
-	unsigned int truesize = SKB_DATA_ALIGN(size + IAVF_SKB_PAD);
-
-	if (!size)
-		return;
+	u32 hr = rx_buffer->page->pp->p.offset;
 
 	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_buffer->page,
-			rx_buffer->page_offset, size, truesize);
-}
-
-/**
- * iavf_get_rx_buffer - Fetch Rx buffer and synchronize data for use
- * @rx_ring: rx descriptor ring to transact packets on
- * @size: size of buffer to add to skb
- *
- * This function will pull an Rx buffer from the ring and synchronize it
- * for use by the CPU.
- */
-static struct iavf_rx_buffer *iavf_get_rx_buffer(struct iavf_ring *rx_ring,
-						 const unsigned int size)
-{
-	struct iavf_rx_buffer *rx_buffer;
-
-	rx_buffer = &rx_ring->rx_bi[rx_ring->next_to_clean];
-	prefetchw(rx_buffer->page);
-	if (!size)
-		return rx_buffer;
-
-	/* we are reusing so sync this buffer for CPU use */
-	dma_sync_single_range_for_cpu(rx_ring->dev,
-				      rx_buffer->dma,
-				      rx_buffer->page_offset,
-				      size,
-				      DMA_FROM_DEVICE);
-
-	return rx_buffer;
+			rx_buffer->offset + hr, size, rx_buffer->truesize);
 }
 
 /**
  * iavf_build_skb - Build skb around an existing buffer
- * @rx_ring: Rx descriptor ring to transact packets on
  * @rx_buffer: Rx buffer to pull data from
  * @size: size of buffer to add to skb
  *
  * This function builds an skb around an existing Rx buffer, taking care
  * to set up the skb correctly and avoid any memcpy overhead.
  */
-static struct sk_buff *iavf_build_skb(struct iavf_ring *rx_ring,
-				      struct iavf_rx_buffer *rx_buffer,
+static struct sk_buff *iavf_build_skb(const struct libeth_fqe *rx_buffer,
 				      unsigned int size)
 {
-	void *va;
-	unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) +
-				SKB_DATA_ALIGN(IAVF_SKB_PAD + size);
+	u32 hr = rx_buffer->page->pp->p.offset;
 	struct sk_buff *skb;
+	void *va;
 
-	if (!rx_buffer || !size)
-		return NULL;
 	/* prefetch first cache line of first page */
-	va = page_address(rx_buffer->page) + rx_buffer->page_offset;
-	net_prefetch(va);
+	va = page_address(rx_buffer->page) + rx_buffer->offset;
+	net_prefetch(va + hr);
 
 	/* build an skb around the page buffer */
-	skb = napi_build_skb(va - IAVF_SKB_PAD, truesize);
+	skb = napi_build_skb(va, rx_buffer->truesize);
 	if (unlikely(!skb))
 		return NULL;
 
+	skb_mark_for_recycle(skb);
+
 	/* update pointers within the skb to store the data */
-	skb_reserve(skb, IAVF_SKB_PAD);
+	skb_reserve(skb, hr);
 	__skb_put(skb, size);
 
 	return skb;
 }
 
-/**
- * iavf_put_rx_buffer - Unmap used buffer
- * @rx_ring: rx descriptor ring to transact packets on
- * @rx_buffer: rx buffer to pull data from
- *
- * This function will unmap the buffer after it's written by HW.
- */
-static void iavf_put_rx_buffer(struct iavf_ring *rx_ring,
-			       struct iavf_rx_buffer *rx_buffer)
-{
-	if (!rx_buffer)
-		return;
-
-	/* we are not reusing the buffer so unmap it */
-	dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma, PAGE_SIZE,
-			     DMA_FROM_DEVICE, IAVF_RX_DMA_ATTR);
-
-	/* clear contents of buffer_info */
-	rx_buffer->page = NULL;
-}
-
 /**
  * iavf_is_non_eop - process handling of non-EOP buffers
  * @rx_ring: Rx ring being processed
@@ -1252,7 +1142,7 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 	bool failure = false;
 
 	while (likely(total_rx_packets < (unsigned int)budget)) {
-		struct iavf_rx_buffer *rx_buffer;
+		struct libeth_fqe *rx_buffer;
 		union iavf_rx_desc *rx_desc;
 		unsigned int size;
 		u16 vlan_tag = 0;
@@ -1287,13 +1177,16 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		size = FIELD_GET(IAVF_RXD_QW1_LENGTH_PBUF_MASK, qword);
 
 		iavf_trace(clean_rx_irq, rx_ring, rx_desc, skb);
-		rx_buffer = iavf_get_rx_buffer(rx_ring, size);
+
+		rx_buffer = &rx_ring->rx_fqes[rx_ring->next_to_clean];
+		if (!libeth_rx_sync_for_cpu(rx_buffer, size))
+			goto skip_data;
 
 		/* retrieve a buffer from the ring */
 		if (skb)
-			iavf_add_rx_frag(rx_ring, rx_buffer, skb, size);
+			iavf_add_rx_frag(skb, rx_buffer, size);
 		else
-			skb = iavf_build_skb(rx_ring, rx_buffer, size);
+			skb = iavf_build_skb(rx_buffer, size);
 
 		/* exit if we failed to retrieve a buffer */
 		if (!skb) {
@@ -1301,10 +1194,10 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 			break;
 		}
 
-		iavf_put_rx_buffer(rx_ring, rx_buffer);
+skip_data:
 		cleaned_count++;
 
-		if (iavf_is_non_eop(rx_ring, rx_desc, skb))
+		if (iavf_is_non_eop(rx_ring, rx_desc, skb) || unlikely(!skb))
 			continue;
 
 		/* ERR_MASK will only have valid bits if EOP set, and
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index ed559fa6f214..d7b5587aeb8e 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -80,18 +80,8 @@ enum iavf_dyn_idx_t {
 	BIT_ULL(IAVF_FILTER_PCTYPE_NONF_UNICAST_IPV6_UDP) | \
 	BIT_ULL(IAVF_FILTER_PCTYPE_NONF_MULTICAST_IPV6_UDP))
 
-/* Supported Rx Buffer Sizes (a multiple of 128) */
-#define IAVF_RXBUFFER_3072  3072  /* Used for large frames w/ padding */
-#define IAVF_MAX_RXBUFFER   9728  /* largest size for single descriptor */
-
-#define IAVF_PACKET_HDR_PAD (ETH_HLEN + ETH_FCS_LEN + (VLAN_HLEN * 2))
 #define iavf_rx_desc iavf_32byte_rx_desc
 
-#define IAVF_RX_DMA_ATTR \
-	(DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
-
-#define IAVF_SKB_PAD (NET_SKB_PAD + NET_IP_ALIGN)
-
 /**
  * iavf_test_staterr - tests bits in Rx descriptor status and error fields
  * @rx_desc: pointer to receive descriptor (in le64 format)
@@ -210,12 +200,6 @@ struct iavf_tx_buffer {
 	u32 tx_flags;
 };
 
-struct iavf_rx_buffer {
-	dma_addr_t dma;
-	struct page *page;
-	__u32 page_offset;
-};
-
 struct iavf_queue_stats {
 	u64 packets;
 	u64 bytes;
@@ -251,13 +235,18 @@ struct iavf_rx_queue_stats {
 struct iavf_ring {
 	struct iavf_ring *next;		/* pointer to next ring in q_vector */
 	void *desc;			/* Descriptor ring memory */
-	struct device *dev;		/* Used for DMA mapping */
+	union {
+		struct page_pool *pp;	/* Used on Rx for buffer management */
+		struct device *dev;	/* Used on Tx for DMA mapping */
+	};
 	struct net_device *netdev;	/* netdev ring maps to */
 	union {
+		struct libeth_fqe *rx_fqes;
 		struct iavf_tx_buffer *tx_bi;
-		struct iavf_rx_buffer *rx_bi;
 	};
 	u8 __iomem *tail;
+	u32 truesize;
+
 	u16 queue_index;		/* Queue number of ring */
 
 	/* high bit set means dynamic, use accessors routines to read/write.
@@ -305,6 +294,8 @@ struct iavf_ring {
 					 * iavf_clean_rx_ring_irq() is called
 					 * for this ring.
 					 */
+
+	u32 rx_buf_len;
 } ____cacheline_internodealigned_in_smp;
 
 #define IAVF_ITR_ADAPTIVE_MIN_INC	0x0002
@@ -328,13 +319,6 @@ struct iavf_ring_container {
 #define iavf_for_each_ring(pos, head) \
 	for (pos = (head).ring; pos != NULL; pos = pos->next)
 
-static inline unsigned int iavf_rx_pg_order(struct iavf_ring *ring)
-{
-	return 0;
-}
-
-#define iavf_rx_pg_size(_ring) (PAGE_SIZE << iavf_rx_pg_order(_ring))
-
 bool iavf_alloc_rx_buffers(struct iavf_ring *rxr, u16 cleaned_count);
 netdev_tx_t iavf_xmit_frame(struct sk_buff *skb, struct net_device *netdev);
 int iavf_setup_tx_descriptors(struct iavf_ring *tx_ring);
diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
index f8e9f859a4f1..1e543f6a7c30 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright(c) 2013 - 2018 Intel Corporation. */
 
+#include <linux/net/intel/libie/rx.h>
+
 #include "iavf.h"
 #include "iavf_prototype.h"
 
@@ -268,13 +270,13 @@ int iavf_get_vf_vlan_v2_caps(struct iavf_adapter *adapter)
 void iavf_configure_queues(struct iavf_adapter *adapter)
 {
 	struct virtchnl_vsi_queue_config_info *vqci;
-	int i, max_frame = adapter->vf_res->max_mtu;
 	int pairs = adapter->num_active_queues;
 	struct virtchnl_queue_pair_info *vqpi;
+	u32 i, max_frame;
 	size_t len;
 
-	if (max_frame > IAVF_MAX_RXBUFFER || !max_frame)
-		max_frame = IAVF_MAX_RXBUFFER;
+	max_frame = LIBIE_MAX_RX_FRM_LEN(adapter->rx_rings->pp->p.offset);
+	max_frame = min_not_zero(adapter->vf_res->max_mtu, max_frame);
 
 	if (adapter->current_op != VIRTCHNL_OP_UNKNOWN) {
 		/* bail because we already have a command pending */
@@ -304,7 +306,7 @@ void iavf_configure_queues(struct iavf_adapter *adapter)
 		vqpi->rxq.ring_len = adapter->rx_rings[i].count;
 		vqpi->rxq.dma_ring_addr = adapter->rx_rings[i].dma;
 		vqpi->rxq.max_pkt_size = max_frame;
-		vqpi->rxq.databuffer_size = IAVF_RXBUFFER_3072;
+		vqpi->rxq.databuffer_size = adapter->rx_rings[i].rx_buf_len;
 		if (CRC_OFFLOAD_ALLOWED(adapter))
 			vqpi->rxq.crc_disable = !!(adapter->netdev->features &
 						   NETIF_F_RXFCS);
diff --git a/include/linux/net/intel/libie/rx.h b/include/linux/net/intel/libie/rx.h
index 37c5c8f26fb9..8e97775f1d66 100644
--- a/include/linux/net/intel/libie/rx.h
+++ b/include/linux/net/intel/libie/rx.h
@@ -6,6 +6,23 @@
 
 #include <net/libeth/rx.h>
 
+/* Rx buffer management */
+
+/* The largest size for a single descriptor as per HW */
+#define LIBIE_MAX_RX_BUF_LEN	9728U
+/* "True" HW-writeable space: minimum from SW and HW values */
+#define LIBIE_RX_BUF_LEN(hr)	min_t(u32, LIBETH_RX_PAGE_LEN(hr),	\
+				      LIBIE_MAX_RX_BUF_LEN)
+
+/* The maximum frame size as per HW (S/G) */
+#define __LIBIE_MAX_RX_FRM_LEN	16382U
+/* ATST, HW can chain up to 5 Rx descriptors */
+#define LIBIE_MAX_RX_FRM_LEN(hr)					\
+	min_t(u32, __LIBIE_MAX_RX_FRM_LEN, LIBIE_RX_BUF_LEN(hr) * 5)
+/* Maximum frame size minus LL overhead */
+#define LIBIE_MAX_MTU							\
+	(LIBIE_MAX_RX_FRM_LEN(LIBETH_MAX_HEADROOM) - LIBETH_RX_LL_LEN)
+
 /* O(1) converting i40e/ice/iavf's 8/10-bit hardware packet type to a parsed
  * bitfield struct.
  */
-- 
cgit v1.2.3


From cdc66553c4130735f0a2db943a5259e54ff1597a Mon Sep 17 00:00:00 2001
From: Dawei Li <dawei.li@shingroup.cn>
Date: Tue, 16 Apr 2024 16:54:48 +0800
Subject: cpumask: Introduce cpumask_first_and_and()

Introduce cpumask_first_and_and() to get intersection between 3 cpumasks,
free of any intermediate cpumask variable. Instead, cpumask_first_and_and()
works in-place with all inputs and produces desired output directly.

Signed-off-by: Dawei Li <dawei.li@shingroup.cn>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Yury Norov <yury.norov@gmail.com>
Link: https://lore.kernel.org/r/20240416085454.3547175-2-dawei.li@shingroup.cn
---
 include/linux/cpumask.h | 17 +++++++++++++++++
 include/linux/find.h    | 27 +++++++++++++++++++++++++++
 lib/find_bit.c          | 12 ++++++++++++
 3 files changed, 56 insertions(+)

(limited to 'include')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 1c29947db848..c46f9e9e1d66 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -187,6 +187,23 @@ unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask
 	return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
 }
 
+/**
+ * cpumask_first_and_and - return the first cpu from *srcp1 & *srcp2 & *srcp3
+ * @srcp1: the first input
+ * @srcp2: the second input
+ * @srcp3: the third input
+ *
+ * Return: >= nr_cpu_ids if no cpus set in all.
+ */
+static inline
+unsigned int cpumask_first_and_and(const struct cpumask *srcp1,
+				   const struct cpumask *srcp2,
+				   const struct cpumask *srcp3)
+{
+	return find_first_and_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
+				      cpumask_bits(srcp3), small_cpumask_bits);
+}
+
 /**
  * cpumask_last - get the last CPU in a cpumask
  * @srcp:	- the cpumask pointer
diff --git a/include/linux/find.h b/include/linux/find.h
index c69598e383c1..28ec5a03393a 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -29,6 +29,8 @@ unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1, const unsign
 					unsigned long n);
 extern unsigned long _find_first_and_bit(const unsigned long *addr1,
 					 const unsigned long *addr2, unsigned long size);
+unsigned long _find_first_and_and_bit(const unsigned long *addr1, const unsigned long *addr2,
+				      const unsigned long *addr3, unsigned long size);
 extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size);
 extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size);
 
@@ -345,6 +347,31 @@ unsigned long find_first_and_bit(const unsigned long *addr1,
 }
 #endif
 
+/**
+ * find_first_and_and_bit - find the first set bit in 3 memory regions
+ * @addr1: The first address to base the search on
+ * @addr2: The second address to base the search on
+ * @addr3: The third address to base the search on
+ * @size: The bitmap size in bits
+ *
+ * Returns the bit number for the first set bit
+ * If no bits are set, returns @size.
+ */
+static inline
+unsigned long find_first_and_and_bit(const unsigned long *addr1,
+				     const unsigned long *addr2,
+				     const unsigned long *addr3,
+				     unsigned long size)
+{
+	if (small_const_nbits(size)) {
+		unsigned long val = *addr1 & *addr2 & *addr3 & GENMASK(size - 1, 0);
+
+		return val ? __ffs(val) : size;
+	}
+
+	return _find_first_and_and_bit(addr1, addr2, addr3, size);
+}
+
 #ifndef find_first_zero_bit
 /**
  * find_first_zero_bit - find the first cleared bit in a memory region
diff --git a/lib/find_bit.c b/lib/find_bit.c
index 32f99e9a670e..dacadd904250 100644
--- a/lib/find_bit.c
+++ b/lib/find_bit.c
@@ -116,6 +116,18 @@ unsigned long _find_first_and_bit(const unsigned long *addr1,
 EXPORT_SYMBOL(_find_first_and_bit);
 #endif
 
+/*
+ * Find the first set bit in three memory regions.
+ */
+unsigned long _find_first_and_and_bit(const unsigned long *addr1,
+				      const unsigned long *addr2,
+				      const unsigned long *addr3,
+				      unsigned long size)
+{
+	return FIND_FIRST_BIT(addr1[idx] & addr2[idx] & addr3[idx], /* nop */, size);
+}
+EXPORT_SYMBOL(_find_first_and_and_bit);
+
 #ifndef find_first_zero_bit
 /*
  * Find the first cleared bit in a memory region.
-- 
cgit v1.2.3


From 95c07d58250ca3ed01855c20be568cf04e15382f Mon Sep 17 00:00:00 2001
From: Haiyue Wang <haiyue.wang@intel.com>
Date: Wed, 24 Apr 2024 13:45:07 +0800
Subject: bpf: update the comment for BTF_FIELDS_MAX

The commit d56b63cf0c0f ("bpf: add support for bpf_wq user type")
changes the fields support number to 11, just sync the comment.

Signed-off-by: Haiyue Wang <haiyue.wang@intel.com>
Acked-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20240424054526.8031-1-haiyue.wang@intel.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 64bf0cf3ee95..978200f6d925 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -184,7 +184,7 @@ struct bpf_map_ops {
 };
 
 enum {
-	/* Support at most 10 fields in a BTF type */
+	/* Support at most 11 fields in a BTF type */
 	BTF_FIELDS_MAX	   = 11,
 };
 
-- 
cgit v1.2.3


From 3e1c6f35409f9e447bf37f64840f5b65576bfb78 Mon Sep 17 00:00:00 2001
From: Vadim Fedorenko <vadfed@meta.com>
Date: Mon, 22 Apr 2024 15:50:21 -0700
Subject: bpf: make common crypto API for TC/XDP programs

Add crypto API support to BPF to be able to decrypt or encrypt packets
in TC/XDP BPF programs. Special care should be taken for initialization
part of crypto algo because crypto alloc) doesn't work with preemtion
disabled, it can be run only in sleepable BPF program. Also async crypto
is not supported because of the very same issue - TC/XDP BPF programs
are not sleepable.

Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
Link: https://lore.kernel.org/r/20240422225024.2847039-2-vadfed@meta.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/linux/bpf.h        |   1 +
 include/linux/bpf_crypto.h |  24 +++
 kernel/bpf/Makefile        |   3 +
 kernel/bpf/crypto.c        | 385 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/helpers.c       |   2 +-
 kernel/bpf/verifier.c      |   1 +
 6 files changed, 415 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/bpf_crypto.h
 create mode 100644 kernel/bpf/crypto.c

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 978200f6d925..364563b74db6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1275,6 +1275,7 @@ int bpf_dynptr_check_size(u32 size);
 u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr);
 const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len);
 void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len);
+bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr);
 
 #ifdef CONFIG_BPF_JIT
 int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr);
diff --git a/include/linux/bpf_crypto.h b/include/linux/bpf_crypto.h
new file mode 100644
index 000000000000..a41e71d4e2d9
--- /dev/null
+++ b/include/linux/bpf_crypto.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+#ifndef _BPF_CRYPTO_H
+#define _BPF_CRYPTO_H
+
+struct bpf_crypto_type {
+	void *(*alloc_tfm)(const char *algo);
+	void (*free_tfm)(void *tfm);
+	int (*has_algo)(const char *algo);
+	int (*setkey)(void *tfm, const u8 *key, unsigned int keylen);
+	int (*setauthsize)(void *tfm, unsigned int authsize);
+	int (*encrypt)(void *tfm, const u8 *src, u8 *dst, unsigned int len, u8 *iv);
+	int (*decrypt)(void *tfm, const u8 *src, u8 *dst, unsigned int len, u8 *iv);
+	unsigned int (*ivsize)(void *tfm);
+	unsigned int (*statesize)(void *tfm);
+	u32 (*get_flags)(void *tfm);
+	struct module *owner;
+	char name[14];
+};
+
+int bpf_crypto_register_type(const struct bpf_crypto_type *type);
+int bpf_crypto_unregister_type(const struct bpf_crypto_type *type);
+
+#endif /* _BPF_CRYPTO_H */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 368c5d86b5b7..736bd22e5ce0 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -44,6 +44,9 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
 obj-$(CONFIG_BPF_SYSCALL) += cpumask.o
 obj-${CONFIG_BPF_LSM} += bpf_lsm.o
 endif
+ifeq ($(CONFIG_CRYPTO),y)
+obj-$(CONFIG_BPF_SYSCALL) += crypto.o
+endif
 obj-$(CONFIG_BPF_PRELOAD) += preload/
 
 obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c
new file mode 100644
index 000000000000..2bee4af91e38
--- /dev/null
+++ b/kernel/bpf/crypto.c
@@ -0,0 +1,385 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024 Meta, Inc */
+#include <linux/bpf.h>
+#include <linux/bpf_crypto.h>
+#include <linux/bpf_mem_alloc.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/filter.h>
+#include <linux/scatterlist.h>
+#include <linux/skbuff.h>
+#include <crypto/skcipher.h>
+
+struct bpf_crypto_type_list {
+	const struct bpf_crypto_type *type;
+	struct list_head list;
+};
+
+/* BPF crypto initialization parameters struct */
+/**
+ * struct bpf_crypto_params - BPF crypto initialization parameters structure
+ * @type:	The string of crypto operation type.
+ * @reserved:	Reserved member, will be reused for more options in future
+ *		Values:
+ *		  0
+ * @algo:	The string of algorithm to initialize.
+ * @key:	The cipher key used to init crypto algorithm.
+ * @key_len:	The length of cipher key.
+ * @authsize:	The length of authentication tag used by algorithm.
+ */
+struct bpf_crypto_params {
+	char type[14];
+	u8 reserved[2];
+	char algo[128];
+	u8 key[256];
+	u32 key_len;
+	u32 authsize;
+};
+
+static LIST_HEAD(bpf_crypto_types);
+static DECLARE_RWSEM(bpf_crypto_types_sem);
+
+/**
+ * struct bpf_crypto_ctx - refcounted BPF crypto context structure
+ * @type:	The pointer to bpf crypto type
+ * @tfm:	The pointer to instance of crypto API struct.
+ * @siv_len:    Size of IV and state storage for cipher
+ * @rcu:	The RCU head used to free the crypto context with RCU safety.
+ * @usage:	Object reference counter. When the refcount goes to 0, the
+ *		memory is released back to the BPF allocator, which provides
+ *		RCU safety.
+ */
+struct bpf_crypto_ctx {
+	const struct bpf_crypto_type *type;
+	void *tfm;
+	u32 siv_len;
+	struct rcu_head rcu;
+	refcount_t usage;
+};
+
+int bpf_crypto_register_type(const struct bpf_crypto_type *type)
+{
+	struct bpf_crypto_type_list *node;
+	int err = -EEXIST;
+
+	down_write(&bpf_crypto_types_sem);
+	list_for_each_entry(node, &bpf_crypto_types, list) {
+		if (!strcmp(node->type->name, type->name))
+			goto unlock;
+	}
+
+	node = kmalloc(sizeof(*node), GFP_KERNEL);
+	err = -ENOMEM;
+	if (!node)
+		goto unlock;
+
+	node->type = type;
+	list_add(&node->list, &bpf_crypto_types);
+	err = 0;
+
+unlock:
+	up_write(&bpf_crypto_types_sem);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(bpf_crypto_register_type);
+
+int bpf_crypto_unregister_type(const struct bpf_crypto_type *type)
+{
+	struct bpf_crypto_type_list *node;
+	int err = -ENOENT;
+
+	down_write(&bpf_crypto_types_sem);
+	list_for_each_entry(node, &bpf_crypto_types, list) {
+		if (strcmp(node->type->name, type->name))
+			continue;
+
+		list_del(&node->list);
+		kfree(node);
+		err = 0;
+		break;
+	}
+	up_write(&bpf_crypto_types_sem);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(bpf_crypto_unregister_type);
+
+static const struct bpf_crypto_type *bpf_crypto_get_type(const char *name)
+{
+	const struct bpf_crypto_type *type = ERR_PTR(-ENOENT);
+	struct bpf_crypto_type_list *node;
+
+	down_read(&bpf_crypto_types_sem);
+	list_for_each_entry(node, &bpf_crypto_types, list) {
+		if (strcmp(node->type->name, name))
+			continue;
+
+		if (try_module_get(node->type->owner))
+			type = node->type;
+		break;
+	}
+	up_read(&bpf_crypto_types_sem);
+
+	return type;
+}
+
+__bpf_kfunc_start_defs();
+
+/**
+ * bpf_crypto_ctx_create() - Create a mutable BPF crypto context.
+ *
+ * Allocates a crypto context that can be used, acquired, and released by
+ * a BPF program. The crypto context returned by this function must either
+ * be embedded in a map as a kptr, or freed with bpf_crypto_ctx_release().
+ * As crypto API functions use GFP_KERNEL allocations, this function can
+ * only be used in sleepable BPF programs.
+ *
+ * bpf_crypto_ctx_create() allocates memory for crypto context.
+ * It may return NULL if no memory is available.
+ * @params:	pointer to struct bpf_crypto_params which contains all the
+ *		details needed to initialise crypto context.
+ * @params__sz:	size of steuct bpf_crypto_params usef by bpf program
+ * @err:	integer to store error code when NULL is returned.
+ */
+__bpf_kfunc struct bpf_crypto_ctx *
+bpf_crypto_ctx_create(const struct bpf_crypto_params *params, u32 params__sz,
+		      int *err)
+{
+	const struct bpf_crypto_type *type;
+	struct bpf_crypto_ctx *ctx;
+
+	if (!params || params->reserved[0] || params->reserved[1] ||
+	    params__sz != sizeof(struct bpf_crypto_params)) {
+		*err = -EINVAL;
+		return NULL;
+	}
+
+	type = bpf_crypto_get_type(params->type);
+	if (IS_ERR(type)) {
+		*err = PTR_ERR(type);
+		return NULL;
+	}
+
+	if (!type->has_algo(params->algo)) {
+		*err = -EOPNOTSUPP;
+		goto err_module_put;
+	}
+
+	if (!!params->authsize ^ !!type->setauthsize) {
+		*err = -EOPNOTSUPP;
+		goto err_module_put;
+	}
+
+	if (!params->key_len || params->key_len > sizeof(params->key)) {
+		*err = -EINVAL;
+		goto err_module_put;
+	}
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx) {
+		*err = -ENOMEM;
+		goto err_module_put;
+	}
+
+	ctx->type = type;
+	ctx->tfm = type->alloc_tfm(params->algo);
+	if (IS_ERR(ctx->tfm)) {
+		*err = PTR_ERR(ctx->tfm);
+		goto err_free_ctx;
+	}
+
+	if (params->authsize) {
+		*err = type->setauthsize(ctx->tfm, params->authsize);
+		if (*err)
+			goto err_free_tfm;
+	}
+
+	*err = type->setkey(ctx->tfm, params->key, params->key_len);
+	if (*err)
+		goto err_free_tfm;
+
+	if (type->get_flags(ctx->tfm) & CRYPTO_TFM_NEED_KEY) {
+		*err = -EINVAL;
+		goto err_free_tfm;
+	}
+
+	ctx->siv_len = type->ivsize(ctx->tfm) + type->statesize(ctx->tfm);
+
+	refcount_set(&ctx->usage, 1);
+
+	return ctx;
+
+err_free_tfm:
+	type->free_tfm(ctx->tfm);
+err_free_ctx:
+	kfree(ctx);
+err_module_put:
+	module_put(type->owner);
+
+	return NULL;
+}
+
+static void crypto_free_cb(struct rcu_head *head)
+{
+	struct bpf_crypto_ctx *ctx;
+
+	ctx = container_of(head, struct bpf_crypto_ctx, rcu);
+	ctx->type->free_tfm(ctx->tfm);
+	module_put(ctx->type->owner);
+	kfree(ctx);
+}
+
+/**
+ * bpf_crypto_ctx_acquire() - Acquire a reference to a BPF crypto context.
+ * @ctx: The BPF crypto context being acquired. The ctx must be a trusted
+ *	     pointer.
+ *
+ * Acquires a reference to a BPF crypto context. The context returned by this function
+ * must either be embedded in a map as a kptr, or freed with
+ * bpf_crypto_ctx_release().
+ */
+__bpf_kfunc struct bpf_crypto_ctx *
+bpf_crypto_ctx_acquire(struct bpf_crypto_ctx *ctx)
+{
+	if (!refcount_inc_not_zero(&ctx->usage))
+		return NULL;
+	return ctx;
+}
+
+/**
+ * bpf_crypto_ctx_release() - Release a previously acquired BPF crypto context.
+ * @ctx: The crypto context being released.
+ *
+ * Releases a previously acquired reference to a BPF crypto context. When the final
+ * reference of the BPF crypto context has been released, its memory
+ * will be released.
+ */
+__bpf_kfunc void bpf_crypto_ctx_release(struct bpf_crypto_ctx *ctx)
+{
+	if (refcount_dec_and_test(&ctx->usage))
+		call_rcu(&ctx->rcu, crypto_free_cb);
+}
+
+static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx,
+			    const struct bpf_dynptr_kern *src,
+			    const struct bpf_dynptr_kern *dst,
+			    const struct bpf_dynptr_kern *siv,
+			    bool decrypt)
+{
+	u32 src_len, dst_len, siv_len;
+	const u8 *psrc;
+	u8 *pdst, *piv;
+	int err;
+
+	if (__bpf_dynptr_is_rdonly(dst))
+		return -EINVAL;
+
+	siv_len = __bpf_dynptr_size(siv);
+	src_len = __bpf_dynptr_size(src);
+	dst_len = __bpf_dynptr_size(dst);
+	if (!src_len || !dst_len)
+		return -EINVAL;
+
+	if (siv_len != ctx->siv_len)
+		return -EINVAL;
+
+	psrc = __bpf_dynptr_data(src, src_len);
+	if (!psrc)
+		return -EINVAL;
+	pdst = __bpf_dynptr_data_rw(dst, dst_len);
+	if (!pdst)
+		return -EINVAL;
+
+	piv = siv_len ? __bpf_dynptr_data_rw(siv, siv_len) : NULL;
+	if (siv_len && !piv)
+		return -EINVAL;
+
+	err = decrypt ? ctx->type->decrypt(ctx->tfm, psrc, pdst, src_len, piv)
+		      : ctx->type->encrypt(ctx->tfm, psrc, pdst, src_len, piv);
+
+	return err;
+}
+
+/**
+ * bpf_crypto_decrypt() - Decrypt buffer using configured context and IV provided.
+ * @ctx:	The crypto context being used. The ctx must be a trusted pointer.
+ * @src:	bpf_dynptr to the encrypted data. Must be a trusted pointer.
+ * @dst:	bpf_dynptr to the buffer where to store the result. Must be a trusted pointer.
+ * @siv:	bpf_dynptr to IV data and state data to be used by decryptor.
+ *
+ * Decrypts provided buffer using IV data and the crypto context. Crypto context must be configured.
+ */
+__bpf_kfunc int bpf_crypto_decrypt(struct bpf_crypto_ctx *ctx,
+				   const struct bpf_dynptr_kern *src,
+				   const struct bpf_dynptr_kern *dst,
+				   const struct bpf_dynptr_kern *siv)
+{
+	return bpf_crypto_crypt(ctx, src, dst, siv, true);
+}
+
+/**
+ * bpf_crypto_encrypt() - Encrypt buffer using configured context and IV provided.
+ * @ctx:	The crypto context being used. The ctx must be a trusted pointer.
+ * @src:	bpf_dynptr to the plain data. Must be a trusted pointer.
+ * @dst:	bpf_dynptr to buffer where to store the result. Must be a trusted pointer.
+ * @siv:	bpf_dynptr to IV data and state data to be used by decryptor.
+ *
+ * Encrypts provided buffer using IV data and the crypto context. Crypto context must be configured.
+ */
+__bpf_kfunc int bpf_crypto_encrypt(struct bpf_crypto_ctx *ctx,
+				   const struct bpf_dynptr_kern *src,
+				   const struct bpf_dynptr_kern *dst,
+				   const struct bpf_dynptr_kern *siv)
+{
+	return bpf_crypto_crypt(ctx, src, dst, siv, false);
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(crypt_init_kfunc_btf_ids)
+BTF_ID_FLAGS(func, bpf_crypto_ctx_create, KF_ACQUIRE | KF_RET_NULL | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_crypto_ctx_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_crypto_ctx_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
+BTF_KFUNCS_END(crypt_init_kfunc_btf_ids)
+
+static const struct btf_kfunc_id_set crypt_init_kfunc_set = {
+	.owner = THIS_MODULE,
+	.set   = &crypt_init_kfunc_btf_ids,
+};
+
+BTF_KFUNCS_START(crypt_kfunc_btf_ids)
+BTF_ID_FLAGS(func, bpf_crypto_decrypt, KF_RCU)
+BTF_ID_FLAGS(func, bpf_crypto_encrypt, KF_RCU)
+BTF_KFUNCS_END(crypt_kfunc_btf_ids)
+
+static const struct btf_kfunc_id_set crypt_kfunc_set = {
+	.owner = THIS_MODULE,
+	.set   = &crypt_kfunc_btf_ids,
+};
+
+BTF_ID_LIST(bpf_crypto_dtor_ids)
+BTF_ID(struct, bpf_crypto_ctx)
+BTF_ID(func, bpf_crypto_ctx_release)
+
+static int __init crypto_kfunc_init(void)
+{
+	int ret;
+	const struct btf_id_dtor_kfunc bpf_crypto_dtors[] = {
+		{
+			.btf_id	      = bpf_crypto_dtor_ids[0],
+			.kfunc_btf_id = bpf_crypto_dtor_ids[1]
+		},
+	};
+
+	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &crypt_kfunc_set);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &crypt_kfunc_set);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &crypt_kfunc_set);
+	ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL,
+					       &crypt_init_kfunc_set);
+	return  ret ?: register_btf_id_dtor_kfuncs(bpf_crypto_dtors,
+						   ARRAY_SIZE(bpf_crypto_dtors),
+						   THIS_MODULE);
+}
+
+late_initcall(crypto_kfunc_init);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 14c401b78e12..2a69a9a36c0f 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1583,7 +1583,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = {
 #define DYNPTR_SIZE_MASK	0xFFFFFF
 #define DYNPTR_RDONLY_BIT	BIT(31)
 
-static bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
+bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
 {
 	return ptr->size & DYNPTR_RDONLY_BIT;
 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8312c7a0ee19..4e474ef44e9c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5310,6 +5310,7 @@ BTF_ID(struct, cgroup)
 BTF_ID(struct, bpf_cpumask)
 #endif
 BTF_ID(struct, task_struct)
+BTF_ID(struct, bpf_crypto_ctx)
 BTF_SET_END(rcu_protected_types)
 
 static bool rcu_protected_object(const struct btf *btf, u32 btf_id)
-- 
cgit v1.2.3


From 2a7177a804571fe9d0abd7e049d7c129cd34c3f6 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 24 Apr 2024 00:18:43 +0300
Subject: scsi: core: Don't use "proxy" headers

Update header inclusions to follow IWYU (Include What You Use) principle.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20240423211843.3996046-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/scsi/scsi.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h
index d90645f06a3a..96b350366670 100644
--- a/include/scsi/scsi.h
+++ b/include/scsi/scsi.h
@@ -7,8 +7,9 @@
 #define _SCSI_SCSI_H
 
 #include <linux/types.h>
-#include <linux/scatterlist.h>
-#include <linux/kernel.h>
+
+#include <asm/param.h>
+
 #include <scsi/scsi_common.h>
 #include <scsi/scsi_proto.h>
 #include <scsi/scsi_status.h>
-- 
cgit v1.2.3


From 9cef74a9bc26fbb68da4e90573d4d61c2e07af77 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 23 Apr 2024 22:00:38 -0700
Subject: scsi: libfc: Add some kernel-doc comments

Complete the kernel-doc notation for enum fc_lport_state. This fixes 7
kernel-doc warnings.

 - In struct fc_rport_priv, change 'event_callback' to 'lld_event_callback'
   to match the struct member name.

 - In struct fc_fcp_pkt, add a description for 'timer_delay' to eliminate
   one kernel-doc warning.

 - Add return value notation for 3 functions. This fixes 3 kernel-doc
   warnings.

There are still 12 warnings for struct members not described in struct
fc_rport_priv and struct fc_lport, e.g:

libfc.h:218: warning: Function parameter or struct member 'event' not described in 'fc_rport_priv'
libfc.h:760: warning: Function parameter or struct member 'vlan' not described in 'fc_lport'

Warnings that are fixed in this patch:

libfc.h:75: warning: Enum value 'LPORT_ST_RNN_ID' not described in enum 'fc_lport_state'
libfc.h:75: warning: Enum value 'LPORT_ST_RSNN_NN' not described in enum 'fc_lport_state'
libfc.h:75: warning: Enum value 'LPORT_ST_RSPN_ID' not described in enum 'fc_lport_state'
libfc.h:75: warning: Enum value 'LPORT_ST_RPA' not described in enum 'fc_lport_state'
libfc.h:75: warning: Enum value 'LPORT_ST_DHBA' not described in enum 'fc_lport_state'
libfc.h:75: warning: Enum value 'LPORT_ST_DPRT' not described in enum 'fc_lport_state'
libfc.h:75: warning: Excess enum value 'LPORT_ST_RPN_ID' description in 'fc_lport_state'

libfc.h:218: warning: Excess struct member 'event_callback' description in 'fc_rport_priv'

libfc.h:793: warning: No description found for return value of 'fc_lport_test_ready'
libfc.h:835: warning: No description found for return value of 'fc_lport_init_stats'
libfc.h:856: warning: No description found for return value of 'lport_priv'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/20240424050038.31403-1-rdunlap@infradead.org
Cc: Hannes Reinecke <hare@suse.de>
Cc: James E.J. Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/scsi/libfc.h | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/scsi/libfc.h b/include/scsi/libfc.h
index eca6fd42d7f7..4a9b4169e081 100644
--- a/include/scsi/libfc.h
+++ b/include/scsi/libfc.h
@@ -44,11 +44,16 @@
  * @LPORT_ST_DISABLED: Disabled
  * @LPORT_ST_FLOGI:    Fabric login (FLOGI) sent
  * @LPORT_ST_DNS:      Waiting for name server remote port to become ready
- * @LPORT_ST_RPN_ID:   Register port name by ID (RPN_ID) sent
+ * @LPORT_ST_RNN_ID:   Register port name by ID (RNN_ID) sent
+ * @LPORT_ST_RSNN_NN:  Waiting for host symbolic node name
+ * @LPORT_ST_RSPN_ID:  Waiting for host symbolic port name
  * @LPORT_ST_RFT_ID:   Register Fibre Channel types by ID (RFT_ID) sent
  * @LPORT_ST_RFF_ID:   Register FC-4 Features by ID (RFF_ID) sent
  * @LPORT_ST_FDMI:     Waiting for mgmt server rport to become ready
- * @LPORT_ST_RHBA:
+ * @LPORT_ST_RHBA:     Register HBA
+ * @LPORT_ST_RPA:      Register Port Attributes
+ * @LPORT_ST_DHBA:     Deregister HBA
+ * @LPORT_ST_DPRT:     Deregister Port
  * @LPORT_ST_SCR:      State Change Register (SCR) sent
  * @LPORT_ST_READY:    Ready for use
  * @LPORT_ST_LOGO:     Local port logout (LOGO) sent
@@ -183,7 +188,7 @@ struct fc_rport_libfc_priv {
  * @r_a_tov:        Resource allocation timeout value (in msec)
  * @rp_mutex:       The mutex that protects the remote port
  * @retry_work:     Handle for retries
- * @event_callback: Callback when READY, FAILED or LOGO states complete
+ * @lld_event_callback: Callback when READY, FAILED or LOGO states complete
  * @prli_count:     Count of open PRLI sessions in providers
  * @rcu:	    Structure used for freeing in an RCU-safe manner
  */
@@ -289,6 +294,7 @@ struct fc_seq_els_data {
  * @timer:           The command timer
  * @tm_done:         Completion indicator
  * @wait_for_comp:   Indicator to wait for completion of the I/O (in jiffies)
+ * @timer_delay:     FCP packet timer delay in jiffies
  * @data_len:        The length of the data
  * @cdb_cmd:         The CDB command
  * @xfer_len:        The transfer length
@@ -788,6 +794,8 @@ void fc_fc4_deregister_provider(enum fc_fh_type type, struct fc4_prov *);
 /**
  * fc_lport_test_ready() - Determine if a local port is in the READY state
  * @lport: The local port to test
+ *
+ * Returns: %true if local port is in the READY state, %false otherwise
  */
 static inline int fc_lport_test_ready(struct fc_lport *lport)
 {
@@ -830,6 +838,8 @@ static inline void fc_lport_state_enter(struct fc_lport *lport,
 /**
  * fc_lport_init_stats() - Allocate per-CPU statistics for a local port
  * @lport: The local port whose statistics are to be initialized
+ *
+ * Returns: %0 on success, %-ENOMEM on failure
  */
 static inline int fc_lport_init_stats(struct fc_lport *lport)
 {
@@ -851,6 +861,8 @@ static inline void fc_lport_free_stats(struct fc_lport *lport)
 /**
  * lport_priv() - Return the private data from a local port
  * @lport: The local port whose private data is to be retrieved
+ *
+ * Returns: the local port's private data pointer
  */
 static inline void *lport_priv(const struct fc_lport *lport)
 {
-- 
cgit v1.2.3


From de37677ef17d2a90fe99ea8f492e5c221d5bb627 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 23 Apr 2024 22:53:16 -0700
Subject: scsi: ufs: bsg: Fix all kernel-doc warnings

In struct utp_upiu_query_v4_0, add description for @osf3 and mark the
@reserved field as private so that no description is needed for it.

In struct utp_upiu_cmd, use the correct struct member name to eliminate a
kernel-doc warning.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/20240424055316.1384-1-rdunlap@infradead.org
Cc: James E.J. Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Alim Akhtar <alim.akhtar@samsung.com>
Cc: Avri Altman <avri.altman@wdc.com>
Cc: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/uapi/scsi/scsi_bsg_ufs.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/scsi/scsi_bsg_ufs.h b/include/uapi/scsi/scsi_bsg_ufs.h
index 03f2beadf201..8c29e498ef98 100644
--- a/include/uapi/scsi/scsi_bsg_ufs.h
+++ b/include/uapi/scsi/scsi_bsg_ufs.h
@@ -123,6 +123,7 @@ struct utp_upiu_query {
  * @idn: a value that indicates the particular type of data B-1
  * @index: Index to further identify data B-2
  * @selector: Index to further identify data B-3
+ * @osf3: spec field B-4
  * @osf4: spec field B-5
  * @osf5: spec field B 6,7
  * @osf6: spec field DW 8,9
@@ -138,12 +139,13 @@ struct utp_upiu_query_v4_0 {
 	__be16 osf5;
 	__be32 osf6;
 	__be32 osf7;
+	/* private: */
 	__be32 reserved;
 };
 
 /**
  * struct utp_upiu_cmd - Command UPIU structure
- * @data_transfer_len: Data Transfer Length DW-3
+ * @exp_data_transfer_len: Data Transfer Length DW-3
  * @cdb: Command Descriptor Block CDB DW-4 to DW-7
  */
 struct utp_upiu_cmd {
-- 
cgit v1.2.3


From aca061774bc412c1415242f0d2579143dd642b46 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Tue, 23 Apr 2024 22:53:22 -0700
Subject: scsi: mpi3mr: Fix some kernel-doc warnings in scsi_bsg_mpi3mr.h

Correct the name of a struct in kernel-doc to match the actual function
name.

Add kernel-doc comments for 2 reserved fields to match comments for other
reserved fields.

Correct the kernel-doc comments for a nested struct to eliminate kernel-doc
warnings for them.

Warnings fixed here are:

scsi_bsg_mpi3mr.h:419: warning: expecting prototype for struct mpi3mr_bsg_buf_entry_list. Prototype was for struct mpi3mr_buf_entry_list instead

scsi_bsg_mpi3mr.h:435: warning: Function parameter or struct member 'rsvd2' not described in 'mpi3mr_bsg_mptcmd'
scsi_bsg_mpi3mr.h:456: warning: Function parameter or struct member 'rsvd3' not described in 'mpi3mr_bsg_packet'

scsi_bsg_mpi3mr.h:456: warning: Excess struct member 'drvrcmd' description in 'mpi3mr_bsg_packet'
scsi_bsg_mpi3mr.h:456: warning: Excess struct member 'mptcmd' description in 'mpi3mr_bsg_packet'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://lore.kernel.org/r/20240424055322.1400-1-rdunlap@infradead.org
Cc: James E.J. Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Sathya Prakash Veerichetty <sathya.prakash@broadcom.com>
Cc: Kashyap Desai <kashyap.desai@broadcom.com>
Cc: Sumit Saxena <sumit.saxena@broadcom.com>
Cc: Sreekanth Reddy <sreekanth.reddy@broadcom.com>
Cc: mpi3mr-linuxdrv.pdl@broadcom.com
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
---
 include/uapi/scsi/scsi_bsg_mpi3mr.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/uapi/scsi/scsi_bsg_mpi3mr.h b/include/uapi/scsi/scsi_bsg_mpi3mr.h
index c72ce387286a..22b2fe266293 100644
--- a/include/uapi/scsi/scsi_bsg_mpi3mr.h
+++ b/include/uapi/scsi/scsi_bsg_mpi3mr.h
@@ -401,7 +401,7 @@ struct mpi3mr_buf_entry {
 	__u32	buf_len;
 };
 /**
- * struct mpi3mr_bsg_buf_entry_list - list of user buffer
+ * struct mpi3mr_buf_entry_list - list of user buffer
  * descriptor for MPI Passthrough requests.
  *
  * @num_of_entries: Number of buffer descriptors
@@ -424,6 +424,7 @@ struct mpi3mr_buf_entry_list {
  * @mrioc_id: Controller ID
  * @rsvd1: Reserved
  * @timeout: MPI request timeout
+ * @rsvd2: Reserved
  * @buf_entry_list: Buffer descriptor list
  */
 struct mpi3mr_bsg_mptcmd {
@@ -441,8 +442,9 @@ struct mpi3mr_bsg_mptcmd {
  * @cmd_type: represents drvrcmd or mptcmd
  * @rsvd1: Reserved
  * @rsvd2: Reserved
- * @drvrcmd: driver request structure
- * @mptcmd: mpt request structure
+ * @rsvd3: Reserved
+ * @cmd.drvrcmd: driver request structure
+ * @cmd.mptcmd: mpt request structure
  */
 struct mpi3mr_bsg_packet {
 	__u8	cmd_type;
-- 
cgit v1.2.3


From 4bcc2e91b9e77644ee04b290bfd2c1410d7c402e Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Thu, 11 Apr 2024 07:44:30 +0300
Subject: bus: ti-sysc: Drop legacy idle quirk handling

There are no more users that need the legacy idle quirk so let's drop
the legacy idle quirk handling. This simplifies the PM code to just
sysc_pm_ops with unified handling for all the interconnect targets.

Reviewed-by: Dhruva Gole <d-gole@ti.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
---
 drivers/bus/ti-sysc.c                 | 109 +---------------------------------
 include/linux/platform_data/ti-sysc.h |   1 -
 2 files changed, 2 insertions(+), 108 deletions(-)

(limited to 'include')

diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c
index 467702da0e82..8767e04d6c89 100644
--- a/drivers/bus/ti-sysc.c
+++ b/drivers/bus/ti-sysc.c
@@ -1469,8 +1469,7 @@ static int __maybe_unused sysc_noirq_suspend(struct device *dev)
 
 	ddata = dev_get_drvdata(dev);
 
-	if (ddata->cfg.quirks &
-	    (SYSC_QUIRK_LEGACY_IDLE | SYSC_QUIRK_NO_IDLE))
+	if (ddata->cfg.quirks & SYSC_QUIRK_NO_IDLE)
 		return 0;
 
 	if (!ddata->enabled)
@@ -1488,8 +1487,7 @@ static int __maybe_unused sysc_noirq_resume(struct device *dev)
 
 	ddata = dev_get_drvdata(dev);
 
-	if (ddata->cfg.quirks &
-	    (SYSC_QUIRK_LEGACY_IDLE | SYSC_QUIRK_NO_IDLE))
+	if (ddata->cfg.quirks & SYSC_QUIRK_NO_IDLE)
 		return 0;
 
 	if (ddata->cfg.quirks & SYSC_QUIRK_REINIT_ON_RESUME) {
@@ -2457,89 +2455,6 @@ static int __maybe_unused sysc_child_runtime_resume(struct device *dev)
 	return pm_generic_runtime_resume(dev);
 }
 
-#ifdef CONFIG_PM_SLEEP
-static int sysc_child_suspend_noirq(struct device *dev)
-{
-	struct sysc *ddata;
-	int error;
-
-	ddata = sysc_child_to_parent(dev);
-
-	dev_dbg(ddata->dev, "%s %s\n", __func__,
-		ddata->name ? ddata->name : "");
-
-	error = pm_generic_suspend_noirq(dev);
-	if (error) {
-		dev_err(dev, "%s error at %i: %i\n",
-			__func__, __LINE__, error);
-
-		return error;
-	}
-
-	if (!pm_runtime_status_suspended(dev)) {
-		error = pm_generic_runtime_suspend(dev);
-		if (error) {
-			dev_dbg(dev, "%s busy at %i: %i\n",
-				__func__, __LINE__, error);
-
-			return 0;
-		}
-
-		error = sysc_runtime_suspend(ddata->dev);
-		if (error) {
-			dev_err(dev, "%s error at %i: %i\n",
-				__func__, __LINE__, error);
-
-			return error;
-		}
-
-		ddata->child_needs_resume = true;
-	}
-
-	return 0;
-}
-
-static int sysc_child_resume_noirq(struct device *dev)
-{
-	struct sysc *ddata;
-	int error;
-
-	ddata = sysc_child_to_parent(dev);
-
-	dev_dbg(ddata->dev, "%s %s\n", __func__,
-		ddata->name ? ddata->name : "");
-
-	if (ddata->child_needs_resume) {
-		ddata->child_needs_resume = false;
-
-		error = sysc_runtime_resume(ddata->dev);
-		if (error)
-			dev_err(ddata->dev,
-				"%s runtime resume error: %i\n",
-				__func__, error);
-
-		error = pm_generic_runtime_resume(dev);
-		if (error)
-			dev_err(ddata->dev,
-				"%s generic runtime resume: %i\n",
-				__func__, error);
-	}
-
-	return pm_generic_resume_noirq(dev);
-}
-#endif
-
-static struct dev_pm_domain sysc_child_pm_domain = {
-	.ops = {
-		SET_RUNTIME_PM_OPS(sysc_child_runtime_suspend,
-				   sysc_child_runtime_resume,
-				   NULL)
-		USE_PLATFORM_PM_SLEEP_OPS
-		SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(sysc_child_suspend_noirq,
-					      sysc_child_resume_noirq)
-	}
-};
-
 /* Caller needs to take list_lock if ever used outside of cpu_pm */
 static void sysc_reinit_modules(struct sysc_soc_info *soc)
 {
@@ -2610,25 +2525,6 @@ out_unlock:
 	mutex_unlock(&sysc_soc->list_lock);
 }
 
-/**
- * sysc_legacy_idle_quirk - handle children in omap_device compatible way
- * @ddata: device driver data
- * @child: child device driver
- *
- * Allow idle for child devices as done with _od_runtime_suspend().
- * Otherwise many child devices will not idle because of the permanent
- * parent usecount set in pm_runtime_irq_safe().
- *
- * Note that the long term solution is to just modify the child device
- * drivers to not set pm_runtime_irq_safe() and then this can be just
- * dropped.
- */
-static void sysc_legacy_idle_quirk(struct sysc *ddata, struct device *child)
-{
-	if (ddata->cfg.quirks & SYSC_QUIRK_LEGACY_IDLE)
-		dev_pm_domain_set(child, &sysc_child_pm_domain);
-}
-
 static int sysc_notifier_call(struct notifier_block *nb,
 			      unsigned long event, void *device)
 {
@@ -2645,7 +2541,6 @@ static int sysc_notifier_call(struct notifier_block *nb,
 		error = sysc_child_add_clocks(ddata, dev);
 		if (error)
 			return error;
-		sysc_legacy_idle_quirk(ddata, dev);
 		break;
 	default:
 		break;
diff --git a/include/linux/platform_data/ti-sysc.h b/include/linux/platform_data/ti-sysc.h
index eb556f988d57..d8f15770a522 100644
--- a/include/linux/platform_data/ti-sysc.h
+++ b/include/linux/platform_data/ti-sysc.h
@@ -71,7 +71,6 @@ struct sysc_regbits {
 #define SYSC_QUIRK_SWSUP_SIDLE_ACT	BIT(12)
 #define SYSC_QUIRK_SWSUP_SIDLE		BIT(11)
 #define SYSC_QUIRK_EXT_OPT_CLOCK	BIT(10)
-#define SYSC_QUIRK_LEGACY_IDLE		BIT(9)
 #define SYSC_QUIRK_RESET_STATUS		BIT(8)
 #define SYSC_QUIRK_NO_IDLE		BIT(7)
 #define SYSC_QUIRK_NO_IDLE_ON_INIT	BIT(6)
-- 
cgit v1.2.3


From dbf76c0d3da8d18a46f75130bdfae7b3b54407c5 Mon Sep 17 00:00:00 2001
From: André Draszik <andre.draszik@linaro.org>
Date: Tue, 23 Apr 2024 15:31:03 +0100
Subject: dt-bindings: clock: google,gs101-clock: add HSI0 clock management
 unit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add dt-schema documentation and clock IDs for the high speed interface
0 HSI0 clock management unit. This is used (amongst others) for USB.

While the usual (sed) script has been used to derive the linux clock
IDs from the data sheet, one manual tweak was applied to fix a typo
which we don't want to carry:
    HSI0_USPDPDBG_USER -> HSI0_USBDPDBG_USER (note USB vs USP).

Signed-off-by: André Draszik <andre.draszik@linaro.org>
Reviewed-by: Rob Herring (Arm) <robh@kernel.org>
Link: https://lore.kernel.org/r/20240423-hsi0-gs101-v1-1-2c3ddb50c720@linaro.org
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
---
 .../bindings/clock/google,gs101-clock.yaml         | 29 +++++++++++-
 include/dt-bindings/clock/google,gs101.h           | 54 ++++++++++++++++++++++
 2 files changed, 81 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/google,gs101-clock.yaml b/Documentation/devicetree/bindings/clock/google,gs101-clock.yaml
index 1d2bcea41c85..94dcc4f84c85 100644
--- a/Documentation/devicetree/bindings/clock/google,gs101-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/google,gs101-clock.yaml
@@ -30,16 +30,17 @@ properties:
       - google,gs101-cmu-top
       - google,gs101-cmu-apm
       - google,gs101-cmu-misc
+      - google,gs101-cmu-hsi0
       - google,gs101-cmu-peric0
       - google,gs101-cmu-peric1
 
   clocks:
     minItems: 1
-    maxItems: 3
+    maxItems: 5
 
   clock-names:
     minItems: 1
-    maxItems: 3
+    maxItems: 5
 
   "#clock-cells":
     const: 1
@@ -72,6 +73,30 @@ allOf:
           items:
             - const: oscclk
 
+  - if:
+      properties:
+        compatible:
+          contains:
+            const: google,gs101-cmu-hsi0
+
+    then:
+      properties:
+        clocks:
+          items:
+            - description: External reference clock (24.576 MHz)
+            - description: HSI0 bus clock (from CMU_TOP)
+            - description: DPGTC (from CMU_TOP)
+            - description: USB DRD controller clock (from CMU_TOP)
+            - description: USB Display Port debug clock (from CMU_TOP)
+
+        clock-names:
+          items:
+            - const: oscclk
+            - const: bus
+            - const: dpgtc
+            - const: usb31drd
+            - const: usbdpdbg
+
   - if:
       properties:
         compatible:
diff --git a/include/dt-bindings/clock/google,gs101.h b/include/dt-bindings/clock/google,gs101.h
index 3dac3577788a..7a2006f0edf1 100644
--- a/include/dt-bindings/clock/google,gs101.h
+++ b/include/dt-bindings/clock/google,gs101.h
@@ -313,6 +313,60 @@
 #define CLK_APM_PLL_DIV4_APM				70
 #define CLK_APM_PLL_DIV16_APM				71
 
+/* CMU_HSI0 */
+#define CLK_FOUT_USB_PLL					1
+#define CLK_MOUT_PLL_USB					2
+#define CLK_MOUT_HSI0_ALT_USER					3
+#define CLK_MOUT_HSI0_BUS_USER					4
+#define CLK_MOUT_HSI0_DPGTC_USER				5
+#define CLK_MOUT_HSI0_TCXO_USER					6
+#define CLK_MOUT_HSI0_USB20_USER				7
+#define CLK_MOUT_HSI0_USB31DRD_USER				8
+#define CLK_MOUT_HSI0_USBDPDBG_USER				9
+#define CLK_MOUT_HSI0_BUS					10
+#define CLK_MOUT_HSI0_USB20_REF					11
+#define CLK_MOUT_HSI0_USB31DRD					12
+#define CLK_DOUT_HSI0_USB31DRD					13
+#define CLK_GOUT_HSI0_PCLK					14
+#define CLK_GOUT_HSI0_USB31DRD_I_USB31DRD_SUSPEND_CLK_26	15
+#define CLK_GOUT_HSI0_CLK_HSI0_ALT				16
+#define CLK_GOUT_HSI0_DP_LINK_I_DP_GTC_CLK			17
+#define CLK_GOUT_HSI0_DP_LINK_I_PCLK				18
+#define CLK_GOUT_HSI0_D_TZPC_HSI0_PCLK				19
+#define CLK_GOUT_HSI0_ETR_MIU_I_ACLK				20
+#define CLK_GOUT_HSI0_ETR_MIU_I_PCLK				21
+#define CLK_GOUT_HSI0_GPC_HSI0_PCLK				22
+#define CLK_GOUT_HSI0_LHM_AXI_G_ETR_HSI0_I_CLK			23
+#define CLK_GOUT_HSI0_LHM_AXI_P_AOCHSI0_I_CLK			24
+#define CLK_GOUT_HSI0_LHM_AXI_P_HSI0_I_CLK			25
+#define CLK_GOUT_HSI0_LHS_ACEL_D_HSI0_I_CLK			26
+#define CLK_GOUT_HSI0_LHS_AXI_D_HSI0AOC_I_CLK			27
+#define CLK_GOUT_HSI0_PPMU_HSI0_AOC_ACLK			28
+#define CLK_GOUT_HSI0_PPMU_HSI0_AOC_PCLK			29
+#define CLK_GOUT_HSI0_PPMU_HSI0_BUS0_ACLK			30
+#define CLK_GOUT_HSI0_PPMU_HSI0_BUS0_PCLK			31
+#define CLK_GOUT_HSI0_CLK_HSI0_BUS_CLK				32
+#define CLK_GOUT_HSI0_SSMT_USB_ACLK				33
+#define CLK_GOUT_HSI0_SSMT_USB_PCLK				34
+#define CLK_GOUT_HSI0_SYSMMU_USB_CLK_S2				35
+#define CLK_GOUT_HSI0_SYSREG_HSI0_PCLK				36
+#define CLK_GOUT_HSI0_UASC_HSI0_CTRL_ACLK			37
+#define CLK_GOUT_HSI0_UASC_HSI0_CTRL_PCLK			38
+#define CLK_GOUT_HSI0_UASC_HSI0_LINK_ACLK			39
+#define CLK_GOUT_HSI0_UASC_HSI0_LINK_PCLK			40
+#define CLK_GOUT_HSI0_USB31DRD_ACLK_PHYCTRL			41
+#define CLK_GOUT_HSI0_USB31DRD_BUS_CLK_EARLY			42
+#define CLK_GOUT_HSI0_USB31DRD_I_USB20_PHY_REFCLK_26		43
+#define CLK_GOUT_HSI0_USB31DRD_I_USB31DRD_REF_CLK_40		44
+#define CLK_GOUT_HSI0_USB31DRD_I_USBDPPHY_REF_SOC_PLL		45
+#define CLK_GOUT_HSI0_USB31DRD_I_USBDPPHY_SCL_APB_PCLK		46
+#define CLK_GOUT_HSI0_USB31DRD_I_USBPCS_APB_CLK			47
+#define CLK_GOUT_HSI0_USB31DRD_USBDPPHY_I_ACLK			48
+#define CLK_GOUT_HSI0_USB31DRD_USBDPPHY_UDBG_I_APB_PCLK		49
+#define CLK_GOUT_HSI0_XIU_D0_HSI0_ACLK				50
+#define CLK_GOUT_HSI0_XIU_D1_HSI0_ACLK				51
+#define CLK_GOUT_HSI0_XIU_P_HSI0_ACLK				52
+
 /* CMU_MISC */
 #define CLK_MOUT_MISC_BUS_USER				1
 #define CLK_MOUT_MISC_SSS_USER				2
-- 
cgit v1.2.3


From 14b5fb2145caeb909a1cd57d9cd5e0c3cd005642 Mon Sep 17 00:00:00 2001
From: Philo Lu <lulie@linux.alibaba.com>
Date: Sun, 21 Apr 2024 12:20:08 +0800
Subject: tcp: move tcp_skb_cb->sacked flags to enum

Move the flag definitions for tcp_skb_cb->sacked into a new enum named
tcp_skb_cb_sacked_flags, then we can get access to them in bpf via
vmlinux.h, e.g., in tracepoints.

This patch does not change any existing functionality.

Signed-off-by: Philo Lu <lulie@linux.alibaba.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index b935e1ae4caf..ffc9371fe9de 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -928,6 +928,19 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq)
 
 #define TCPHDR_SYN_ECN	(TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR)
 
+/* State flags for sacked in struct tcp_skb_cb */
+enum tcp_skb_cb_sacked_flags {
+	TCPCB_SACKED_ACKED	= (1 << 0),	/* SKB ACK'd by a SACK block	*/
+	TCPCB_SACKED_RETRANS	= (1 << 1),	/* SKB retransmitted		*/
+	TCPCB_LOST		= (1 << 2),	/* SKB is lost			*/
+	TCPCB_TAGBITS		= (TCPCB_SACKED_ACKED | TCPCB_SACKED_RETRANS |
+				   TCPCB_LOST),	/* All tag bits			*/
+	TCPCB_REPAIRED		= (1 << 4),	/* SKB repaired (no skb_mstamp_ns)	*/
+	TCPCB_EVER_RETRANS	= (1 << 7),	/* Ever retransmitted frame	*/
+	TCPCB_RETRANS		= (TCPCB_SACKED_RETRANS | TCPCB_EVER_RETRANS |
+				   TCPCB_REPAIRED),
+};
+
 /* This is what the send packet queuing engine uses to pass
  * TCP per-packet control information to the transmission code.
  * We also store the host-order sequence numbers in here too.
@@ -950,15 +963,6 @@ struct tcp_skb_cb {
 	__u8		tcp_flags;	/* TCP header flags. (tcp[13])	*/
 
 	__u8		sacked;		/* State flags for SACK.	*/
-#define TCPCB_SACKED_ACKED	0x01	/* SKB ACK'd by a SACK block	*/
-#define TCPCB_SACKED_RETRANS	0x02	/* SKB retransmitted		*/
-#define TCPCB_LOST		0x04	/* SKB is lost			*/
-#define TCPCB_TAGBITS		0x07	/* All tag bits			*/
-#define TCPCB_REPAIRED		0x10	/* SKB repaired (no skb_mstamp_ns)	*/
-#define TCPCB_EVER_RETRANS	0x80	/* Ever retransmitted frame	*/
-#define TCPCB_RETRANS		(TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS| \
-				TCPCB_REPAIRED)
-
 	__u8		ip_dsfield;	/* IPv4 tos or IPv6 dsfield	*/
 	__u8		txstamp_ack:1,	/* Record TX timestamp for ack? */
 			eor:1,		/* Is skb MSG_EOR marked? */
-- 
cgit v1.2.3


From d850b5bae0f5e435360edf0474ff446622f0d899 Mon Sep 17 00:00:00 2001
From: Robin Gong <yibin.gong@nxp.com>
Date: Fri, 19 Apr 2024 11:07:29 -0400
Subject: dmaengine: imx-sdma: Add i2c dma support

New sdma script (sdma-6q: v3.6, sdma-7d: v4.6) support i2c at imx8mp and
imx6ull. So add I2C dma support.

Signed-off-by: Robin Gong <yibin.gong@nxp.com>
Acked-by: Clark Wang <xiaoning.wang@nxp.com>
Reviewed-by: Joy Zou <joy.zou@nxp.com>
Reviewed-by: Daniel Baluta <daniel.baluta@nxp.com>
Signed-off-by: Frank Li <Frank.Li@nxp.com>
Link: https://lore.kernel.org/r/20240419150729.1071904-3-Frank.Li@nxp.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/dma/imx-sdma.c      | 7 +++++++
 include/linux/dma/imx-dma.h | 1 +
 2 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c
index 4a4d44ed03c8..003e1580b902 100644
--- a/drivers/dma/imx-sdma.c
+++ b/drivers/dma/imx-sdma.c
@@ -251,6 +251,8 @@ struct sdma_script_start_addrs {
 	s32 sai_2_mcu_addr;
 	s32 uart_2_mcu_rom_addr;
 	s32 uartsh_2_mcu_rom_addr;
+	s32 i2c_2_mcu_addr;
+	s32 mcu_2_i2c_addr;
 	/* End of v3 array */
 	union { s32 v3_end; s32 mcu_2_zqspi_addr; };
 	/* End of v4 array */
@@ -1082,6 +1084,11 @@ static int sdma_get_pc(struct sdma_channel *sdmac,
 		per_2_emi = sdma->script_addrs->sai_2_mcu_addr;
 		emi_2_per = sdma->script_addrs->mcu_2_sai_addr;
 		break;
+	case IMX_DMATYPE_I2C:
+		per_2_emi = sdma->script_addrs->i2c_2_mcu_addr;
+		emi_2_per = sdma->script_addrs->mcu_2_i2c_addr;
+		sdmac->is_ram_script = true;
+		break;
 	case IMX_DMATYPE_HDMI:
 		emi_2_per = sdma->script_addrs->hdmi_dma_addr;
 		sdmac->is_ram_script = true;
diff --git a/include/linux/dma/imx-dma.h b/include/linux/dma/imx-dma.h
index cfec5f946e23..76a8de9ae151 100644
--- a/include/linux/dma/imx-dma.h
+++ b/include/linux/dma/imx-dma.h
@@ -41,6 +41,7 @@ enum sdma_peripheral_type {
 	IMX_DMATYPE_SAI,	/* SAI */
 	IMX_DMATYPE_MULTI_SAI,	/* MULTI FIFOs For Audio */
 	IMX_DMATYPE_HDMI,       /* HDMI Audio */
+	IMX_DMATYPE_I2C,	/* I2C */
 };
 
 enum imx_dma_prio {
-- 
cgit v1.2.3


From 85d3ccc8b75bb5a443edb3c42fa22e97da2e60ec Mon Sep 17 00:00:00 2001
From: Oliver Upton <oliver.upton@linux.dev>
Date: Mon, 22 Apr 2024 20:01:44 +0000
Subject: KVM: arm64: vgic-debug: Use an xarray mark for debug iterator

The vgic debug iterator is the final user of vgic_copy_lpi_list(), but
is a bit more complicated to transition to something else. Use a mark
in the LPI xarray to record the indices 'known' to the debug iterator.
Protect against the LPIs from being freed by associating an additional
reference with the xarray mark.

Rework iter_next() to let the xarray walk 'drive' the iteration after
visiting all of the SGIs, PPIs, and SPIs.

Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
Link: https://lore.kernel.org/r/20240422200158.2606761-6-oliver.upton@linux.dev
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/vgic/vgic-debug.c | 82 +++++++++++++++++++++++++++++-----------
 arch/arm64/kvm/vgic/vgic-its.c   |  4 +-
 arch/arm64/kvm/vgic/vgic.h       |  1 +
 include/kvm/arm_vgic.h           |  2 +
 4 files changed, 64 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c
index 389025ce7749..bcbc8c986b1d 100644
--- a/arch/arm64/kvm/vgic/vgic-debug.c
+++ b/arch/arm64/kvm/vgic/vgic-debug.c
@@ -28,27 +28,65 @@ struct vgic_state_iter {
 	int nr_lpis;
 	int dist_id;
 	int vcpu_id;
-	int intid;
+	unsigned long intid;
 	int lpi_idx;
-	u32 *lpi_array;
 };
 
-static void iter_next(struct vgic_state_iter *iter)
+static void iter_next(struct kvm *kvm, struct vgic_state_iter *iter)
 {
+	struct vgic_dist *dist = &kvm->arch.vgic;
+
 	if (iter->dist_id == 0) {
 		iter->dist_id++;
 		return;
 	}
 
+	/*
+	 * Let the xarray drive the iterator after the last SPI, as the iterator
+	 * has exhausted the sequentially-allocated INTID space.
+	 */
+	if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS - 1)) {
+		if (iter->lpi_idx < iter->nr_lpis)
+			xa_find_after(&dist->lpi_xa, &iter->intid,
+				      VGIC_LPI_MAX_INTID,
+				      LPI_XA_MARK_DEBUG_ITER);
+		iter->lpi_idx++;
+		return;
+	}
+
 	iter->intid++;
 	if (iter->intid == VGIC_NR_PRIVATE_IRQS &&
 	    ++iter->vcpu_id < iter->nr_cpus)
 		iter->intid = 0;
+}
 
-	if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS)) {
-		if (iter->lpi_idx < iter->nr_lpis)
-			iter->intid = iter->lpi_array[iter->lpi_idx];
-		iter->lpi_idx++;
+static int iter_mark_lpis(struct kvm *kvm)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	struct vgic_irq *irq;
+	unsigned long intid;
+	int nr_lpis = 0;
+
+	xa_for_each(&dist->lpi_xa, intid, irq) {
+		if (!vgic_try_get_irq_kref(irq))
+			continue;
+
+		xa_set_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER);
+		nr_lpis++;
+	}
+
+	return nr_lpis;
+}
+
+static void iter_unmark_lpis(struct kvm *kvm)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	struct vgic_irq *irq;
+	unsigned long intid;
+
+	xa_for_each(&dist->lpi_xa, intid, irq) {
+		xa_clear_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER);
+		vgic_put_irq(kvm, irq);
 	}
 }
 
@@ -61,15 +99,12 @@ static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter,
 
 	iter->nr_cpus = nr_cpus;
 	iter->nr_spis = kvm->arch.vgic.nr_spis;
-	if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
-		iter->nr_lpis = vgic_copy_lpi_list(kvm, NULL, &iter->lpi_array);
-		if (iter->nr_lpis < 0)
-			iter->nr_lpis = 0;
-	}
+	if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
+		iter->nr_lpis = iter_mark_lpis(kvm);
 
 	/* Fast forward to the right position if needed */
 	while (pos--)
-		iter_next(iter);
+		iter_next(kvm, iter);
 }
 
 static bool end_of_vgic(struct vgic_state_iter *iter)
@@ -114,7 +149,7 @@ static void *vgic_debug_next(struct seq_file *s, void *v, loff_t *pos)
 	struct vgic_state_iter *iter = kvm->arch.vgic.iter;
 
 	++*pos;
-	iter_next(iter);
+	iter_next(kvm, iter);
 	if (end_of_vgic(iter))
 		iter = NULL;
 	return iter;
@@ -134,13 +169,14 @@ static void vgic_debug_stop(struct seq_file *s, void *v)
 
 	mutex_lock(&kvm->arch.config_lock);
 	iter = kvm->arch.vgic.iter;
-	kfree(iter->lpi_array);
+	iter_unmark_lpis(kvm);
 	kfree(iter);
 	kvm->arch.vgic.iter = NULL;
 	mutex_unlock(&kvm->arch.config_lock);
 }
 
-static void print_dist_state(struct seq_file *s, struct vgic_dist *dist)
+static void print_dist_state(struct seq_file *s, struct vgic_dist *dist,
+			     struct vgic_state_iter *iter)
 {
 	bool v3 = dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3;
 
@@ -149,7 +185,7 @@ static void print_dist_state(struct seq_file *s, struct vgic_dist *dist)
 	seq_printf(s, "vgic_model:\t%s\n", v3 ? "GICv3" : "GICv2");
 	seq_printf(s, "nr_spis:\t%d\n", dist->nr_spis);
 	if (v3)
-		seq_printf(s, "nr_lpis:\t%d\n", atomic_read(&dist->lpi_count));
+		seq_printf(s, "nr_lpis:\t%d\n", iter->nr_lpis);
 	seq_printf(s, "enabled:\t%d\n", dist->enabled);
 	seq_printf(s, "\n");
 
@@ -236,7 +272,7 @@ static int vgic_debug_show(struct seq_file *s, void *v)
 	unsigned long flags;
 
 	if (iter->dist_id == 0) {
-		print_dist_state(s, &kvm->arch.vgic);
+		print_dist_state(s, &kvm->arch.vgic, iter);
 		return 0;
 	}
 
@@ -246,11 +282,13 @@ static int vgic_debug_show(struct seq_file *s, void *v)
 	if (iter->vcpu_id < iter->nr_cpus)
 		vcpu = kvm_get_vcpu(kvm, iter->vcpu_id);
 
+	/*
+	 * Expect this to succeed, as iter_mark_lpis() takes a reference on
+	 * every LPI to be visited.
+	 */
 	irq = vgic_get_irq(kvm, vcpu, iter->intid);
-	if (!irq) {
-		seq_printf(s, "       LPI %4d freed\n", iter->intid);
-		return 0;
-	}
+	if (WARN_ON_ONCE(!irq))
+		return -EINVAL;
 
 	raw_spin_lock_irqsave(&irq->irq_lock, flags);
 	print_irq_state(s, irq, vcpu);
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index 420a71597b78..5025ac968d27 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -316,8 +316,6 @@ static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
 	return 0;
 }
 
-#define GIC_LPI_MAX_INTID	((1 << INTERRUPT_ID_BITS_ITS) - 1)
-
 /*
  * Create a snapshot of the current LPIs targeting @vcpu, so that we can
  * enumerate those LPIs without holding any lock.
@@ -347,7 +345,7 @@ int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr)
 	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
 	rcu_read_lock();
 
-	xas_for_each(&xas, irq, GIC_LPI_MAX_INTID) {
+	xas_for_each(&xas, irq, VGIC_LPI_MAX_INTID) {
 		if (i == irq_count)
 			break;
 		/* We don't need to "get" the IRQ, as we hold the list lock. */
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index 0c2b82de8fa3..e0c77e1bd9f6 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -16,6 +16,7 @@
 
 #define INTERRUPT_ID_BITS_SPIS	10
 #define INTERRUPT_ID_BITS_ITS	16
+#define VGIC_LPI_MAX_INTID	((1 << INTERRUPT_ID_BITS_ITS) - 1)
 #define VGIC_PRI_BITS		5
 
 #define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS)
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 47035946648e..8eb72721dac1 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -276,6 +276,8 @@ struct vgic_dist {
 
 	/* Protects the lpi_list. */
 	raw_spinlock_t		lpi_list_lock;
+
+#define LPI_XA_MARK_DEBUG_ITER	XA_MARK_0
 	struct xarray		lpi_xa;
 	atomic_t		lpi_count;
 
-- 
cgit v1.2.3


From 30a0ce9c4928640efd3112d01d432d0778878b7e Mon Sep 17 00:00:00 2001
From: Oliver Upton <oliver.upton@linux.dev>
Date: Mon, 22 Apr 2024 20:01:45 +0000
Subject: KVM: arm64: vgic-its: Get rid of vgic_copy_lpi_list()

The last user has been transitioned to walking the LPI xarray directly.
Cut the wart off, and get rid of the now unneeded lpi_count while doing
so.

Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
Link: https://lore.kernel.org/r/20240422200158.2606761-7-oliver.upton@linux.dev
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/vgic/vgic-its.c | 48 ------------------------------------------
 arch/arm64/kvm/vgic/vgic.c     |  1 -
 arch/arm64/kvm/vgic/vgic.h     |  1 -
 include/kvm/arm_vgic.h         |  1 -
 4 files changed, 51 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index 5025ac968d27..441134ad674e 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -86,11 +86,8 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
 	if (ret) {
 		xa_release(&dist->lpi_xa, intid);
 		kfree(irq);
-		goto out_unlock;
 	}
 
-	atomic_inc(&dist->lpi_count);
-
 out_unlock:
 	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
 
@@ -316,51 +313,6 @@ static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
 	return 0;
 }
 
-/*
- * Create a snapshot of the current LPIs targeting @vcpu, so that we can
- * enumerate those LPIs without holding any lock.
- * Returns their number and puts the kmalloc'ed array into intid_ptr.
- */
-int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	XA_STATE(xas, &dist->lpi_xa, GIC_LPI_OFFSET);
-	struct vgic_irq *irq;
-	unsigned long flags;
-	u32 *intids;
-	int irq_count, i = 0;
-
-	/*
-	 * There is an obvious race between allocating the array and LPIs
-	 * being mapped/unmapped. If we ended up here as a result of a
-	 * command, we're safe (locks are held, preventing another
-	 * command). If coming from another path (such as enabling LPIs),
-	 * we must be careful not to overrun the array.
-	 */
-	irq_count = atomic_read(&dist->lpi_count);
-	intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL_ACCOUNT);
-	if (!intids)
-		return -ENOMEM;
-
-	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
-	rcu_read_lock();
-
-	xas_for_each(&xas, irq, VGIC_LPI_MAX_INTID) {
-		if (i == irq_count)
-			break;
-		/* We don't need to "get" the IRQ, as we hold the list lock. */
-		if (vcpu && irq->target_vcpu != vcpu)
-			continue;
-		intids[i++] = irq->intid;
-	}
-
-	rcu_read_unlock();
-	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
-
-	*intid_ptr = intids;
-	return i;
-}
-
 static int update_affinity(struct vgic_irq *irq, struct kvm_vcpu *vcpu)
 {
 	int ret = 0;
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index 4ec93587c8cd..e3ee1bc1214a 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -126,7 +126,6 @@ void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
 	__xa_erase(&dist->lpi_xa, irq->intid);
 	xa_unlock_irqrestore(&dist->lpi_xa, flags);
 
-	atomic_dec(&dist->lpi_count);
 	kfree_rcu(irq, rcu);
 }
 
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index e0c77e1bd9f6..060dfd96b41f 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -331,7 +331,6 @@ static inline bool vgic_dist_overlap(struct kvm *kvm, gpa_t base, size_t size)
 }
 
 bool vgic_lpis_enabled(struct kvm_vcpu *vcpu);
-int vgic_copy_lpi_list(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 **intid_ptr);
 int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its,
 			 u32 devid, u32 eventid, struct vgic_irq **irq);
 struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi);
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 8eb72721dac1..ac7f15ec1586 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -279,7 +279,6 @@ struct vgic_dist {
 
 #define LPI_XA_MARK_DEBUG_ITER	XA_MARK_0
 	struct xarray		lpi_xa;
-	atomic_t		lpi_count;
 
 	/* LPI translation cache */
 	struct list_head	lpi_translation_cache;
-- 
cgit v1.2.3


From 8201d1028caa4fae88e222c4e8cf541fdf45b821 Mon Sep 17 00:00:00 2001
From: Oliver Upton <oliver.upton@linux.dev>
Date: Mon, 22 Apr 2024 20:01:47 +0000
Subject: KVM: arm64: vgic-its: Maintain a translation cache per ITS

Within the context of a single ITS, it is possible to use an xarray to
cache the device ID & event ID translation to a particular irq
descriptor. Take advantage of this to build a translation cache capable
of fitting all valid translations for a given ITS.

Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
Link: https://lore.kernel.org/r/20240422200158.2606761-9-oliver.upton@linux.dev
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/vgic/vgic-its.c | 37 ++++++++++++++++++++++++++++++++++++-
 include/kvm/arm_vgic.h         |  6 ++++++
 2 files changed, 42 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index 2caa30bf20c7..1cea0d78025b 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -511,6 +511,11 @@ static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm,
 	return 0;
 }
 
+static unsigned long vgic_its_cache_key(u32 devid, u32 eventid)
+{
+	return (((unsigned long)devid) << VITS_TYPER_IDBITS) | eventid;
+}
+
 static struct vgic_irq *__vgic_its_check_cache(struct vgic_dist *dist,
 					       phys_addr_t db,
 					       u32 devid, u32 eventid)
@@ -564,8 +569,10 @@ static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its,
 				       u32 devid, u32 eventid,
 				       struct vgic_irq *irq)
 {
+	unsigned long cache_key = vgic_its_cache_key(devid, eventid);
 	struct vgic_dist *dist = &kvm->arch.vgic;
 	struct vgic_translation_cache_entry *cte;
+	struct vgic_irq *old;
 	unsigned long flags;
 	phys_addr_t db;
 
@@ -604,6 +611,15 @@ static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its,
 	 * its_lock, as the ITE (and the reference it holds) cannot be freed.
 	 */
 	lockdep_assert_held(&its->its_lock);
+
+	/*
+	 * Yes, two references are necessary at the moment:
+	 *  - One for the global LPI translation cache
+	 *  - Another for the translation cache belonging to @its
+	 *
+	 * This will soon disappear.
+	 */
+	vgic_get_irq_kref(irq);
 	vgic_get_irq_kref(irq);
 
 	cte->db		= db;
@@ -613,6 +629,16 @@ static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its,
 
 	/* Move the new translation to the head of the list */
 	list_move(&cte->entry, &dist->lpi_translation_cache);
+	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+
+	/*
+	 * The per-ITS cache is a perfect cache, so it may already have an
+	 * identical translation even if it were missing from the global
+	 * cache. Ensure we don't leak a reference if that is the case.
+	 */
+	old = xa_store(&its->translation_cache, cache_key, irq, GFP_KERNEL_ACCOUNT);
+	if (old)
+		vgic_put_irq(kvm, old);
 
 out:
 	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
@@ -623,7 +649,8 @@ static void vgic_its_invalidate_cache(struct vgic_its *its)
 	struct kvm *kvm = its->dev->kvm;
 	struct vgic_dist *dist = &kvm->arch.vgic;
 	struct vgic_translation_cache_entry *cte;
-	unsigned long flags;
+	unsigned long flags, idx;
+	struct vgic_irq *irq;
 
 	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
 
@@ -640,6 +667,11 @@ static void vgic_its_invalidate_cache(struct vgic_its *its)
 	}
 
 	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+
+	xa_for_each(&its->translation_cache, idx, irq) {
+		xa_erase(&its->translation_cache, idx);
+		vgic_put_irq(kvm, irq);
+	}
 }
 
 void vgic_its_invalidate_all_caches(struct kvm *kvm)
@@ -1962,6 +1994,7 @@ static int vgic_its_create(struct kvm_device *dev, u32 type)
 
 	INIT_LIST_HEAD(&its->device_list);
 	INIT_LIST_HEAD(&its->collection_list);
+	xa_init(&its->translation_cache);
 
 	dev->kvm->arch.vgic.msis_require_devid = true;
 	dev->kvm->arch.vgic.has_its = true;
@@ -1992,6 +2025,8 @@ static void vgic_its_destroy(struct kvm_device *kvm_dev)
 
 	vgic_its_free_device_list(kvm, its);
 	vgic_its_free_collection_list(kvm, its);
+	vgic_its_invalidate_cache(its);
+	xa_destroy(&its->translation_cache);
 
 	mutex_unlock(&its->its_lock);
 	kfree(its);
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index ac7f15ec1586..c15e7fcccb86 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -210,6 +210,12 @@ struct vgic_its {
 	struct mutex		its_lock;
 	struct list_head	device_list;
 	struct list_head	collection_list;
+
+	/*
+	 * Caches the (device_id, event_id) -> vgic_irq translation for
+	 * LPIs that are mapped and enabled.
+	 */
+	struct xarray		translation_cache;
 };
 
 struct vgic_state_iter;
-- 
cgit v1.2.3


From ec39bbfd55d07de2e2d4111f35c7ad9523c89ec3 Mon Sep 17 00:00:00 2001
From: Oliver Upton <oliver.upton@linux.dev>
Date: Mon, 22 Apr 2024 20:01:50 +0000
Subject: KVM: arm64: vgic-its: Rip out the global translation cache

The MSI injection fast path has been transitioned away from the global
translation cache. Rip it out.

Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
Link: https://lore.kernel.org/r/20240422200158.2606761-12-oliver.upton@linux.dev
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/vgic/vgic-init.c |   7 ---
 arch/arm64/kvm/vgic/vgic-its.c  | 121 ++--------------------------------------
 arch/arm64/kvm/vgic/vgic.h      |   2 -
 include/kvm/arm_vgic.h          |   3 -
 4 files changed, 4 insertions(+), 129 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index f20941f83a07..6ee42f395253 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -53,7 +53,6 @@ void kvm_vgic_early_init(struct kvm *kvm)
 {
 	struct vgic_dist *dist = &kvm->arch.vgic;
 
-	INIT_LIST_HEAD(&dist->lpi_translation_cache);
 	raw_spin_lock_init(&dist->lpi_list_lock);
 	xa_init_flags(&dist->lpi_xa, XA_FLAGS_LOCK_IRQ);
 }
@@ -305,9 +304,6 @@ int vgic_init(struct kvm *kvm)
 		}
 	}
 
-	if (vgic_has_its(kvm))
-		vgic_lpi_translation_cache_init(kvm);
-
 	/*
 	 * If we have GICv4.1 enabled, unconditionally request enable the
 	 * v4 support so that we get HW-accelerated vSGIs. Otherwise, only
@@ -361,9 +357,6 @@ static void kvm_vgic_dist_destroy(struct kvm *kvm)
 		dist->vgic_cpu_base = VGIC_ADDR_UNDEF;
 	}
 
-	if (vgic_has_its(kvm))
-		vgic_lpi_translation_cache_destroy(kvm);
-
 	if (vgic_supports_direct_msis(kvm))
 		vgic_v4_teardown(kvm);
 
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index 9a517faa43ae..bb7f4fd35b2b 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -149,14 +149,6 @@ struct its_ite {
 	u32 event_id;
 };
 
-struct vgic_translation_cache_entry {
-	struct list_head	entry;
-	phys_addr_t		db;
-	u32			devid;
-	u32			eventid;
-	struct vgic_irq		*irq;
-};
-
 /**
  * struct vgic_its_abi - ITS abi ops and settings
  * @cte_esz: collection table entry size
@@ -568,96 +560,34 @@ static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its,
 				       struct vgic_irq *irq)
 {
 	unsigned long cache_key = vgic_its_cache_key(devid, eventid);
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct vgic_translation_cache_entry *cte;
 	struct vgic_irq *old;
-	unsigned long flags;
-	phys_addr_t db;
 
 	/* Do not cache a directly injected interrupt */
 	if (irq->hw)
 		return;
 
-	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
-
-	if (unlikely(list_empty(&dist->lpi_translation_cache)))
-		goto out;
-
-	db = its->vgic_its_base + GITS_TRANSLATER;
-
-	/* Always reuse the last entry (LRU policy) */
-	cte = list_last_entry(&dist->lpi_translation_cache,
-			      typeof(*cte), entry);
-
-	/*
-	 * Caching the translation implies having an extra reference
-	 * to the interrupt, so drop the potential reference on what
-	 * was in the cache, and increment it on the new interrupt.
-	 */
-	if (cte->irq)
-		vgic_put_irq(kvm, cte->irq);
-
 	/*
 	 * The irq refcount is guaranteed to be nonzero while holding the
 	 * its_lock, as the ITE (and the reference it holds) cannot be freed.
 	 */
 	lockdep_assert_held(&its->its_lock);
-
-	/*
-	 * Yes, two references are necessary at the moment:
-	 *  - One for the global LPI translation cache
-	 *  - Another for the translation cache belonging to @its
-	 *
-	 * This will soon disappear.
-	 */
-	vgic_get_irq_kref(irq);
 	vgic_get_irq_kref(irq);
 
-	cte->db		= db;
-	cte->devid	= devid;
-	cte->eventid	= eventid;
-	cte->irq	= irq;
-
-	/* Move the new translation to the head of the list */
-	list_move(&cte->entry, &dist->lpi_translation_cache);
-	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
-
 	/*
-	 * The per-ITS cache is a perfect cache, so it may already have an
-	 * identical translation even if it were missing from the global
-	 * cache. Ensure we don't leak a reference if that is the case.
+	 * We could have raced with another CPU caching the same
+	 * translation behind our back, ensure we don't leak a
+	 * reference if that is the case.
 	 */
 	old = xa_store(&its->translation_cache, cache_key, irq, GFP_KERNEL_ACCOUNT);
 	if (old)
 		vgic_put_irq(kvm, old);
-
-out:
-	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
 }
 
 static void vgic_its_invalidate_cache(struct vgic_its *its)
 {
 	struct kvm *kvm = its->dev->kvm;
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct vgic_translation_cache_entry *cte;
-	unsigned long flags, idx;
 	struct vgic_irq *irq;
-
-	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
-
-	list_for_each_entry(cte, &dist->lpi_translation_cache, entry) {
-		/*
-		 * If we hit a NULL entry, there is nothing after this
-		 * point.
-		 */
-		if (!cte->irq)
-			break;
-
-		vgic_put_irq(kvm, cte->irq);
-		cte->irq = NULL;
-	}
-
-	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+	unsigned long idx;
 
 	xa_for_each(&its->translation_cache, idx, irq) {
 		xa_erase(&its->translation_cache, idx);
@@ -1880,47 +1810,6 @@ out:
 	return ret;
 }
 
-/* Default is 16 cached LPIs per vcpu */
-#define LPI_DEFAULT_PCPU_CACHE_SIZE	16
-
-void vgic_lpi_translation_cache_init(struct kvm *kvm)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	unsigned int sz;
-	int i;
-
-	if (!list_empty(&dist->lpi_translation_cache))
-		return;
-
-	sz = atomic_read(&kvm->online_vcpus) * LPI_DEFAULT_PCPU_CACHE_SIZE;
-
-	for (i = 0; i < sz; i++) {
-		struct vgic_translation_cache_entry *cte;
-
-		/* An allocation failure is not fatal */
-		cte = kzalloc(sizeof(*cte), GFP_KERNEL_ACCOUNT);
-		if (WARN_ON(!cte))
-			break;
-
-		INIT_LIST_HEAD(&cte->entry);
-		list_add(&cte->entry, &dist->lpi_translation_cache);
-	}
-}
-
-void vgic_lpi_translation_cache_destroy(struct kvm *kvm)
-{
-	struct vgic_dist *dist = &kvm->arch.vgic;
-	struct vgic_translation_cache_entry *cte, *tmp;
-
-	vgic_its_invalidate_all_caches(kvm);
-
-	list_for_each_entry_safe(cte, tmp,
-				 &dist->lpi_translation_cache, entry) {
-		list_del(&cte->entry);
-		kfree(cte);
-	}
-}
-
 #define INITIAL_BASER_VALUE						  \
 	(GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb)		| \
 	 GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, SameAsInner)		| \
@@ -1953,8 +1842,6 @@ static int vgic_its_create(struct kvm_device *dev, u32 type)
 			kfree(its);
 			return ret;
 		}
-
-		vgic_lpi_translation_cache_init(dev->kvm);
 	}
 
 	mutex_init(&its->its_lock);
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index e5cda1eb4bcf..407640c24049 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -335,8 +335,6 @@ int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its,
 			 u32 devid, u32 eventid, struct vgic_irq **irq);
 struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi);
 int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi);
-void vgic_lpi_translation_cache_init(struct kvm *kvm);
-void vgic_lpi_translation_cache_destroy(struct kvm *kvm);
 void vgic_its_invalidate_all_caches(struct kvm *kvm);
 
 /* GICv4.1 MMIO interface */
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index c15e7fcccb86..76ed097500c0 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -286,9 +286,6 @@ struct vgic_dist {
 #define LPI_XA_MARK_DEBUG_ITER	XA_MARK_0
 	struct xarray		lpi_xa;
 
-	/* LPI translation cache */
-	struct list_head	lpi_translation_cache;
-
 	/* used by vgic-debug */
 	struct vgic_state_iter *iter;
 
-- 
cgit v1.2.3


From 481c9ee846d27c72acc0c3bb23025c7fdad8c171 Mon Sep 17 00:00:00 2001
From: Oliver Upton <oliver.upton@linux.dev>
Date: Mon, 22 Apr 2024 20:01:51 +0000
Subject: KVM: arm64: vgic-its: Get rid of the lpi_list_lock

The last genuine use case for the lpi_list_lock was the global LPI
translation cache, which has been removed in favor of a per-ITS xarray.
Remove a layer from the locking puzzle by getting rid of it.

vgic_add_lpi() still has a critical section that needs to protect
against the insertion of other LPIs; change it to take the LPI xarray's
xa_lock to retain this property.

Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
Link: https://lore.kernel.org/r/20240422200158.2606761-13-oliver.upton@linux.dev
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/vgic/vgic-init.c | 1 -
 arch/arm64/kvm/vgic/vgic-its.c  | 6 +++---
 arch/arm64/kvm/vgic/vgic.c      | 5 ++---
 include/kvm/arm_vgic.h          | 3 ---
 4 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index 6ee42f395253..aee6083d0da6 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -53,7 +53,6 @@ void kvm_vgic_early_init(struct kvm *kvm)
 {
 	struct vgic_dist *dist = &kvm->arch.vgic;
 
-	raw_spin_lock_init(&dist->lpi_list_lock);
 	xa_init_flags(&dist->lpi_xa, XA_FLAGS_LOCK_IRQ);
 }
 
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index bb7f4fd35b2b..40bb43f20bf3 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -69,7 +69,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
 	irq->target_vcpu = vcpu;
 	irq->group = 1;
 
-	raw_spin_lock_irqsave(&dist->lpi_list_lock, flags);
+	xa_lock_irqsave(&dist->lpi_xa, flags);
 
 	/*
 	 * There could be a race with another vgic_add_lpi(), so we need to
@@ -84,14 +84,14 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
 		goto out_unlock;
 	}
 
-	ret = xa_err(xa_store(&dist->lpi_xa, intid, irq, 0));
+	ret = xa_err(__xa_store(&dist->lpi_xa, intid, irq, 0));
 	if (ret) {
 		xa_release(&dist->lpi_xa, intid);
 		kfree(irq);
 	}
 
 out_unlock:
-	raw_spin_unlock_irqrestore(&dist->lpi_list_lock, flags);
+	xa_unlock_irqrestore(&dist->lpi_xa, flags);
 
 	if (ret)
 		return ERR_PTR(ret);
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index e3ee1bc1214a..d0c59b51a6b0 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -29,9 +29,8 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = {
  *       its->cmd_lock (mutex)
  *         its->its_lock (mutex)
  *           vgic_cpu->ap_list_lock		must be taken with IRQs disabled
- *             kvm->lpi_list_lock		must be taken with IRQs disabled
- *               vgic_dist->lpi_xa.xa_lock	must be taken with IRQs disabled
- *                 vgic_irq->irq_lock		must be taken with IRQs disabled
+ *             vgic_dist->lpi_xa.xa_lock	must be taken with IRQs disabled
+ *               vgic_irq->irq_lock		must be taken with IRQs disabled
  *
  * As the ap_list_lock might be taken from the timer interrupt handler,
  * we have to disable IRQs before taking this lock and everything lower
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 76ed097500c0..50b828b278fd 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -280,9 +280,6 @@ struct vgic_dist {
 	 */
 	u64			propbaser;
 
-	/* Protects the lpi_list. */
-	raw_spinlock_t		lpi_list_lock;
-
 #define LPI_XA_MARK_DEBUG_ITER	XA_MARK_0
 	struct xarray		lpi_xa;
 
-- 
cgit v1.2.3


From 33d5ae6cacf46a043578d711ae7239bab55b4455 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Mon, 22 Apr 2024 15:10:10 +0300
Subject: drm/print: drop include debugfs.h and include where needed

Surprisingly many places depend on debugfs.h to be included via
drm_print.h. Fix them.

v3: Also fix armada, ite-it6505, imagination, msm, sti, vc4, and xe

v2: Also fix ivpu and vmwgfx

Reviewed-by: Andrzej Hajda <andrzej.hajda@intel.com>
Acked-by: Maxime Ripard <mripard@kernel.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20240410141434.157908-1-jani.nikula@intel.com
Acked-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org> # drm/msm
Acked-by: Matt Coster <matt.coster@imgtec.com> # drm/imagination
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Acked-by: Lucas De Marchi <lucas.demarchi@intel.com>
Acked-by: Robert Foss <rfoss@kernel.org> #drm/bridge
Reviewed-by: Lyude Paul <lyude@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240422121011.4133236-1-jani.nikula@intel.com
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 drivers/accel/ivpu/ivpu_debugfs.c           | 2 ++
 drivers/gpu/drm/armada/armada_debugfs.c     | 1 +
 drivers/gpu/drm/bridge/ite-it6505.c         | 1 +
 drivers/gpu/drm/bridge/panel.c              | 2 ++
 drivers/gpu/drm/drm_print.c                 | 6 +++---
 drivers/gpu/drm/i915/display/intel_dmc.c    | 1 +
 drivers/gpu/drm/imagination/pvr_fw_trace.c  | 1 +
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_sspp.c | 2 ++
 drivers/gpu/drm/nouveau/dispnv50/crc.c      | 2 ++
 drivers/gpu/drm/radeon/r100.c               | 1 +
 drivers/gpu/drm/radeon/r300.c               | 1 +
 drivers/gpu/drm/radeon/r420.c               | 1 +
 drivers/gpu/drm/radeon/r600.c               | 3 ++-
 drivers/gpu/drm/radeon/radeon_fence.c       | 1 +
 drivers/gpu/drm/radeon/radeon_gem.c         | 1 +
 drivers/gpu/drm/radeon/radeon_ib.c          | 2 ++
 drivers/gpu/drm/radeon/radeon_pm.c          | 1 +
 drivers/gpu/drm/radeon/radeon_ring.c        | 2 ++
 drivers/gpu/drm/radeon/radeon_ttm.c         | 1 +
 drivers/gpu/drm/radeon/rs400.c              | 1 +
 drivers/gpu/drm/radeon/rv515.c              | 1 +
 drivers/gpu/drm/sti/sti_drv.c               | 1 +
 drivers/gpu/drm/ttm/ttm_device.c            | 1 +
 drivers/gpu/drm/ttm/ttm_resource.c          | 3 ++-
 drivers/gpu/drm/ttm/ttm_tt.c                | 5 +++--
 drivers/gpu/drm/vc4/vc4_drv.h               | 1 +
 drivers/gpu/drm/vmwgfx/vmwgfx_gem.c         | 2 ++
 drivers/gpu/drm/xe/xe_debugfs.c             | 1 +
 drivers/gpu/drm/xe/xe_gt_debugfs.c          | 2 ++
 drivers/gpu/drm/xe/xe_uc_debugfs.c          | 2 ++
 include/drm/drm_print.h                     | 2 +-
 31 files changed, 46 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/accel/ivpu/ivpu_debugfs.c b/drivers/accel/ivpu/ivpu_debugfs.c
index d09d29775b3f..e07e447d08d1 100644
--- a/drivers/accel/ivpu/ivpu_debugfs.c
+++ b/drivers/accel/ivpu/ivpu_debugfs.c
@@ -3,6 +3,8 @@
  * Copyright (C) 2020-2023 Intel Corporation
  */
 
+#include <linux/debugfs.h>
+
 #include <drm/drm_debugfs.h>
 #include <drm/drm_file.h>
 #include <drm/drm_print.h>
diff --git a/drivers/gpu/drm/armada/armada_debugfs.c b/drivers/gpu/drm/armada/armada_debugfs.c
index 29f4b52e3c8d..a763349dd89f 100644
--- a/drivers/gpu/drm/armada/armada_debugfs.c
+++ b/drivers/gpu/drm/armada/armada_debugfs.c
@@ -5,6 +5,7 @@
  */
 
 #include <linux/ctype.h>
+#include <linux/debugfs.h>
 #include <linux/module.h>
 #include <linux/seq_file.h>
 #include <linux/uaccess.h>
diff --git a/drivers/gpu/drm/bridge/ite-it6505.c b/drivers/gpu/drm/bridge/ite-it6505.c
index 27334173e911..3f68c82888c2 100644
--- a/drivers/gpu/drm/bridge/ite-it6505.c
+++ b/drivers/gpu/drm/bridge/ite-it6505.c
@@ -3,6 +3,7 @@
  * Copyright (c) 2020, The Linux Foundation. All rights reserved.
  */
 #include <linux/bits.h>
+#include <linux/debugfs.h>
 #include <linux/delay.h>
 #include <linux/device.h>
 #include <linux/err.h>
diff --git a/drivers/gpu/drm/bridge/panel.c b/drivers/gpu/drm/bridge/panel.c
index 7f41525f7a6e..32506524d9a2 100644
--- a/drivers/gpu/drm/bridge/panel.c
+++ b/drivers/gpu/drm/bridge/panel.c
@@ -4,6 +4,8 @@
  * Copyright (C) 2017 Broadcom
  */
 
+#include <linux/debugfs.h>
+
 #include <drm/drm_atomic_helper.h>
 #include <drm/drm_bridge.h>
 #include <drm/drm_connector.h>
diff --git a/drivers/gpu/drm/drm_print.c b/drivers/gpu/drm/drm_print.c
index 699b7dbffd7b..cf2efb44722c 100644
--- a/drivers/gpu/drm/drm_print.c
+++ b/drivers/gpu/drm/drm_print.c
@@ -23,13 +23,13 @@
  * Rob Clark <robdclark@gmail.com>
  */
 
-#include <linux/stdarg.h>
-
+#include <linux/debugfs.h>
+#include <linux/dynamic_debug.h>
 #include <linux/io.h>
 #include <linux/moduleparam.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
-#include <linux/dynamic_debug.h>
+#include <linux/stdarg.h>
 
 #include <drm/drm.h>
 #include <drm/drm_drv.h>
diff --git a/drivers/gpu/drm/i915/display/intel_dmc.c b/drivers/gpu/drm/i915/display/intel_dmc.c
index a34ff3383fd3..370d61c7e342 100644
--- a/drivers/gpu/drm/i915/display/intel_dmc.c
+++ b/drivers/gpu/drm/i915/display/intel_dmc.c
@@ -22,6 +22,7 @@
  *
  */
 
+#include <linux/debugfs.h>
 #include <linux/firmware.h>
 
 #include "i915_drv.h"
diff --git a/drivers/gpu/drm/imagination/pvr_fw_trace.c b/drivers/gpu/drm/imagination/pvr_fw_trace.c
index 31199e45b72e..73707daa4e52 100644
--- a/drivers/gpu/drm/imagination/pvr_fw_trace.c
+++ b/drivers/gpu/drm/imagination/pvr_fw_trace.c
@@ -12,6 +12,7 @@
 
 #include <linux/build_bug.h>
 #include <linux/dcache.h>
+#include <linux/debugfs.h>
 #include <linux/sysfs.h>
 #include <linux/types.h>
 
diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_sspp.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_sspp.c
index 0bf8a83e8df3..8586f2761782 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_sspp.c
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_sspp.c
@@ -2,6 +2,8 @@
 /* Copyright (c) 2015-2018, The Linux Foundation. All rights reserved.
  */
 
+#include <linux/debugfs.h>
+
 #include "dpu_hwio.h"
 #include "dpu_hw_catalog.h"
 #include "dpu_hw_lm.h"
diff --git a/drivers/gpu/drm/nouveau/dispnv50/crc.c b/drivers/gpu/drm/nouveau/dispnv50/crc.c
index 9c942fbd836d..5936b6b3b15d 100644
--- a/drivers/gpu/drm/nouveau/dispnv50/crc.c
+++ b/drivers/gpu/drm/nouveau/dispnv50/crc.c
@@ -1,5 +1,7 @@
 // SPDX-License-Identifier: MIT
+#include <linux/debugfs.h>
 #include <linux/string.h>
+
 #include <drm/drm_crtc.h>
 #include <drm/drm_atomic_helper.h>
 #include <drm/drm_vblank.h>
diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c
index 86b8b770af19..0b1e19345f43 100644
--- a/drivers/gpu/drm/radeon/r100.c
+++ b/drivers/gpu/drm/radeon/r100.c
@@ -26,6 +26,7 @@
  *          Jerome Glisse
  */
 
+#include <linux/debugfs.h>
 #include <linux/firmware.h>
 #include <linux/module.h>
 #include <linux/pci.h>
diff --git a/drivers/gpu/drm/radeon/r300.c b/drivers/gpu/drm/radeon/r300.c
index 25201b9a5aae..1620f534f55f 100644
--- a/drivers/gpu/drm/radeon/r300.c
+++ b/drivers/gpu/drm/radeon/r300.c
@@ -26,6 +26,7 @@
  *          Jerome Glisse
  */
 
+#include <linux/debugfs.h>
 #include <linux/pci.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
diff --git a/drivers/gpu/drm/radeon/r420.c b/drivers/gpu/drm/radeon/r420.c
index eae8a6389f5e..a979662eaa73 100644
--- a/drivers/gpu/drm/radeon/r420.c
+++ b/drivers/gpu/drm/radeon/r420.c
@@ -26,6 +26,7 @@
  *          Jerome Glisse
  */
 
+#include <linux/debugfs.h>
 #include <linux/pci.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c
index b5e97d95a19f..087d41e370fd 100644
--- a/drivers/gpu/drm/radeon/r600.c
+++ b/drivers/gpu/drm/radeon/r600.c
@@ -26,11 +26,12 @@
  *          Jerome Glisse
  */
 
+#include <linux/debugfs.h>
 #include <linux/firmware.h>
 #include <linux/module.h>
 #include <linux/pci.h>
-#include <linux/slab.h>
 #include <linux/seq_file.h>
+#include <linux/slab.h>
 
 #include <drm/drm_device.h>
 #include <drm/drm_vblank.h>
diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c
index 9ebe4a0b9a6c..4fb780d96f32 100644
--- a/drivers/gpu/drm/radeon/radeon_fence.c
+++ b/drivers/gpu/drm/radeon/radeon_fence.c
@@ -30,6 +30,7 @@
  */
 
 #include <linux/atomic.h>
+#include <linux/debugfs.h>
 #include <linux/firmware.h>
 #include <linux/kref.h>
 #include <linux/sched/signal.h>
diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c
index 3fec3acdaf28..2ef201a072f1 100644
--- a/drivers/gpu/drm/radeon/radeon_gem.c
+++ b/drivers/gpu/drm/radeon/radeon_gem.c
@@ -26,6 +26,7 @@
  *          Jerome Glisse
  */
 
+#include <linux/debugfs.h>
 #include <linux/iosys-map.h>
 #include <linux/pci.h>
 
diff --git a/drivers/gpu/drm/radeon/radeon_ib.c b/drivers/gpu/drm/radeon/radeon_ib.c
index fb9ecf5dbe2b..63d914f3414d 100644
--- a/drivers/gpu/drm/radeon/radeon_ib.c
+++ b/drivers/gpu/drm/radeon/radeon_ib.c
@@ -27,6 +27,8 @@
  *          Christian König
  */
 
+#include <linux/debugfs.h>
+
 #include <drm/drm_file.h>
 
 #include "radeon.h"
diff --git a/drivers/gpu/drm/radeon/radeon_pm.c b/drivers/gpu/drm/radeon/radeon_pm.c
index 4482c8c5f5ce..2d9d9f46f243 100644
--- a/drivers/gpu/drm/radeon/radeon_pm.c
+++ b/drivers/gpu/drm/radeon/radeon_pm.c
@@ -21,6 +21,7 @@
  *          Alex Deucher <alexdeucher@gmail.com>
  */
 
+#include <linux/debugfs.h>
 #include <linux/hwmon-sysfs.h>
 #include <linux/hwmon.h>
 #include <linux/pci.h>
diff --git a/drivers/gpu/drm/radeon/radeon_ring.c b/drivers/gpu/drm/radeon/radeon_ring.c
index 38048593bb4a..8d1d458286a8 100644
--- a/drivers/gpu/drm/radeon/radeon_ring.c
+++ b/drivers/gpu/drm/radeon/radeon_ring.c
@@ -27,6 +27,8 @@
  *          Christian König
  */
 
+#include <linux/debugfs.h>
+
 #include <drm/drm_device.h>
 #include <drm/drm_file.h>
 
diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c
index 2078b0000e22..5c65b6dfb99a 100644
--- a/drivers/gpu/drm/radeon/radeon_ttm.c
+++ b/drivers/gpu/drm/radeon/radeon_ttm.c
@@ -30,6 +30,7 @@
  *    Dave Airlie
  */
 
+#include <linux/debugfs.h>
 #include <linux/dma-mapping.h>
 #include <linux/pagemap.h>
 #include <linux/pci.h>
diff --git a/drivers/gpu/drm/radeon/rs400.c b/drivers/gpu/drm/radeon/rs400.c
index d7f552d441ab..d4d1501e6576 100644
--- a/drivers/gpu/drm/radeon/rs400.c
+++ b/drivers/gpu/drm/radeon/rs400.c
@@ -26,6 +26,7 @@
  *          Jerome Glisse
  */
 
+#include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 
diff --git a/drivers/gpu/drm/radeon/rv515.c b/drivers/gpu/drm/radeon/rv515.c
index 79709d26d983..bbc6ccabf788 100644
--- a/drivers/gpu/drm/radeon/rv515.c
+++ b/drivers/gpu/drm/radeon/rv515.c
@@ -26,6 +26,7 @@
  *          Jerome Glisse
  */
 
+#include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 
diff --git a/drivers/gpu/drm/sti/sti_drv.c b/drivers/gpu/drm/sti/sti_drv.c
index 4bab93c4fefd..1799c12babf5 100644
--- a/drivers/gpu/drm/sti/sti_drv.c
+++ b/drivers/gpu/drm/sti/sti_drv.c
@@ -5,6 +5,7 @@
  */
 
 #include <linux/component.h>
+#include <linux/debugfs.h>
 #include <linux/dma-mapping.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
diff --git a/drivers/gpu/drm/ttm/ttm_device.c b/drivers/gpu/drm/ttm/ttm_device.c
index 76027960054f..434cf0258000 100644
--- a/drivers/gpu/drm/ttm/ttm_device.c
+++ b/drivers/gpu/drm/ttm/ttm_device.c
@@ -27,6 +27,7 @@
 
 #define pr_fmt(fmt) "[TTM DEVICE] " fmt
 
+#include <linux/debugfs.h>
 #include <linux/mm.h>
 
 #include <drm/ttm/ttm_bo.h>
diff --git a/drivers/gpu/drm/ttm/ttm_resource.c b/drivers/gpu/drm/ttm/ttm_resource.c
index be8d286513f9..4a66b851b67d 100644
--- a/drivers/gpu/drm/ttm/ttm_resource.c
+++ b/drivers/gpu/drm/ttm/ttm_resource.c
@@ -22,8 +22,9 @@
  * Authors: Christian König
  */
 
-#include <linux/iosys-map.h>
+#include <linux/debugfs.h>
 #include <linux/io-mapping.h>
+#include <linux/iosys-map.h>
 #include <linux/scatterlist.h>
 
 #include <drm/ttm/ttm_bo.h>
diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
index 578a7c37f00b..474fe7aad2a0 100644
--- a/drivers/gpu/drm/ttm/ttm_tt.c
+++ b/drivers/gpu/drm/ttm/ttm_tt.c
@@ -32,10 +32,11 @@
 #define pr_fmt(fmt) "[TTM] " fmt
 
 #include <linux/cc_platform.h>
-#include <linux/sched.h>
-#include <linux/shmem_fs.h>
+#include <linux/debugfs.h>
 #include <linux/file.h>
 #include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/shmem_fs.h>
 #include <drm/drm_cache.h>
 #include <drm/drm_device.h>
 #include <drm/drm_util.h>
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index ab61e96e7e14..08e29fa82563 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -5,6 +5,7 @@
 #ifndef _VC4_DRV_H_
 #define _VC4_DRV_H_
 
+#include <linux/debugfs.h>
 #include <linux/delay.h>
 #include <linux/of.h>
 #include <linux/refcount.h>
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_gem.c b/drivers/gpu/drm/vmwgfx/vmwgfx_gem.c
index 2132a8ad8c0c..07185c108218 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_gem.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_gem.c
@@ -30,6 +30,8 @@
 #include "drm/drm_prime.h"
 #include "drm/drm_gem_ttm_helper.h"
 
+#include <linux/debugfs.h>
+
 static void vmw_gem_object_free(struct drm_gem_object *gobj)
 {
 	struct ttm_buffer_object *bo = drm_gem_ttm_of_gem(gobj);
diff --git a/drivers/gpu/drm/xe/xe_debugfs.c b/drivers/gpu/drm/xe/xe_debugfs.c
index 01db5b27bec5..bfe7da19f92b 100644
--- a/drivers/gpu/drm/xe/xe_debugfs.c
+++ b/drivers/gpu/drm/xe/xe_debugfs.c
@@ -5,6 +5,7 @@
 
 #include "xe_debugfs.h"
 
+#include <linux/debugfs.h>
 #include <linux/string_helpers.h>
 
 #include <drm/drm_debugfs.h>
diff --git a/drivers/gpu/drm/xe/xe_gt_debugfs.c b/drivers/gpu/drm/xe/xe_gt_debugfs.c
index c4b67cf09f8f..7da029b39703 100644
--- a/drivers/gpu/drm/xe/xe_gt_debugfs.c
+++ b/drivers/gpu/drm/xe/xe_gt_debugfs.c
@@ -5,6 +5,8 @@
 
 #include "xe_gt_debugfs.h"
 
+#include <linux/debugfs.h>
+
 #include <drm/drm_debugfs.h>
 #include <drm/drm_managed.h>
 
diff --git a/drivers/gpu/drm/xe/xe_uc_debugfs.c b/drivers/gpu/drm/xe/xe_uc_debugfs.c
index 0a39ec5a6e99..78eb8db73791 100644
--- a/drivers/gpu/drm/xe/xe_uc_debugfs.c
+++ b/drivers/gpu/drm/xe/xe_uc_debugfs.c
@@ -3,6 +3,8 @@
  * Copyright © 2022 Intel Corporation
  */
 
+#include <linux/debugfs.h>
+
 #include <drm/drm_debugfs.h>
 
 #include "xe_gt.h"
diff --git a/include/drm/drm_print.h b/include/drm/drm_print.h
index 9cc473e5d353..561c3b96b6fd 100644
--- a/include/drm/drm_print.h
+++ b/include/drm/drm_print.h
@@ -30,11 +30,11 @@
 #include <linux/printk.h>
 #include <linux/seq_file.h>
 #include <linux/device.h>
-#include <linux/debugfs.h>
 #include <linux/dynamic_debug.h>
 
 #include <drm/drm.h>
 
+struct debugfs_regset32;
 struct drm_device;
 
 /* Do *not* use outside of drm_print.[ch]! */
-- 
cgit v1.2.3


From 9e2b84fb6cd7ee913aa61d461db65c1d6a08dcf2 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Mon, 22 Apr 2024 15:10:11 +0300
Subject: drm/print: drop include seq_file.h

Never include where a forward declaration will suffice.

Reviewed-by: Andrzej Hajda <andrzej.hajda@intel.com>
Acked-by: Maxime Ripard <mripard@kernel.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20240410141434.157908-2-jani.nikula@intel.com
Link: https://patchwork.freedesktop.org/patch/msgid/20240422121011.4133236-2-jani.nikula@intel.com
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
---
 include/drm/drm_print.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/drm/drm_print.h b/include/drm/drm_print.h
index 561c3b96b6fd..089950ad8681 100644
--- a/include/drm/drm_print.h
+++ b/include/drm/drm_print.h
@@ -28,7 +28,6 @@
 
 #include <linux/compiler.h>
 #include <linux/printk.h>
-#include <linux/seq_file.h>
 #include <linux/device.h>
 #include <linux/dynamic_debug.h>
 
@@ -36,6 +35,7 @@
 
 struct debugfs_regset32;
 struct drm_device;
+struct seq_file;
 
 /* Do *not* use outside of drm_print.[ch]! */
 extern unsigned long __drm_debug;
-- 
cgit v1.2.3


From 17553ba8e19dee8770b3dcc597d49dcc3418f3b0 Mon Sep 17 00:00:00 2001
From: Qiang Yu <quic_qianyu@quicinc.com>
Date: Wed, 24 Apr 2024 11:21:53 +0800
Subject: bus: mhi: host: Add sysfs entry to force device to enter EDL

Add sysfs entry to allow users of MHI bus to force device to enter EDL
(Emergency Download) mode to download the device firmware. Since there is
no guarantee that all the devices will support EDL mode, the sysfs entry
is kept as an optional one and will appear only for the supported devices.

Controllers supporting the EDL mode are expected to provide edl_trigger()
callback that puts the device into EDL mode.

Signed-off-by: Qiang Yu <quic_qianyu@quicinc.com>
Reviewed-by: Jeffrey Hugo <quic_jhugo@quicinc.com>
Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
Link: https://lore.kernel.org/r/1713928915-18229-2-git-send-email-quic_qianyu@quicinc.com
[mani: fixed the kernel version and reworded the commit message]
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 Documentation/ABI/stable/sysfs-bus-mhi | 13 +++++++++++++
 drivers/bus/mhi/host/init.c            | 33 +++++++++++++++++++++++++++++++++
 include/linux/mhi.h                    |  2 ++
 3 files changed, 48 insertions(+)

(limited to 'include')

diff --git a/Documentation/ABI/stable/sysfs-bus-mhi b/Documentation/ABI/stable/sysfs-bus-mhi
index 1a47f9e0cc84..8b9698fa0beb 100644
--- a/Documentation/ABI/stable/sysfs-bus-mhi
+++ b/Documentation/ABI/stable/sysfs-bus-mhi
@@ -29,3 +29,16 @@ Description:	Initiates a SoC reset on the MHI controller.  A SoC reset is
                 This can be useful as a method of recovery if the device is
                 non-responsive, or as a means of loading new firmware as a
                 system administration task.
+
+What:           /sys/bus/mhi/devices/.../trigger_edl
+Date:           April 2024
+KernelVersion:  6.10
+Contact:        mhi@lists.linux.dev
+Description:    Writing a non-zero value to this file will force devices to
+                enter EDL (Emergency Download) mode. This entry only exists for
+                devices capable of entering the EDL mode using the standard EDL
+                triggering mechanism defined in the MHI spec v1.2. Once in EDL
+                mode, the flash programmer image can be downloaded to the
+                device to enter the flash programmer execution environment.
+                This can be useful if user wants to use QDL (Qualcomm Download,
+                which is used to download firmware over EDL) to update firmware.
diff --git a/drivers/bus/mhi/host/init.c b/drivers/bus/mhi/host/init.c
index 44f934981de8..7104c1856987 100644
--- a/drivers/bus/mhi/host/init.c
+++ b/drivers/bus/mhi/host/init.c
@@ -127,6 +127,30 @@ static ssize_t soc_reset_store(struct device *dev,
 }
 static DEVICE_ATTR_WO(soc_reset);
 
+static ssize_t trigger_edl_store(struct device *dev,
+			       struct device_attribute *attr,
+			       const char *buf, size_t count)
+{
+	struct mhi_device *mhi_dev = to_mhi_device(dev);
+	struct mhi_controller *mhi_cntrl = mhi_dev->mhi_cntrl;
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	if (!val)
+		return -EINVAL;
+
+	ret = mhi_cntrl->edl_trigger(mhi_cntrl);
+	if (ret)
+		return ret;
+
+	return count;
+}
+static DEVICE_ATTR_WO(trigger_edl);
+
 static struct attribute *mhi_dev_attrs[] = {
 	&dev_attr_serial_number.attr,
 	&dev_attr_oem_pk_hash.attr,
@@ -1018,6 +1042,12 @@ int mhi_register_controller(struct mhi_controller *mhi_cntrl,
 	if (ret)
 		goto err_release_dev;
 
+	if (mhi_cntrl->edl_trigger) {
+		ret = sysfs_create_file(&mhi_dev->dev.kobj, &dev_attr_trigger_edl.attr);
+		if (ret)
+			goto err_release_dev;
+	}
+
 	mhi_cntrl->mhi_dev = mhi_dev;
 
 	mhi_create_debugfs(mhi_cntrl);
@@ -1051,6 +1081,9 @@ void mhi_unregister_controller(struct mhi_controller *mhi_cntrl)
 	mhi_deinit_free_irq(mhi_cntrl);
 	mhi_destroy_debugfs(mhi_cntrl);
 
+	if (mhi_cntrl->edl_trigger)
+		sysfs_remove_file(&mhi_dev->dev.kobj, &dev_attr_trigger_edl.attr);
+
 	destroy_workqueue(mhi_cntrl->hiprio_wq);
 	kfree(mhi_cntrl->mhi_cmd);
 	kfree(mhi_cntrl->mhi_event);
diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index cde01e133a1b..d968e1ab44dc 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -353,6 +353,7 @@ struct mhi_controller_config {
  * @read_reg: Read a MHI register via the physical link (required)
  * @write_reg: Write a MHI register via the physical link (required)
  * @reset: Controller specific reset function (optional)
+ * @edl_trigger: CB function to trigger EDL mode (optional)
  * @buffer_len: Bounce buffer length
  * @index: Index of the MHI controller instance
  * @bounce_buf: Use of bounce buffer
@@ -435,6 +436,7 @@ struct mhi_controller {
 	void (*write_reg)(struct mhi_controller *mhi_cntrl, void __iomem *addr,
 			  u32 val);
 	void (*reset)(struct mhi_controller *mhi_cntrl);
+	int (*edl_trigger)(struct mhi_controller *mhi_cntrl);
 
 	size_t buffer_len;
 	int index;
-- 
cgit v1.2.3


From 553f94fc7667259e47a9318e9e5702c9a814d637 Mon Sep 17 00:00:00 2001
From: Qiang Yu <quic_qianyu@quicinc.com>
Date: Wed, 24 Apr 2024 11:21:54 +0800
Subject: bus: mhi: host: Add a new API for getting channel doorbell offset

Some controllers may want to access a specific doorbell register. Hence add
a new API that reads the CHDBOFF register and returns the offset of the
doorbell registers from MMIO base, so that the controller can calculate the
address of the specific doorbell register by adding the register offset
with doorbell offset and MMIO base address.

Signed-off-by: Qiang Yu <quic_qianyu@quicinc.com>
Reviewed-by: Jeffrey Hugo <quic_jhugo@quicinc.com>
Link: https://lore.kernel.org/r/1713928915-18229-3-git-send-email-quic_qianyu@quicinc.com
[mani: reworded commit message and Kdoc]
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
---
 drivers/bus/mhi/host/init.c |  8 +++-----
 drivers/bus/mhi/host/main.c | 16 ++++++++++++++++
 include/linux/mhi.h         |  9 +++++++++
 3 files changed, 28 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/bus/mhi/host/init.c b/drivers/bus/mhi/host/init.c
index 7104c1856987..173f79918741 100644
--- a/drivers/bus/mhi/host/init.c
+++ b/drivers/bus/mhi/host/init.c
@@ -541,11 +541,9 @@ int mhi_init_mmio(struct mhi_controller *mhi_cntrl)
 	dev_dbg(dev, "Initializing MHI registers\n");
 
 	/* Read channel db offset */
-	ret = mhi_read_reg(mhi_cntrl, base, CHDBOFF, &val);
-	if (ret) {
-		dev_err(dev, "Unable to read CHDBOFF register\n");
-		return -EIO;
-	}
+	ret = mhi_get_channel_doorbell_offset(mhi_cntrl, &val);
+	if (ret)
+		return ret;
 
 	if (val >= mhi_cntrl->reg_len - (8 * MHI_DEV_WAKE_DB)) {
 		dev_err(dev, "CHDB offset: 0x%x is out of range: 0x%zx\n",
diff --git a/drivers/bus/mhi/host/main.c b/drivers/bus/mhi/host/main.c
index 15d657af9b5b..4de75674f193 100644
--- a/drivers/bus/mhi/host/main.c
+++ b/drivers/bus/mhi/host/main.c
@@ -1691,3 +1691,19 @@ void mhi_unprepare_from_transfer(struct mhi_device *mhi_dev)
 	}
 }
 EXPORT_SYMBOL_GPL(mhi_unprepare_from_transfer);
+
+int mhi_get_channel_doorbell_offset(struct mhi_controller *mhi_cntrl, u32 *chdb_offset)
+{
+	struct device *dev = &mhi_cntrl->mhi_dev->dev;
+	void __iomem *base = mhi_cntrl->regs;
+	int ret;
+
+	ret = mhi_read_reg(mhi_cntrl, base, CHDBOFF, chdb_offset);
+	if (ret) {
+		dev_err(dev, "Unable to read CHDBOFF register\n");
+		return -EIO;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mhi_get_channel_doorbell_offset);
diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index d968e1ab44dc..b573f15762f8 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -816,4 +816,13 @@ int mhi_queue_skb(struct mhi_device *mhi_dev, enum dma_data_direction dir,
  */
 bool mhi_queue_is_full(struct mhi_device *mhi_dev, enum dma_data_direction dir);
 
+/**
+ * mhi_get_channel_doorbell_offset - Get the channel doorbell offset
+ * @mhi_cntrl: MHI controller
+ * @chdb_offset: Read channel doorbell offset
+ *
+ * Return: 0 if the read succeeds, a negative error code otherwise
+ */
+int mhi_get_channel_doorbell_offset(struct mhi_controller *mhi_cntrl, u32 *chdb_offset);
+
 #endif /* _MHI_H_ */
-- 
cgit v1.2.3


From b6cc692ac67a75b97a2d524d8e2fd2d59b20248a Mon Sep 17 00:00:00 2001
From: Claudiu Beznea <claudiu.beznea.uj@bp.renesas.com>
Date: Mon, 22 Apr 2024 13:53:48 +0300
Subject: dt-bindings: clock: r9a07g043-cpg: Add power domain IDs

Add power domain IDs for the RZ/G2UL (R9A07G043) SoC.

Signed-off-by: Claudiu Beznea <claudiu.beznea.uj@bp.renesas.com>
Acked-by: Rob Herring <robh@kernel.org>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20240422105355.1622177-2-claudiu.beznea.uj@bp.renesas.com
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 include/dt-bindings/clock/r9a07g043-cpg.h | 52 +++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/r9a07g043-cpg.h b/include/dt-bindings/clock/r9a07g043-cpg.h
index a64139fec815..131993343777 100644
--- a/include/dt-bindings/clock/r9a07g043-cpg.h
+++ b/include/dt-bindings/clock/r9a07g043-cpg.h
@@ -200,5 +200,57 @@
 #define R9A07G043_AX45MP_CORE0_RESETN	78	/* RZ/Five Only */
 #define R9A07G043_IAX45_RESETN		79	/* RZ/Five Only */
 
+/* Power domain IDs. */
+#define R9A07G043_PD_ALWAYS_ON		0
+#define R9A07G043_PD_GIC		1	/* RZ/G2UL Only */
+#define R9A07G043_PD_IA55		2	/* RZ/G2UL Only */
+#define R9A07G043_PD_MHU		3	/* RZ/G2UL Only */
+#define R9A07G043_PD_CORESIGHT		4	/* RZ/G2UL Only */
+#define R9A07G043_PD_SYC		5	/* RZ/G2UL Only */
+#define R9A07G043_PD_DMAC		6
+#define R9A07G043_PD_GTM0		7
+#define R9A07G043_PD_GTM1		8
+#define R9A07G043_PD_GTM2		9
+#define R9A07G043_PD_MTU		10
+#define R9A07G043_PD_POE3		11
+#define R9A07G043_PD_WDT0		12
+#define R9A07G043_PD_SPI		13
+#define R9A07G043_PD_SDHI0		14
+#define R9A07G043_PD_SDHI1		15
+#define R9A07G043_PD_ISU		16	/* RZ/G2UL Only */
+#define R9A07G043_PD_CRU		17	/* RZ/G2UL Only */
+#define R9A07G043_PD_LCDC		18	/* RZ/G2UL Only */
+#define R9A07G043_PD_SSI0		19
+#define R9A07G043_PD_SSI1		20
+#define R9A07G043_PD_SSI2		21
+#define R9A07G043_PD_SSI3		22
+#define R9A07G043_PD_SRC		23
+#define R9A07G043_PD_USB0		24
+#define R9A07G043_PD_USB1		25
+#define R9A07G043_PD_USB_PHY		26
+#define R9A07G043_PD_ETHER0		27
+#define R9A07G043_PD_ETHER1		28
+#define R9A07G043_PD_I2C0		29
+#define R9A07G043_PD_I2C1		30
+#define R9A07G043_PD_I2C2		31
+#define R9A07G043_PD_I2C3		32
+#define R9A07G043_PD_SCIF0		33
+#define R9A07G043_PD_SCIF1		34
+#define R9A07G043_PD_SCIF2		35
+#define R9A07G043_PD_SCIF3		36
+#define R9A07G043_PD_SCIF4		37
+#define R9A07G043_PD_SCI0		38
+#define R9A07G043_PD_SCI1		39
+#define R9A07G043_PD_IRDA		40
+#define R9A07G043_PD_RSPI0		41
+#define R9A07G043_PD_RSPI1		42
+#define R9A07G043_PD_RSPI2		43
+#define R9A07G043_PD_CANFD		44
+#define R9A07G043_PD_ADC		45
+#define R9A07G043_PD_TSU		46
+#define R9A07G043_PD_PLIC		47	/* RZ/Five Only */
+#define R9A07G043_PD_IAX45		48	/* RZ/Five Only */
+#define R9A07G043_PD_NCEPLDM		49	/* RZ/Five Only */
+#define R9A07G043_PD_NCEPLMT		50	/* RZ/Five Only */
 
 #endif /* __DT_BINDINGS_CLOCK_R9A07G043_CPG_H__ */
-- 
cgit v1.2.3


From d744e4567419e26025f42e8b1f13c0b021d62819 Mon Sep 17 00:00:00 2001
From: Claudiu Beznea <claudiu.beznea.uj@bp.renesas.com>
Date: Mon, 22 Apr 2024 13:53:49 +0300
Subject: dt-bindings: clock: r9a07g044-cpg: Add power domain IDs

Add power domain IDs for the RZ/G2L (R9A07G044) SoC.

Signed-off-by: Claudiu Beznea <claudiu.beznea.uj@bp.renesas.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Rob Herring <robh@kernel.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20240422105355.1622177-3-claudiu.beznea.uj@bp.renesas.com
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 include/dt-bindings/clock/r9a07g044-cpg.h | 58 +++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/r9a07g044-cpg.h b/include/dt-bindings/clock/r9a07g044-cpg.h
index 0bb17ff1a01a..e209f96f92b7 100644
--- a/include/dt-bindings/clock/r9a07g044-cpg.h
+++ b/include/dt-bindings/clock/r9a07g044-cpg.h
@@ -217,4 +217,62 @@
 #define R9A07G044_ADC_ADRST_N		82
 #define R9A07G044_TSU_PRESETN		83
 
+/* Power domain IDs. */
+#define R9A07G044_PD_ALWAYS_ON		0
+#define R9A07G044_PD_GIC		1
+#define R9A07G044_PD_IA55		2
+#define R9A07G044_PD_MHU		3
+#define R9A07G044_PD_CORESIGHT		4
+#define R9A07G044_PD_SYC		5
+#define R9A07G044_PD_DMAC		6
+#define R9A07G044_PD_GTM0		7
+#define R9A07G044_PD_GTM1		8
+#define R9A07G044_PD_GTM2		9
+#define R9A07G044_PD_MTU		10
+#define R9A07G044_PD_POE3		11
+#define R9A07G044_PD_GPT		12
+#define R9A07G044_PD_POEGA		13
+#define R9A07G044_PD_POEGB		14
+#define R9A07G044_PD_POEGC		15
+#define R9A07G044_PD_POEGD		16
+#define R9A07G044_PD_WDT0		17
+#define R9A07G044_PD_WDT1		18
+#define R9A07G044_PD_SPI		19
+#define R9A07G044_PD_SDHI0		20
+#define R9A07G044_PD_SDHI1		21
+#define R9A07G044_PD_3DGE		22
+#define R9A07G044_PD_ISU		23
+#define R9A07G044_PD_VCPL4		24
+#define R9A07G044_PD_CRU		25
+#define R9A07G044_PD_MIPI_DSI		26
+#define R9A07G044_PD_LCDC		27
+#define R9A07G044_PD_SSI0		28
+#define R9A07G044_PD_SSI1		29
+#define R9A07G044_PD_SSI2		30
+#define R9A07G044_PD_SSI3		31
+#define R9A07G044_PD_SRC		32
+#define R9A07G044_PD_USB0		33
+#define R9A07G044_PD_USB1		34
+#define R9A07G044_PD_USB_PHY		35
+#define R9A07G044_PD_ETHER0		36
+#define R9A07G044_PD_ETHER1		37
+#define R9A07G044_PD_I2C0		38
+#define R9A07G044_PD_I2C1		39
+#define R9A07G044_PD_I2C2		40
+#define R9A07G044_PD_I2C3		41
+#define R9A07G044_PD_SCIF0		42
+#define R9A07G044_PD_SCIF1		43
+#define R9A07G044_PD_SCIF2		44
+#define R9A07G044_PD_SCIF3		45
+#define R9A07G044_PD_SCIF4		46
+#define R9A07G044_PD_SCI0		47
+#define R9A07G044_PD_SCI1		48
+#define R9A07G044_PD_IRDA		49
+#define R9A07G044_PD_RSPI0		50
+#define R9A07G044_PD_RSPI1		51
+#define R9A07G044_PD_RSPI2		52
+#define R9A07G044_PD_CANFD		53
+#define R9A07G044_PD_ADC		54
+#define R9A07G044_PD_TSU		55
+
 #endif /* __DT_BINDINGS_CLOCK_R9A07G044_CPG_H__ */
-- 
cgit v1.2.3


From 5b9979fda3058c786706d86b6e0ecb8e71be5a84 Mon Sep 17 00:00:00 2001
From: Claudiu Beznea <claudiu.beznea.uj@bp.renesas.com>
Date: Mon, 22 Apr 2024 13:53:50 +0300
Subject: dt-bindings: clock: r9a07g054-cpg: Add power domain IDs

Add power domain IDs for the RZ/V2L (R9A07G054) SoC.

Signed-off-by: Claudiu Beznea <claudiu.beznea.uj@bp.renesas.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Rob Herring <robh@kernel.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20240422105355.1622177-4-claudiu.beznea.uj@bp.renesas.com
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 include/dt-bindings/clock/r9a07g054-cpg.h | 58 +++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/r9a07g054-cpg.h b/include/dt-bindings/clock/r9a07g054-cpg.h
index 43f4dbda872c..2c99f89397c4 100644
--- a/include/dt-bindings/clock/r9a07g054-cpg.h
+++ b/include/dt-bindings/clock/r9a07g054-cpg.h
@@ -226,4 +226,62 @@
 #define R9A07G054_TSU_PRESETN		83
 #define R9A07G054_STPAI_ARESETN		84
 
+/* Power domain IDs. */
+#define R9A07G054_PD_ALWAYS_ON		0
+#define R9A07G054_PD_GIC		1
+#define R9A07G054_PD_IA55		2
+#define R9A07G054_PD_MHU		3
+#define R9A07G054_PD_CORESIGHT		4
+#define R9A07G054_PD_SYC		5
+#define R9A07G054_PD_DMAC		6
+#define R9A07G054_PD_GTM0		7
+#define R9A07G054_PD_GTM1		8
+#define R9A07G054_PD_GTM2		9
+#define R9A07G054_PD_MTU		10
+#define R9A07G054_PD_POE3		11
+#define R9A07G054_PD_GPT		12
+#define R9A07G054_PD_POEGA		13
+#define R9A07G054_PD_POEGB		14
+#define R9A07G054_PD_POEGC		15
+#define R9A07G054_PD_POEGD		16
+#define R9A07G054_PD_WDT0		17
+#define R9A07G054_PD_WDT1		18
+#define R9A07G054_PD_SPI		19
+#define R9A07G054_PD_SDHI0		20
+#define R9A07G054_PD_SDHI1		21
+#define R9A07G054_PD_3DGE		22
+#define R9A07G054_PD_ISU		23
+#define R9A07G054_PD_VCPL4		24
+#define R9A07G054_PD_CRU		25
+#define R9A07G054_PD_MIPI_DSI		26
+#define R9A07G054_PD_LCDC		27
+#define R9A07G054_PD_SSI0		28
+#define R9A07G054_PD_SSI1		29
+#define R9A07G054_PD_SSI2		30
+#define R9A07G054_PD_SSI3		31
+#define R9A07G054_PD_SRC		32
+#define R9A07G054_PD_USB0		33
+#define R9A07G054_PD_USB1		34
+#define R9A07G054_PD_USB_PHY		35
+#define R9A07G054_PD_ETHER0		36
+#define R9A07G054_PD_ETHER1		37
+#define R9A07G054_PD_I2C0		38
+#define R9A07G054_PD_I2C1		39
+#define R9A07G054_PD_I2C2		40
+#define R9A07G054_PD_I2C3		41
+#define R9A07G054_PD_SCIF0		42
+#define R9A07G054_PD_SCIF1		43
+#define R9A07G054_PD_SCIF2		44
+#define R9A07G054_PD_SCIF3		45
+#define R9A07G054_PD_SCIF4		46
+#define R9A07G054_PD_SCI0		47
+#define R9A07G054_PD_SCI1		48
+#define R9A07G054_PD_IRDA		49
+#define R9A07G054_PD_RSPI0		50
+#define R9A07G054_PD_RSPI1		51
+#define R9A07G054_PD_RSPI2		52
+#define R9A07G054_PD_CANFD		53
+#define R9A07G054_PD_ADC		54
+#define R9A07G054_PD_TSU		55
+
 #endif /* __DT_BINDINGS_CLOCK_R9A07G054_CPG_H__ */
-- 
cgit v1.2.3


From 2d03ce9cd7bdd1e29772f8ecf7769535361c7eaa Mon Sep 17 00:00:00 2001
From: Claudiu Beznea <claudiu.beznea.uj@bp.renesas.com>
Date: Mon, 22 Apr 2024 13:53:51 +0300
Subject: dt-bindings: clock: r9a08g045-cpg: Add power domain IDs

Add power domain IDs for the RZ/G3S (R9A08G045) SoC.

Signed-off-by: Claudiu Beznea <claudiu.beznea.uj@bp.renesas.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Rob Herring <robh@kernel.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Link: https://lore.kernel.org/r/20240422105355.1622177-5-claudiu.beznea.uj@bp.renesas.com
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 include/dt-bindings/clock/r9a08g045-cpg.h | 70 +++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

(limited to 'include')

diff --git a/include/dt-bindings/clock/r9a08g045-cpg.h b/include/dt-bindings/clock/r9a08g045-cpg.h
index 410725b778a8..8281e9caf3a9 100644
--- a/include/dt-bindings/clock/r9a08g045-cpg.h
+++ b/include/dt-bindings/clock/r9a08g045-cpg.h
@@ -239,4 +239,74 @@
 #define R9A08G045_I3C_PRESETN		92
 #define R9A08G045_VBAT_BRESETN		93
 
+/* Power domain IDs. */
+#define R9A08G045_PD_ALWAYS_ON		0
+#define R9A08G045_PD_GIC		1
+#define R9A08G045_PD_IA55		2
+#define R9A08G045_PD_MHU		3
+#define R9A08G045_PD_CORESIGHT		4
+#define R9A08G045_PD_SYC		5
+#define R9A08G045_PD_DMAC		6
+#define R9A08G045_PD_GTM0		7
+#define R9A08G045_PD_GTM1		8
+#define R9A08G045_PD_GTM2		9
+#define R9A08G045_PD_GTM3		10
+#define R9A08G045_PD_GTM4		11
+#define R9A08G045_PD_GTM5		12
+#define R9A08G045_PD_GTM6		13
+#define R9A08G045_PD_GTM7		14
+#define R9A08G045_PD_MTU		15
+#define R9A08G045_PD_POE3		16
+#define R9A08G045_PD_GPT		17
+#define R9A08G045_PD_POEGA		18
+#define R9A08G045_PD_POEGB		19
+#define R9A08G045_PD_POEGC		20
+#define R9A08G045_PD_POEGD		21
+#define R9A08G045_PD_WDT0		22
+#define R9A08G045_PD_XSPI		23
+#define R9A08G045_PD_SDHI0		24
+#define R9A08G045_PD_SDHI1		25
+#define R9A08G045_PD_SDHI2		26
+#define R9A08G045_PD_SSI0		27
+#define R9A08G045_PD_SSI1		28
+#define R9A08G045_PD_SSI2		29
+#define R9A08G045_PD_SSI3		30
+#define R9A08G045_PD_SRC		31
+#define R9A08G045_PD_USB0		32
+#define R9A08G045_PD_USB1		33
+#define R9A08G045_PD_USB_PHY		34
+#define R9A08G045_PD_ETHER0		35
+#define R9A08G045_PD_ETHER1		36
+#define R9A08G045_PD_I2C0		37
+#define R9A08G045_PD_I2C1		38
+#define R9A08G045_PD_I2C2		39
+#define R9A08G045_PD_I2C3		40
+#define R9A08G045_PD_SCIF0		41
+#define R9A08G045_PD_SCIF1		42
+#define R9A08G045_PD_SCIF2		43
+#define R9A08G045_PD_SCIF3		44
+#define R9A08G045_PD_SCIF4		45
+#define R9A08G045_PD_SCIF5		46
+#define R9A08G045_PD_SCI0		47
+#define R9A08G045_PD_SCI1		48
+#define R9A08G045_PD_IRDA		49
+#define R9A08G045_PD_RSPI0		50
+#define R9A08G045_PD_RSPI1		51
+#define R9A08G045_PD_RSPI2		52
+#define R9A08G045_PD_RSPI3		53
+#define R9A08G045_PD_RSPI4		54
+#define R9A08G045_PD_CANFD		55
+#define R9A08G045_PD_ADC		56
+#define R9A08G045_PD_TSU		57
+#define R9A08G045_PD_OCTA		58
+#define R9A08G045_PD_PDM		59
+#define R9A08G045_PD_PCI		60
+#define R9A08G045_PD_SPDIF		61
+#define R9A08G045_PD_I3C		62
+#define R9A08G045_PD_VBAT		63
+
+#define R9A08G045_PD_DDR		64
+#define R9A08G045_PD_TZCDDR		65
+#define R9A08G045_PD_OTFDE_DDR		66
+
 #endif /* __DT_BINDINGS_CLOCK_R9A08G045_CPG_H__ */
-- 
cgit v1.2.3


From dbc8fc9d6de17b618591d4d2a6227dee27fe0f51 Mon Sep 17 00:00:00 2001
From: Shivaprasad G Bhat <sbhat@linux.ibm.com>
Date: Sat, 27 Jan 2024 12:57:00 -0600
Subject: powerpc/papr_scm: Move duplicate definitions to common header files

papr_scm and ndtest share common PDSM payload structs like
nd_papr_pdsm_health. Presently these structs are duplicated across
papr_pdsm.h and ndtest.h header files. Since 'ndtest' is essentially
arch independent and can run on platforms other than PPC64, a way
needs to be deviced to avoid redundancy and duplication of PDSM
structs in future.

So the patch proposes moving the PDSM header from arch/powerpc/include-
-/uapi/ to the generic include/uapi/linux directory. Also, there
are some #defines common between papr_scm and ndtest which are not
exported to the user space. So, move them to a header file which
can be shared across ndtest and papr_scm via newly introduced
include/linux/papr_scm.h.

Signed-off-by: Shivaprasad G Bhat <sbhat@linux.ibm.com>
Signed-off-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Suggested-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Link: https://lore.kernel.org/r/170638176942.112443.2937254675538057083.stgit@ltcd48-lp2.aus.stglab.ibm.com
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
---
 MAINTAINERS                               |   2 +
 arch/powerpc/include/uapi/asm/papr_pdsm.h | 165 ------------------------------
 arch/powerpc/platforms/pseries/papr_scm.c |  43 +-------
 include/linux/papr_scm.h                  |  49 +++++++++
 include/uapi/linux/papr_pdsm.h            | 165 ++++++++++++++++++++++++++++++
 tools/testing/nvdimm/test/ndtest.c        |   2 +
 tools/testing/nvdimm/test/ndtest.h        |  31 ------
 7 files changed, 220 insertions(+), 237 deletions(-)
 delete mode 100644 arch/powerpc/include/uapi/asm/papr_pdsm.h
 create mode 100644 include/linux/papr_scm.h
 create mode 100644 include/uapi/linux/papr_pdsm.h

(limited to 'include')

diff --git a/MAINTAINERS b/MAINTAINERS
index ebf03f5f0619..0960c519f949 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12497,6 +12497,8 @@ F:	drivers/rtc/rtc-opal.c
 F:	drivers/scsi/ibmvscsi/
 F:	drivers/tty/hvc/hvc_opal.c
 F:	drivers/watchdog/wdrtas.c
+F:	include/linux/papr_scm.h
+F:	include/uapi/linux/papr_pdsm.h
 F:	tools/testing/selftests/powerpc
 N:	/pmac
 N:	powermac
diff --git a/arch/powerpc/include/uapi/asm/papr_pdsm.h b/arch/powerpc/include/uapi/asm/papr_pdsm.h
deleted file mode 100644
index 17439925045c..000000000000
--- a/arch/powerpc/include/uapi/asm/papr_pdsm.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * PAPR nvDimm Specific Methods (PDSM) and structs for libndctl
- *
- * (C) Copyright IBM 2020
- *
- * Author: Vaibhav Jain <vaibhav at linux.ibm.com>
- */
-
-#ifndef _UAPI_ASM_POWERPC_PAPR_PDSM_H_
-#define _UAPI_ASM_POWERPC_PAPR_PDSM_H_
-
-#include <linux/types.h>
-#include <linux/ndctl.h>
-
-/*
- * PDSM Envelope:
- *
- * The ioctl ND_CMD_CALL exchange data between user-space and kernel via
- * envelope which consists of 2 headers sections and payload sections as
- * illustrated below:
- *  +-----------------+---------------+---------------------------+
- *  |   64-Bytes      |   8-Bytes     |       Max 184-Bytes       |
- *  +-----------------+---------------+---------------------------+
- *  | ND-HEADER       |  PDSM-HEADER  |      PDSM-PAYLOAD         |
- *  +-----------------+---------------+---------------------------+
- *  | nd_family       |               |                           |
- *  | nd_size_out     | cmd_status    |                           |
- *  | nd_size_in      | reserved      |     nd_pdsm_payload       |
- *  | nd_command      | payload   --> |                           |
- *  | nd_fw_size      |               |                           |
- *  | nd_payload ---> |               |                           |
- *  +---------------+-----------------+---------------------------+
- *
- * ND Header:
- * This is the generic libnvdimm header described as 'struct nd_cmd_pkg'
- * which is interpreted by libnvdimm before passed on to papr_scm. Important
- * member fields used are:
- * 'nd_family'		: (In) NVDIMM_FAMILY_PAPR_SCM
- * 'nd_size_in'		: (In) PDSM-HEADER + PDSM-IN-PAYLOAD (usually 0)
- * 'nd_size_out'        : (In) PDSM-HEADER + PDSM-RETURN-PAYLOAD
- * 'nd_command'         : (In) One of PAPR_PDSM_XXX
- * 'nd_fw_size'         : (Out) PDSM-HEADER + size of actual payload returned
- *
- * PDSM Header:
- * This is papr-scm specific header that precedes the payload. This is defined
- * as nd_cmd_pdsm_pkg.  Following fields aare available in this header:
- *
- * 'cmd_status'		: (Out) Errors if any encountered while servicing PDSM.
- * 'reserved'		: Not used, reserved for future and should be set to 0.
- * 'payload'            : A union of all the possible payload structs
- *
- * PDSM Payload:
- *
- * The layout of the PDSM Payload is defined by various structs shared between
- * papr_scm and libndctl so that contents of payload can be interpreted. As such
- * its defined as a union of all possible payload structs as
- * 'union nd_pdsm_payload'. Based on the value of 'nd_cmd_pkg.nd_command'
- * appropriate member of the union is accessed.
- */
-
-/* Max payload size that we can handle */
-#define ND_PDSM_PAYLOAD_MAX_SIZE 184
-
-/* Max payload size that we can handle */
-#define ND_PDSM_HDR_SIZE \
-	(sizeof(struct nd_pkg_pdsm) - ND_PDSM_PAYLOAD_MAX_SIZE)
-
-/* Various nvdimm health indicators */
-#define PAPR_PDSM_DIMM_HEALTHY       0
-#define PAPR_PDSM_DIMM_UNHEALTHY     1
-#define PAPR_PDSM_DIMM_CRITICAL      2
-#define PAPR_PDSM_DIMM_FATAL         3
-
-/* struct nd_papr_pdsm_health.extension_flags field flags */
-
-/* Indicate that the 'dimm_fuel_gauge' field is valid */
-#define PDSM_DIMM_HEALTH_RUN_GAUGE_VALID 1
-
-/* Indicate that the 'dimm_dsc' field is valid */
-#define PDSM_DIMM_DSC_VALID 2
-
-/*
- * Struct exchanged between kernel & ndctl in for PAPR_PDSM_HEALTH
- * Various flags indicate the health status of the dimm.
- *
- * extension_flags	: Any extension fields present in the struct.
- * dimm_unarmed		: Dimm not armed. So contents wont persist.
- * dimm_bad_shutdown	: Previous shutdown did not persist contents.
- * dimm_bad_restore	: Contents from previous shutdown werent restored.
- * dimm_scrubbed	: Contents of the dimm have been scrubbed.
- * dimm_locked		: Contents of the dimm cant be modified until CEC reboot
- * dimm_encrypted	: Contents of dimm are encrypted.
- * dimm_health		: Dimm health indicator. One of PAPR_PDSM_DIMM_XXXX
- * dimm_fuel_gauge	: Life remaining of DIMM as a percentage from 0-100
- */
-struct nd_papr_pdsm_health {
-	union {
-		struct {
-			__u32 extension_flags;
-			__u8 dimm_unarmed;
-			__u8 dimm_bad_shutdown;
-			__u8 dimm_bad_restore;
-			__u8 dimm_scrubbed;
-			__u8 dimm_locked;
-			__u8 dimm_encrypted;
-			__u16 dimm_health;
-
-			/* Extension flag PDSM_DIMM_HEALTH_RUN_GAUGE_VALID */
-			__u16 dimm_fuel_gauge;
-
-			/* Extension flag PDSM_DIMM_DSC_VALID */
-			__u64 dimm_dsc;
-		};
-		__u8 buf[ND_PDSM_PAYLOAD_MAX_SIZE];
-	};
-};
-
-/* Flags for injecting specific smart errors */
-#define PDSM_SMART_INJECT_HEALTH_FATAL		(1 << 0)
-#define PDSM_SMART_INJECT_BAD_SHUTDOWN		(1 << 1)
-
-struct nd_papr_pdsm_smart_inject {
-	union {
-		struct {
-			/* One or more of PDSM_SMART_INJECT_ */
-			__u32 flags;
-			__u8 fatal_enable;
-			__u8 unsafe_shutdown_enable;
-		};
-		__u8 buf[ND_PDSM_PAYLOAD_MAX_SIZE];
-	};
-};
-
-/*
- * Methods to be embedded in ND_CMD_CALL request. These are sent to the kernel
- * via 'nd_cmd_pkg.nd_command' member of the ioctl struct
- */
-enum papr_pdsm {
-	PAPR_PDSM_MIN = 0x0,
-	PAPR_PDSM_HEALTH,
-	PAPR_PDSM_SMART_INJECT,
-	PAPR_PDSM_MAX,
-};
-
-/* Maximal union that can hold all possible payload types */
-union nd_pdsm_payload {
-	struct nd_papr_pdsm_health health;
-	struct nd_papr_pdsm_smart_inject smart_inject;
-	__u8 buf[ND_PDSM_PAYLOAD_MAX_SIZE];
-} __packed;
-
-/*
- * PDSM-header + payload expected with ND_CMD_CALL ioctl from libnvdimm
- * Valid member of union 'payload' is identified via 'nd_cmd_pkg.nd_command'
- * that should always precede this struct when sent to papr_scm via CMD_CALL
- * interface.
- */
-struct nd_pkg_pdsm {
-	__s32 cmd_status;	/* Out: Sub-cmd status returned back */
-	__u16 reserved[2];	/* Ignored and to be set as '0' */
-	union nd_pdsm_payload payload;
-} __packed;
-
-#endif /* _UAPI_ASM_POWERPC_PAPR_PDSM_H_ */
diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
index c233f9db039b..9b6420eb3567 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -16,7 +16,8 @@
 #include <linux/nd.h>
 
 #include <asm/plpar_wrappers.h>
-#include <asm/papr_pdsm.h>
+#include <uapi/linux/papr_pdsm.h>
+#include <linux/papr_scm.h>
 #include <asm/mce.h>
 #include <asm/unaligned.h>
 #include <linux/perf_event.h>
@@ -29,46 +30,6 @@
 	 (1ul << ND_CMD_SET_CONFIG_DATA) | \
 	 (1ul << ND_CMD_CALL))
 
-/* DIMM health bitmap indicators */
-/* SCM device is unable to persist memory contents */
-#define PAPR_PMEM_UNARMED                   (1ULL << (63 - 0))
-/* SCM device failed to persist memory contents */
-#define PAPR_PMEM_SHUTDOWN_DIRTY            (1ULL << (63 - 1))
-/* SCM device contents are persisted from previous IPL */
-#define PAPR_PMEM_SHUTDOWN_CLEAN            (1ULL << (63 - 2))
-/* SCM device contents are not persisted from previous IPL */
-#define PAPR_PMEM_EMPTY                     (1ULL << (63 - 3))
-/* SCM device memory life remaining is critically low */
-#define PAPR_PMEM_HEALTH_CRITICAL           (1ULL << (63 - 4))
-/* SCM device will be garded off next IPL due to failure */
-#define PAPR_PMEM_HEALTH_FATAL              (1ULL << (63 - 5))
-/* SCM contents cannot persist due to current platform health status */
-#define PAPR_PMEM_HEALTH_UNHEALTHY          (1ULL << (63 - 6))
-/* SCM device is unable to persist memory contents in certain conditions */
-#define PAPR_PMEM_HEALTH_NON_CRITICAL       (1ULL << (63 - 7))
-/* SCM device is encrypted */
-#define PAPR_PMEM_ENCRYPTED                 (1ULL << (63 - 8))
-/* SCM device has been scrubbed and locked */
-#define PAPR_PMEM_SCRUBBED_AND_LOCKED       (1ULL << (63 - 9))
-
-/* Bits status indicators for health bitmap indicating unarmed dimm */
-#define PAPR_PMEM_UNARMED_MASK (PAPR_PMEM_UNARMED |		\
-				PAPR_PMEM_HEALTH_UNHEALTHY)
-
-/* Bits status indicators for health bitmap indicating unflushed dimm */
-#define PAPR_PMEM_BAD_SHUTDOWN_MASK (PAPR_PMEM_SHUTDOWN_DIRTY)
-
-/* Bits status indicators for health bitmap indicating unrestored dimm */
-#define PAPR_PMEM_BAD_RESTORE_MASK  (PAPR_PMEM_EMPTY)
-
-/* Bit status indicators for smart event notification */
-#define PAPR_PMEM_SMART_EVENT_MASK (PAPR_PMEM_HEALTH_CRITICAL | \
-				    PAPR_PMEM_HEALTH_FATAL |	\
-				    PAPR_PMEM_HEALTH_UNHEALTHY)
-
-#define PAPR_SCM_PERF_STATS_EYECATCHER __stringify(SCMSTATS)
-#define PAPR_SCM_PERF_STATS_VERSION 0x1
-
 /* Struct holding a single performance metric */
 struct papr_scm_perf_stat {
 	u8 stat_id[8];
diff --git a/include/linux/papr_scm.h b/include/linux/papr_scm.h
new file mode 100644
index 000000000000..eb36453813db
--- /dev/null
+++ b/include/linux/papr_scm.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __LINUX_PAPR_SCM_H
+#define __LINUX_PAPR_SCM_H
+
+/* DIMM health bitmap indicators */
+/* SCM device is unable to persist memory contents */
+#define PAPR_PMEM_UNARMED                   (1ULL << (63 - 0))
+/* SCM device failed to persist memory contents */
+#define PAPR_PMEM_SHUTDOWN_DIRTY            (1ULL << (63 - 1))
+/* SCM device contents are persisted from previous IPL */
+#define PAPR_PMEM_SHUTDOWN_CLEAN            (1ULL << (63 - 2))
+/* SCM device contents are not persisted from previous IPL */
+#define PAPR_PMEM_EMPTY                     (1ULL << (63 - 3))
+/* SCM device memory life remaining is critically low */
+#define PAPR_PMEM_HEALTH_CRITICAL           (1ULL << (63 - 4))
+/* SCM device will be garded off next IPL due to failure */
+#define PAPR_PMEM_HEALTH_FATAL              (1ULL << (63 - 5))
+/* SCM contents cannot persist due to current platform health status */
+#define PAPR_PMEM_HEALTH_UNHEALTHY          (1ULL << (63 - 6))
+/* SCM device is unable to persist memory contents in certain conditions */
+#define PAPR_PMEM_HEALTH_NON_CRITICAL       (1ULL << (63 - 7))
+/* SCM device is encrypted */
+#define PAPR_PMEM_ENCRYPTED                 (1ULL << (63 - 8))
+/* SCM device has been scrubbed and locked */
+#define PAPR_PMEM_SCRUBBED_AND_LOCKED       (1ULL << (63 - 9))
+
+#define PAPR_PMEM_SAVE_FAILED               (1ULL << (63 - 10))
+
+/* Bits status indicators for health bitmap indicating unarmed dimm */
+#define PAPR_PMEM_UNARMED_MASK (PAPR_PMEM_UNARMED |            \
+				PAPR_PMEM_HEALTH_UNHEALTHY)
+
+/* Bits status indicators for health bitmap indicating unflushed dimm */
+#define PAPR_PMEM_BAD_SHUTDOWN_MASK (PAPR_PMEM_SHUTDOWN_DIRTY)
+
+/* Bits status indicators for health bitmap indicating unrestored dimm */
+#define PAPR_PMEM_BAD_RESTORE_MASK  (PAPR_PMEM_EMPTY)
+
+/* Bit status indicators for smart event notification */
+#define PAPR_PMEM_SMART_EVENT_MASK (PAPR_PMEM_HEALTH_CRITICAL | \
+					PAPR_PMEM_HEALTH_FATAL | \
+					PAPR_PMEM_HEALTH_UNHEALTHY)
+
+#define PAPR_PMEM_SAVE_MASK                (PAPR_PMEM_SAVE_FAILED)
+
+#define PAPR_SCM_PERF_STATS_EYECATCHER __stringify(SCMSTATS)
+#define PAPR_SCM_PERF_STATS_VERSION 0x1
+
+#endif /* __LINUX_PAPR_SCM_H */
diff --git a/include/uapi/linux/papr_pdsm.h b/include/uapi/linux/papr_pdsm.h
new file mode 100644
index 000000000000..17439925045c
--- /dev/null
+++ b/include/uapi/linux/papr_pdsm.h
@@ -0,0 +1,165 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * PAPR nvDimm Specific Methods (PDSM) and structs for libndctl
+ *
+ * (C) Copyright IBM 2020
+ *
+ * Author: Vaibhav Jain <vaibhav at linux.ibm.com>
+ */
+
+#ifndef _UAPI_ASM_POWERPC_PAPR_PDSM_H_
+#define _UAPI_ASM_POWERPC_PAPR_PDSM_H_
+
+#include <linux/types.h>
+#include <linux/ndctl.h>
+
+/*
+ * PDSM Envelope:
+ *
+ * The ioctl ND_CMD_CALL exchange data between user-space and kernel via
+ * envelope which consists of 2 headers sections and payload sections as
+ * illustrated below:
+ *  +-----------------+---------------+---------------------------+
+ *  |   64-Bytes      |   8-Bytes     |       Max 184-Bytes       |
+ *  +-----------------+---------------+---------------------------+
+ *  | ND-HEADER       |  PDSM-HEADER  |      PDSM-PAYLOAD         |
+ *  +-----------------+---------------+---------------------------+
+ *  | nd_family       |               |                           |
+ *  | nd_size_out     | cmd_status    |                           |
+ *  | nd_size_in      | reserved      |     nd_pdsm_payload       |
+ *  | nd_command      | payload   --> |                           |
+ *  | nd_fw_size      |               |                           |
+ *  | nd_payload ---> |               |                           |
+ *  +---------------+-----------------+---------------------------+
+ *
+ * ND Header:
+ * This is the generic libnvdimm header described as 'struct nd_cmd_pkg'
+ * which is interpreted by libnvdimm before passed on to papr_scm. Important
+ * member fields used are:
+ * 'nd_family'		: (In) NVDIMM_FAMILY_PAPR_SCM
+ * 'nd_size_in'		: (In) PDSM-HEADER + PDSM-IN-PAYLOAD (usually 0)
+ * 'nd_size_out'        : (In) PDSM-HEADER + PDSM-RETURN-PAYLOAD
+ * 'nd_command'         : (In) One of PAPR_PDSM_XXX
+ * 'nd_fw_size'         : (Out) PDSM-HEADER + size of actual payload returned
+ *
+ * PDSM Header:
+ * This is papr-scm specific header that precedes the payload. This is defined
+ * as nd_cmd_pdsm_pkg.  Following fields aare available in this header:
+ *
+ * 'cmd_status'		: (Out) Errors if any encountered while servicing PDSM.
+ * 'reserved'		: Not used, reserved for future and should be set to 0.
+ * 'payload'            : A union of all the possible payload structs
+ *
+ * PDSM Payload:
+ *
+ * The layout of the PDSM Payload is defined by various structs shared between
+ * papr_scm and libndctl so that contents of payload can be interpreted. As such
+ * its defined as a union of all possible payload structs as
+ * 'union nd_pdsm_payload'. Based on the value of 'nd_cmd_pkg.nd_command'
+ * appropriate member of the union is accessed.
+ */
+
+/* Max payload size that we can handle */
+#define ND_PDSM_PAYLOAD_MAX_SIZE 184
+
+/* Max payload size that we can handle */
+#define ND_PDSM_HDR_SIZE \
+	(sizeof(struct nd_pkg_pdsm) - ND_PDSM_PAYLOAD_MAX_SIZE)
+
+/* Various nvdimm health indicators */
+#define PAPR_PDSM_DIMM_HEALTHY       0
+#define PAPR_PDSM_DIMM_UNHEALTHY     1
+#define PAPR_PDSM_DIMM_CRITICAL      2
+#define PAPR_PDSM_DIMM_FATAL         3
+
+/* struct nd_papr_pdsm_health.extension_flags field flags */
+
+/* Indicate that the 'dimm_fuel_gauge' field is valid */
+#define PDSM_DIMM_HEALTH_RUN_GAUGE_VALID 1
+
+/* Indicate that the 'dimm_dsc' field is valid */
+#define PDSM_DIMM_DSC_VALID 2
+
+/*
+ * Struct exchanged between kernel & ndctl in for PAPR_PDSM_HEALTH
+ * Various flags indicate the health status of the dimm.
+ *
+ * extension_flags	: Any extension fields present in the struct.
+ * dimm_unarmed		: Dimm not armed. So contents wont persist.
+ * dimm_bad_shutdown	: Previous shutdown did not persist contents.
+ * dimm_bad_restore	: Contents from previous shutdown werent restored.
+ * dimm_scrubbed	: Contents of the dimm have been scrubbed.
+ * dimm_locked		: Contents of the dimm cant be modified until CEC reboot
+ * dimm_encrypted	: Contents of dimm are encrypted.
+ * dimm_health		: Dimm health indicator. One of PAPR_PDSM_DIMM_XXXX
+ * dimm_fuel_gauge	: Life remaining of DIMM as a percentage from 0-100
+ */
+struct nd_papr_pdsm_health {
+	union {
+		struct {
+			__u32 extension_flags;
+			__u8 dimm_unarmed;
+			__u8 dimm_bad_shutdown;
+			__u8 dimm_bad_restore;
+			__u8 dimm_scrubbed;
+			__u8 dimm_locked;
+			__u8 dimm_encrypted;
+			__u16 dimm_health;
+
+			/* Extension flag PDSM_DIMM_HEALTH_RUN_GAUGE_VALID */
+			__u16 dimm_fuel_gauge;
+
+			/* Extension flag PDSM_DIMM_DSC_VALID */
+			__u64 dimm_dsc;
+		};
+		__u8 buf[ND_PDSM_PAYLOAD_MAX_SIZE];
+	};
+};
+
+/* Flags for injecting specific smart errors */
+#define PDSM_SMART_INJECT_HEALTH_FATAL		(1 << 0)
+#define PDSM_SMART_INJECT_BAD_SHUTDOWN		(1 << 1)
+
+struct nd_papr_pdsm_smart_inject {
+	union {
+		struct {
+			/* One or more of PDSM_SMART_INJECT_ */
+			__u32 flags;
+			__u8 fatal_enable;
+			__u8 unsafe_shutdown_enable;
+		};
+		__u8 buf[ND_PDSM_PAYLOAD_MAX_SIZE];
+	};
+};
+
+/*
+ * Methods to be embedded in ND_CMD_CALL request. These are sent to the kernel
+ * via 'nd_cmd_pkg.nd_command' member of the ioctl struct
+ */
+enum papr_pdsm {
+	PAPR_PDSM_MIN = 0x0,
+	PAPR_PDSM_HEALTH,
+	PAPR_PDSM_SMART_INJECT,
+	PAPR_PDSM_MAX,
+};
+
+/* Maximal union that can hold all possible payload types */
+union nd_pdsm_payload {
+	struct nd_papr_pdsm_health health;
+	struct nd_papr_pdsm_smart_inject smart_inject;
+	__u8 buf[ND_PDSM_PAYLOAD_MAX_SIZE];
+} __packed;
+
+/*
+ * PDSM-header + payload expected with ND_CMD_CALL ioctl from libnvdimm
+ * Valid member of union 'payload' is identified via 'nd_cmd_pkg.nd_command'
+ * that should always precede this struct when sent to papr_scm via CMD_CALL
+ * interface.
+ */
+struct nd_pkg_pdsm {
+	__s32 cmd_status;	/* Out: Sub-cmd status returned back */
+	__u16 reserved[2];	/* Ignored and to be set as '0' */
+	union nd_pdsm_payload payload;
+} __packed;
+
+#endif /* _UAPI_ASM_POWERPC_PAPR_PDSM_H_ */
diff --git a/tools/testing/nvdimm/test/ndtest.c b/tools/testing/nvdimm/test/ndtest.c
index b8419f460368..fa9d1327fc71 100644
--- a/tools/testing/nvdimm/test/ndtest.c
+++ b/tools/testing/nvdimm/test/ndtest.c
@@ -13,6 +13,8 @@
 #include <nd-core.h>
 #include <linux/printk.h>
 #include <linux/seq_buf.h>
+#include <linux/papr_scm.h>
+#include <uapi/linux/papr_pdsm.h>
 
 #include "../watermark.h"
 #include "nfit_test.h"
diff --git a/tools/testing/nvdimm/test/ndtest.h b/tools/testing/nvdimm/test/ndtest.h
index 2c54c9cbb90c..8f27ad6f7319 100644
--- a/tools/testing/nvdimm/test/ndtest.h
+++ b/tools/testing/nvdimm/test/ndtest.h
@@ -5,37 +5,6 @@
 #include <linux/platform_device.h>
 #include <linux/libnvdimm.h>
 
-/* SCM device is unable to persist memory contents */
-#define PAPR_PMEM_UNARMED                   (1ULL << (63 - 0))
-/* SCM device failed to persist memory contents */
-#define PAPR_PMEM_SHUTDOWN_DIRTY            (1ULL << (63 - 1))
-/* SCM device contents are not persisted from previous IPL */
-#define PAPR_PMEM_EMPTY                     (1ULL << (63 - 3))
-#define PAPR_PMEM_HEALTH_CRITICAL           (1ULL << (63 - 4))
-/* SCM device will be garded off next IPL due to failure */
-#define PAPR_PMEM_HEALTH_FATAL              (1ULL << (63 - 5))
-/* SCM contents cannot persist due to current platform health status */
-#define PAPR_PMEM_HEALTH_UNHEALTHY          (1ULL << (63 - 6))
-
-/* Bits status indicators for health bitmap indicating unarmed dimm */
-#define PAPR_PMEM_UNARMED_MASK (PAPR_PMEM_UNARMED |		\
-				PAPR_PMEM_HEALTH_UNHEALTHY)
-
-#define PAPR_PMEM_SAVE_FAILED                (1ULL << (63 - 10))
-
-/* Bits status indicators for health bitmap indicating unflushed dimm */
-#define PAPR_PMEM_BAD_SHUTDOWN_MASK (PAPR_PMEM_SHUTDOWN_DIRTY)
-
-/* Bits status indicators for health bitmap indicating unrestored dimm */
-#define PAPR_PMEM_BAD_RESTORE_MASK  (PAPR_PMEM_EMPTY)
-
-/* Bit status indicators for smart event notification */
-#define PAPR_PMEM_SMART_EVENT_MASK (PAPR_PMEM_HEALTH_CRITICAL | \
-				    PAPR_PMEM_HEALTH_FATAL |	\
-				    PAPR_PMEM_HEALTH_UNHEALTHY)
-
-#define PAPR_PMEM_SAVE_MASK                (PAPR_PMEM_SAVE_FAILED)
-
 struct ndtest_config;
 
 struct ndtest_priv {
-- 
cgit v1.2.3


From 48e2cd3e3dcfe04f212df4fb189fa04c2a87b980 Mon Sep 17 00:00:00 2001
From: Philo Lu <lulie@linux.alibaba.com>
Date: Fri, 26 Apr 2024 00:17:23 +0800
Subject: bpf: add mrtt and srtt as BPF_SOCK_OPS_RTT_CB args

Two important arguments in RTT estimation, mrtt and srtt, are passed to
tcp_bpf_rtt(), so that bpf programs get more information about RTT
computation in BPF_SOCK_OPS_RTT_CB.

The difference between bpf_sock_ops->srtt_us and the srtt here is: the
former is an old rtt before update, while srtt passed by tcp_bpf_rtt()
is that after update.

Signed-off-by: Philo Lu <lulie@linux.alibaba.com>
Link: https://lore.kernel.org/r/20240425161724.73707-2-lulie@linux.alibaba.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/net/tcp.h              | 4 ++--
 include/uapi/linux/bpf.h       | 2 ++
 net/ipv4/tcp_input.c           | 4 ++--
 tools/include/uapi/linux/bpf.h | 2 ++
 4 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6ae35199d3b3..0f75d03287c2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2706,10 +2706,10 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
 	return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1);
 }
 
-static inline void tcp_bpf_rtt(struct sock *sk)
+static inline void tcp_bpf_rtt(struct sock *sk, long mrtt, u32 srtt)
 {
 	if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_RTT_CB_FLAG))
-		tcp_call_bpf(sk, BPF_SOCK_OPS_RTT_CB, 0, NULL);
+		tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_RTT_CB, mrtt, srtt);
 }
 
 #if IS_ENABLED(CONFIG_SMC)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e4ae83550fb3..d94a72593ead 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6947,6 +6947,8 @@ enum {
 					 * socket transition to LISTEN state.
 					 */
 	BPF_SOCK_OPS_RTT_CB,		/* Called on every RTT.
+					 * Arg1: measured RTT input (mrtt)
+					 * Arg2: updated srtt
 					 */
 	BPF_SOCK_OPS_PARSE_HDR_OPT_CB,	/* Parse the header option.
 					 * It will be called to handle
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5d874817a78d..d1115d7c3936 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -911,7 +911,7 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
 			tp->rtt_seq = tp->snd_nxt;
 			tp->mdev_max_us = tcp_rto_min_us(sk);
 
-			tcp_bpf_rtt(sk);
+			tcp_bpf_rtt(sk, mrtt_us, srtt);
 		}
 	} else {
 		/* no previous measure. */
@@ -921,7 +921,7 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
 		tp->mdev_max_us = tp->rttvar_us;
 		tp->rtt_seq = tp->snd_nxt;
 
-		tcp_bpf_rtt(sk);
+		tcp_bpf_rtt(sk, mrtt_us, srtt);
 	}
 	tp->srtt_us = max(1U, srtt);
 }
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e4ae83550fb3..d94a72593ead 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6947,6 +6947,8 @@ enum {
 					 * socket transition to LISTEN state.
 					 */
 	BPF_SOCK_OPS_RTT_CB,		/* Called on every RTT.
+					 * Arg1: measured RTT input (mrtt)
+					 * Arg2: updated srtt
 					 */
 	BPF_SOCK_OPS_PARSE_HDR_OPT_CB,	/* Parse the header option.
 					 * It will be called to handle
-- 
cgit v1.2.3


From c4e86b4363ac8ecf836def8e46973707688fd98f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 23 Apr 2024 20:54:08 +0000
Subject: net: add two more call_rcu_hurry()

I had failures with pmtu.sh selftests lately,
with netns dismantles firing ref_tracking alerts [1].

After much debugging, I found that some queued
rcu callbacks were delayed by minutes, because
of CONFIG_RCU_LAZY=y option.

Joel Fernandes had a similar issue in the past,
fixed with commit 483c26ff63f4 ("net: Use call_rcu_hurry()
for dst_release()")

In this commit, I make sure nexthop_free_rcu()
and free_fib_info_rcu() are not delayed too much
because they both can release device references.

tools/testing/selftests/net/pmtu.sh no longer fails.

Traces were:

[  968.179860] ref_tracker: veth_A-R1@00000000d0ff3fe2 has 3/5 users at
                    dst_alloc+0x76/0x160
                    ip6_dst_alloc+0x25/0x80
                    ip6_pol_route+0x2a8/0x450
                    ip6_pol_route_output+0x1f/0x30
                    fib6_rule_lookup+0x163/0x270
                    ip6_route_output_flags+0xda/0x190
                    ip6_dst_lookup_tail.constprop.0+0x1d0/0x260
                    ip6_dst_lookup_flow+0x47/0xa0
                    udp_tunnel6_dst_lookup+0x158/0x210
                    vxlan_xmit_one+0x4c2/0x1550 [vxlan]
                    vxlan_xmit+0x52d/0x14f0 [vxlan]
                    dev_hard_start_xmit+0x7b/0x1e0
                    __dev_queue_xmit+0x20b/0xe40
                    ip6_finish_output2+0x2ea/0x6e0
                    ip6_finish_output+0x143/0x320
                    ip6_output+0x74/0x140

[  968.179860] ref_tracker: veth_A-R1@00000000d0ff3fe2 has 1/5 users at
                    netdev_get_by_index+0xc0/0xe0
                    fib6_nh_init+0x1a9/0xa90
                    rtm_new_nexthop+0x6fa/0x1580
                    rtnetlink_rcv_msg+0x155/0x3e0
                    netlink_rcv_skb+0x61/0x110
                    rtnetlink_rcv+0x19/0x20
                    netlink_unicast+0x23f/0x380
                    netlink_sendmsg+0x1fc/0x430
                    ____sys_sendmsg+0x2ef/0x320
                    ___sys_sendmsg+0x86/0xd0
                    __sys_sendmsg+0x67/0xc0
                    __x64_sys_sendmsg+0x21/0x30
                    x64_sys_call+0x252/0x2030
                    do_syscall_64+0x6c/0x190
                    entry_SYSCALL_64_after_hwframe+0x76/0x7e

[  968.179860] ref_tracker: veth_A-R1@00000000d0ff3fe2 has 1/5 users at
                    ipv6_add_dev+0x136/0x530
                    addrconf_notify+0x19d/0x770
                    notifier_call_chain+0x65/0xd0
                    raw_notifier_call_chain+0x1a/0x20
                    call_netdevice_notifiers_info+0x54/0x90
                    register_netdevice+0x61e/0x790
                    veth_newlink+0x230/0x440
                    __rtnl_newlink+0x7d2/0xaa0
                    rtnl_newlink+0x4c/0x70
                    rtnetlink_rcv_msg+0x155/0x3e0
                    netlink_rcv_skb+0x61/0x110
                    rtnetlink_rcv+0x19/0x20
                    netlink_unicast+0x23f/0x380
                    netlink_sendmsg+0x1fc/0x430
                    ____sys_sendmsg+0x2ef/0x320
                    ___sys_sendmsg+0x86/0xd0
....
[ 1079.316024]  ? show_regs+0x68/0x80
[ 1079.316087]  ? __warn+0x8c/0x140
[ 1079.316103]  ? ref_tracker_free+0x1a0/0x270
[ 1079.316117]  ? report_bug+0x196/0x1c0
[ 1079.316135]  ? handle_bug+0x42/0x80
[ 1079.316149]  ? exc_invalid_op+0x1c/0x70
[ 1079.316162]  ? asm_exc_invalid_op+0x1f/0x30
[ 1079.316193]  ? ref_tracker_free+0x1a0/0x270
[ 1079.316208]  ? _raw_spin_unlock+0x1a/0x40
[ 1079.316222]  ? free_unref_page+0x126/0x1a0
[ 1079.316239]  ? destroy_large_folio+0x69/0x90
[ 1079.316251]  ? __folio_put+0x99/0xd0
[ 1079.316276]  dst_dev_put+0x69/0xd0
[ 1079.316308]  fib6_nh_release_dsts.part.0+0x3d/0x80
[ 1079.316327]  fib6_nh_release+0x45/0x70
[ 1079.316340]  nexthop_free_rcu+0x131/0x170
[ 1079.316356]  rcu_do_batch+0x1ee/0x820
[ 1079.316370]  ? rcu_do_batch+0x179/0x820
[ 1079.316388]  rcu_core+0x1aa/0x4d0
[ 1079.316405]  rcu_core_si+0x12/0x20
[ 1079.316417]  __do_softirq+0x13a/0x3dc
[ 1079.316435]  __irq_exit_rcu+0xa3/0x110
[ 1079.316449]  irq_exit_rcu+0x12/0x30
[ 1079.316462]  sysvec_apic_timer_interrupt+0x5b/0xe0
[ 1079.316474]  asm_sysvec_apic_timer_interrupt+0x1f/0x30
[ 1079.316569] RIP: 0033:0x7f06b65c63f0

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Cc: Paul E. McKenney <paulmck@kernel.org>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20240423205408.39632-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/nexthop.h    | 2 +-
 net/ipv4/fib_semantics.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index 7ca315ad500e..68463aebcc05 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -267,7 +267,7 @@ static inline bool nexthop_get(struct nexthop *nh)
 static inline void nexthop_put(struct nexthop *nh)
 {
 	if (refcount_dec_and_test(&nh->refcnt))
-		call_rcu(&nh->rcu, nexthop_free_rcu);
+		call_rcu_hurry(&nh->rcu, nexthop_free_rcu);
 }
 
 static inline bool nexthop_cmp(const struct nexthop *nh1,
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 5eb1b8d302bb..f669da98d11d 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -254,7 +254,7 @@ void free_fib_info(struct fib_info *fi)
 		return;
 	}
 
-	call_rcu(&fi->rcu, free_fib_info_rcu);
+	call_rcu_hurry(&fi->rcu, free_fib_info_rcu);
 }
 EXPORT_SYMBOL_GPL(free_fib_info);
 
-- 
cgit v1.2.3


From 5b0a67008b0d608bb7585fe4c3c0c09d5ceaf1c9 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Sat, 16 Mar 2024 12:52:41 +0100
Subject: mm: remove guard around pgd_offset_k() macro

The last architecture redefining pgd_offset_k() was IA64 and it was
removed by commit cf8e8658100d ("arch: Remove Itanium (IA-64)
architecture")

There is no need anymore to guard generic version of pgd_offset_k()
with #ifndef pgd_offset_k

Link: https://lkml.kernel.org/r/59d3f47d5615d18cca1986f269be2fcb3df34556.1710589838.git.christophe.leroy@csgroup.eu
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgtable.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 85fc7554cd52..bd9c7180718c 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -149,9 +149,7 @@ static inline pgd_t *pgd_offset_pgd(pgd_t *pgd, unsigned long address)
  * a shortcut which implies the use of the kernel's pgd, instead
  * of a process's
  */
-#ifndef pgd_offset_k
 #define pgd_offset_k(address)		pgd_offset(&init_mm, (address))
-#endif
 
 /*
  * In many cases it is known that a virtual address is mapped at PMD or PTE
-- 
cgit v1.2.3


From 91b71e78b8e478e1a9af36b15ff1d38810572866 Mon Sep 17 00:00:00 2001
From: Yosry Ahmed <yosryahmed@google.com>
Date: Sat, 16 Mar 2024 01:58:03 +0000
Subject: mm: memcg: add NULL check to obj_cgroup_put()

9 out of 16 callers perform a NULL check before calling obj_cgroup_put().
Move the NULL check in the function, similar to mem_cgroup_put().  The
unlikely() NULL check in current_objcg_update() was left alone to avoid
dropping the unlikey() annotation as this a fast path.

Link: https://lkml.kernel.org/r/20240316015803.2777252-1-yosryahmed@google.com
Signed-off-by: Yosry Ahmed <yosryahmed@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h |  3 ++-
 kernel/bpf/memalloc.c      |  6 ++----
 mm/memcontrol.c            | 18 ++++++------------
 mm/zswap.c                 |  3 +--
 4 files changed, 11 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 394fd0a887ae..b6264796815d 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -818,7 +818,8 @@ static inline void obj_cgroup_get_many(struct obj_cgroup *objcg,
 
 static inline void obj_cgroup_put(struct obj_cgroup *objcg)
 {
-	percpu_ref_put(&objcg->refcnt);
+	if (objcg)
+		percpu_ref_put(&objcg->refcnt);
 }
 
 static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 550f02e2cb13..a546aba46d5d 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -759,8 +759,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
 			rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
 			rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
 		}
-		if (ma->objcg)
-			obj_cgroup_put(ma->objcg);
+		obj_cgroup_put(ma->objcg);
 		destroy_mem_alloc(ma, rcu_in_progress);
 	}
 	if (ma->caches) {
@@ -776,8 +775,7 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
 				rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
 			}
 		}
-		if (ma->objcg)
-			obj_cgroup_put(ma->objcg);
+		obj_cgroup_put(ma->objcg);
 		destroy_mem_alloc(ma, rcu_in_progress);
 	}
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fabce2b50c69..786c0fae6e30 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2369,8 +2369,7 @@ static void drain_local_stock(struct work_struct *dummy)
 	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
 
 	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
-	if (old)
-		obj_cgroup_put(old);
+	obj_cgroup_put(old);
 }
 
 /*
@@ -3145,8 +3144,7 @@ static struct obj_cgroup *current_objcg_update(void)
 		if (old) {
 			old = (struct obj_cgroup *)
 				((unsigned long)old & ~CURRENT_OBJCG_UPDATE_FLAG);
-			if (old)
-				obj_cgroup_put(old);
+			obj_cgroup_put(old);
 
 			old = NULL;
 		}
@@ -3418,8 +3416,7 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
 		mod_objcg_mlstate(objcg, pgdat, idx, nr);
 
 	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
-	if (old)
-		obj_cgroup_put(old);
+	obj_cgroup_put(old);
 }
 
 static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
@@ -3546,8 +3543,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
 	}
 
 	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
-	if (old)
-		obj_cgroup_put(old);
+	obj_cgroup_put(old);
 
 	if (nr_pages)
 		obj_cgroup_uncharge_pages(objcg, nr_pages);
@@ -5468,8 +5464,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 {
 	int node;
 
-	if (memcg->orig_objcg)
-		obj_cgroup_put(memcg->orig_objcg);
+	obj_cgroup_put(memcg->orig_objcg);
 
 	for_each_node(node)
 		free_mem_cgroup_per_node_info(memcg, node);
@@ -6620,8 +6615,7 @@ static void mem_cgroup_exit(struct task_struct *task)
 
 	objcg = (struct obj_cgroup *)
 		((unsigned long)objcg & ~CURRENT_OBJCG_UPDATE_FLAG);
-	if (objcg)
-		obj_cgroup_put(objcg);
+	obj_cgroup_put(objcg);
 
 	/*
 	 * Some kernel allocations can happen after this point,
diff --git a/mm/zswap.c b/mm/zswap.c
index 6f8850c44b61..2817e2791c13 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1618,8 +1618,7 @@ put_pool:
 freepage:
 	zswap_entry_cache_free(entry);
 reject:
-	if (objcg)
-		obj_cgroup_put(objcg);
+	obj_cgroup_put(objcg);
 check_old:
 	/*
 	 * If the zswap store fails or zswap is disabled, we must invalidate the
-- 
cgit v1.2.3


From 9636f055dae19ffae9d8bf21a68261b58c769987 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Mon, 18 Mar 2024 16:04:02 -0400
Subject: mm/treewide: remove pXd_huge()

This API is not used anymore, drop it for the whole tree.

Link: https://lkml.kernel.org/r/20240318200404.448346-13-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bjorn Andersson <andersson@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Fabio Estevam <festevam@denx.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Konrad Dybcio <konrad.dybcio@linaro.org>
Cc: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Cc: Lucas Stach <l.stach@pengutronix.de>
Cc: Mark Salter <msalter@redhat.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Shawn Guo <shawnguo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm/mm/Makefile                             |  1 -
 arch/arm/mm/hugetlbpage.c                        | 29 ------------------------
 arch/arm64/mm/hugetlbpage.c                      | 10 --------
 arch/loongarch/mm/hugetlbpage.c                  | 10 --------
 arch/mips/include/asm/pgtable-32.h               |  2 +-
 arch/mips/include/asm/pgtable-64.h               |  2 +-
 arch/mips/mm/hugetlbpage.c                       | 10 --------
 arch/parisc/mm/hugetlbpage.c                     | 11 ---------
 arch/powerpc/include/asm/book3s/64/pgtable-4k.h  | 10 --------
 arch/powerpc/include/asm/book3s/64/pgtable-64k.h | 25 --------------------
 arch/powerpc/include/asm/nohash/pgtable.h        | 10 --------
 arch/riscv/mm/hugetlbpage.c                      | 10 --------
 arch/s390/mm/hugetlbpage.c                       | 10 --------
 arch/sh/mm/hugetlbpage.c                         | 10 --------
 arch/sparc/mm/hugetlbpage.c                      | 10 --------
 arch/x86/mm/hugetlbpage.c                        | 16 -------------
 include/linux/hugetlb.h                          | 24 --------------------
 17 files changed, 2 insertions(+), 198 deletions(-)
 delete mode 100644 arch/arm/mm/hugetlbpage.c

(limited to 'include')

diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile
index 71b858c9b10c..1779e12db085 100644
--- a/arch/arm/mm/Makefile
+++ b/arch/arm/mm/Makefile
@@ -21,7 +21,6 @@ KASAN_SANITIZE_physaddr.o	:= n
 obj-$(CONFIG_DEBUG_VIRTUAL)	+= physaddr.o
 
 obj-$(CONFIG_ALIGNMENT_TRAP)	+= alignment.o
-obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
 obj-$(CONFIG_ARM_PV_FIXUP)	+= pv-fixup-asm.o
 
 obj-$(CONFIG_CPU_ABRT_NOMMU)	+= abort-nommu.o
diff --git a/arch/arm/mm/hugetlbpage.c b/arch/arm/mm/hugetlbpage.c
deleted file mode 100644
index c2fa643f6bb5..000000000000
--- a/arch/arm/mm/hugetlbpage.c
+++ /dev/null
@@ -1,29 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * arch/arm/mm/hugetlbpage.c
- *
- * Copyright (C) 2012 ARM Ltd.
- *
- * Based on arch/x86/include/asm/hugetlb.h and Bill Carson's patches
- */
-
-#include <linux/init.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/hugetlb.h>
-#include <linux/pagemap.h>
-#include <linux/err.h>
-#include <linux/sysctl.h>
-#include <asm/mman.h>
-#include <asm/tlb.h>
-#include <asm/tlbflush.h>
-
-int pud_huge(pud_t pud)
-{
-	return 0;
-}
-
-int pmd_huge(pmd_t pmd)
-{
-	return pmd_leaf(pmd);
-}
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index f494fc31201f..ca58210d6c07 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -79,16 +79,6 @@ bool arch_hugetlb_migration_supported(struct hstate *h)
 }
 #endif
 
-int pmd_huge(pmd_t pmd)
-{
-	return pmd_leaf(pmd);
-}
-
-int pud_huge(pud_t pud)
-{
-	return pud_leaf(pud);
-}
-
 static int find_num_contig(struct mm_struct *mm, unsigned long addr,
 			   pte_t *ptep, size_t *pgsize)
 {
diff --git a/arch/loongarch/mm/hugetlbpage.c b/arch/loongarch/mm/hugetlbpage.c
index a4e78e74aa21..12222c56cb59 100644
--- a/arch/loongarch/mm/hugetlbpage.c
+++ b/arch/loongarch/mm/hugetlbpage.c
@@ -50,16 +50,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr,
 	return (pte_t *) pmd;
 }
 
-int pmd_huge(pmd_t pmd)
-{
-	return (pmd_val(pmd) & _PAGE_HUGE) != 0;
-}
-
-int pud_huge(pud_t pud)
-{
-	return (pud_val(pud) & _PAGE_HUGE) != 0;
-}
-
 uint64_t pmd_to_entrylo(unsigned long pmd_val)
 {
 	uint64_t val;
diff --git a/arch/mips/include/asm/pgtable-32.h b/arch/mips/include/asm/pgtable-32.h
index 0e196650f4f4..92b7591aac2a 100644
--- a/arch/mips/include/asm/pgtable-32.h
+++ b/arch/mips/include/asm/pgtable-32.h
@@ -129,7 +129,7 @@ static inline int pmd_none(pmd_t pmd)
 static inline int pmd_bad(pmd_t pmd)
 {
 #ifdef CONFIG_MIPS_HUGE_TLB_SUPPORT
-	/* pmd_huge(pmd) but inline */
+	/* pmd_leaf(pmd) but inline */
 	if (unlikely(pmd_val(pmd) & _PAGE_HUGE))
 		return 0;
 #endif
diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h
index 20ca48c1b606..7c28510b3768 100644
--- a/arch/mips/include/asm/pgtable-64.h
+++ b/arch/mips/include/asm/pgtable-64.h
@@ -245,7 +245,7 @@ static inline int pmd_none(pmd_t pmd)
 static inline int pmd_bad(pmd_t pmd)
 {
 #ifdef CONFIG_MIPS_HUGE_TLB_SUPPORT
-	/* pmd_huge(pmd) but inline */
+	/* pmd_leaf(pmd) but inline */
 	if (unlikely(pmd_val(pmd) & _PAGE_HUGE))
 		return 0;
 #endif
diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c
index 7eaff5b07873..0b9e15555b59 100644
--- a/arch/mips/mm/hugetlbpage.c
+++ b/arch/mips/mm/hugetlbpage.c
@@ -57,13 +57,3 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr,
 	}
 	return (pte_t *) pmd;
 }
-
-int pmd_huge(pmd_t pmd)
-{
-	return (pmd_val(pmd) & _PAGE_HUGE) != 0;
-}
-
-int pud_huge(pud_t pud)
-{
-	return (pud_val(pud) & _PAGE_HUGE) != 0;
-}
diff --git a/arch/parisc/mm/hugetlbpage.c b/arch/parisc/mm/hugetlbpage.c
index a9f7e21f6656..0356199bd9e7 100644
--- a/arch/parisc/mm/hugetlbpage.c
+++ b/arch/parisc/mm/hugetlbpage.c
@@ -180,14 +180,3 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 	}
 	return changed;
 }
-
-
-int pmd_huge(pmd_t pmd)
-{
-	return 0;
-}
-
-int pud_huge(pud_t pud)
-{
-	return 0;
-}
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h
index 92545981bb49..baf934578c3a 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h
@@ -6,16 +6,6 @@
  */
 #ifndef __ASSEMBLY__
 #ifdef CONFIG_HUGETLB_PAGE
-static inline int pmd_huge(pmd_t pmd)
-{
-	return pmd_leaf(pmd);
-}
-
-static inline int pud_huge(pud_t pud)
-{
-	return pud_leaf(pud);
-}
-
 /*
  * With radix , we have hugepage ptes in the pud and pmd entries. We don't
  * need to setup hugepage directory for them. Our pte and page directory format
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h
index ced7ee8b42fc..6ac73da7b80e 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h
@@ -4,31 +4,6 @@
 
 #ifndef __ASSEMBLY__
 #ifdef CONFIG_HUGETLB_PAGE
-/*
- * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have
- * 16GB hugepage pte in PGD and 16MB hugepage pte at PMD;
- *
- * Defined in such a way that we can optimize away code block at build time
- * if CONFIG_HUGETLB_PAGE=n.
- *
- * returns true for pmd migration entries, THP, devmap, hugetlb
- * But compile time dependent on CONFIG_HUGETLB_PAGE
- */
-static inline int pmd_huge(pmd_t pmd)
-{
-	/*
-	 * leaf pte for huge page
-	 */
-	return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE));
-}
-
-static inline int pud_huge(pud_t pud)
-{
-	/*
-	 * leaf pte for huge page
-	 */
-	return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE));
-}
 
 /*
  * With 64k page size, we have hugepage ptes in the pgd and pmd entries. We don't
diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h
index 427db14292c9..f5f39d4f03c8 100644
--- a/arch/powerpc/include/asm/nohash/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/pgtable.h
@@ -351,16 +351,6 @@ static inline int hugepd_ok(hugepd_t hpd)
 #endif
 }
 
-static inline int pmd_huge(pmd_t pmd)
-{
-	return 0;
-}
-
-static inline int pud_huge(pud_t pud)
-{
-	return 0;
-}
-
 #define is_hugepd(hpd)		(hugepd_ok(hpd))
 #endif
 
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index 5ef2a6891158..0ebd968b33c9 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -399,16 +399,6 @@ static bool is_napot_size(unsigned long size)
 
 #endif /*CONFIG_RISCV_ISA_SVNAPOT*/
 
-int pud_huge(pud_t pud)
-{
-	return pud_leaf(pud);
-}
-
-int pmd_huge(pmd_t pmd)
-{
-	return pmd_leaf(pmd);
-}
-
 static bool __hugetlb_valid_size(unsigned long size)
 {
 	if (size == HPAGE_SIZE)
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index c2e8242bd15d..ca43b6fce71c 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -233,16 +233,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
 	return (pte_t *) pmdp;
 }
 
-int pmd_huge(pmd_t pmd)
-{
-	return pmd_leaf(pmd);
-}
-
-int pud_huge(pud_t pud)
-{
-	return pud_leaf(pud);
-}
-
 bool __init arch_hugetlb_valid_size(unsigned long size)
 {
 	if (MACHINE_HAS_EDAT1 && size == PMD_SIZE)
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index 6cb0ad73dbb9..ff209b55285a 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -70,13 +70,3 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
 
 	return pte;
 }
-
-int pmd_huge(pmd_t pmd)
-{
-	return 0;
-}
-
-int pud_huge(pud_t pud)
-{
-	return 0;
-}
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index 532143f1bf63..8ed5bdf95d25 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -407,16 +407,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 	return entry;
 }
 
-int pmd_huge(pmd_t pmd)
-{
-	return pmd_leaf(pmd);
-}
-
-int pud_huge(pud_t pud)
-{
-	return pud_leaf(pud);
-}
-
 static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 			   unsigned long addr)
 {
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 8362953a24ce..dab6db288e5b 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -19,22 +19,6 @@
 #include <asm/tlbflush.h>
 #include <asm/elf.h>
 
-/*
- * pmd_huge() returns 1 if @pmd is hugetlb related entry.
- */
-int pmd_huge(pmd_t pmd)
-{
-	return pmd_leaf(pmd);
-}
-
-/*
- * pud_huge() returns 1 if @pud is hugetlb related entry.
- */
-int pud_huge(pud_t pud)
-{
-	return pud_leaf(pud);
-}
-
 #ifdef CONFIG_HUGETLB_PAGE
 static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
 		unsigned long addr, unsigned long len,
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 77b30a8c6076..300de33c6fde 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -272,13 +272,9 @@ void hugetlb_vma_unlock_write(struct vm_area_struct *vma);
 int hugetlb_vma_trylock_write(struct vm_area_struct *vma);
 void hugetlb_vma_assert_locked(struct vm_area_struct *vma);
 void hugetlb_vma_lock_release(struct kref *kref);
-
-int pmd_huge(pmd_t pmd);
-int pud_huge(pud_t pud);
 long hugetlb_change_protection(struct vm_area_struct *vma,
 		unsigned long address, unsigned long end, pgprot_t newprot,
 		unsigned long cp_flags);
-
 bool is_hugetlb_entry_migration(pte_t pte);
 bool is_hugetlb_entry_hwpoisoned(pte_t pte);
 void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
@@ -399,16 +395,6 @@ static inline void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
 {
 }
 
-static inline int pmd_huge(pmd_t pmd)
-{
-	return 0;
-}
-
-static inline int pud_huge(pud_t pud)
-{
-	return 0;
-}
-
 static inline int is_hugepage_only_range(struct mm_struct *mm,
 					unsigned long addr, unsigned long len)
 {
@@ -493,16 +479,6 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,
 static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { }
 
 #endif /* !CONFIG_HUGETLB_PAGE */
-/*
- * hugepages at page global directory. If arch support
- * hugepages at pgd level, they need to define this.
- */
-#ifndef pgd_huge
-#define pgd_huge(x)	0
-#endif
-#ifndef p4d_huge
-#define p4d_huge(x)	0
-#endif
 
 #ifndef pgd_write
 static inline int pgd_write(pgd_t pgd)
-- 
cgit v1.2.3


From 64078b3d57dd9aaa4dde4a4f5851a53a79ac4d42 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Mon, 18 Mar 2024 16:04:04 -0400
Subject: mm: document pXd_leaf() API

There's one small section already, but since we're going to remove
pXd_huge(), that comment may start to obsolete.

Rewrite that section with more information, hopefully with that the API is
crystal clear on what it implies.

Link: https://lkml.kernel.org/r/20240318200404.448346-15-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Bjorn Andersson <andersson@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Fabio Estevam <festevam@denx.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Konrad Dybcio <konrad.dybcio@linaro.org>
Cc: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Cc: Lucas Stach <l.stach@pengutronix.de>
Cc: Mark Salter <msalter@redhat.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: "Naveen N. Rao" <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Shawn Guo <shawnguo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgtable.h | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index bd9c7180718c..2a1c044ae467 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1768,11 +1768,25 @@ typedef unsigned int pgtbl_mod_mask;
 #endif
 
 /*
- * p?d_leaf() - true if this entry is a final mapping to a physical address.
- * This differs from p?d_huge() by the fact that they are always available (if
- * the architecture supports large pages at the appropriate level) even
- * if CONFIG_HUGETLB_PAGE is not defined.
- * Only meaningful when called on a valid entry.
+ * pXd_leaf() is the API to check whether a pgtable entry is a huge page
+ * mapping.  It should work globally across all archs, without any
+ * dependency on CONFIG_* options.  For architectures that do not support
+ * huge mappings on specific levels, below fallbacks will be used.
+ *
+ * A leaf pgtable entry should always imply the following:
+ *
+ * - It is a "present" entry.  IOW, before using this API, please check it
+ *   with pXd_present() first. NOTE: it may not always mean the "present
+ *   bit" is set.  For example, PROT_NONE entries are always "present".
+ *
+ * - It should _never_ be a swap entry of any type.  Above "present" check
+ *   should have guarded this, but let's be crystal clear on this.
+ *
+ * - It should contain a huge PFN, which points to a huge page larger than
+ *   PAGE_SIZE of the platform.  The PFN format isn't important here.
+ *
+ * - It should cover all kinds of huge mappings (e.g., pXd_trans_huge(),
+ *   pXd_devmap(), or hugetlb mappings).
  */
 #ifndef pgd_leaf
 #define pgd_leaf(x)	false
-- 
cgit v1.2.3


From 91cdcd8d624bfdf05e8db9d572759516f3c786b8 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 12 Mar 2024 11:34:11 -0400
Subject: mm: zswap: optimize zswap pool size tracking

Profiling the munmap() of a zswapped memory region shows 60% of the total
cycles currently going into updating the zswap_pool_total_size.

There are three consumers of this counter:
- store, to enforce the globally configured pool limit
- meminfo & debugfs, to report the size to the user
- shrink, to determine the batch size for each cycle

Instead of aggregating everytime an entry enters or exits the zswap
pool, aggregate the value from the zpools on-demand:

- Stores aggregate the counter anyway upon success. Aggregating to
  check the limit instead is the same amount of work.

- Meminfo & debugfs might benefit somewhat from a pre-aggregated
  counter, but aren't exactly hotpaths.

- Shrinking can aggregate once for every cycle instead of doing it for
  every freed entry. As the shrinker might work on tens or hundreds of
  objects per scan cycle, this is a large reduction in aggregations.

The paths that benefit dramatically are swapin, swapoff, and unmaps.
There could be millions of pages being processed until somebody asks for
the pool size again.  This eliminates the pool size updates from those
paths entirely.

Top profile entries for a 24G range munmap(), before:

    38.54%  zswap-unmap  [kernel.kallsyms]  [k] zs_zpool_total_size
    12.51%  zswap-unmap  [kernel.kallsyms]  [k] zpool_get_total_size
     9.10%  zswap-unmap  [kernel.kallsyms]  [k] zswap_update_total_size
     2.95%  zswap-unmap  [kernel.kallsyms]  [k] obj_cgroup_uncharge_zswap
     2.88%  zswap-unmap  [kernel.kallsyms]  [k] __slab_free
     2.86%  zswap-unmap  [kernel.kallsyms]  [k] xas_store

and after:

     7.70%  zswap-unmap  [kernel.kallsyms]  [k] __slab_free
     7.16%  zswap-unmap  [kernel.kallsyms]  [k] obj_cgroup_uncharge_zswap
     6.74%  zswap-unmap  [kernel.kallsyms]  [k] xas_store

It was also briefly considered to move to a single atomic in zswap
that is updated by the backends, since zswap only cares about the sum
of all pools anyway. However, zram directly needs per-pool information
out of zsmalloc. To keep the backend from having to update two atomics
every time, I opted for the lazy aggregation instead for now.

Link: https://lkml.kernel.org/r/20240312153901.3441-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/meminfo.c     |   3 +-
 include/linux/zswap.h |   2 +-
 mm/zswap.c            | 101 +++++++++++++++++++++++++-------------------------
 3 files changed, 52 insertions(+), 54 deletions(-)

(limited to 'include')

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 45af9a989d40..245171d9164b 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -89,8 +89,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	show_val_kb(m, "SwapTotal:      ", i.totalswap);
 	show_val_kb(m, "SwapFree:       ", i.freeswap);
 #ifdef CONFIG_ZSWAP
-	seq_printf(m,  "Zswap:          %8lu kB\n",
-		   (unsigned long)(zswap_pool_total_size >> 10));
+	show_val_kb(m, "Zswap:          ", zswap_total_pages());
 	seq_printf(m,  "Zswapped:       %8lu kB\n",
 		   (unsigned long)atomic_read(&zswap_stored_pages) <<
 		   (PAGE_SHIFT - 10));
diff --git a/include/linux/zswap.h b/include/linux/zswap.h
index 341aea490070..2a85b941db97 100644
--- a/include/linux/zswap.h
+++ b/include/linux/zswap.h
@@ -7,7 +7,6 @@
 
 struct lruvec;
 
-extern u64 zswap_pool_total_size;
 extern atomic_t zswap_stored_pages;
 
 #ifdef CONFIG_ZSWAP
@@ -27,6 +26,7 @@ struct zswap_lruvec_state {
 	atomic_long_t nr_zswap_protected;
 };
 
+unsigned long zswap_total_pages(void);
 bool zswap_store(struct folio *folio);
 bool zswap_load(struct folio *folio);
 void zswap_invalidate(swp_entry_t swp);
diff --git a/mm/zswap.c b/mm/zswap.c
index 2817e2791c13..a56b76852664 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -43,8 +43,6 @@
 /*********************************
 * statistics
 **********************************/
-/* Total bytes used by the compressed storage */
-u64 zswap_pool_total_size;
 /* The number of compressed pages currently stored in zswap */
 atomic_t zswap_stored_pages = ATOMIC_INIT(0);
 /* The number of same-value filled pages currently stored in zswap */
@@ -265,45 +263,6 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp)
 	pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,		\
 		 zpool_get_type((p)->zpools[0]))
 
-static bool zswap_is_full(void)
-{
-	return totalram_pages() * zswap_max_pool_percent / 100 <
-			DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
-}
-
-static bool zswap_can_accept(void)
-{
-	return totalram_pages() * zswap_accept_thr_percent / 100 *
-				zswap_max_pool_percent / 100 >
-			DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
-}
-
-static u64 get_zswap_pool_size(struct zswap_pool *pool)
-{
-	u64 pool_size = 0;
-	int i;
-
-	for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
-		pool_size += zpool_get_total_size(pool->zpools[i]);
-
-	return pool_size;
-}
-
-static void zswap_update_total_size(void)
-{
-	struct zswap_pool *pool;
-	u64 total = 0;
-
-	rcu_read_lock();
-
-	list_for_each_entry_rcu(pool, &zswap_pools, list)
-		total += get_zswap_pool_size(pool);
-
-	rcu_read_unlock();
-
-	zswap_pool_total_size = total;
-}
-
 /*********************************
 * pool functions
 **********************************/
@@ -541,6 +500,33 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
 	return NULL;
 }
 
+static unsigned long zswap_max_pages(void)
+{
+	return totalram_pages() * zswap_max_pool_percent / 100;
+}
+
+static unsigned long zswap_accept_thr_pages(void)
+{
+	return zswap_max_pages() * zswap_accept_thr_percent / 100;
+}
+
+unsigned long zswap_total_pages(void)
+{
+	struct zswap_pool *pool;
+	u64 total = 0;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(pool, &zswap_pools, list) {
+		int i;
+
+		for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
+			total += zpool_get_total_size(pool->zpools[i]);
+	}
+	rcu_read_unlock();
+
+	return total >> PAGE_SHIFT;
+}
+
 /*********************************
 * param callbacks
 **********************************/
@@ -913,7 +899,6 @@ static void zswap_entry_free(struct zswap_entry *entry)
 	}
 	zswap_entry_cache_free(entry);
 	atomic_dec(&zswap_stored_pages);
-	zswap_update_total_size();
 }
 
 /*
@@ -1344,7 +1329,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
 		nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT;
 		nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
 	} else {
-		nr_backing = zswap_pool_total_size >> PAGE_SHIFT;
+		nr_backing = zswap_total_pages();
 		nr_stored = atomic_read(&zswap_nr_stored);
 	}
 
@@ -1412,6 +1397,10 @@ static void shrink_worker(struct work_struct *w)
 {
 	struct mem_cgroup *memcg;
 	int ret, failures = 0;
+	unsigned long thr;
+
+	/* Reclaim down to the accept threshold */
+	thr = zswap_accept_thr_pages();
 
 	/* global reclaim will select cgroup in a round-robin fashion. */
 	do {
@@ -1459,10 +1448,9 @@ static void shrink_worker(struct work_struct *w)
 			break;
 		if (ret && ++failures == MAX_RECLAIM_RETRIES)
 			break;
-
 resched:
 		cond_resched();
-	} while (!zswap_can_accept());
+	} while (zswap_total_pages() > thr);
 }
 
 static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
@@ -1503,6 +1491,7 @@ bool zswap_store(struct folio *folio)
 	struct zswap_entry *entry, *dupentry;
 	struct obj_cgroup *objcg = NULL;
 	struct mem_cgroup *memcg = NULL;
+	unsigned long max_pages, cur_pages;
 
 	VM_WARN_ON_ONCE(!folio_test_locked(folio));
 	VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
@@ -1514,6 +1503,7 @@ bool zswap_store(struct folio *folio)
 	if (!zswap_enabled)
 		goto check_old;
 
+	/* Check cgroup limits */
 	objcg = get_obj_cgroup_from_folio(folio);
 	if (objcg && !obj_cgroup_may_zswap(objcg)) {
 		memcg = get_mem_cgroup_from_objcg(objcg);
@@ -1524,15 +1514,18 @@ bool zswap_store(struct folio *folio)
 		mem_cgroup_put(memcg);
 	}
 
-	/* reclaim space if needed */
-	if (zswap_is_full()) {
+	/* Check global limits */
+	cur_pages = zswap_total_pages();
+	max_pages = zswap_max_pages();
+
+	if (cur_pages >= max_pages) {
 		zswap_pool_limit_hit++;
 		zswap_pool_reached_full = true;
 		goto shrink;
 	}
 
 	if (zswap_pool_reached_full) {
-	       if (!zswap_can_accept())
+		if (cur_pages > zswap_accept_thr_pages())
 			goto shrink;
 		else
 			zswap_pool_reached_full = false;
@@ -1608,7 +1601,6 @@ insert_entry:
 
 	/* update stats */
 	atomic_inc(&zswap_stored_pages);
-	zswap_update_total_size();
 	count_vm_event(ZSWPOUT);
 
 	return true;
@@ -1752,6 +1744,13 @@ void zswap_swapoff(int type)
 
 static struct dentry *zswap_debugfs_root;
 
+static int debugfs_get_total_size(void *data, u64 *val)
+{
+	*val = zswap_total_pages() * PAGE_SIZE;
+	return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu\n");
+
 static int zswap_debugfs_init(void)
 {
 	if (!debugfs_initialized())
@@ -1773,8 +1772,8 @@ static int zswap_debugfs_init(void)
 			   zswap_debugfs_root, &zswap_reject_compress_poor);
 	debugfs_create_u64("written_back_pages", 0444,
 			   zswap_debugfs_root, &zswap_written_back_pages);
-	debugfs_create_u64("pool_total_size", 0444,
-			   zswap_debugfs_root, &zswap_pool_total_size);
+	debugfs_create_file("pool_total_size", 0444,
+			    zswap_debugfs_root, NULL, &total_size_fops);
 	debugfs_create_atomic_t("stored_pages", 0444,
 				zswap_debugfs_root, &zswap_stored_pages);
 	debugfs_create_atomic_t("same_filled_pages", 0444,
-- 
cgit v1.2.3


From 4196b48ddd382419e51658bf94a25af195ba9450 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 12 Mar 2024 11:34:12 -0400
Subject: mm: zpool: return pool size in pages

All zswap backends track their pool sizes in pages.  Currently they
multiply by PAGE_SIZE for zswap, only for zswap to divide again in order
to do limit math.  Report pages directly.

Link: https://lkml.kernel.org/r/20240312153901.3441-2-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: Chengming Zhou <chengming.zhou@linux.dev>
Reviewed-by: Nhat Pham <nphamcs@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/zpool.h |  4 ++--
 mm/z3fold.c           | 10 +++++-----
 mm/zbud.c             | 10 +++++-----
 mm/zpool.c            | 10 +++++-----
 mm/zsmalloc.c         |  6 +++---
 mm/zswap.c            |  6 +++---
 6 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/zpool.h b/include/linux/zpool.h
index 3296438eec06..a67d62b79698 100644
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h
@@ -53,7 +53,7 @@ void *zpool_map_handle(struct zpool *pool, unsigned long handle,
 
 void zpool_unmap_handle(struct zpool *pool, unsigned long handle);
 
-u64 zpool_get_total_size(struct zpool *pool);
+u64 zpool_get_total_pages(struct zpool *pool);
 
 
 /**
@@ -91,7 +91,7 @@ struct zpool_driver {
 				enum zpool_mapmode mm);
 	void (*unmap)(void *pool, unsigned long handle);
 
-	u64 (*total_size)(void *pool);
+	u64 (*total_pages)(void *pool);
 };
 
 void zpool_register_driver(struct zpool_driver *driver);
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 7ab05621052d..2ebfed32871b 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -1237,12 +1237,12 @@ static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle)
 }
 
 /**
- * z3fold_get_pool_size() - gets the z3fold pool size in pages
+ * z3fold_get_pool_pages() - gets the z3fold pool size in pages
  * @pool:	pool whose size is being queried
  *
  * Returns: size in pages of the given pool.
  */
-static u64 z3fold_get_pool_size(struct z3fold_pool *pool)
+static u64 z3fold_get_pool_pages(struct z3fold_pool *pool)
 {
 	return atomic64_read(&pool->pages_nr);
 }
@@ -1402,9 +1402,9 @@ static void z3fold_zpool_unmap(void *pool, unsigned long handle)
 	z3fold_unmap(pool, handle);
 }
 
-static u64 z3fold_zpool_total_size(void *pool)
+static u64 z3fold_zpool_total_pages(void *pool)
 {
-	return z3fold_get_pool_size(pool) * PAGE_SIZE;
+	return z3fold_get_pool_pages(pool);
 }
 
 static struct zpool_driver z3fold_zpool_driver = {
@@ -1417,7 +1417,7 @@ static struct zpool_driver z3fold_zpool_driver = {
 	.free =		z3fold_zpool_free,
 	.map =		z3fold_zpool_map,
 	.unmap =	z3fold_zpool_unmap,
-	.total_size =	z3fold_zpool_total_size,
+	.total_pages =	z3fold_zpool_total_pages,
 };
 
 MODULE_ALIAS("zpool-z3fold");
diff --git a/mm/zbud.c b/mm/zbud.c
index 2190cc1f37b3..e9836fff9438 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -365,13 +365,13 @@ static void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
 }
 
 /**
- * zbud_get_pool_size() - gets the zbud pool size in pages
+ * zbud_get_pool_pages() - gets the zbud pool size in pages
  * @pool:	pool whose size is being queried
  *
  * Returns: size in pages of the given pool.  The pool lock need not be
  * taken to access pages_nr.
  */
-static u64 zbud_get_pool_size(struct zbud_pool *pool)
+static u64 zbud_get_pool_pages(struct zbud_pool *pool)
 {
 	return pool->pages_nr;
 }
@@ -410,9 +410,9 @@ static void zbud_zpool_unmap(void *pool, unsigned long handle)
 	zbud_unmap(pool, handle);
 }
 
-static u64 zbud_zpool_total_size(void *pool)
+static u64 zbud_zpool_total_pages(void *pool)
 {
-	return zbud_get_pool_size(pool) * PAGE_SIZE;
+	return zbud_get_pool_pages(pool);
 }
 
 static struct zpool_driver zbud_zpool_driver = {
@@ -425,7 +425,7 @@ static struct zpool_driver zbud_zpool_driver = {
 	.free =		zbud_zpool_free,
 	.map =		zbud_zpool_map,
 	.unmap =	zbud_zpool_unmap,
-	.total_size =	zbud_zpool_total_size,
+	.total_pages =	zbud_zpool_total_pages,
 };
 
 MODULE_ALIAS("zpool-zbud");
diff --git a/mm/zpool.c b/mm/zpool.c
index 846410479c2f..b9fda1fa857d 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -321,16 +321,16 @@ void zpool_unmap_handle(struct zpool *zpool, unsigned long handle)
 }
 
 /**
- * zpool_get_total_size() - The total size of the pool
+ * zpool_get_total_pages() - The total size of the pool
  * @zpool:	The zpool to check
  *
- * This returns the total size in bytes of the pool.
+ * This returns the total size in pages of the pool.
  *
- * Returns: Total size of the zpool in bytes.
+ * Returns: Total size of the zpool in pages.
  */
-u64 zpool_get_total_size(struct zpool *zpool)
+u64 zpool_get_total_pages(struct zpool *zpool)
 {
-	return zpool->driver->total_size(zpool->pool);
+	return zpool->driver->total_pages(zpool->pool);
 }
 
 /**
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 7d7cb3eaabe0..b42d3545ca85 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -399,9 +399,9 @@ static void zs_zpool_unmap(void *pool, unsigned long handle)
 	zs_unmap_object(pool, handle);
 }
 
-static u64 zs_zpool_total_size(void *pool)
+static u64 zs_zpool_total_pages(void *pool)
 {
-	return zs_get_total_pages(pool) << PAGE_SHIFT;
+	return zs_get_total_pages(pool);
 }
 
 static struct zpool_driver zs_zpool_driver = {
@@ -414,7 +414,7 @@ static struct zpool_driver zs_zpool_driver = {
 	.free =			  zs_zpool_free,
 	.map =			  zs_zpool_map,
 	.unmap =		  zs_zpool_unmap,
-	.total_size =		  zs_zpool_total_size,
+	.total_pages =		  zs_zpool_total_pages,
 };
 
 MODULE_ALIAS("zpool-zsmalloc");
diff --git a/mm/zswap.c b/mm/zswap.c
index a56b76852664..8721980a2274 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -513,18 +513,18 @@ static unsigned long zswap_accept_thr_pages(void)
 unsigned long zswap_total_pages(void)
 {
 	struct zswap_pool *pool;
-	u64 total = 0;
+	unsigned long total = 0;
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(pool, &zswap_pools, list) {
 		int i;
 
 		for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
-			total += zpool_get_total_size(pool->zpools[i]);
+			total += zpool_get_total_pages(pool->zpools[i]);
 	}
 	rcu_read_unlock();
 
-	return total >> PAGE_SHIFT;
+	return total;
 }
 
 /*********************************
-- 
cgit v1.2.3


From f8fd525ba3a298d5791536bd8ca39be7a7103296 Mon Sep 17 00:00:00 2001
From: Donet Tom <donettom@linux.ibm.com>
Date: Fri, 8 Mar 2024 09:15:37 -0600
Subject: mm/mempolicy: use numa_node_id() instead of cpu_to_node()

Patch series "Allow migrate on protnone reference with MPOL_PREFERRED_MANY
policy:, v4.

This patchset is to optimize the cross-socket memory access with
MPOL_PREFERRED_MANY policy.

To test this patch we ran the following test on a 3 node system.
 Node 0 - 2GB   - Tier 1
 Node 1 - 11GB  - Tier 1
 Node 6 - 10GB  - Tier 2

Below changes are made to memcached to set the memory policy,
It select Node0 and Node1 as preferred nodes.

   #include <numaif.h>
   #include <numa.h>

    unsigned long nodemask;
    int ret;

    nodemask = 0x03;
    ret = set_mempolicy(MPOL_PREFERRED_MANY | MPOL_F_NUMA_BALANCING,
                                               &nodemask, 10);
    /* If MPOL_F_NUMA_BALANCING isn't supported,
     * fall back to MPOL_PREFERRED_MANY */
    if (ret < 0 && errno == EINVAL){
       printf("set mem policy normal\n");
        ret = set_mempolicy(MPOL_PREFERRED_MANY, &nodemask, 10);
    }
    if (ret < 0) {
       perror("Failed to call set_mempolicy");
       exit(-1);
    }

Test Procedure:
===============
1. Make sure memory tiring and demotion are enabled.
2. Start memcached.

   # ./memcached -b 100000 -m 204800 -u root -c 1000000 -t 7
       -d -s "/tmp/memcached.sock"

3. Run memtier_benchmark to store 3200000 keys.

  #./memtier_benchmark -S "/tmp/memcached.sock" --protocol=memcache_binary
    --threads=1 --pipeline=1 --ratio=1:0 --key-pattern=S:S --key-minimum=1
    --key-maximum=3200000 -n allkeys -c 1 -R -x 1 -d 1024

4. Start a memory eater on node 0 and 1. This will demote all memcached
   pages to node 6.
5. Make sure all the memcached pages got demoted to lower tier by reading
   /proc/<memcaced PID>/numa_maps.

    # cat /proc/2771/numa_maps
     ---
    default anon=1009 dirty=1009 active=0 N6=1009 kernelpagesize_kB=64
    default anon=1009 dirty=1009 active=0 N6=1009 kernelpagesize_kB=64
     ---

6. Kill memory eater.
7. Read the pgpromote_success counter.
8. Start reading the keys by running memtier_benchmark.

  #./memtier_benchmark -S "/tmp/memcached.sock" --protocol=memcache_binary
   --pipeline=1 --distinct-client-seed --ratio=0:3 --key-pattern=R:R
   --key-minimum=1 --key-maximum=3200000 -n allkeys
   --threads=64 -c 1 -R -x 6

9. Read the pgpromote_success counter.

Test Results:
=============
Without Patch
------------------
1. pgpromote_success  before test
Node 0:  pgpromote_success 11
Node 1:  pgpromote_success 140974

pgpromote_success  after test
Node 0:  pgpromote_success 11
Node 1:  pgpromote_success 140974

2. Memtier-benchmark result.
AGGREGATED AVERAGE RESULTS (6 runs)
==================================================================
Type    Ops/sec   Hits/sec   Misses/sec  Avg. Latency  p50 Latency
------------------------------------------------------------------
Sets     0.00       ---         ---        ---          ---
Gets    305792.03  305791.93   0.10       0.18949       0.16700
Waits    0.00       ---         ---        ---          ---
Totals  305792.03  305791.93   0.10       0.18949       0.16700

======================================
p99 Latency  p99.9 Latency  KB/sec
-------------------------------------
---          ---            0.00
0.44700     1.71100        11542.69
---           ---            ---
0.44700     1.71100        11542.69

With Patch
---------------
1. pgpromote_success  before test
Node 0:  pgpromote_success 5
Node 1:  pgpromote_success 89386

pgpromote_success  after test
Node 0:  pgpromote_success 57895
Node 1:  pgpromote_success 141463

2. Memtier-benchmark result.
AGGREGATED AVERAGE RESULTS (6 runs)
====================================================================
Type    Ops/sec    Hits/sec  Misses/sec  Avg. Latency  p50 Latency
--------------------------------------------------------------------
Sets     0.00        ---       ---        ---           ---
Gets    521942.24  521942.07  0.17       0.11459        0.10300
Waits    0.00        ---       ---         ---          ---
Totals  521942.24  521942.07  0.17       0.11459        0.10300

=======================================
p99 Latency  p99.9 Latency  KB/sec
---------------------------------------
 ---          ---            0.00
0.23100      0.31900        19701.68
---          ---             ---
0.23100      0.31900        19701.68


Test Result Analysis:
=====================
1. With patch we could observe pages are getting promoted.
2. Memtier-benchmark results shows that, with the patch,
   performance has increased more than 50%.

 Ops/sec without fix -  305792.03
 Ops/sec with fix    -  521942.24


This patch (of 2):

Instead of using 'cpu_to_node()', we use 'numa_node_id()', which is
quicker.  smp_processor_id is guaranteed to be stable in the
'mpol_misplaced()' function because it is called with ptl held.
lockdep_assert_held was added to ensure that.

No functional change in this patch.

[donettom@linux.ibm.com: add "* @vmf: structure describing the fault" comment]
  Link: https://lkml.kernel.org/r/d8b993ea9dccfac0bc3ed61d3a81f4ac5f376e46.1711002865.git.donettom@linux.ibm.com
Link: https://lkml.kernel.org/r/cover.1711373653.git.donettom@linux.ibm.com
Link: https://lkml.kernel.org/r/6059f034f436734b472d066db69676fb3a459864.1711373653.git.donettom@linux.ibm.com
Link: https://lkml.kernel.org/r/cover.1709909210.git.donettom@linux.ibm.com
Link: https://lkml.kernel.org/r/744646531af02cc687cde8ae788fb1779e99d02c.1709909210.git.donettom@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V (IBM) <aneesh.kumar@kernel.org>
Signed-off-by: Donet Tom <donettom@linux.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Feng Tang <feng.tang@intel.com>
Cc: Huang, Ying <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mempolicy.h |  5 +++--
 mm/huge_memory.c          |  2 +-
 mm/internal.h             |  2 +-
 mm/memory.c               |  8 +++++---
 mm/mempolicy.c            | 14 ++++++++++----
 5 files changed, 20 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 931b118336f4..1add16f21612 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -167,7 +167,8 @@ extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);
 /* Check if a vma is migratable */
 extern bool vma_migratable(struct vm_area_struct *vma);
 
-int mpol_misplaced(struct folio *, struct vm_area_struct *, unsigned long);
+int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
+					unsigned long addr);
 extern void mpol_put_task_policy(struct task_struct *);
 
 static inline bool mpol_is_preferred_many(struct mempolicy *pol)
@@ -282,7 +283,7 @@ static inline int mpol_parse_str(char *str, struct mempolicy **mpol)
 #endif
 
 static inline int mpol_misplaced(struct folio *folio,
-				 struct vm_area_struct *vma,
+				 struct vm_fault *vmf,
 				 unsigned long address)
 {
 	return -1; /* no node preference */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 89f58c7603b2..dd88bb3ea196 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1754,7 +1754,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
 	 */
 	if (node_is_toptier(nid))
 		last_cpupid = folio_last_cpupid(folio);
-	target_nid = numa_migrate_prep(folio, vma, haddr, nid, &flags);
+	target_nid = numa_migrate_prep(folio, vmf, haddr, nid, &flags);
 	if (target_nid == NUMA_NO_NODE) {
 		folio_put(folio);
 		goto out_map;
diff --git a/mm/internal.h b/mm/internal.h
index 07ad2675a88b..85c3db43454d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1087,7 +1087,7 @@ void vunmap_range_noflush(unsigned long start, unsigned long end);
 
 void __vunmap_range_noflush(unsigned long start, unsigned long end);
 
-int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma,
+int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf,
 		      unsigned long addr, int page_nid, int *flags);
 
 void free_zone_device_page(struct page *page);
diff --git a/mm/memory.c b/mm/memory.c
index e0576f2a8d23..c859a09b4f72 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5035,9 +5035,11 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
 	return ret;
 }
 
-int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma,
+int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf,
 		      unsigned long addr, int page_nid, int *flags)
 {
+	struct vm_area_struct *vma = vmf->vma;
+
 	folio_get(folio);
 
 	/* Record the current PID acceesing VMA */
@@ -5049,7 +5051,7 @@ int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma,
 		*flags |= TNF_FAULT_LOCAL;
 	}
 
-	return mpol_misplaced(folio, vma, addr);
+	return mpol_misplaced(folio, vmf, addr);
 }
 
 static vm_fault_t do_numa_page(struct vm_fault *vmf)
@@ -5123,7 +5125,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
 		last_cpupid = (-1 & LAST_CPUPID_MASK);
 	else
 		last_cpupid = folio_last_cpupid(folio);
-	target_nid = numa_migrate_prep(folio, vma, vmf->address, nid, &flags);
+	target_nid = numa_migrate_prep(folio, vmf, vmf->address, nid, &flags);
 	if (target_nid == NUMA_NO_NODE) {
 		folio_put(folio);
 		goto out_map;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0fe77738d971..aa48376e2d34 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2718,7 +2718,7 @@ static void sp_free(struct sp_node *n)
  * mpol_misplaced - check whether current folio node is valid in policy
  *
  * @folio: folio to be checked
- * @vma: vm area where folio mapped
+ * @vmf: structure describing the fault
  * @addr: virtual address in @vma for shared policy lookup and interleave policy
  *
  * Lookup current policy node id for vma,addr and "compare to" folio's
@@ -2728,18 +2728,24 @@ static void sp_free(struct sp_node *n)
  * Return: NUMA_NO_NODE if the page is in a node that is valid for this
  * policy, or a suitable node ID to allocate a replacement folio from.
  */
-int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
+int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
 		   unsigned long addr)
 {
 	struct mempolicy *pol;
 	pgoff_t ilx;
 	struct zoneref *z;
 	int curnid = folio_nid(folio);
+	struct vm_area_struct *vma = vmf->vma;
 	int thiscpu = raw_smp_processor_id();
-	int thisnid = cpu_to_node(thiscpu);
+	int thisnid = numa_node_id();
 	int polnid = NUMA_NO_NODE;
 	int ret = NUMA_NO_NODE;
 
+	/*
+	 * Make sure ptl is held so that we don't preempt and we
+	 * have a stable smp processor id
+	 */
+	lockdep_assert_held(vmf->ptl);
 	pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
 	if (!(pol->flags & MPOL_F_MOF))
 		goto out;
@@ -2781,7 +2787,7 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma,
 		if (node_isset(curnid, pol->nodes))
 			goto out;
 		z = first_zones_zonelist(
-				node_zonelist(numa_node_id(), GFP_HIGHUSER),
+				node_zonelist(thisnid, GFP_HIGHUSER),
 				gfp_zone(GFP_HIGHUSER),
 				&pol->nodes);
 		polnid = zone_to_nid(z->zone);
-- 
cgit v1.2.3


From 0069455bcbf9ea73ffe4553ed6d2b4e4cad703de Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 21 Mar 2024 09:36:23 -0700
Subject: fix missing vmalloc.h includes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "Memory allocation profiling", v6.

Overview:
Low overhead [1] per-callsite memory allocation profiling. Not just for
debug kernels, overhead low enough to be deployed in production.

Example output:
  root@moria-kvm:~# sort -rn /proc/allocinfo
   127664128    31168 mm/page_ext.c:270 func:alloc_page_ext
    56373248     4737 mm/slub.c:2259 func:alloc_slab_page
    14880768     3633 mm/readahead.c:247 func:page_cache_ra_unbounded
    14417920     3520 mm/mm_init.c:2530 func:alloc_large_system_hash
    13377536      234 block/blk-mq.c:3421 func:blk_mq_alloc_rqs
    11718656     2861 mm/filemap.c:1919 func:__filemap_get_folio
     9192960     2800 kernel/fork.c:307 func:alloc_thread_stack_node
     4206592        4 net/netfilter/nf_conntrack_core.c:2567 func:nf_ct_alloc_hashtable
     4136960     1010 drivers/staging/ctagmod/ctagmod.c:20 [ctagmod] func:ctagmod_start
     3940352      962 mm/memory.c:4214 func:alloc_anon_folio
     2894464    22613 fs/kernfs/dir.c:615 func:__kernfs_new_node
     ...

Usage:
kconfig options:
 - CONFIG_MEM_ALLOC_PROFILING
 - CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
 - CONFIG_MEM_ALLOC_PROFILING_DEBUG
   adds warnings for allocations that weren't accounted because of a
   missing annotation

sysctl:
  /proc/sys/vm/mem_profiling

Runtime info:
  /proc/allocinfo

Notes:

[1]: Overhead
To measure the overhead we are comparing the following configurations:
(1) Baseline with CONFIG_MEMCG_KMEM=n
(2) Disabled by default (CONFIG_MEM_ALLOC_PROFILING=y &&
    CONFIG_MEM_ALLOC_PROFILING_BY_DEFAULT=n)
(3) Enabled by default (CONFIG_MEM_ALLOC_PROFILING=y &&
    CONFIG_MEM_ALLOC_PROFILING_BY_DEFAULT=y)
(4) Enabled at runtime (CONFIG_MEM_ALLOC_PROFILING=y &&
    CONFIG_MEM_ALLOC_PROFILING_BY_DEFAULT=n && /proc/sys/vm/mem_profiling=1)
(5) Baseline with CONFIG_MEMCG_KMEM=y && allocating with __GFP_ACCOUNT
(6) Disabled by default (CONFIG_MEM_ALLOC_PROFILING=y &&
    CONFIG_MEM_ALLOC_PROFILING_BY_DEFAULT=n)  && CONFIG_MEMCG_KMEM=y
(7) Enabled by default (CONFIG_MEM_ALLOC_PROFILING=y &&
    CONFIG_MEM_ALLOC_PROFILING_BY_DEFAULT=y) && CONFIG_MEMCG_KMEM=y

Performance overhead:
To evaluate performance we implemented an in-kernel test executing
multiple get_free_page/free_page and kmalloc/kfree calls with allocation
sizes growing from 8 to 240 bytes with CPU frequency set to max and CPU
affinity set to a specific CPU to minimize the noise. Below are results
from running the test on Ubuntu 22.04.2 LTS with 6.8.0-rc1 kernel on
56 core Intel Xeon:

                        kmalloc                 pgalloc
(1 baseline)            6.764s                  16.902s
(2 default disabled)    6.793s  (+0.43%)        17.007s (+0.62%)
(3 default enabled)     7.197s  (+6.40%)        23.666s (+40.02%)
(4 runtime enabled)     7.405s  (+9.48%)        23.901s (+41.41%)
(5 memcg)               13.388s (+97.94%)       48.460s (+186.71%)
(6 def disabled+memcg)  13.332s (+97.10%)       48.105s (+184.61%)
(7 def enabled+memcg)   13.446s (+98.78%)       54.963s (+225.18%)

Memory overhead:
Kernel size:

   text           data        bss         dec         diff
(1) 26515311	      18890222    17018880    62424413
(2) 26524728	      19423818    16740352    62688898    264485
(3) 26524724	      19423818    16740352    62688894    264481
(4) 26524728	      19423818    16740352    62688898    264485
(5) 26541782	      18964374    16957440    62463596    39183

Memory consumption on a 56 core Intel CPU with 125GB of memory:
Code tags:           192 kB
PageExts:         262144 kB (256MB)
SlabExts:           9876 kB (9.6MB)
PcpuExts:            512 kB (0.5MB)

Total overhead is 0.2% of total memory.

Benchmarks:

Hackbench tests run 100 times:
hackbench -s 512 -l 200 -g 15 -f 25 -P
      baseline       disabled profiling           enabled profiling
avg   0.3543         0.3559 (+0.0016)             0.3566 (+0.0023)
stdev 0.0137         0.0188                       0.0077


hackbench -l 10000
      baseline       disabled profiling           enabled profiling
avg   6.4218         6.4306 (+0.0088)             6.5077 (+0.0859)
stdev 0.0933         0.0286                       0.0489

stress-ng tests:
stress-ng --class memory --seq 4 -t 60
stress-ng --class cpu --seq 4 -t 60
Results posted at: https://evilpiepirate.org/~kent/memalloc_prof_v4_stress-ng/

[2] https://lore.kernel.org/all/20240306182440.2003814-1-surenb@google.com/


This patch (of 37):

The next patch drops vmalloc.h from a system header in order to fix a
circular dependency; this adds it to all the files that were pulling it in
implicitly.

[kent.overstreet@linux.dev: fix arch/alpha/lib/memcpy.c]
  Link: https://lkml.kernel.org/r/20240327002152.3339937-1-kent.overstreet@linux.dev
[surenb@google.com: fix arch/x86/mm/numa_32.c]
  Link: https://lkml.kernel.org/r/20240402180933.1663992-1-surenb@google.com
[kent.overstreet@linux.dev: a few places were depending on sizes.h]
  Link: https://lkml.kernel.org/r/20240404034744.1664840-1-kent.overstreet@linux.dev
[arnd@arndb.de: fix mm/kasan/hw_tags.c]
  Link: https://lkml.kernel.org/r/20240404124435.3121534-1-arnd@kernel.org
[surenb@google.com: fix arc build]
  Link: https://lkml.kernel.org/r/20240405225115.431056-1-surenb@google.com
Link: https://lkml.kernel.org/r/20240321163705.3067592-1-surenb@google.com
Link: https://lkml.kernel.org/r/20240321163705.3067592-2-surenb@google.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Tested-by: Kees Cook <keescook@chromium.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andreas Hindborg <a.hindborg@samsung.com>
Cc: Benno Lossin <benno.lossin@proton.me>
Cc: "Björn Roy Baron" <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/alpha/lib/checksum.c                                 | 1 +
 arch/alpha/lib/fpreg.c                                    | 1 +
 arch/arc/include/asm/mmu-arcv2.h                          | 2 ++
 arch/arm/kernel/irq.c                                     | 1 +
 arch/arm/kernel/traps.c                                   | 1 +
 arch/arm64/kernel/efi.c                                   | 1 +
 arch/loongarch/include/asm/kfence.h                       | 1 +
 arch/powerpc/kernel/iommu.c                               | 1 +
 arch/powerpc/mm/mem.c                                     | 1 +
 arch/riscv/kernel/elf_kexec.c                             | 1 +
 arch/riscv/kernel/probes/kprobes.c                        | 1 +
 arch/s390/kernel/cert_store.c                             | 1 +
 arch/s390/kernel/ipl.c                                    | 1 +
 arch/x86/include/asm/io.h                                 | 1 +
 arch/x86/kernel/cpu/sgx/main.c                            | 1 +
 arch/x86/kernel/irq_64.c                                  | 1 +
 arch/x86/mm/fault.c                                       | 1 +
 arch/x86/mm/numa_32.c                                     | 1 +
 drivers/accel/ivpu/ivpu_mmu_context.c                     | 1 +
 drivers/gpu/drm/gma500/mmu.c                              | 1 +
 drivers/gpu/drm/i915/gem/i915_gem_pages.c                 | 1 +
 drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c          | 1 +
 drivers/gpu/drm/i915/gt/shmem_utils.c                     | 1 +
 drivers/gpu/drm/i915/gvt/firmware.c                       | 1 +
 drivers/gpu/drm/i915/gvt/gtt.c                            | 1 +
 drivers/gpu/drm/i915/gvt/handlers.c                       | 1 +
 drivers/gpu/drm/i915/gvt/mmio.c                           | 1 +
 drivers/gpu/drm/i915/gvt/vgpu.c                           | 1 +
 drivers/gpu/drm/i915/intel_gvt.c                          | 1 +
 drivers/gpu/drm/imagination/pvr_vm_mips.c                 | 1 +
 drivers/gpu/drm/mediatek/mtk_drm_gem.c                    | 1 +
 drivers/gpu/drm/omapdrm/omap_gem.c                        | 1 +
 drivers/gpu/drm/v3d/v3d_bo.c                              | 1 +
 drivers/gpu/drm/vmwgfx/vmwgfx_binding.c                   | 1 +
 drivers/gpu/drm/vmwgfx/vmwgfx_cmd.c                       | 1 +
 drivers/gpu/drm/vmwgfx/vmwgfx_devcaps.c                   | 1 +
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.c                       | 1 +
 drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c                   | 1 +
 drivers/gpu/drm/vmwgfx/vmwgfx_ioctl.c                     | 1 +
 drivers/gpu/drm/xen/xen_drm_front_gem.c                   | 1 +
 drivers/hwtracing/coresight/coresight-trbe.c              | 1 +
 drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c  | 1 +
 drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_mbox.c | 1 +
 drivers/net/ethernet/microsoft/mana/hw_channel.c          | 1 +
 drivers/platform/x86/uv_sysfs.c                           | 1 +
 drivers/scsi/mpi3mr/mpi3mr_transport.c                    | 2 ++
 drivers/vfio/pci/pds/dirty.c                              | 1 +
 drivers/virt/acrn/mm.c                                    | 1 +
 drivers/virtio/virtio_mem.c                               | 1 +
 include/asm-generic/io.h                                  | 1 +
 include/linux/io.h                                        | 1 +
 include/linux/pds/pds_common.h                            | 2 ++
 include/rdma/rdmavt_qp.h                                  | 1 +
 mm/debug_vm_pgtable.c                                     | 1 +
 mm/kasan/hw_tags.c                                        | 1 +
 sound/pci/hda/cs35l41_hda.c                               | 1 +
 56 files changed, 59 insertions(+)

(limited to 'include')

diff --git a/arch/alpha/lib/checksum.c b/arch/alpha/lib/checksum.c
index 3f35c3ed6948..c29b98ef9c82 100644
--- a/arch/alpha/lib/checksum.c
+++ b/arch/alpha/lib/checksum.c
@@ -14,6 +14,7 @@
 #include <linux/string.h>
 
 #include <asm/byteorder.h>
+#include <asm/checksum.h>
 
 static inline unsigned short from64to16(unsigned long x)
 {
diff --git a/arch/alpha/lib/fpreg.c b/arch/alpha/lib/fpreg.c
index 7c08b225261c..3d32165043f8 100644
--- a/arch/alpha/lib/fpreg.c
+++ b/arch/alpha/lib/fpreg.c
@@ -8,6 +8,7 @@
 #include <linux/compiler.h>
 #include <linux/export.h>
 #include <linux/preempt.h>
+#include <asm/fpu.h>
 #include <asm/thread_info.h>
 
 #if defined(CONFIG_ALPHA_EV6) || defined(CONFIG_ALPHA_EV67)
diff --git a/arch/arc/include/asm/mmu-arcv2.h b/arch/arc/include/asm/mmu-arcv2.h
index ed9036d4ede3..d85dc0721907 100644
--- a/arch/arc/include/asm/mmu-arcv2.h
+++ b/arch/arc/include/asm/mmu-arcv2.h
@@ -9,6 +9,8 @@
 #ifndef _ASM_ARC_MMU_ARCV2_H
 #define _ASM_ARC_MMU_ARCV2_H
 
+#include <soc/arc/aux.h>
+
 /*
  * TLB Management regs
  */
diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c
index fe28fc1f759d..dab42d066d06 100644
--- a/arch/arm/kernel/irq.c
+++ b/arch/arm/kernel/irq.c
@@ -32,6 +32,7 @@
 #include <linux/kallsyms.h>
 #include <linux/proc_fs.h>
 #include <linux/export.h>
+#include <linux/vmalloc.h>
 
 #include <asm/hardware/cache-l2x0.h>
 #include <asm/hardware/cache-uniphier.h>
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index 72c82a4d63ac..480e307501bb 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -26,6 +26,7 @@
 #include <linux/sched/debug.h>
 #include <linux/sched/task_stack.h>
 #include <linux/irq.h>
+#include <linux/vmalloc.h>
 
 #include <linux/atomic.h>
 #include <asm/cacheflush.h>
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index 9afcc690fe73..4a92096db34e 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -10,6 +10,7 @@
 #include <linux/efi.h>
 #include <linux/init.h>
 #include <linux/screen_info.h>
+#include <linux/vmalloc.h>
 
 #include <asm/efi.h>
 #include <asm/stacktrace.h>
diff --git a/arch/loongarch/include/asm/kfence.h b/arch/loongarch/include/asm/kfence.h
index a6a5760da3a3..92636e82957c 100644
--- a/arch/loongarch/include/asm/kfence.h
+++ b/arch/loongarch/include/asm/kfence.h
@@ -10,6 +10,7 @@
 #define _ASM_LOONGARCH_KFENCE_H
 
 #include <linux/kfence.h>
+#include <linux/vmalloc.h>
 #include <asm/pgtable.h>
 #include <asm/tlb.h>
 
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 1185efebf032..65468d0829e1 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -26,6 +26,7 @@
 #include <linux/iommu.h>
 #include <linux/sched.h>
 #include <linux/debugfs.h>
+#include <linux/vmalloc.h>
 #include <asm/io.h>
 #include <asm/iommu.h>
 #include <asm/pci-bridge.h>
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 3a440004b97d..a197d4c2244b 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -16,6 +16,7 @@
 #include <linux/highmem.h>
 #include <linux/suspend.h>
 #include <linux/dma-direct.h>
+#include <linux/vmalloc.h>
 
 #include <asm/swiotlb.h>
 #include <asm/machdep.h>
diff --git a/arch/riscv/kernel/elf_kexec.c b/arch/riscv/kernel/elf_kexec.c
index 54260c16f991..11c0d2e0becf 100644
--- a/arch/riscv/kernel/elf_kexec.c
+++ b/arch/riscv/kernel/elf_kexec.c
@@ -19,6 +19,7 @@
 #include <linux/libfdt.h>
 #include <linux/types.h>
 #include <linux/memblock.h>
+#include <linux/vmalloc.h>
 #include <asm/setup.h>
 
 int arch_kimage_file_post_load_cleanup(struct kimage *image)
diff --git a/arch/riscv/kernel/probes/kprobes.c b/arch/riscv/kernel/probes/kprobes.c
index 2f08c14a933d..71a8b8945b26 100644
--- a/arch/riscv/kernel/probes/kprobes.c
+++ b/arch/riscv/kernel/probes/kprobes.c
@@ -6,6 +6,7 @@
 #include <linux/extable.h>
 #include <linux/slab.h>
 #include <linux/stop_machine.h>
+#include <linux/vmalloc.h>
 #include <asm/ptrace.h>
 #include <linux/uaccess.h>
 #include <asm/sections.h>
diff --git a/arch/s390/kernel/cert_store.c b/arch/s390/kernel/cert_store.c
index 554447768bdd..bf983513dd33 100644
--- a/arch/s390/kernel/cert_store.c
+++ b/arch/s390/kernel/cert_store.c
@@ -21,6 +21,7 @@
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/sysfs.h>
+#include <linux/vmalloc.h>
 #include <crypto/sha2.h>
 #include <keys/user-type.h>
 #include <asm/debug.h>
diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index 1486350a4177..5d6d381aa0be 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c
@@ -20,6 +20,7 @@
 #include <linux/gfp.h>
 #include <linux/crash_dump.h>
 #include <linux/debug_locks.h>
+#include <linux/vmalloc.h>
 #include <asm/asm-extable.h>
 #include <asm/diag.h>
 #include <asm/ipl.h>
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 294cd2a40818..7452fc193b4f 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -42,6 +42,7 @@
 #include <asm/early_ioremap.h>
 #include <asm/pgtable_types.h>
 #include <asm/shared/io.h>
+#include <asm/special_insns.h>
 
 #define build_mmio_read(name, size, type, reg, barrier) \
 static inline type name(const volatile void __iomem *addr) \
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 166692f2d501..27892e57c4ef 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -13,6 +13,7 @@
 #include <linux/sched/signal.h>
 #include <linux/slab.h>
 #include <linux/sysfs.h>
+#include <linux/vmalloc.h>
 #include <asm/sgx.h>
 #include "driver.h"
 #include "encl.h"
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index fe0c859873d1..ade0043ce56e 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -18,6 +18,7 @@
 #include <linux/uaccess.h>
 #include <linux/smp.h>
 #include <linux/sched/task_stack.h>
+#include <linux/vmalloc.h>
 
 #include <asm/cpu_entry_area.h>
 #include <asm/softirq_stack.h>
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 622d12ec7f08..a4cc20d0036d 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -20,6 +20,7 @@
 #include <linux/efi.h>			/* efi_crash_gracefully_on_page_fault()*/
 #include <linux/mm_types.h>
 #include <linux/mm.h>			/* find_and_lock_vma() */
+#include <linux/vmalloc.h>
 
 #include <asm/cpufeature.h>		/* boot_cpu_has, ...		*/
 #include <asm/traps.h>			/* dotraplinkage, ...		*/
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 025fd7ea5d69..65fda406e6f2 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -24,6 +24,7 @@
 
 #include <linux/memblock.h>
 #include <linux/init.h>
+#include <linux/vmalloc.h>
 #include <asm/pgtable_areas.h>
 
 #include "numa_internal.h"
diff --git a/drivers/accel/ivpu/ivpu_mmu_context.c b/drivers/accel/ivpu/ivpu_mmu_context.c
index fe6161299236..128aef8e5a19 100644
--- a/drivers/accel/ivpu/ivpu_mmu_context.c
+++ b/drivers/accel/ivpu/ivpu_mmu_context.c
@@ -6,6 +6,7 @@
 #include <linux/bitfield.h>
 #include <linux/highmem.h>
 #include <linux/set_memory.h>
+#include <linux/vmalloc.h>
 
 #include <drm/drm_cache.h>
 
diff --git a/drivers/gpu/drm/gma500/mmu.c b/drivers/gpu/drm/gma500/mmu.c
index a70b01ccdf70..4d78b33eaa82 100644
--- a/drivers/gpu/drm/gma500/mmu.c
+++ b/drivers/gpu/drm/gma500/mmu.c
@@ -5,6 +5,7 @@
  **************************************************************************/
 
 #include <linux/highmem.h>
+#include <linux/vmalloc.h>
 
 #include "mmu.h"
 #include "psb_drv.h"
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
index 0ba955611dfb..8780aa243105 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
@@ -5,6 +5,7 @@
  */
 
 #include <drm/drm_cache.h>
+#include <linux/vmalloc.h>
 
 #include "gt/intel_gt.h"
 #include "gt/intel_tlb.h"
diff --git a/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c b/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c
index b2a5882b8f81..075657018739 100644
--- a/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c
+++ b/drivers/gpu/drm/i915/gem/selftests/mock_dmabuf.c
@@ -4,6 +4,7 @@
  * Copyright © 2016 Intel Corporation
  */
 
+#include <linux/vmalloc.h>
 #include "mock_dmabuf.h"
 
 static struct sg_table *mock_map_dma_buf(struct dma_buf_attachment *attachment,
diff --git a/drivers/gpu/drm/i915/gt/shmem_utils.c b/drivers/gpu/drm/i915/gt/shmem_utils.c
index bccc3a1200bc..1fb6ff77fd89 100644
--- a/drivers/gpu/drm/i915/gt/shmem_utils.c
+++ b/drivers/gpu/drm/i915/gt/shmem_utils.c
@@ -7,6 +7,7 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/shmem_fs.h>
+#include <linux/vmalloc.h>
 
 #include "i915_drv.h"
 #include "gem/i915_gem_object.h"
diff --git a/drivers/gpu/drm/i915/gvt/firmware.c b/drivers/gpu/drm/i915/gvt/firmware.c
index 4dd52ac2043e..d800d267f0e9 100644
--- a/drivers/gpu/drm/i915/gvt/firmware.c
+++ b/drivers/gpu/drm/i915/gvt/firmware.c
@@ -30,6 +30,7 @@
 
 #include <linux/firmware.h>
 #include <linux/crc32.h>
+#include <linux/vmalloc.h>
 
 #include "i915_drv.h"
 #include "gvt.h"
diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c
index 094fca9b0e73..58cca4906f41 100644
--- a/drivers/gpu/drm/i915/gvt/gtt.c
+++ b/drivers/gpu/drm/i915/gvt/gtt.c
@@ -39,6 +39,7 @@
 #include "trace.h"
 
 #include "gt/intel_gt_regs.h"
+#include <linux/vmalloc.h>
 
 #if defined(VERBOSE_DEBUG)
 #define gvt_vdbg_mm(fmt, args...) gvt_dbg_mm(fmt, ##args)
diff --git a/drivers/gpu/drm/i915/gvt/handlers.c b/drivers/gpu/drm/i915/gvt/handlers.c
index efcb00472be2..ea9c30092767 100644
--- a/drivers/gpu/drm/i915/gvt/handlers.c
+++ b/drivers/gpu/drm/i915/gvt/handlers.c
@@ -52,6 +52,7 @@
 #include "display/skl_watermark_regs.h"
 #include "display/vlv_dsi_pll_regs.h"
 #include "gt/intel_gt_regs.h"
+#include <linux/vmalloc.h>
 
 /* XXX FIXME i915 has changed PP_XXX definition */
 #define PCH_PP_STATUS  _MMIO(0xc7200)
diff --git a/drivers/gpu/drm/i915/gvt/mmio.c b/drivers/gpu/drm/i915/gvt/mmio.c
index 5b5def6ddef7..780762f28aa4 100644
--- a/drivers/gpu/drm/i915/gvt/mmio.c
+++ b/drivers/gpu/drm/i915/gvt/mmio.c
@@ -33,6 +33,7 @@
  *
  */
 
+#include <linux/vmalloc.h>
 #include "i915_drv.h"
 #include "i915_reg.h"
 #include "gvt.h"
diff --git a/drivers/gpu/drm/i915/gvt/vgpu.c b/drivers/gpu/drm/i915/gvt/vgpu.c
index 08ad1bd651f1..63c751ca4119 100644
--- a/drivers/gpu/drm/i915/gvt/vgpu.c
+++ b/drivers/gpu/drm/i915/gvt/vgpu.c
@@ -34,6 +34,7 @@
 #include "i915_drv.h"
 #include "gvt.h"
 #include "i915_pvinfo.h"
+#include <linux/vmalloc.h>
 
 void populate_pvinfo_page(struct intel_vgpu *vgpu)
 {
diff --git a/drivers/gpu/drm/i915/intel_gvt.c b/drivers/gpu/drm/i915/intel_gvt.c
index 9b6d87c8b583..5a01d60e5186 100644
--- a/drivers/gpu/drm/i915/intel_gvt.c
+++ b/drivers/gpu/drm/i915/intel_gvt.c
@@ -28,6 +28,7 @@
 #include "gt/intel_context.h"
 #include "gt/intel_ring.h"
 #include "gt/shmem_utils.h"
+#include <linux/vmalloc.h>
 
 /**
  * DOC: Intel GVT-g host support
diff --git a/drivers/gpu/drm/imagination/pvr_vm_mips.c b/drivers/gpu/drm/imagination/pvr_vm_mips.c
index b7fef3c797e6..6563dcde109c 100644
--- a/drivers/gpu/drm/imagination/pvr_vm_mips.c
+++ b/drivers/gpu/drm/imagination/pvr_vm_mips.c
@@ -14,6 +14,7 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/types.h>
+#include <linux/vmalloc.h>
 
 /**
  * pvr_vm_mips_init() - Initialise MIPS FW pagetable
diff --git a/drivers/gpu/drm/mediatek/mtk_drm_gem.c b/drivers/gpu/drm/mediatek/mtk_drm_gem.c
index 4f2e3feabc0f..3e519869b632 100644
--- a/drivers/gpu/drm/mediatek/mtk_drm_gem.c
+++ b/drivers/gpu/drm/mediatek/mtk_drm_gem.c
@@ -4,6 +4,7 @@
  */
 
 #include <linux/dma-buf.h>
+#include <linux/vmalloc.h>
 
 #include <drm/drm.h>
 #include <drm/drm_device.h>
diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c
index 3421e8389222..9ea0c64c26b5 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem.c
@@ -9,6 +9,7 @@
 #include <linux/shmem_fs.h>
 #include <linux/spinlock.h>
 #include <linux/pfn_t.h>
+#include <linux/vmalloc.h>
 
 #include <drm/drm_prime.h>
 #include <drm/drm_vma_manager.h>
diff --git a/drivers/gpu/drm/v3d/v3d_bo.c b/drivers/gpu/drm/v3d/v3d_bo.c
index a07ede668cc1..a165cbcdd27b 100644
--- a/drivers/gpu/drm/v3d/v3d_bo.c
+++ b/drivers/gpu/drm/v3d/v3d_bo.c
@@ -21,6 +21,7 @@
 
 #include <linux/dma-buf.h>
 #include <linux/pfn_t.h>
+#include <linux/vmalloc.h>
 
 #include "v3d_drv.h"
 #include "uapi/drm/v3d_drm.h"
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_binding.c b/drivers/gpu/drm/vmwgfx/vmwgfx_binding.c
index ae2de914eb89..2731f6ded1c2 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_binding.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_binding.c
@@ -54,6 +54,7 @@
 #include "vmwgfx_drv.h"
 #include "vmwgfx_binding.h"
 #include "device_include/svga3d_reg.h"
+#include <linux/vmalloc.h>
 
 #define VMW_BINDING_RT_BIT     0
 #define VMW_BINDING_PS_BIT     1
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_cmd.c b/drivers/gpu/drm/vmwgfx/vmwgfx_cmd.c
index 195ff8792e5a..dd4ca6a9c690 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_cmd.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_cmd.c
@@ -31,6 +31,7 @@
 #include <drm/ttm/ttm_placement.h>
 
 #include <linux/sched/signal.h>
+#include <linux/vmalloc.h>
 
 bool vmw_supports_3d(struct vmw_private *dev_priv)
 {
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_devcaps.c b/drivers/gpu/drm/vmwgfx/vmwgfx_devcaps.c
index 829df395c2ed..6e6beff9e262 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_devcaps.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_devcaps.c
@@ -25,6 +25,7 @@
  *
  **************************************************************************/
 
+#include <linux/vmalloc.h>
 #include "vmwgfx_devcaps.h"
 
 #include "vmwgfx_drv.h"
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index 0a304706e013..f8504a2e049e 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -53,6 +53,7 @@
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/version.h>
+#include <linux/vmalloc.h>
 
 #define VMWGFX_DRIVER_DESC "Linux drm driver for VMware graphics devices"
 
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
index cc3086e649eb..2e52d73eba48 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_execbuf.c
@@ -35,6 +35,7 @@
 
 #include <linux/sync_file.h>
 #include <linux/hashtable.h>
+#include <linux/vmalloc.h>
 
 /*
  * Helper macro to get dx_ctx_node if available otherwise print an error
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ioctl.c b/drivers/gpu/drm/vmwgfx/vmwgfx_ioctl.c
index a1da5678c731..835d1eed8dd9 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_ioctl.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_ioctl.c
@@ -31,6 +31,7 @@
 
 #include <drm/vmwgfx_drm.h>
 #include <linux/pci.h>
+#include <linux/vmalloc.h>
 
 int vmw_getparam_ioctl(struct drm_device *dev, void *data,
 		       struct drm_file *file_priv)
diff --git a/drivers/gpu/drm/xen/xen_drm_front_gem.c b/drivers/gpu/drm/xen/xen_drm_front_gem.c
index 3ad2b4cfd1f0..63112ed975c4 100644
--- a/drivers/gpu/drm/xen/xen_drm_front_gem.c
+++ b/drivers/gpu/drm/xen/xen_drm_front_gem.c
@@ -11,6 +11,7 @@
 #include <linux/dma-buf.h>
 #include <linux/scatterlist.h>
 #include <linux/shmem_fs.h>
+#include <linux/vmalloc.h>
 
 #include <drm/drm_gem.h>
 #include <drm/drm_prime.h>
diff --git a/drivers/hwtracing/coresight/coresight-trbe.c b/drivers/hwtracing/coresight/coresight-trbe.c
index 6136776482e6..96a32b213669 100644
--- a/drivers/hwtracing/coresight/coresight-trbe.c
+++ b/drivers/hwtracing/coresight/coresight-trbe.c
@@ -17,6 +17,7 @@
 
 #include <asm/barrier.h>
 #include <asm/cpufeature.h>
+#include <linux/vmalloc.h>
 
 #include "coresight-self-hosted-trace.h"
 #include "coresight-trbe.h"
diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c b/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c
index 2e2c3be8a0b4..e6eb98d70f3c 100644
--- a/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c
+++ b/drivers/net/ethernet/marvell/octeon_ep/octep_pfvf_mbox.c
@@ -15,6 +15,7 @@
 #include <linux/io.h>
 #include <linux/pci.h>
 #include <linux/etherdevice.h>
+#include <linux/vmalloc.h>
 
 #include "octep_config.h"
 #include "octep_main.h"
diff --git a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_mbox.c b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_mbox.c
index 2eab21e43048..445b626efe11 100644
--- a/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_mbox.c
+++ b/drivers/net/ethernet/marvell/octeon_ep_vf/octep_vf_mbox.c
@@ -7,6 +7,7 @@
 #include <linux/types.h>
 #include <linux/pci.h>
 #include <linux/netdevice.h>
+#include <linux/vmalloc.h>
 #include "octep_vf_config.h"
 #include "octep_vf_main.h"
 
diff --git a/drivers/net/ethernet/microsoft/mana/hw_channel.c b/drivers/net/ethernet/microsoft/mana/hw_channel.c
index 2729a2c5acf9..11021c34e47e 100644
--- a/drivers/net/ethernet/microsoft/mana/hw_channel.c
+++ b/drivers/net/ethernet/microsoft/mana/hw_channel.c
@@ -3,6 +3,7 @@
 
 #include <net/mana/gdma.h>
 #include <net/mana/hw_channel.h>
+#include <linux/vmalloc.h>
 
 static int mana_hwc_get_msg_index(struct hw_channel_context *hwc, u16 *msg_id)
 {
diff --git a/drivers/platform/x86/uv_sysfs.c b/drivers/platform/x86/uv_sysfs.c
index 38d1b692d3c0..40e010877189 100644
--- a/drivers/platform/x86/uv_sysfs.c
+++ b/drivers/platform/x86/uv_sysfs.c
@@ -11,6 +11,7 @@
 #include <linux/device.h>
 #include <linux/slab.h>
 #include <linux/kobject.h>
+#include <linux/vmalloc.h>
 #include <asm/uv/bios.h>
 #include <asm/uv/uv.h>
 #include <asm/uv/uv_hub.h>
diff --git a/drivers/scsi/mpi3mr/mpi3mr_transport.c b/drivers/scsi/mpi3mr/mpi3mr_transport.c
index d32ad46318cb..dabb91f0f75d 100644
--- a/drivers/scsi/mpi3mr/mpi3mr_transport.c
+++ b/drivers/scsi/mpi3mr/mpi3mr_transport.c
@@ -7,6 +7,8 @@
  *
  */
 
+#include <linux/vmalloc.h>
+
 #include "mpi3mr.h"
 
 /**
diff --git a/drivers/vfio/pci/pds/dirty.c b/drivers/vfio/pci/pds/dirty.c
index 68e8f006dfdb..c51f5e4c3dd6 100644
--- a/drivers/vfio/pci/pds/dirty.c
+++ b/drivers/vfio/pci/pds/dirty.c
@@ -3,6 +3,7 @@
 
 #include <linux/interval_tree.h>
 #include <linux/vfio.h>
+#include <linux/vmalloc.h>
 
 #include <linux/pds/pds_common.h>
 #include <linux/pds/pds_core_if.h>
diff --git a/drivers/virt/acrn/mm.c b/drivers/virt/acrn/mm.c
index fa5d9ca6be57..c088ee1f1180 100644
--- a/drivers/virt/acrn/mm.c
+++ b/drivers/virt/acrn/mm.c
@@ -12,6 +12,7 @@
 #include <linux/io.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
+#include <linux/vmalloc.h>
 
 #include "acrn_drv.h"
 
diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index 8e3223294442..e8355f55a8f7 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -21,6 +21,7 @@
 #include <linux/bitmap.h>
 #include <linux/lockdep.h>
 #include <linux/log2.h>
+#include <linux/vmalloc.h>
 
 #include <acpi/acpi_numa.h>
 
diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index bac63e874c7b..d4e1f042c9eb 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -9,6 +9,7 @@
 
 #include <asm/page.h> /* I/O is all done through memory accesses */
 #include <linux/string.h> /* for memset() and memcpy() */
+#include <linux/sizes.h>
 #include <linux/types.h>
 #include <linux/instruction_pointer.h>
 
diff --git a/include/linux/io.h b/include/linux/io.h
index 235ba7d80a8f..e8ee40ee1843 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -6,6 +6,7 @@
 #ifndef _LINUX_IO_H
 #define _LINUX_IO_H
 
+#include <linux/sizes.h>
 #include <linux/types.h>
 #include <linux/init.h>
 #include <linux/bug.h>
diff --git a/include/linux/pds/pds_common.h b/include/linux/pds/pds_common.h
index 30581e2e04cc..5802e1deef24 100644
--- a/include/linux/pds/pds_common.h
+++ b/include/linux/pds/pds_common.h
@@ -4,6 +4,8 @@
 #ifndef _PDS_COMMON_H_
 #define _PDS_COMMON_H_
 
+#include <linux/notifier.h>
+
 #define PDS_CORE_DRV_NAME			"pds_core"
 
 /* the device's internal addressing uses up to 52 bits */
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index 2e58d5e6ac0e..d67892944193 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -11,6 +11,7 @@
 #include <rdma/ib_verbs.h>
 #include <rdma/rdmavt_cq.h>
 #include <rdma/rvt-abi.h>
+#include <linux/vmalloc.h>
 /*
  * Atomic bit definitions for r_aflags.
  */
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 65c19025da3d..f1c9a2c5abc0 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -30,6 +30,7 @@
 #include <linux/start_kernel.h>
 #include <linux/sched/mm.h>
 #include <linux/io.h>
+#include <linux/vmalloc.h>
 
 #include <asm/cacheflush.h>
 #include <asm/pgalloc.h>
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index 2b994092a2d4..9958ebc15d38 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -16,6 +16,7 @@
 #include <linux/static_key.h>
 #include <linux/string.h>
 #include <linux/types.h>
+#include <linux/vmalloc.h>
 
 #include "kasan.h"
 
diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c
index d3fa6e136744..990b5bd717a1 100644
--- a/sound/pci/hda/cs35l41_hda.c
+++ b/sound/pci/hda/cs35l41_hda.c
@@ -13,6 +13,7 @@
 #include <sound/soc.h>
 #include <linux/pm_runtime.h>
 #include <linux/spi/spi.h>
+#include <linux/vmalloc.h>
 #include "hda_local.h"
 #include "hda_auto_parser.h"
 #include "hda_jack.h"
-- 
cgit v1.2.3


From 690da22dbfa85aa9f3ce06f40656c0b67e53cc80 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 21 Mar 2024 09:36:24 -0700
Subject: asm-generic/io.h: kill vmalloc.h dependency
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Needed to avoid a new circular dependency with the memory allocation
profiling series.

Naturally, a whole bunch of files needed to include vmalloc.h that were
previously getting it implicitly.

Link: https://lkml.kernel.org/r/20240321163705.3067592-3-surenb@google.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Tested-by: Kees Cook <keescook@chromium.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andreas Hindborg <a.hindborg@samsung.com>
Cc: Benno Lossin <benno.lossin@proton.me>
Cc: "Björn Roy Baron" <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/asm-generic/io.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index d4e1f042c9eb..80de699bf6af 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -992,7 +992,6 @@ static inline void iowrite64_rep(volatile void __iomem *addr,
 
 #ifdef __KERNEL__
 
-#include <linux/vmalloc.h>
 #define __io_virt(x) ((void __force *)(x))
 
 /*
-- 
cgit v1.2.3


From a5674119f0faa46f6ebdfcfa92342535b7f54e67 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 21 Mar 2024 09:36:27 -0700
Subject: fs: convert alloc_inode_sb() to a macro
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We're introducing alloc tagging, which tracks memory allocations by
callsite.  Converting alloc_inode_sb() to a macro means allocations will
be tracked by its caller, which is a bit more useful.

Link: https://lkml.kernel.org/r/20240321163705.3067592-6-surenb@google.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Tested-by: Kees Cook <keescook@chromium.org>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andreas Hindborg <a.hindborg@samsung.com>
Cc: Benno Lossin <benno.lossin@proton.me>
Cc: "Björn Roy Baron" <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/fs.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8dfd53b52744..4f2e2e5b6ab1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3085,11 +3085,7 @@ int setattr_should_drop_sgid(struct mnt_idmap *idmap,
  * This must be used for allocating filesystems specific inodes to set
  * up the inode reclaim context correctly.
  */
-static inline void *
-alloc_inode_sb(struct super_block *sb, struct kmem_cache *cache, gfp_t gfp)
-{
-	return kmem_cache_alloc_lru(cache, &sb->s_inode_lru, gfp);
-}
+#define alloc_inode_sb(_sb, _cache, _gfp) kmem_cache_alloc_lru(_cache, &_sb->s_inode_lru, _gfp)
 
 extern void __insert_inode_hash(struct inode *, unsigned long hashval);
 static inline void insert_inode_hash(struct inode *inode)
-- 
cgit v1.2.3


From 21c690a349baab895dc68ab70d291e1598d7109d Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 21 Mar 2024 09:36:28 -0700
Subject: mm: introduce slabobj_ext to support slab object extensions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently slab pages can store only vectors of obj_cgroup pointers in
page->memcg_data.  Introduce slabobj_ext structure to allow more data to
be stored for each slab object.  Wrap obj_cgroup into slabobj_ext to
support current functionality while allowing to extend slabobj_ext in the
future.

Link: https://lkml.kernel.org/r/20240321163705.3067592-7-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Kees Cook <keescook@chromium.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andreas Hindborg <a.hindborg@samsung.com>
Cc: Benno Lossin <benno.lossin@proton.me>
Cc: "Björn Roy Baron" <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h |  20 ++++++---
 include/linux/mm_types.h   |   4 +-
 init/Kconfig               |   4 ++
 mm/kfence/core.c           |  14 +++----
 mm/kfence/kfence.h         |   4 +-
 mm/memcontrol.c            |  56 ++++---------------------
 mm/page_owner.c            |   2 +-
 mm/slab.h                  |  52 ++++++++++++-----------
 mm/slub.c                  | 101 ++++++++++++++++++++++++++++++++++++---------
 9 files changed, 145 insertions(+), 112 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b6264796815d..99f423742324 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -349,8 +349,8 @@ struct mem_cgroup {
 extern struct mem_cgroup *root_mem_cgroup;
 
 enum page_memcg_data_flags {
-	/* page->memcg_data is a pointer to an objcgs vector */
-	MEMCG_DATA_OBJCGS = (1UL << 0),
+	/* page->memcg_data is a pointer to an slabobj_ext vector */
+	MEMCG_DATA_OBJEXTS = (1UL << 0),
 	/* page has been accounted as a non-slab kernel page */
 	MEMCG_DATA_KMEM = (1UL << 1),
 	/* the next bit after the last actual flag */
@@ -388,7 +388,7 @@ static inline struct mem_cgroup *__folio_memcg(struct folio *folio)
 	unsigned long memcg_data = folio->memcg_data;
 
 	VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
-	VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio);
+	VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio);
 	VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio);
 
 	return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
@@ -409,7 +409,7 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
 	unsigned long memcg_data = folio->memcg_data;
 
 	VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
-	VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio);
+	VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio);
 	VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio);
 
 	return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
@@ -506,7 +506,7 @@ static inline struct mem_cgroup *folio_memcg_check(struct folio *folio)
 	 */
 	unsigned long memcg_data = READ_ONCE(folio->memcg_data);
 
-	if (memcg_data & MEMCG_DATA_OBJCGS)
+	if (memcg_data & MEMCG_DATA_OBJEXTS)
 		return NULL;
 
 	if (memcg_data & MEMCG_DATA_KMEM) {
@@ -552,7 +552,7 @@ retry:
 static inline bool folio_memcg_kmem(struct folio *folio)
 {
 	VM_BUG_ON_PGFLAGS(PageTail(&folio->page), &folio->page);
-	VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJCGS, folio);
+	VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJEXTS, folio);
 	return folio->memcg_data & MEMCG_DATA_KMEM;
 }
 
@@ -1633,6 +1633,14 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
 }
 #endif /* CONFIG_MEMCG */
 
+/*
+ * Extended information for slab objects stored as an array in page->memcg_data
+ * if MEMCG_DATA_OBJEXTS is set.
+ */
+struct slabobj_ext {
+	struct obj_cgroup *objcg;
+} __aligned(8);
+
 static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
 {
 	__mod_lruvec_kmem_state(p, idx, 1);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5240bd7bca33..4ae4684d1add 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -169,7 +169,7 @@ struct page {
 	/* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */
 	atomic_t _refcount;
 
-#ifdef CONFIG_MEMCG
+#ifdef CONFIG_SLAB_OBJ_EXT
 	unsigned long memcg_data;
 #endif
 
@@ -331,7 +331,7 @@ struct folio {
 			};
 			atomic_t _mapcount;
 			atomic_t _refcount;
-#ifdef CONFIG_MEMCG
+#ifdef CONFIG_SLAB_OBJ_EXT
 			unsigned long memcg_data;
 #endif
 #if defined(WANT_PAGE_VIRTUAL)
diff --git a/init/Kconfig b/init/Kconfig
index aa02aec6aa7d..10d4a638d9ae 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -929,6 +929,9 @@ config NUMA_BALANCING_DEFAULT_ENABLED
 	  If set, automatic NUMA balancing will be enabled if running on a NUMA
 	  machine.
 
+config SLAB_OBJ_EXT
+	bool
+
 menuconfig CGROUPS
 	bool "Control Group support"
 	select KERNFS
@@ -962,6 +965,7 @@ config MEMCG
 	bool "Memory controller"
 	select PAGE_COUNTER
 	select EVENTFD
+	select SLAB_OBJ_EXT
 	help
 	  Provides control over the memory footprint of tasks in a cgroup.
 
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 8350f5c06f2e..964b8482275b 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -595,9 +595,9 @@ static unsigned long kfence_init_pool(void)
 			continue;
 
 		__folio_set_slab(slab_folio(slab));
-#ifdef CONFIG_MEMCG
-		slab->memcg_data = (unsigned long)&kfence_metadata_init[i / 2 - 1].objcg |
-				   MEMCG_DATA_OBJCGS;
+#ifdef CONFIG_MEMCG_KMEM
+		slab->obj_exts = (unsigned long)&kfence_metadata_init[i / 2 - 1].obj_exts |
+				 MEMCG_DATA_OBJEXTS;
 #endif
 	}
 
@@ -645,8 +645,8 @@ reset_slab:
 
 		if (!i || (i % 2))
 			continue;
-#ifdef CONFIG_MEMCG
-		slab->memcg_data = 0;
+#ifdef CONFIG_MEMCG_KMEM
+		slab->obj_exts = 0;
 #endif
 		__folio_clear_slab(slab_folio(slab));
 	}
@@ -1139,8 +1139,8 @@ void __kfence_free(void *addr)
 {
 	struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
 
-#ifdef CONFIG_MEMCG
-	KFENCE_WARN_ON(meta->objcg);
+#ifdef CONFIG_MEMCG_KMEM
+	KFENCE_WARN_ON(meta->obj_exts.objcg);
 #endif
 	/*
 	 * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing
diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
index f46fbb03062b..084f5f36e8e7 100644
--- a/mm/kfence/kfence.h
+++ b/mm/kfence/kfence.h
@@ -97,8 +97,8 @@ struct kfence_metadata {
 	struct kfence_track free_track;
 	/* For updating alloc_covered on frees. */
 	u32 alloc_stack_hash;
-#ifdef CONFIG_MEMCG
-	struct obj_cgroup *objcg;
+#ifdef CONFIG_MEMCG_KMEM
+	struct slabobj_ext obj_exts;
 #endif
 };
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 786c0fae6e30..896b4bf05b9c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2977,13 +2977,6 @@ void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
 }
 
 #ifdef CONFIG_MEMCG_KMEM
-/*
- * The allocated objcg pointers array is not accounted directly.
- * Moreover, it should not come from DMA buffer and is not readily
- * reclaimable. So those GFP bits should be masked off.
- */
-#define OBJCGS_CLEAR_MASK	(__GFP_DMA | __GFP_RECLAIMABLE | \
-				 __GFP_ACCOUNT | __GFP_NOFAIL)
 
 /*
  * mod_objcg_mlstate() may be called with irq enabled, so
@@ -3003,62 +2996,27 @@ static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
 	rcu_read_unlock();
 }
 
-int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s,
-				 gfp_t gfp, bool new_slab)
-{
-	unsigned int objects = objs_per_slab(s, slab);
-	unsigned long memcg_data;
-	void *vec;
-
-	gfp &= ~OBJCGS_CLEAR_MASK;
-	vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
-			   slab_nid(slab));
-	if (!vec)
-		return -ENOMEM;
-
-	memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS;
-	if (new_slab) {
-		/*
-		 * If the slab is brand new and nobody can yet access its
-		 * memcg_data, no synchronization is required and memcg_data can
-		 * be simply assigned.
-		 */
-		slab->memcg_data = memcg_data;
-	} else if (cmpxchg(&slab->memcg_data, 0, memcg_data)) {
-		/*
-		 * If the slab is already in use, somebody can allocate and
-		 * assign obj_cgroups in parallel. In this case the existing
-		 * objcg vector should be reused.
-		 */
-		kfree(vec);
-		return 0;
-	}
-
-	kmemleak_not_leak(vec);
-	return 0;
-}
-
 static __always_inline
 struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
 {
 	/*
 	 * Slab objects are accounted individually, not per-page.
 	 * Memcg membership data for each individual object is saved in
-	 * slab->memcg_data.
+	 * slab->obj_exts.
 	 */
 	if (folio_test_slab(folio)) {
-		struct obj_cgroup **objcgs;
+		struct slabobj_ext *obj_exts;
 		struct slab *slab;
 		unsigned int off;
 
 		slab = folio_slab(folio);
-		objcgs = slab_objcgs(slab);
-		if (!objcgs)
+		obj_exts = slab_obj_exts(slab);
+		if (!obj_exts)
 			return NULL;
 
 		off = obj_to_index(slab->slab_cache, slab, p);
-		if (objcgs[off])
-			return obj_cgroup_memcg(objcgs[off]);
+		if (obj_exts[off].objcg)
+			return obj_cgroup_memcg(obj_exts[off].objcg);
 
 		return NULL;
 	}
@@ -3066,7 +3024,7 @@ struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
 	/*
 	 * folio_memcg_check() is used here, because in theory we can encounter
 	 * a folio where the slab flag has been cleared already, but
-	 * slab->memcg_data has not been freed yet
+	 * slab->obj_exts has not been freed yet
 	 * folio_memcg_check() will guarantee that a proper memory
 	 * cgroup pointer or NULL will be returned.
 	 */
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 742f432e5bf0..75c23302868a 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -515,7 +515,7 @@ static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret,
 	if (!memcg_data)
 		goto out_unlock;
 
-	if (memcg_data & MEMCG_DATA_OBJCGS)
+	if (memcg_data & MEMCG_DATA_OBJEXTS)
 		ret += scnprintf(kbuf + ret, count - ret,
 				"Slab cache page\n");
 
diff --git a/mm/slab.h b/mm/slab.h
index d2bc9b191222..1c16dc8344fa 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -87,8 +87,8 @@ struct slab {
 	unsigned int __unused;
 
 	atomic_t __page_refcount;
-#ifdef CONFIG_MEMCG
-	unsigned long memcg_data;
+#ifdef CONFIG_SLAB_OBJ_EXT
+	unsigned long obj_exts;
 #endif
 };
 
@@ -97,8 +97,8 @@ struct slab {
 SLAB_MATCH(flags, __page_flags);
 SLAB_MATCH(compound_head, slab_cache);	/* Ensure bit 0 is clear */
 SLAB_MATCH(_refcount, __page_refcount);
-#ifdef CONFIG_MEMCG
-SLAB_MATCH(memcg_data, memcg_data);
+#ifdef CONFIG_SLAB_OBJ_EXT
+SLAB_MATCH(memcg_data, obj_exts);
 #endif
 #undef SLAB_MATCH
 static_assert(sizeof(struct slab) <= sizeof(struct page));
@@ -536,42 +536,44 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla
 	return false;
 }
 
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_SLAB_OBJ_EXT
+
 /*
- * slab_objcgs - get the object cgroups vector associated with a slab
+ * slab_obj_exts - get the pointer to the slab object extension vector
+ * associated with a slab.
  * @slab: a pointer to the slab struct
  *
- * Returns a pointer to the object cgroups vector associated with the slab,
+ * Returns a pointer to the object extension vector associated with the slab,
  * or NULL if no such vector has been associated yet.
  */
-static inline struct obj_cgroup **slab_objcgs(struct slab *slab)
+static inline struct slabobj_ext *slab_obj_exts(struct slab *slab)
 {
-	unsigned long memcg_data = READ_ONCE(slab->memcg_data);
+	unsigned long obj_exts = READ_ONCE(slab->obj_exts);
 
-	VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS),
+#ifdef CONFIG_MEMCG
+	VM_BUG_ON_PAGE(obj_exts && !(obj_exts & MEMCG_DATA_OBJEXTS),
 							slab_page(slab));
-	VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, slab_page(slab));
+	VM_BUG_ON_PAGE(obj_exts & MEMCG_DATA_KMEM, slab_page(slab));
 
-	return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+	return (struct slabobj_ext *)(obj_exts & ~MEMCG_DATA_FLAGS_MASK);
+#else
+	return (struct slabobj_ext *)obj_exts;
+#endif
 }
 
-int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s,
-				 gfp_t gfp, bool new_slab);
-void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
-		     enum node_stat_item idx, int nr);
-#else /* CONFIG_MEMCG_KMEM */
-static inline struct obj_cgroup **slab_objcgs(struct slab *slab)
+#else /* CONFIG_SLAB_OBJ_EXT */
+
+static inline struct slabobj_ext *slab_obj_exts(struct slab *slab)
 {
 	return NULL;
 }
 
-static inline int memcg_alloc_slab_cgroups(struct slab *slab,
-					       struct kmem_cache *s, gfp_t gfp,
-					       bool new_slab)
-{
-	return 0;
-}
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_SLAB_OBJ_EXT */
+
+#ifdef CONFIG_MEMCG_KMEM
+void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
+		     enum node_stat_item idx, int nr);
+#endif
 
 size_t __ksize(const void *objp);
 
diff --git a/mm/slub.c b/mm/slub.c
index bc9f40889834..5c896c76812d 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1871,13 +1871,78 @@ static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s)
 		NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B;
 }
 
-#ifdef CONFIG_MEMCG_KMEM
-static inline void memcg_free_slab_cgroups(struct slab *slab)
+#ifdef CONFIG_SLAB_OBJ_EXT
+
+/*
+ * The allocated objcg pointers array is not accounted directly.
+ * Moreover, it should not come from DMA buffer and is not readily
+ * reclaimable. So those GFP bits should be masked off.
+ */
+#define OBJCGS_CLEAR_MASK	(__GFP_DMA | __GFP_RECLAIMABLE | \
+				__GFP_ACCOUNT | __GFP_NOFAIL)
+
+static int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
+			       gfp_t gfp, bool new_slab)
 {
-	kfree(slab_objcgs(slab));
-	slab->memcg_data = 0;
+	unsigned int objects = objs_per_slab(s, slab);
+	unsigned long obj_exts;
+	void *vec;
+
+	gfp &= ~OBJCGS_CLEAR_MASK;
+	vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp,
+			   slab_nid(slab));
+	if (!vec)
+		return -ENOMEM;
+
+	obj_exts = (unsigned long)vec;
+#ifdef CONFIG_MEMCG
+	obj_exts |= MEMCG_DATA_OBJEXTS;
+#endif
+	if (new_slab) {
+		/*
+		 * If the slab is brand new and nobody can yet access its
+		 * obj_exts, no synchronization is required and obj_exts can
+		 * be simply assigned.
+		 */
+		slab->obj_exts = obj_exts;
+	} else if (cmpxchg(&slab->obj_exts, 0, obj_exts)) {
+		/*
+		 * If the slab is already in use, somebody can allocate and
+		 * assign slabobj_exts in parallel. In this case the existing
+		 * objcg vector should be reused.
+		 */
+		kfree(vec);
+		return 0;
+	}
+
+	kmemleak_not_leak(vec);
+	return 0;
 }
 
+static inline void free_slab_obj_exts(struct slab *slab)
+{
+	struct slabobj_ext *obj_exts;
+
+	obj_exts = slab_obj_exts(slab);
+	if (!obj_exts)
+		return;
+
+	kfree(obj_exts);
+	slab->obj_exts = 0;
+}
+#else /* CONFIG_SLAB_OBJ_EXT */
+static int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
+			       gfp_t gfp, bool new_slab)
+{
+	return 0;
+}
+
+static inline void free_slab_obj_exts(struct slab *slab)
+{
+}
+#endif /* CONFIG_SLAB_OBJ_EXT */
+
+#ifdef CONFIG_MEMCG_KMEM
 static inline size_t obj_full_size(struct kmem_cache *s)
 {
 	/*
@@ -1956,15 +2021,15 @@ static void __memcg_slab_post_alloc_hook(struct kmem_cache *s,
 		if (likely(p[i])) {
 			slab = virt_to_slab(p[i]);
 
-			if (!slab_objcgs(slab) &&
-			    memcg_alloc_slab_cgroups(slab, s, flags, false)) {
+			if (!slab_obj_exts(slab) &&
+			    alloc_slab_obj_exts(slab, s, flags, false)) {
 				obj_cgroup_uncharge(objcg, obj_full_size(s));
 				continue;
 			}
 
 			off = obj_to_index(s, slab, p[i]);
 			obj_cgroup_get(objcg);
-			slab_objcgs(slab)[off] = objcg;
+			slab_obj_exts(slab)[off].objcg = objcg;
 			mod_objcg_state(objcg, slab_pgdat(slab),
 					cache_vmstat_idx(s), obj_full_size(s));
 		} else {
@@ -1985,18 +2050,18 @@ void memcg_slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg,
 
 static void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
 				   void **p, int objects,
-				   struct obj_cgroup **objcgs)
+				   struct slabobj_ext *obj_exts)
 {
 	for (int i = 0; i < objects; i++) {
 		struct obj_cgroup *objcg;
 		unsigned int off;
 
 		off = obj_to_index(s, slab, p[i]);
-		objcg = objcgs[off];
+		objcg = obj_exts[off].objcg;
 		if (!objcg)
 			continue;
 
-		objcgs[off] = NULL;
+		obj_exts[off].objcg = NULL;
 		obj_cgroup_uncharge(objcg, obj_full_size(s));
 		mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s),
 				-obj_full_size(s));
@@ -2008,16 +2073,16 @@ static __fastpath_inline
 void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p,
 			  int objects)
 {
-	struct obj_cgroup **objcgs;
+	struct slabobj_ext *obj_exts;
 
 	if (!memcg_kmem_online())
 		return;
 
-	objcgs = slab_objcgs(slab);
-	if (likely(!objcgs))
+	obj_exts = slab_obj_exts(slab);
+	if (likely(!obj_exts))
 		return;
 
-	__memcg_slab_free_hook(s, slab, p, objects, objcgs);
+	__memcg_slab_free_hook(s, slab, p, objects, obj_exts);
 }
 
 static inline
@@ -2028,10 +2093,6 @@ void memcg_slab_alloc_error_hook(struct kmem_cache *s, int objects,
 		obj_cgroup_uncharge(objcg, objects * obj_full_size(s));
 }
 #else /* CONFIG_MEMCG_KMEM */
-static inline void memcg_free_slab_cgroups(struct slab *slab)
-{
-}
-
 static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
 					     struct list_lru *lru,
 					     struct obj_cgroup **objcgp,
@@ -2298,7 +2359,7 @@ static __always_inline void account_slab(struct slab *slab, int order,
 					 struct kmem_cache *s, gfp_t gfp)
 {
 	if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT))
-		memcg_alloc_slab_cgroups(slab, s, gfp, true);
+		alloc_slab_obj_exts(slab, s, gfp, true);
 
 	mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
 			    PAGE_SIZE << order);
@@ -2308,7 +2369,7 @@ static __always_inline void unaccount_slab(struct slab *slab, int order,
 					   struct kmem_cache *s)
 {
 	if (memcg_kmem_online())
-		memcg_free_slab_cgroups(slab);
+		free_slab_obj_exts(slab);
 
 	mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
 			    -(PAGE_SIZE << order));
-- 
cgit v1.2.3


From 768c33be1b3166791bd554abd105154719eba43d Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 21 Mar 2024 09:36:29 -0700
Subject: mm: introduce __GFP_NO_OBJ_EXT flag to selectively prevent
 slabobj_ext creation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce __GFP_NO_OBJ_EXT flag in order to prevent recursive allocations
when allocating slabobj_ext on a slab.

Link: https://lkml.kernel.org/r/20240321163705.3067592-8-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Kees Cook <keescook@chromium.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andreas Hindborg <a.hindborg@samsung.com>
Cc: Benno Lossin <benno.lossin@proton.me>
Cc: "Björn Roy Baron" <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/gfp_types.h | 11 +++++++++++
 mm/slub.c                 |  2 ++
 2 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h
index 13becafe41df..313be4ad79fd 100644
--- a/include/linux/gfp_types.h
+++ b/include/linux/gfp_types.h
@@ -54,6 +54,9 @@ enum {
 #endif
 #ifdef CONFIG_LOCKDEP
 	___GFP_NOLOCKDEP_BIT,
+#endif
+#ifdef CONFIG_SLAB_OBJ_EXT
+	___GFP_NO_OBJ_EXT_BIT,
 #endif
 	___GFP_LAST_BIT
 };
@@ -95,6 +98,11 @@ enum {
 #else
 #define ___GFP_NOLOCKDEP	0
 #endif
+#ifdef CONFIG_SLAB_OBJ_EXT
+#define ___GFP_NO_OBJ_EXT       BIT(___GFP_NO_OBJ_EXT_BIT)
+#else
+#define ___GFP_NO_OBJ_EXT       0
+#endif
 
 /*
  * Physical address zone modifiers (see linux/mmzone.h - low four bits)
@@ -135,12 +143,15 @@ enum {
  * node with no fallbacks or placement policy enforcements.
  *
  * %__GFP_ACCOUNT causes the allocation to be accounted to kmemcg.
+ *
+ * %__GFP_NO_OBJ_EXT causes slab allocation to have no object extension.
  */
 #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE)
 #define __GFP_WRITE	((__force gfp_t)___GFP_WRITE)
 #define __GFP_HARDWALL   ((__force gfp_t)___GFP_HARDWALL)
 #define __GFP_THISNODE	((__force gfp_t)___GFP_THISNODE)
 #define __GFP_ACCOUNT	((__force gfp_t)___GFP_ACCOUNT)
+#define __GFP_NO_OBJ_EXT   ((__force gfp_t)___GFP_NO_OBJ_EXT)
 
 /**
  * DOC: Watermark modifiers
diff --git a/mm/slub.c b/mm/slub.c
index 5c896c76812d..2cb53642a091 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1889,6 +1889,8 @@ static int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
 	void *vec;
 
 	gfp &= ~OBJCGS_CLEAR_MASK;
+	/* Prevent recursive extension vector allocation */
+	gfp |= __GFP_NO_OBJ_EXT;
 	vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp,
 			   slab_nid(slab));
 	if (!vec)
-- 
cgit v1.2.3


From 45012241ec5d5870d986e66c8ce20a0f6212e6f8 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 21 Mar 2024 09:36:30 -0700
Subject: mm/slab: introduce SLAB_NO_OBJ_EXT to avoid obj_ext creation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Slab extension objects can't be allocated before slab infrastructure is
initialized.  Some caches, like kmem_cache and kmem_cache_node, are
created before slab infrastructure is initialized.  Objects from these
caches can't have extension objects.  Introduce SLAB_NO_OBJ_EXT slab flag
to mark these caches and avoid creating extensions for objects allocated
from these slabs.

Link: https://lkml.kernel.org/r/20240321163705.3067592-9-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Kees Cook <keescook@chromium.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andreas Hindborg <a.hindborg@samsung.com>
Cc: Benno Lossin <benno.lossin@proton.me>
Cc: "Björn Roy Baron" <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/slab.h | 10 ++++++++++
 mm/slub.c            |  5 +++--
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index e53cbfa18325..68ff754b85a4 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -56,6 +56,9 @@ enum _slab_flag_bits {
 #endif
 	_SLAB_OBJECT_POISON,
 	_SLAB_CMPXCHG_DOUBLE,
+#ifdef CONFIG_SLAB_OBJ_EXT
+	_SLAB_NO_OBJ_EXT,
+#endif
 	_SLAB_FLAGS_LAST_BIT
 };
 
@@ -202,6 +205,13 @@ enum _slab_flag_bits {
 #endif
 #define SLAB_TEMPORARY		SLAB_RECLAIM_ACCOUNT	/* Objects are short-lived */
 
+/* Slab created using create_boot_cache */
+#ifdef CONFIG_SLAB_OBJ_EXT
+#define SLAB_NO_OBJ_EXT		__SLAB_FLAG_BIT(_SLAB_NO_OBJ_EXT)
+#else
+#define SLAB_NO_OBJ_EXT		__SLAB_FLAG_UNUSED
+#endif
+
 /*
  * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
  *
diff --git a/mm/slub.c b/mm/slub.c
index 2cb53642a091..666dcc3b8a26 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5693,7 +5693,8 @@ void __init kmem_cache_init(void)
 		node_set(node, slab_nodes);
 
 	create_boot_cache(kmem_cache_node, "kmem_cache_node",
-		sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0);
+			sizeof(struct kmem_cache_node),
+			SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0);
 
 	hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
 
@@ -5703,7 +5704,7 @@ void __init kmem_cache_init(void)
 	create_boot_cache(kmem_cache, "kmem_cache",
 			offsetof(struct kmem_cache, node) +
 				nr_node_ids * sizeof(struct kmem_cache_node *),
-		       SLAB_HWCACHE_ALIGN, 0, 0);
+			SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0);
 
 	kmem_cache = bootstrap(&boot_kmem_cache);
 	kmem_cache_node = bootstrap(&boot_kmem_cache_node);
-- 
cgit v1.2.3


From 53ce720359b8f4b531c5b8bd7fa06d74bca45f0a Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 21 Mar 2024 09:36:31 -0700
Subject: slab: objext: introduce objext_flags as extension to
 page_memcg_data_flags
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce objext_flags to store additional objext flags unrelated to memcg.

Link: https://lkml.kernel.org/r/20240321163705.3067592-10-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Kees Cook <keescook@chromium.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andreas Hindborg <a.hindborg@samsung.com>
Cc: Benno Lossin <benno.lossin@proton.me>
Cc: "Björn Roy Baron" <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 29 ++++++++++++++++++++++-------
 mm/slab.h                  |  5 +----
 2 files changed, 23 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 99f423742324..12afc2647cf0 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -357,7 +357,22 @@ enum page_memcg_data_flags {
 	__NR_MEMCG_DATA_FLAGS  = (1UL << 2),
 };
 
-#define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1)
+#define __FIRST_OBJEXT_FLAG	__NR_MEMCG_DATA_FLAGS
+
+#else /* CONFIG_MEMCG */
+
+#define __FIRST_OBJEXT_FLAG	(1UL << 0)
+
+#endif /* CONFIG_MEMCG */
+
+enum objext_flags {
+	/* the next bit after the last actual flag */
+	__NR_OBJEXTS_FLAGS  = __FIRST_OBJEXT_FLAG,
+};
+
+#define OBJEXTS_FLAGS_MASK (__NR_OBJEXTS_FLAGS - 1)
+
+#ifdef CONFIG_MEMCG
 
 static inline bool folio_memcg_kmem(struct folio *folio);
 
@@ -391,7 +406,7 @@ static inline struct mem_cgroup *__folio_memcg(struct folio *folio)
 	VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio);
 	VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio);
 
-	return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+	return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
 }
 
 /*
@@ -412,7 +427,7 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
 	VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio);
 	VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio);
 
-	return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+	return (struct obj_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
 }
 
 /*
@@ -469,11 +484,11 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
 	if (memcg_data & MEMCG_DATA_KMEM) {
 		struct obj_cgroup *objcg;
 
-		objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+		objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
 		return obj_cgroup_memcg(objcg);
 	}
 
-	return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+	return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
 }
 
 /*
@@ -512,11 +527,11 @@ static inline struct mem_cgroup *folio_memcg_check(struct folio *folio)
 	if (memcg_data & MEMCG_DATA_KMEM) {
 		struct obj_cgroup *objcg;
 
-		objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+		objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
 		return obj_cgroup_memcg(objcg);
 	}
 
-	return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+	return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
 }
 
 static inline struct mem_cgroup *page_memcg_check(struct page *page)
diff --git a/mm/slab.h b/mm/slab.h
index 1c16dc8344fa..65db525e93af 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -554,11 +554,8 @@ static inline struct slabobj_ext *slab_obj_exts(struct slab *slab)
 	VM_BUG_ON_PAGE(obj_exts && !(obj_exts & MEMCG_DATA_OBJEXTS),
 							slab_page(slab));
 	VM_BUG_ON_PAGE(obj_exts & MEMCG_DATA_KMEM, slab_page(slab));
-
-	return (struct slabobj_ext *)(obj_exts & ~MEMCG_DATA_FLAGS_MASK);
-#else
-	return (struct slabobj_ext *)obj_exts;
 #endif
+	return (struct slabobj_ext *)(obj_exts & ~OBJEXTS_FLAGS_MASK);
 }
 
 #else /* CONFIG_SLAB_OBJ_EXT */
-- 
cgit v1.2.3


From 916cc5167cc6f81651f4460d9a35c939ad18ecff Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 21 Mar 2024 09:36:32 -0700
Subject: lib: code tagging framework
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add basic infrastructure to support code tagging which stores tag common
information consisting of the module name, function, file name and line
number.  Provide functions to register a new code tag type and navigate
between code tags.

Link: https://lkml.kernel.org/r/20240321163705.3067592-11-surenb@google.com
Co-developed-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Tested-by: Kees Cook <keescook@chromium.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andreas Hindborg <a.hindborg@samsung.com>
Cc: Benno Lossin <benno.lossin@proton.me>
Cc: "Björn Roy Baron" <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/codetag.h |  68 +++++++++++++++
 lib/Kconfig.debug       |   4 +
 lib/Makefile            |   1 +
 lib/codetag.c           | 219 ++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 292 insertions(+)
 create mode 100644 include/linux/codetag.h
 create mode 100644 lib/codetag.c

(limited to 'include')

diff --git a/include/linux/codetag.h b/include/linux/codetag.h
new file mode 100644
index 000000000000..7734269cdb63
--- /dev/null
+++ b/include/linux/codetag.h
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * code tagging framework
+ */
+#ifndef _LINUX_CODETAG_H
+#define _LINUX_CODETAG_H
+
+#include <linux/types.h>
+
+struct codetag_iterator;
+struct codetag_type;
+struct codetag_module;
+struct seq_buf;
+struct module;
+
+/*
+ * An instance of this structure is created in a special ELF section at every
+ * code location being tagged.  At runtime, the special section is treated as
+ * an array of these.
+ */
+struct codetag {
+	unsigned int flags; /* used in later patches */
+	unsigned int lineno;
+	const char *modname;
+	const char *function;
+	const char *filename;
+} __aligned(8);
+
+union codetag_ref {
+	struct codetag *ct;
+};
+
+struct codetag_type_desc {
+	const char *section;
+	size_t tag_size;
+};
+
+struct codetag_iterator {
+	struct codetag_type *cttype;
+	struct codetag_module *cmod;
+	unsigned long mod_id;
+	struct codetag *ct;
+};
+
+#ifdef MODULE
+#define CT_MODULE_NAME KBUILD_MODNAME
+#else
+#define CT_MODULE_NAME NULL
+#endif
+
+#define CODE_TAG_INIT {					\
+	.modname	= CT_MODULE_NAME,		\
+	.function	= __func__,			\
+	.filename	= __FILE__,			\
+	.lineno		= __LINE__,			\
+	.flags		= 0,				\
+}
+
+void codetag_lock_module_list(struct codetag_type *cttype, bool lock);
+struct codetag_iterator codetag_get_ct_iter(struct codetag_type *cttype);
+struct codetag *codetag_next_ct(struct codetag_iterator *iter);
+
+void codetag_to_text(struct seq_buf *out, struct codetag *ct);
+
+struct codetag_type *
+codetag_register_type(const struct codetag_type_desc *desc);
+
+#endif /* _LINUX_CODETAG_H */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index c63a5fbf1f1c..015fc6ee9849 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -968,6 +968,10 @@ config DEBUG_STACKOVERFLOW
 
 	  If in doubt, say "N".
 
+config CODE_TAGGING
+	bool
+	select KALLSYMS
+
 source "lib/Kconfig.kasan"
 source "lib/Kconfig.kfence"
 source "lib/Kconfig.kmsan"
diff --git a/lib/Makefile b/lib/Makefile
index ffc6b2341b45..910335da8f13 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -233,6 +233,7 @@ obj-$(CONFIG_OF_RECONFIG_NOTIFIER_ERROR_INJECT) += \
 	of-reconfig-notifier-error-inject.o
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
 
+obj-$(CONFIG_CODE_TAGGING) += codetag.o
 lib-$(CONFIG_GENERIC_BUG) += bug.o
 
 obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o
diff --git a/lib/codetag.c b/lib/codetag.c
new file mode 100644
index 000000000000..8b5b89ad508d
--- /dev/null
+++ b/lib/codetag.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/codetag.h>
+#include <linux/idr.h>
+#include <linux/kallsyms.h>
+#include <linux/module.h>
+#include <linux/seq_buf.h>
+#include <linux/slab.h>
+
+struct codetag_type {
+	struct list_head link;
+	unsigned int count;
+	struct idr mod_idr;
+	struct rw_semaphore mod_lock; /* protects mod_idr */
+	struct codetag_type_desc desc;
+};
+
+struct codetag_range {
+	struct codetag *start;
+	struct codetag *stop;
+};
+
+struct codetag_module {
+	struct module *mod;
+	struct codetag_range range;
+};
+
+static DEFINE_MUTEX(codetag_lock);
+static LIST_HEAD(codetag_types);
+
+void codetag_lock_module_list(struct codetag_type *cttype, bool lock)
+{
+	if (lock)
+		down_read(&cttype->mod_lock);
+	else
+		up_read(&cttype->mod_lock);
+}
+
+struct codetag_iterator codetag_get_ct_iter(struct codetag_type *cttype)
+{
+	struct codetag_iterator iter = {
+		.cttype = cttype,
+		.cmod = NULL,
+		.mod_id = 0,
+		.ct = NULL,
+	};
+
+	return iter;
+}
+
+static inline struct codetag *get_first_module_ct(struct codetag_module *cmod)
+{
+	return cmod->range.start < cmod->range.stop ? cmod->range.start : NULL;
+}
+
+static inline
+struct codetag *get_next_module_ct(struct codetag_iterator *iter)
+{
+	struct codetag *res = (struct codetag *)
+			((char *)iter->ct + iter->cttype->desc.tag_size);
+
+	return res < iter->cmod->range.stop ? res : NULL;
+}
+
+struct codetag *codetag_next_ct(struct codetag_iterator *iter)
+{
+	struct codetag_type *cttype = iter->cttype;
+	struct codetag_module *cmod;
+	struct codetag *ct;
+
+	lockdep_assert_held(&cttype->mod_lock);
+
+	if (unlikely(idr_is_empty(&cttype->mod_idr)))
+		return NULL;
+
+	ct = NULL;
+	while (true) {
+		cmod = idr_find(&cttype->mod_idr, iter->mod_id);
+
+		/* If module was removed move to the next one */
+		if (!cmod)
+			cmod = idr_get_next_ul(&cttype->mod_idr,
+					       &iter->mod_id);
+
+		/* Exit if no more modules */
+		if (!cmod)
+			break;
+
+		if (cmod != iter->cmod) {
+			iter->cmod = cmod;
+			ct = get_first_module_ct(cmod);
+		} else
+			ct = get_next_module_ct(iter);
+
+		if (ct)
+			break;
+
+		iter->mod_id++;
+	}
+
+	iter->ct = ct;
+	return ct;
+}
+
+void codetag_to_text(struct seq_buf *out, struct codetag *ct)
+{
+	if (ct->modname)
+		seq_buf_printf(out, "%s:%u [%s] func:%s",
+			       ct->filename, ct->lineno,
+			       ct->modname, ct->function);
+	else
+		seq_buf_printf(out, "%s:%u func:%s",
+			       ct->filename, ct->lineno, ct->function);
+}
+
+static inline size_t range_size(const struct codetag_type *cttype,
+				const struct codetag_range *range)
+{
+	return ((char *)range->stop - (char *)range->start) /
+			cttype->desc.tag_size;
+}
+
+#ifdef CONFIG_MODULES
+static void *get_symbol(struct module *mod, const char *prefix, const char *name)
+{
+	DECLARE_SEQ_BUF(sb, KSYM_NAME_LEN);
+	const char *buf;
+
+	seq_buf_printf(&sb, "%s%s", prefix, name);
+	if (seq_buf_has_overflowed(&sb))
+		return NULL;
+
+	buf = seq_buf_str(&sb);
+	return mod ?
+		(void *)find_kallsyms_symbol_value(mod, buf) :
+		(void *)kallsyms_lookup_name(buf);
+}
+
+static struct codetag_range get_section_range(struct module *mod,
+					      const char *section)
+{
+	return (struct codetag_range) {
+		get_symbol(mod, "__start_", section),
+		get_symbol(mod, "__stop_", section),
+	};
+}
+
+static int codetag_module_init(struct codetag_type *cttype, struct module *mod)
+{
+	struct codetag_range range;
+	struct codetag_module *cmod;
+	int err;
+
+	range = get_section_range(mod, cttype->desc.section);
+	if (!range.start || !range.stop) {
+		pr_warn("Failed to load code tags of type %s from the module %s\n",
+			cttype->desc.section,
+			mod ? mod->name : "(built-in)");
+		return -EINVAL;
+	}
+
+	/* Ignore empty ranges */
+	if (range.start == range.stop)
+		return 0;
+
+	BUG_ON(range.start > range.stop);
+
+	cmod = kmalloc(sizeof(*cmod), GFP_KERNEL);
+	if (unlikely(!cmod))
+		return -ENOMEM;
+
+	cmod->mod = mod;
+	cmod->range = range;
+
+	down_write(&cttype->mod_lock);
+	err = idr_alloc(&cttype->mod_idr, cmod, 0, 0, GFP_KERNEL);
+	if (err >= 0)
+		cttype->count += range_size(cttype, &range);
+	up_write(&cttype->mod_lock);
+
+	if (err < 0) {
+		kfree(cmod);
+		return err;
+	}
+
+	return 0;
+}
+
+#else /* CONFIG_MODULES */
+static int codetag_module_init(struct codetag_type *cttype, struct module *mod) { return 0; }
+#endif /* CONFIG_MODULES */
+
+struct codetag_type *
+codetag_register_type(const struct codetag_type_desc *desc)
+{
+	struct codetag_type *cttype;
+	int err;
+
+	BUG_ON(desc->tag_size <= 0);
+
+	cttype = kzalloc(sizeof(*cttype), GFP_KERNEL);
+	if (unlikely(!cttype))
+		return ERR_PTR(-ENOMEM);
+
+	cttype->desc = *desc;
+	idr_init(&cttype->mod_idr);
+	init_rwsem(&cttype->mod_lock);
+
+	err = codetag_module_init(cttype, NULL);
+	if (unlikely(err)) {
+		kfree(cttype);
+		return ERR_PTR(err);
+	}
+
+	mutex_lock(&codetag_lock);
+	list_add_tail(&cttype->link, &codetag_types);
+	mutex_unlock(&codetag_lock);
+
+	return cttype;
+}
-- 
cgit v1.2.3


From a473573964e51dcb6efc182f773cd3924be4a184 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 21 Mar 2024 09:36:33 -0700
Subject: lib: code tagging module support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add support for code tagging from dynamically loaded modules.

Link: https://lkml.kernel.org/r/20240321163705.3067592-12-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Co-developed-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Tested-by: Kees Cook <keescook@chromium.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andreas Hindborg <a.hindborg@samsung.com>
Cc: Benno Lossin <benno.lossin@proton.me>
Cc: "Björn Roy Baron" <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/codetag.h | 12 ++++++++++
 kernel/module/main.c    |  4 ++++
 lib/codetag.c           | 58 +++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 72 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/codetag.h b/include/linux/codetag.h
index 7734269cdb63..c44f5b83f24d 100644
--- a/include/linux/codetag.h
+++ b/include/linux/codetag.h
@@ -33,6 +33,10 @@ union codetag_ref {
 struct codetag_type_desc {
 	const char *section;
 	size_t tag_size;
+	void (*module_load)(struct codetag_type *cttype,
+			    struct codetag_module *cmod);
+	void (*module_unload)(struct codetag_type *cttype,
+			      struct codetag_module *cmod);
 };
 
 struct codetag_iterator {
@@ -65,4 +69,12 @@ void codetag_to_text(struct seq_buf *out, struct codetag *ct);
 struct codetag_type *
 codetag_register_type(const struct codetag_type_desc *desc);
 
+#if defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES)
+void codetag_load_module(struct module *mod);
+void codetag_unload_module(struct module *mod);
+#else
+static inline void codetag_load_module(struct module *mod) {}
+static inline void codetag_unload_module(struct module *mod) {}
+#endif
+
 #endif /* _LINUX_CODETAG_H */
diff --git a/kernel/module/main.c b/kernel/module/main.c
index e1e8a7a9d6c1..ffa6b3e9cb43 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -56,6 +56,7 @@
 #include <linux/dynamic_debug.h>
 #include <linux/audit.h>
 #include <linux/cfi.h>
+#include <linux/codetag.h>
 #include <linux/debugfs.h>
 #include <uapi/linux/module.h>
 #include "internal.h"
@@ -1242,6 +1243,7 @@ static void free_module(struct module *mod)
 {
 	trace_module_free(mod);
 
+	codetag_unload_module(mod);
 	mod_sysfs_teardown(mod);
 
 	/*
@@ -2995,6 +2997,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
 	/* Get rid of temporary copy. */
 	free_copy(info, flags);
 
+	codetag_load_module(mod);
+
 	/* Done! */
 	trace_module_load(mod);
 
diff --git a/lib/codetag.c b/lib/codetag.c
index 8b5b89ad508d..54d2828eba25 100644
--- a/lib/codetag.c
+++ b/lib/codetag.c
@@ -124,15 +124,20 @@ static void *get_symbol(struct module *mod, const char *prefix, const char *name
 {
 	DECLARE_SEQ_BUF(sb, KSYM_NAME_LEN);
 	const char *buf;
+	void *ret;
 
 	seq_buf_printf(&sb, "%s%s", prefix, name);
 	if (seq_buf_has_overflowed(&sb))
 		return NULL;
 
 	buf = seq_buf_str(&sb);
-	return mod ?
+	preempt_disable();
+	ret = mod ?
 		(void *)find_kallsyms_symbol_value(mod, buf) :
 		(void *)kallsyms_lookup_name(buf);
+	preempt_enable();
+
+	return ret;
 }
 
 static struct codetag_range get_section_range(struct module *mod,
@@ -173,8 +178,11 @@ static int codetag_module_init(struct codetag_type *cttype, struct module *mod)
 
 	down_write(&cttype->mod_lock);
 	err = idr_alloc(&cttype->mod_idr, cmod, 0, 0, GFP_KERNEL);
-	if (err >= 0)
+	if (err >= 0) {
 		cttype->count += range_size(cttype, &range);
+		if (cttype->desc.module_load)
+			cttype->desc.module_load(cttype, cmod);
+	}
 	up_write(&cttype->mod_lock);
 
 	if (err < 0) {
@@ -185,6 +193,52 @@ static int codetag_module_init(struct codetag_type *cttype, struct module *mod)
 	return 0;
 }
 
+void codetag_load_module(struct module *mod)
+{
+	struct codetag_type *cttype;
+
+	if (!mod)
+		return;
+
+	mutex_lock(&codetag_lock);
+	list_for_each_entry(cttype, &codetag_types, link)
+		codetag_module_init(cttype, mod);
+	mutex_unlock(&codetag_lock);
+}
+
+void codetag_unload_module(struct module *mod)
+{
+	struct codetag_type *cttype;
+
+	if (!mod)
+		return;
+
+	mutex_lock(&codetag_lock);
+	list_for_each_entry(cttype, &codetag_types, link) {
+		struct codetag_module *found = NULL;
+		struct codetag_module *cmod;
+		unsigned long mod_id, tmp;
+
+		down_write(&cttype->mod_lock);
+		idr_for_each_entry_ul(&cttype->mod_idr, cmod, tmp, mod_id) {
+			if (cmod->mod && cmod->mod == mod) {
+				found = cmod;
+				break;
+			}
+		}
+		if (found) {
+			if (cttype->desc.module_unload)
+				cttype->desc.module_unload(cttype, cmod);
+
+			cttype->count -= range_size(cttype, &cmod->range);
+			idr_remove(&cttype->mod_idr, mod_id);
+			kfree(cmod);
+		}
+		up_write(&cttype->mod_lock);
+	}
+	mutex_unlock(&codetag_lock);
+}
+
 #else /* CONFIG_MODULES */
 static int codetag_module_init(struct codetag_type *cttype, struct module *mod) { return 0; }
 #endif /* CONFIG_MODULES */
-- 
cgit v1.2.3


From 47a92dfbe01f41bcbf359250ccb3caa589763abf Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 21 Mar 2024 09:36:34 -0700
Subject: lib: prevent module unloading if memory is not freed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Skip freeing module's data section if there are non-zero allocation tags
because otherwise, once these allocations are freed, the access to their
code tag would cause UAF.

Link: https://lkml.kernel.org/r/20240321163705.3067592-13-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Tested-by: Kees Cook <keescook@chromium.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andreas Hindborg <a.hindborg@samsung.com>
Cc: Benno Lossin <benno.lossin@proton.me>
Cc: "Björn Roy Baron" <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/codetag.h |  6 +++---
 kernel/module/main.c    | 27 +++++++++++++++++++--------
 lib/codetag.c           | 11 ++++++++---
 3 files changed, 30 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/codetag.h b/include/linux/codetag.h
index c44f5b83f24d..bfd0ba5c4185 100644
--- a/include/linux/codetag.h
+++ b/include/linux/codetag.h
@@ -35,7 +35,7 @@ struct codetag_type_desc {
 	size_t tag_size;
 	void (*module_load)(struct codetag_type *cttype,
 			    struct codetag_module *cmod);
-	void (*module_unload)(struct codetag_type *cttype,
+	bool (*module_unload)(struct codetag_type *cttype,
 			      struct codetag_module *cmod);
 };
 
@@ -71,10 +71,10 @@ codetag_register_type(const struct codetag_type_desc *desc);
 
 #if defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES)
 void codetag_load_module(struct module *mod);
-void codetag_unload_module(struct module *mod);
+bool codetag_unload_module(struct module *mod);
 #else
 static inline void codetag_load_module(struct module *mod) {}
-static inline void codetag_unload_module(struct module *mod) {}
+static inline bool codetag_unload_module(struct module *mod) { return true; }
 #endif
 
 #endif /* _LINUX_CODETAG_H */
diff --git a/kernel/module/main.c b/kernel/module/main.c
index ffa6b3e9cb43..2d25eebc549d 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1211,15 +1211,19 @@ static void *module_memory_alloc(unsigned int size, enum mod_mem_type type)
 	return module_alloc(size);
 }
 
-static void module_memory_free(void *ptr, enum mod_mem_type type)
+static void module_memory_free(void *ptr, enum mod_mem_type type,
+			       bool unload_codetags)
 {
+	if (!unload_codetags && mod_mem_type_is_core_data(type))
+		return;
+
 	if (mod_mem_use_vmalloc(type))
 		vfree(ptr);
 	else
 		module_memfree(ptr);
 }
 
-static void free_mod_mem(struct module *mod)
+static void free_mod_mem(struct module *mod, bool unload_codetags)
 {
 	for_each_mod_mem_type(type) {
 		struct module_memory *mod_mem = &mod->mem[type];
@@ -1230,20 +1234,27 @@ static void free_mod_mem(struct module *mod)
 		/* Free lock-classes; relies on the preceding sync_rcu(). */
 		lockdep_free_key_range(mod_mem->base, mod_mem->size);
 		if (mod_mem->size)
-			module_memory_free(mod_mem->base, type);
+			module_memory_free(mod_mem->base, type,
+					   unload_codetags);
 	}
 
 	/* MOD_DATA hosts mod, so free it at last */
 	lockdep_free_key_range(mod->mem[MOD_DATA].base, mod->mem[MOD_DATA].size);
-	module_memory_free(mod->mem[MOD_DATA].base, MOD_DATA);
+	module_memory_free(mod->mem[MOD_DATA].base, MOD_DATA, unload_codetags);
 }
 
 /* Free a module, remove from lists, etc. */
 static void free_module(struct module *mod)
 {
+	bool unload_codetags;
+
 	trace_module_free(mod);
 
-	codetag_unload_module(mod);
+	unload_codetags = codetag_unload_module(mod);
+	if (!unload_codetags)
+		pr_warn("%s: memory allocation(s) from the module still alive, cannot unload cleanly\n",
+			mod->name);
+
 	mod_sysfs_teardown(mod);
 
 	/*
@@ -1285,7 +1296,7 @@ static void free_module(struct module *mod)
 	kfree(mod->args);
 	percpu_modfree(mod);
 
-	free_mod_mem(mod);
+	free_mod_mem(mod, unload_codetags);
 }
 
 void *__symbol_get(const char *symbol)
@@ -2298,7 +2309,7 @@ static int move_module(struct module *mod, struct load_info *info)
 	return 0;
 out_enomem:
 	for (t--; t >= 0; t--)
-		module_memory_free(mod->mem[t].base, t);
+		module_memory_free(mod->mem[t].base, t, true);
 	return ret;
 }
 
@@ -2428,7 +2439,7 @@ static void module_deallocate(struct module *mod, struct load_info *info)
 	percpu_modfree(mod);
 	module_arch_freeing_init(mod);
 
-	free_mod_mem(mod);
+	free_mod_mem(mod, true);
 }
 
 int __weak module_finalize(const Elf_Ehdr *hdr,
diff --git a/lib/codetag.c b/lib/codetag.c
index 54d2828eba25..408062f722ce 100644
--- a/lib/codetag.c
+++ b/lib/codetag.c
@@ -5,6 +5,7 @@
 #include <linux/module.h>
 #include <linux/seq_buf.h>
 #include <linux/slab.h>
+#include <linux/vmalloc.h>
 
 struct codetag_type {
 	struct list_head link;
@@ -206,12 +207,13 @@ void codetag_load_module(struct module *mod)
 	mutex_unlock(&codetag_lock);
 }
 
-void codetag_unload_module(struct module *mod)
+bool codetag_unload_module(struct module *mod)
 {
 	struct codetag_type *cttype;
+	bool unload_ok = true;
 
 	if (!mod)
-		return;
+		return true;
 
 	mutex_lock(&codetag_lock);
 	list_for_each_entry(cttype, &codetag_types, link) {
@@ -228,7 +230,8 @@ void codetag_unload_module(struct module *mod)
 		}
 		if (found) {
 			if (cttype->desc.module_unload)
-				cttype->desc.module_unload(cttype, cmod);
+				if (!cttype->desc.module_unload(cttype, cmod))
+					unload_ok = false;
 
 			cttype->count -= range_size(cttype, &cmod->range);
 			idr_remove(&cttype->mod_idr, mod_id);
@@ -237,6 +240,8 @@ void codetag_unload_module(struct module *mod)
 		up_write(&cttype->mod_lock);
 	}
 	mutex_unlock(&codetag_lock);
+
+	return unload_ok;
 }
 
 #else /* CONFIG_MODULES */
-- 
cgit v1.2.3


From 22d407b164ff79de42d21f37d99f9ee7abdd51c8 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 21 Mar 2024 09:36:35 -0700
Subject: lib: add allocation tagging support for memory allocation profiling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce CONFIG_MEM_ALLOC_PROFILING which provides definitions to easily
instrument memory allocators.  It registers an "alloc_tags" codetag type
with /proc/allocinfo interface to output allocation tag information when
the feature is enabled.

CONFIG_MEM_ALLOC_PROFILING_DEBUG is provided for debugging the memory
allocation profiling instrumentation.

Memory allocation profiling can be enabled or disabled at runtime using
/proc/sys/vm/mem_profiling sysctl when CONFIG_MEM_ALLOC_PROFILING_DEBUG=n.
CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT enables memory allocation
profiling by default.

[surenb@google.com: Documentation/filesystems/proc.rst: fix allocinfo title]
  Link: https://lkml.kernel.org/r/20240326073813.727090-1-surenb@google.com
[surenb@google.com: do limited memory accounting for modules with ARCH_NEEDS_WEAK_PER_CPU]
  Link: https://lkml.kernel.org/r/20240402180933.1663992-2-surenb@google.com
[klarasmodin@gmail.com: explicitly include irqflags.h in alloc_tag.h]
  Link: https://lkml.kernel.org/r/20240407133252.173636-1-klarasmodin@gmail.com
[surenb@google.com: fix alloc_tag_init() to prevent passing NULL to PTR_ERR()]
  Link: https://lkml.kernel.org/r/20240417003349.2520094-1-surenb@google.com
Link: https://lkml.kernel.org/r/20240321163705.3067592-14-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Co-developed-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Klara Modin <klarasmodin@gmail.com>
Tested-by: Kees Cook <keescook@chromium.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andreas Hindborg <a.hindborg@samsung.com>
Cc: Benno Lossin <benno.lossin@proton.me>
Cc: "Björn Roy Baron" <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/sysctl/vm.rst |  16 ++++
 Documentation/filesystems/proc.rst      |  29 ++++++
 include/asm-generic/codetag.lds.h       |  14 +++
 include/asm-generic/vmlinux.lds.h       |   3 +
 include/linux/alloc_tag.h               | 156 ++++++++++++++++++++++++++++++++
 include/linux/sched.h                   |  24 +++++
 lib/Kconfig.debug                       |  25 +++++
 lib/Makefile                            |   2 +
 lib/alloc_tag.c                         | 152 +++++++++++++++++++++++++++++++
 scripts/module.lds.S                    |   7 ++
 10 files changed, 428 insertions(+)
 create mode 100644 include/asm-generic/codetag.lds.h
 create mode 100644 include/linux/alloc_tag.h
 create mode 100644 lib/alloc_tag.c

(limited to 'include')

diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index c59889de122b..e86c968a7a0e 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -43,6 +43,7 @@ Currently, these files are in /proc/sys/vm:
 - legacy_va_layout
 - lowmem_reserve_ratio
 - max_map_count
+- mem_profiling         (only if CONFIG_MEM_ALLOC_PROFILING=y)
 - memory_failure_early_kill
 - memory_failure_recovery
 - min_free_kbytes
@@ -425,6 +426,21 @@ e.g., up to one or two maps per allocation.
 The default value is 65530.
 
 
+mem_profiling
+==============
+
+Enable memory profiling (when CONFIG_MEM_ALLOC_PROFILING=y)
+
+1: Enable memory profiling.
+
+0: Disable memory profiling.
+
+Enabling memory profiling introduces a small performance overhead for all
+memory allocations.
+
+The default value depends on CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT.
+
+
 memory_failure_early_kill:
 ==========================
 
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index c6a6b9df2104..245269dd6e02 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -688,6 +688,7 @@ files are there, and which are missing.
  ============ ===============================================================
  File         Content
  ============ ===============================================================
+ allocinfo    Memory allocations profiling information
  apm          Advanced power management info
  bootconfig   Kernel command line obtained from boot config,
  	      and, if there were kernel parameters from the
@@ -953,6 +954,34 @@ also be allocatable although a lot of filesystem metadata may have to be
 reclaimed to achieve this.
 
 
+allocinfo
+~~~~~~~~~
+
+Provides information about memory allocations at all locations in the code
+base. Each allocation in the code is identified by its source file, line
+number, module (if originates from a loadable module) and the function calling
+the allocation. The number of bytes allocated and number of calls at each
+location are reported.
+
+Example output.
+
+::
+
+    > sort -rn /proc/allocinfo
+   127664128    31168 mm/page_ext.c:270 func:alloc_page_ext
+    56373248     4737 mm/slub.c:2259 func:alloc_slab_page
+    14880768     3633 mm/readahead.c:247 func:page_cache_ra_unbounded
+    14417920     3520 mm/mm_init.c:2530 func:alloc_large_system_hash
+    13377536      234 block/blk-mq.c:3421 func:blk_mq_alloc_rqs
+    11718656     2861 mm/filemap.c:1919 func:__filemap_get_folio
+     9192960     2800 kernel/fork.c:307 func:alloc_thread_stack_node
+     4206592        4 net/netfilter/nf_conntrack_core.c:2567 func:nf_ct_alloc_hashtable
+     4136960     1010 drivers/staging/ctagmod/ctagmod.c:20 [ctagmod] func:ctagmod_start
+     3940352      962 mm/memory.c:4214 func:alloc_anon_folio
+     2894464    22613 fs/kernfs/dir.c:615 func:__kernfs_new_node
+     ...
+
+
 meminfo
 ~~~~~~~
 
diff --git a/include/asm-generic/codetag.lds.h b/include/asm-generic/codetag.lds.h
new file mode 100644
index 000000000000..64f536b80380
--- /dev/null
+++ b/include/asm-generic/codetag.lds.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __ASM_GENERIC_CODETAG_LDS_H
+#define __ASM_GENERIC_CODETAG_LDS_H
+
+#define SECTION_WITH_BOUNDARIES(_name)	\
+	. = ALIGN(8);			\
+	__start_##_name = .;		\
+	KEEP(*(_name))			\
+	__stop_##_name = .;
+
+#define CODETAG_SECTIONS()		\
+	SECTION_WITH_BOUNDARIES(alloc_tags)
+
+#endif /* __ASM_GENERIC_CODETAG_LDS_H */
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index f7749d0f2562..3e4497b5135a 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -50,6 +50,8 @@
  *               [__nosave_begin, __nosave_end] for the nosave data
  */
 
+#include <asm-generic/codetag.lds.h>
+
 #ifndef LOAD_OFFSET
 #define LOAD_OFFSET 0
 #endif
@@ -366,6 +368,7 @@
 	. = ALIGN(8);							\
 	BOUNDED_SECTION_BY(__dyndbg_classes, ___dyndbg_classes)		\
 	BOUNDED_SECTION_BY(__dyndbg, ___dyndbg)				\
+	CODETAG_SECTIONS()						\
 	LIKELY_PROFILE()		       				\
 	BRANCH_PROFILE()						\
 	TRACE_PRINTKS()							\
diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
new file mode 100644
index 000000000000..13561223fdab
--- /dev/null
+++ b/include/linux/alloc_tag.h
@@ -0,0 +1,156 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * allocation tagging
+ */
+#ifndef _LINUX_ALLOC_TAG_H
+#define _LINUX_ALLOC_TAG_H
+
+#include <linux/bug.h>
+#include <linux/codetag.h>
+#include <linux/container_of.h>
+#include <linux/preempt.h>
+#include <asm/percpu.h>
+#include <linux/cpumask.h>
+#include <linux/static_key.h>
+#include <linux/irqflags.h>
+
+struct alloc_tag_counters {
+	u64 bytes;
+	u64 calls;
+};
+
+/*
+ * An instance of this structure is created in a special ELF section at every
+ * allocation callsite. At runtime, the special section is treated as
+ * an array of these. Embedded codetag utilizes codetag framework.
+ */
+struct alloc_tag {
+	struct codetag			ct;
+	struct alloc_tag_counters __percpu	*counters;
+} __aligned(8);
+
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+
+static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct)
+{
+	return container_of(ct, struct alloc_tag, ct);
+}
+
+#ifdef ARCH_NEEDS_WEAK_PER_CPU
+/*
+ * When percpu variables are required to be defined as weak, static percpu
+ * variables can't be used inside a function (see comments for DECLARE_PER_CPU_SECTION).
+ * Instead we will accound all module allocations to a single counter.
+ */
+DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag);
+
+#define DEFINE_ALLOC_TAG(_alloc_tag)						\
+	static struct alloc_tag _alloc_tag __used __aligned(8)			\
+	__section("alloc_tags") = {						\
+		.ct = CODE_TAG_INIT,						\
+		.counters = &_shared_alloc_tag };
+
+#else /* ARCH_NEEDS_WEAK_PER_CPU */
+
+#define DEFINE_ALLOC_TAG(_alloc_tag)						\
+	static DEFINE_PER_CPU(struct alloc_tag_counters, _alloc_tag_cntr);	\
+	static struct alloc_tag _alloc_tag __used __aligned(8)			\
+	__section("alloc_tags") = {						\
+		.ct = CODE_TAG_INIT,						\
+		.counters = &_alloc_tag_cntr };
+
+#endif /* ARCH_NEEDS_WEAK_PER_CPU */
+
+DECLARE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
+			mem_alloc_profiling_key);
+
+static inline bool mem_alloc_profiling_enabled(void)
+{
+	return static_branch_maybe(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
+				   &mem_alloc_profiling_key);
+}
+
+static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag)
+{
+	struct alloc_tag_counters v = { 0, 0 };
+	struct alloc_tag_counters *counter;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		counter = per_cpu_ptr(tag->counters, cpu);
+		v.bytes += counter->bytes;
+		v.calls += counter->calls;
+	}
+
+	return v;
+}
+
+#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
+static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag)
+{
+	WARN_ONCE(ref && ref->ct,
+		  "alloc_tag was not cleared (got tag for %s:%u)\n",
+		  ref->ct->filename, ref->ct->lineno);
+
+	WARN_ONCE(!tag, "current->alloc_tag not set");
+}
+
+static inline void alloc_tag_sub_check(union codetag_ref *ref)
+{
+	WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n");
+}
+#else
+static inline void alloc_tag_add_check(union codetag_ref *ref, struct alloc_tag *tag) {}
+static inline void alloc_tag_sub_check(union codetag_ref *ref) {}
+#endif
+
+/* Caller should verify both ref and tag to be valid */
+static inline void __alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag)
+{
+	ref->ct = &tag->ct;
+	/*
+	 * We need in increment the call counter every time we have a new
+	 * allocation or when we split a large allocation into smaller ones.
+	 * Each new reference for every sub-allocation needs to increment call
+	 * counter because when we free each part the counter will be decremented.
+	 */
+	this_cpu_inc(tag->counters->calls);
+}
+
+static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes)
+{
+	alloc_tag_add_check(ref, tag);
+	if (!ref || !tag)
+		return;
+
+	__alloc_tag_ref_set(ref, tag);
+	this_cpu_add(tag->counters->bytes, bytes);
+}
+
+static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes)
+{
+	struct alloc_tag *tag;
+
+	alloc_tag_sub_check(ref);
+	if (!ref || !ref->ct)
+		return;
+
+	tag = ct_to_alloc_tag(ref->ct);
+
+	this_cpu_sub(tag->counters->bytes, bytes);
+	this_cpu_dec(tag->counters->calls);
+
+	ref->ct = NULL;
+}
+
+#else /* CONFIG_MEM_ALLOC_PROFILING */
+
+#define DEFINE_ALLOC_TAG(_alloc_tag)
+static inline bool mem_alloc_profiling_enabled(void) { return false; }
+static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag,
+				 size_t bytes) {}
+static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {}
+
+#endif /* CONFIG_MEM_ALLOC_PROFILING */
+
+#endif /* _LINUX_ALLOC_TAG_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3c2abbc587b4..4118b3f959c3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -770,6 +770,10 @@ struct task_struct {
 	unsigned int			flags;
 	unsigned int			ptrace;
 
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+	struct alloc_tag		*alloc_tag;
+#endif
+
 #ifdef CONFIG_SMP
 	int				on_cpu;
 	struct __call_single_node	wake_entry;
@@ -810,6 +814,7 @@ struct task_struct {
 	struct task_group		*sched_task_group;
 #endif
 
+
 #ifdef CONFIG_UCLAMP_TASK
 	/*
 	 * Clamp values requested for a scheduling entity.
@@ -2187,4 +2192,23 @@ static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); }
 
 extern void sched_set_stop_task(int cpu, struct task_struct *stop);
 
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag)
+{
+	swap(current->alloc_tag, tag);
+	return tag;
+}
+
+static inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old)
+{
+#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
+	WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n");
+#endif
+	current->alloc_tag = old;
+}
+#else
+#define alloc_tag_save(_tag)			NULL
+#define alloc_tag_restore(_tag, _old)		do {} while (0)
+#endif
+
 #endif
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 015fc6ee9849..fa7aa32ba11a 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -972,6 +972,31 @@ config CODE_TAGGING
 	bool
 	select KALLSYMS
 
+config MEM_ALLOC_PROFILING
+	bool "Enable memory allocation profiling"
+	default n
+	depends on PROC_FS
+	depends on !DEBUG_FORCE_WEAK_PER_CPU
+	select CODE_TAGGING
+	help
+	  Track allocation source code and record total allocation size
+	  initiated at that code location. The mechanism can be used to track
+	  memory leaks with a low performance and memory impact.
+
+config MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
+	bool "Enable memory allocation profiling by default"
+	default y
+	depends on MEM_ALLOC_PROFILING
+
+config MEM_ALLOC_PROFILING_DEBUG
+	bool "Memory allocation profiler debugging"
+	default n
+	depends on MEM_ALLOC_PROFILING
+	select MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
+	help
+	  Adds warnings with helpful error messages for memory allocation
+	  profiling.
+
 source "lib/Kconfig.kasan"
 source "lib/Kconfig.kfence"
 source "lib/Kconfig.kmsan"
diff --git a/lib/Makefile b/lib/Makefile
index 910335da8f13..2f4e17bfb299 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -234,6 +234,8 @@ obj-$(CONFIG_OF_RECONFIG_NOTIFIER_ERROR_INJECT) += \
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
 
 obj-$(CONFIG_CODE_TAGGING) += codetag.o
+obj-$(CONFIG_MEM_ALLOC_PROFILING) += alloc_tag.o
+
 lib-$(CONFIG_GENERIC_BUG) += bug.o
 
 obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
new file mode 100644
index 000000000000..331dd17650f3
--- /dev/null
+++ b/lib/alloc_tag.c
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/alloc_tag.h>
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_buf.h>
+#include <linux/seq_file.h>
+
+static struct codetag_type *alloc_tag_cttype;
+
+DEFINE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag);
+EXPORT_SYMBOL(_shared_alloc_tag);
+
+DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
+			mem_alloc_profiling_key);
+
+static void *allocinfo_start(struct seq_file *m, loff_t *pos)
+{
+	struct codetag_iterator *iter;
+	struct codetag *ct;
+	loff_t node = *pos;
+
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+	m->private = iter;
+	if (!iter)
+		return NULL;
+
+	codetag_lock_module_list(alloc_tag_cttype, true);
+	*iter = codetag_get_ct_iter(alloc_tag_cttype);
+	while ((ct = codetag_next_ct(iter)) != NULL && node)
+		node--;
+
+	return ct ? iter : NULL;
+}
+
+static void *allocinfo_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+	struct codetag_iterator *iter = (struct codetag_iterator *)arg;
+	struct codetag *ct = codetag_next_ct(iter);
+
+	(*pos)++;
+	if (!ct)
+		return NULL;
+
+	return iter;
+}
+
+static void allocinfo_stop(struct seq_file *m, void *arg)
+{
+	struct codetag_iterator *iter = (struct codetag_iterator *)m->private;
+
+	if (iter) {
+		codetag_lock_module_list(alloc_tag_cttype, false);
+		kfree(iter);
+	}
+}
+
+static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct)
+{
+	struct alloc_tag *tag = ct_to_alloc_tag(ct);
+	struct alloc_tag_counters counter = alloc_tag_read(tag);
+	s64 bytes = counter.bytes;
+
+	seq_buf_printf(out, "%12lli %8llu ", bytes, counter.calls);
+	codetag_to_text(out, ct);
+	seq_buf_putc(out, ' ');
+	seq_buf_putc(out, '\n');
+}
+
+static int allocinfo_show(struct seq_file *m, void *arg)
+{
+	struct codetag_iterator *iter = (struct codetag_iterator *)arg;
+	char *bufp;
+	size_t n = seq_get_buf(m, &bufp);
+	struct seq_buf buf;
+
+	seq_buf_init(&buf, bufp, n);
+	alloc_tag_to_text(&buf, iter->ct);
+	seq_commit(m, seq_buf_used(&buf));
+	return 0;
+}
+
+static const struct seq_operations allocinfo_seq_op = {
+	.start	= allocinfo_start,
+	.next	= allocinfo_next,
+	.stop	= allocinfo_stop,
+	.show	= allocinfo_show,
+};
+
+static void __init procfs_init(void)
+{
+	proc_create_seq("allocinfo", 0444, NULL, &allocinfo_seq_op);
+}
+
+static bool alloc_tag_module_unload(struct codetag_type *cttype,
+				    struct codetag_module *cmod)
+{
+	struct codetag_iterator iter = codetag_get_ct_iter(cttype);
+	struct alloc_tag_counters counter;
+	bool module_unused = true;
+	struct alloc_tag *tag;
+	struct codetag *ct;
+
+	for (ct = codetag_next_ct(&iter); ct; ct = codetag_next_ct(&iter)) {
+		if (iter.cmod != cmod)
+			continue;
+
+		tag = ct_to_alloc_tag(ct);
+		counter = alloc_tag_read(tag);
+
+		if (WARN(counter.bytes,
+			 "%s:%u module %s func:%s has %llu allocated at module unload",
+			 ct->filename, ct->lineno, ct->modname, ct->function, counter.bytes))
+			module_unused = false;
+	}
+
+	return module_unused;
+}
+
+static struct ctl_table memory_allocation_profiling_sysctls[] = {
+	{
+		.procname	= "mem_profiling",
+		.data		= &mem_alloc_profiling_key,
+#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
+		.mode		= 0444,
+#else
+		.mode		= 0644,
+#endif
+		.proc_handler	= proc_do_static_key,
+	},
+	{ }
+};
+
+static int __init alloc_tag_init(void)
+{
+	const struct codetag_type_desc desc = {
+		.section	= "alloc_tags",
+		.tag_size	= sizeof(struct alloc_tag),
+		.module_unload	= alloc_tag_module_unload,
+	};
+
+	alloc_tag_cttype = codetag_register_type(&desc);
+	if (IS_ERR(alloc_tag_cttype))
+		return PTR_ERR(alloc_tag_cttype);
+
+	register_sysctl_init("vm", memory_allocation_profiling_sysctls);
+	procfs_init();
+
+	return 0;
+}
+module_init(alloc_tag_init);
diff --git a/scripts/module.lds.S b/scripts/module.lds.S
index bf5bcf2836d8..45c67a0994f3 100644
--- a/scripts/module.lds.S
+++ b/scripts/module.lds.S
@@ -9,6 +9,8 @@
 #define DISCARD_EH_FRAME	*(.eh_frame)
 #endif
 
+#include <asm-generic/codetag.lds.h>
+
 SECTIONS {
 	/DISCARD/ : {
 		*(.discard)
@@ -47,12 +49,17 @@ SECTIONS {
 	.data : {
 		*(.data .data.[0-9a-zA-Z_]*)
 		*(.data..L*)
+		CODETAG_SECTIONS()
 	}
 
 	.rodata : {
 		*(.rodata .rodata.[0-9a-zA-Z_]*)
 		*(.rodata..L*)
 	}
+#else
+	.data : {
+		CODETAG_SECTIONS()
+	}
 #endif
 }
 
-- 
cgit v1.2.3


From dcfe378c81f72f146890ce1dcfdcc742d3b66924 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 21 Mar 2024 09:36:36 -0700
Subject: lib: introduce support for page allocation tagging
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce helper functions to easily instrument page allocators by storing
a pointer to the allocation tag associated with the code that allocated
the page in a page_ext field.

Link: https://lkml.kernel.org/r/20240321163705.3067592-15-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Co-developed-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Kees Cook <keescook@chromium.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andreas Hindborg <a.hindborg@samsung.com>
Cc: Benno Lossin <benno.lossin@proton.me>
Cc: "Björn Roy Baron" <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page_ext.h    |  1 -
 include/linux/pgalloc_tag.h | 78 +++++++++++++++++++++++++++++++++++++++++++++
 lib/Kconfig.debug           |  1 +
 lib/alloc_tag.c             | 17 ++++++++++
 mm/mm_init.c                |  1 +
 mm/page_alloc.c             |  4 +++
 mm/page_ext.c               |  4 +++
 7 files changed, 105 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/pgalloc_tag.h

(limited to 'include')

diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index be98564191e6..07e0656898f9 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -4,7 +4,6 @@
 
 #include <linux/types.h>
 #include <linux/stacktrace.h>
-#include <linux/stackdepot.h>
 
 struct pglist_data;
 
diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h
new file mode 100644
index 000000000000..66bd021eb46e
--- /dev/null
+++ b/include/linux/pgalloc_tag.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * page allocation tagging
+ */
+#ifndef _LINUX_PGALLOC_TAG_H
+#define _LINUX_PGALLOC_TAG_H
+
+#include <linux/alloc_tag.h>
+
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+
+#include <linux/page_ext.h>
+
+extern struct page_ext_operations page_alloc_tagging_ops;
+extern struct page_ext *page_ext_get(struct page *page);
+extern void page_ext_put(struct page_ext *page_ext);
+
+static inline union codetag_ref *codetag_ref_from_page_ext(struct page_ext *page_ext)
+{
+	return (void *)page_ext + page_alloc_tagging_ops.offset;
+}
+
+static inline struct page_ext *page_ext_from_codetag_ref(union codetag_ref *ref)
+{
+	return (void *)ref - page_alloc_tagging_ops.offset;
+}
+
+/* Should be called only if mem_alloc_profiling_enabled() */
+static inline union codetag_ref *get_page_tag_ref(struct page *page)
+{
+	if (page) {
+		struct page_ext *page_ext = page_ext_get(page);
+
+		if (page_ext)
+			return codetag_ref_from_page_ext(page_ext);
+	}
+	return NULL;
+}
+
+static inline void put_page_tag_ref(union codetag_ref *ref)
+{
+	page_ext_put(page_ext_from_codetag_ref(ref));
+}
+
+static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
+				   unsigned int nr)
+{
+	if (mem_alloc_profiling_enabled()) {
+		union codetag_ref *ref = get_page_tag_ref(page);
+
+		if (ref) {
+			alloc_tag_add(ref, task->alloc_tag, PAGE_SIZE * nr);
+			put_page_tag_ref(ref);
+		}
+	}
+}
+
+static inline void pgalloc_tag_sub(struct page *page, unsigned int nr)
+{
+	if (mem_alloc_profiling_enabled()) {
+		union codetag_ref *ref = get_page_tag_ref(page);
+
+		if (ref) {
+			alloc_tag_sub(ref, PAGE_SIZE * nr);
+			put_page_tag_ref(ref);
+		}
+	}
+}
+
+#else /* CONFIG_MEM_ALLOC_PROFILING */
+
+static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
+				   unsigned int nr) {}
+static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
+
+#endif /* CONFIG_MEM_ALLOC_PROFILING */
+
+#endif /* _LINUX_PGALLOC_TAG_H */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index fa7aa32ba11a..a8f77de5e8ea 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -978,6 +978,7 @@ config MEM_ALLOC_PROFILING
 	depends on PROC_FS
 	depends on !DEBUG_FORCE_WEAK_PER_CPU
 	select CODE_TAGGING
+	select PAGE_EXTENSION
 	help
 	  Track allocation source code and record total allocation size
 	  initiated at that code location. The mechanism can be used to track
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index 331dd17650f3..59bb71036837 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -3,6 +3,7 @@
 #include <linux/fs.h>
 #include <linux/gfp.h>
 #include <linux/module.h>
+#include <linux/page_ext.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_buf.h>
 #include <linux/seq_file.h>
@@ -118,6 +119,22 @@ static bool alloc_tag_module_unload(struct codetag_type *cttype,
 	return module_unused;
 }
 
+static __init bool need_page_alloc_tagging(void)
+{
+	return true;
+}
+
+static __init void init_page_alloc_tagging(void)
+{
+}
+
+struct page_ext_operations page_alloc_tagging_ops = {
+	.size = sizeof(union codetag_ref),
+	.need = need_page_alloc_tagging,
+	.init = init_page_alloc_tagging,
+};
+EXPORT_SYMBOL(page_alloc_tagging_ops);
+
 static struct ctl_table memory_allocation_profiling_sysctls[] = {
 	{
 		.procname	= "mem_profiling",
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 549e76af8f82..2fd9bf044a79 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -24,6 +24,7 @@
 #include <linux/page_ext.h>
 #include <linux/pti.h>
 #include <linux/pgtable.h>
+#include <linux/stackdepot.h>
 #include <linux/swap.h>
 #include <linux/cma.h>
 #include <linux/crash_dump.h>
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5083ac034d26..3fd273c22749 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -54,6 +54,7 @@
 #include <linux/khugepaged.h>
 #include <linux/delayacct.h>
 #include <linux/cacheinfo.h>
+#include <linux/pgalloc_tag.h>
 #include <asm/div64.h>
 #include "internal.h"
 #include "shuffle.h"
@@ -1101,6 +1102,7 @@ __always_inline bool free_pages_prepare(struct page *page,
 		/* Do not let hwpoison pages hit pcplists/buddy */
 		reset_page_owner(page, order);
 		page_table_check_free(page, order);
+		pgalloc_tag_sub(page, 1 << order);
 		return false;
 	}
 
@@ -1140,6 +1142,7 @@ __always_inline bool free_pages_prepare(struct page *page,
 	page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
 	reset_page_owner(page, order);
 	page_table_check_free(page, order);
+	pgalloc_tag_sub(page, 1 << order);
 
 	if (!PageHighMem(page)) {
 		debug_check_no_locks_freed(page_address(page),
@@ -1533,6 +1536,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
 
 	set_page_owner(page, order, gfp_flags);
 	page_table_check_alloc(page, order);
+	pgalloc_tag_add(page, current, 1 << order);
 }
 
 static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 4548fcc66d74..3c58fe8a24df 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -10,6 +10,7 @@
 #include <linux/page_idle.h>
 #include <linux/page_table_check.h>
 #include <linux/rcupdate.h>
+#include <linux/pgalloc_tag.h>
 
 /*
  * struct page extension
@@ -82,6 +83,9 @@ static struct page_ext_operations *page_ext_ops[] __initdata = {
 #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
 	&page_idle_ops,
 #endif
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+	&page_alloc_tagging_ops,
+#endif
 #ifdef CONFIG_PAGE_TABLE_CHECK
 	&page_table_check_ops,
 #endif
-- 
cgit v1.2.3


From ccdabb1d7f7a09810495ac57c0154a9913791dd2 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 21 Mar 2024 09:36:38 -0700
Subject: mm: percpu: increase PERCPU_MODULE_RESERVE to accommodate allocation
 tags
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As each allocation tag generates a per-cpu variable, more space is
required to store them.  Increase PERCPU_MODULE_RESERVE to provide enough
area.  A better long-term solution would be to allocate this memory
dynamically.

[surenb@google.com: increase PERCPU_MODULE_RESERVE to accommodate allocation tags]
  Link: https://lkml.kernel.org/r/20240406214044.1114406-1-surenb@google.com
Link: https://lkml.kernel.org/r/20240321163705.3067592-17-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Tested-by: Kees Cook <keescook@chromium.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andreas Hindborg <a.hindborg@samsung.com>
Cc: Benno Lossin <benno.lossin@proton.me>
Cc: "Björn Roy Baron" <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/percpu.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 8c677f185901..9dfa7b526e70 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -14,7 +14,11 @@
 
 /* enough to cover all DEFINE_PER_CPUs in modules */
 #ifdef CONFIG_MODULES
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+#define PERCPU_MODULE_RESERVE		(8 << 13)
+#else
 #define PERCPU_MODULE_RESERVE		(8 << 10)
+#endif
 #else
 #define PERCPU_MODULE_RESERVE		0
 #endif
-- 
cgit v1.2.3


From 8a2f11878771da65b8ac135c73b47dae13afbd62 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 21 Mar 2024 09:36:39 -0700
Subject: change alloc_pages name in dma_map_ops to avoid name conflicts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After redefining alloc_pages, all uses of that name are being replaced.
Change the conflicting names to prevent preprocessor from replacing them
when it's not intended.

Link: https://lkml.kernel.org/r/20240321163705.3067592-18-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Tested-by: Kees Cook <keescook@chromium.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andreas Hindborg <a.hindborg@samsung.com>
Cc: Benno Lossin <benno.lossin@proton.me>
Cc: "Björn Roy Baron" <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/alpha/kernel/pci_iommu.c           | 2 +-
 arch/mips/jazz/jazzdma.c                | 2 +-
 arch/powerpc/kernel/dma-iommu.c         | 2 +-
 arch/powerpc/platforms/ps3/system-bus.c | 4 ++--
 arch/powerpc/platforms/pseries/vio.c    | 2 +-
 arch/x86/kernel/amd_gart_64.c           | 2 +-
 drivers/iommu/dma-iommu.c               | 2 +-
 drivers/parisc/ccio-dma.c               | 2 +-
 drivers/parisc/sba_iommu.c              | 2 +-
 drivers/xen/grant-dma-ops.c             | 2 +-
 drivers/xen/swiotlb-xen.c               | 2 +-
 include/linux/dma-map-ops.h             | 2 +-
 kernel/dma/mapping.c                    | 4 ++--
 13 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c
index c81183935e97..7fcf3e9b7103 100644
--- a/arch/alpha/kernel/pci_iommu.c
+++ b/arch/alpha/kernel/pci_iommu.c
@@ -929,7 +929,7 @@ const struct dma_map_ops alpha_pci_ops = {
 	.dma_supported		= alpha_pci_supported,
 	.mmap			= dma_common_mmap,
 	.get_sgtable		= dma_common_get_sgtable,
-	.alloc_pages		= dma_common_alloc_pages,
+	.alloc_pages_op		= dma_common_alloc_pages,
 	.free_pages		= dma_common_free_pages,
 };
 EXPORT_SYMBOL(alpha_pci_ops);
diff --git a/arch/mips/jazz/jazzdma.c b/arch/mips/jazz/jazzdma.c
index eabddb89d221..c97b089b9902 100644
--- a/arch/mips/jazz/jazzdma.c
+++ b/arch/mips/jazz/jazzdma.c
@@ -617,7 +617,7 @@ const struct dma_map_ops jazz_dma_ops = {
 	.sync_sg_for_device	= jazz_dma_sync_sg_for_device,
 	.mmap			= dma_common_mmap,
 	.get_sgtable		= dma_common_get_sgtable,
-	.alloc_pages		= dma_common_alloc_pages,
+	.alloc_pages_op		= dma_common_alloc_pages,
 	.free_pages		= dma_common_free_pages,
 };
 EXPORT_SYMBOL(jazz_dma_ops);
diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c
index 8920862ffd79..f0ae39e77e37 100644
--- a/arch/powerpc/kernel/dma-iommu.c
+++ b/arch/powerpc/kernel/dma-iommu.c
@@ -216,6 +216,6 @@ const struct dma_map_ops dma_iommu_ops = {
 	.get_required_mask	= dma_iommu_get_required_mask,
 	.mmap			= dma_common_mmap,
 	.get_sgtable		= dma_common_get_sgtable,
-	.alloc_pages		= dma_common_alloc_pages,
+	.alloc_pages_op		= dma_common_alloc_pages,
 	.free_pages		= dma_common_free_pages,
 };
diff --git a/arch/powerpc/platforms/ps3/system-bus.c b/arch/powerpc/platforms/ps3/system-bus.c
index d6b5f5ecd515..56dc6b29a3e7 100644
--- a/arch/powerpc/platforms/ps3/system-bus.c
+++ b/arch/powerpc/platforms/ps3/system-bus.c
@@ -695,7 +695,7 @@ static const struct dma_map_ops ps3_sb_dma_ops = {
 	.unmap_page = ps3_unmap_page,
 	.mmap = dma_common_mmap,
 	.get_sgtable = dma_common_get_sgtable,
-	.alloc_pages = dma_common_alloc_pages,
+	.alloc_pages_op = dma_common_alloc_pages,
 	.free_pages = dma_common_free_pages,
 };
 
@@ -709,7 +709,7 @@ static const struct dma_map_ops ps3_ioc0_dma_ops = {
 	.unmap_page = ps3_unmap_page,
 	.mmap = dma_common_mmap,
 	.get_sgtable = dma_common_get_sgtable,
-	.alloc_pages = dma_common_alloc_pages,
+	.alloc_pages_op = dma_common_alloc_pages,
 	.free_pages = dma_common_free_pages,
 };
 
diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c
index 90ff85c879bf..477c1d5e1737 100644
--- a/arch/powerpc/platforms/pseries/vio.c
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -611,7 +611,7 @@ static const struct dma_map_ops vio_dma_mapping_ops = {
 	.get_required_mask = dma_iommu_get_required_mask,
 	.mmap		   = dma_common_mmap,
 	.get_sgtable	   = dma_common_get_sgtable,
-	.alloc_pages	   = dma_common_alloc_pages,
+	.alloc_pages_op	   = dma_common_alloc_pages,
 	.free_pages	   = dma_common_free_pages,
 };
 
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 2ae98f754e59..c884deca839b 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -676,7 +676,7 @@ static const struct dma_map_ops gart_dma_ops = {
 	.get_sgtable			= dma_common_get_sgtable,
 	.dma_supported			= dma_direct_supported,
 	.get_required_mask		= dma_direct_get_required_mask,
-	.alloc_pages			= dma_direct_alloc_pages,
+	.alloc_pages_op			= dma_direct_alloc_pages,
 	.free_pages			= dma_direct_free_pages,
 };
 
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index e4cb26f6a943..07d087eecc17 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1723,7 +1723,7 @@ static const struct dma_map_ops iommu_dma_ops = {
 	.flags			= DMA_F_PCI_P2PDMA_SUPPORTED,
 	.alloc			= iommu_dma_alloc,
 	.free			= iommu_dma_free,
-	.alloc_pages		= dma_common_alloc_pages,
+	.alloc_pages_op		= dma_common_alloc_pages,
 	.free_pages		= dma_common_free_pages,
 	.alloc_noncontiguous	= iommu_dma_alloc_noncontiguous,
 	.free_noncontiguous	= iommu_dma_free_noncontiguous,
diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c
index 9ce0d20a6c58..feef537257d0 100644
--- a/drivers/parisc/ccio-dma.c
+++ b/drivers/parisc/ccio-dma.c
@@ -1022,7 +1022,7 @@ static const struct dma_map_ops ccio_ops = {
 	.map_sg =		ccio_map_sg,
 	.unmap_sg =		ccio_unmap_sg,
 	.get_sgtable =		dma_common_get_sgtable,
-	.alloc_pages =		dma_common_alloc_pages,
+	.alloc_pages_op =	dma_common_alloc_pages,
 	.free_pages =		dma_common_free_pages,
 };
 
diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c
index 784037837f65..fc3863c09f83 100644
--- a/drivers/parisc/sba_iommu.c
+++ b/drivers/parisc/sba_iommu.c
@@ -1090,7 +1090,7 @@ static const struct dma_map_ops sba_ops = {
 	.map_sg =		sba_map_sg,
 	.unmap_sg =		sba_unmap_sg,
 	.get_sgtable =		dma_common_get_sgtable,
-	.alloc_pages =		dma_common_alloc_pages,
+	.alloc_pages_op =	dma_common_alloc_pages,
 	.free_pages =		dma_common_free_pages,
 };
 
diff --git a/drivers/xen/grant-dma-ops.c b/drivers/xen/grant-dma-ops.c
index 76f6f26265a3..29257d2639db 100644
--- a/drivers/xen/grant-dma-ops.c
+++ b/drivers/xen/grant-dma-ops.c
@@ -282,7 +282,7 @@ static int xen_grant_dma_supported(struct device *dev, u64 mask)
 static const struct dma_map_ops xen_grant_dma_ops = {
 	.alloc = xen_grant_dma_alloc,
 	.free = xen_grant_dma_free,
-	.alloc_pages = xen_grant_dma_alloc_pages,
+	.alloc_pages_op = xen_grant_dma_alloc_pages,
 	.free_pages = xen_grant_dma_free_pages,
 	.mmap = dma_common_mmap,
 	.get_sgtable = dma_common_get_sgtable,
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 0e6c6c25d154..1c4ef5111651 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -403,7 +403,7 @@ const struct dma_map_ops xen_swiotlb_dma_ops = {
 	.dma_supported = xen_swiotlb_dma_supported,
 	.mmap = dma_common_mmap,
 	.get_sgtable = dma_common_get_sgtable,
-	.alloc_pages = dma_common_alloc_pages,
+	.alloc_pages_op = dma_common_alloc_pages,
 	.free_pages = dma_common_free_pages,
 	.max_mapping_size = swiotlb_max_mapping_size,
 };
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 4abc60f04209..9ee319851b5f 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -29,7 +29,7 @@ struct dma_map_ops {
 			unsigned long attrs);
 	void (*free)(struct device *dev, size_t size, void *vaddr,
 			dma_addr_t dma_handle, unsigned long attrs);
-	struct page *(*alloc_pages)(struct device *dev, size_t size,
+	struct page *(*alloc_pages_op)(struct device *dev, size_t size,
 			dma_addr_t *dma_handle, enum dma_data_direction dir,
 			gfp_t gfp);
 	void (*free_pages)(struct device *dev, size_t size, struct page *vaddr,
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 58db8fd70471..5e2d51e1cdf6 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -570,9 +570,9 @@ static struct page *__dma_alloc_pages(struct device *dev, size_t size,
 	size = PAGE_ALIGN(size);
 	if (dma_alloc_direct(dev, ops))
 		return dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp);
-	if (!ops->alloc_pages)
+	if (!ops->alloc_pages_op)
 		return NULL;
-	return ops->alloc_pages(dev, size, dma_handle, dir, gfp);
+	return ops->alloc_pages_op(dev, size, dma_handle, dir, gfp);
 }
 
 struct page *dma_alloc_pages(struct device *dev, size_t size,
-- 
cgit v1.2.3


From b951aaff503502a7fe066eeed2744ba8a6413c89 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 21 Mar 2024 09:36:40 -0700
Subject: mm: enable page allocation tagging
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Redefine page allocators to record allocation tags upon their invocation.
Instrument post_alloc_hook and free_pages_prepare to modify current
allocation tag.

[surenb@google.com: undo _noprof additions in the documentation]
  Link: https://lkml.kernel.org/r/20240326231453.1206227-3-surenb@google.com
Link: https://lkml.kernel.org/r/20240321163705.3067592-19-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Co-developed-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andreas Hindborg <a.hindborg@samsung.com>
Cc: Benno Lossin <benno.lossin@proton.me>
Cc: "Björn Roy Baron" <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/alloc_tag.h |  14 ++++++
 include/linux/gfp.h       | 126 +++++++++++++++++++++++++++++-----------------
 include/linux/pagemap.h   |   9 ++--
 mm/compaction.c           |   7 ++-
 mm/filemap.c              |   6 +--
 mm/mempolicy.c            |  46 ++++++++---------
 mm/page_alloc.c           |  52 +++++++++----------
 7 files changed, 157 insertions(+), 103 deletions(-)

(limited to 'include')

diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index 13561223fdab..8932187124da 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -153,4 +153,18 @@ static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {}
 
 #endif /* CONFIG_MEM_ALLOC_PROFILING */
 
+#define alloc_hooks_tag(_tag, _do_alloc)				\
+({									\
+	struct alloc_tag * __maybe_unused _old = alloc_tag_save(_tag);	\
+	typeof(_do_alloc) _res = _do_alloc;				\
+	alloc_tag_restore(_tag, _old);					\
+	_res;								\
+})
+
+#define alloc_hooks(_do_alloc)						\
+({									\
+	DEFINE_ALLOC_TAG(_alloc_tag);					\
+	alloc_hooks_tag(&_alloc_tag, _do_alloc);			\
+})
+
 #endif /* _LINUX_ALLOC_TAG_H */
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index c775ea3c6015..450c2cbcf04b 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -6,6 +6,8 @@
 
 #include <linux/mmzone.h>
 #include <linux/topology.h>
+#include <linux/alloc_tag.h>
+#include <linux/sched.h>
 
 struct vm_area_struct;
 struct mempolicy;
@@ -175,42 +177,46 @@ static inline void arch_free_page(struct page *page, int order) { }
 static inline void arch_alloc_page(struct page *page, int order) { }
 #endif
 
-struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
+struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, int preferred_nid,
 		nodemask_t *nodemask);
-struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid,
+#define __alloc_pages(...)			alloc_hooks(__alloc_pages_noprof(__VA_ARGS__))
+
+struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid,
 		nodemask_t *nodemask);
+#define __folio_alloc(...)			alloc_hooks(__folio_alloc_noprof(__VA_ARGS__))
 
-unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
+unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
 				nodemask_t *nodemask, int nr_pages,
 				struct list_head *page_list,
 				struct page **page_array);
+#define __alloc_pages_bulk(...)			alloc_hooks(alloc_pages_bulk_noprof(__VA_ARGS__))
 
-unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
+unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp,
 				unsigned long nr_pages,
 				struct page **page_array);
+#define  alloc_pages_bulk_array_mempolicy(...)				\
+	alloc_hooks(alloc_pages_bulk_array_mempolicy_noprof(__VA_ARGS__))
 
 /* Bulk allocate order-0 pages */
-static inline unsigned long
-alloc_pages_bulk_list(gfp_t gfp, unsigned long nr_pages, struct list_head *list)
-{
-	return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list, NULL);
-}
+#define alloc_pages_bulk_list(_gfp, _nr_pages, _list)			\
+	__alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _list, NULL)
 
-static inline unsigned long
-alloc_pages_bulk_array(gfp_t gfp, unsigned long nr_pages, struct page **page_array)
-{
-	return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, NULL, page_array);
-}
+#define alloc_pages_bulk_array(_gfp, _nr_pages, _page_array)		\
+	__alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, NULL, _page_array)
 
 static inline unsigned long
-alloc_pages_bulk_array_node(gfp_t gfp, int nid, unsigned long nr_pages, struct page **page_array)
+alloc_pages_bulk_array_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages,
+				   struct page **page_array)
 {
 	if (nid == NUMA_NO_NODE)
 		nid = numa_mem_id();
 
-	return __alloc_pages_bulk(gfp, nid, NULL, nr_pages, NULL, page_array);
+	return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, NULL, page_array);
 }
 
+#define alloc_pages_bulk_array_node(...)				\
+	alloc_hooks(alloc_pages_bulk_array_node_noprof(__VA_ARGS__))
+
 static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask)
 {
 	gfp_t warn_gfp = gfp_mask & (__GFP_THISNODE|__GFP_NOWARN);
@@ -230,82 +236,104 @@ static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask)
  * online. For more general interface, see alloc_pages_node().
  */
 static inline struct page *
-__alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
+__alloc_pages_node_noprof(int nid, gfp_t gfp_mask, unsigned int order)
 {
 	VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
 	warn_if_node_offline(nid, gfp_mask);
 
-	return __alloc_pages(gfp_mask, order, nid, NULL);
+	return __alloc_pages_noprof(gfp_mask, order, nid, NULL);
 }
 
+#define  __alloc_pages_node(...)		alloc_hooks(__alloc_pages_node_noprof(__VA_ARGS__))
+
 static inline
-struct folio *__folio_alloc_node(gfp_t gfp, unsigned int order, int nid)
+struct folio *__folio_alloc_node_noprof(gfp_t gfp, unsigned int order, int nid)
 {
 	VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
 	warn_if_node_offline(nid, gfp);
 
-	return __folio_alloc(gfp, order, nid, NULL);
+	return __folio_alloc_noprof(gfp, order, nid, NULL);
 }
 
+#define  __folio_alloc_node(...)		alloc_hooks(__folio_alloc_node_noprof(__VA_ARGS__))
+
 /*
  * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE,
  * prefer the current CPU's closest node. Otherwise node must be valid and
  * online.
  */
-static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
-						unsigned int order)
+static inline struct page *alloc_pages_node_noprof(int nid, gfp_t gfp_mask,
+						   unsigned int order)
 {
 	if (nid == NUMA_NO_NODE)
 		nid = numa_mem_id();
 
-	return __alloc_pages_node(nid, gfp_mask, order);
+	return __alloc_pages_node_noprof(nid, gfp_mask, order);
 }
 
+#define  alloc_pages_node(...)			alloc_hooks(alloc_pages_node_noprof(__VA_ARGS__))
+
 #ifdef CONFIG_NUMA
-struct page *alloc_pages(gfp_t gfp, unsigned int order);
-struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
+struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order);
+struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
 		struct mempolicy *mpol, pgoff_t ilx, int nid);
-struct folio *folio_alloc(gfp_t gfp, unsigned int order);
-struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
+struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order);
+struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
 		unsigned long addr, bool hugepage);
 #else
-static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)
+static inline struct page *alloc_pages_noprof(gfp_t gfp_mask, unsigned int order)
 {
-	return alloc_pages_node(numa_node_id(), gfp_mask, order);
+	return alloc_pages_node_noprof(numa_node_id(), gfp_mask, order);
 }
-static inline struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
+static inline struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
 		struct mempolicy *mpol, pgoff_t ilx, int nid)
 {
-	return alloc_pages(gfp, order);
+	return alloc_pages_noprof(gfp, order);
 }
-static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order)
+static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
 {
 	return __folio_alloc_node(gfp, order, numa_node_id());
 }
-#define vma_alloc_folio(gfp, order, vma, addr, hugepage)		\
-	folio_alloc(gfp, order)
+#define vma_alloc_folio_noprof(gfp, order, vma, addr, hugepage)		\
+	folio_alloc_noprof(gfp, order)
 #endif
+
+#define alloc_pages(...)			alloc_hooks(alloc_pages_noprof(__VA_ARGS__))
+#define alloc_pages_mpol(...)			alloc_hooks(alloc_pages_mpol_noprof(__VA_ARGS__))
+#define folio_alloc(...)			alloc_hooks(folio_alloc_noprof(__VA_ARGS__))
+#define vma_alloc_folio(...)			alloc_hooks(vma_alloc_folio_noprof(__VA_ARGS__))
+
 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
-static inline struct page *alloc_page_vma(gfp_t gfp,
+
+static inline struct page *alloc_page_vma_noprof(gfp_t gfp,
 		struct vm_area_struct *vma, unsigned long addr)
 {
-	struct folio *folio = vma_alloc_folio(gfp, 0, vma, addr, false);
+	struct folio *folio = vma_alloc_folio_noprof(gfp, 0, vma, addr, false);
 
 	return &folio->page;
 }
+#define alloc_page_vma(...)			alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__))
+
+extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order);
+#define __get_free_pages(...)			alloc_hooks(get_free_pages_noprof(__VA_ARGS__))
 
-extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
-extern unsigned long get_zeroed_page(gfp_t gfp_mask);
+extern unsigned long get_zeroed_page_noprof(gfp_t gfp_mask);
+#define get_zeroed_page(...)			alloc_hooks(get_zeroed_page_noprof(__VA_ARGS__))
+
+void *alloc_pages_exact_noprof(size_t size, gfp_t gfp_mask) __alloc_size(1);
+#define alloc_pages_exact(...)			alloc_hooks(alloc_pages_exact_noprof(__VA_ARGS__))
 
-void *alloc_pages_exact(size_t size, gfp_t gfp_mask) __alloc_size(1);
 void free_pages_exact(void *virt, size_t size);
-__meminit void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __alloc_size(2);
 
-#define __get_free_page(gfp_mask) \
-		__get_free_pages((gfp_mask), 0)
+__meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mask) __alloc_size(2);
+#define alloc_pages_exact_nid(...)					\
+	alloc_hooks(alloc_pages_exact_nid_noprof(__VA_ARGS__))
+
+#define __get_free_page(gfp_mask)					\
+	__get_free_pages((gfp_mask), 0)
 
-#define __get_dma_pages(gfp_mask, order) \
-		__get_free_pages((gfp_mask) | GFP_DMA, (order))
+#define __get_dma_pages(gfp_mask, order)				\
+	__get_free_pages((gfp_mask) | GFP_DMA, (order))
 
 extern void __free_pages(struct page *page, unsigned int order);
 extern void free_pages(unsigned long addr, unsigned int order);
@@ -374,10 +402,14 @@ extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma);
 
 #ifdef CONFIG_CONTIG_ALLOC
 /* The below functions must be run on a range from a single zone. */
-extern int alloc_contig_range(unsigned long start, unsigned long end,
+extern int alloc_contig_range_noprof(unsigned long start, unsigned long end,
 			      unsigned migratetype, gfp_t gfp_mask);
-extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
-				       int nid, nodemask_t *nodemask);
+#define alloc_contig_range(...)			alloc_hooks(alloc_contig_range_noprof(__VA_ARGS__))
+
+extern struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask,
+					      int nid, nodemask_t *nodemask);
+#define alloc_contig_pages(...)			alloc_hooks(alloc_contig_pages_noprof(__VA_ARGS__))
+
 #endif
 void free_contig_range(unsigned long pfn, unsigned long nr_pages);
 
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 2df35e65557d..35636e67e2e1 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -542,14 +542,17 @@ static inline void *detach_page_private(struct page *page)
 #endif
 
 #ifdef CONFIG_NUMA
-struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order);
+struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order);
 #else
-static inline struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order)
+static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
 {
-	return folio_alloc(gfp, order);
+	return folio_alloc_noprof(gfp, order);
 }
 #endif
 
+#define filemap_alloc_folio(...)				\
+	alloc_hooks(filemap_alloc_folio_noprof(__VA_ARGS__))
+
 static inline struct page *__page_cache_alloc(gfp_t gfp)
 {
 	return &filemap_alloc_folio(gfp, 0)->page;
diff --git a/mm/compaction.c b/mm/compaction.c
index 807b58e6eb68..70b01190d2f3 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1851,7 +1851,7 @@ static void isolate_freepages(struct compact_control *cc)
  * This is a migrate-callback that "allocates" freepages by taking pages
  * from the isolated freelists in the block we are migrating to.
  */
-static struct folio *compaction_alloc(struct folio *src, unsigned long data)
+static struct folio *compaction_alloc_noprof(struct folio *src, unsigned long data)
 {
 	struct compact_control *cc = (struct compact_control *)data;
 	struct folio *dst;
@@ -1898,6 +1898,11 @@ again:
 	return page_rmappable_folio(&dst->page);
 }
 
+static struct folio *compaction_alloc(struct folio *src, unsigned long data)
+{
+	return alloc_hooks(compaction_alloc_noprof(src, data));
+}
+
 /*
  * This is a migrate-callback that "frees" freepages back to the isolated
  * freelist.  All pages on the freelist are from the same zone, so there is no
diff --git a/mm/filemap.c b/mm/filemap.c
index 30de18c4fd28..60890758d660 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -966,7 +966,7 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio,
 EXPORT_SYMBOL_GPL(filemap_add_folio);
 
 #ifdef CONFIG_NUMA
-struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order)
+struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
 {
 	int n;
 	struct folio *folio;
@@ -981,9 +981,9 @@ struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order)
 
 		return folio;
 	}
-	return folio_alloc(gfp, order);
+	return folio_alloc_noprof(gfp, order);
 }
-EXPORT_SYMBOL(filemap_alloc_folio);
+EXPORT_SYMBOL(filemap_alloc_folio_noprof);
 #endif
 
 /*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 13100a290918..0b3def99174a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2201,9 +2201,9 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
 	 */
 	preferred_gfp = gfp | __GFP_NOWARN;
 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
-	page = __alloc_pages(preferred_gfp, order, nid, nodemask);
+	page = __alloc_pages_noprof(preferred_gfp, order, nid, nodemask);
 	if (!page)
-		page = __alloc_pages(gfp, order, nid, NULL);
+		page = __alloc_pages_noprof(gfp, order, nid, NULL);
 
 	return page;
 }
@@ -2218,7 +2218,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
  *
  * Return: The page on success or NULL if allocation fails.
  */
-struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
+struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
 		struct mempolicy *pol, pgoff_t ilx, int nid)
 {
 	nodemask_t *nodemask;
@@ -2249,7 +2249,7 @@ struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
 			 * First, try to allocate THP only on local node, but
 			 * don't reclaim unnecessarily, just compact.
 			 */
-			page = __alloc_pages_node(nid,
+			page = __alloc_pages_node_noprof(nid,
 				gfp | __GFP_THISNODE | __GFP_NORETRY, order);
 			if (page || !(gfp & __GFP_DIRECT_RECLAIM))
 				return page;
@@ -2262,7 +2262,7 @@ struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
 		}
 	}
 
-	page = __alloc_pages(gfp, order, nid, nodemask);
+	page = __alloc_pages_noprof(gfp, order, nid, nodemask);
 
 	if (unlikely(pol->mode == MPOL_INTERLEAVE) && page) {
 		/* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
@@ -2293,7 +2293,7 @@ struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
  *
  * Return: The folio on success or NULL if allocation fails.
  */
-struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
+struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
 		unsigned long addr, bool hugepage)
 {
 	struct mempolicy *pol;
@@ -2301,12 +2301,12 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
 	struct page *page;
 
 	pol = get_vma_policy(vma, addr, order, &ilx);
-	page = alloc_pages_mpol(gfp | __GFP_COMP, order,
-				pol, ilx, numa_node_id());
+	page = alloc_pages_mpol_noprof(gfp | __GFP_COMP, order,
+				       pol, ilx, numa_node_id());
 	mpol_cond_put(pol);
 	return page_rmappable_folio(page);
 }
-EXPORT_SYMBOL(vma_alloc_folio);
+EXPORT_SYMBOL(vma_alloc_folio_noprof);
 
 /**
  * alloc_pages - Allocate pages.
@@ -2322,7 +2322,7 @@ EXPORT_SYMBOL(vma_alloc_folio);
  * flags are used.
  * Return: The page on success or NULL if allocation fails.
  */
-struct page *alloc_pages(gfp_t gfp, unsigned int order)
+struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order)
 {
 	struct mempolicy *pol = &default_policy;
 
@@ -2333,16 +2333,16 @@ struct page *alloc_pages(gfp_t gfp, unsigned int order)
 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
 		pol = get_task_policy(current);
 
-	return alloc_pages_mpol(gfp, order,
-				pol, NO_INTERLEAVE_INDEX, numa_node_id());
+	return alloc_pages_mpol_noprof(gfp, order, pol, NO_INTERLEAVE_INDEX,
+				       numa_node_id());
 }
-EXPORT_SYMBOL(alloc_pages);
+EXPORT_SYMBOL(alloc_pages_noprof);
 
-struct folio *folio_alloc(gfp_t gfp, unsigned int order)
+struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
 {
-	return page_rmappable_folio(alloc_pages(gfp | __GFP_COMP, order));
+	return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order));
 }
-EXPORT_SYMBOL(folio_alloc);
+EXPORT_SYMBOL(folio_alloc_noprof);
 
 static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
 		struct mempolicy *pol, unsigned long nr_pages,
@@ -2361,13 +2361,13 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
 
 	for (i = 0; i < nodes; i++) {
 		if (delta) {
-			nr_allocated = __alloc_pages_bulk(gfp,
+			nr_allocated = alloc_pages_bulk_noprof(gfp,
 					interleave_nodes(pol), NULL,
 					nr_pages_per_node + 1, NULL,
 					page_array);
 			delta--;
 		} else {
-			nr_allocated = __alloc_pages_bulk(gfp,
+			nr_allocated = alloc_pages_bulk_noprof(gfp,
 					interleave_nodes(pol), NULL,
 					nr_pages_per_node, NULL, page_array);
 		}
@@ -2504,11 +2504,11 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
 	preferred_gfp = gfp | __GFP_NOWARN;
 	preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
 
-	nr_allocated  = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes,
+	nr_allocated  = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes,
 					   nr_pages, NULL, page_array);
 
 	if (nr_allocated < nr_pages)
-		nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL,
+		nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL,
 				nr_pages - nr_allocated, NULL,
 				page_array + nr_allocated);
 	return nr_allocated;
@@ -2520,7 +2520,7 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
  * It can accelerate memory allocation especially interleaving
  * allocate memory.
  */
-unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
+unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp,
 		unsigned long nr_pages, struct page **page_array)
 {
 	struct mempolicy *pol = &default_policy;
@@ -2544,8 +2544,8 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
 
 	nid = numa_node_id();
 	nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
-	return __alloc_pages_bulk(gfp, nid, nodemask,
-				  nr_pages, NULL, page_array);
+	return alloc_pages_bulk_noprof(gfp, nid, nodemask,
+				       nr_pages, NULL, page_array);
 }
 
 int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3fd273c22749..d3afaf8f983a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4391,7 +4391,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
  *
  * Returns the number of pages on the list or array.
  */
-unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
+unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
 			nodemask_t *nodemask, int nr_pages,
 			struct list_head *page_list,
 			struct page **page_array)
@@ -4527,7 +4527,7 @@ failed_irq:
 	pcp_trylock_finish(UP_flags);
 
 failed:
-	page = __alloc_pages(gfp, 0, preferred_nid, nodemask);
+	page = __alloc_pages_noprof(gfp, 0, preferred_nid, nodemask);
 	if (page) {
 		if (page_list)
 			list_add(&page->lru, page_list);
@@ -4538,13 +4538,13 @@ failed:
 
 	goto out;
 }
-EXPORT_SYMBOL_GPL(__alloc_pages_bulk);
+EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof);
 
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
-struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
-							nodemask_t *nodemask)
+struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order,
+				      int preferred_nid, nodemask_t *nodemask)
 {
 	struct page *page;
 	unsigned int alloc_flags = ALLOC_WMARK_LOW;
@@ -4606,38 +4606,38 @@ out:
 
 	return page;
 }
-EXPORT_SYMBOL(__alloc_pages);
+EXPORT_SYMBOL(__alloc_pages_noprof);
 
-struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid,
+struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid,
 		nodemask_t *nodemask)
 {
-	struct page *page = __alloc_pages(gfp | __GFP_COMP, order,
+	struct page *page = __alloc_pages_noprof(gfp | __GFP_COMP, order,
 					preferred_nid, nodemask);
 	return page_rmappable_folio(page);
 }
-EXPORT_SYMBOL(__folio_alloc);
+EXPORT_SYMBOL(__folio_alloc_noprof);
 
 /*
  * Common helper functions. Never use with __GFP_HIGHMEM because the returned
  * address cannot represent highmem pages. Use alloc_pages and then kmap if
  * you need to access high mem.
  */
-unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
+unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order)
 {
 	struct page *page;
 
-	page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order);
+	page = alloc_pages_noprof(gfp_mask & ~__GFP_HIGHMEM, order);
 	if (!page)
 		return 0;
 	return (unsigned long) page_address(page);
 }
-EXPORT_SYMBOL(__get_free_pages);
+EXPORT_SYMBOL(get_free_pages_noprof);
 
-unsigned long get_zeroed_page(gfp_t gfp_mask)
+unsigned long get_zeroed_page_noprof(gfp_t gfp_mask)
 {
-	return __get_free_page(gfp_mask | __GFP_ZERO);
+	return get_free_pages_noprof(gfp_mask | __GFP_ZERO, 0);
 }
-EXPORT_SYMBOL(get_zeroed_page);
+EXPORT_SYMBOL(get_zeroed_page_noprof);
 
 /**
  * __free_pages - Free pages allocated with alloc_pages().
@@ -4853,7 +4853,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
  *
  * Return: pointer to the allocated area or %NULL in case of error.
  */
-void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
+void *alloc_pages_exact_noprof(size_t size, gfp_t gfp_mask)
 {
 	unsigned int order = get_order(size);
 	unsigned long addr;
@@ -4861,10 +4861,10 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
 	if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
 		gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
 
-	addr = __get_free_pages(gfp_mask, order);
+	addr = get_free_pages_noprof(gfp_mask, order);
 	return make_alloc_exact(addr, order, size);
 }
-EXPORT_SYMBOL(alloc_pages_exact);
+EXPORT_SYMBOL(alloc_pages_exact_noprof);
 
 /**
  * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
@@ -4878,7 +4878,7 @@ EXPORT_SYMBOL(alloc_pages_exact);
  *
  * Return: pointer to the allocated area or %NULL in case of error.
  */
-void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
+void * __meminit alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mask)
 {
 	unsigned int order = get_order(size);
 	struct page *p;
@@ -4886,7 +4886,7 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
 	if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
 		gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
 
-	p = alloc_pages_node(nid, gfp_mask, order);
+	p = alloc_pages_node_noprof(nid, gfp_mask, order);
 	if (!p)
 		return NULL;
 	return make_alloc_exact((unsigned long)page_address(p), order, size);
@@ -6343,7 +6343,7 @@ int __alloc_contig_migrate_range(struct compact_control *cc,
  * pages which PFN is in [start, end) are allocated for the caller and
  * need to be freed with free_contig_range().
  */
-int alloc_contig_range(unsigned long start, unsigned long end,
+int alloc_contig_range_noprof(unsigned long start, unsigned long end,
 		       unsigned migratetype, gfp_t gfp_mask)
 {
 	unsigned long outer_start, outer_end;
@@ -6467,15 +6467,15 @@ done:
 	undo_isolate_page_range(start, end, migratetype);
 	return ret;
 }
-EXPORT_SYMBOL(alloc_contig_range);
+EXPORT_SYMBOL(alloc_contig_range_noprof);
 
 static int __alloc_contig_pages(unsigned long start_pfn,
 				unsigned long nr_pages, gfp_t gfp_mask)
 {
 	unsigned long end_pfn = start_pfn + nr_pages;
 
-	return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
-				  gfp_mask);
+	return alloc_contig_range_noprof(start_pfn, end_pfn, MIGRATE_MOVABLE,
+				   gfp_mask);
 }
 
 static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn,
@@ -6530,8 +6530,8 @@ static bool zone_spans_last_pfn(const struct zone *zone,
  *
  * Return: pointer to contiguous pages on success, or NULL if not successful.
  */
-struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
-				int nid, nodemask_t *nodemask)
+struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask,
+				 int nid, nodemask_t *nodemask)
 {
 	unsigned long ret, pfn, flags;
 	struct zonelist *zonelist;
-- 
cgit v1.2.3


From be25d1d4e822a703b19043baff916aec6ac4362d Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 21 Mar 2024 09:36:41 -0700
Subject: mm: create new codetag references during page splitting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a high-order page is split into smaller ones, each newly split page
should get its codetag.  After the split each split page will be
referencing the original codetag.  The codetag's "bytes" counter remains
the same because the amount of allocated memory has not changed, however
the "calls" counter gets increased to keep the counter correct when these
individual pages get freed.

Link: https://lkml.kernel.org/r/20240321163705.3067592-20-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Kees Cook <keescook@chromium.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andreas Hindborg <a.hindborg@samsung.com>
Cc: Benno Lossin <benno.lossin@proton.me>
Cc: "Björn Roy Baron" <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/alloc_tag.h   |  9 +++++++++
 include/linux/pgalloc_tag.h | 30 ++++++++++++++++++++++++++++++
 mm/huge_memory.c            |  2 ++
 mm/page_alloc.c             |  2 ++
 4 files changed, 43 insertions(+)

(limited to 'include')

diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index 8932187124da..2aea7afa1fab 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -117,6 +117,15 @@ static inline void __alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag
 	this_cpu_inc(tag->counters->calls);
 }
 
+static inline void alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag)
+{
+	alloc_tag_add_check(ref, tag);
+	if (!ref || !tag)
+		return;
+
+	__alloc_tag_ref_set(ref, tag);
+}
+
 static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes)
 {
 	alloc_tag_add_check(ref, tag);
diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h
index 66bd021eb46e..093edf98c3d7 100644
--- a/include/linux/pgalloc_tag.h
+++ b/include/linux/pgalloc_tag.h
@@ -67,11 +67,41 @@ static inline void pgalloc_tag_sub(struct page *page, unsigned int nr)
 	}
 }
 
+static inline void pgalloc_tag_split(struct page *page, unsigned int nr)
+{
+	int i;
+	struct page_ext *page_ext;
+	union codetag_ref *ref;
+	struct alloc_tag *tag;
+
+	if (!mem_alloc_profiling_enabled())
+		return;
+
+	page_ext = page_ext_get(page);
+	if (unlikely(!page_ext))
+		return;
+
+	ref = codetag_ref_from_page_ext(page_ext);
+	if (!ref->ct)
+		goto out;
+
+	tag = ct_to_alloc_tag(ref->ct);
+	page_ext = page_ext_next(page_ext);
+	for (i = 1; i < nr; i++) {
+		/* Set new reference to point to the original tag */
+		alloc_tag_ref_set(codetag_ref_from_page_ext(page_ext), tag);
+		page_ext = page_ext_next(page_ext);
+	}
+out:
+	page_ext_put(page_ext);
+}
+
 #else /* CONFIG_MEM_ALLOC_PROFILING */
 
 static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
 				   unsigned int nr) {}
 static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
+static inline void pgalloc_tag_split(struct page *page, unsigned int nr) {}
 
 #endif /* CONFIG_MEM_ALLOC_PROFILING */
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index dd88bb3ea196..25191ab70631 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -38,6 +38,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/memory-tiers.h>
 #include <linux/compat.h>
+#include <linux/pgalloc_tag.h>
 
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
@@ -2946,6 +2947,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 	/* Caller disabled irqs, so they are still disabled here */
 
 	split_page_owner(head, order, new_order);
+	pgalloc_tag_split(head, 1 << order);
 
 	/* See comment in __split_huge_page_tail() */
 	if (folio_test_anon(folio)) {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d3afaf8f983a..e453ee22d489 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2630,6 +2630,7 @@ void split_page(struct page *page, unsigned int order)
 	for (i = 1; i < (1 << order); i++)
 		set_page_refcounted(page + i);
 	split_page_owner(page, order, 0);
+	pgalloc_tag_split(page, 1 << order);
 	split_page_memcg(page, order, 0);
 }
 EXPORT_SYMBOL_GPL(split_page);
@@ -4827,6 +4828,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
 		struct page *last = page + nr;
 
 		split_page_owner(page, order, 0);
+		pgalloc_tag_split(page, 1 << order);
 		split_page_memcg(page, order, 0);
 		while (page < --last)
 			set_page_refcounted(last);
-- 
cgit v1.2.3


From cc92eba1c88b1f74e0f044df2738f4e4b22f1e4e Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 21 Mar 2024 09:36:42 -0700
Subject: mm: fix non-compound multi-order memory accounting in __free_pages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a non-compound multi-order page is freed, it is possible that a
speculative reference keeps the page pinned.  In this case we free all
pages except for the first page, which will be freed later by the last
put_page().  However the page passed to put_page() is indistinguishable
from an order-0 page, so it cannot do the accounting, just as it cannot
free the subsequent pages.  Do the accounting here, where we free the
pages.

Link: https://lkml.kernel.org/r/20240321163705.3067592-21-surenb@google.com
Reported-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Tested-by: Kees Cook <keescook@chromium.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andreas Hindborg <a.hindborg@samsung.com>
Cc: Benno Lossin <benno.lossin@proton.me>
Cc: "Björn Roy Baron" <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgalloc_tag.h | 24 ++++++++++++++++++++++++
 mm/page_alloc.c             |  5 ++++-
 2 files changed, 28 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h
index 093edf98c3d7..50d212330bbb 100644
--- a/include/linux/pgalloc_tag.h
+++ b/include/linux/pgalloc_tag.h
@@ -96,12 +96,36 @@ out:
 	page_ext_put(page_ext);
 }
 
+static inline struct alloc_tag *pgalloc_tag_get(struct page *page)
+{
+	struct alloc_tag *tag = NULL;
+
+	if (mem_alloc_profiling_enabled()) {
+		union codetag_ref *ref = get_page_tag_ref(page);
+
+		alloc_tag_sub_check(ref);
+		if (ref && ref->ct)
+			tag = ct_to_alloc_tag(ref->ct);
+		put_page_tag_ref(ref);
+	}
+
+	return tag;
+}
+
+static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr)
+{
+	if (mem_alloc_profiling_enabled() && tag)
+		this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr);
+}
+
 #else /* CONFIG_MEM_ALLOC_PROFILING */
 
 static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
 				   unsigned int nr) {}
 static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
 static inline void pgalloc_tag_split(struct page *page, unsigned int nr) {}
+static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL; }
+static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
 
 #endif /* CONFIG_MEM_ALLOC_PROFILING */
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e453ee22d489..e1241ecef271 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4664,12 +4664,15 @@ void __free_pages(struct page *page, unsigned int order)
 {
 	/* get PageHead before we drop reference */
 	int head = PageHead(page);
+	struct alloc_tag *tag = pgalloc_tag_get(page);
 
 	if (put_page_testzero(page))
 		free_the_page(page, order);
-	else if (!head)
+	else if (!head) {
+		pgalloc_tag_sub_pages(tag, (1 << order) - 1);
 		while (order-- > 0)
 			free_the_page(page + (1 << order), order);
+	}
 }
 EXPORT_SYMBOL(__free_pages);
 
-- 
cgit v1.2.3


From c789b5fe38f386bbdec8c9c41e9af52d3775e16c Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 21 Mar 2024 09:36:44 -0700
Subject: lib: add codetag reference into slabobj_ext
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To store code tag for every slab object, a codetag reference is embedded
into slabobj_ext when CONFIG_MEM_ALLOC_PROFILING=y.

Link: https://lkml.kernel.org/r/20240321163705.3067592-23-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Co-developed-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Kees Cook <keescook@chromium.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andreas Hindborg <a.hindborg@samsung.com>
Cc: Benno Lossin <benno.lossin@proton.me>
Cc: "Björn Roy Baron" <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 5 +++++
 lib/Kconfig.debug          | 1 +
 2 files changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 12afc2647cf0..24a6df30be49 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1653,7 +1653,12 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
  * if MEMCG_DATA_OBJEXTS is set.
  */
 struct slabobj_ext {
+#ifdef CONFIG_MEMCG_KMEM
 	struct obj_cgroup *objcg;
+#endif
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+	union codetag_ref ref;
+#endif
 } __aligned(8);
 
 static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index a8f77de5e8ea..9d2685ec4592 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -979,6 +979,7 @@ config MEM_ALLOC_PROFILING
 	depends on !DEBUG_FORCE_WEAK_PER_CPU
 	select CODE_TAGGING
 	select PAGE_EXTENSION
+	select SLAB_OBJ_EXT
 	help
 	  Track allocation source code and record total allocation size
 	  initiated at that code location. The mechanism can be used to track
-- 
cgit v1.2.3


From 7bd230a26648ac68ab3731ebbc449090f0ac6a37 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 21 Mar 2024 09:36:47 -0700
Subject: mm/slab: enable slab allocation tagging for kmalloc and friends
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Redefine kmalloc, krealloc, kzalloc, kcalloc, etc. to record allocations
and deallocations done by these functions.

[surenb@google.com: undo _noprof additions in the documentation]
  Link: https://lkml.kernel.org/r/20240326231453.1206227-7-surenb@google.com
[rdunlap@infradead.org: fix kcalloc() kernel-doc warnings]
  Link: https://lkml.kernel.org/r/20240327044649.9199-1-rdunlap@infradead.org
Link: https://lkml.kernel.org/r/20240321163705.3067592-26-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Co-developed-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andreas Hindborg <a.hindborg@samsung.com>
Cc: Benno Lossin <benno.lossin@proton.me>
Cc: "Björn Roy Baron" <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/fortify-string.h |   5 +-
 include/linux/slab.h           | 169 +++++++++++++++++++++--------------------
 include/linux/string.h         |   4 +-
 mm/slab_common.c               |   6 +-
 mm/slub.c                      |  50 ++++++------
 mm/util.c                      |  16 ++--
 6 files changed, 127 insertions(+), 123 deletions(-)

(limited to 'include')

diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index 6aeebe0a6777..b24d62bad0b3 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -725,9 +725,9 @@ __FORTIFY_INLINE void *memchr_inv(const void * const POS0 p, int c, size_t size)
 	return __real_memchr_inv(p, c, size);
 }
 
-extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup)
+extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup_noprof)
 								    __realloc_size(2);
-__FORTIFY_INLINE void *kmemdup(const void * const POS0 p, size_t size, gfp_t gfp)
+__FORTIFY_INLINE void *kmemdup_noprof(const void * const POS0 p, size_t size, gfp_t gfp)
 {
 	const size_t p_size = __struct_size(p);
 
@@ -737,6 +737,7 @@ __FORTIFY_INLINE void *kmemdup(const void * const POS0 p, size_t size, gfp_t gfp
 		fortify_panic(FORTIFY_FUNC_kmemdup, FORTIFY_READ, p_size, size, NULL);
 	return __real_kmemdup(p, size, gfp);
 }
+#define kmemdup(...)	alloc_hooks(kmemdup_noprof(__VA_ARGS__))
 
 /**
  * strcpy - Copy a string into another string buffer
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 68ff754b85a4..c0be1cd03cf6 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -271,7 +271,10 @@ int kmem_cache_shrink(struct kmem_cache *s);
 /*
  * Common kmalloc functions provided by all allocators
  */
-void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags) __realloc_size(2);
+void * __must_check krealloc_noprof(const void *objp, size_t new_size,
+				    gfp_t flags) __realloc_size(2);
+#define krealloc(...)				alloc_hooks(krealloc_noprof(__VA_ARGS__))
+
 void kfree(const void *objp);
 void kfree_sensitive(const void *objp);
 size_t __ksize(const void *objp);
@@ -523,7 +526,10 @@ static __always_inline unsigned int __kmalloc_index(size_t size,
 static_assert(PAGE_SHIFT <= 20);
 #define kmalloc_index(s) __kmalloc_index(s, true)
 
-void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_size(1);
+#include <linux/alloc_tag.h>
+
+void *__kmalloc_noprof(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_size(1);
+#define __kmalloc(...)				alloc_hooks(__kmalloc_noprof(__VA_ARGS__))
 
 /**
  * kmem_cache_alloc - Allocate an object
@@ -535,9 +541,14 @@ void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_siz
  *
  * Return: pointer to the new object or %NULL in case of error
  */
-void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) __assume_slab_alignment __malloc;
-void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru,
-			   gfp_t gfpflags) __assume_slab_alignment __malloc;
+void *kmem_cache_alloc_noprof(struct kmem_cache *cachep,
+			      gfp_t flags) __assume_slab_alignment __malloc;
+#define kmem_cache_alloc(...)			alloc_hooks(kmem_cache_alloc_noprof(__VA_ARGS__))
+
+void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru,
+			    gfp_t gfpflags) __assume_slab_alignment __malloc;
+#define kmem_cache_alloc_lru(...)	alloc_hooks(kmem_cache_alloc_lru_noprof(__VA_ARGS__))
+
 void kmem_cache_free(struct kmem_cache *s, void *objp);
 
 /*
@@ -548,29 +559,40 @@ void kmem_cache_free(struct kmem_cache *s, void *objp);
  * Note that interrupts must be enabled when calling these functions.
  */
 void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p);
-int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p);
+
+int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, void **p);
+#define kmem_cache_alloc_bulk(...)	alloc_hooks(kmem_cache_alloc_bulk_noprof(__VA_ARGS__))
 
 static __always_inline void kfree_bulk(size_t size, void **p)
 {
 	kmem_cache_free_bulk(NULL, size, p);
 }
 
-void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment
+void *__kmalloc_node_noprof(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment
 							 __alloc_size(1);
-void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment
-									 __malloc;
+#define __kmalloc_node(...)			alloc_hooks(__kmalloc_node_noprof(__VA_ARGS__))
+
+void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t flags,
+				   int node) __assume_slab_alignment __malloc;
+#define kmem_cache_alloc_node(...)	alloc_hooks(kmem_cache_alloc_node_noprof(__VA_ARGS__))
 
-void *kmalloc_trace(struct kmem_cache *s, gfp_t flags, size_t size)
+void *kmalloc_trace_noprof(struct kmem_cache *s, gfp_t flags, size_t size)
 		    __assume_kmalloc_alignment __alloc_size(3);
 
-void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
-			 int node, size_t size) __assume_kmalloc_alignment
+void *kmalloc_node_trace_noprof(struct kmem_cache *s, gfp_t gfpflags,
+		int node, size_t size) __assume_kmalloc_alignment
 						__alloc_size(4);
-void *kmalloc_large(size_t size, gfp_t flags) __assume_page_alignment
+#define kmalloc_trace(...)			alloc_hooks(kmalloc_trace_noprof(__VA_ARGS__))
+
+#define kmalloc_node_trace(...)			alloc_hooks(kmalloc_node_trace_noprof(__VA_ARGS__))
+
+void *kmalloc_large_noprof(size_t size, gfp_t flags) __assume_page_alignment
 					      __alloc_size(1);
+#define kmalloc_large(...)			alloc_hooks(kmalloc_large_noprof(__VA_ARGS__))
 
-void *kmalloc_large_node(size_t size, gfp_t flags, int node) __assume_page_alignment
+void *kmalloc_large_node_noprof(size_t size, gfp_t flags, int node) __assume_page_alignment
 							     __alloc_size(1);
+#define kmalloc_large_node(...)			alloc_hooks(kmalloc_large_node_noprof(__VA_ARGS__))
 
 /**
  * kmalloc - allocate kernel memory
@@ -626,37 +648,39 @@ void *kmalloc_large_node(size_t size, gfp_t flags, int node) __assume_page_align
  *	Try really hard to succeed the allocation but fail
  *	eventually.
  */
-static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags)
+static __always_inline __alloc_size(1) void *kmalloc_noprof(size_t size, gfp_t flags)
 {
 	if (__builtin_constant_p(size) && size) {
 		unsigned int index;
 
 		if (size > KMALLOC_MAX_CACHE_SIZE)
-			return kmalloc_large(size, flags);
+			return kmalloc_large_noprof(size, flags);
 
 		index = kmalloc_index(size);
-		return kmalloc_trace(
+		return kmalloc_trace_noprof(
 				kmalloc_caches[kmalloc_type(flags, _RET_IP_)][index],
 				flags, size);
 	}
-	return __kmalloc(size, flags);
+	return __kmalloc_noprof(size, flags);
 }
+#define kmalloc(...)				alloc_hooks(kmalloc_noprof(__VA_ARGS__))
 
-static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t flags, int node)
+static __always_inline __alloc_size(1) void *kmalloc_node_noprof(size_t size, gfp_t flags, int node)
 {
 	if (__builtin_constant_p(size) && size) {
 		unsigned int index;
 
 		if (size > KMALLOC_MAX_CACHE_SIZE)
-			return kmalloc_large_node(size, flags, node);
+			return kmalloc_large_node_noprof(size, flags, node);
 
 		index = kmalloc_index(size);
-		return kmalloc_node_trace(
+		return kmalloc_node_trace_noprof(
 				kmalloc_caches[kmalloc_type(flags, _RET_IP_)][index],
 				flags, node, size);
 	}
-	return __kmalloc_node(size, flags, node);
+	return __kmalloc_node_noprof(size, flags, node);
 }
+#define kmalloc_node(...)			alloc_hooks(kmalloc_node_noprof(__VA_ARGS__))
 
 /**
  * kmalloc_array - allocate memory for an array.
@@ -664,16 +688,17 @@ static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t fla
  * @size: element size.
  * @flags: the type of memory to allocate (see kmalloc).
  */
-static inline __alloc_size(1, 2) void *kmalloc_array(size_t n, size_t size, gfp_t flags)
+static inline __alloc_size(1, 2) void *kmalloc_array_noprof(size_t n, size_t size, gfp_t flags)
 {
 	size_t bytes;
 
 	if (unlikely(check_mul_overflow(n, size, &bytes)))
 		return NULL;
 	if (__builtin_constant_p(n) && __builtin_constant_p(size))
-		return kmalloc(bytes, flags);
-	return __kmalloc(bytes, flags);
+		return kmalloc_noprof(bytes, flags);
+	return kmalloc_noprof(bytes, flags);
 }
+#define kmalloc_array(...)			alloc_hooks(kmalloc_array_noprof(__VA_ARGS__))
 
 /**
  * krealloc_array - reallocate memory for an array.
@@ -682,18 +707,19 @@ static inline __alloc_size(1, 2) void *kmalloc_array(size_t n, size_t size, gfp_
  * @new_size: new size of a single member of the array
  * @flags: the type of memory to allocate (see kmalloc)
  */
-static inline __realloc_size(2, 3) void * __must_check krealloc_array(void *p,
-								      size_t new_n,
-								      size_t new_size,
-								      gfp_t flags)
+static inline __realloc_size(2, 3) void * __must_check krealloc_array_noprof(void *p,
+								       size_t new_n,
+								       size_t new_size,
+								       gfp_t flags)
 {
 	size_t bytes;
 
 	if (unlikely(check_mul_overflow(new_n, new_size, &bytes)))
 		return NULL;
 
-	return krealloc(p, bytes, flags);
+	return krealloc_noprof(p, bytes, flags);
 }
+#define krealloc_array(...)			alloc_hooks(krealloc_array_noprof(__VA_ARGS__))
 
 /**
  * kcalloc - allocate memory for an array. The memory is set to zero.
@@ -701,16 +727,12 @@ static inline __realloc_size(2, 3) void * __must_check krealloc_array(void *p,
  * @size: element size.
  * @flags: the type of memory to allocate (see kmalloc).
  */
-static inline __alloc_size(1, 2) void *kcalloc(size_t n, size_t size, gfp_t flags)
-{
-	return kmalloc_array(n, size, flags | __GFP_ZERO);
-}
+#define kcalloc(n, size, flags)		kmalloc_array(n, size, (flags) | __GFP_ZERO)
 
-void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node,
+void *kmalloc_node_track_caller_noprof(size_t size, gfp_t flags, int node,
 				  unsigned long caller) __alloc_size(1);
-#define kmalloc_node_track_caller(size, flags, node) \
-	__kmalloc_node_track_caller(size, flags, node, \
-				    _RET_IP_)
+#define kmalloc_node_track_caller(...)		\
+	alloc_hooks(kmalloc_node_track_caller_noprof(__VA_ARGS__, _RET_IP_))
 
 /*
  * kmalloc_track_caller is a special version of kmalloc that records the
@@ -720,11 +742,9 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node,
  * allocator where we care about the real place the memory allocation
  * request comes from.
  */
-#define kmalloc_track_caller(size, flags) \
-	__kmalloc_node_track_caller(size, flags, \
-				    NUMA_NO_NODE, _RET_IP_)
+#define kmalloc_track_caller(...)		kmalloc_node_track_caller(__VA_ARGS__, NUMA_NO_NODE)
 
-static inline __alloc_size(1, 2) void *kmalloc_array_node(size_t n, size_t size, gfp_t flags,
+static inline __alloc_size(1, 2) void *kmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags,
 							  int node)
 {
 	size_t bytes;
@@ -732,75 +752,56 @@ static inline __alloc_size(1, 2) void *kmalloc_array_node(size_t n, size_t size,
 	if (unlikely(check_mul_overflow(n, size, &bytes)))
 		return NULL;
 	if (__builtin_constant_p(n) && __builtin_constant_p(size))
-		return kmalloc_node(bytes, flags, node);
-	return __kmalloc_node(bytes, flags, node);
+		return kmalloc_node_noprof(bytes, flags, node);
+	return __kmalloc_node_noprof(bytes, flags, node);
 }
+#define kmalloc_array_node(...)			alloc_hooks(kmalloc_array_node_noprof(__VA_ARGS__))
 
-static inline __alloc_size(1, 2) void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node)
-{
-	return kmalloc_array_node(n, size, flags | __GFP_ZERO, node);
-}
+#define kcalloc_node(_n, _size, _flags, _node)	\
+	kmalloc_array_node(_n, _size, (_flags) | __GFP_ZERO, _node)
 
 /*
  * Shortcuts
  */
-static inline void *kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags)
-{
-	return kmem_cache_alloc(k, flags | __GFP_ZERO);
-}
+#define kmem_cache_zalloc(_k, _flags)		kmem_cache_alloc(_k, (_flags)|__GFP_ZERO)
 
 /**
  * kzalloc - allocate memory. The memory is set to zero.
  * @size: how many bytes of memory are required.
  * @flags: the type of memory to allocate (see kmalloc).
  */
-static inline __alloc_size(1) void *kzalloc(size_t size, gfp_t flags)
+static inline __alloc_size(1) void *kzalloc_noprof(size_t size, gfp_t flags)
 {
-	return kmalloc(size, flags | __GFP_ZERO);
+	return kmalloc_noprof(size, flags | __GFP_ZERO);
 }
+#define kzalloc(...)				alloc_hooks(kzalloc_noprof(__VA_ARGS__))
+#define kzalloc_node(_size, _flags, _node)	kmalloc_node(_size, (_flags)|__GFP_ZERO, _node)
 
-/**
- * kzalloc_node - allocate zeroed memory from a particular memory node.
- * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate (see kmalloc).
- * @node: memory node from which to allocate
- */
-static inline __alloc_size(1) void *kzalloc_node(size_t size, gfp_t flags, int node)
-{
-	return kmalloc_node(size, flags | __GFP_ZERO, node);
-}
+extern void *kvmalloc_node_noprof(size_t size, gfp_t flags, int node) __alloc_size(1);
+#define kvmalloc_node(...)			alloc_hooks(kvmalloc_node_noprof(__VA_ARGS__))
 
-extern void *kvmalloc_node(size_t size, gfp_t flags, int node) __alloc_size(1);
-static inline __alloc_size(1) void *kvmalloc(size_t size, gfp_t flags)
-{
-	return kvmalloc_node(size, flags, NUMA_NO_NODE);
-}
-static inline __alloc_size(1) void *kvzalloc_node(size_t size, gfp_t flags, int node)
-{
-	return kvmalloc_node(size, flags | __GFP_ZERO, node);
-}
-static inline __alloc_size(1) void *kvzalloc(size_t size, gfp_t flags)
-{
-	return kvmalloc(size, flags | __GFP_ZERO);
-}
+#define kvmalloc(_size, _flags)			kvmalloc_node(_size, _flags, NUMA_NO_NODE)
+#define kvzalloc(_size, _flags)			kvmalloc(_size, _flags|__GFP_ZERO)
+
+#define kvzalloc_node(_size, _flags, _node)	kvmalloc_node(_size, _flags|__GFP_ZERO, _node)
 
-static inline __alloc_size(1, 2) void *kvmalloc_array(size_t n, size_t size, gfp_t flags)
+static inline __alloc_size(1, 2) void *kvmalloc_array_noprof(size_t n, size_t size, gfp_t flags)
 {
 	size_t bytes;
 
 	if (unlikely(check_mul_overflow(n, size, &bytes)))
 		return NULL;
 
-	return kvmalloc(bytes, flags);
+	return kvmalloc_node_noprof(bytes, flags, NUMA_NO_NODE);
 }
 
-static inline __alloc_size(1, 2) void *kvcalloc(size_t n, size_t size, gfp_t flags)
-{
-	return kvmalloc_array(n, size, flags | __GFP_ZERO);
-}
+#define kvmalloc_array(...)			alloc_hooks(kvmalloc_array_noprof(__VA_ARGS__))
+#define kvcalloc(_n, _size, _flags)		kvmalloc_array(_n, _size, _flags|__GFP_ZERO)
 
-extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
+extern void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
 		      __realloc_size(3);
+#define kvrealloc(...)				alloc_hooks(kvrealloc_noprof(__VA_ARGS__))
+
 extern void kvfree(const void *addr);
 DEFINE_FREE(kvfree, void *, if (_T) kvfree(_T))
 
diff --git a/include/linux/string.h b/include/linux/string.h
index 9ba8b4597009..793c27ad7c0d 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -282,7 +282,9 @@ extern void kfree_const(const void *x);
 extern char *kstrdup(const char *s, gfp_t gfp) __malloc;
 extern const char *kstrdup_const(const char *s, gfp_t gfp);
 extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
-extern void *kmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2);
+extern void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp) __realloc_size(2);
+#define kmemdup(...)	alloc_hooks(kmemdup_noprof(__VA_ARGS__))
+
 extern void *kvmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2);
 extern char *kmemdup_nul(const char *s, size_t len, gfp_t gfp);
 extern void *kmemdup_array(const void *src, size_t element_size, size_t count, gfp_t gfp);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index f5234672f03c..3179a6aeffc5 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1189,7 +1189,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags)
 		return (void *)p;
 	}
 
-	ret = kmalloc_track_caller(new_size, flags);
+	ret = kmalloc_node_track_caller_noprof(new_size, flags, NUMA_NO_NODE, _RET_IP_);
 	if (ret && p) {
 		/* Disable KASAN checks as the object's redzone is accessed. */
 		kasan_disable_current();
@@ -1213,7 +1213,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags)
  *
  * Return: pointer to the allocated memory or %NULL in case of error
  */
-void *krealloc(const void *p, size_t new_size, gfp_t flags)
+void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags)
 {
 	void *ret;
 
@@ -1228,7 +1228,7 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
 
 	return ret;
 }
-EXPORT_SYMBOL(krealloc);
+EXPORT_SYMBOL(krealloc_noprof);
 
 /**
  * kfree_sensitive - Clear sensitive information in memory before freeing
diff --git a/mm/slub.c b/mm/slub.c
index 5840ab963319..0fc891d44cff 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4002,7 +4002,7 @@ out:
 	return object;
 }
 
-void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
+void *kmem_cache_alloc_noprof(struct kmem_cache *s, gfp_t gfpflags)
 {
 	void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE, _RET_IP_,
 				    s->object_size);
@@ -4011,9 +4011,9 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
 
 	return ret;
 }
-EXPORT_SYMBOL(kmem_cache_alloc);
+EXPORT_SYMBOL(kmem_cache_alloc_noprof);
 
-void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru,
+void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru,
 			   gfp_t gfpflags)
 {
 	void *ret = slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, _RET_IP_,
@@ -4023,7 +4023,7 @@ void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru,
 
 	return ret;
 }
-EXPORT_SYMBOL(kmem_cache_alloc_lru);
+EXPORT_SYMBOL(kmem_cache_alloc_lru_noprof);
 
 /**
  * kmem_cache_alloc_node - Allocate an object on the specified node
@@ -4038,7 +4038,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_lru);
  *
  * Return: pointer to the new object or %NULL in case of error
  */
-void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
+void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int node)
 {
 	void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size);
 
@@ -4046,7 +4046,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
 
 	return ret;
 }
-EXPORT_SYMBOL(kmem_cache_alloc_node);
+EXPORT_SYMBOL(kmem_cache_alloc_node_noprof);
 
 /*
  * To avoid unnecessary overhead, we pass through large allocation requests
@@ -4063,7 +4063,7 @@ static void *__kmalloc_large_node(size_t size, gfp_t flags, int node)
 		flags = kmalloc_fix_flags(flags);
 
 	flags |= __GFP_COMP;
-	folio = (struct folio *)alloc_pages_node(node, flags, order);
+	folio = (struct folio *)alloc_pages_node_noprof(node, flags, order);
 	if (folio) {
 		ptr = folio_address(folio);
 		lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B,
@@ -4078,7 +4078,7 @@ static void *__kmalloc_large_node(size_t size, gfp_t flags, int node)
 	return ptr;
 }
 
-void *kmalloc_large(size_t size, gfp_t flags)
+void *kmalloc_large_noprof(size_t size, gfp_t flags)
 {
 	void *ret = __kmalloc_large_node(size, flags, NUMA_NO_NODE);
 
@@ -4086,9 +4086,9 @@ void *kmalloc_large(size_t size, gfp_t flags)
 		      flags, NUMA_NO_NODE);
 	return ret;
 }
-EXPORT_SYMBOL(kmalloc_large);
+EXPORT_SYMBOL(kmalloc_large_noprof);
 
-void *kmalloc_large_node(size_t size, gfp_t flags, int node)
+void *kmalloc_large_node_noprof(size_t size, gfp_t flags, int node)
 {
 	void *ret = __kmalloc_large_node(size, flags, node);
 
@@ -4096,7 +4096,7 @@ void *kmalloc_large_node(size_t size, gfp_t flags, int node)
 		      flags, node);
 	return ret;
 }
-EXPORT_SYMBOL(kmalloc_large_node);
+EXPORT_SYMBOL(kmalloc_large_node_noprof);
 
 static __always_inline
 void *__do_kmalloc_node(size_t size, gfp_t flags, int node,
@@ -4123,26 +4123,26 @@ void *__do_kmalloc_node(size_t size, gfp_t flags, int node,
 	return ret;
 }
 
-void *__kmalloc_node(size_t size, gfp_t flags, int node)
+void *__kmalloc_node_noprof(size_t size, gfp_t flags, int node)
 {
 	return __do_kmalloc_node(size, flags, node, _RET_IP_);
 }
-EXPORT_SYMBOL(__kmalloc_node);
+EXPORT_SYMBOL(__kmalloc_node_noprof);
 
-void *__kmalloc(size_t size, gfp_t flags)
+void *__kmalloc_noprof(size_t size, gfp_t flags)
 {
 	return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_);
 }
-EXPORT_SYMBOL(__kmalloc);
+EXPORT_SYMBOL(__kmalloc_noprof);
 
-void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
-				  int node, unsigned long caller)
+void *kmalloc_node_track_caller_noprof(size_t size, gfp_t flags,
+				       int node, unsigned long caller)
 {
 	return __do_kmalloc_node(size, flags, node, caller);
 }
-EXPORT_SYMBOL(__kmalloc_node_track_caller);
+EXPORT_SYMBOL(kmalloc_node_track_caller_noprof);
 
-void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
+void *kmalloc_trace_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size)
 {
 	void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE,
 					    _RET_IP_, size);
@@ -4152,9 +4152,9 @@ void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
 	ret = kasan_kmalloc(s, ret, size, gfpflags);
 	return ret;
 }
-EXPORT_SYMBOL(kmalloc_trace);
+EXPORT_SYMBOL(kmalloc_trace_noprof);
 
-void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
+void *kmalloc_node_trace_noprof(struct kmem_cache *s, gfp_t gfpflags,
 			 int node, size_t size)
 {
 	void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size);
@@ -4164,7 +4164,7 @@ void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags,
 	ret = kasan_kmalloc(s, ret, size, gfpflags);
 	return ret;
 }
-EXPORT_SYMBOL(kmalloc_node_trace);
+EXPORT_SYMBOL(kmalloc_node_trace_noprof);
 
 static noinline void free_to_partial_list(
 	struct kmem_cache *s, struct slab *slab,
@@ -4769,8 +4769,8 @@ error:
 #endif /* CONFIG_SLUB_TINY */
 
 /* Note that interrupts must be enabled when calling this function. */
-int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
-			  void **p)
+int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size,
+				 void **p)
 {
 	int i;
 	struct obj_cgroup *objcg = NULL;
@@ -4798,7 +4798,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 
 	return i;
 }
-EXPORT_SYMBOL(kmem_cache_alloc_bulk);
+EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof);
 
 
 /*
diff --git a/mm/util.c b/mm/util.c
index 669397235787..cee3563d2071 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -124,16 +124,16 @@ EXPORT_SYMBOL(kstrndup);
  * Return: newly allocated copy of @src or %NULL in case of error,
  * result is physically contiguous. Use kfree() to free.
  */
-void *kmemdup(const void *src, size_t len, gfp_t gfp)
+void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp)
 {
 	void *p;
 
-	p = kmalloc_track_caller(len, gfp);
+	p = kmalloc_node_track_caller_noprof(len, gfp, NUMA_NO_NODE, _RET_IP_);
 	if (p)
 		memcpy(p, src, len);
 	return p;
 }
-EXPORT_SYMBOL(kmemdup);
+EXPORT_SYMBOL(kmemdup_noprof);
 
 /**
  * kmemdup_array - duplicate a given array.
@@ -609,7 +609,7 @@ EXPORT_SYMBOL(vm_mmap);
  *
  * Return: pointer to the allocated memory of %NULL in case of failure
  */
-void *kvmalloc_node(size_t size, gfp_t flags, int node)
+void *kvmalloc_node_noprof(size_t size, gfp_t flags, int node)
 {
 	gfp_t kmalloc_flags = flags;
 	void *ret;
@@ -631,7 +631,7 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
 		kmalloc_flags &= ~__GFP_NOFAIL;
 	}
 
-	ret = kmalloc_node(size, kmalloc_flags, node);
+	ret = kmalloc_node_noprof(size, kmalloc_flags, node);
 
 	/*
 	 * It doesn't really make sense to fallback to vmalloc for sub page
@@ -660,7 +660,7 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
 			flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
 			node, __builtin_return_address(0));
 }
-EXPORT_SYMBOL(kvmalloc_node);
+EXPORT_SYMBOL(kvmalloc_node_noprof);
 
 /**
  * kvfree() - Free memory.
@@ -699,7 +699,7 @@ void kvfree_sensitive(const void *addr, size_t len)
 }
 EXPORT_SYMBOL(kvfree_sensitive);
 
-void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
+void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
 {
 	void *newp;
 
@@ -712,7 +712,7 @@ void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
 	kvfree(p);
 	return newp;
 }
-EXPORT_SYMBOL(kvrealloc);
+EXPORT_SYMBOL(kvrealloc_noprof);
 
 /**
  * __vmalloc_array - allocate memory for a virtually contiguous array.
-- 
cgit v1.2.3


From e26d8769da6d4c7319fdd4ddb497caa7d87a6c5c Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 21 Mar 2024 09:36:48 -0700
Subject: mempool: hook up to memory allocation profiling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds hooks to mempools for correctly annotating mempool-backed
allocations at the correct source line, so they show up correctly in
/sys/kernel/debug/allocations.

Various inline functions are converted to wrappers so that we can invoke
alloc_hooks() in fewer places.

[surenb@google.com: undo _noprof additions in the documentation]
  Link: https://lkml.kernel.org/r/20240326231453.1206227-4-surenb@google.com
[surenb@google.com: add missing mempool_create_node documentation]
  Link: https://lkml.kernel.org/r/20240402180835.1661905-1-surenb@google.com
Link: https://lkml.kernel.org/r/20240321163705.3067592-27-surenb@google.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Tested-by: Kees Cook <keescook@chromium.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andreas Hindborg <a.hindborg@samsung.com>
Cc: Benno Lossin <benno.lossin@proton.me>
Cc: "Björn Roy Baron" <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mempool.h | 73 ++++++++++++++++++++++++-------------------------
 mm/mempool.c            | 36 ++++++++++--------------
 2 files changed, 50 insertions(+), 59 deletions(-)

(limited to 'include')

diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 16c5cc807ff6..7b151441341b 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -5,6 +5,8 @@
 #ifndef _LINUX_MEMPOOL_H
 #define _LINUX_MEMPOOL_H
 
+#include <linux/sched.h>
+#include <linux/alloc_tag.h>
 #include <linux/wait.h>
 #include <linux/compiler.h>
 
@@ -39,18 +41,32 @@ void mempool_exit(mempool_t *pool);
 int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
 		      mempool_free_t *free_fn, void *pool_data,
 		      gfp_t gfp_mask, int node_id);
-int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
+
+int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
 		 mempool_free_t *free_fn, void *pool_data);
+#define mempool_init(...)						\
+	alloc_hooks(mempool_init_noprof(__VA_ARGS__))
 
 extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
 			mempool_free_t *free_fn, void *pool_data);
-extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
+
+extern mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn,
 			mempool_free_t *free_fn, void *pool_data,
 			gfp_t gfp_mask, int nid);
+#define mempool_create_node(...)					\
+	alloc_hooks(mempool_create_node_noprof(__VA_ARGS__))
+
+#define mempool_create(_min_nr, _alloc_fn, _free_fn, _pool_data)	\
+	mempool_create_node(_min_nr, _alloc_fn, _free_fn, _pool_data,	\
+			    GFP_KERNEL, NUMA_NO_NODE)
 
 extern int mempool_resize(mempool_t *pool, int new_min_nr);
 extern void mempool_destroy(mempool_t *pool);
-extern void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) __malloc;
+
+extern void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) __malloc;
+#define mempool_alloc(...)						\
+	alloc_hooks(mempool_alloc_noprof(__VA_ARGS__))
+
 extern void *mempool_alloc_preallocated(mempool_t *pool) __malloc;
 extern void mempool_free(void *element, mempool_t *pool);
 
@@ -62,19 +78,10 @@ extern void mempool_free(void *element, mempool_t *pool);
 void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data);
 void mempool_free_slab(void *element, void *pool_data);
 
-static inline int
-mempool_init_slab_pool(mempool_t *pool, int min_nr, struct kmem_cache *kc)
-{
-	return mempool_init(pool, min_nr, mempool_alloc_slab,
-			    mempool_free_slab, (void *) kc);
-}
-
-static inline mempool_t *
-mempool_create_slab_pool(int min_nr, struct kmem_cache *kc)
-{
-	return mempool_create(min_nr, mempool_alloc_slab, mempool_free_slab,
-			      (void *) kc);
-}
+#define mempool_init_slab_pool(_pool, _min_nr, _kc)			\
+	mempool_init(_pool, (_min_nr), mempool_alloc_slab, mempool_free_slab, (void *)(_kc))
+#define mempool_create_slab_pool(_min_nr, _kc)			\
+	mempool_create((_min_nr), mempool_alloc_slab, mempool_free_slab, (void *)(_kc))
 
 /*
  * a mempool_alloc_t and a mempool_free_t to kmalloc and kfree the
@@ -83,17 +90,12 @@ mempool_create_slab_pool(int min_nr, struct kmem_cache *kc)
 void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data);
 void mempool_kfree(void *element, void *pool_data);
 
-static inline int mempool_init_kmalloc_pool(mempool_t *pool, int min_nr, size_t size)
-{
-	return mempool_init(pool, min_nr, mempool_kmalloc,
-			    mempool_kfree, (void *) size);
-}
-
-static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size)
-{
-	return mempool_create(min_nr, mempool_kmalloc, mempool_kfree,
-			      (void *) size);
-}
+#define mempool_init_kmalloc_pool(_pool, _min_nr, _size)		\
+	mempool_init(_pool, (_min_nr), mempool_kmalloc, mempool_kfree,	\
+		     (void *)(unsigned long)(_size))
+#define mempool_create_kmalloc_pool(_min_nr, _size)			\
+	mempool_create((_min_nr), mempool_kmalloc, mempool_kfree,	\
+		       (void *)(unsigned long)(_size))
 
 void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data);
 void mempool_kvfree(void *element, void *pool_data);
@@ -115,16 +117,11 @@ static inline mempool_t *mempool_create_kvmalloc_pool(int min_nr, size_t size)
 void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data);
 void mempool_free_pages(void *element, void *pool_data);
 
-static inline int mempool_init_page_pool(mempool_t *pool, int min_nr, int order)
-{
-	return mempool_init(pool, min_nr, mempool_alloc_pages,
-			    mempool_free_pages, (void *)(long)order);
-}
-
-static inline mempool_t *mempool_create_page_pool(int min_nr, int order)
-{
-	return mempool_create(min_nr, mempool_alloc_pages, mempool_free_pages,
-			      (void *)(long)order);
-}
+#define mempool_init_page_pool(_pool, _min_nr, _order)			\
+	mempool_init(_pool, (_min_nr), mempool_alloc_pages,		\
+		     mempool_free_pages, (void *)(long)(_order))
+#define mempool_create_page_pool(_min_nr, _order)			\
+	mempool_create((_min_nr), mempool_alloc_pages,			\
+		       mempool_free_pages, (void *)(long)(_order))
 
 #endif /* _LINUX_MEMPOOL_H */
diff --git a/mm/mempool.c b/mm/mempool.c
index 076c736f5f1f..6ece63a00acf 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -240,22 +240,24 @@ EXPORT_SYMBOL(mempool_init_node);
  *
  * Return: %0 on success, negative error code otherwise.
  */
-int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
-		 mempool_free_t *free_fn, void *pool_data)
+int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
+			mempool_free_t *free_fn, void *pool_data)
 {
 	return mempool_init_node(pool, min_nr, alloc_fn, free_fn,
 				 pool_data, GFP_KERNEL, NUMA_NO_NODE);
 
 }
-EXPORT_SYMBOL(mempool_init);
+EXPORT_SYMBOL(mempool_init_noprof);
 
 /**
- * mempool_create - create a memory pool
+ * mempool_create_node - create a memory pool
  * @min_nr:    the minimum number of elements guaranteed to be
  *             allocated for this pool.
  * @alloc_fn:  user-defined element-allocation function.
  * @free_fn:   user-defined element-freeing function.
  * @pool_data: optional private data available to the user-defined functions.
+ * @gfp_mask:  memory allocation flags
+ * @node_id:   numa node to allocate on
  *
  * this function creates and allocates a guaranteed size, preallocated
  * memory pool. The pool can be used from the mempool_alloc() and mempool_free()
@@ -265,17 +267,9 @@ EXPORT_SYMBOL(mempool_init);
  *
  * Return: pointer to the created memory pool object or %NULL on error.
  */
-mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
-				mempool_free_t *free_fn, void *pool_data)
-{
-	return mempool_create_node(min_nr, alloc_fn, free_fn, pool_data,
-				   GFP_KERNEL, NUMA_NO_NODE);
-}
-EXPORT_SYMBOL(mempool_create);
-
-mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
-			       mempool_free_t *free_fn, void *pool_data,
-			       gfp_t gfp_mask, int node_id)
+mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn,
+				      mempool_free_t *free_fn, void *pool_data,
+				      gfp_t gfp_mask, int node_id)
 {
 	mempool_t *pool;
 
@@ -291,7 +285,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
 
 	return pool;
 }
-EXPORT_SYMBOL(mempool_create_node);
+EXPORT_SYMBOL(mempool_create_node_noprof);
 
 /**
  * mempool_resize - resize an existing memory pool
@@ -387,7 +381,7 @@ EXPORT_SYMBOL(mempool_resize);
  *
  * Return: pointer to the allocated element or %NULL on error.
  */
-void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
+void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask)
 {
 	void *element;
 	unsigned long flags;
@@ -454,7 +448,7 @@ repeat_alloc:
 	finish_wait(&pool->wait, &wait);
 	goto repeat_alloc;
 }
-EXPORT_SYMBOL(mempool_alloc);
+EXPORT_SYMBOL(mempool_alloc_noprof);
 
 /**
  * mempool_alloc_preallocated - allocate an element from preallocated elements
@@ -562,7 +556,7 @@ void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data)
 {
 	struct kmem_cache *mem = pool_data;
 	VM_BUG_ON(mem->ctor);
-	return kmem_cache_alloc(mem, gfp_mask);
+	return kmem_cache_alloc_noprof(mem, gfp_mask);
 }
 EXPORT_SYMBOL(mempool_alloc_slab);
 
@@ -580,7 +574,7 @@ EXPORT_SYMBOL(mempool_free_slab);
 void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
 {
 	size_t size = (size_t)pool_data;
-	return kmalloc(size, gfp_mask);
+	return kmalloc_noprof(size, gfp_mask);
 }
 EXPORT_SYMBOL(mempool_kmalloc);
 
@@ -610,7 +604,7 @@ EXPORT_SYMBOL(mempool_kvfree);
 void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data)
 {
 	int order = (int)(long)pool_data;
-	return alloc_pages(gfp_mask, order);
+	return alloc_pages_noprof(gfp_mask, order);
 }
 EXPORT_SYMBOL(mempool_alloc_pages);
 
-- 
cgit v1.2.3


From 24e44cc22aa3112082f2ee23137d048c73ca96d5 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 21 Mar 2024 09:36:51 -0700
Subject: mm: percpu: enable per-cpu allocation tagging
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Redefine __alloc_percpu, __alloc_percpu_gfp and __alloc_reserved_percpu
to record allocations and deallocations done by these functions.

[surenb@google.com: undo _noprof additions in the documentation]
  Link: https://lkml.kernel.org/r/20240326231453.1206227-6-surenb@google.com
Link: https://lkml.kernel.org/r/20240321163705.3067592-30-surenb@google.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Tested-by: Kees Cook <keescook@chromium.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andreas Hindborg <a.hindborg@samsung.com>
Cc: Benno Lossin <benno.lossin@proton.me>
Cc: "Björn Roy Baron" <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/percpu.h | 23 +++++++++++++------
 mm/percpu.c            | 62 +++++---------------------------------------------
 2 files changed, 22 insertions(+), 63 deletions(-)

(limited to 'include')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 9dfa7b526e70..13a82f11e4fd 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -2,6 +2,7 @@
 #ifndef __LINUX_PERCPU_H
 #define __LINUX_PERCPU_H
 
+#include <linux/alloc_tag.h>
 #include <linux/mmdebug.h>
 #include <linux/preempt.h>
 #include <linux/smp.h>
@@ -9,6 +10,7 @@
 #include <linux/pfn.h>
 #include <linux/init.h>
 #include <linux/cleanup.h>
+#include <linux/sched.h>
 
 #include <asm/percpu.h>
 
@@ -125,7 +127,6 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size,
 				pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn);
 #endif
 
-extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align) __alloc_size(1);
 extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr);
 extern bool is_kernel_percpu_address(unsigned long addr);
 
@@ -133,14 +134,16 @@ extern bool is_kernel_percpu_address(unsigned long addr);
 extern void __init setup_per_cpu_areas(void);
 #endif
 
-extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __alloc_size(1);
-extern void __percpu *__alloc_percpu(size_t size, size_t align) __alloc_size(1);
-extern void free_percpu(void __percpu *__pdata);
+extern void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved,
+				   gfp_t gfp) __alloc_size(1);
 extern size_t pcpu_alloc_size(void __percpu *__pdata);
 
-DEFINE_FREE(free_percpu, void __percpu *, free_percpu(_T))
-
-extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
+#define __alloc_percpu_gfp(_size, _align, _gfp)				\
+	alloc_hooks(pcpu_alloc_noprof(_size, _align, false, _gfp))
+#define __alloc_percpu(_size, _align)					\
+	alloc_hooks(pcpu_alloc_noprof(_size, _align, false, GFP_KERNEL))
+#define __alloc_reserved_percpu(_size, _align)				\
+	alloc_hooks(pcpu_alloc_noprof(_size, _align, true, GFP_KERNEL))
 
 #define alloc_percpu_gfp(type, gfp)					\
 	(typeof(type) __percpu *)__alloc_percpu_gfp(sizeof(type),	\
@@ -149,6 +152,12 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
 	(typeof(type) __percpu *)__alloc_percpu(sizeof(type),		\
 						__alignof__(type))
 
+extern void free_percpu(void __percpu *__pdata);
+
+DEFINE_FREE(free_percpu, void __percpu *, free_percpu(_T))
+
+extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
+
 extern unsigned long pcpu_nr_pages(void);
 
 #endif /* __LINUX_PERCPU_H */
diff --git a/mm/percpu.c b/mm/percpu.c
index 90e9e4004ac9..474e3683b74d 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1740,7 +1740,7 @@ static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t s
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
  */
-static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
+void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved,
 				 gfp_t gfp)
 {
 	gfp_t pcpu_gfp;
@@ -1907,6 +1907,8 @@ area_found:
 
 	pcpu_memcg_post_alloc_hook(objcg, chunk, off, size);
 
+	pcpu_alloc_tag_alloc_hook(chunk, off, size);
+
 	return ptr;
 
 fail_unlock:
@@ -1935,61 +1937,7 @@ fail:
 
 	return NULL;
 }
-
-/**
- * __alloc_percpu_gfp - allocate dynamic percpu area
- * @size: size of area to allocate in bytes
- * @align: alignment of area (max PAGE_SIZE)
- * @gfp: allocation flags
- *
- * Allocate zero-filled percpu area of @size bytes aligned at @align.  If
- * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can
- * be called from any context but is a lot more likely to fail. If @gfp
- * has __GFP_NOWARN then no warning will be triggered on invalid or failed
- * allocation requests.
- *
- * RETURNS:
- * Percpu pointer to the allocated area on success, NULL on failure.
- */
-void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
-{
-	return pcpu_alloc(size, align, false, gfp);
-}
-EXPORT_SYMBOL_GPL(__alloc_percpu_gfp);
-
-/**
- * __alloc_percpu - allocate dynamic percpu area
- * @size: size of area to allocate in bytes
- * @align: alignment of area (max PAGE_SIZE)
- *
- * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL).
- */
-void __percpu *__alloc_percpu(size_t size, size_t align)
-{
-	return pcpu_alloc(size, align, false, GFP_KERNEL);
-}
-EXPORT_SYMBOL_GPL(__alloc_percpu);
-
-/**
- * __alloc_reserved_percpu - allocate reserved percpu area
- * @size: size of area to allocate in bytes
- * @align: alignment of area (max PAGE_SIZE)
- *
- * Allocate zero-filled percpu area of @size bytes aligned at @align
- * from reserved percpu area if arch has set it up; otherwise,
- * allocation is served from the same dynamic area.  Might sleep.
- * Might trigger writeouts.
- *
- * CONTEXT:
- * Does GFP_KERNEL allocation.
- *
- * RETURNS:
- * Percpu pointer to the allocated area on success, NULL on failure.
- */
-void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
-{
-	return pcpu_alloc(size, align, true, GFP_KERNEL);
-}
+EXPORT_SYMBOL_GPL(pcpu_alloc_noprof);
 
 /**
  * pcpu_balance_free - manage the amount of free chunks
@@ -2328,6 +2276,8 @@ void free_percpu(void __percpu *ptr)
 	spin_lock_irqsave(&pcpu_lock, flags);
 	size = pcpu_free_area(chunk, off);
 
+	pcpu_alloc_tag_free_hook(chunk, off, size);
+
 	pcpu_memcg_free_hook(chunk, off, size);
 
 	/*
-- 
cgit v1.2.3


From 88ae5fb755b0d45f790b392f578924d1ca5693a7 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 21 Mar 2024 09:36:52 -0700
Subject: mm: vmalloc: enable memory allocation profiling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This wrapps all external vmalloc allocation functions with the
alloc_hooks() wrapper, and switches internal allocations to _noprof
variants where appropriate, for the new memory allocation profiling
feature.

[surenb@google.com: arch/um: fix forward declaration for vmalloc]
  Link: https://lkml.kernel.org/r/20240326073750.726636-1-surenb@google.com
[surenb@google.com: undo _noprof additions in the documentation]
  Link: https://lkml.kernel.org/r/20240326231453.1206227-5-surenb@google.com
Link: https://lkml.kernel.org/r/20240321163705.3067592-31-surenb@google.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Tested-by: Kees Cook <keescook@chromium.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andreas Hindborg <a.hindborg@samsung.com>
Cc: Benno Lossin <benno.lossin@proton.me>
Cc: "Björn Roy Baron" <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/um/include/shared/um_malloc.h          |  3 +-
 drivers/staging/media/atomisp/pci/hmm/hmm.c |  2 +-
 include/linux/vmalloc.h                     | 60 +++++++++++++++++------
 kernel/kallsyms_selftest.c                  |  2 +-
 mm/nommu.c                                  | 56 +++++++++++-----------
 mm/util.c                                   | 18 +++----
 mm/vmalloc.c                                | 74 ++++++++++++++---------------
 7 files changed, 123 insertions(+), 92 deletions(-)

(limited to 'include')

diff --git a/arch/um/include/shared/um_malloc.h b/arch/um/include/shared/um_malloc.h
index 13da93284c2c..bf503658f08e 100644
--- a/arch/um/include/shared/um_malloc.h
+++ b/arch/um/include/shared/um_malloc.h
@@ -11,7 +11,8 @@
 extern void *uml_kmalloc(int size, int flags);
 extern void kfree(const void *ptr);
 
-extern void *vmalloc(unsigned long size);
+extern void *vmalloc_noprof(unsigned long size);
+#define vmalloc(...)		vmalloc_noprof(__VA_ARGS__)
 extern void vfree(void *ptr);
 
 #endif /* __UM_MALLOC_H__ */
diff --git a/drivers/staging/media/atomisp/pci/hmm/hmm.c b/drivers/staging/media/atomisp/pci/hmm/hmm.c
index bb12644fd033..3e2899ad8517 100644
--- a/drivers/staging/media/atomisp/pci/hmm/hmm.c
+++ b/drivers/staging/media/atomisp/pci/hmm/hmm.c
@@ -205,7 +205,7 @@ static ia_css_ptr __hmm_alloc(size_t bytes, enum hmm_bo_type type,
 	}
 
 	dev_dbg(atomisp_dev, "pages: 0x%08x (%zu bytes), type: %d, vmalloc %p\n",
-		bo->start, bytes, type, vmalloc);
+		bo->start, bytes, type, vmalloc_noprof);
 
 	return bo->start;
 
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 98ea90e90439..e4a631ec430b 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -2,6 +2,8 @@
 #ifndef _LINUX_VMALLOC_H
 #define _LINUX_VMALLOC_H
 
+#include <linux/alloc_tag.h>
+#include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/init.h>
 #include <linux/list.h>
@@ -138,26 +140,54 @@ extern unsigned long vmalloc_nr_pages(void);
 static inline unsigned long vmalloc_nr_pages(void) { return 0; }
 #endif
 
-extern void *vmalloc(unsigned long size) __alloc_size(1);
-extern void *vzalloc(unsigned long size) __alloc_size(1);
-extern void *vmalloc_user(unsigned long size) __alloc_size(1);
-extern void *vmalloc_node(unsigned long size, int node) __alloc_size(1);
-extern void *vzalloc_node(unsigned long size, int node) __alloc_size(1);
-extern void *vmalloc_32(unsigned long size) __alloc_size(1);
-extern void *vmalloc_32_user(unsigned long size) __alloc_size(1);
-extern void *__vmalloc(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
-extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
+extern void *vmalloc_noprof(unsigned long size) __alloc_size(1);
+#define vmalloc(...)		alloc_hooks(vmalloc_noprof(__VA_ARGS__))
+
+extern void *vzalloc_noprof(unsigned long size) __alloc_size(1);
+#define vzalloc(...)		alloc_hooks(vzalloc_noprof(__VA_ARGS__))
+
+extern void *vmalloc_user_noprof(unsigned long size) __alloc_size(1);
+#define vmalloc_user(...)	alloc_hooks(vmalloc_user_noprof(__VA_ARGS__))
+
+extern void *vmalloc_node_noprof(unsigned long size, int node) __alloc_size(1);
+#define vmalloc_node(...)	alloc_hooks(vmalloc_node_noprof(__VA_ARGS__))
+
+extern void *vzalloc_node_noprof(unsigned long size, int node) __alloc_size(1);
+#define vzalloc_node(...)	alloc_hooks(vzalloc_node_noprof(__VA_ARGS__))
+
+extern void *vmalloc_32_noprof(unsigned long size) __alloc_size(1);
+#define vmalloc_32(...)		alloc_hooks(vmalloc_32_noprof(__VA_ARGS__))
+
+extern void *vmalloc_32_user_noprof(unsigned long size) __alloc_size(1);
+#define vmalloc_32_user(...)	alloc_hooks(vmalloc_32_user_noprof(__VA_ARGS__))
+
+extern void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
+#define __vmalloc(...)		alloc_hooks(__vmalloc_noprof(__VA_ARGS__))
+
+extern void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
 			unsigned long start, unsigned long end, gfp_t gfp_mask,
 			pgprot_t prot, unsigned long vm_flags, int node,
 			const void *caller) __alloc_size(1);
-void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
+#define __vmalloc_node_range(...)	alloc_hooks(__vmalloc_node_range_noprof(__VA_ARGS__))
+
+void *__vmalloc_node_noprof(unsigned long size, unsigned long align, gfp_t gfp_mask,
 		int node, const void *caller) __alloc_size(1);
-void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
+#define __vmalloc_node(...)	alloc_hooks(__vmalloc_node_noprof(__VA_ARGS__))
+
+void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __alloc_size(1);
+#define vmalloc_huge(...)	alloc_hooks(vmalloc_huge_noprof(__VA_ARGS__))
+
+extern void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2);
+#define __vmalloc_array(...)	alloc_hooks(__vmalloc_array_noprof(__VA_ARGS__))
+
+extern void *vmalloc_array_noprof(size_t n, size_t size) __alloc_size(1, 2);
+#define vmalloc_array(...)	alloc_hooks(vmalloc_array_noprof(__VA_ARGS__))
+
+extern void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2);
+#define __vcalloc(...)		alloc_hooks(__vcalloc_noprof(__VA_ARGS__))
 
-extern void *__vmalloc_array(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2);
-extern void *vmalloc_array(size_t n, size_t size) __alloc_size(1, 2);
-extern void *__vcalloc(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2);
-extern void *vcalloc(size_t n, size_t size) __alloc_size(1, 2);
+extern void *vcalloc_noprof(size_t n, size_t size) __alloc_size(1, 2);
+#define vcalloc(...)		alloc_hooks(vcalloc_noprof(__VA_ARGS__))
 
 extern void vfree(const void *addr);
 extern void vfree_atomic(const void *addr);
diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c
index 8a689b4ff4f9..2f84896a7bcb 100644
--- a/kernel/kallsyms_selftest.c
+++ b/kernel/kallsyms_selftest.c
@@ -82,7 +82,7 @@ static struct test_item test_items[] = {
 	ITEM_FUNC(kallsyms_test_func_static),
 	ITEM_FUNC(kallsyms_test_func),
 	ITEM_FUNC(kallsyms_test_func_weak),
-	ITEM_FUNC(vmalloc),
+	ITEM_FUNC(vmalloc_noprof),
 	ITEM_FUNC(vfree),
 #ifdef CONFIG_KALLSYMS_ALL
 	ITEM_DATA(kallsyms_test_var_bss_static),
diff --git a/mm/nommu.c b/mm/nommu.c
index 5ec8f44e7ce9..88b646ea6d30 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -137,28 +137,28 @@ void vfree(const void *addr)
 }
 EXPORT_SYMBOL(vfree);
 
-void *__vmalloc(unsigned long size, gfp_t gfp_mask)
+void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask)
 {
 	/*
 	 *  You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc()
 	 * returns only a logical address.
 	 */
-	return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM);
+	return kmalloc_noprof(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM);
 }
-EXPORT_SYMBOL(__vmalloc);
+EXPORT_SYMBOL(__vmalloc_noprof);
 
-void *__vmalloc_node_range(unsigned long size, unsigned long align,
+void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
 		unsigned long start, unsigned long end, gfp_t gfp_mask,
 		pgprot_t prot, unsigned long vm_flags, int node,
 		const void *caller)
 {
-	return __vmalloc(size, gfp_mask);
+	return __vmalloc_noprof(size, gfp_mask);
 }
 
-void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
+void *__vmalloc_node_noprof(unsigned long size, unsigned long align, gfp_t gfp_mask,
 		int node, const void *caller)
 {
-	return __vmalloc(size, gfp_mask);
+	return __vmalloc_noprof(size, gfp_mask);
 }
 
 static void *__vmalloc_user_flags(unsigned long size, gfp_t flags)
@@ -179,11 +179,11 @@ static void *__vmalloc_user_flags(unsigned long size, gfp_t flags)
 	return ret;
 }
 
-void *vmalloc_user(unsigned long size)
+void *vmalloc_user_noprof(unsigned long size)
 {
 	return __vmalloc_user_flags(size, GFP_KERNEL | __GFP_ZERO);
 }
-EXPORT_SYMBOL(vmalloc_user);
+EXPORT_SYMBOL(vmalloc_user_noprof);
 
 struct page *vmalloc_to_page(const void *addr)
 {
@@ -217,13 +217,13 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
  *	For tight control over page level allocator and protection flags
  *	use __vmalloc() instead.
  */
-void *vmalloc(unsigned long size)
+void *vmalloc_noprof(unsigned long size)
 {
-	return __vmalloc(size, GFP_KERNEL);
+	return __vmalloc_noprof(size, GFP_KERNEL);
 }
-EXPORT_SYMBOL(vmalloc);
+EXPORT_SYMBOL(vmalloc_noprof);
 
-void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc);
+void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc_noprof);
 
 /*
  *	vzalloc - allocate virtually contiguous memory with zero fill
@@ -237,11 +237,11 @@ void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __weak __alias(__vmalloc)
  *	For tight control over page level allocator and protection flags
  *	use __vmalloc() instead.
  */
-void *vzalloc(unsigned long size)
+void *vzalloc_noprof(unsigned long size)
 {
-	return __vmalloc(size, GFP_KERNEL | __GFP_ZERO);
+	return __vmalloc_noprof(size, GFP_KERNEL | __GFP_ZERO);
 }
-EXPORT_SYMBOL(vzalloc);
+EXPORT_SYMBOL(vzalloc_noprof);
 
 /**
  * vmalloc_node - allocate memory on a specific node
@@ -254,11 +254,11 @@ EXPORT_SYMBOL(vzalloc);
  * For tight control over page level allocator and protection flags
  * use __vmalloc() instead.
  */
-void *vmalloc_node(unsigned long size, int node)
+void *vmalloc_node_noprof(unsigned long size, int node)
 {
-	return vmalloc(size);
+	return vmalloc_noprof(size);
 }
-EXPORT_SYMBOL(vmalloc_node);
+EXPORT_SYMBOL(vmalloc_node_noprof);
 
 /**
  * vzalloc_node - allocate memory on a specific node with zero fill
@@ -272,11 +272,11 @@ EXPORT_SYMBOL(vmalloc_node);
  * For tight control over page level allocator and protection flags
  * use __vmalloc() instead.
  */
-void *vzalloc_node(unsigned long size, int node)
+void *vzalloc_node_noprof(unsigned long size, int node)
 {
-	return vzalloc(size);
+	return vzalloc_noprof(size);
 }
-EXPORT_SYMBOL(vzalloc_node);
+EXPORT_SYMBOL(vzalloc_node_noprof);
 
 /**
  * vmalloc_32  -  allocate virtually contiguous memory (32bit addressable)
@@ -285,11 +285,11 @@ EXPORT_SYMBOL(vzalloc_node);
  *	Allocate enough 32bit PA addressable pages to cover @size from the
  *	page level allocator and map them into contiguous kernel virtual space.
  */
-void *vmalloc_32(unsigned long size)
+void *vmalloc_32_noprof(unsigned long size)
 {
-	return __vmalloc(size, GFP_KERNEL);
+	return __vmalloc_noprof(size, GFP_KERNEL);
 }
-EXPORT_SYMBOL(vmalloc_32);
+EXPORT_SYMBOL(vmalloc_32_noprof);
 
 /**
  * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
@@ -301,15 +301,15 @@ EXPORT_SYMBOL(vmalloc_32);
  * VM_USERMAP is set on the corresponding VMA so that subsequent calls to
  * remap_vmalloc_range() are permissible.
  */
-void *vmalloc_32_user(unsigned long size)
+void *vmalloc_32_user_noprof(unsigned long size)
 {
 	/*
 	 * We'll have to sort out the ZONE_DMA bits for 64-bit,
 	 * but for now this can simply use vmalloc_user() directly.
 	 */
-	return vmalloc_user(size);
+	return vmalloc_user_noprof(size);
 }
-EXPORT_SYMBOL(vmalloc_32_user);
+EXPORT_SYMBOL(vmalloc_32_user_noprof);
 
 void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot)
 {
diff --git a/mm/util.c b/mm/util.c
index cee3563d2071..a9e911b22b99 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -656,7 +656,7 @@ void *kvmalloc_node_noprof(size_t size, gfp_t flags, int node)
 	 * about the resulting pointer, and cannot play
 	 * protection games.
 	 */
-	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+	return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
 			flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
 			node, __builtin_return_address(0));
 }
@@ -720,7 +720,7 @@ EXPORT_SYMBOL(kvrealloc_noprof);
  * @size: element size.
  * @flags: the type of memory to allocate (see kmalloc).
  */
-void *__vmalloc_array(size_t n, size_t size, gfp_t flags)
+void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags)
 {
 	size_t bytes;
 
@@ -728,18 +728,18 @@ void *__vmalloc_array(size_t n, size_t size, gfp_t flags)
 		return NULL;
 	return __vmalloc(bytes, flags);
 }
-EXPORT_SYMBOL(__vmalloc_array);
+EXPORT_SYMBOL(__vmalloc_array_noprof);
 
 /**
  * vmalloc_array - allocate memory for a virtually contiguous array.
  * @n: number of elements.
  * @size: element size.
  */
-void *vmalloc_array(size_t n, size_t size)
+void *vmalloc_array_noprof(size_t n, size_t size)
 {
 	return __vmalloc_array(n, size, GFP_KERNEL);
 }
-EXPORT_SYMBOL(vmalloc_array);
+EXPORT_SYMBOL(vmalloc_array_noprof);
 
 /**
  * __vcalloc - allocate and zero memory for a virtually contiguous array.
@@ -747,22 +747,22 @@ EXPORT_SYMBOL(vmalloc_array);
  * @size: element size.
  * @flags: the type of memory to allocate (see kmalloc).
  */
-void *__vcalloc(size_t n, size_t size, gfp_t flags)
+void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags)
 {
 	return __vmalloc_array(n, size, flags | __GFP_ZERO);
 }
-EXPORT_SYMBOL(__vcalloc);
+EXPORT_SYMBOL(__vcalloc_noprof);
 
 /**
  * vcalloc - allocate and zero memory for a virtually contiguous array.
  * @n: number of elements.
  * @size: element size.
  */
-void *vcalloc(size_t n, size_t size)
+void *vcalloc_noprof(size_t n, size_t size)
 {
 	return __vmalloc_array(n, size, GFP_KERNEL | __GFP_ZERO);
 }
-EXPORT_SYMBOL(vcalloc);
+EXPORT_SYMBOL(vcalloc_noprof);
 
 struct anon_vma *folio_anon_vma(struct folio *folio)
 {
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 68fa001648cc..61af431bb6b8 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -3523,12 +3523,12 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
 			 * but mempolicy wants to alloc memory by interleaving.
 			 */
 			if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
-				nr = alloc_pages_bulk_array_mempolicy(bulk_gfp,
+				nr = alloc_pages_bulk_array_mempolicy_noprof(bulk_gfp,
 							nr_pages_request,
 							pages + nr_allocated);
 
 			else
-				nr = alloc_pages_bulk_array_node(bulk_gfp, nid,
+				nr = alloc_pages_bulk_array_node_noprof(bulk_gfp, nid,
 							nr_pages_request,
 							pages + nr_allocated);
 
@@ -3558,9 +3558,9 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
 			break;
 
 		if (nid == NUMA_NO_NODE)
-			page = alloc_pages(alloc_gfp, order);
+			page = alloc_pages_noprof(alloc_gfp, order);
 		else
-			page = alloc_pages_node(nid, alloc_gfp, order);
+			page = alloc_pages_node_noprof(nid, alloc_gfp, order);
 		if (unlikely(!page)) {
 			if (!nofail)
 				break;
@@ -3617,10 +3617,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 
 	/* Please note that the recursion is strictly bounded. */
 	if (array_size > PAGE_SIZE) {
-		area->pages = __vmalloc_node(array_size, 1, nested_gfp, node,
+		area->pages = __vmalloc_node_noprof(array_size, 1, nested_gfp, node,
 					area->caller);
 	} else {
-		area->pages = kmalloc_node(array_size, nested_gfp, node);
+		area->pages = kmalloc_node_noprof(array_size, nested_gfp, node);
 	}
 
 	if (!area->pages) {
@@ -3730,7 +3730,7 @@ fail:
  *
  * Return: the address of the area or %NULL on failure
  */
-void *__vmalloc_node_range(unsigned long size, unsigned long align,
+void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
 			unsigned long start, unsigned long end, gfp_t gfp_mask,
 			pgprot_t prot, unsigned long vm_flags, int node,
 			const void *caller)
@@ -3877,10 +3877,10 @@ fail:
  *
  * Return: pointer to the allocated memory or %NULL on error
  */
-void *__vmalloc_node(unsigned long size, unsigned long align,
+void *__vmalloc_node_noprof(unsigned long size, unsigned long align,
 			    gfp_t gfp_mask, int node, const void *caller)
 {
-	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
+	return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END,
 				gfp_mask, PAGE_KERNEL, 0, node, caller);
 }
 /*
@@ -3889,15 +3889,15 @@ void *__vmalloc_node(unsigned long size, unsigned long align,
  * than that.
  */
 #ifdef CONFIG_TEST_VMALLOC_MODULE
-EXPORT_SYMBOL_GPL(__vmalloc_node);
+EXPORT_SYMBOL_GPL(__vmalloc_node_noprof);
 #endif
 
-void *__vmalloc(unsigned long size, gfp_t gfp_mask)
+void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask)
 {
-	return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE,
+	return __vmalloc_node_noprof(size, 1, gfp_mask, NUMA_NO_NODE,
 				__builtin_return_address(0));
 }
-EXPORT_SYMBOL(__vmalloc);
+EXPORT_SYMBOL(__vmalloc_noprof);
 
 /**
  * vmalloc - allocate virtually contiguous memory
@@ -3911,12 +3911,12 @@ EXPORT_SYMBOL(__vmalloc);
  *
  * Return: pointer to the allocated memory or %NULL on error
  */
-void *vmalloc(unsigned long size)
+void *vmalloc_noprof(unsigned long size)
 {
-	return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE,
+	return __vmalloc_node_noprof(size, 1, GFP_KERNEL, NUMA_NO_NODE,
 				__builtin_return_address(0));
 }
-EXPORT_SYMBOL(vmalloc);
+EXPORT_SYMBOL(vmalloc_noprof);
 
 /**
  * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
@@ -3930,13 +3930,13 @@ EXPORT_SYMBOL(vmalloc);
  *
  * Return: pointer to the allocated memory or %NULL on error
  */
-void *vmalloc_huge(unsigned long size, gfp_t gfp_mask)
+void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask)
 {
-	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+	return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
 				    gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
 				    NUMA_NO_NODE, __builtin_return_address(0));
 }
-EXPORT_SYMBOL_GPL(vmalloc_huge);
+EXPORT_SYMBOL_GPL(vmalloc_huge_noprof);
 
 /**
  * vzalloc - allocate virtually contiguous memory with zero fill
@@ -3951,12 +3951,12 @@ EXPORT_SYMBOL_GPL(vmalloc_huge);
  *
  * Return: pointer to the allocated memory or %NULL on error
  */
-void *vzalloc(unsigned long size)
+void *vzalloc_noprof(unsigned long size)
 {
-	return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
+	return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
 				__builtin_return_address(0));
 }
-EXPORT_SYMBOL(vzalloc);
+EXPORT_SYMBOL(vzalloc_noprof);
 
 /**
  * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
@@ -3967,14 +3967,14 @@ EXPORT_SYMBOL(vzalloc);
  *
  * Return: pointer to the allocated memory or %NULL on error
  */
-void *vmalloc_user(unsigned long size)
+void *vmalloc_user_noprof(unsigned long size)
 {
-	return __vmalloc_node_range(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
+	return __vmalloc_node_range_noprof(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
 				    GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
 				    VM_USERMAP, NUMA_NO_NODE,
 				    __builtin_return_address(0));
 }
-EXPORT_SYMBOL(vmalloc_user);
+EXPORT_SYMBOL(vmalloc_user_noprof);
 
 /**
  * vmalloc_node - allocate memory on a specific node
@@ -3989,12 +3989,12 @@ EXPORT_SYMBOL(vmalloc_user);
  *
  * Return: pointer to the allocated memory or %NULL on error
  */
-void *vmalloc_node(unsigned long size, int node)
+void *vmalloc_node_noprof(unsigned long size, int node)
 {
-	return __vmalloc_node(size, 1, GFP_KERNEL, node,
+	return __vmalloc_node_noprof(size, 1, GFP_KERNEL, node,
 			__builtin_return_address(0));
 }
-EXPORT_SYMBOL(vmalloc_node);
+EXPORT_SYMBOL(vmalloc_node_noprof);
 
 /**
  * vzalloc_node - allocate memory on a specific node with zero fill
@@ -4007,12 +4007,12 @@ EXPORT_SYMBOL(vmalloc_node);
  *
  * Return: pointer to the allocated memory or %NULL on error
  */
-void *vzalloc_node(unsigned long size, int node)
+void *vzalloc_node_noprof(unsigned long size, int node)
 {
-	return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node,
+	return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, node,
 				__builtin_return_address(0));
 }
-EXPORT_SYMBOL(vzalloc_node);
+EXPORT_SYMBOL(vzalloc_node_noprof);
 
 #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
 #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
@@ -4035,12 +4035,12 @@ EXPORT_SYMBOL(vzalloc_node);
  *
  * Return: pointer to the allocated memory or %NULL on error
  */
-void *vmalloc_32(unsigned long size)
+void *vmalloc_32_noprof(unsigned long size)
 {
-	return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
+	return __vmalloc_node_noprof(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
 			__builtin_return_address(0));
 }
-EXPORT_SYMBOL(vmalloc_32);
+EXPORT_SYMBOL(vmalloc_32_noprof);
 
 /**
  * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
@@ -4051,14 +4051,14 @@ EXPORT_SYMBOL(vmalloc_32);
  *
  * Return: pointer to the allocated memory or %NULL on error
  */
-void *vmalloc_32_user(unsigned long size)
+void *vmalloc_32_user_noprof(unsigned long size)
 {
-	return __vmalloc_node_range(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
+	return __vmalloc_node_range_noprof(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
 				    GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
 				    VM_USERMAP, NUMA_NO_NODE,
 				    __builtin_return_address(0));
 }
-EXPORT_SYMBOL(vmalloc_32_user);
+EXPORT_SYMBOL(vmalloc_32_user_noprof);
 
 /*
  * Atomically zero bytes in the iterator.
-- 
cgit v1.2.3


From 9e54dd8b64dccdf5336172bf905017a6f97ee18d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 21 Mar 2024 09:36:53 -0700
Subject: rhashtable: plumb through alloc tag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This gives better memory allocation profiling results; rhashtable
allocations will be accounted to the code that initialized the rhashtable.

[surenb@google.com: undo _noprof additions in the documentation]
  Link: https://lkml.kernel.org/r/20240326231453.1206227-1-surenb@google.com
Link: https://lkml.kernel.org/r/20240321163705.3067592-32-surenb@google.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Tested-by: Kees Cook <keescook@chromium.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andreas Hindborg <a.hindborg@samsung.com>
Cc: Benno Lossin <benno.lossin@proton.me>
Cc: "Björn Roy Baron" <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/alloc_tag.h        |  3 +++
 include/linux/rhashtable-types.h | 11 +++++++++--
 lib/rhashtable.c                 | 22 ++++++++++++++--------
 3 files changed, 26 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index 2aea7afa1fab..0567766e50bd 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -152,6 +152,8 @@ static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes)
 	ref->ct = NULL;
 }
 
+#define alloc_tag_record(p)	((p) = current->alloc_tag)
+
 #else /* CONFIG_MEM_ALLOC_PROFILING */
 
 #define DEFINE_ALLOC_TAG(_alloc_tag)
@@ -159,6 +161,7 @@ static inline bool mem_alloc_profiling_enabled(void) { return false; }
 static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag,
 				 size_t bytes) {}
 static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {}
+#define alloc_tag_record(p)	do {} while (0)
 
 #endif /* CONFIG_MEM_ALLOC_PROFILING */
 
diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h
index b6f3797277ff..015c8298bebc 100644
--- a/include/linux/rhashtable-types.h
+++ b/include/linux/rhashtable-types.h
@@ -9,6 +9,7 @@
 #ifndef _LINUX_RHASHTABLE_TYPES_H
 #define _LINUX_RHASHTABLE_TYPES_H
 
+#include <linux/alloc_tag.h>
 #include <linux/atomic.h>
 #include <linux/compiler.h>
 #include <linux/mutex.h>
@@ -88,6 +89,9 @@ struct rhashtable {
 	struct mutex                    mutex;
 	spinlock_t			lock;
 	atomic_t			nelems;
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+	struct alloc_tag		*alloc_tag;
+#endif
 };
 
 /**
@@ -127,9 +131,12 @@ struct rhashtable_iter {
 	bool end_of_table;
 };
 
-int rhashtable_init(struct rhashtable *ht,
+int rhashtable_init_noprof(struct rhashtable *ht,
 		    const struct rhashtable_params *params);
-int rhltable_init(struct rhltable *hlt,
+#define rhashtable_init(...)	alloc_hooks(rhashtable_init_noprof(__VA_ARGS__))
+
+int rhltable_init_noprof(struct rhltable *hlt,
 		  const struct rhashtable_params *params);
+#define rhltable_init(...)	alloc_hooks(rhltable_init_noprof(__VA_ARGS__))
 
 #endif /* _LINUX_RHASHTABLE_TYPES_H */
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 6ae2ba8e06a2..dbbed19f8fff 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -130,7 +130,8 @@ static union nested_table *nested_table_alloc(struct rhashtable *ht,
 	if (ntbl)
 		return ntbl;
 
-	ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC);
+	ntbl = alloc_hooks_tag(ht->alloc_tag,
+			kmalloc_noprof(PAGE_SIZE, GFP_ATOMIC|__GFP_ZERO));
 
 	if (ntbl && leaf) {
 		for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++)
@@ -157,7 +158,8 @@ static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht,
 
 	size = sizeof(*tbl) + sizeof(tbl->buckets[0]);
 
-	tbl = kzalloc(size, gfp);
+	tbl = alloc_hooks_tag(ht->alloc_tag,
+			kmalloc_noprof(size, gfp|__GFP_ZERO));
 	if (!tbl)
 		return NULL;
 
@@ -181,7 +183,9 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 	int i;
 	static struct lock_class_key __key;
 
-	tbl = kvzalloc(struct_size(tbl, buckets, nbuckets), gfp);
+	tbl = alloc_hooks_tag(ht->alloc_tag,
+			kvmalloc_node_noprof(struct_size(tbl, buckets, nbuckets),
+					     gfp|__GFP_ZERO, NUMA_NO_NODE));
 
 	size = nbuckets;
 
@@ -1016,7 +1020,7 @@ static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
  *	.obj_hashfn = my_hash_fn,
  * };
  */
-int rhashtable_init(struct rhashtable *ht,
+int rhashtable_init_noprof(struct rhashtable *ht,
 		    const struct rhashtable_params *params)
 {
 	struct bucket_table *tbl;
@@ -1031,6 +1035,8 @@ int rhashtable_init(struct rhashtable *ht,
 	spin_lock_init(&ht->lock);
 	memcpy(&ht->p, params, sizeof(*params));
 
+	alloc_tag_record(ht->alloc_tag);
+
 	if (params->min_size)
 		ht->p.min_size = roundup_pow_of_two(params->min_size);
 
@@ -1076,7 +1082,7 @@ int rhashtable_init(struct rhashtable *ht,
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(rhashtable_init);
+EXPORT_SYMBOL_GPL(rhashtable_init_noprof);
 
 /**
  * rhltable_init - initialize a new hash list table
@@ -1087,15 +1093,15 @@ EXPORT_SYMBOL_GPL(rhashtable_init);
  *
  * See documentation for rhashtable_init.
  */
-int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params)
+int rhltable_init_noprof(struct rhltable *hlt, const struct rhashtable_params *params)
 {
 	int err;
 
-	err = rhashtable_init(&hlt->ht, params);
+	err = rhashtable_init_noprof(&hlt->ht, params);
 	hlt->ht.rhlist = true;
 	return err;
 }
-EXPORT_SYMBOL_GPL(rhltable_init);
+EXPORT_SYMBOL_GPL(rhltable_init_noprof);
 
 static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj,
 				void (*free_fn)(void *ptr, void *arg),
-- 
cgit v1.2.3


From 1438d349d16b78d88f9e978a4a5496f078c8191b Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 21 Mar 2024 09:36:54 -0700
Subject: lib: add memory allocations report in show_mem()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Include allocations in show_mem reports.

Link: https://lkml.kernel.org/r/20240321163705.3067592-33-surenb@google.com
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Kees Cook <keescook@chromium.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andreas Hindborg <a.hindborg@samsung.com>
Cc: Benno Lossin <benno.lossin@proton.me>
Cc: "Björn Roy Baron" <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/alloc_tag.h |  7 +++++++
 include/linux/codetag.h   |  1 +
 lib/alloc_tag.c           | 38 ++++++++++++++++++++++++++++++++++++++
 lib/codetag.c             |  5 +++++
 mm/show_mem.c             | 26 ++++++++++++++++++++++++++
 5 files changed, 77 insertions(+)

(limited to 'include')

diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index 0567766e50bd..d18388c0e9e1 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -31,6 +31,13 @@ struct alloc_tag {
 
 #ifdef CONFIG_MEM_ALLOC_PROFILING
 
+struct codetag_bytes {
+	struct codetag *ct;
+	s64 bytes;
+};
+
+size_t alloc_tag_top_users(struct codetag_bytes *tags, size_t count, bool can_sleep);
+
 static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct)
 {
 	return container_of(ct, struct alloc_tag, ct);
diff --git a/include/linux/codetag.h b/include/linux/codetag.h
index bfd0ba5c4185..c2a579ccd455 100644
--- a/include/linux/codetag.h
+++ b/include/linux/codetag.h
@@ -61,6 +61,7 @@ struct codetag_iterator {
 }
 
 void codetag_lock_module_list(struct codetag_type *cttype, bool lock);
+bool codetag_trylock_module_list(struct codetag_type *cttype);
 struct codetag_iterator codetag_get_ct_iter(struct codetag_type *cttype);
 struct codetag *codetag_next_ct(struct codetag_iterator *iter);
 
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index 6bd8e85beaf7..26af9982ddc4 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -89,6 +89,44 @@ static const struct seq_operations allocinfo_seq_op = {
 	.show	= allocinfo_show,
 };
 
+size_t alloc_tag_top_users(struct codetag_bytes *tags, size_t count, bool can_sleep)
+{
+	struct codetag_iterator iter;
+	struct codetag *ct;
+	struct codetag_bytes n;
+	unsigned int i, nr = 0;
+
+	if (can_sleep)
+		codetag_lock_module_list(alloc_tag_cttype, true);
+	else if (!codetag_trylock_module_list(alloc_tag_cttype))
+		return 0;
+
+	iter = codetag_get_ct_iter(alloc_tag_cttype);
+	while ((ct = codetag_next_ct(&iter))) {
+		struct alloc_tag_counters counter = alloc_tag_read(ct_to_alloc_tag(ct));
+
+		n.ct	= ct;
+		n.bytes = counter.bytes;
+
+		for (i = 0; i < nr; i++)
+			if (n.bytes > tags[i].bytes)
+				break;
+
+		if (i < count) {
+			nr -= nr == count;
+			memmove(&tags[i + 1],
+				&tags[i],
+				sizeof(tags[0]) * (nr - i));
+			nr++;
+			tags[i] = n;
+		}
+	}
+
+	codetag_lock_module_list(alloc_tag_cttype, false);
+
+	return nr;
+}
+
 static void __init procfs_init(void)
 {
 	proc_create_seq("allocinfo", 0444, NULL, &allocinfo_seq_op);
diff --git a/lib/codetag.c b/lib/codetag.c
index 408062f722ce..5ace625f2328 100644
--- a/lib/codetag.c
+++ b/lib/codetag.c
@@ -36,6 +36,11 @@ void codetag_lock_module_list(struct codetag_type *cttype, bool lock)
 		up_read(&cttype->mod_lock);
 }
 
+bool codetag_trylock_module_list(struct codetag_type *cttype)
+{
+	return down_read_trylock(&cttype->mod_lock) != 0;
+}
+
 struct codetag_iterator codetag_get_ct_iter(struct codetag_type *cttype)
 {
 	struct codetag_iterator iter = {
diff --git a/mm/show_mem.c b/mm/show_mem.c
index 8dcfafbd283c..bdb439551eef 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -423,4 +423,30 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
 #ifdef CONFIG_MEMORY_FAILURE
 	printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
 #endif
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+	{
+		struct codetag_bytes tags[10];
+		size_t i, nr;
+
+		nr = alloc_tag_top_users(tags, ARRAY_SIZE(tags), false);
+		if (nr) {
+			pr_notice("Memory allocations:\n");
+			for (i = 0; i < nr; i++) {
+				struct codetag *ct = tags[i].ct;
+				struct alloc_tag *tag = ct_to_alloc_tag(ct);
+				struct alloc_tag_counters counter = alloc_tag_read(tag);
+
+				/* Same as alloc_tag_to_text() but w/o intermediate buffer */
+				if (ct->modname)
+					pr_notice("%12lli %8llu %s:%u [%s] func:%s\n",
+						  counter.bytes, counter.calls, ct->filename,
+						  ct->lineno, ct->modname, ct->function);
+				else
+					pr_notice("%12lli %8llu %s:%u func:%s\n",
+						  counter.bytes, counter.calls, ct->filename,
+						  ct->lineno, ct->function);
+			}
+		}
+	}
+#endif
 }
-- 
cgit v1.2.3


From 239d6c96d86f8a42d799b9a85c09cc7f36aef3f8 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 21 Mar 2024 09:36:55 -0700
Subject: codetag: debug: skip objext checking when it's for objext itself
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

objext objects are created with __GFP_NO_OBJ_EXT flag and therefore have
no corresponding objext themselves (otherwise we would get an infinite
recursion). When freeing these objects their codetag will be empty and
when CONFIG_MEM_ALLOC_PROFILING_DEBUG is enabled this will lead to false
warnings. Introduce CODETAG_EMPTY special codetag value to mark
allocations which intentionally lack codetag to avoid these warnings.
Set objext codetags to CODETAG_EMPTY before freeing to indicate that
the codetag is expected to be empty.

Link: https://lkml.kernel.org/r/20240321163705.3067592-34-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Tested-by: Kees Cook <keescook@chromium.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andreas Hindborg <a.hindborg@samsung.com>
Cc: Benno Lossin <benno.lossin@proton.me>
Cc: "Björn Roy Baron" <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/alloc_tag.h | 26 ++++++++++++++++++++++++++
 mm/slub.c                 | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+)

(limited to 'include')

diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index d18388c0e9e1..3af5d6a5ced4 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -29,6 +29,27 @@ struct alloc_tag {
 	struct alloc_tag_counters __percpu	*counters;
 } __aligned(8);
 
+#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
+
+#define CODETAG_EMPTY	((void *)1)
+
+static inline bool is_codetag_empty(union codetag_ref *ref)
+{
+	return ref->ct == CODETAG_EMPTY;
+}
+
+static inline void set_codetag_empty(union codetag_ref *ref)
+{
+	if (ref)
+		ref->ct = CODETAG_EMPTY;
+}
+
+#else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
+
+static inline bool is_codetag_empty(union codetag_ref *ref) { return false; }
+
+#endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
+
 #ifdef CONFIG_MEM_ALLOC_PROFILING
 
 struct codetag_bytes {
@@ -151,6 +172,11 @@ static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes)
 	if (!ref || !ref->ct)
 		return;
 
+	if (is_codetag_empty(ref)) {
+		ref->ct = NULL;
+		return;
+	}
+
 	tag = ct_to_alloc_tag(ref->ct);
 
 	this_cpu_sub(tag->counters->bytes, bytes);
diff --git a/mm/slub.c b/mm/slub.c
index 0fc891d44cff..00f461826462 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1873,6 +1873,30 @@ static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s)
 
 #ifdef CONFIG_SLAB_OBJ_EXT
 
+#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
+
+static inline void mark_objexts_empty(struct slabobj_ext *obj_exts)
+{
+	struct slabobj_ext *slab_exts;
+	struct slab *obj_exts_slab;
+
+	obj_exts_slab = virt_to_slab(obj_exts);
+	slab_exts = slab_obj_exts(obj_exts_slab);
+	if (slab_exts) {
+		unsigned int offs = obj_to_index(obj_exts_slab->slab_cache,
+						 obj_exts_slab, obj_exts);
+		/* codetag should be NULL */
+		WARN_ON(slab_exts[offs].ref.ct);
+		set_codetag_empty(&slab_exts[offs].ref);
+	}
+}
+
+#else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
+
+static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) {}
+
+#endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
+
 /*
  * The allocated objcg pointers array is not accounted directly.
  * Moreover, it should not come from DMA buffer and is not readily
@@ -1913,6 +1937,7 @@ static int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
 		 * assign slabobj_exts in parallel. In this case the existing
 		 * objcg vector should be reused.
 		 */
+		mark_objexts_empty(vec);
 		kfree(vec);
 		return 0;
 	}
@@ -1929,6 +1954,14 @@ static inline void free_slab_obj_exts(struct slab *slab)
 	if (!obj_exts)
 		return;
 
+	/*
+	 * obj_exts was created with __GFP_NO_OBJ_EXT flag, therefore its
+	 * corresponding extension will be NULL. alloc_tag_sub() will throw a
+	 * warning if slab has extensions but the extension of an object is
+	 * NULL, therefore replace NULL with CODETAG_EMPTY to indicate that
+	 * the extension for obj_exts is expected to be NULL.
+	 */
+	mark_objexts_empty(obj_exts);
 	kfree(obj_exts);
 	slab->obj_exts = 0;
 }
-- 
cgit v1.2.3


From d224eb0287fbd84f4f13eca042c7f08f87138f3b Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 21 Mar 2024 09:36:56 -0700
Subject: codetag: debug: mark codetags for reserved pages as empty
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

To avoid debug warnings while freeing reserved pages which were not
allocated with usual allocators, mark their codetags as empty before
freeing.

Link: https://lkml.kernel.org/r/20240321163705.3067592-35-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kees Cook <keescook@chromium.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andreas Hindborg <a.hindborg@samsung.com>
Cc: Benno Lossin <benno.lossin@proton.me>
Cc: "Björn Roy Baron" <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/alloc_tag.h   |  1 +
 include/linux/mm.h          |  9 +++++++++
 include/linux/pgalloc_tag.h |  2 ++
 mm/mm_init.c                | 12 +++++++++++-
 4 files changed, 23 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index 3af5d6a5ced4..afc9e259a2d3 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -47,6 +47,7 @@ static inline void set_codetag_empty(union codetag_ref *ref)
 #else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
 
 static inline bool is_codetag_empty(union codetag_ref *ref) { return false; }
+static inline void set_codetag_empty(union codetag_ref *ref) {}
 
 #endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b6bdaa18b9e9..0dbd737cfabd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -5,6 +5,7 @@
 #include <linux/errno.h>
 #include <linux/mmdebug.h>
 #include <linux/gfp.h>
+#include <linux/pgalloc_tag.h>
 #include <linux/bug.h>
 #include <linux/list.h>
 #include <linux/mmzone.h>
@@ -3134,6 +3135,14 @@ extern void reserve_bootmem_region(phys_addr_t start,
 /* Free the reserved page into the buddy system, so it gets managed. */
 static inline void free_reserved_page(struct page *page)
 {
+	if (mem_alloc_profiling_enabled()) {
+		union codetag_ref *ref = get_page_tag_ref(page);
+
+		if (ref) {
+			set_codetag_empty(ref);
+			put_page_tag_ref(ref);
+		}
+	}
 	ClearPageReserved(page);
 	init_page_count(page);
 	__free_page(page);
diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h
index 50d212330bbb..62d8dad74b37 100644
--- a/include/linux/pgalloc_tag.h
+++ b/include/linux/pgalloc_tag.h
@@ -120,6 +120,8 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr)
 
 #else /* CONFIG_MEM_ALLOC_PROFILING */
 
+static inline union codetag_ref *get_page_tag_ref(struct page *page) { return NULL; }
+static inline void put_page_tag_ref(union codetag_ref *ref) {}
 static inline void pgalloc_tag_add(struct page *page, struct task_struct *task,
 				   unsigned int nr) {}
 static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {}
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 2fd9bf044a79..f45c2b32ba82 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -2567,7 +2567,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
 void __init memblock_free_pages(struct page *page, unsigned long pfn,
 							unsigned int order)
 {
-
 	if (IS_ENABLED(CONFIG_DEFERRED_STRUCT_PAGE_INIT)) {
 		int nid = early_pfn_to_nid(pfn);
 
@@ -2579,6 +2578,17 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn,
 		/* KMSAN will take care of these pages. */
 		return;
 	}
+
+	/* pages were reserved and not allocated */
+	if (mem_alloc_profiling_enabled()) {
+		union codetag_ref *ref = get_page_tag_ref(page);
+
+		if (ref) {
+			set_codetag_empty(ref);
+			put_page_tag_ref(ref);
+		}
+	}
+
 	__free_pages_core(page, order);
 }
 
-- 
cgit v1.2.3


From 09c46563ff6d5f090211e48ff1fdba0ec7f4c97f Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Thu, 21 Mar 2024 09:36:57 -0700
Subject: codetag: debug: introduce OBJEXTS_ALLOC_FAIL to mark failed slab_ext
 allocations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If slabobj_ext vector allocation for a slab object fails and later on it
succeeds for another object in the same slab, the slabobj_ext for the
original object will be NULL and will be flagged in case when
CONFIG_MEM_ALLOC_PROFILING_DEBUG is enabled.

Mark failed slabobj_ext vector allocations using a new objext_flags flag
stored in the lower bits of slab->obj_exts.  When new allocation succeeds
it marks all tag references in the same slabobj_ext vector as empty to
avoid warnings implemented by CONFIG_MEM_ALLOC_PROFILING_DEBUG checks.

Link: https://lkml.kernel.org/r/20240321163705.3067592-36-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Tested-by: Kees Cook <keescook@chromium.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alex Gaynor <alex.gaynor@gmail.com>
Cc: Alice Ryhl <aliceryhl@google.com>
Cc: Andreas Hindborg <a.hindborg@samsung.com>
Cc: Benno Lossin <benno.lossin@proton.me>
Cc: "Björn Roy Baron" <bjorn3_gh@protonmail.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Gary Guo <gary@garyguo.net>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Miguel Ojeda <ojeda@kernel.org>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wedson Almeida Filho <wedsonaf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h |  4 +++-
 mm/slub.c                  | 46 +++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 42 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 24a6df30be49..8f332b4ae84c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -366,8 +366,10 @@ enum page_memcg_data_flags {
 #endif /* CONFIG_MEMCG */
 
 enum objext_flags {
+	/* slabobj_ext vector failed to allocate */
+	OBJEXTS_ALLOC_FAIL = __FIRST_OBJEXT_FLAG,
 	/* the next bit after the last actual flag */
-	__NR_OBJEXTS_FLAGS  = __FIRST_OBJEXT_FLAG,
+	__NR_OBJEXTS_FLAGS  = (__FIRST_OBJEXT_FLAG << 1),
 };
 
 #define OBJEXTS_FLAGS_MASK (__NR_OBJEXTS_FLAGS - 1)
diff --git a/mm/slub.c b/mm/slub.c
index 00f461826462..be047279c9e9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1891,9 +1891,33 @@ static inline void mark_objexts_empty(struct slabobj_ext *obj_exts)
 	}
 }
 
+static inline void mark_failed_objexts_alloc(struct slab *slab)
+{
+	slab->obj_exts = OBJEXTS_ALLOC_FAIL;
+}
+
+static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
+			struct slabobj_ext *vec, unsigned int objects)
+{
+	/*
+	 * If vector previously failed to allocate then we have live
+	 * objects with no tag reference. Mark all references in this
+	 * vector as empty to avoid warnings later on.
+	 */
+	if (obj_exts & OBJEXTS_ALLOC_FAIL) {
+		unsigned int i;
+
+		for (i = 0; i < objects; i++)
+			set_codetag_empty(&vec[i].ref);
+	}
+}
+
 #else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
 
 static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) {}
+static inline void mark_failed_objexts_alloc(struct slab *slab) {}
+static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
+			struct slabobj_ext *vec, unsigned int objects) {}
 
 #endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */
 
@@ -1909,29 +1933,37 @@ static int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
 			       gfp_t gfp, bool new_slab)
 {
 	unsigned int objects = objs_per_slab(s, slab);
-	unsigned long obj_exts;
-	void *vec;
+	unsigned long new_exts;
+	unsigned long old_exts;
+	struct slabobj_ext *vec;
 
 	gfp &= ~OBJCGS_CLEAR_MASK;
 	/* Prevent recursive extension vector allocation */
 	gfp |= __GFP_NO_OBJ_EXT;
 	vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp,
 			   slab_nid(slab));
-	if (!vec)
+	if (!vec) {
+		/* Mark vectors which failed to allocate */
+		if (new_slab)
+			mark_failed_objexts_alloc(slab);
+
 		return -ENOMEM;
+	}
 
-	obj_exts = (unsigned long)vec;
+	new_exts = (unsigned long)vec;
 #ifdef CONFIG_MEMCG
-	obj_exts |= MEMCG_DATA_OBJEXTS;
+	new_exts |= MEMCG_DATA_OBJEXTS;
 #endif
+	old_exts = slab->obj_exts;
+	handle_failed_objexts_alloc(old_exts, vec, objects);
 	if (new_slab) {
 		/*
 		 * If the slab is brand new and nobody can yet access its
 		 * obj_exts, no synchronization is required and obj_exts can
 		 * be simply assigned.
 		 */
-		slab->obj_exts = obj_exts;
-	} else if (cmpxchg(&slab->obj_exts, 0, obj_exts)) {
+		slab->obj_exts = new_exts;
+	} else if (cmpxchg(&slab->obj_exts, old_exts, new_exts) != old_exts) {
 		/*
 		 * If the slab is already in use, somebody can allocate and
 		 * assign slabobj_exts in parallel. In this case the existing
-- 
cgit v1.2.3


From 2c321f3f70bc284510598f712b702ce8d60c4d14 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Sun, 14 Apr 2024 19:07:31 -0700
Subject: mm: change inlined allocation helpers to account at the call site

Main goal of memory allocation profiling patchset is to provide accounting
that is cheap enough to run in production.  To achieve that we inject
counters using codetags at the allocation call sites to account every time
allocation is made.  This injection allows us to perform accounting
efficiently because injected counters are immediately available as opposed
to the alternative methods, such as using _RET_IP_, which would require
counter lookup and appropriate locking that makes accounting much more
expensive.  This method requires all allocation functions to inject
separate counters at their call sites so that their callers can be
individually accounted.  Counter injection is implemented by allocation
hooks which should wrap all allocation functions.

Inlined functions which perform allocations but do not use allocation
hooks are directly charged for the allocations they perform.  In most
cases these functions are just specialized allocation wrappers used from
multiple places to allocate objects of a specific type.  It would be more
useful to do the accounting at their call sites instead.  Instrument these
helpers to do accounting at the call site.  Simple inlined allocation
wrappers are converted directly into macros.  More complex allocators or
allocators with documentation are converted into _noprof versions and
allocation hooks are added.  This allows memory allocation profiling
mechanism to charge allocations to the callers of these functions.

Link: https://lkml.kernel.org/r/20240415020731.1152108-1-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Acked-by: Jan Kara <jack@suse.cz>		[jbd2]
Cc: Anna Schumaker <anna@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Jakub Sitnicki <jakub@cloudflare.com>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/iommu/amd/amd_iommu.h           |  5 +++--
 fs/nfs/iostat.h                         |  5 +----
 include/acpi/platform/aclinuxex.h       | 19 ++++++------------
 include/asm-generic/pgalloc.h           | 35 ++++++++++++++++++++-------------
 include/crypto/hash.h                   |  7 ++++---
 include/crypto/internal/acompress.h     |  5 +++--
 include/crypto/skcipher.h               |  7 ++++---
 include/linux/bpf.h                     | 33 ++++++++-----------------------
 include/linux/bpfptr.h                  |  5 +++--
 include/linux/dma-fence-chain.h         |  6 ++----
 include/linux/hid_bpf.h                 |  6 ++----
 include/linux/jbd2.h                    | 12 ++++-------
 include/linux/mm.h                      |  5 +++--
 include/linux/mm_types.h                |  5 +++--
 include/linux/percpu.h                  |  3 +++
 include/linux/ptr_ring.h                | 28 +++++++++++++++-----------
 include/linux/skb_array.h               | 19 ++++++++++--------
 include/linux/skbuff.h                  | 20 ++++++++-----------
 include/linux/skmsg.h                   |  8 +++-----
 include/linux/slab.h                    |  5 +++++
 include/linux/sockptr.h                 | 10 ++++++----
 include/net/netlabel.h                  | 16 +++++++++------
 include/net/netlink.h                   |  5 +++--
 include/net/request_sock.h              |  5 +++--
 include/net/tcx.h                       |  5 +++--
 net/sunrpc/auth_gss/auth_gss_internal.h |  6 ++++--
 26 files changed, 142 insertions(+), 143 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h
index f482aab420f7..52575ba9a141 100644
--- a/drivers/iommu/amd/amd_iommu.h
+++ b/drivers/iommu/amd/amd_iommu.h
@@ -134,13 +134,14 @@ static inline int get_pci_sbdf_id(struct pci_dev *pdev)
 	return PCI_SEG_DEVID_TO_SBDF(seg, devid);
 }
 
-static inline void *alloc_pgtable_page(int nid, gfp_t gfp)
+static inline void *alloc_pgtable_page_noprof(int nid, gfp_t gfp)
 {
 	struct page *page;
 
-	page = alloc_pages_node(nid, gfp | __GFP_ZERO, 0);
+	page = alloc_pages_node_noprof(nid, gfp | __GFP_ZERO, 0);
 	return page ? page_address(page) : NULL;
 }
+#define alloc_pgtable_page(...)	alloc_hooks(alloc_pgtable_page_noprof(__VA_ARGS__))
 
 /*
  * This must be called after device probe completes. During probe
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index 5aa776b5a3e7..b17a9eb9b148 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -46,10 +46,7 @@ static inline void nfs_add_stats(const struct inode *inode,
 	nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
 }
 
-static inline struct nfs_iostats __percpu *nfs_alloc_iostats(void)
-{
-	return alloc_percpu(struct nfs_iostats);
-}
+#define nfs_alloc_iostats()	alloc_percpu(struct nfs_iostats)
 
 static inline void nfs_free_iostats(struct nfs_iostats __percpu *stats)
 {
diff --git a/include/acpi/platform/aclinuxex.h b/include/acpi/platform/aclinuxex.h
index 600d4e2641da..62cac266a1c8 100644
--- a/include/acpi/platform/aclinuxex.h
+++ b/include/acpi/platform/aclinuxex.h
@@ -47,26 +47,19 @@ acpi_status acpi_os_terminate(void);
  * However, boot has  (system_state != SYSTEM_RUNNING)
  * to quiet __might_sleep() in kmalloc() and resume does not.
  */
-static inline void *acpi_os_allocate(acpi_size size)
-{
-	return kmalloc(size, irqs_disabled()? GFP_ATOMIC : GFP_KERNEL);
-}
+#define acpi_os_allocate(_size)	\
+		kmalloc(_size, irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL)
 
-static inline void *acpi_os_allocate_zeroed(acpi_size size)
-{
-	return kzalloc(size, irqs_disabled()? GFP_ATOMIC : GFP_KERNEL);
-}
+#define acpi_os_allocate_zeroed(_size)	\
+		kzalloc(_size, irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL)
 
 static inline void acpi_os_free(void *memory)
 {
 	kfree(memory);
 }
 
-static inline void *acpi_os_acquire_object(acpi_cache_t * cache)
-{
-	return kmem_cache_zalloc(cache,
-				 irqs_disabled()? GFP_ATOMIC : GFP_KERNEL);
-}
+#define acpi_os_acquire_object(_cache)	\
+		kmem_cache_zalloc(_cache, irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL)
 
 static inline acpi_thread_id acpi_os_get_thread_id(void)
 {
diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h
index 879e5f8aa5e9..7c48f5fbf8aa 100644
--- a/include/asm-generic/pgalloc.h
+++ b/include/asm-generic/pgalloc.h
@@ -16,15 +16,16 @@
  *
  * Return: pointer to the allocated memory or %NULL on error
  */
-static inline pte_t *__pte_alloc_one_kernel(struct mm_struct *mm)
+static inline pte_t *__pte_alloc_one_kernel_noprof(struct mm_struct *mm)
 {
-	struct ptdesc *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL &
+	struct ptdesc *ptdesc = pagetable_alloc_noprof(GFP_PGTABLE_KERNEL &
 			~__GFP_HIGHMEM, 0);
 
 	if (!ptdesc)
 		return NULL;
 	return ptdesc_address(ptdesc);
 }
+#define __pte_alloc_one_kernel(...)	alloc_hooks(__pte_alloc_one_kernel_noprof(__VA_ARGS__))
 
 #ifndef __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL
 /**
@@ -33,10 +34,11 @@ static inline pte_t *__pte_alloc_one_kernel(struct mm_struct *mm)
  *
  * Return: pointer to the allocated memory or %NULL on error
  */
-static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
+static inline pte_t *pte_alloc_one_kernel_noprof(struct mm_struct *mm)
 {
-	return __pte_alloc_one_kernel(mm);
+	return __pte_alloc_one_kernel_noprof(mm);
 }
+#define pte_alloc_one_kernel(...)	alloc_hooks(pte_alloc_one_kernel_noprof(__VA_ARGS__))
 #endif
 
 /**
@@ -61,11 +63,11 @@ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
  *
  * Return: `struct page` referencing the ptdesc or %NULL on error
  */
-static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp)
+static inline pgtable_t __pte_alloc_one_noprof(struct mm_struct *mm, gfp_t gfp)
 {
 	struct ptdesc *ptdesc;
 
-	ptdesc = pagetable_alloc(gfp, 0);
+	ptdesc = pagetable_alloc_noprof(gfp, 0);
 	if (!ptdesc)
 		return NULL;
 	if (!pagetable_pte_ctor(ptdesc)) {
@@ -75,6 +77,7 @@ static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp)
 
 	return ptdesc_page(ptdesc);
 }
+#define __pte_alloc_one(...)	alloc_hooks(__pte_alloc_one_noprof(__VA_ARGS__))
 
 #ifndef __HAVE_ARCH_PTE_ALLOC_ONE
 /**
@@ -85,10 +88,11 @@ static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp)
  *
  * Return: `struct page` referencing the ptdesc or %NULL on error
  */
-static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
+static inline pgtable_t pte_alloc_one_noprof(struct mm_struct *mm)
 {
-	return __pte_alloc_one(mm, GFP_PGTABLE_USER);
+	return __pte_alloc_one_noprof(mm, GFP_PGTABLE_USER);
 }
+#define pte_alloc_one(...)	alloc_hooks(pte_alloc_one_noprof(__VA_ARGS__))
 #endif
 
 /*
@@ -124,14 +128,14 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte_page)
  *
  * Return: pointer to the allocated memory or %NULL on error
  */
-static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
+static inline pmd_t *pmd_alloc_one_noprof(struct mm_struct *mm, unsigned long addr)
 {
 	struct ptdesc *ptdesc;
 	gfp_t gfp = GFP_PGTABLE_USER;
 
 	if (mm == &init_mm)
 		gfp = GFP_PGTABLE_KERNEL;
-	ptdesc = pagetable_alloc(gfp, 0);
+	ptdesc = pagetable_alloc_noprof(gfp, 0);
 	if (!ptdesc)
 		return NULL;
 	if (!pagetable_pmd_ctor(ptdesc)) {
@@ -140,6 +144,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 	}
 	return ptdesc_address(ptdesc);
 }
+#define pmd_alloc_one(...)	alloc_hooks(pmd_alloc_one_noprof(__VA_ARGS__))
 #endif
 
 #ifndef __HAVE_ARCH_PMD_FREE
@@ -157,7 +162,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 
 #if CONFIG_PGTABLE_LEVELS > 3
 
-static inline pud_t *__pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+static inline pud_t *__pud_alloc_one_noprof(struct mm_struct *mm, unsigned long addr)
 {
 	gfp_t gfp = GFP_PGTABLE_USER;
 	struct ptdesc *ptdesc;
@@ -166,13 +171,14 @@ static inline pud_t *__pud_alloc_one(struct mm_struct *mm, unsigned long addr)
 		gfp = GFP_PGTABLE_KERNEL;
 	gfp &= ~__GFP_HIGHMEM;
 
-	ptdesc = pagetable_alloc(gfp, 0);
+	ptdesc = pagetable_alloc_noprof(gfp, 0);
 	if (!ptdesc)
 		return NULL;
 
 	pagetable_pud_ctor(ptdesc);
 	return ptdesc_address(ptdesc);
 }
+#define __pud_alloc_one(...)	alloc_hooks(__pud_alloc_one_noprof(__VA_ARGS__))
 
 #ifndef __HAVE_ARCH_PUD_ALLOC_ONE
 /**
@@ -184,10 +190,11 @@ static inline pud_t *__pud_alloc_one(struct mm_struct *mm, unsigned long addr)
  *
  * Return: pointer to the allocated memory or %NULL on error
  */
-static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+static inline pud_t *pud_alloc_one_noprof(struct mm_struct *mm, unsigned long addr)
 {
-	return __pud_alloc_one(mm, addr);
+	return __pud_alloc_one_noprof(mm, addr);
 }
+#define pud_alloc_one(...)	alloc_hooks(pud_alloc_one_noprof(__VA_ARGS__))
 #endif
 
 static inline void __pud_free(struct mm_struct *mm, pud_t *pud)
diff --git a/include/crypto/hash.h b/include/crypto/hash.h
index 5d61f576cfc8..e5181cc9b7c5 100644
--- a/include/crypto/hash.h
+++ b/include/crypto/hash.h
@@ -578,19 +578,20 @@ static inline void ahash_request_set_tfm(struct ahash_request *req,
  *
  * Return: allocated request handle in case of success, or NULL if out of memory
  */
-static inline struct ahash_request *ahash_request_alloc(
+static inline struct ahash_request *ahash_request_alloc_noprof(
 	struct crypto_ahash *tfm, gfp_t gfp)
 {
 	struct ahash_request *req;
 
-	req = kmalloc(sizeof(struct ahash_request) +
-		      crypto_ahash_reqsize(tfm), gfp);
+	req = kmalloc_noprof(sizeof(struct ahash_request) +
+			     crypto_ahash_reqsize(tfm), gfp);
 
 	if (likely(req))
 		ahash_request_set_tfm(req, tfm);
 
 	return req;
 }
+#define ahash_request_alloc(...)	alloc_hooks(ahash_request_alloc_noprof(__VA_ARGS__))
 
 /**
  * ahash_request_free() - zeroize and free the request data structure
diff --git a/include/crypto/internal/acompress.h b/include/crypto/internal/acompress.h
index 4ac46bafba9d..2a67793f52ad 100644
--- a/include/crypto/internal/acompress.h
+++ b/include/crypto/internal/acompress.h
@@ -69,15 +69,16 @@ static inline void acomp_request_complete(struct acomp_req *req,
 	crypto_request_complete(&req->base, err);
 }
 
-static inline struct acomp_req *__acomp_request_alloc(struct crypto_acomp *tfm)
+static inline struct acomp_req *__acomp_request_alloc_noprof(struct crypto_acomp *tfm)
 {
 	struct acomp_req *req;
 
-	req = kzalloc(sizeof(*req) + crypto_acomp_reqsize(tfm), GFP_KERNEL);
+	req = kzalloc_noprof(sizeof(*req) + crypto_acomp_reqsize(tfm), GFP_KERNEL);
 	if (likely(req))
 		acomp_request_set_tfm(req, tfm);
 	return req;
 }
+#define __acomp_request_alloc(...)	alloc_hooks(__acomp_request_alloc_noprof(__VA_ARGS__))
 
 static inline void __acomp_request_free(struct acomp_req *req)
 {
diff --git a/include/crypto/skcipher.h b/include/crypto/skcipher.h
index c8857d7bdb37..6c5330e316b0 100644
--- a/include/crypto/skcipher.h
+++ b/include/crypto/skcipher.h
@@ -861,19 +861,20 @@ static inline struct skcipher_request *skcipher_request_cast(
  *
  * Return: allocated request handle in case of success, or NULL if out of memory
  */
-static inline struct skcipher_request *skcipher_request_alloc(
+static inline struct skcipher_request *skcipher_request_alloc_noprof(
 	struct crypto_skcipher *tfm, gfp_t gfp)
 {
 	struct skcipher_request *req;
 
-	req = kmalloc(sizeof(struct skcipher_request) +
-		      crypto_skcipher_reqsize(tfm), gfp);
+	req = kmalloc_noprof(sizeof(struct skcipher_request) +
+			     crypto_skcipher_reqsize(tfm), gfp);
 
 	if (likely(req))
 		skcipher_request_set_tfm(req, tfm);
 
 	return req;
 }
+#define skcipher_request_alloc(...)	alloc_hooks(skcipher_request_alloc_noprof(__VA_ARGS__))
 
 /**
  * skcipher_request_free() - zeroize and free request data structure
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 890e152d553e..a86da1f38b3c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2244,31 +2244,14 @@ void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size,
 void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
 				    size_t align, gfp_t flags);
 #else
-static inline void *
-bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
-		     int node)
-{
-	return kmalloc_node(size, flags, node);
-}
-
-static inline void *
-bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
-{
-	return kzalloc(size, flags);
-}
-
-static inline void *
-bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, gfp_t flags)
-{
-	return kvcalloc(n, size, flags);
-}
-
-static inline void __percpu *
-bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align,
-		     gfp_t flags)
-{
-	return __alloc_percpu_gfp(size, align, flags);
-}
+#define bpf_map_kmalloc_node(_map, _size, _flags, _node)	\
+		kmalloc_node(_size, _flags, _node)
+#define bpf_map_kzalloc(_map, _size, _flags)			\
+		kzalloc(_size, _flags)
+#define bpf_map_kvcalloc(_map, _n, _size, _flags)		\
+		kvcalloc(_n, _size, _flags)
+#define bpf_map_alloc_percpu(_map, _size, _align, _flags)	\
+		__alloc_percpu_gfp(_size, _align, _flags)
 #endif
 
 static inline int
diff --git a/include/linux/bpfptr.h b/include/linux/bpfptr.h
index 79b2f78eec1a..1af241525a17 100644
--- a/include/linux/bpfptr.h
+++ b/include/linux/bpfptr.h
@@ -65,9 +65,9 @@ static inline int copy_to_bpfptr_offset(bpfptr_t dst, size_t offset,
 	return copy_to_sockptr_offset((sockptr_t) dst, offset, src, size);
 }
 
-static inline void *kvmemdup_bpfptr(bpfptr_t src, size_t len)
+static inline void *kvmemdup_bpfptr_noprof(bpfptr_t src, size_t len)
 {
-	void *p = kvmalloc(len, GFP_USER | __GFP_NOWARN);
+	void *p = kvmalloc_noprof(len, GFP_USER | __GFP_NOWARN);
 
 	if (!p)
 		return ERR_PTR(-ENOMEM);
@@ -77,6 +77,7 @@ static inline void *kvmemdup_bpfptr(bpfptr_t src, size_t len)
 	}
 	return p;
 }
+#define kvmemdup_bpfptr(...)	alloc_hooks(kvmemdup_bpfptr_noprof(__VA_ARGS__))
 
 static inline long strncpy_from_bpfptr(char *dst, bpfptr_t src, size_t count)
 {
diff --git a/include/linux/dma-fence-chain.h b/include/linux/dma-fence-chain.h
index 4bdf0b96da28..ad9e2506c2f4 100644
--- a/include/linux/dma-fence-chain.h
+++ b/include/linux/dma-fence-chain.h
@@ -86,10 +86,8 @@ dma_fence_chain_contained(struct dma_fence *fence)
  *
  * Returns a new struct dma_fence_chain object or NULL on failure.
  */
-static inline struct dma_fence_chain *dma_fence_chain_alloc(void)
-{
-	return kmalloc(sizeof(struct dma_fence_chain), GFP_KERNEL);
-};
+#define dma_fence_chain_alloc()	\
+		((struct dma_fence_chain *)kmalloc(sizeof(struct dma_fence_chain), GFP_KERNEL))
 
 /**
  * dma_fence_chain_free
diff --git a/include/linux/hid_bpf.h b/include/linux/hid_bpf.h
index 7118ac28d468..ca70eeb6393e 100644
--- a/include/linux/hid_bpf.h
+++ b/include/linux/hid_bpf.h
@@ -149,10 +149,8 @@ static inline int hid_bpf_connect_device(struct hid_device *hdev) { return 0; }
 static inline void hid_bpf_disconnect_device(struct hid_device *hdev) {}
 static inline void hid_bpf_destroy_device(struct hid_device *hid) {}
 static inline void hid_bpf_device_init(struct hid_device *hid) {}
-static inline u8 *call_hid_bpf_rdesc_fixup(struct hid_device *hdev, u8 *rdesc, unsigned int *size)
-{
-	return kmemdup(rdesc, *size, GFP_KERNEL);
-}
+#define call_hid_bpf_rdesc_fixup(_hdev, _rdesc, _size)	\
+		((u8 *)kmemdup(_rdesc, *(_size), GFP_KERNEL))
 
 #endif /* CONFIG_HID_BPF */
 
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 971f3e826e15..9512fe332668 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1586,10 +1586,8 @@ void jbd2_journal_put_journal_head(struct journal_head *jh);
  */
 extern struct kmem_cache *jbd2_handle_cache;
 
-static inline handle_t *jbd2_alloc_handle(gfp_t gfp_flags)
-{
-	return kmem_cache_zalloc(jbd2_handle_cache, gfp_flags);
-}
+#define jbd2_alloc_handle(_gfp_flags)	\
+		((handle_t *)kmem_cache_zalloc(jbd2_handle_cache, _gfp_flags))
 
 static inline void jbd2_free_handle(handle_t *handle)
 {
@@ -1602,10 +1600,8 @@ static inline void jbd2_free_handle(handle_t *handle)
  */
 extern struct kmem_cache *jbd2_inode_cache;
 
-static inline struct jbd2_inode *jbd2_alloc_inode(gfp_t gfp_flags)
-{
-	return kmem_cache_alloc(jbd2_inode_cache, gfp_flags);
-}
+#define jbd2_alloc_inode(_gfp_flags)	\
+		((struct jbd2_inode *)kmem_cache_alloc(jbd2_inode_cache, _gfp_flags))
 
 static inline void jbd2_free_inode(struct jbd2_inode *jinode)
 {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0dbd737cfabd..60eabc6c8e00 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2860,12 +2860,13 @@ static inline bool pagetable_is_reserved(struct ptdesc *pt)
  *
  * Return: The ptdesc describing the allocated page tables.
  */
-static inline struct ptdesc *pagetable_alloc(gfp_t gfp, unsigned int order)
+static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order)
 {
-	struct page *page = alloc_pages(gfp | __GFP_COMP, order);
+	struct page *page = alloc_pages_noprof(gfp | __GFP_COMP, order);
 
 	return page_ptdesc(page);
 }
+#define pagetable_alloc(...)	alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__))
 
 /**
  * pagetable_free - Free pagetables
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 4ae4684d1add..6fef56938a17 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1170,14 +1170,15 @@ static inline void mm_init_cid(struct mm_struct *mm)
 	cpumask_clear(mm_cidmask(mm));
 }
 
-static inline int mm_alloc_cid(struct mm_struct *mm)
+static inline int mm_alloc_cid_noprof(struct mm_struct *mm)
 {
-	mm->pcpu_cid = alloc_percpu(struct mm_cid);
+	mm->pcpu_cid = alloc_percpu_noprof(struct mm_cid);
 	if (!mm->pcpu_cid)
 		return -ENOMEM;
 	mm_init_cid(mm);
 	return 0;
 }
+#define mm_alloc_cid(...)	alloc_hooks(mm_alloc_cid_noprof(__VA_ARGS__))
 
 static inline void mm_destroy_cid(struct mm_struct *mm)
 {
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 13a82f11e4fd..03053de557cf 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -151,6 +151,9 @@ extern size_t pcpu_alloc_size(void __percpu *__pdata);
 #define alloc_percpu(type)						\
 	(typeof(type) __percpu *)__alloc_percpu(sizeof(type),		\
 						__alignof__(type))
+#define alloc_percpu_noprof(type)					\
+	((typeof(type) __percpu *)pcpu_alloc_noprof(sizeof(type),	\
+					__alignof__(type), false, GFP_KERNEL))
 
 extern void free_percpu(void __percpu *__pdata);
 
diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 808f9d3ee546..fd037c127bb0 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -464,11 +464,11 @@ static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r,
 /* Not all gfp_t flags (besides GFP_KERNEL) are allowed. See
  * documentation for vmalloc for which of them are legal.
  */
-static inline void **__ptr_ring_init_queue_alloc(unsigned int size, gfp_t gfp)
+static inline void **__ptr_ring_init_queue_alloc_noprof(unsigned int size, gfp_t gfp)
 {
 	if (size > KMALLOC_MAX_SIZE / sizeof(void *))
 		return NULL;
-	return kvmalloc_array(size, sizeof(void *), gfp | __GFP_ZERO);
+	return kvmalloc_array_noprof(size, sizeof(void *), gfp | __GFP_ZERO);
 }
 
 static inline void __ptr_ring_set_size(struct ptr_ring *r, int size)
@@ -484,9 +484,9 @@ static inline void __ptr_ring_set_size(struct ptr_ring *r, int size)
 		r->batch = 1;
 }
 
-static inline int ptr_ring_init(struct ptr_ring *r, int size, gfp_t gfp)
+static inline int ptr_ring_init_noprof(struct ptr_ring *r, int size, gfp_t gfp)
 {
-	r->queue = __ptr_ring_init_queue_alloc(size, gfp);
+	r->queue = __ptr_ring_init_queue_alloc_noprof(size, gfp);
 	if (!r->queue)
 		return -ENOMEM;
 
@@ -497,6 +497,7 @@ static inline int ptr_ring_init(struct ptr_ring *r, int size, gfp_t gfp)
 
 	return 0;
 }
+#define ptr_ring_init(...)	alloc_hooks(ptr_ring_init_noprof(__VA_ARGS__))
 
 /*
  * Return entries into ring. Destroy entries that don't fit.
@@ -587,11 +588,11 @@ static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue,
  * In particular if you consume ring in interrupt or BH context, you must
  * disable interrupts/BH when doing so.
  */
-static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp,
+static inline int ptr_ring_resize_noprof(struct ptr_ring *r, int size, gfp_t gfp,
 				  void (*destroy)(void *))
 {
 	unsigned long flags;
-	void **queue = __ptr_ring_init_queue_alloc(size, gfp);
+	void **queue = __ptr_ring_init_queue_alloc_noprof(size, gfp);
 	void **old;
 
 	if (!queue)
@@ -609,6 +610,7 @@ static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp,
 
 	return 0;
 }
+#define ptr_ring_resize(...)	alloc_hooks(ptr_ring_resize_noprof(__VA_ARGS__))
 
 /*
  * Note: producer lock is nested within consumer lock, so if you
@@ -616,21 +618,21 @@ static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp,
  * In particular if you consume ring in interrupt or BH context, you must
  * disable interrupts/BH when doing so.
  */
-static inline int ptr_ring_resize_multiple(struct ptr_ring **rings,
-					   unsigned int nrings,
-					   int size,
-					   gfp_t gfp, void (*destroy)(void *))
+static inline int ptr_ring_resize_multiple_noprof(struct ptr_ring **rings,
+						  unsigned int nrings,
+						  int size,
+						  gfp_t gfp, void (*destroy)(void *))
 {
 	unsigned long flags;
 	void ***queues;
 	int i;
 
-	queues = kmalloc_array(nrings, sizeof(*queues), gfp);
+	queues = kmalloc_array_noprof(nrings, sizeof(*queues), gfp);
 	if (!queues)
 		goto noqueues;
 
 	for (i = 0; i < nrings; ++i) {
-		queues[i] = __ptr_ring_init_queue_alloc(size, gfp);
+		queues[i] = __ptr_ring_init_queue_alloc_noprof(size, gfp);
 		if (!queues[i])
 			goto nomem;
 	}
@@ -660,6 +662,8 @@ nomem:
 noqueues:
 	return -ENOMEM;
 }
+#define ptr_ring_resize_multiple(...) \
+		alloc_hooks(ptr_ring_resize_multiple_noprof(__VA_ARGS__))
 
 static inline void ptr_ring_cleanup(struct ptr_ring *r, void (*destroy)(void *))
 {
diff --git a/include/linux/skb_array.h b/include/linux/skb_array.h
index e2d45b7cb619..926496c9cc9c 100644
--- a/include/linux/skb_array.h
+++ b/include/linux/skb_array.h
@@ -177,10 +177,11 @@ static inline int skb_array_peek_len_any(struct skb_array *a)
 	return PTR_RING_PEEK_CALL_ANY(&a->ring, __skb_array_len_with_tag);
 }
 
-static inline int skb_array_init(struct skb_array *a, int size, gfp_t gfp)
+static inline int skb_array_init_noprof(struct skb_array *a, int size, gfp_t gfp)
 {
-	return ptr_ring_init(&a->ring, size, gfp);
+	return ptr_ring_init_noprof(&a->ring, size, gfp);
 }
+#define skb_array_init(...)	alloc_hooks(skb_array_init_noprof(__VA_ARGS__))
 
 static void __skb_array_destroy_skb(void *ptr)
 {
@@ -198,15 +199,17 @@ static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp)
 	return ptr_ring_resize(&a->ring, size, gfp, __skb_array_destroy_skb);
 }
 
-static inline int skb_array_resize_multiple(struct skb_array **rings,
-					    int nrings, unsigned int size,
-					    gfp_t gfp)
+static inline int skb_array_resize_multiple_noprof(struct skb_array **rings,
+						   int nrings, unsigned int size,
+						   gfp_t gfp)
 {
 	BUILD_BUG_ON(offsetof(struct skb_array, ring));
-	return ptr_ring_resize_multiple((struct ptr_ring **)rings,
-					nrings, size, gfp,
-					__skb_array_destroy_skb);
+	return ptr_ring_resize_multiple_noprof((struct ptr_ring **)rings,
+					       nrings, size, gfp,
+					       __skb_array_destroy_skb);
 }
+#define skb_array_resize_multiple(...)	\
+		alloc_hooks(skb_array_resize_multiple_noprof(__VA_ARGS__))
 
 static inline void skb_array_cleanup(struct skb_array *a)
 {
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9d24aec064e8..b72920c9887b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3371,7 +3371,7 @@ void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason);
  *
  * %NULL is returned if there is no free memory.
 */
-static inline struct page *__dev_alloc_pages(gfp_t gfp_mask,
+static inline struct page *__dev_alloc_pages_noprof(gfp_t gfp_mask,
 					     unsigned int order)
 {
 	/* This piece of code contains several assumptions.
@@ -3384,13 +3384,11 @@ static inline struct page *__dev_alloc_pages(gfp_t gfp_mask,
 	 */
 	gfp_mask |= __GFP_COMP | __GFP_MEMALLOC;
 
-	return alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
+	return alloc_pages_node_noprof(NUMA_NO_NODE, gfp_mask, order);
 }
+#define __dev_alloc_pages(...)	alloc_hooks(__dev_alloc_pages_noprof(__VA_ARGS__))
 
-static inline struct page *dev_alloc_pages(unsigned int order)
-{
-	return __dev_alloc_pages(GFP_ATOMIC | __GFP_NOWARN, order);
-}
+#define dev_alloc_pages(_order) __dev_alloc_pages(GFP_ATOMIC | __GFP_NOWARN, _order)
 
 /**
  * __dev_alloc_page - allocate a page for network Rx
@@ -3400,15 +3398,13 @@ static inline struct page *dev_alloc_pages(unsigned int order)
  *
  * %NULL is returned if there is no free memory.
  */
-static inline struct page *__dev_alloc_page(gfp_t gfp_mask)
+static inline struct page *__dev_alloc_page_noprof(gfp_t gfp_mask)
 {
-	return __dev_alloc_pages(gfp_mask, 0);
+	return __dev_alloc_pages_noprof(gfp_mask, 0);
 }
+#define __dev_alloc_page(...)	alloc_hooks(__dev_alloc_page_noprof(__VA_ARGS__))
 
-static inline struct page *dev_alloc_page(void)
-{
-	return dev_alloc_pages(0);
-}
+#define dev_alloc_page()	dev_alloc_pages(0)
 
 /**
  * dev_page_is_reusable - check whether a page can be reused for network Rx
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index e65ec3fd2799..78efc5b20284 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -410,11 +410,9 @@ void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock);
 int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
 			 struct sk_msg *msg);
 
-static inline struct sk_psock_link *sk_psock_init_link(void)
-{
-	return kzalloc(sizeof(struct sk_psock_link),
-		       GFP_ATOMIC | __GFP_NOWARN);
-}
+#define sk_psock_init_link()	\
+		((struct sk_psock_link *)kzalloc(sizeof(struct sk_psock_link),	\
+						 GFP_ATOMIC | __GFP_NOWARN))
 
 static inline void sk_psock_free_link(struct sk_psock_link *link)
 {
diff --git a/include/linux/slab.h b/include/linux/slab.h
index c0be1cd03cf6..4cc37ef22aae 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -744,6 +744,9 @@ void *kmalloc_node_track_caller_noprof(size_t size, gfp_t flags, int node,
  */
 #define kmalloc_track_caller(...)		kmalloc_node_track_caller(__VA_ARGS__, NUMA_NO_NODE)
 
+#define kmalloc_track_caller_noprof(...)	\
+		kmalloc_node_track_caller_noprof(__VA_ARGS__, NUMA_NO_NODE, _RET_IP_)
+
 static inline __alloc_size(1, 2) void *kmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags,
 							  int node)
 {
@@ -781,6 +784,7 @@ extern void *kvmalloc_node_noprof(size_t size, gfp_t flags, int node) __alloc_si
 #define kvmalloc_node(...)			alloc_hooks(kvmalloc_node_noprof(__VA_ARGS__))
 
 #define kvmalloc(_size, _flags)			kvmalloc_node(_size, _flags, NUMA_NO_NODE)
+#define kvmalloc_noprof(_size, _flags)		kvmalloc_node_noprof(_size, _flags, NUMA_NO_NODE)
 #define kvzalloc(_size, _flags)			kvmalloc(_size, _flags|__GFP_ZERO)
 
 #define kvzalloc_node(_size, _flags, _node)	kvmalloc_node(_size, _flags|__GFP_ZERO, _node)
@@ -797,6 +801,7 @@ static inline __alloc_size(1, 2) void *kvmalloc_array_noprof(size_t n, size_t si
 
 #define kvmalloc_array(...)			alloc_hooks(kvmalloc_array_noprof(__VA_ARGS__))
 #define kvcalloc(_n, _size, _flags)		kvmalloc_array(_n, _size, _flags|__GFP_ZERO)
+#define kvcalloc_noprof(_n, _size, _flags)	kvmalloc_array_noprof(_n, _size, _flags|__GFP_ZERO)
 
 extern void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
 		      __realloc_size(3);
diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h
index 317200cd3a60..fc5a206c4043 100644
--- a/include/linux/sockptr.h
+++ b/include/linux/sockptr.h
@@ -117,9 +117,9 @@ static inline int copy_to_sockptr(sockptr_t dst, const void *src, size_t size)
 	return copy_to_sockptr_offset(dst, 0, src, size);
 }
 
-static inline void *memdup_sockptr(sockptr_t src, size_t len)
+static inline void *memdup_sockptr_noprof(sockptr_t src, size_t len)
 {
-	void *p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN);
+	void *p = kmalloc_track_caller_noprof(len, GFP_USER | __GFP_NOWARN);
 
 	if (!p)
 		return ERR_PTR(-ENOMEM);
@@ -129,10 +129,11 @@ static inline void *memdup_sockptr(sockptr_t src, size_t len)
 	}
 	return p;
 }
+#define memdup_sockptr(...)	alloc_hooks(memdup_sockptr_noprof(__VA_ARGS__))
 
-static inline void *memdup_sockptr_nul(sockptr_t src, size_t len)
+static inline void *memdup_sockptr_nul_noprof(sockptr_t src, size_t len)
 {
-	char *p = kmalloc_track_caller(len + 1, GFP_KERNEL);
+	char *p = kmalloc_track_caller_noprof(len + 1, GFP_KERNEL);
 
 	if (!p)
 		return ERR_PTR(-ENOMEM);
@@ -143,6 +144,7 @@ static inline void *memdup_sockptr_nul(sockptr_t src, size_t len)
 	p[len] = '\0';
 	return p;
 }
+#define memdup_sockptr_nul(...)	alloc_hooks(memdup_sockptr_nul_noprof(__VA_ARGS__))
 
 static inline long strncpy_from_sockptr(char *dst, sockptr_t src, size_t count)
 {
diff --git a/include/net/netlabel.h b/include/net/netlabel.h
index f3ab0b8a4b18..c31bd96dafdb 100644
--- a/include/net/netlabel.h
+++ b/include/net/netlabel.h
@@ -274,15 +274,17 @@ struct netlbl_calipso_ops {
  * on success, NULL on failure.
  *
  */
-static inline struct netlbl_lsm_cache *netlbl_secattr_cache_alloc(gfp_t flags)
+static inline struct netlbl_lsm_cache *netlbl_secattr_cache_alloc_noprof(gfp_t flags)
 {
 	struct netlbl_lsm_cache *cache;
 
-	cache = kzalloc(sizeof(*cache), flags);
+	cache = kzalloc_noprof(sizeof(*cache), flags);
 	if (cache)
 		refcount_set(&cache->refcount, 1);
 	return cache;
 }
+#define netlbl_secattr_cache_alloc(...)	\
+		alloc_hooks(netlbl_secattr_cache_alloc_noprof(__VA_ARGS__))
 
 /**
  * netlbl_secattr_cache_free - Frees a netlbl_lsm_cache struct
@@ -311,10 +313,11 @@ static inline void netlbl_secattr_cache_free(struct netlbl_lsm_cache *cache)
  * on failure.
  *
  */
-static inline struct netlbl_lsm_catmap *netlbl_catmap_alloc(gfp_t flags)
+static inline struct netlbl_lsm_catmap *netlbl_catmap_alloc_noprof(gfp_t flags)
 {
-	return kzalloc(sizeof(struct netlbl_lsm_catmap), flags);
+	return kzalloc_noprof(sizeof(struct netlbl_lsm_catmap), flags);
 }
+#define netlbl_catmap_alloc(...)	alloc_hooks(netlbl_catmap_alloc_noprof(__VA_ARGS__))
 
 /**
  * netlbl_catmap_free - Free a LSM secattr catmap
@@ -376,10 +379,11 @@ static inline void netlbl_secattr_destroy(struct netlbl_lsm_secattr *secattr)
  * pointer on success, or NULL on failure.
  *
  */
-static inline struct netlbl_lsm_secattr *netlbl_secattr_alloc(gfp_t flags)
+static inline struct netlbl_lsm_secattr *netlbl_secattr_alloc_noprof(gfp_t flags)
 {
-	return kzalloc(sizeof(struct netlbl_lsm_secattr), flags);
+	return kzalloc_noprof(sizeof(struct netlbl_lsm_secattr), flags);
 }
+#define netlbl_secattr_alloc(...)	alloc_hooks(netlbl_secattr_alloc_noprof(__VA_ARGS__))
 
 /**
  * netlbl_secattr_free - Frees a netlbl_lsm_secattr struct
diff --git a/include/net/netlink.h b/include/net/netlink.h
index c19ff921b661..972b5484fa6f 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -1891,10 +1891,11 @@ static inline struct nla_bitfield32 nla_get_bitfield32(const struct nlattr *nla)
  * @src: netlink attribute to duplicate from
  * @gfp: GFP mask
  */
-static inline void *nla_memdup(const struct nlattr *src, gfp_t gfp)
+static inline void *nla_memdup_noprof(const struct nlattr *src, gfp_t gfp)
 {
-	return kmemdup(nla_data(src), nla_len(src), gfp);
+	return kmemdup_noprof(nla_data(src), nla_len(src), gfp);
 }
+#define nla_memdup(...)	alloc_hooks(nla_memdup_noprof(__VA_ARGS__))
 
 /**
  * nla_nest_start_noflag - Start a new level of nested attributes
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 004e651e6067..29495c331d20 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -127,12 +127,12 @@ static inline struct sock *skb_steal_sock(struct sk_buff *skb,
 }
 
 static inline struct request_sock *
-reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener,
+reqsk_alloc_noprof(const struct request_sock_ops *ops, struct sock *sk_listener,
 	    bool attach_listener)
 {
 	struct request_sock *req;
 
-	req = kmem_cache_alloc(ops->slab, GFP_ATOMIC | __GFP_NOWARN);
+	req = kmem_cache_alloc_noprof(ops->slab, GFP_ATOMIC | __GFP_NOWARN);
 	if (!req)
 		return NULL;
 	req->rsk_listener = NULL;
@@ -157,6 +157,7 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener,
 
 	return req;
 }
+#define reqsk_alloc(...)	alloc_hooks(reqsk_alloc_noprof(__VA_ARGS__))
 
 static inline void __reqsk_free(struct request_sock *req)
 {
diff --git a/include/net/tcx.h b/include/net/tcx.h
index 04be9377785d..72a3e75e539f 100644
--- a/include/net/tcx.h
+++ b/include/net/tcx.h
@@ -75,9 +75,9 @@ tcx_entry_fetch(struct net_device *dev, bool ingress)
 		return rcu_dereference_rtnl(dev->tcx_egress);
 }
 
-static inline struct bpf_mprog_entry *tcx_entry_create(void)
+static inline struct bpf_mprog_entry *tcx_entry_create_noprof(void)
 {
-	struct tcx_entry *tcx = kzalloc(sizeof(*tcx), GFP_KERNEL);
+	struct tcx_entry *tcx = kzalloc_noprof(sizeof(*tcx), GFP_KERNEL);
 
 	if (tcx) {
 		bpf_mprog_bundle_init(&tcx->bundle);
@@ -85,6 +85,7 @@ static inline struct bpf_mprog_entry *tcx_entry_create(void)
 	}
 	return NULL;
 }
+#define tcx_entry_create(...)	alloc_hooks(tcx_entry_create_noprof(__VA_ARGS__))
 
 static inline void tcx_entry_free(struct bpf_mprog_entry *entry)
 {
diff --git a/net/sunrpc/auth_gss/auth_gss_internal.h b/net/sunrpc/auth_gss/auth_gss_internal.h
index c53b329092d4..4ebc1b7043d9 100644
--- a/net/sunrpc/auth_gss/auth_gss_internal.h
+++ b/net/sunrpc/auth_gss/auth_gss_internal.h
@@ -23,7 +23,7 @@ simple_get_bytes(const void *p, const void *end, void *res, size_t len)
 }
 
 static inline const void *
-simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest)
+simple_get_netobj_noprof(const void *p, const void *end, struct xdr_netobj *dest)
 {
 	const void *q;
 	unsigned int len;
@@ -35,7 +35,7 @@ simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest)
 	if (unlikely(q > end || q < p))
 		return ERR_PTR(-EFAULT);
 	if (len) {
-		dest->data = kmemdup(p, len, GFP_KERNEL);
+		dest->data = kmemdup_noprof(p, len, GFP_KERNEL);
 		if (unlikely(dest->data == NULL))
 			return ERR_PTR(-ENOMEM);
 	} else
@@ -43,3 +43,5 @@ simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest)
 	dest->len = len;
 	return q;
 }
+
+#define simple_get_netobj(...)	alloc_hooks(simple_get_netobj_noprof(__VA_ARGS__))
-- 
cgit v1.2.3


From 85edc15a4c606094a14c36ebf5bceea7f9a3e395 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 21 Mar 2024 14:24:41 +0000
Subject: mm: remove folio_prep_large_rmappable()

Now that prep_compound_page() initialises folio->_deferred_list,
folio_prep_large_rmappable()'s only purpose is to set the large_rmappable
flag, so inline it into the two callers.  Take the opportunity to convert
the large_rmappable definition from PAGEFLAG to FOLIO_FLAG and remove the
existance of PageTestLargeRmappable and friends.

Link: https://lkml.kernel.org/r/20240321142448.1645400-4-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h    | 3 ---
 include/linux/page-flags.h | 4 ++--
 mm/huge_memory.c           | 9 +--------
 mm/internal.h              | 3 ++-
 4 files changed, 5 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index de0c89105076..0e16451adaba 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -263,7 +263,6 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
 unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
 		unsigned long len, unsigned long pgoff, unsigned long flags);
 
-void folio_prep_large_rmappable(struct folio *folio);
 bool can_split_folio(struct folio *folio, int *pextra_pins);
 int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
 		unsigned int new_order);
@@ -411,8 +410,6 @@ static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
 	return 0;
 }
 
-static inline void folio_prep_large_rmappable(struct folio *folio) {}
-
 #define transparent_hugepage_flags 0UL
 
 #define thp_get_unmapped_area	NULL
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 4bf1c25fd1dc..6fb3cd42ee59 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -868,9 +868,9 @@ static inline void ClearPageCompound(struct page *page)
 	BUG_ON(!PageHead(page));
 	ClearPageHead(page);
 }
-PAGEFLAG(LargeRmappable, large_rmappable, PF_SECOND)
+FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
 #else
-TESTPAGEFLAG_FALSE(LargeRmappable, large_rmappable)
+FOLIO_FLAG_FALSE(large_rmappable)
 #endif
 
 #define PG_head_mask ((1UL << PG_head))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 4cc7133aaa4b..16b2c5622fb1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -789,13 +789,6 @@ struct deferred_split *get_deferred_split_queue(struct folio *folio)
 }
 #endif
 
-void folio_prep_large_rmappable(struct folio *folio)
-{
-	if (!folio || !folio_test_large(folio))
-		return;
-	folio_set_large_rmappable(folio);
-}
-
 static inline bool is_transparent_hugepage(struct folio *folio)
 {
 	if (!folio_test_large(folio))
@@ -2862,7 +2855,7 @@ static void __split_huge_page_tail(struct folio *folio, int tail,
 	clear_compound_head(page_tail);
 	if (new_order) {
 		prep_compound_page(page_tail, new_order);
-		folio_prep_large_rmappable(new_folio);
+		folio_set_large_rmappable(new_folio);
 	}
 
 	/* Finally unfreeze refcount. Additional reference from page cache. */
diff --git a/mm/internal.h b/mm/internal.h
index 5c0c57c9cd19..ab8250d8a591 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -513,7 +513,8 @@ static inline struct folio *page_rmappable_folio(struct page *page)
 {
 	struct folio *folio = (struct folio *)page;
 
-	folio_prep_large_rmappable(folio);
+	if (folio && folio_test_large(folio))
+		folio_set_large_rmappable(folio);
 	return folio;
 }
 
-- 
cgit v1.2.3


From 8682a7be36d8c6ebd484753034a716a13f8a1f54 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 21 Mar 2024 14:24:44 +0000
Subject: mm: remove a call to compound_head() from is_page_hwpoison()

We can call it only once instead of twice.

Link: https://lkml.kernel.org/r/20240321142448.1645400-7-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 6fb3cd42ee59..94eb8a11a321 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -1065,11 +1065,14 @@ static inline bool PageHuge(const struct page *page)
  * best effort only and inherently racy: there is no way to synchronize with
  * failing hardware.
  */
-static inline bool is_page_hwpoison(struct page *page)
+static inline bool is_page_hwpoison(const struct page *page)
 {
+	const struct folio *folio;
+
 	if (PageHWPoison(page))
 		return true;
-	return PageHuge(page) && PageHWPoison(compound_head(page));
+	folio = page_folio(page);
+	return folio_test_hugetlb(folio) && PageHWPoison(&folio->page);
 }
 
 extern bool is_free_buddy_page(struct page *page);
-- 
cgit v1.2.3


From 46df8e73a4a3f1445f2a8429111e72ede1f4d291 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Thu, 21 Mar 2024 14:24:45 +0000
Subject: mm: free up PG_slab

Reclaim the Slab page flag by using a spare bit in PageType.  We are
perennially short of page flags for various purposes, and now that the
original SLAB allocator has been retired, SLUB does not use the
mapcount/page_type field.  This lets us remove a number of special cases
for ignoring mapcount on Slab pages.

[willy@infradead.org: update vmcoreinfo]
  Link: https://lkml.kernel.org/r/ZgGV-O8WYQ_83kxp@casper.infradead.org
Link: https://lkml.kernel.org/r/20240321142448.1645400-8-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h     | 21 +++++++++++++++++----
 include/trace/events/mmflags.h |  2 +-
 kernel/vmcore_info.c           |  3 ++-
 mm/memory-failure.c            |  9 ---------
 mm/slab.h                      |  2 +-
 5 files changed, 21 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 94eb8a11a321..73e0b17c7728 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -109,7 +109,6 @@ enum pageflags {
 	PG_active,
 	PG_workingset,
 	PG_error,
-	PG_slab,
 	PG_owner_priv_1,	/* Owner use. If pagecache, fs may use*/
 	PG_arch_1,
 	PG_reserved,
@@ -524,7 +523,6 @@ PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD)
 	TESTCLEARFLAG(Active, active, PF_HEAD)
 PAGEFLAG(Workingset, workingset, PF_HEAD)
 	TESTCLEARFLAG(Workingset, workingset, PF_HEAD)
-__PAGEFLAG(Slab, slab, PF_NO_TAIL)
 PAGEFLAG(Checked, checked, PF_NO_COMPOUND)	   /* Used by some filesystems */
 
 /* Xen */
@@ -931,7 +929,7 @@ PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned)
 #endif
 
 /*
- * For pages that are never mapped to userspace (and aren't PageSlab),
+ * For pages that are never mapped to userspace,
  * page_type may be used.  Because it is initialised to -1, we invert the
  * sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and
  * __ClearPageFoo *sets* the bit used for PageFoo.  We reserve a few high and
@@ -947,6 +945,7 @@ PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned)
 #define PG_table	0x00000200
 #define PG_guard	0x00000400
 #define PG_hugetlb	0x00000800
+#define PG_slab		0x00001000
 
 #define PageType(page, flag)						\
 	((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE)
@@ -1041,6 +1040,20 @@ PAGE_TYPE_OPS(Table, table, pgtable)
  */
 PAGE_TYPE_OPS(Guard, guard, guard)
 
+FOLIO_TYPE_OPS(slab, slab)
+
+/**
+ * PageSlab - Determine if the page belongs to the slab allocator
+ * @page: The page to test.
+ *
+ * Context: Any context.
+ * Return: True for slab pages, false for any other kind of page.
+ */
+static inline bool PageSlab(const struct page *page)
+{
+	return folio_test_slab(page_folio(page));
+}
+
 #ifdef CONFIG_HUGETLB_PAGE
 FOLIO_TYPE_OPS(hugetlb, hugetlb)
 #else
@@ -1121,7 +1134,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
 	(1UL << PG_lru		| 1UL << PG_locked	|	\
 	 1UL << PG_private	| 1UL << PG_private_2	|	\
 	 1UL << PG_writeback	| 1UL << PG_reserved	|	\
-	 1UL << PG_slab		| 1UL << PG_active 	|	\
+	 1UL << PG_active 	|				\
 	 1UL << PG_unevictable	| __PG_MLOCKED | LRU_GEN_MASK)
 
 /*
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index d55e53ac91bd..e46d6e82765e 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -107,7 +107,6 @@
 	DEF_PAGEFLAG_NAME(lru),						\
 	DEF_PAGEFLAG_NAME(active),					\
 	DEF_PAGEFLAG_NAME(workingset),					\
-	DEF_PAGEFLAG_NAME(slab),					\
 	DEF_PAGEFLAG_NAME(owner_priv_1),				\
 	DEF_PAGEFLAG_NAME(arch_1),					\
 	DEF_PAGEFLAG_NAME(reserved),					\
@@ -135,6 +134,7 @@ IF_HAVE_PG_ARCH_X(arch_3)
 #define DEF_PAGETYPE_NAME(_name) { PG_##_name, __stringify(_name) }
 
 #define __def_pagetype_names						\
+	DEF_PAGETYPE_NAME(slab),					\
 	DEF_PAGETYPE_NAME(hugetlb),					\
 	DEF_PAGETYPE_NAME(offline),					\
 	DEF_PAGETYPE_NAME(guard),					\
diff --git a/kernel/vmcore_info.c b/kernel/vmcore_info.c
index 23c125c2e243..1d5eadd9dd61 100644
--- a/kernel/vmcore_info.c
+++ b/kernel/vmcore_info.c
@@ -198,7 +198,8 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_NUMBER(PG_private);
 	VMCOREINFO_NUMBER(PG_swapcache);
 	VMCOREINFO_NUMBER(PG_swapbacked);
-	VMCOREINFO_NUMBER(PG_slab);
+#define PAGE_SLAB_MAPCOUNT_VALUE	(~PG_slab)
+	VMCOREINFO_NUMBER(PAGE_SLAB_MAPCOUNT_VALUE);
 #ifdef CONFIG_MEMORY_FAILURE
 	VMCOREINFO_NUMBER(PG_hwpoison);
 #endif
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9e62a00b46dd..0a7a8a4ba421 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1251,7 +1251,6 @@ static int me_huge_page(struct page_state *ps, struct page *p)
 #define mlock		(1UL << PG_mlocked)
 #define lru		(1UL << PG_lru)
 #define head		(1UL << PG_head)
-#define slab		(1UL << PG_slab)
 #define reserved	(1UL << PG_reserved)
 
 static struct page_state error_states[] = {
@@ -1261,13 +1260,6 @@ static struct page_state error_states[] = {
 	 * PG_buddy pages only make a small fraction of all free pages.
 	 */
 
-	/*
-	 * Could in theory check if slab page is free or if we can drop
-	 * currently unused objects without touching them. But just
-	 * treat it as standard kernel for now.
-	 */
-	{ slab,		slab,		MF_MSG_SLAB,	me_kernel },
-
 	{ head,		head,		MF_MSG_HUGE,		me_huge_page },
 
 	{ sc|dirty,	sc|dirty,	MF_MSG_DIRTY_SWAPCACHE,	me_swapcache_dirty },
@@ -1294,7 +1286,6 @@ static struct page_state error_states[] = {
 #undef mlock
 #undef lru
 #undef head
-#undef slab
 #undef reserved
 
 static void update_per_node_mf_stats(unsigned long pfn,
diff --git a/mm/slab.h b/mm/slab.h
index 65db525e93af..1343bfa12cee 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -84,8 +84,8 @@ struct slab {
 		};
 		struct rcu_head rcu_head;
 	};
-	unsigned int __unused;
 
+	unsigned int __page_type;
 	atomic_t __page_refcount;
 #ifdef CONFIG_SLAB_OBJ_EXT
 	unsigned long obj_exts;
-- 
cgit v1.2.3


From eff201b8755bbc95a65165960a6464ca2ae18587 Mon Sep 17 00:00:00 2001
From: Hao Ge <gehao@kylinos.cn>
Date: Thu, 21 Mar 2024 11:22:56 +0800
Subject: mm/page-flags: make __PageMovable return bool

Make __PageMovable() return bool like __folio_test_movable().

Link: https://lkml.kernel.org/r/20240321032256.82063-1-gehao@kylinos.cn
Signed-off-by: Hao Ge <gehao@kylinos.cn>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 73e0b17c7728..9c7ca28ac84c 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -707,7 +707,7 @@ static __always_inline bool __folio_test_movable(const struct folio *folio)
 			PAGE_MAPPING_MOVABLE;
 }
 
-static __always_inline int __PageMovable(const struct page *page)
+static __always_inline bool __PageMovable(const struct page *page)
 {
 	return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
 				PAGE_MAPPING_MOVABLE;
-- 
cgit v1.2.3


From 64c2e895df14c642fbb05e5ac7f55044cd4913a3 Mon Sep 17 00:00:00 2001
From: Hao Ge <gehao@kylinos.cn>
Date: Thu, 21 Mar 2024 11:07:12 +0800
Subject: mm/page-flags: make PageMappingFlags return bool

Make PageMappingFlags() return bool like folio_mapping_flags().

Link: https://lkml.kernel.org/r/20240321030712.80618-1-gehao@kylinos.cn
Signed-off-by: Hao Ge <gehao@kylinos.cn>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 9c7ca28ac84c..0163d6da6283 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -686,7 +686,7 @@ static __always_inline bool folio_mapping_flags(const struct folio *folio)
 	return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) != 0;
 }
 
-static __always_inline int PageMappingFlags(const struct page *page)
+static __always_inline bool PageMappingFlags(const struct page *page)
 {
 	return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != 0;
 }
-- 
cgit v1.2.3


From c0cd6f557b9090525d288806cccbc73440ac235a Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 20 Mar 2024 14:02:11 -0400
Subject: mm: page_alloc: fix freelist movement during block conversion

Currently, page block type conversion during fallbacks, atomic
reservations and isolation can strand various amounts of free pages on
incorrect freelists.

For example, fallback stealing moves free pages in the block to the new
type's freelists, but then may not actually claim the block for that type
if there aren't enough compatible pages already allocated.

In all cases, free page moving might fail if the block straddles more than
one zone, in which case no free pages are moved at all, but the block type
is changed anyway.

This is detrimental to type hygiene on the freelists.  It encourages
incompatible page mixing down the line (ask for one type, get another) and
thus contributes to long-term fragmentation.

Split the process into a proper transaction: check first if conversion
will happen, then try to move the free pages, and only if that was
successful convert the block to the new type.

[baolin.wang@linux.alibaba.com: fix allocation failures with CONFIG_CMA]
  Link: https://lkml.kernel.org/r/a97697e0-45b0-4f71-b087-fdc7a1d43c0e@linux.alibaba.com
Link: https://lkml.kernel.org/r/20240320180429.678181-7-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Tested-by: "Huang, Ying" <ying.huang@intel.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-isolation.h |   3 +-
 mm/page_alloc.c                | 174 +++++++++++++++++++++++++----------------
 mm/page_isolation.c            |  22 ++++--
 3 files changed, 121 insertions(+), 78 deletions(-)

(limited to 'include')

diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 4ac34392823a..8550b3c91480 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -34,8 +34,7 @@ static inline bool is_migrate_isolate(int migratetype)
 #define REPORT_FAILURE	0x2
 
 void set_pageblock_migratetype(struct page *page, int migratetype);
-int move_freepages_block(struct zone *zone, struct page *page,
-				int migratetype, int *num_movable);
+int move_freepages_block(struct zone *zone, struct page *page, int migratetype);
 
 int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 			     int migratetype, int flags, gfp_t gfp_flags);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0e223d5b94fa..a7cfe65e45c1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1601,9 +1601,8 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
  * Note that start_page and end_pages are not aligned on a pageblock
  * boundary. If alignment is required, use move_freepages_block()
  */
-static int move_freepages(struct zone *zone,
-			  unsigned long start_pfn, unsigned long end_pfn,
-			  int migratetype, int *num_movable)
+static int move_freepages(struct zone *zone, unsigned long start_pfn,
+			  unsigned long end_pfn, int migratetype)
 {
 	struct page *page;
 	unsigned long pfn;
@@ -1613,14 +1612,6 @@ static int move_freepages(struct zone *zone,
 	for (pfn = start_pfn; pfn <= end_pfn;) {
 		page = pfn_to_page(pfn);
 		if (!PageBuddy(page)) {
-			/*
-			 * We assume that pages that could be isolated for
-			 * migration are movable. But we don't actually try
-			 * isolating, as that would be expensive.
-			 */
-			if (num_movable &&
-					(PageLRU(page) || __PageMovable(page)))
-				(*num_movable)++;
 			pfn++;
 			continue;
 		}
@@ -1638,17 +1629,16 @@ static int move_freepages(struct zone *zone,
 	return pages_moved;
 }
 
-int move_freepages_block(struct zone *zone, struct page *page,
-				int migratetype, int *num_movable)
+static bool prep_move_freepages_block(struct zone *zone, struct page *page,
+				      unsigned long *start_pfn,
+				      unsigned long *end_pfn,
+				      int *num_free, int *num_movable)
 {
-	unsigned long start_pfn, end_pfn, pfn;
-
-	if (num_movable)
-		*num_movable = 0;
+	unsigned long pfn, start, end;
 
 	pfn = page_to_pfn(page);
-	start_pfn = pageblock_start_pfn(pfn);
-	end_pfn = pageblock_end_pfn(pfn) - 1;
+	start = pageblock_start_pfn(pfn);
+	end = pageblock_end_pfn(pfn) - 1;
 
 	/*
 	 * The caller only has the lock for @zone, don't touch ranges
@@ -1657,13 +1647,50 @@ int move_freepages_block(struct zone *zone, struct page *page,
 	 * accompanied by other operations such as migratetype updates
 	 * which also should be locked.
 	 */
-	if (!zone_spans_pfn(zone, start_pfn))
-		return 0;
-	if (!zone_spans_pfn(zone, end_pfn))
-		return 0;
+	if (!zone_spans_pfn(zone, start))
+		return false;
+	if (!zone_spans_pfn(zone, end))
+		return false;
+
+	*start_pfn = start;
+	*end_pfn = end;
 
-	return move_freepages(zone, start_pfn, end_pfn, migratetype,
-								num_movable);
+	if (num_free) {
+		*num_free = 0;
+		*num_movable = 0;
+		for (pfn = start; pfn <= end;) {
+			page = pfn_to_page(pfn);
+			if (PageBuddy(page)) {
+				int nr = 1 << buddy_order(page);
+
+				*num_free += nr;
+				pfn += nr;
+				continue;
+			}
+			/*
+			 * We assume that pages that could be isolated for
+			 * migration are movable. But we don't actually try
+			 * isolating, as that would be expensive.
+			 */
+			if (PageLRU(page) || __PageMovable(page))
+				(*num_movable)++;
+			pfn++;
+		}
+	}
+
+	return true;
+}
+
+int move_freepages_block(struct zone *zone, struct page *page,
+			 int migratetype)
+{
+	unsigned long start_pfn, end_pfn;
+
+	if (!prep_move_freepages_block(zone, page, &start_pfn, &end_pfn,
+				       NULL, NULL))
+		return -1;
+
+	return move_freepages(zone, start_pfn, end_pfn, migratetype);
 }
 
 static void change_pageblock_range(struct page *pageblock_page,
@@ -1748,33 +1775,37 @@ static inline bool boost_watermark(struct zone *zone)
 }
 
 /*
- * This function implements actual steal behaviour. If order is large enough,
- * we can steal whole pageblock. If not, we first move freepages in this
- * pageblock to our migratetype and determine how many already-allocated pages
- * are there in the pageblock with a compatible migratetype. If at least half
- * of pages are free or compatible, we can change migratetype of the pageblock
- * itself, so pages freed in the future will be put on the correct free list.
+ * This function implements actual steal behaviour. If order is large enough, we
+ * can claim the whole pageblock for the requested migratetype. If not, we check
+ * the pageblock for constituent pages; if at least half of the pages are free
+ * or compatible, we can still claim the whole block, so pages freed in the
+ * future will be put on the correct free list. Otherwise, we isolate exactly
+ * the order we need from the fallback block and leave its migratetype alone.
  */
-static void steal_suitable_fallback(struct zone *zone, struct page *page,
-		unsigned int alloc_flags, int start_type, bool whole_block)
+static struct page *
+steal_suitable_fallback(struct zone *zone, struct page *page,
+			int current_order, int order, int start_type,
+			unsigned int alloc_flags, bool whole_block)
 {
-	unsigned int current_order = buddy_order(page);
 	int free_pages, movable_pages, alike_pages;
-	int old_block_type;
+	unsigned long start_pfn, end_pfn;
+	int block_type;
 
-	old_block_type = get_pageblock_migratetype(page);
+	block_type = get_pageblock_migratetype(page);
 
 	/*
 	 * This can happen due to races and we want to prevent broken
 	 * highatomic accounting.
 	 */
-	if (is_migrate_highatomic(old_block_type))
+	if (is_migrate_highatomic(block_type))
 		goto single_page;
 
 	/* Take ownership for orders >= pageblock_order */
 	if (current_order >= pageblock_order) {
+		del_page_from_free_list(page, zone, current_order);
 		change_pageblock_range(page, current_order, start_type);
-		goto single_page;
+		expand(zone, page, order, current_order, start_type);
+		return page;
 	}
 
 	/*
@@ -1789,10 +1820,9 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
 	if (!whole_block)
 		goto single_page;
 
-	free_pages = move_freepages_block(zone, page, start_type,
-						&movable_pages);
 	/* moving whole block can fail due to zone boundary conditions */
-	if (!free_pages)
+	if (!prep_move_freepages_block(zone, page, &start_pfn, &end_pfn,
+				       &free_pages, &movable_pages))
 		goto single_page;
 
 	/*
@@ -1810,7 +1840,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
 		 * vice versa, be conservative since we can't distinguish the
 		 * exact migratetype of non-movable pages.
 		 */
-		if (old_block_type == MIGRATE_MOVABLE)
+		if (block_type == MIGRATE_MOVABLE)
 			alike_pages = pageblock_nr_pages
 						- (free_pages + movable_pages);
 		else
@@ -1821,13 +1851,16 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
 	 * compatible migratability as our allocation, claim the whole block.
 	 */
 	if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
-			page_group_by_mobility_disabled)
+			page_group_by_mobility_disabled) {
+		move_freepages(zone, start_pfn, end_pfn, start_type);
 		set_pageblock_migratetype(page, start_type);
-
-	return;
+		return __rmqueue_smallest(zone, order, start_type);
+	}
 
 single_page:
-	move_to_free_list(page, zone, current_order, start_type);
+	del_page_from_free_list(page, zone, current_order);
+	expand(zone, page, order, current_order, block_type);
+	return page;
 }
 
 /*
@@ -1895,9 +1928,10 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone)
 	mt = get_pageblock_migratetype(page);
 	/* Only reserve normal pageblocks (i.e., they can merge with others) */
 	if (migratetype_is_mergeable(mt)) {
-		zone->nr_reserved_highatomic += pageblock_nr_pages;
-		set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
-		move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
+		if (move_freepages_block(zone, page, MIGRATE_HIGHATOMIC) != -1) {
+			set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
+			zone->nr_reserved_highatomic += pageblock_nr_pages;
+		}
 	}
 
 out_unlock:
@@ -1922,7 +1956,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
 	struct zone *zone;
 	struct page *page;
 	int order;
-	bool ret;
+	int ret;
 
 	for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
 								ac->nodemask) {
@@ -1971,10 +2005,14 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
 			 * of pageblocks that cannot be completely freed
 			 * may increase.
 			 */
+			ret = move_freepages_block(zone, page, ac->migratetype);
+			/*
+			 * Reserving this block already succeeded, so this should
+			 * not fail on zone boundaries.
+			 */
+			WARN_ON_ONCE(ret == -1);
 			set_pageblock_migratetype(page, ac->migratetype);
-			ret = move_freepages_block(zone, page, ac->migratetype,
-									NULL);
-			if (ret) {
+			if (ret > 0) {
 				spin_unlock_irqrestore(&zone->lock, flags);
 				return ret;
 			}
@@ -1995,7 +2033,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
  * deviation from the rest of this file, to make the for loop
  * condition simpler.
  */
-static __always_inline bool
+static __always_inline struct page *
 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
 						unsigned int alloc_flags)
 {
@@ -2042,7 +2080,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
 		goto do_steal;
 	}
 
-	return false;
+	return NULL;
 
 find_smallest:
 	for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) {
@@ -2062,14 +2100,14 @@ find_smallest:
 do_steal:
 	page = get_page_from_free_area(area, fallback_mt);
 
-	steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
-								can_steal);
+	/* take off list, maybe claim block, expand remainder */
+	page = steal_suitable_fallback(zone, page, current_order, order,
+				       start_migratetype, alloc_flags, can_steal);
 
 	trace_mm_page_alloc_extfrag(page, order, current_order,
 		start_migratetype, fallback_mt);
 
-	return true;
-
+	return page;
 }
 
 /*
@@ -2096,15 +2134,15 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
 				return page;
 		}
 	}
-retry:
+
 	page = __rmqueue_smallest(zone, order, migratetype);
 	if (unlikely(!page)) {
 		if (alloc_flags & ALLOC_CMA)
 			page = __rmqueue_cma_fallback(zone, order);
 
-		if (!page && __rmqueue_fallback(zone, order, migratetype,
-								alloc_flags))
-			goto retry;
+		if (!page)
+			page = __rmqueue_fallback(zone, order, migratetype,
+						  alloc_flags);
 	}
 	return page;
 }
@@ -2659,12 +2697,10 @@ int __isolate_free_page(struct page *page, unsigned int order)
 			 * Only change normal pageblocks (i.e., they can merge
 			 * with others)
 			 */
-			if (migratetype_is_mergeable(mt)) {
-				set_pageblock_migratetype(page,
-							  MIGRATE_MOVABLE);
-				move_freepages_block(zone, page,
-						     MIGRATE_MOVABLE, NULL);
-			}
+			if (migratetype_is_mergeable(mt) &&
+			    move_freepages_block(zone, page,
+						 MIGRATE_MOVABLE) != -1)
+				set_pageblock_migratetype(page, MIGRATE_MOVABLE);
 		}
 	}
 
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index a5c8fa4c2a75..71539d7b96cf 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -178,15 +178,18 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
 	unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end,
 			migratetype, isol_flags);
 	if (!unmovable) {
-		unsigned long nr_pages;
+		int nr_pages;
 		int mt = get_pageblock_migratetype(page);
 
+		nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
+		/* Block spans zone boundaries? */
+		if (nr_pages == -1) {
+			spin_unlock_irqrestore(&zone->lock, flags);
+			return -EBUSY;
+		}
+		__mod_zone_freepage_state(zone, -nr_pages, mt);
 		set_pageblock_migratetype(page, MIGRATE_ISOLATE);
 		zone->nr_isolate_pageblock++;
-		nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE,
-									NULL);
-
-		__mod_zone_freepage_state(zone, -nr_pages, mt);
 		spin_unlock_irqrestore(&zone->lock, flags);
 		return 0;
 	}
@@ -206,7 +209,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
 static void unset_migratetype_isolate(struct page *page, int migratetype)
 {
 	struct zone *zone;
-	unsigned long flags, nr_pages;
+	unsigned long flags;
 	bool isolated_page = false;
 	unsigned int order;
 	struct page *buddy;
@@ -252,7 +255,12 @@ static void unset_migratetype_isolate(struct page *page, int migratetype)
 	 * allocation.
 	 */
 	if (!isolated_page) {
-		nr_pages = move_freepages_block(zone, page, migratetype, NULL);
+		int nr_pages = move_freepages_block(zone, page, migratetype);
+		/*
+		 * Isolating this block already succeeded, so this
+		 * should not fail on zone boundaries.
+		 */
+		WARN_ON_ONCE(nr_pages == -1);
 		__mod_zone_freepage_state(zone, nr_pages, migratetype);
 	}
 	set_pageblock_migratetype(page, migratetype);
-- 
cgit v1.2.3


From fd919a85cd55be5d00a6a7372071f44c8eafb825 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 20 Mar 2024 14:02:14 -0400
Subject: mm: page_isolation: prepare for hygienic freelists

Page isolation currently sets MIGRATE_ISOLATE on a block, then drops
zone->lock and scans the block for straddling buddies to split up.
Because this happens non-atomically wrt the page allocator, it's possible
for allocations to get a buddy whose first block is a regular pcp
migratetype but whose tail is isolated.  This means that in certain cases
memory can still be allocated after isolation.  It will also trigger the
freelist type hygiene warnings in subsequent patches.

start_isolate_page_range()
  isolate_single_pageblock()
    set_migratetype_isolate(tail)
      lock zone->lock
      move_freepages_block(tail) // nop
      set_pageblock_migratetype(tail)
      unlock zone->lock
                                                     __rmqueue_smallest()
                                                       del_page_from_freelist(head)
                                                       expand(head, head_mt)
                                                         WARN(head_mt != tail_mt)
    start_pfn = ALIGN_DOWN(MAX_ORDER_NR_PAGES)
    for (pfn = start_pfn, pfn < end_pfn)
      if (PageBuddy())
        split_free_page(head)

Introduce a variant of move_freepages_block() provided by the allocator
specifically for page isolation; it moves free pages, converts the block,
and handles the splitting of straddling buddies while holding zone->lock.

The allocator knows that pageblocks and buddies are always naturally
aligned, which means that buddies can only straddle blocks if they're
actually >pageblock_order.  This means the search-and-split part can be
simplified compared to what page isolation used to do.

Also tighten up the page isolation code around the expectations of which
pages can be large, and how they are freed.

Based on extensive discussions with and invaluable input from Zi Yan.

[hannes@cmpxchg.org: work around older gcc warning]
  Link: https://lkml.kernel.org/r/20240321142426.GB777580@cmpxchg.org
Link: https://lkml.kernel.org/r/20240320180429.678181-10-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-isolation.h |   4 +-
 mm/internal.h                  |   4 -
 mm/page_alloc.c                | 204 ++++++++++++++++++++++++-----------------
 mm/page_isolation.c            | 106 +++++++--------------
 4 files changed, 155 insertions(+), 163 deletions(-)

(limited to 'include')

diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 8550b3c91480..c16db0067090 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -34,7 +34,9 @@ static inline bool is_migrate_isolate(int migratetype)
 #define REPORT_FAILURE	0x2
 
 void set_pageblock_migratetype(struct page *page, int migratetype);
-int move_freepages_block(struct zone *zone, struct page *page, int migratetype);
+
+bool move_freepages_block_isolate(struct zone *zone, struct page *page,
+				  int migratetype);
 
 int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 			     int migratetype, int flags, gfp_t gfp_flags);
diff --git a/mm/internal.h b/mm/internal.h
index ab8250d8a591..cbc40f07537c 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -562,10 +562,6 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
 void memmap_init_range(unsigned long, int, unsigned long, unsigned long,
 		unsigned long, enum meminit_context, struct vmem_altmap *, int);
 
-
-int split_free_page(struct page *free_page,
-			unsigned int order, unsigned long split_pfn_offset);
-
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
 
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fee52ce8ab2d..fe7853eb3ea0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -833,64 +833,6 @@ done_merging:
 		page_reporting_notify_free(order);
 }
 
-/**
- * split_free_page() -- split a free page at split_pfn_offset
- * @free_page:		the original free page
- * @order:		the order of the page
- * @split_pfn_offset:	split offset within the page
- *
- * Return -ENOENT if the free page is changed, otherwise 0
- *
- * It is used when the free page crosses two pageblocks with different migratetypes
- * at split_pfn_offset within the page. The split free page will be put into
- * separate migratetype lists afterwards. Otherwise, the function achieves
- * nothing.
- */
-int split_free_page(struct page *free_page,
-			unsigned int order, unsigned long split_pfn_offset)
-{
-	struct zone *zone = page_zone(free_page);
-	unsigned long free_page_pfn = page_to_pfn(free_page);
-	unsigned long pfn;
-	unsigned long flags;
-	int free_page_order;
-	int mt;
-	int ret = 0;
-
-	if (split_pfn_offset == 0)
-		return ret;
-
-	spin_lock_irqsave(&zone->lock, flags);
-
-	if (!PageBuddy(free_page) || buddy_order(free_page) != order) {
-		ret = -ENOENT;
-		goto out;
-	}
-
-	mt = get_pfnblock_migratetype(free_page, free_page_pfn);
-	if (likely(!is_migrate_isolate(mt)))
-		__mod_zone_freepage_state(zone, -(1UL << order), mt);
-
-	del_page_from_free_list(free_page, zone, order);
-	for (pfn = free_page_pfn;
-	     pfn < free_page_pfn + (1UL << order);) {
-		int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
-
-		free_page_order = min_t(unsigned int,
-					pfn ? __ffs(pfn) : order,
-					__fls(split_pfn_offset));
-		__free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
-				mt, FPI_NONE);
-		pfn += 1UL << free_page_order;
-		split_pfn_offset -= (1UL << free_page_order);
-		/* we have done the first part, now switch to second part */
-		if (split_pfn_offset == 0)
-			split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
-	}
-out:
-	spin_unlock_irqrestore(&zone->lock, flags);
-	return ret;
-}
 /*
  * A bad page could be due to a number of fields. Instead of multiple branches,
  * try and check multiple fields with one check. The caller must do a detailed
@@ -1674,8 +1616,8 @@ static bool prep_move_freepages_block(struct zone *zone, struct page *page,
 	return true;
 }
 
-int move_freepages_block(struct zone *zone, struct page *page,
-			 int migratetype)
+static int move_freepages_block(struct zone *zone, struct page *page,
+				int migratetype)
 {
 	unsigned long start_pfn, end_pfn;
 
@@ -1686,6 +1628,123 @@ int move_freepages_block(struct zone *zone, struct page *page,
 	return move_freepages(zone, start_pfn, end_pfn, migratetype);
 }
 
+#ifdef CONFIG_MEMORY_ISOLATION
+/* Look for a buddy that straddles start_pfn */
+static unsigned long find_large_buddy(unsigned long start_pfn)
+{
+	int order = 0;
+	struct page *page;
+	unsigned long pfn = start_pfn;
+
+	while (!PageBuddy(page = pfn_to_page(pfn))) {
+		/* Nothing found */
+		if (++order > MAX_PAGE_ORDER)
+			return start_pfn;
+		pfn &= ~0UL << order;
+	}
+
+	/*
+	 * Found a preceding buddy, but does it straddle?
+	 */
+	if (pfn + (1 << buddy_order(page)) > start_pfn)
+		return pfn;
+
+	/* Nothing found */
+	return start_pfn;
+}
+
+/* Split a multi-block free page into its individual pageblocks */
+static void split_large_buddy(struct zone *zone, struct page *page,
+			      unsigned long pfn, int order)
+{
+	unsigned long end_pfn = pfn + (1 << order);
+
+	VM_WARN_ON_ONCE(order <= pageblock_order);
+	VM_WARN_ON_ONCE(pfn & (pageblock_nr_pages - 1));
+
+	/* Caller removed page from freelist, buddy info cleared! */
+	VM_WARN_ON_ONCE(PageBuddy(page));
+
+	while (pfn != end_pfn) {
+		int mt = get_pfnblock_migratetype(page, pfn);
+
+		__free_one_page(page, pfn, zone, pageblock_order, mt, FPI_NONE);
+		pfn += pageblock_nr_pages;
+		page = pfn_to_page(pfn);
+	}
+}
+
+/**
+ * move_freepages_block_isolate - move free pages in block for page isolation
+ * @zone: the zone
+ * @page: the pageblock page
+ * @migratetype: migratetype to set on the pageblock
+ *
+ * This is similar to move_freepages_block(), but handles the special
+ * case encountered in page isolation, where the block of interest
+ * might be part of a larger buddy spanning multiple pageblocks.
+ *
+ * Unlike the regular page allocator path, which moves pages while
+ * stealing buddies off the freelist, page isolation is interested in
+ * arbitrary pfn ranges that may have overlapping buddies on both ends.
+ *
+ * This function handles that. Straddling buddies are split into
+ * individual pageblocks. Only the block of interest is moved.
+ *
+ * Returns %true if pages could be moved, %false otherwise.
+ */
+bool move_freepages_block_isolate(struct zone *zone, struct page *page,
+				  int migratetype)
+{
+	unsigned long start_pfn, end_pfn, pfn;
+	int nr_moved, mt;
+
+	if (!prep_move_freepages_block(zone, page, &start_pfn, &end_pfn,
+				       NULL, NULL))
+		return false;
+
+	/* No splits needed if buddies can't span multiple blocks */
+	if (pageblock_order == MAX_PAGE_ORDER)
+		goto move;
+
+	/* We're a tail block in a larger buddy */
+	pfn = find_large_buddy(start_pfn);
+	if (pfn != start_pfn) {
+		struct page *buddy = pfn_to_page(pfn);
+		int order = buddy_order(buddy);
+		int mt = get_pfnblock_migratetype(buddy, pfn);
+
+		if (!is_migrate_isolate(mt))
+			__mod_zone_freepage_state(zone, -(1UL << order), mt);
+		del_page_from_free_list(buddy, zone, order);
+		set_pageblock_migratetype(page, migratetype);
+		split_large_buddy(zone, buddy, pfn, order);
+		return true;
+	}
+
+	/* We're the starting block of a larger buddy */
+	if (PageBuddy(page) && buddy_order(page) > pageblock_order) {
+		int mt = get_pfnblock_migratetype(page, pfn);
+		int order = buddy_order(page);
+
+		if (!is_migrate_isolate(mt))
+			__mod_zone_freepage_state(zone, -(1UL << order), mt);
+		del_page_from_free_list(page, zone, order);
+		set_pageblock_migratetype(page, migratetype);
+		split_large_buddy(zone, page, pfn, order);
+		return true;
+	}
+move:
+	mt = get_pfnblock_migratetype(page, start_pfn);
+	nr_moved = move_freepages(zone, start_pfn, end_pfn, migratetype);
+	if (!is_migrate_isolate(mt))
+		__mod_zone_freepage_state(zone, -nr_moved, mt);
+	else if (!is_migrate_isolate(migratetype))
+		__mod_zone_freepage_state(zone, nr_moved, migratetype);
+	return true;
+}
+#endif /* CONFIG_MEMORY_ISOLATION */
+
 static void change_pageblock_range(struct page *pageblock_page,
 					int start_order, int migratetype)
 {
@@ -6365,7 +6424,6 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
 		       unsigned migratetype, gfp_t gfp_mask)
 {
 	unsigned long outer_start, outer_end;
-	int order;
 	int ret = 0;
 
 	struct compact_control cc = {
@@ -6438,29 +6496,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
 	 * We don't have to hold zone->lock here because the pages are
 	 * isolated thus they won't get removed from buddy.
 	 */
-
-	order = 0;
-	outer_start = start;
-	while (!PageBuddy(pfn_to_page(outer_start))) {
-		if (++order > MAX_PAGE_ORDER) {
-			outer_start = start;
-			break;
-		}
-		outer_start &= ~0UL << order;
-	}
-
-	if (outer_start != start) {
-		order = buddy_order(pfn_to_page(outer_start));
-
-		/*
-		 * outer_start page could be small order buddy page and
-		 * it doesn't include start page. Adjust outer_start
-		 * in this case to report failed page properly
-		 * on tracepoint in test_pages_isolated()
-		 */
-		if (outer_start + (1UL << order) <= start)
-			outer_start = start;
-	}
+	outer_start = find_large_buddy(start);
 
 	/* Make sure the range is really isolated. */
 	if (test_pages_isolated(outer_start, end, 0)) {
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index f84f0981b2df..042937d5abe4 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -178,16 +178,10 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
 	unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end,
 			migratetype, isol_flags);
 	if (!unmovable) {
-		int nr_pages;
-		int mt = get_pageblock_migratetype(page);
-
-		nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
-		/* Block spans zone boundaries? */
-		if (nr_pages == -1) {
+		if (!move_freepages_block_isolate(zone, page, MIGRATE_ISOLATE)) {
 			spin_unlock_irqrestore(&zone->lock, flags);
 			return -EBUSY;
 		}
-		__mod_zone_freepage_state(zone, -nr_pages, mt);
 		zone->nr_isolate_pageblock++;
 		spin_unlock_irqrestore(&zone->lock, flags);
 		return 0;
@@ -254,13 +248,11 @@ static void unset_migratetype_isolate(struct page *page, int migratetype)
 	 * allocation.
 	 */
 	if (!isolated_page) {
-		int nr_pages = move_freepages_block(zone, page, migratetype);
 		/*
 		 * Isolating this block already succeeded, so this
 		 * should not fail on zone boundaries.
 		 */
-		WARN_ON_ONCE(nr_pages == -1);
-		__mod_zone_freepage_state(zone, nr_pages, migratetype);
+		WARN_ON_ONCE(!move_freepages_block_isolate(zone, page, migratetype));
 	} else {
 		set_pageblock_migratetype(page, migratetype);
 		__putback_isolated_page(page, order, migratetype);
@@ -374,26 +366,29 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
 
 		VM_BUG_ON(!page);
 		pfn = page_to_pfn(page);
-		/*
-		 * start_pfn is MAX_ORDER_NR_PAGES aligned, if there is any
-		 * free pages in [start_pfn, boundary_pfn), its head page will
-		 * always be in the range.
-		 */
+
 		if (PageBuddy(page)) {
 			int order = buddy_order(page);
 
-			if (pfn + (1UL << order) > boundary_pfn) {
-				/* free page changed before split, check it again */
-				if (split_free_page(page, order, boundary_pfn - pfn))
-					continue;
-			}
+			/* move_freepages_block_isolate() handled this */
+			VM_WARN_ON_ONCE(pfn + (1 << order) > boundary_pfn);
 
 			pfn += 1UL << order;
 			continue;
 		}
+
 		/*
-		 * migrate compound pages then let the free page handling code
-		 * above do the rest. If migration is not possible, just fail.
+		 * If a compound page is straddling our block, attempt
+		 * to migrate it out of the way.
+		 *
+		 * We don't have to worry about this creating a large
+		 * free page that straddles into our block: gigantic
+		 * pages are freed as order-0 chunks, and LRU pages
+		 * (currently) do not exceed pageblock_order.
+		 *
+		 * The block of interest has already been marked
+		 * MIGRATE_ISOLATE above, so when migration is done it
+		 * will free its pages onto the correct freelists.
 		 */
 		if (PageCompound(page)) {
 			struct page *head = compound_head(page);
@@ -404,16 +399,10 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
 				pfn = head_pfn + nr_pages;
 				continue;
 			}
+
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
-			/*
-			 * hugetlb, lru compound (THP), and movable compound pages
-			 * can be migrated. Otherwise, fail the isolation.
-			 */
-			if (PageHuge(page) || PageLRU(page) || __PageMovable(page)) {
-				int order;
-				unsigned long outer_pfn;
+			if (PageHuge(page)) {
 				int page_mt = get_pageblock_migratetype(page);
-				bool isolate_page = !is_migrate_isolate_page(page);
 				struct compact_control cc = {
 					.nr_migratepages = 0,
 					.order = -1,
@@ -426,56 +415,25 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
 				};
 				INIT_LIST_HEAD(&cc.migratepages);
 
-				/*
-				 * XXX: mark the page as MIGRATE_ISOLATE so that
-				 * no one else can grab the freed page after migration.
-				 * Ideally, the page should be freed as two separate
-				 * pages to be added into separate migratetype free
-				 * lists.
-				 */
-				if (isolate_page) {
-					ret = set_migratetype_isolate(page, page_mt,
-						flags, head_pfn, head_pfn + nr_pages);
-					if (ret)
-						goto failed;
-				}
-
 				ret = __alloc_contig_migrate_range(&cc, head_pfn,
 							head_pfn + nr_pages, page_mt);
-
-				/*
-				 * restore the page's migratetype so that it can
-				 * be split into separate migratetype free lists
-				 * later.
-				 */
-				if (isolate_page)
-					unset_migratetype_isolate(page, page_mt);
-
 				if (ret)
 					goto failed;
-				/*
-				 * reset pfn to the head of the free page, so
-				 * that the free page handling code above can split
-				 * the free page to the right migratetype list.
-				 *
-				 * head_pfn is not used here as a hugetlb page order
-				 * can be bigger than MAX_PAGE_ORDER, but after it is
-				 * freed, the free page order is not. Use pfn within
-				 * the range to find the head of the free page.
-				 */
-				order = 0;
-				outer_pfn = pfn;
-				while (!PageBuddy(pfn_to_page(outer_pfn))) {
-					/* stop if we cannot find the free page */
-					if (++order > MAX_PAGE_ORDER)
-						goto failed;
-					outer_pfn &= ~0UL << order;
-				}
-				pfn = outer_pfn;
+				pfn = head_pfn + nr_pages;
 				continue;
-			} else
+			}
+
+			/*
+			 * These pages are movable too, but they're
+			 * not expected to exceed pageblock_order.
+			 *
+			 * Let us know when they do, so we can add
+			 * proper free and split handling for them.
+			 */
+			VM_WARN_ON_ONCE_PAGE(PageLRU(page), page);
+			VM_WARN_ON_ONCE_PAGE(__PageMovable(page), page);
 #endif
-				goto failed;
+			goto failed;
 		}
 
 		pfn++;
-- 
cgit v1.2.3


From e0932b6c1f942fa747258e152cdce0d0b2b5be5c Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 20 Mar 2024 14:02:15 -0400
Subject: mm: page_alloc: consolidate free page accounting

Free page accounting currently happens a bit too high up the call stack,
where it has to deal with guard pages, compaction capturing, block
stealing and even page isolation.  This is subtle and fragile, and makes
it difficult to hack on the code.

Now that type violations on the freelists have been fixed, push the
accounting down to where pages enter and leave the freelist.

[hannes@cmpxchg.org: undo unrelated drive-by line wrap]
  Link: https://lkml.kernel.org/r/20240327185736.GA7597@cmpxchg.org
[hannes@cmpxchg.org: remove unused page parameter from account_freepages()]
  Link: https://lkml.kernel.org/r/20240327185831.GB7597@cmpxchg.org
[baolin.wang@linux.alibaba.com: fix free page accounting]
  Link: https://lkml.kernel.org/r/a2a48baca69f103aa431fd201f8a06e3b95e203d.1712648441.git.baolin.wang@linux.alibaba.com
[andriy.shevchenko@linux.intel.com: avoid defining unused function]
  Link: https://lkml.kernel.org/r/20240423161506.2637177-1-andriy.shevchenko@linux.intel.com
Link: https://lkml.kernel.org/r/20240320180429.678181-11-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h     |  18 +++--
 include/linux/vmstat.h |   8 ---
 mm/debug_page_alloc.c  |  12 +---
 mm/internal.h          |   5 --
 mm/page_alloc.c        | 192 +++++++++++++++++++++++++++----------------------
 5 files changed, 118 insertions(+), 117 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 60eabc6c8e00..1588fe15a38e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3797,24 +3797,22 @@ static inline bool page_is_guard(struct page *page)
 	return PageGuard(page);
 }
 
-bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order,
-		      int migratetype);
+bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order);
 static inline bool set_page_guard(struct zone *zone, struct page *page,
-				  unsigned int order, int migratetype)
+				  unsigned int order)
 {
 	if (!debug_guardpage_enabled())
 		return false;
-	return __set_page_guard(zone, page, order, migratetype);
+	return __set_page_guard(zone, page, order);
 }
 
-void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order,
-			int migratetype);
+void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order);
 static inline void clear_page_guard(struct zone *zone, struct page *page,
-				    unsigned int order, int migratetype)
+				    unsigned int order)
 {
 	if (!debug_guardpage_enabled())
 		return;
-	__clear_page_guard(zone, page, order, migratetype);
+	__clear_page_guard(zone, page, order);
 }
 
 #else	/* CONFIG_DEBUG_PAGEALLOC */
@@ -3824,9 +3822,9 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; }
 static inline bool debug_guardpage_enabled(void) { return false; }
 static inline bool page_is_guard(struct page *page) { return false; }
 static inline bool set_page_guard(struct zone *zone, struct page *page,
-			unsigned int order, int migratetype) { return false; }
+			unsigned int order) { return false; }
 static inline void clear_page_guard(struct zone *zone, struct page *page,
-				unsigned int order, int migratetype) {}
+				unsigned int order) {}
 #endif	/* CONFIG_DEBUG_PAGEALLOC */
 
 #ifdef __HAVE_ARCH_GATE_AREA
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 343906a98d6e..735eae6e272c 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -487,14 +487,6 @@ static inline void node_stat_sub_folio(struct folio *folio,
 	mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio));
 }
 
-static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages,
-					     int migratetype)
-{
-	__mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages);
-	if (is_migrate_cma(migratetype))
-		__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
-}
-
 extern const char * const vmstat_text[];
 
 static inline const char *zone_stat_name(enum zone_stat_item item)
diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c
index 6755f0c9d4a3..d46acf989dde 100644
--- a/mm/debug_page_alloc.c
+++ b/mm/debug_page_alloc.c
@@ -32,8 +32,7 @@ static int __init debug_guardpage_minorder_setup(char *buf)
 }
 early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
 
-bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order,
-		      int migratetype)
+bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order)
 {
 	if (order >= debug_guardpage_minorder())
 		return false;
@@ -41,19 +40,12 @@ bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order,
 	__SetPageGuard(page);
 	INIT_LIST_HEAD(&page->buddy_list);
 	set_page_private(page, order);
-	/* Guard pages are not available for any usage */
-	if (!is_migrate_isolate(migratetype))
-		__mod_zone_freepage_state(zone, -(1 << order), migratetype);
 
 	return true;
 }
 
-void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order,
-		      int migratetype)
+void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order)
 {
 	__ClearPageGuard(page);
-
 	set_page_private(page, 0);
-	if (!is_migrate_isolate(migratetype))
-		__mod_zone_freepage_state(zone, (1 << order), migratetype);
 }
diff --git a/mm/internal.h b/mm/internal.h
index cbc40f07537c..fb219e31f0f0 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1039,11 +1039,6 @@ static inline bool is_migrate_highatomic(enum migratetype migratetype)
 	return migratetype == MIGRATE_HIGHATOMIC;
 }
 
-static inline bool is_migrate_highatomic_page(struct page *page)
-{
-	return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC;
-}
-
 void setup_zone_pageset(struct zone *zone);
 
 struct migration_target_control {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fe7853eb3ea0..b3b98a91d6a7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -643,23 +643,33 @@ compaction_capture(struct capture_control *capc, struct page *page,
 }
 #endif /* CONFIG_COMPACTION */
 
-/* Used for pages not on another list */
-static inline void add_to_free_list(struct page *page, struct zone *zone,
-				    unsigned int order, int migratetype)
+static inline void account_freepages(struct zone *zone, int nr_pages,
+				     int migratetype)
 {
-	struct free_area *area = &zone->free_area[order];
+	if (is_migrate_isolate(migratetype))
+		return;
 
-	list_add(&page->buddy_list, &area->free_list[migratetype]);
-	area->nr_free++;
+	__mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages);
+
+	if (is_migrate_cma(migratetype))
+		__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
 }
 
 /* Used for pages not on another list */
-static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
-					 unsigned int order, int migratetype)
+static inline void __add_to_free_list(struct page *page, struct zone *zone,
+				      unsigned int order, int migratetype,
+				      bool tail)
 {
 	struct free_area *area = &zone->free_area[order];
 
-	list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
+	VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
+		     "page type is %lu, passed migratetype is %d (nr=%d)\n",
+		     get_pageblock_migratetype(page), migratetype, 1 << order);
+
+	if (tail)
+		list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
+	else
+		list_add(&page->buddy_list, &area->free_list[migratetype]);
 	area->nr_free++;
 }
 
@@ -669,16 +679,28 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
  * allocation again (e.g., optimization for memory onlining).
  */
 static inline void move_to_free_list(struct page *page, struct zone *zone,
-				     unsigned int order, int migratetype)
+				     unsigned int order, int old_mt, int new_mt)
 {
 	struct free_area *area = &zone->free_area[order];
 
-	list_move_tail(&page->buddy_list, &area->free_list[migratetype]);
+	/* Free page moving can fail, so it happens before the type update */
+	VM_WARN_ONCE(get_pageblock_migratetype(page) != old_mt,
+		     "page type is %lu, passed migratetype is %d (nr=%d)\n",
+		     get_pageblock_migratetype(page), old_mt, 1 << order);
+
+	list_move_tail(&page->buddy_list, &area->free_list[new_mt]);
+
+	account_freepages(zone, -(1 << order), old_mt);
+	account_freepages(zone, 1 << order, new_mt);
 }
 
-static inline void del_page_from_free_list(struct page *page, struct zone *zone,
-					   unsigned int order)
+static inline void __del_page_from_free_list(struct page *page, struct zone *zone,
+					     unsigned int order, int migratetype)
 {
+        VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
+		     "page type is %lu, passed migratetype is %d (nr=%d)\n",
+		     get_pageblock_migratetype(page), migratetype, 1 << order);
+
 	/* clear reported state and update reported page count */
 	if (page_reported(page))
 		__ClearPageReported(page);
@@ -689,6 +711,13 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone,
 	zone->free_area[order].nr_free--;
 }
 
+static inline void del_page_from_free_list(struct page *page, struct zone *zone,
+					   unsigned int order, int migratetype)
+{
+	__del_page_from_free_list(page, zone, order, migratetype);
+	account_freepages(zone, -(1 << order), migratetype);
+}
+
 static inline struct page *get_page_from_free_area(struct free_area *area,
 					    int migratetype)
 {
@@ -760,16 +789,16 @@ static inline void __free_one_page(struct page *page,
 	VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
 
 	VM_BUG_ON(migratetype == -1);
-	if (likely(!is_migrate_isolate(migratetype)))
-		__mod_zone_freepage_state(zone, 1 << order, migratetype);
-
 	VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
 	VM_BUG_ON_PAGE(bad_range(zone, page), page);
 
+	account_freepages(zone, 1 << order, migratetype);
+
 	while (order < MAX_PAGE_ORDER) {
+		int buddy_mt = migratetype;
+
 		if (compaction_capture(capc, page, order, migratetype)) {
-			__mod_zone_freepage_state(zone, -(1 << order),
-								migratetype);
+			account_freepages(zone, -(1 << order), migratetype);
 			return;
 		}
 
@@ -784,19 +813,12 @@ static inline void __free_one_page(struct page *page,
 			 * pageblock isolation could cause incorrect freepage or CMA
 			 * accounting or HIGHATOMIC accounting.
 			 */
-			int buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn);
+			buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn);
 
-			if (migratetype != buddy_mt) {
-				if (!migratetype_is_mergeable(migratetype) ||
-				    !migratetype_is_mergeable(buddy_mt))
-					goto done_merging;
-				/*
-				 * Match buddy type. This ensures that
-				 * an expand() down the line puts the
-				 * sub-blocks on the right freelists.
-				 */
-				set_pageblock_migratetype(buddy, migratetype);
-			}
+			if (migratetype != buddy_mt &&
+			    (!migratetype_is_mergeable(migratetype) ||
+			     !migratetype_is_mergeable(buddy_mt)))
+				goto done_merging;
 		}
 
 		/*
@@ -804,9 +826,19 @@ static inline void __free_one_page(struct page *page,
 		 * merge with it and move up one order.
 		 */
 		if (page_is_guard(buddy))
-			clear_page_guard(zone, buddy, order, migratetype);
+			clear_page_guard(zone, buddy, order);
 		else
-			del_page_from_free_list(buddy, zone, order);
+			__del_page_from_free_list(buddy, zone, order, buddy_mt);
+
+		if (unlikely(buddy_mt != migratetype)) {
+			/*
+			 * Match buddy type. This ensures that an
+			 * expand() down the line puts the sub-blocks
+			 * on the right freelists.
+			 */
+			set_pageblock_migratetype(buddy, migratetype);
+		}
+
 		combined_pfn = buddy_pfn & pfn;
 		page = page + (combined_pfn - pfn);
 		pfn = combined_pfn;
@@ -823,10 +855,7 @@ done_merging:
 	else
 		to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
 
-	if (to_tail)
-		add_to_free_list_tail(page, zone, order, migratetype);
-	else
-		add_to_free_list(page, zone, order, migratetype);
+	__add_to_free_list(page, zone, order, migratetype, to_tail);
 
 	/* Notify page reporting subsystem of freed page */
 	if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
@@ -1318,10 +1347,10 @@ static inline void expand(struct zone *zone, struct page *page,
 		 * Corresponding page table entries will not be touched,
 		 * pages will stay not present in virtual address space
 		 */
-		if (set_page_guard(zone, &page[size], high, migratetype))
+		if (set_page_guard(zone, &page[size], high))
 			continue;
 
-		add_to_free_list(&page[size], zone, high, migratetype);
+		add_to_free_list(&page[size], zone, high, migratetype, false);
 		set_buddy_order(&page[size], high);
 	}
 }
@@ -1492,7 +1521,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 		page = get_page_from_free_area(area, migratetype);
 		if (!page)
 			continue;
-		del_page_from_free_list(page, zone, current_order);
+		del_page_from_free_list(page, zone, current_order, migratetype);
 		expand(zone, page, order, current_order, migratetype);
 		trace_mm_page_alloc_zone_locked(page, order, migratetype,
 				pcp_allowed_order(order) &&
@@ -1532,7 +1561,7 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
  * type's freelist.
  */
 static int move_freepages(struct zone *zone, unsigned long start_pfn,
-			  unsigned long end_pfn, int migratetype)
+			  unsigned long end_pfn, int old_mt, int new_mt)
 {
 	struct page *page;
 	unsigned long pfn;
@@ -1554,12 +1583,14 @@ static int move_freepages(struct zone *zone, unsigned long start_pfn,
 		VM_BUG_ON_PAGE(page_zone(page) != zone, page);
 
 		order = buddy_order(page);
-		move_to_free_list(page, zone, order, migratetype);
+
+		move_to_free_list(page, zone, order, old_mt, new_mt);
+
 		pfn += 1 << order;
 		pages_moved += 1 << order;
 	}
 
-	set_pageblock_migratetype(pfn_to_page(start_pfn), migratetype);
+	set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt);
 
 	return pages_moved;
 }
@@ -1617,7 +1648,7 @@ static bool prep_move_freepages_block(struct zone *zone, struct page *page,
 }
 
 static int move_freepages_block(struct zone *zone, struct page *page,
-				int migratetype)
+				int old_mt, int new_mt)
 {
 	unsigned long start_pfn, end_pfn;
 
@@ -1625,7 +1656,7 @@ static int move_freepages_block(struct zone *zone, struct page *page,
 				       NULL, NULL))
 		return -1;
 
-	return move_freepages(zone, start_pfn, end_pfn, migratetype);
+	return move_freepages(zone, start_pfn, end_pfn, old_mt, new_mt);
 }
 
 #ifdef CONFIG_MEMORY_ISOLATION
@@ -1697,7 +1728,6 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page,
 				  int migratetype)
 {
 	unsigned long start_pfn, end_pfn, pfn;
-	int nr_moved, mt;
 
 	if (!prep_move_freepages_block(zone, page, &start_pfn, &end_pfn,
 				       NULL, NULL))
@@ -1712,11 +1742,9 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page,
 	if (pfn != start_pfn) {
 		struct page *buddy = pfn_to_page(pfn);
 		int order = buddy_order(buddy);
-		int mt = get_pfnblock_migratetype(buddy, pfn);
 
-		if (!is_migrate_isolate(mt))
-			__mod_zone_freepage_state(zone, -(1UL << order), mt);
-		del_page_from_free_list(buddy, zone, order);
+		del_page_from_free_list(buddy, zone, order,
+					get_pfnblock_migratetype(buddy, pfn));
 		set_pageblock_migratetype(page, migratetype);
 		split_large_buddy(zone, buddy, pfn, order);
 		return true;
@@ -1724,23 +1752,17 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page,
 
 	/* We're the starting block of a larger buddy */
 	if (PageBuddy(page) && buddy_order(page) > pageblock_order) {
-		int mt = get_pfnblock_migratetype(page, pfn);
 		int order = buddy_order(page);
 
-		if (!is_migrate_isolate(mt))
-			__mod_zone_freepage_state(zone, -(1UL << order), mt);
-		del_page_from_free_list(page, zone, order);
+		del_page_from_free_list(page, zone, order,
+					get_pfnblock_migratetype(page, pfn));
 		set_pageblock_migratetype(page, migratetype);
 		split_large_buddy(zone, page, pfn, order);
 		return true;
 	}
 move:
-	mt = get_pfnblock_migratetype(page, start_pfn);
-	nr_moved = move_freepages(zone, start_pfn, end_pfn, migratetype);
-	if (!is_migrate_isolate(mt))
-		__mod_zone_freepage_state(zone, -nr_moved, mt);
-	else if (!is_migrate_isolate(migratetype))
-		__mod_zone_freepage_state(zone, nr_moved, migratetype);
+	move_freepages(zone, start_pfn, end_pfn,
+		       get_pfnblock_migratetype(page, start_pfn), migratetype);
 	return true;
 }
 #endif /* CONFIG_MEMORY_ISOLATION */
@@ -1854,7 +1876,7 @@ steal_suitable_fallback(struct zone *zone, struct page *page,
 
 	/* Take ownership for orders >= pageblock_order */
 	if (current_order >= pageblock_order) {
-		del_page_from_free_list(page, zone, current_order);
+		del_page_from_free_list(page, zone, current_order, block_type);
 		change_pageblock_range(page, current_order, start_type);
 		expand(zone, page, order, current_order, start_type);
 		return page;
@@ -1904,12 +1926,12 @@ steal_suitable_fallback(struct zone *zone, struct page *page,
 	 */
 	if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
 			page_group_by_mobility_disabled) {
-		move_freepages(zone, start_pfn, end_pfn, start_type);
+		move_freepages(zone, start_pfn, end_pfn, block_type, start_type);
 		return __rmqueue_smallest(zone, order, start_type);
 	}
 
 single_page:
-	del_page_from_free_list(page, zone, current_order);
+	del_page_from_free_list(page, zone, current_order, block_type);
 	expand(zone, page, order, current_order, block_type);
 	return page;
 }
@@ -1979,7 +2001,7 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone)
 	mt = get_pageblock_migratetype(page);
 	/* Only reserve normal pageblocks (i.e., they can merge with others) */
 	if (migratetype_is_mergeable(mt))
-		if (move_freepages_block(zone, page,
+		if (move_freepages_block(zone, page, mt,
 					 MIGRATE_HIGHATOMIC) != -1)
 			zone->nr_reserved_highatomic += pageblock_nr_pages;
 
@@ -2020,11 +2042,13 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
 		spin_lock_irqsave(&zone->lock, flags);
 		for (order = 0; order < NR_PAGE_ORDERS; order++) {
 			struct free_area *area = &(zone->free_area[order]);
+			int mt;
 
 			page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
 			if (!page)
 				continue;
 
+			mt = get_pageblock_migratetype(page);
 			/*
 			 * In page freeing path, migratetype change is racy so
 			 * we can counter several free pages in a pageblock
@@ -2032,7 +2056,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
 			 * from highatomic to ac->migratetype. So we should
 			 * adjust the count once.
 			 */
-			if (is_migrate_highatomic_page(page)) {
+			if (is_migrate_highatomic(mt)) {
 				/*
 				 * It should never happen but changes to
 				 * locking could inadvertently allow a per-cpu
@@ -2054,7 +2078,8 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
 			 * of pageblocks that cannot be completely freed
 			 * may increase.
 			 */
-			ret = move_freepages_block(zone, page, ac->migratetype);
+			ret = move_freepages_block(zone, page, mt,
+						   ac->migratetype);
 			/*
 			 * Reserving this block already succeeded, so this should
 			 * not fail on zone boundaries.
@@ -2225,12 +2250,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 		 * pages are ordered properly.
 		 */
 		list_add_tail(&page->pcp_list, list);
-		if (is_migrate_cma(get_pageblock_migratetype(page)))
-			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-					      -(1 << order));
 	}
-
-	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
 	spin_unlock_irqrestore(&zone->lock, flags);
 
 	return i;
@@ -2723,11 +2743,9 @@ int __isolate_free_page(struct page *page, unsigned int order)
 		watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
 		if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
 			return 0;
-
-		__mod_zone_freepage_state(zone, -(1UL << order), mt);
 	}
 
-	del_page_from_free_list(page, zone, order);
+	del_page_from_free_list(page, zone, order, mt);
 
 	/*
 	 * Set the pageblock if the isolated page is at least half of a
@@ -2742,7 +2760,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
 			 * with others)
 			 */
 			if (migratetype_is_mergeable(mt))
-				move_freepages_block(zone, page,
+				move_freepages_block(zone, page, mt,
 						     MIGRATE_MOVABLE);
 		}
 	}
@@ -2827,8 +2845,6 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
 				return NULL;
 			}
 		}
-		__mod_zone_freepage_state(zone, -(1 << order),
-					  get_pageblock_migratetype(page));
 		spin_unlock_irqrestore(&zone->lock, flags);
 	} while (check_new_pages(page, order));
 
@@ -6716,8 +6732,9 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
 
 		BUG_ON(page_count(page));
 		BUG_ON(!PageBuddy(page));
+		VM_WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE);
 		order = buddy_order(page);
-		del_page_from_free_list(page, zone, order);
+		del_page_from_free_list(page, zone, order, MIGRATE_ISOLATE);
 		pfn += (1 << order);
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
@@ -6745,6 +6762,14 @@ bool is_free_buddy_page(struct page *page)
 EXPORT_SYMBOL(is_free_buddy_page);
 
 #ifdef CONFIG_MEMORY_FAILURE
+static inline void add_to_free_list(struct page *page, struct zone *zone,
+				    unsigned int order, int migratetype,
+				    bool tail)
+{
+	__add_to_free_list(page, zone, order, migratetype, tail);
+	account_freepages(zone, 1 << order, migratetype);
+}
+
 /*
  * Break down a higher-order page in sub-pages, and keep our target out of
  * buddy allocator.
@@ -6767,10 +6792,10 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page,
 			current_buddy = page + size;
 		}
 
-		if (set_page_guard(zone, current_buddy, high, migratetype))
+		if (set_page_guard(zone, current_buddy, high))
 			continue;
 
-		add_to_free_list(current_buddy, zone, high, migratetype);
+		add_to_free_list(current_buddy, zone, high, migratetype, false);
 		set_buddy_order(current_buddy, high);
 	}
 }
@@ -6796,12 +6821,11 @@ bool take_page_off_buddy(struct page *page)
 			int migratetype = get_pfnblock_migratetype(page_head,
 								   pfn_head);
 
-			del_page_from_free_list(page_head, zone, page_order);
+			del_page_from_free_list(page_head, zone, page_order,
+						migratetype);
 			break_down_buddy_pages(zone, page_head, page, 0,
 						page_order, migratetype);
 			SetPageHWPoisonTakenOff(page);
-			if (!is_migrate_isolate(migratetype))
-				__mod_zone_freepage_state(zone, -1, migratetype);
 			ret = true;
 			break;
 		}
@@ -6909,7 +6933,7 @@ static bool try_to_accept_memory_one(struct zone *zone)
 	list_del(&page->lru);
 	last = list_empty(&zone->unaccepted_pages);
 
-	__mod_zone_freepage_state(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
+	account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
 	__mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
 	spin_unlock_irqrestore(&zone->lock, flags);
 
@@ -6961,7 +6985,7 @@ static bool __free_unaccepted(struct page *page)
 	spin_lock_irqsave(&zone->lock, flags);
 	first = list_empty(&zone->unaccepted_pages);
 	list_add_tail(&page->lru, &zone->unaccepted_pages);
-	__mod_zone_freepage_state(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
+	account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
 	__mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES);
 	spin_unlock_irqrestore(&zone->lock, flags);
 
-- 
cgit v1.2.3


From 42d0c3fbb5811fbfb663d8ede1d7ffba02e7ae18 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linux.alibaba.com>
Date: Wed, 6 Mar 2024 18:13:27 +0800
Subject: mm: hugetlb: make the hugetlb migration strategy consistent

As discussed in previous thread [1], there is an inconsistency when
handing hugetlb migration.  When handling the migration of freed hugetlb,
it prevents fallback to other NUMA nodes in
alloc_and_dissolve_hugetlb_folio().  However, when dealing with in-use
hugetlb, it allows fallback to other NUMA nodes in
alloc_hugetlb_folio_nodemask(), which can break the per-node hugetlb pool
and might result in unexpected failures when node bound workloads doesn't
get what is asssumed available.

To make hugetlb migration strategy more clear, we should list all the scenarios
of hugetlb migration and analyze whether allocation fallback is permitted:

1) Memory offline: will call dissolve_free_huge_pages() to free the
   freed hugetlb, and call do_migrate_range() to migrate the in-use
   hugetlb.  Both can break the per-node hugetlb pool, but as this is an
   explicit offlining operation, no better choice.  So should allow the
   hugetlb allocation fallback.

2) Memory failure: same as memory offline.  Should allow fallback to a
   different node might be the only option to handle it, otherwise the
   impact of poisoned memory can be amplified.

3) Longterm pinning: will call migrate_longterm_unpinnable_pages() to
   migrate in-use and not-longterm-pinnable hugetlb, which can break the
   per-node pool.  But we should fail to longterm pinning if can not
   allocate on current node to avoid breaking the per-node pool.

4) Syscalls (mbind, migrate_pages, move_pages): these are explicit
   users operation to move pages to other nodes, so fallback to other
   nodes should not be prohibited.

5) alloc_contig_range: used by CMA allocation and virtio-mem
   fake-offline to allocate given range of pages.  Now the freed hugetlb
   migration is not allowed to fallback, to keep consistency, the in-use
   hugetlb migration should be also not allowed to fallback.

6) alloc_contig_pages: used by kfence, pgtable_debug etc.  The strategy
   should be consistent with that of alloc_contig_range().

Based on the analysis of the various scenarios above, introducing a new
helper to determine whether fallback is permitted according to the
migration reason..

[1] https://lore.kernel.org/all/6f26ce22d2fcd523418a085f2c588fe0776d46e7.1706794035.git.baolin.wang@linux.alibaba.com/
Link: https://lkml.kernel.org/r/3519fcd41522817307a05b40fb551e2e17e68101.1709719720.git.baolin.wang@linux.alibaba.com
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 35 +++++++++++++++++++++++++++++++++--
 mm/hugetlb.c            | 14 ++++++++++++--
 mm/mempolicy.c          |  3 ++-
 mm/migrate.c            |  3 ++-
 4 files changed, 49 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 300de33c6fde..d748628efc5e 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -723,7 +723,8 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list);
 struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 				unsigned long addr, int avoid_reserve);
 struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
-				nodemask_t *nmask, gfp_t gfp_mask);
+				nodemask_t *nmask, gfp_t gfp_mask,
+				bool allow_alloc_fallback);
 int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping,
 			pgoff_t idx);
 void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
@@ -946,6 +947,30 @@ static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
 	return modified_mask;
 }
 
+static inline bool htlb_allow_alloc_fallback(int reason)
+{
+	bool allowed_fallback = false;
+
+	/*
+	 * Note: the memory offline, memory failure and migration syscalls will
+	 * be allowed to fallback to other nodes due to lack of a better chioce,
+	 * that might break the per-node hugetlb pool. While other cases will
+	 * set the __GFP_THISNODE to avoid breaking the per-node hugetlb pool.
+	 */
+	switch (reason) {
+	case MR_MEMORY_HOTPLUG:
+	case MR_MEMORY_FAILURE:
+	case MR_SYSCALL:
+	case MR_MEMPOLICY_MBIND:
+		allowed_fallback = true;
+		break;
+	default:
+		break;
+	}
+
+	return allowed_fallback;
+}
+
 static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
 					   struct mm_struct *mm, pte_t *pte)
 {
@@ -1041,7 +1066,8 @@ static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 
 static inline struct folio *
 alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
-			nodemask_t *nmask, gfp_t gfp_mask)
+			nodemask_t *nmask, gfp_t gfp_mask,
+			bool allow_alloc_fallback)
 {
 	return NULL;
 }
@@ -1157,6 +1183,11 @@ static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
 	return 0;
 }
 
+static inline bool htlb_allow_alloc_fallback(int reason)
+{
+	return false;
+}
+
 static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
 					   struct mm_struct *mm, pte_t *pte)
 {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e4dd0c802489..2e984554ec63 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2598,7 +2598,7 @@ struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h,
 
 /* folio migration callback function */
 struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
-		nodemask_t *nmask, gfp_t gfp_mask)
+		nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback)
 {
 	spin_lock_irq(&hugetlb_lock);
 	if (available_huge_pages(h)) {
@@ -2613,6 +2613,10 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
 	}
 	spin_unlock_irq(&hugetlb_lock);
 
+	/* We cannot fallback to other nodes, as we could break the per-node pool. */
+	if (!allow_alloc_fallback)
+		gfp_mask |= __GFP_THISNODE;
+
 	return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask);
 }
 
@@ -6636,7 +6640,13 @@ static struct folio *alloc_hugetlb_folio_vma(struct hstate *h,
 
 	gfp_mask = htlb_alloc_mask(h);
 	node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
-	folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask);
+	/*
+	 * This is used to allocate a temporary hugetlb to hold the copied
+	 * content, which will then be copied again to the final hugetlb
+	 * consuming a reservation. Set the alloc_fallback to false to indicate
+	 * that breaking the per-node hugetlb pool is not allowed in this case.
+	 */
+	folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask, false);
 	mpol_cond_put(mpol);
 
 	return folio;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e128e6b7bbcb..72626b2cefa9 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1228,7 +1228,8 @@ static struct folio *alloc_migration_target_by_mpol(struct folio *src,
 		h = folio_hstate(src);
 		gfp = htlb_alloc_mask(h);
 		nodemask = policy_nodemask(gfp, pol, ilx, &nid);
-		return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp);
+		return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp,
+				htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND));
 	}
 
 	if (folio_test_large(src))
diff --git a/mm/migrate.c b/mm/migrate.c
index bde63010a3cf..ab9856f5931b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2022,7 +2022,8 @@ struct folio *alloc_migration_target(struct folio *src, unsigned long private)
 
 		gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
 		return alloc_hugetlb_folio_nodemask(h, nid,
-						mtc->nmask, gfp_mask);
+						mtc->nmask, gfp_mask,
+						htlb_allow_alloc_fallback(mtc->reason));
 	}
 
 	if (folio_test_large(src)) {
-- 
cgit v1.2.3


From f238b8c33c6738f146bbfbb09b78870ea157c2b7 Mon Sep 17 00:00:00 2001
From: Barry Song <v-songbaohua@oppo.com>
Date: Sat, 23 Mar 2024 00:41:36 +1300
Subject: arm64: mm: swap: support THP_SWAP on hardware with MTE

Commit d0637c505f8a1 ("arm64: enable THP_SWAP for arm64") brings up
THP_SWAP on ARM64, but it doesn't enable THP_SWP on hardware with MTE as
the MTE code works with the assumption tags save/restore is always
handling a folio with only one page.

The limitation should be removed as more and more ARM64 SoCs have this
feature.  Co-existence of MTE and THP_SWAP becomes more and more
important.

This patch makes MTE tags saving support large folios, then we don't need
to split large folios into base pages for swapping out on ARM64 SoCs with
MTE any more.

arch_prepare_to_swap() should take folio rather than page as parameter
because we support THP swap-out as a whole.  It saves tags for all pages
in a large folio.

As now we are restoring tags based-on folio, in arch_swap_restore(), we
may increase some extra loops and early-exitings while refaulting a large
folio which is still in swapcache in do_swap_page().  In case a large
folio has nr pages, do_swap_page() will only set the PTE of the particular
page which is causing the page fault.  Thus do_swap_page() runs nr times,
and each time, arch_swap_restore() will loop nr times for those subpages
in the folio.  So right now the algorithmic complexity becomes O(nr^2).

Once we support mapping large folios in do_swap_page(), extra loops and
early-exitings will decrease while not being completely removed as a large
folio might get partially tagged in corner cases such as, 1.  a large
folio in swapcache can be partially unmapped, thus, MTE tags for the
unmapped pages will be invalidated; 2.  users might use mprotect() to set
MTEs on a part of a large folio.

arch_thp_swp_supported() is dropped since ARM64 MTE was the only one who
needed it.

Link: https://lkml.kernel.org/r/20240322114136.61386-2-21cnbao@gmail.com
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
Reviewed-by: Steven Price <steven.price@arm.com>
Acked-by: Chris Li <chrisl@kernel.org>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Peter Collingbourne <pcc@google.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: "Mike Rapoport (IBM)" <rppt@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Cc: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm64/include/asm/pgtable.h | 19 ++---------------
 arch/arm64/mm/mteswap.c          | 45 ++++++++++++++++++++++++++++++++++++++++
 include/linux/huge_mm.h          | 12 -----------
 include/linux/pgtable.h          |  2 +-
 mm/internal.h                    | 14 +++++++++++++
 mm/memory.c                      |  2 +-
 mm/page_io.c                     |  2 +-
 mm/shmem.c                       |  2 +-
 mm/swap_slots.c                  |  2 +-
 mm/swapfile.c                    |  2 +-
 10 files changed, 67 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 6870b60158fc..9fd8613b2db2 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -49,12 +49,6 @@
 	__flush_tlb_range(vma, addr, end, PUD_SIZE, false, 1)
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
-static inline bool arch_thp_swp_supported(void)
-{
-	return !system_supports_mte();
-}
-#define arch_thp_swp_supported arch_thp_swp_supported
-
 /*
  * Outside of a few very special situations (e.g. hibernation), we always
  * use broadcast TLB invalidation instructions, therefore a spurious page
@@ -1282,12 +1276,7 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
 #ifdef CONFIG_ARM64_MTE
 
 #define __HAVE_ARCH_PREPARE_TO_SWAP
-static inline int arch_prepare_to_swap(struct page *page)
-{
-	if (system_supports_mte())
-		return mte_save_tags(page);
-	return 0;
-}
+extern int arch_prepare_to_swap(struct folio *folio);
 
 #define __HAVE_ARCH_SWAP_INVALIDATE
 static inline void arch_swap_invalidate_page(int type, pgoff_t offset)
@@ -1303,11 +1292,7 @@ static inline void arch_swap_invalidate_area(int type)
 }
 
 #define __HAVE_ARCH_SWAP_RESTORE
-static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
-{
-	if (system_supports_mte())
-		mte_restore_tags(entry, &folio->page);
-}
+extern void arch_swap_restore(swp_entry_t entry, struct folio *folio);
 
 #endif /* CONFIG_ARM64_MTE */
 
diff --git a/arch/arm64/mm/mteswap.c b/arch/arm64/mm/mteswap.c
index a31833e3ddc5..63e8d72f202a 100644
--- a/arch/arm64/mm/mteswap.c
+++ b/arch/arm64/mm/mteswap.c
@@ -68,6 +68,13 @@ void mte_invalidate_tags(int type, pgoff_t offset)
 	mte_free_tag_storage(tags);
 }
 
+static inline void __mte_invalidate_tags(struct page *page)
+{
+	swp_entry_t entry = page_swap_entry(page);
+
+	mte_invalidate_tags(swp_type(entry), swp_offset(entry));
+}
+
 void mte_invalidate_tags_area(int type)
 {
 	swp_entry_t entry = swp_entry(type, 0);
@@ -83,3 +90,41 @@ void mte_invalidate_tags_area(int type)
 	}
 	xa_unlock(&mte_pages);
 }
+
+int arch_prepare_to_swap(struct folio *folio)
+{
+	long i, nr;
+	int err;
+
+	if (!system_supports_mte())
+		return 0;
+
+	nr = folio_nr_pages(folio);
+
+	for (i = 0; i < nr; i++) {
+		err = mte_save_tags(folio_page(folio, i));
+		if (err)
+			goto out;
+	}
+	return 0;
+
+out:
+	while (i--)
+		__mte_invalidate_tags(folio_page(folio, i));
+	return err;
+}
+
+void arch_swap_restore(swp_entry_t entry, struct folio *folio)
+{
+	long i, nr;
+
+	if (!system_supports_mte())
+		return;
+
+	nr = folio_nr_pages(folio);
+
+	for (i = 0; i < nr; i++) {
+		mte_restore_tags(entry, folio_page(folio, i));
+		entry.val++;
+	}
+}
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 0e16451adaba..7576025db55d 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -532,16 +532,4 @@ static inline int split_folio_to_order(struct folio *folio, int new_order)
 #define split_folio_to_list(f, l) split_folio_to_list_to_order(f, l, 0)
 #define split_folio(f) split_folio_to_order(f, 0)
 
-/*
- * archs that select ARCH_WANTS_THP_SWAP but don't support THP_SWP due to
- * limitations in the implementation like arm64 MTE can override this to
- * false
- */
-#ifndef arch_thp_swp_supported
-static inline bool arch_thp_swp_supported(void)
-{
-	return true;
-}
-#endif
-
 #endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 2a1c044ae467..600e17d03659 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1050,7 +1050,7 @@ static inline int arch_unmap_one(struct mm_struct *mm,
  * prototypes must be defined in the arch-specific asm/pgtable.h file.
  */
 #ifndef __HAVE_ARCH_PREPARE_TO_SWAP
-static inline int arch_prepare_to_swap(struct page *page)
+static inline int arch_prepare_to_swap(struct folio *folio)
 {
 	return 0;
 }
diff --git a/mm/internal.h b/mm/internal.h
index 63bdac6d0413..6c8d3844b6a3 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -76,6 +76,20 @@ static inline int folio_nr_pages_mapped(struct folio *folio)
 	return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED;
 }
 
+/*
+ * Retrieve the first entry of a folio based on a provided entry within the
+ * folio. We cannot rely on folio->swap as there is no guarantee that it has
+ * been initialized. Used for calling arch_swap_restore()
+ */
+static inline swp_entry_t folio_swap(swp_entry_t entry, struct folio *folio)
+{
+	swp_entry_t swap = {
+		.val = ALIGN_DOWN(entry.val, folio_nr_pages(folio)),
+	};
+
+	return swap;
+}
+
 static inline void *folio_raw_mapping(struct folio *folio)
 {
 	unsigned long mapping = (unsigned long)folio->mapping;
diff --git a/mm/memory.c b/mm/memory.c
index c859a09b4f72..805cebb6fd72 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4190,7 +4190,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	 * when reading from swap. This metadata may be indexed by swap entry
 	 * so this must be called before swap_free().
 	 */
-	arch_swap_restore(entry, folio);
+	arch_swap_restore(folio_swap(entry, folio), folio);
 
 	/*
 	 * Remove the swap entry and conditionally try to free up the swapcache.
diff --git a/mm/page_io.c b/mm/page_io.c
index ae2b49055e43..a9a7c236aecc 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -189,7 +189,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
 	 * Arch code may have to preserve more data than just the page
 	 * contents, e.g. memory tags.
 	 */
-	ret = arch_prepare_to_swap(&folio->page);
+	ret = arch_prepare_to_swap(folio);
 	if (ret) {
 		folio_mark_dirty(folio);
 		folio_unlock(folio);
diff --git a/mm/shmem.c b/mm/shmem.c
index 94ab99b6b574..98985179f495 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1907,7 +1907,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
 	 * Some architectures may have to restore extra metadata to the
 	 * folio after reading from swap.
 	 */
-	arch_swap_restore(swap, folio);
+	arch_swap_restore(folio_swap(swap, folio), folio);
 
 	if (shmem_should_replace_folio(folio, gfp)) {
 		error = shmem_replace_folio(&folio, gfp, info, index);
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 90973ce7881d..53abeaf1371d 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -310,7 +310,7 @@ swp_entry_t folio_alloc_swap(struct folio *folio)
 	entry.val = 0;
 
 	if (folio_test_large(folio)) {
-		if (IS_ENABLED(CONFIG_THP_SWAP) && arch_thp_swp_supported())
+		if (IS_ENABLED(CONFIG_THP_SWAP))
 			get_swap_pages(1, &entry, folio_nr_pages(folio));
 		goto out;
 	}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4919423cce76..5e6d2304a2a4 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1806,7 +1806,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
 	 * when reading from swap. This metadata may be indexed by swap entry
 	 * so this must be called before swap_free().
 	 */
-	arch_swap_restore(entry, folio);
+	arch_swap_restore(folio_swap(entry, folio), folio);
 
 	dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
 	inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
-- 
cgit v1.2.3


From ebb34f78d72c2320620ba6d55cb22a52949047a1 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 27 Feb 2024 21:15:48 +0100
Subject: mm: convert folio_estimated_sharers() to folio_likely_mapped_shared()

Callers of folio_estimated_sharers() only care about "mapped shared vs.
mapped exclusively", not the exact estimate of sharers.  Let's consolidate
and unify the condition users are checking.  While at it clarify the
semantics and extend the discussion on the fuzziness.

Use the "likely mapped shared" terminology to better express what the
(adjusted) function actually checks.

Whether a partially-mappable folio is more likely to not be partially
mapped than partially mapped is debatable.  In the future, we might be
able to improve our estimate for partially-mappable folios, though.

Note that we will now consistently detect "mapped shared" only if the
first subpage is actually mapped multiple times.  When the first subpage
is not mapped, we will consistently detect it as "mapped exclusively".
This change should currently only affect the usage in
madvise_free_pte_range() and queue_folios_pte_range() for large folios: if
the first page was already unmapped, we would have skipped the folio.

[david@redhat.com: folio_likely_mapped_shared() kerneldoc fixup]
  Link: https://lkml.kernel.org/r/dd0ad9f2-2d7a-45f3-9ba3-979488c7dd27@redhat.com
Link: https://lkml.kernel.org/r/20240227201548.857831-1-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Khalid Aziz <khalid.aziz@oracle.com>
Acked-by: Barry Song <v-songbaohua@oppo.com>
Reviewed-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 48 ++++++++++++++++++++++++++++++++++++++----------
 mm/huge_memory.c   |  2 +-
 mm/madvise.c       |  6 +++---
 mm/memory.c        |  2 +-
 mm/mempolicy.c     | 14 ++++++--------
 mm/migrate.c       |  8 ++++----
 6 files changed, 53 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1588fe15a38e..fe66b515aaab 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2165,21 +2165,49 @@ static inline size_t folio_size(struct folio *folio)
 }
 
 /**
- * folio_estimated_sharers - Estimate the number of sharers of a folio.
+ * folio_likely_mapped_shared - Estimate if the folio is mapped into the page
+ *				tables of more than one MM
  * @folio: The folio.
  *
- * folio_estimated_sharers() aims to serve as a function to efficiently
- * estimate the number of processes sharing a folio. This is done by
- * looking at the precise mapcount of the first subpage in the folio, and
- * assuming the other subpages are the same. This may not be true for large
- * folios. If you want exact mapcounts for exact calculations, look at
- * page_mapcount() or folio_total_mapcount().
+ * This function checks if the folio is currently mapped into more than one
+ * MM ("mapped shared"), or if the folio is only mapped into a single MM
+ * ("mapped exclusively").
  *
- * Return: The estimated number of processes sharing a folio.
+ * As precise information is not easily available for all folios, this function
+ * estimates the number of MMs ("sharers") that are currently mapping a folio
+ * using the number of times the first page of the folio is currently mapped
+ * into page tables.
+ *
+ * For small anonymous folios (except KSM folios) and anonymous hugetlb folios,
+ * the return value will be exactly correct, because they can only be mapped
+ * at most once into an MM, and they cannot be partially mapped.
+ *
+ * For other folios, the result can be fuzzy:
+ *    #. For partially-mappable large folios (THP), the return value can wrongly
+ *       indicate "mapped exclusively" (false negative) when the folio is
+ *       only partially mapped into at least one MM.
+ *    #. For pagecache folios (including hugetlb), the return value can wrongly
+ *       indicate "mapped shared" (false positive) when two VMAs in the same MM
+ *       cover the same file range.
+ *    #. For (small) KSM folios, the return value can wrongly indicate "mapped
+ *       shared" (false negative), when the folio is mapped multiple times into
+ *       the same MM.
+ *
+ * Further, this function only considers current page table mappings that
+ * are tracked using the folio mapcount(s).
+ *
+ * This function does not consider:
+ *    #. If the folio might get mapped in the (near) future (e.g., swapcache,
+ *       pagecache, temporary unmapping for migration).
+ *    #. If the folio is mapped differently (VM_PFNMAP).
+ *    #. If hugetlb page table sharing applies. Callers might want to check
+ *       hugetlb_pmd_shared().
+ *
+ * Return: Whether the folio is estimated to be mapped into more than one MM.
  */
-static inline int folio_estimated_sharers(struct folio *folio)
+static inline bool folio_likely_mapped_shared(struct folio *folio)
 {
-	return page_mapcount(folio_page(folio, 0));
+	return page_mapcount(folio_page(folio, 0)) > 1;
 }
 
 #ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 16b2c5622fb1..1170fc22ed89 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1821,7 +1821,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	 * If other processes are mapping this folio, we couldn't discard
 	 * the folio unless they all do MADV_FREE so let's skip the folio.
 	 */
-	if (folio_estimated_sharers(folio) != 1)
+	if (folio_likely_mapped_shared(folio))
 		goto out;
 
 	if (!folio_trylock(folio))
diff --git a/mm/madvise.c b/mm/madvise.c
index a2dd70c4a2e6..7625830d6ae9 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -366,7 +366,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 		folio = pfn_folio(pmd_pfn(orig_pmd));
 
 		/* Do not interfere with other mappings of this folio */
-		if (folio_estimated_sharers(folio) != 1)
+		if (folio_likely_mapped_shared(folio))
 			goto huge_unlock;
 
 		if (pageout_anon_only_filter && !folio_test_anon(folio))
@@ -453,7 +453,7 @@ restart:
 		if (folio_test_large(folio)) {
 			int err;
 
-			if (folio_estimated_sharers(folio) > 1)
+			if (folio_likely_mapped_shared(folio))
 				break;
 			if (pageout_anon_only_filter && !folio_test_anon(folio))
 				break;
@@ -677,7 +677,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 		if (folio_test_large(folio)) {
 			int err;
 
-			if (folio_estimated_sharers(folio) != 1)
+			if (folio_likely_mapped_shared(folio))
 				break;
 			if (!folio_trylock(folio))
 				break;
diff --git a/mm/memory.c b/mm/memory.c
index 805cebb6fd72..bcd29104f5ea 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5112,7 +5112,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
 	 * Flag if the folio is shared between multiple address spaces. This
 	 * is later used when determining whether to group tasks together
 	 */
-	if (folio_estimated_sharers(folio) > 1 && (vma->vm_flags & VM_SHARED))
+	if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
 		flags |= TNF_SHARED;
 
 	nid = folio_nid(folio);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 72626b2cefa9..913cff5da5a3 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -642,12 +642,11 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
 	 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
 	 * Choosing not to migrate a shared folio is not counted as a failure.
 	 *
-	 * To check if the folio is shared, ideally we want to make sure
-	 * every page is mapped to the same process. Doing that is very
-	 * expensive, so check the estimated sharers of the folio instead.
+	 * See folio_likely_mapped_shared() on possible imprecision when we
+	 * cannot easily detect if a folio is shared.
 	 */
 	if ((flags & MPOL_MF_MOVE_ALL) ||
-	    (folio_estimated_sharers(folio) == 1 && !hugetlb_pmd_shared(pte)))
+	    (!folio_likely_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
 		if (!isolate_hugetlb(folio, qp->pagelist))
 			qp->nr_failed++;
 unlock:
@@ -1032,11 +1031,10 @@ static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
 	 * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
 	 * Choosing not to migrate a shared folio is not counted as a failure.
 	 *
-	 * To check if the folio is shared, ideally we want to make sure
-	 * every page is mapped to the same process. Doing that is very
-	 * expensive, so check the estimated sharers of the folio instead.
+	 * See folio_likely_mapped_shared() on possible imprecision when we
+	 * cannot easily detect if a folio is shared.
 	 */
-	if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) {
+	if ((flags & MPOL_MF_MOVE_ALL) || !folio_likely_mapped_shared(folio)) {
 		if (folio_isolate_lru(folio)) {
 			list_add_tail(&folio->lru, foliolist);
 			node_stat_mod_folio(folio,
diff --git a/mm/migrate.c b/mm/migrate.c
index 63c97bb639d7..a31aa75d223d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2593,11 +2593,11 @@ int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma,
 	/*
 	 * Don't migrate file folios that are mapped in multiple processes
 	 * with execute permissions as they are probably shared libraries.
-	 * To check if the folio is shared, ideally we want to make sure
-	 * every page is mapped to the same process. Doing that is very
-	 * expensive, so check the estimated mapcount of the folio instead.
+	 *
+	 * See folio_likely_mapped_shared() on possible imprecision when we
+	 * cannot easily detect if a folio is shared.
 	 */
-	if (folio_estimated_sharers(folio) != 1 && folio_is_file_lru(folio) &&
+	if (folio_likely_mapped_shared(folio) && folio_is_file_lru(folio) &&
 	    (vma->vm_flags & VM_EXEC))
 		goto out;
 
-- 
cgit v1.2.3


From a4864671ca0bf51c8e78242951741df52c06766f Mon Sep 17 00:00:00 2001
From: Kairui Song <kasong@tencent.com>
Date: Tue, 16 Apr 2024 01:18:55 +0800
Subject: lib/xarray: introduce a new helper xas_get_order

It can be used after xas_load to check the order of loaded entries.
Compared to xa_get_order, it saves an XA_STATE and avoid a rewalk.

Added new test for xas_get_order, to make the test work, we have to export
xas_get_order with EXPORT_SYMBOL_GPL.

Also fix a sparse warning by checking the slot value with xa_entry instead
of accessing it directly, as suggested by Matthew Wilcox.

[kasong@tencent.com: simplify comment, sparse warning fix, per Matthew Wilcox]
  Link: https://lkml.kernel.org/r/20240416071722.45997-4-ryncsn@gmail.com
Link: https://lkml.kernel.org/r/20240415171857.19244-4-ryncsn@gmail.com
Signed-off-by: Kairui Song <kasong@tencent.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/xarray.h |  6 ++++++
 lib/test_xarray.c      | 34 ++++++++++++++++++++++++++++++++++
 lib/xarray.c           | 49 +++++++++++++++++++++++++++++++------------------
 3 files changed, 71 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index cb571dfcf4b1..d9d479334c9e 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -1548,6 +1548,7 @@ void xas_create_range(struct xa_state *);
 
 #ifdef CONFIG_XARRAY_MULTI
 int xa_get_order(struct xarray *, unsigned long index);
+int xas_get_order(struct xa_state *xas);
 void xas_split(struct xa_state *, void *entry, unsigned int order);
 void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t);
 #else
@@ -1556,6 +1557,11 @@ static inline int xa_get_order(struct xarray *xa, unsigned long index)
 	return 0;
 }
 
+static inline int xas_get_order(struct xa_state *xas)
+{
+	return 0;
+}
+
 static inline void xas_split(struct xa_state *xas, void *entry,
 		unsigned int order)
 {
diff --git a/lib/test_xarray.c b/lib/test_xarray.c
index ebe2af2e072d..0efde8f93490 100644
--- a/lib/test_xarray.c
+++ b/lib/test_xarray.c
@@ -1984,6 +1984,39 @@ static noinline void check_get_order(struct xarray *xa)
 	}
 }
 
+static noinline void check_xas_get_order(struct xarray *xa)
+{
+	XA_STATE(xas, xa, 0);
+
+	unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 20 : 1;
+	unsigned int order;
+	unsigned long i, j;
+
+	for (order = 0; order < max_order; order++) {
+		for (i = 0; i < 10; i++) {
+			xas_set_order(&xas, i << order, order);
+			do {
+				xas_lock(&xas);
+				xas_store(&xas, xa_mk_value(i));
+				xas_unlock(&xas);
+			} while (xas_nomem(&xas, GFP_KERNEL));
+
+			for (j = i << order; j < (i + 1) << order; j++) {
+				xas_set_order(&xas, j, 0);
+				rcu_read_lock();
+				xas_load(&xas);
+				XA_BUG_ON(xa, xas_get_order(&xas) != order);
+				rcu_read_unlock();
+			}
+
+			xas_lock(&xas);
+			xas_set_order(&xas, i << order, order);
+			xas_store(&xas, NULL);
+			xas_unlock(&xas);
+		}
+	}
+}
+
 static noinline void check_destroy(struct xarray *xa)
 {
 	unsigned long index;
@@ -2035,6 +2068,7 @@ static int xarray_checks(void)
 	check_multi_store(&array);
 	check_multi_store_advanced(&array);
 	check_get_order(&array);
+	check_xas_get_order(&array);
 	check_xa_alloc();
 	check_find(&array);
 	check_find_entry(&array);
diff --git a/lib/xarray.c b/lib/xarray.c
index 39f07bfc4dcc..da79128ad754 100644
--- a/lib/xarray.c
+++ b/lib/xarray.c
@@ -1750,39 +1750,52 @@ unlock:
 EXPORT_SYMBOL(xa_store_range);
 
 /**
- * xa_get_order() - Get the order of an entry.
- * @xa: XArray.
- * @index: Index of the entry.
+ * xas_get_order() - Get the order of an entry.
+ * @xas: XArray operation state.
+ *
+ * Called after xas_load, the xas should not be in an error state.
  *
  * Return: A number between 0 and 63 indicating the order of the entry.
  */
-int xa_get_order(struct xarray *xa, unsigned long index)
+int xas_get_order(struct xa_state *xas)
 {
-	XA_STATE(xas, xa, index);
-	void *entry;
 	int order = 0;
 
-	rcu_read_lock();
-	entry = xas_load(&xas);
-
-	if (!entry)
-		goto unlock;
-
-	if (!xas.xa_node)
-		goto unlock;
+	if (!xas->xa_node)
+		return 0;
 
 	for (;;) {
-		unsigned int slot = xas.xa_offset + (1 << order);
+		unsigned int slot = xas->xa_offset + (1 << order);
 
 		if (slot >= XA_CHUNK_SIZE)
 			break;
-		if (!xa_is_sibling(xas.xa_node->slots[slot]))
+		if (!xa_is_sibling(xa_entry(xas->xa, xas->xa_node, slot)))
 			break;
 		order++;
 	}
 
-	order += xas.xa_node->shift;
-unlock:
+	order += xas->xa_node->shift;
+	return order;
+}
+EXPORT_SYMBOL_GPL(xas_get_order);
+
+/**
+ * xa_get_order() - Get the order of an entry.
+ * @xa: XArray.
+ * @index: Index of the entry.
+ *
+ * Return: A number between 0 and 63 indicating the order of the entry.
+ */
+int xa_get_order(struct xarray *xa, unsigned long index)
+{
+	XA_STATE(xas, xa, index);
+	int order = 0;
+	void *entry;
+
+	rcu_read_lock();
+	entry = xas_load(&xas);
+	if (entry)
+		order = xas_get_order(&xas);
 	rcu_read_unlock();
 
 	return order;
-- 
cgit v1.2.3


From 6600a6b10c3210280e7623b8ba7feac9619cc2cd Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Mon, 25 Mar 2024 22:56:42 +0800
Subject: mm/mm_init.c: remove the useless dma_reserve

Now nobody calls set_dma_reserve() to set value for dma_reserve, remove
set_dma_reserve(), global variable dma_reserve and the codes using it.

Link: https://lkml.kernel.org/r/20240325145646.1044760-3-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h |  1 -
 mm/mm_init.c       | 23 -----------------------
 2 files changed, 24 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index fe66b515aaab..22ae6ab621da 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3250,7 +3250,6 @@ static inline int early_pfn_to_nid(unsigned long pfn)
 extern int __meminit early_pfn_to_nid(unsigned long pfn);
 #endif
 
-extern void set_dma_reserve(unsigned long new_dma_reserve);
 extern void mem_init(void);
 extern void __init mmap_init(void);
 
diff --git a/mm/mm_init.c b/mm/mm_init.c
index f45c2b32ba82..91bb8600c0fa 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -227,7 +227,6 @@ static unsigned long required_movablecore_percent __initdata;
 
 static unsigned long nr_kernel_pages __initdata;
 static unsigned long nr_all_pages __initdata;
-static unsigned long dma_reserve __initdata;
 
 static bool deferred_struct_pages __meminitdata;
 
@@ -1584,12 +1583,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat)
 					zone_names[j], memmap_pages, freesize);
 		}
 
-		/* Account for reserved pages */
-		if (j == 0 && freesize > dma_reserve) {
-			freesize -= dma_reserve;
-			pr_debug("  %s zone: %lu pages reserved\n", zone_names[0], dma_reserve);
-		}
-
 		if (!is_highmem_idx(j))
 			nr_kernel_pages += freesize;
 		/* Charge for highmem memmap if there are enough kernel pages */
@@ -2548,22 +2541,6 @@ void *__init alloc_large_system_hash(const char *tablename,
 	return table;
 }
 
-/**
- * set_dma_reserve - set the specified number of pages reserved in the first zone
- * @new_dma_reserve: The number of pages to mark reserved
- *
- * The per-cpu batchsize and zone watermarks are determined by managed_pages.
- * In the DMA zone, a significant percentage may be consumed by kernel image
- * and other unfreeable allocations which can skew the watermarks badly. This
- * function may optionally be used to account for unfreeable pages in the
- * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
- * smaller per-cpu batchsize.
- */
-void __init set_dma_reserve(unsigned long new_dma_reserve)
-{
-	dma_reserve = new_dma_reserve;
-}
-
 void __init memblock_free_pages(struct page *page, unsigned long pfn,
 							unsigned int order)
 {
-- 
cgit v1.2.3


From 0b52663f7547520fbf54e2c07f61f8bd1b5cb6eb Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Mon, 25 Mar 2024 22:56:46 +0800
Subject: mm/mm_init.c: remove arch_reserved_kernel_pages()

Since the current calculation of calc_nr_kernel_pages() has taken into
consideration of kernel reserved memory, no need to have
arch_reserved_kernel_pages() any more.

Link: https://lkml.kernel.org/r/20240325145646.1044760-7-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/powerpc/include/asm/mmu.h |  4 ----
 arch/powerpc/kernel/fadump.c   |  5 -----
 include/linux/mm.h             |  3 ---
 mm/mm_init.c                   | 12 ------------
 4 files changed, 24 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index 3b72c7ed24cf..aa5c0fd5edb1 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -406,9 +406,5 @@ extern void *abatron_pteptrs[2];
 #include <asm/nohash/mmu.h>
 #endif
 
-#if defined(CONFIG_FA_DUMP) || defined(CONFIG_PRESERVE_FA_DUMP)
-#define __HAVE_ARCH_RESERVED_KERNEL_PAGES
-#endif
-
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_MMU_H_ */
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index d14eda1e8589..ae8c7619e597 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -1735,8 +1735,3 @@ static void __init fadump_reserve_crash_area(u64 base)
 		memblock_reserve(mstart, msize);
 	}
 }
-
-unsigned long __init arch_reserved_kernel_pages(void)
-{
-	return memblock_reserved_size() / PAGE_SIZE;
-}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 22ae6ab621da..dd47ba7a1c04 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3261,9 +3261,6 @@ static inline void show_mem(void)
 extern long si_mem_available(void);
 extern void si_meminfo(struct sysinfo * val);
 extern void si_meminfo_node(struct sysinfo *val, int nid);
-#ifdef __HAVE_ARCH_RESERVED_KERNEL_PAGES
-extern unsigned long arch_reserved_kernel_pages(void);
-#endif
 
 extern __printf(3, 4)
 void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 8c261572ca6e..0d32bcc301e2 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -2374,17 +2374,6 @@ void __init page_alloc_init_late(void)
 	page_alloc_sysctl_init();
 }
 
-#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
-/*
- * Returns the number of pages that arch has reserved but
- * is not known to alloc_large_system_hash().
- */
-static unsigned long __init arch_reserved_kernel_pages(void)
-{
-	return 0;
-}
-#endif
-
 /*
  * Adaptive scale is meant to reduce sizes of hash tables on large memory
  * machines. As memory size is increased the scale is also increased but at
@@ -2427,7 +2416,6 @@ void *__init alloc_large_system_hash(const char *tablename,
 	if (!numentries) {
 		/* round applicable memory size up to nearest megabyte */
 		numentries = nr_kernel_pages;
-		numentries -= arch_reserved_kernel_pages();
 
 		/* It isn't necessary when PAGE_SIZE >= 1MB */
 		if (PAGE_SIZE < SZ_1M)
-- 
cgit v1.2.3


From cb10c28ac82c9b7a5e9b3b1dc7157036c20c36dd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 25 Mar 2024 07:45:41 +0800
Subject: mm: remove follow_pfn

Remove follow_pfn now that the last user is gone.

Link: https://lkml.kernel.org/r/20240324234542.2038726-3-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fei Li <fei1.li@intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h |  2 --
 mm/memory.c        | 36 ++----------------------------------
 mm/nommu.c         | 21 ---------------------
 3 files changed, 2 insertions(+), 57 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index dd47ba7a1c04..cb8e6158b577 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2424,8 +2424,6 @@ int
 copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
 int follow_pte(struct mm_struct *mm, unsigned long address,
 	       pte_t **ptepp, spinlock_t **ptlp);
-int follow_pfn(struct vm_area_struct *vma, unsigned long address,
-	unsigned long *pfn);
 int follow_phys(struct vm_area_struct *vma, unsigned long address,
 		unsigned int flags, unsigned long *prot, resource_size_t *phys);
 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
diff --git a/mm/memory.c b/mm/memory.c
index bcd29104f5ea..652370703e41 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5884,8 +5884,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
  * Only IO mappings and raw PFN mappings are allowed.  The mmap semaphore
  * should be taken for read.
  *
- * KVM uses this function.  While it is arguably less bad than ``follow_pfn``,
- * it is not a good general-purpose API.
+ * KVM uses this function.  While it is arguably less bad than the historic
+ * ``follow_pfn``, it is not a good general-purpose API.
  *
  * Return: zero on success, -ve otherwise.
  */
@@ -5927,38 +5927,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(follow_pte);
 
-/**
- * follow_pfn - look up PFN at a user virtual address
- * @vma: memory mapping
- * @address: user virtual address
- * @pfn: location to store found PFN
- *
- * Only IO mappings and raw PFN mappings are allowed.
- *
- * This function does not allow the caller to read the permissions
- * of the PTE.  Do not use it.
- *
- * Return: zero and the pfn at @pfn on success, -ve otherwise.
- */
-int follow_pfn(struct vm_area_struct *vma, unsigned long address,
-	unsigned long *pfn)
-{
-	int ret = -EINVAL;
-	spinlock_t *ptl;
-	pte_t *ptep;
-
-	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
-		return ret;
-
-	ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
-	if (ret)
-		return ret;
-	*pfn = pte_pfn(ptep_get(ptep));
-	pte_unmap_unlock(ptep, ptl);
-	return 0;
-}
-EXPORT_SYMBOL(follow_pfn);
-
 #ifdef CONFIG_HAVE_IOREMAP_PROT
 int follow_phys(struct vm_area_struct *vma,
 		unsigned long address, unsigned int flags,
diff --git a/mm/nommu.c b/mm/nommu.c
index 88b646ea6d30..331d2f778695 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -110,27 +110,6 @@ unsigned int kobjsize(const void *objp)
 	return page_size(page);
 }
 
-/**
- * follow_pfn - look up PFN at a user virtual address
- * @vma: memory mapping
- * @address: user virtual address
- * @pfn: location to store found PFN
- *
- * Only IO mappings and raw PFN mappings are allowed.
- *
- * Returns zero and the pfn at @pfn on success, -ve otherwise.
- */
-int follow_pfn(struct vm_area_struct *vma, unsigned long address,
-	unsigned long *pfn)
-{
-	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
-		return -EINVAL;
-
-	*pfn = address >> PAGE_SHIFT;
-	return 0;
-}
-EXPORT_SYMBOL(follow_pfn);
-
 void vfree(const void *addr)
 {
 	kfree(addr);
-- 
cgit v1.2.3


From 5b34b76cb0cd8a21dee5c7677eae98480b0d05cc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 25 Mar 2024 07:45:42 +0800
Subject: mm: move follow_phys to arch/x86/mm/pat/memtype.c

follow_phys is only used by two callers in arch/x86/mm/pat/memtype.c.
Move it there and hardcode the two arguments that get the same values
passed by both callers.

[david@redhat.com: conflict resolutions]
Link: https://lkml.kernel.org/r/20240403212131.929421-4-david@redhat.com
Link: https://lkml.kernel.org/r/20240324234542.2038726-4-hch@lst.de
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fei Li <fei1.li@intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/x86/mm/pat/memtype.c | 29 ++++++++++++++++++++++++++++-
 include/linux/mm.h        |  2 --
 mm/memory.c               | 32 --------------------------------
 3 files changed, 28 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index 36b603d0cdde..d01c3b0bd6eb 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -39,6 +39,7 @@
 #include <linux/pfn_t.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
+#include <linux/highmem.h>
 #include <linux/fs.h>
 #include <linux/rbtree.h>
 
@@ -947,6 +948,32 @@ static void free_pfn_range(u64 paddr, unsigned long size)
 		memtype_free(paddr, paddr + size);
 }
 
+static int follow_phys(struct vm_area_struct *vma, unsigned long *prot,
+		resource_size_t *phys)
+{
+	pte_t *ptep, pte;
+	spinlock_t *ptl;
+
+	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+		return -EINVAL;
+
+	if (follow_pte(vma->vm_mm, vma->vm_start, &ptep, &ptl))
+		return -EINVAL;
+
+	pte = ptep_get(ptep);
+
+	/* Never return PFNs of anon folios in COW mappings. */
+	if (vm_normal_folio(vma, vma->vm_start, pte)) {
+		pte_unmap_unlock(ptep, ptl);
+		return -EINVAL;
+	}
+
+	*prot = pgprot_val(pte_pgprot(pte));
+	*phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
+	pte_unmap_unlock(ptep, ptl);
+	return 0;
+}
+
 static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr,
 		pgprot_t *pgprot)
 {
@@ -964,7 +991,7 @@ static int get_pat_info(struct vm_area_struct *vma, resource_size_t *paddr,
 	 * detect the PFN. If we need the cachemode as well, we're out of luck
 	 * for now and have to fail fork().
 	 */
-	if (!follow_phys(vma, vma->vm_start, 0, &prot, paddr)) {
+	if (!follow_phys(vma, &prot, paddr)) {
 		if (pgprot)
 			*pgprot = __pgprot(prot);
 		return 0;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index cb8e6158b577..5dc65618e386 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2424,8 +2424,6 @@ int
 copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
 int follow_pte(struct mm_struct *mm, unsigned long address,
 	       pte_t **ptepp, spinlock_t **ptlp);
-int follow_phys(struct vm_area_struct *vma, unsigned long address,
-		unsigned int flags, unsigned long *prot, resource_size_t *phys);
 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
 			void *buf, int len, int write);
 
diff --git a/mm/memory.c b/mm/memory.c
index 652370703e41..62ee4a15092a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5928,38 +5928,6 @@ out:
 EXPORT_SYMBOL_GPL(follow_pte);
 
 #ifdef CONFIG_HAVE_IOREMAP_PROT
-int follow_phys(struct vm_area_struct *vma,
-		unsigned long address, unsigned int flags,
-		unsigned long *prot, resource_size_t *phys)
-{
-	int ret = -EINVAL;
-	pte_t *ptep, pte;
-	spinlock_t *ptl;
-
-	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
-		goto out;
-
-	if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
-		goto out;
-	pte = ptep_get(ptep);
-
-	/* Never return PFNs of anon folios in COW mappings. */
-	if (vm_normal_folio(vma, address, pte))
-		goto unlock;
-
-	if ((flags & FOLL_WRITE) && !pte_write(pte))
-		goto unlock;
-
-	*prot = pgprot_val(pte_pgprot(pte));
-	*phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
-
-	ret = 0;
-unlock:
-	pte_unmap_unlock(ptep, ptl);
-out:
-	return ret;
-}
-
 /**
  * generic_access_phys - generic implementation for iomem mmap access
  * @vma: the vma to access
-- 
cgit v1.2.3


From f002882ca369aba3eece5006f3346ccf75ede7c5 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 26 Mar 2024 15:32:10 +0100
Subject: mm: merge folio_is_secretmem() and folio_fast_pin_allowed() into
 gup_fast_folio_allowed()

folio_is_secretmem() is currently only used during GUP-fast.  Nowadays,
folio_fast_pin_allowed() performs similar checks during GUP-fast and
contains a lot of careful handling -- READ_ONCE() -- , sanity checks --
lockdep_assert_irqs_disabled() -- and helpful comments on how this
handling is safe and correct.

So let's merge folio_is_secretmem() into folio_fast_pin_allowed().  Rename
folio_fast_pin_allowed() to gup_fast_folio_allowed(), to better match the
new semantics.

Link: https://lkml.kernel.org/r/20240326143210.291116-4-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Miklos Szeredi <mszeredi@redhat.com>
Cc: xingwei lee <xrivendell7@gmail.com>
Cc: yue sun <samsun1006219@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/secretmem.h | 21 ++-------------------
 mm/gup.c                  | 48 +++++++++++++++++++++++++++--------------------
 2 files changed, 30 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h
index acf7e1a3f3de..e918f96881f5 100644
--- a/include/linux/secretmem.h
+++ b/include/linux/secretmem.h
@@ -6,25 +6,8 @@
 
 extern const struct address_space_operations secretmem_aops;
 
-static inline bool folio_is_secretmem(struct folio *folio)
+static inline bool secretmem_mapping(struct address_space *mapping)
 {
-	struct address_space *mapping;
-
-	/*
-	 * Using folio_mapping() is quite slow because of the actual call
-	 * instruction.
-	 * We know that secretmem pages are not compound, so we can
-	 * save a couple of cycles here.
-	 */
-	if (folio_test_large(folio))
-		return false;
-
-	mapping = (struct address_space *)
-		((unsigned long)folio->mapping & ~PAGE_MAPPING_FLAGS);
-
-	if (!mapping || mapping != folio->mapping)
-		return false;
-
 	return mapping->a_ops == &secretmem_aops;
 }
 
@@ -38,7 +21,7 @@ static inline bool vma_is_secretmem(struct vm_area_struct *vma)
 	return false;
 }
 
-static inline bool folio_is_secretmem(struct folio *folio)
+static inline bool secretmem_mapping(struct address_space *mapping)
 {
 	return false;
 }
diff --git a/mm/gup.c b/mm/gup.c
index 6d8d15f8c7f9..7b21fcda3a98 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2468,12 +2468,14 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
 #ifdef CONFIG_HAVE_FAST_GUP
 
 /*
- * Used in the GUP-fast path to determine whether a pin is permitted for a
- * specific folio.
+ * Used in the GUP-fast path to determine whether GUP is permitted to work on
+ * a specific folio.
  *
  * This call assumes the caller has pinned the folio, that the lowest page table
  * level still points to this folio, and that interrupts have been disabled.
  *
+ * GUP-fast must reject all secretmem folios.
+ *
  * Writing to pinned file-backed dirty tracked folios is inherently problematic
  * (see comment describing the writable_file_mapping_allowed() function). We
  * therefore try to avoid the most egregious case of a long-term mapping doing
@@ -2483,25 +2485,34 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
  * in the fast path, so instead we whitelist known good cases and if in doubt,
  * fall back to the slow path.
  */
-static bool folio_fast_pin_allowed(struct folio *folio, unsigned int flags)
+static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
 {
+	bool reject_file_backed = false;
 	struct address_space *mapping;
+	bool check_secretmem = false;
 	unsigned long mapping_flags;
 
 	/*
 	 * If we aren't pinning then no problematic write can occur. A long term
 	 * pin is the most egregious case so this is the one we disallow.
 	 */
-	if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) !=
+	if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) ==
 	    (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE))
-		return true;
+		reject_file_backed = true;
+
+	/* We hold a folio reference, so we can safely access folio fields. */
 
-	/* The folio is pinned, so we can safely access folio fields. */
+	/* secretmem folios are always order-0 folios. */
+	if (IS_ENABLED(CONFIG_SECRETMEM) && !folio_test_large(folio))
+		check_secretmem = true;
+
+	if (!reject_file_backed && !check_secretmem)
+		return true;
 
 	if (WARN_ON_ONCE(folio_test_slab(folio)))
 		return false;
 
-	/* hugetlb mappings do not require dirty-tracking. */
+	/* hugetlb neither requires dirty-tracking nor can be secretmem. */
 	if (folio_test_hugetlb(folio))
 		return true;
 
@@ -2537,10 +2548,12 @@ static bool folio_fast_pin_allowed(struct folio *folio, unsigned int flags)
 
 	/*
 	 * At this point, we know the mapping is non-null and points to an
-	 * address_space object. The only remaining whitelisted file system is
-	 * shmem.
+	 * address_space object.
 	 */
-	return shmem_mapping(mapping);
+	if (check_secretmem && secretmem_mapping(mapping))
+		return false;
+	/* The only remaining allowed file system is shmem. */
+	return !reject_file_backed || shmem_mapping(mapping);
 }
 
 static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
@@ -2626,18 +2639,13 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
 		if (!folio)
 			goto pte_unmap;
 
-		if (unlikely(folio_is_secretmem(folio))) {
-			gup_put_folio(folio, 1, flags);
-			goto pte_unmap;
-		}
-
 		if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
 		    unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
 			gup_put_folio(folio, 1, flags);
 			goto pte_unmap;
 		}
 
-		if (!folio_fast_pin_allowed(folio, flags)) {
+		if (!gup_fast_folio_allowed(folio, flags)) {
 			gup_put_folio(folio, 1, flags);
 			goto pte_unmap;
 		}
@@ -2834,7 +2842,7 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 		return 0;
 	}
 
-	if (!folio_fast_pin_allowed(folio, flags)) {
+	if (!gup_fast_folio_allowed(folio, flags)) {
 		gup_put_folio(folio, refs, flags);
 		return 0;
 	}
@@ -2905,7 +2913,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
 		return 0;
 	}
 
-	if (!folio_fast_pin_allowed(folio, flags)) {
+	if (!gup_fast_folio_allowed(folio, flags)) {
 		gup_put_folio(folio, refs, flags);
 		return 0;
 	}
@@ -2949,7 +2957,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
 		return 0;
 	}
 
-	if (!folio_fast_pin_allowed(folio, flags)) {
+	if (!gup_fast_folio_allowed(folio, flags)) {
 		gup_put_folio(folio, refs, flags);
 		return 0;
 	}
@@ -2994,7 +3002,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
 		return 0;
 	}
 
-	if (!folio_fast_pin_allowed(folio, flags)) {
+	if (!gup_fast_folio_allowed(folio, flags)) {
 		gup_put_folio(folio, refs, flags);
 		return 0;
 	}
-- 
cgit v1.2.3


From 6e65aa55cdf4c7cc0490b156b4aae2035c157140 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 26 Mar 2024 17:10:25 +0000
Subject: mm: make page_ext_get() take a const argument

In order to constify other functions, we need page_ext_get() to be const.
This is no problem as lookup_page_ext() already takes a const argument.

Link: https://lkml.kernel.org/r/20240326171045.410737-4-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page_ext.h    | 4 ++--
 include/linux/pgalloc_tag.h | 2 --
 mm/page_ext.c               | 2 +-
 3 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index 07e0656898f9..e4b48a0dda24 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -77,7 +77,7 @@ static inline void page_ext_init(void)
 }
 #endif
 
-extern struct page_ext *page_ext_get(struct page *page);
+extern struct page_ext *page_ext_get(const struct page *page);
 extern void page_ext_put(struct page_ext *page_ext);
 
 static inline void *page_ext_data(struct page_ext *page_ext,
@@ -117,7 +117,7 @@ static inline void page_ext_init_flatmem(void)
 {
 }
 
-static inline struct page_ext *page_ext_get(struct page *page)
+static inline struct page_ext *page_ext_get(const struct page *page)
 {
 	return NULL;
 }
diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h
index 62d8dad74b37..86ba5d33e43b 100644
--- a/include/linux/pgalloc_tag.h
+++ b/include/linux/pgalloc_tag.h
@@ -12,8 +12,6 @@
 #include <linux/page_ext.h>
 
 extern struct page_ext_operations page_alloc_tagging_ops;
-extern struct page_ext *page_ext_get(struct page *page);
-extern void page_ext_put(struct page_ext *page_ext);
 
 static inline union codetag_ref *codetag_ref_from_page_ext(struct page_ext *page_ext)
 {
diff --git a/mm/page_ext.c b/mm/page_ext.c
index e7d8f1a5589e..95dd8ffeaf81 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -514,7 +514,7 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
  * Context: Any context.  Caller may not sleep until they have called
  * page_ext_put().
  */
-struct page_ext *page_ext_get(struct page *page)
+struct page_ext *page_ext_get(const struct page *page)
 {
 	struct page_ext *page_ext;
 
-- 
cgit v1.2.3


From e3089fd0b0199a368db4039122b241139cc665f3 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 26 Mar 2024 17:10:26 +0000
Subject: mm: make folio_test_idle and folio_test_young take a const argument

If these functions are defined in page-flags.h, they already take a const
argument; make it true for these alternate definitions too.

Link: https://lkml.kernel.org/r/20240326171045.410737-5-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page_idle.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
index d8f344840643..511e22ef459f 100644
--- a/include/linux/page_idle.h
+++ b/include/linux/page_idle.h
@@ -13,7 +13,7 @@
  * If there is not enough space to store Idle and Young bits in page flags, use
  * page ext flags instead.
  */
-static inline bool folio_test_young(struct folio *folio)
+static inline bool folio_test_young(const struct folio *folio)
 {
 	struct page_ext *page_ext = page_ext_get(&folio->page);
 	bool page_young;
@@ -52,7 +52,7 @@ static inline bool folio_test_clear_young(struct folio *folio)
 	return page_young;
 }
 
-static inline bool folio_test_idle(struct folio *folio)
+static inline bool folio_test_idle(const struct folio *folio)
 {
 	struct page_ext *page_ext = page_ext_get(&folio->page);
 	bool page_idle;
@@ -60,7 +60,7 @@ static inline bool folio_test_idle(struct folio *folio)
 	if (unlikely(!page_ext))
 		return false;
 
-	page_idle =  test_bit(PAGE_EXT_IDLE, &page_ext->flags);
+	page_idle = test_bit(PAGE_EXT_IDLE, &page_ext->flags);
 	page_ext_put(page_ext);
 
 	return page_idle;
@@ -91,7 +91,7 @@ static inline void folio_clear_idle(struct folio *folio)
 
 #else /* !CONFIG_PAGE_IDLE_FLAG */
 
-static inline bool folio_test_young(struct folio *folio)
+static inline bool folio_test_young(const struct folio *folio)
 {
 	return false;
 }
@@ -105,7 +105,7 @@ static inline bool folio_test_clear_young(struct folio *folio)
 	return false;
 }
 
-static inline bool folio_test_idle(struct folio *folio)
+static inline bool folio_test_idle(const struct folio *folio)
 {
 	return false;
 }
-- 
cgit v1.2.3


From 2ace5a670e2a0f3a6029bc6cf91dc14202074e7a Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 26 Mar 2024 17:10:27 +0000
Subject: mm: make is_free_buddy_page() take a const argument

This function does not modify its argument; let the callers know that so
they can make better optimisation decisions.

Link: https://lkml.kernel.org/r/20240326171045.410737-6-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 2 +-
 mm/page_alloc.c            | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 0163d6da6283..eaecf544039f 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -1088,7 +1088,7 @@ static inline bool is_page_hwpoison(const struct page *page)
 	return folio_test_hugetlb(folio) && PageHWPoison(&folio->page);
 }
 
-extern bool is_free_buddy_page(struct page *page);
+bool is_free_buddy_page(const struct page *page);
 
 PAGEFLAG(Isolated, isolated, PF_ANY);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index daab8cab91cc..6a8593aef297 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6745,16 +6745,16 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
 /*
  * This function returns a stable result only if called under zone lock.
  */
-bool is_free_buddy_page(struct page *page)
+bool is_free_buddy_page(const struct page *page)
 {
 	unsigned long pfn = page_to_pfn(page);
 	unsigned int order;
 
 	for (order = 0; order < NR_PAGE_ORDERS; order++) {
-		struct page *page_head = page - (pfn & ((1 << order) - 1));
+		const struct page *head = page - (pfn & ((1 << order) - 1));
 
-		if (PageBuddy(page_head) &&
-		    buddy_order_unsafe(page_head) >= order)
+		if (PageBuddy(head) &&
+		    buddy_order_unsafe(head) >= order)
 			break;
 	}
 
-- 
cgit v1.2.3


From b84fd2835c70e0149e2522fd746d3fb7049a1e19 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 26 Mar 2024 17:10:28 +0000
Subject: mm: make page_mapped() take a const argument

None of the functions called by page_mapped() modify the page or folio, so
mark them all as const.

Link: https://lkml.kernel.org/r/20240326171045.410737-7-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 12 ++++++------
 mm/internal.h      |  7 ++++---
 mm/rmap.c          |  2 +-
 3 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5dc65618e386..9c0e441664d4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1200,7 +1200,7 @@ static inline int is_vmalloc_or_module_addr(const void *x)
  * debugging purposes - it does not include PTE-mapped sub-pages; look
  * at folio_mapcount() or page_mapcount() instead.
  */
-static inline int folio_entire_mapcount(struct folio *folio)
+static inline int folio_entire_mapcount(const struct folio *folio)
 {
 	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
 	return atomic_read(&folio->_entire_mapcount) + 1;
@@ -1240,7 +1240,7 @@ static inline int page_mapcount(struct page *page)
 	return mapcount;
 }
 
-int folio_total_mapcount(struct folio *folio);
+int folio_total_mapcount(const struct folio *folio);
 
 /**
  * folio_mapcount() - Calculate the number of mappings of this folio.
@@ -1253,14 +1253,14 @@ int folio_total_mapcount(struct folio *folio);
  *
  * Return: The number of times this folio is mapped.
  */
-static inline int folio_mapcount(struct folio *folio)
+static inline int folio_mapcount(const struct folio *folio)
 {
 	if (likely(!folio_test_large(folio)))
 		return atomic_read(&folio->_mapcount) + 1;
 	return folio_total_mapcount(folio);
 }
 
-static inline bool folio_large_is_mapped(struct folio *folio)
+static inline bool folio_large_is_mapped(const struct folio *folio)
 {
 	/*
 	 * Reading _entire_mapcount below could be omitted if hugetlb
@@ -1288,7 +1288,7 @@ static inline bool folio_mapped(struct folio *folio)
  * For compound page it returns true if any sub-page of compound page is mapped,
  * even if this particular sub-page is not itself mapped by any PTE or PMD.
  */
-static inline bool page_mapped(struct page *page)
+static inline bool page_mapped(const struct page *page)
 {
 	if (likely(!PageCompound(page)))
 		return atomic_read(&page->_mapcount) >= 0;
@@ -2070,7 +2070,7 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
  *
  * Return: A positive power of two.
  */
-static inline long folio_nr_pages(struct folio *folio)
+static inline long folio_nr_pages(const struct folio *folio)
 {
 	if (!folio_test_large(folio))
 		return 1;
diff --git a/mm/internal.h b/mm/internal.h
index 5dbfa1c12e89..8e11f7b2da21 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -71,7 +71,7 @@ void page_writeback_init(void);
  * How many individual pages have an elevated _mapcount.  Excludes
  * the folio's entire_mapcount.
  */
-static inline int folio_nr_pages_mapped(struct folio *folio)
+static inline int folio_nr_pages_mapped(const struct folio *folio)
 {
 	return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED;
 }
@@ -81,7 +81,8 @@ static inline int folio_nr_pages_mapped(struct folio *folio)
  * folio. We cannot rely on folio->swap as there is no guarantee that it has
  * been initialized. Used for calling arch_swap_restore()
  */
-static inline swp_entry_t folio_swap(swp_entry_t entry, struct folio *folio)
+static inline swp_entry_t folio_swap(swp_entry_t entry,
+		const struct folio *folio)
 {
 	swp_entry_t swap = {
 		.val = ALIGN_DOWN(entry.val, folio_nr_pages(folio)),
@@ -90,7 +91,7 @@ static inline swp_entry_t folio_swap(swp_entry_t entry, struct folio *folio)
 	return swap;
 }
 
-static inline void *folio_raw_mapping(struct folio *folio)
+static inline void *folio_raw_mapping(const struct folio *folio)
 {
 	unsigned long mapping = (unsigned long)folio->mapping;
 
diff --git a/mm/rmap.c b/mm/rmap.c
index 3746a5531018..d52759aa3ff7 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1134,7 +1134,7 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
 	return page_vma_mkclean_one(&pvmw);
 }
 
-int folio_total_mapcount(struct folio *folio)
+int folio_total_mapcount(const struct folio *folio)
 {
 	int mapcount = folio_entire_mapcount(folio);
 	int nr_pages;
-- 
cgit v1.2.3


From 51718e25c53f58fe9fa515f219367632cce4914d Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 26 Mar 2024 17:10:29 +0000
Subject: mm: convert arch_clear_hugepage_flags to take a folio

All implementations that aren't no-ops just set a bit in the flags, and we
want to use the folio flags rather than the page flags for that.  Rename
it to arch_clear_hugetlb_flags() while we're touching it so nobody thinks
it's used for THP.

[willy@infradead.org: fix arm64 build]
  Link: https://lkml.kernel.org/r/ZgQvNKGdlDkwhQEX@casper.infradead.org
Link: https://lkml.kernel.org/r/20240326171045.410737-8-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm/include/asm/hugetlb.h   | 6 +++---
 arch/arm64/include/asm/hugetlb.h | 6 +++---
 arch/riscv/include/asm/hugetlb.h | 6 +++---
 arch/s390/include/asm/hugetlb.h  | 6 +++---
 arch/sh/include/asm/hugetlb.h    | 6 +++---
 include/linux/hugetlb.h          | 6 +++---
 mm/hugetlb.c                     | 4 ++--
 7 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h
index a3a82b7158d4..b766c4b373f6 100644
--- a/arch/arm/include/asm/hugetlb.h
+++ b/arch/arm/include/asm/hugetlb.h
@@ -15,10 +15,10 @@
 #include <asm/hugetlb-3level.h>
 #include <asm-generic/hugetlb.h>
 
-static inline void arch_clear_hugepage_flags(struct page *page)
+static inline void arch_clear_hugetlb_flags(struct folio *folio)
 {
-	clear_bit(PG_dcache_clean, &page->flags);
+	clear_bit(PG_dcache_clean, &folio->flags);
 }
-#define arch_clear_hugepage_flags arch_clear_hugepage_flags
+#define arch_clear_hugetlb_flags arch_clear_hugetlb_flags
 
 #endif /* _ASM_ARM_HUGETLB_H */
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index 2ddc33d93b13..3954cbd2ff56 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -18,11 +18,11 @@
 extern bool arch_hugetlb_migration_supported(struct hstate *h);
 #endif
 
-static inline void arch_clear_hugepage_flags(struct page *page)
+static inline void arch_clear_hugetlb_flags(struct folio *folio)
 {
-	clear_bit(PG_dcache_clean, &page->flags);
+	clear_bit(PG_dcache_clean, &folio->flags);
 }
-#define arch_clear_hugepage_flags arch_clear_hugepage_flags
+#define arch_clear_hugetlb_flags arch_clear_hugetlb_flags
 
 pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags);
 #define arch_make_huge_pte arch_make_huge_pte
diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h
index 22deb7a2a6ec..b1ce97a9dbfc 100644
--- a/arch/riscv/include/asm/hugetlb.h
+++ b/arch/riscv/include/asm/hugetlb.h
@@ -5,11 +5,11 @@
 #include <asm/cacheflush.h>
 #include <asm/page.h>
 
-static inline void arch_clear_hugepage_flags(struct page *page)
+static inline void arch_clear_hugetlb_flags(struct folio *folio)
 {
-	clear_bit(PG_dcache_clean, &page->flags);
+	clear_bit(PG_dcache_clean, &folio->flags);
 }
-#define arch_clear_hugepage_flags arch_clear_hugepage_flags
+#define arch_clear_hugetlb_flags arch_clear_hugetlb_flags
 
 #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
 bool arch_hugetlb_migration_supported(struct hstate *h);
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index deb198a61039..ce5f4fe8be4d 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -39,11 +39,11 @@ static inline int prepare_hugepage_range(struct file *file,
 	return 0;
 }
 
-static inline void arch_clear_hugepage_flags(struct page *page)
+static inline void arch_clear_hugetlb_flags(struct folio *folio)
 {
-	clear_bit(PG_arch_1, &page->flags);
+	clear_bit(PG_arch_1, &folio->flags);
 }
-#define arch_clear_hugepage_flags arch_clear_hugepage_flags
+#define arch_clear_hugetlb_flags arch_clear_hugetlb_flags
 
 static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
 				  pte_t *ptep, unsigned long sz)
diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h
index 4d3ba39e681c..75028bd568ba 100644
--- a/arch/sh/include/asm/hugetlb.h
+++ b/arch/sh/include/asm/hugetlb.h
@@ -27,11 +27,11 @@ static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
 	return *ptep;
 }
 
-static inline void arch_clear_hugepage_flags(struct page *page)
+static inline void arch_clear_hugetlb_flags(struct folio *folio)
 {
-	clear_bit(PG_dcache_clean, &page->flags);
+	clear_bit(PG_dcache_clean, &folio->flags);
 }
-#define arch_clear_hugepage_flags arch_clear_hugepage_flags
+#define arch_clear_hugetlb_flags arch_clear_hugetlb_flags
 
 #include <asm-generic/hugetlb.h>
 
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d748628efc5e..03fc6d625068 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -836,9 +836,9 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
 #define is_hugepage_only_range is_hugepage_only_range
 #endif
 
-#ifndef arch_clear_hugepage_flags
-static inline void arch_clear_hugepage_flags(struct page *page) { }
-#define arch_clear_hugepage_flags arch_clear_hugepage_flags
+#ifndef arch_clear_hugetlb_flags
+static inline void arch_clear_hugetlb_flags(struct folio *folio) { }
+#define arch_clear_hugetlb_flags arch_clear_hugetlb_flags
 #endif
 
 #ifndef arch_make_huge_pte
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2e984554ec63..88bb38df1701 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1726,7 +1726,7 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio,
 		 */
 		return;
 
-	arch_clear_hugepage_flags(&folio->page);
+	arch_clear_hugetlb_flags(folio);
 	enqueue_hugetlb_folio(h, folio);
 }
 
@@ -2024,7 +2024,7 @@ void free_huge_folio(struct folio *folio)
 		spin_unlock_irqrestore(&hugetlb_lock, flags);
 		update_and_free_hugetlb_folio(h, folio, true);
 	} else {
-		arch_clear_hugepage_flags(&folio->page);
+		arch_clear_hugetlb_flags(folio);
 		enqueue_hugetlb_folio(h, folio);
 		spin_unlock_irqrestore(&hugetlb_lock, flags);
 	}
-- 
cgit v1.2.3


From dee3d0bef2b00772be430425832ead6aa9d707f9 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 26 Mar 2024 17:10:32 +0000
Subject: proc: rewrite stable_page_flags()

Reduce the usage of PageFlag tests and reduce the number of
compound_head() calls.

For multi-page folios, we'll now show all pages as having the flags that
apply to them, e.g.  if it's dirty, all pages will have the dirty flag set
instead of just the head page.  The mapped flag is still per page, as is
the hwpoison flag.

[willy@infradead.org: fix up some bits vs masks]
  Link: https://lkml.kernel.org/r/20240403173112.1450721-1-willy@infradead.org
[willy@infradead.org: fix warnings]
  Link: https://lkml.kernel.org/r/ZhBPtCYfSuFuUMEz@casper.infradead.org
Link: https://lkml.kernel.org/r/20240326171045.410737-11-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Svetly Todorov <svetly.todorov@memverge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/page.c                 | 69 ++++++++++++++++++++++--------------------
 include/linux/huge_mm.h        |  4 +--
 include/linux/page-flags.h     |  2 +-
 tools/cgroup/memcg_slabinfo.py |  5 ++-
 4 files changed, 42 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/fs/proc/page.c b/fs/proc/page.c
index 9223856c934b..05120263af2a 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -107,10 +107,13 @@ static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit)
 	return ((kflags >> kbit) & 1) << ubit;
 }
 
-u64 stable_page_flags(struct page *page)
+u64 stable_page_flags(const struct page *page)
 {
-	u64 k;
-	u64 u;
+	const struct folio *folio;
+	unsigned long k;
+	unsigned long mapping;
+	bool is_anon;
+	u64 u = 0;
 
 	/*
 	 * pseudo flag: KPF_NOPAGE
@@ -118,49 +121,47 @@ u64 stable_page_flags(struct page *page)
 	 */
 	if (!page)
 		return 1 << KPF_NOPAGE;
+	folio = page_folio(page);
 
-	k = page->flags;
-	u = 0;
+	k = folio->flags;
+	mapping = (unsigned long)folio->mapping;
+	is_anon = mapping & PAGE_MAPPING_ANON;
 
 	/*
 	 * pseudo flags for the well known (anonymous) memory mapped pages
 	 */
 	if (page_mapped(page))
 		u |= 1 << KPF_MMAP;
-	if (PageAnon(page))
+	if (is_anon) {
 		u |= 1 << KPF_ANON;
-	if (PageKsm(page))
-		u |= 1 << KPF_KSM;
+		if (mapping & PAGE_MAPPING_KSM)
+			u |= 1 << KPF_KSM;
+	}
 
 	/*
 	 * compound pages: export both head/tail info
 	 * they together define a compound page's start/end pos and order
 	 */
-	if (PageHead(page))
-		u |= 1 << KPF_COMPOUND_HEAD;
-	if (PageTail(page))
+	if (page == &folio->page)
+		u |= kpf_copy_bit(k, KPF_COMPOUND_HEAD, PG_head);
+	else
 		u |= 1 << KPF_COMPOUND_TAIL;
-	if (PageHuge(page))
+	if (folio_test_hugetlb(folio))
 		u |= 1 << KPF_HUGE;
 	/*
-	 * PageTransCompound can be true for non-huge compound pages (slab
-	 * pages or pages allocated by drivers with __GFP_COMP) because it
-	 * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon
+	 * We need to check PageLRU/PageAnon
 	 * to make sure a given page is a thp, not a non-huge compound page.
 	 */
-	else if (PageTransCompound(page)) {
-		struct page *head = compound_head(page);
-
-		if (PageLRU(head) || PageAnon(head))
+	else if (folio_test_large(folio)) {
+		if ((k & (1 << PG_lru)) || is_anon)
 			u |= 1 << KPF_THP;
-		else if (is_huge_zero_page(head)) {
+		else if (is_huge_zero_page(&folio->page)) {
 			u |= 1 << KPF_ZERO_PAGE;
 			u |= 1 << KPF_THP;
 		}
 	} else if (is_zero_pfn(page_to_pfn(page)))
 		u |= 1 << KPF_ZERO_PAGE;
 
-
 	/*
 	 * Caveats on high order pages: PG_buddy and PG_slab will only be set
 	 * on the head page.
@@ -174,16 +175,17 @@ u64 stable_page_flags(struct page *page)
 		u |= 1 << KPF_OFFLINE;
 	if (PageTable(page))
 		u |= 1 << KPF_PGTABLE;
+	if (folio_test_slab(folio))
+		u |= 1 << KPF_SLAB;
 
-	if (page_is_idle(page))
+#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
+	u |= kpf_copy_bit(k, KPF_IDLE,          PG_idle);
+#else
+	if (folio_test_idle(folio))
 		u |= 1 << KPF_IDLE;
+#endif
 
 	u |= kpf_copy_bit(k, KPF_LOCKED,	PG_locked);
-
-	u |= kpf_copy_bit(k, KPF_SLAB,		PG_slab);
-	if (PageTail(page) && PageSlab(page))
-		u |= 1 << KPF_SLAB;
-
 	u |= kpf_copy_bit(k, KPF_ERROR,		PG_error);
 	u |= kpf_copy_bit(k, KPF_DIRTY,		PG_dirty);
 	u |= kpf_copy_bit(k, KPF_UPTODATE,	PG_uptodate);
@@ -194,7 +196,8 @@ u64 stable_page_flags(struct page *page)
 	u |= kpf_copy_bit(k, KPF_ACTIVE,	PG_active);
 	u |= kpf_copy_bit(k, KPF_RECLAIM,	PG_reclaim);
 
-	if (PageSwapCache(page))
+#define SWAPCACHE ((1 << PG_swapbacked) | (1 << PG_swapcache))
+	if ((k & SWAPCACHE) == SWAPCACHE)
 		u |= 1 << KPF_SWAPCACHE;
 	u |= kpf_copy_bit(k, KPF_SWAPBACKED,	PG_swapbacked);
 
@@ -202,7 +205,10 @@ u64 stable_page_flags(struct page *page)
 	u |= kpf_copy_bit(k, KPF_MLOCKED,	PG_mlocked);
 
 #ifdef CONFIG_MEMORY_FAILURE
-	u |= kpf_copy_bit(k, KPF_HWPOISON,	PG_hwpoison);
+	if (u & (1 << KPF_HUGE))
+		u |= kpf_copy_bit(k, KPF_HWPOISON,	PG_hwpoison);
+	else
+		u |= kpf_copy_bit(page->flags, KPF_HWPOISON,	PG_hwpoison);
 #endif
 
 #ifdef CONFIG_ARCH_USES_PG_UNCACHED
@@ -228,7 +234,6 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
 {
 	const unsigned long max_dump_pfn = get_max_dump_pfn();
 	u64 __user *out = (u64 __user *)buf;
-	struct page *ppage;
 	unsigned long src = *ppos;
 	unsigned long pfn;
 	ssize_t ret = 0;
@@ -245,9 +250,9 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
 		 * TODO: ZONE_DEVICE support requires to identify
 		 * memmaps that were actually initialized.
 		 */
-		ppage = pfn_to_online_page(pfn);
+		struct page *page = pfn_to_online_page(pfn);
 
-		if (put_user(stable_page_flags(ppage), out)) {
+		if (put_user(stable_page_flags(page), out)) {
 			ret = -EFAULT;
 			break;
 		}
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 7576025db55d..1540a1481daf 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -351,7 +351,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf);
 extern struct page *huge_zero_page;
 extern unsigned long huge_zero_pfn;
 
-static inline bool is_huge_zero_page(struct page *page)
+static inline bool is_huge_zero_page(const struct page *page)
 {
 	return READ_ONCE(huge_zero_page) == page;
 }
@@ -480,7 +480,7 @@ static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
 	return 0;
 }
 
-static inline bool is_huge_zero_page(struct page *page)
+static inline bool is_huge_zero_page(const struct page *page)
 {
 	return false;
 }
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index eaecf544039f..888353c209c0 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -734,7 +734,7 @@ static __always_inline bool PageKsm(const struct page *page)
 TESTPAGEFLAG_FALSE(Ksm, ksm)
 #endif
 
-u64 stable_page_flags(struct page *page);
+u64 stable_page_flags(const struct page *page);
 
 /**
  * folio_xor_flags_has_waiters - Change some folio flags.
diff --git a/tools/cgroup/memcg_slabinfo.py b/tools/cgroup/memcg_slabinfo.py
index 1d3a90d93fe2..270c28a0d098 100644
--- a/tools/cgroup/memcg_slabinfo.py
+++ b/tools/cgroup/memcg_slabinfo.py
@@ -146,12 +146,11 @@ def detect_kernel_config():
 
 
 def for_each_slab(prog):
-    PGSlab = 1 << prog.constant('PG_slab')
-    PGHead = 1 << prog.constant('PG_head')
+    PGSlab = ~prog.constant('PG_slab')
 
     for page in for_each_page(prog):
         try:
-            if page.flags.value_() & PGSlab:
+            if page.page_type.value_() == PGSlab:
                 yield cast('struct slab *', page)
         except FaultError:
             pass
-- 
cgit v1.2.3


From b6dd94596f7fafd162da236b438e31b12d352187 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Tue, 26 Mar 2024 14:11:29 +0800
Subject: mm: make __absent_pages_in_range() as static

It's only called in mm/mm_init.c now.

Link: https://lkml.kernel.org/r/20240326061134.1055295-4-bhe@redhat.com
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 2 --
 mm/mm_init.c       | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9c0e441664d4..00a46a5d5c9d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3229,8 +3229,6 @@ static inline unsigned long get_num_physpages(void)
  */
 void free_area_init(unsigned long *max_zone_pfn);
 unsigned long node_map_pfn_alignment(void);
-unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn,
-						unsigned long end_pfn);
 extern unsigned long absent_pages_in_range(unsigned long start_pfn,
 						unsigned long end_pfn);
 extern void get_pfn_range_for_nid(unsigned int nid,
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 0fe466abde43..19c78b54fc8a 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1144,7 +1144,7 @@ static void __init adjust_zone_range_for_zone_movable(int nid,
  * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
  * then all holes in the requested range will be accounted for.
  */
-unsigned long __init __absent_pages_in_range(int nid,
+static unsigned long __init __absent_pages_in_range(int nid,
 				unsigned long range_start_pfn,
 				unsigned long range_end_pfn)
 {
-- 
cgit v1.2.3


From 5beaee54a324ba1fe307e341ec825d5d099f4091 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 26 Mar 2024 20:28:22 +0000
Subject: mm: add is_huge_zero_folio()

This is the folio equivalent of is_huge_zero_page().  It doesn't add any
efficiency, but it does prevent the caller from passing a tail page and
getting confused when the predicate returns false.

Link: https://lkml.kernel.org/r/20240326202833.523759-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/page.c          |  2 +-
 include/linux/huge_mm.h | 10 ++++++++++
 mm/huge_memory.c        |  6 +++---
 mm/mempolicy.c          |  2 +-
 mm/swap.c               |  2 +-
 mm/swap_state.c         |  2 +-
 mm/userfaultfd.c        |  2 +-
 7 files changed, 18 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/fs/proc/page.c b/fs/proc/page.c
index 05120263af2a..2fb64bdb64eb 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -155,7 +155,7 @@ u64 stable_page_flags(const struct page *page)
 	else if (folio_test_large(folio)) {
 		if ((k & (1 << PG_lru)) || is_anon)
 			u |= 1 << KPF_THP;
-		else if (is_huge_zero_page(&folio->page)) {
+		else if (is_huge_zero_folio(folio)) {
 			u |= 1 << KPF_ZERO_PAGE;
 			u |= 1 << KPF_THP;
 		}
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 1540a1481daf..600c6008262b 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -356,6 +356,11 @@ static inline bool is_huge_zero_page(const struct page *page)
 	return READ_ONCE(huge_zero_page) == page;
 }
 
+static inline bool is_huge_zero_folio(const struct folio *folio)
+{
+	return READ_ONCE(huge_zero_page) == &folio->page;
+}
+
 static inline bool is_huge_zero_pmd(pmd_t pmd)
 {
 	return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd);
@@ -485,6 +490,11 @@ static inline bool is_huge_zero_page(const struct page *page)
 	return false;
 }
 
+static inline bool is_huge_zero_folio(const struct folio *folio)
+{
+	return false;
+}
+
 static inline bool is_huge_zero_pmd(pmd_t pmd)
 {
 	return false;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 75ad971ca45e..5c043c7b5062 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -789,12 +789,12 @@ struct deferred_split *get_deferred_split_queue(struct folio *folio)
 }
 #endif
 
-static inline bool is_transparent_hugepage(struct folio *folio)
+static inline bool is_transparent_hugepage(const struct folio *folio)
 {
 	if (!folio_test_large(folio))
 		return false;
 
-	return is_huge_zero_page(&folio->page) ||
+	return is_huge_zero_folio(folio) ||
 		folio_test_large_rmappable(folio);
 }
 
@@ -3085,7 +3085,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
 	}
 
 
-	is_hzp = is_huge_zero_page(&folio->page);
+	is_hzp = is_huge_zero_folio(folio);
 	if (is_hzp) {
 		pr_warn_ratelimited("Called split_huge_page for huge zero page\n");
 		return -EBUSY;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 913cff5da5a3..5743028a63a5 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -510,7 +510,7 @@ static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
 		return;
 	}
 	folio = pfn_folio(pmd_pfn(*pmd));
-	if (is_huge_zero_page(&folio->page)) {
+	if (is_huge_zero_folio(folio)) {
 		walk->action = ACTION_CONTINUE;
 		return;
 	}
diff --git a/mm/swap.c b/mm/swap.c
index 500a09a48dfd..f72364e92d5f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -985,7 +985,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
 		struct folio *folio = folios->folios[i];
 		unsigned int nr_refs = refs ? refs[i] : 1;
 
-		if (is_huge_zero_page(&folio->page))
+		if (is_huge_zero_folio(folio))
 			continue;
 
 		if (folio_is_zone_device(folio)) {
diff --git a/mm/swap_state.c b/mm/swap_state.c
index bfc7e8c58a6d..2deac23633cd 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -301,7 +301,7 @@ void free_page_and_swap_cache(struct page *page)
 	struct folio *folio = page_folio(page);
 
 	free_swap_cache(folio);
-	if (!is_huge_zero_page(page))
+	if (!is_huge_zero_folio(folio))
 		folio_put(folio);
 }
 
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 3c3539c573e7..a0ec14553fbe 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -1664,7 +1664,7 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
 			    !pmd_none(dst_pmdval)) {
 				struct folio *folio = pfn_folio(pmd_pfn(*src_pmd));
 
-				if (!folio || (!is_huge_zero_page(&folio->page) &&
+				if (!folio || (!is_huge_zero_folio(folio) &&
 					       !PageAnonExclusive(&folio->page))) {
 					spin_unlock(ptl);
 					err = -EBUSY;
-- 
cgit v1.2.3


From e06d03d5590ae1c257b8aa2cfbfe6765e0755c14 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 26 Mar 2024 20:28:23 +0000
Subject: mm: add pmd_folio()

Convert directly from a pmd to a folio without going through another
representation first.  For now this is just a slightly shorter way to
write it, but it might end up being more efficient later.

Link: https://lkml.kernel.org/r/20240326202833.523759-4-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgtable.h | 2 ++
 mm/huge_memory.c        | 6 +++---
 mm/madvise.c            | 2 +-
 mm/mempolicy.c          | 2 +-
 mm/mlock.c              | 2 +-
 mm/userfaultfd.c        | 2 +-
 6 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 600e17d03659..09c85c7bf9c2 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -50,6 +50,8 @@
 #define pmd_pgtable(pmd) pmd_page(pmd)
 #endif
 
+#define pmd_folio(pmd) page_folio(pmd_page(pmd))
+
 /*
  * A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD]
  *
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 5c043c7b5062..712263e3b1f6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1816,7 +1816,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		goto out;
 	}
 
-	folio = pfn_folio(pmd_pfn(orig_pmd));
+	folio = pmd_folio(orig_pmd);
 	/*
 	 * If other processes are mapping this folio, we couldn't discard
 	 * the folio unless they all do MADV_FREE so let's skip the folio.
@@ -2086,7 +2086,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		if (pmd_protnone(*pmd))
 			goto unlock;
 
-		folio = page_folio(pmd_page(*pmd));
+		folio = pmd_folio(*pmd);
 		toptier = node_is_toptier(folio_nid(folio));
 		/*
 		 * Skip scanning top tier node if normal numa
@@ -2663,7 +2663,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		 * It's safe to call pmd_page when folio is set because it's
 		 * guaranteed that pmd is present.
 		 */
-		if (folio && folio != page_folio(pmd_page(*pmd)))
+		if (folio && folio != pmd_folio(*pmd))
 			goto out;
 		__split_huge_pmd_locked(vma, pmd, range.start, freeze);
 	}
diff --git a/mm/madvise.c b/mm/madvise.c
index 7625830d6ae9..1f77a51baaac 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -363,7 +363,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 			goto huge_unlock;
 		}
 
-		folio = pfn_folio(pmd_pfn(orig_pmd));
+		folio = pmd_folio(orig_pmd);
 
 		/* Do not interfere with other mappings of this folio */
 		if (folio_likely_mapped_shared(folio))
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5743028a63a5..aec756ae5637 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -509,7 +509,7 @@ static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
 		qp->nr_failed++;
 		return;
 	}
-	folio = pfn_folio(pmd_pfn(*pmd));
+	folio = pmd_folio(*pmd);
 	if (is_huge_zero_folio(folio)) {
 		walk->action = ACTION_CONTINUE;
 		return;
diff --git a/mm/mlock.c b/mm/mlock.c
index 1ed2f2ab37cd..30b51cdea89d 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -378,7 +378,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
 			goto out;
 		if (is_huge_zero_pmd(*pmd))
 			goto out;
-		folio = page_folio(pmd_page(*pmd));
+		folio = pmd_folio(*pmd);
 		if (vma->vm_flags & VM_LOCKED)
 			mlock_folio(folio);
 		else
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index a0ec14553fbe..b70618e8dcd2 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -1662,7 +1662,7 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start,
 			/* Check if we can move the pmd without splitting it. */
 			if (move_splits_huge_pmd(dst_addr, src_addr, src_start + len) ||
 			    !pmd_none(dst_pmdval)) {
-				struct folio *folio = pfn_folio(pmd_pfn(*src_pmd));
+				struct folio *folio = pmd_folio(*src_pmd);
 
 				if (!folio || (!is_huge_zero_folio(folio) &&
 					       !PageAnonExclusive(&folio->page))) {
-- 
cgit v1.2.3


From 5691753d73a233a97c13f7d389f15d20596d062d Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 26 Mar 2024 20:28:25 +0000
Subject: mm: convert huge_zero_page to huge_zero_folio

With all callers of is_huge_zero_page() converted, we can now switch the
huge_zero_page itself from being a compound page to a folio.

Link: https://lkml.kernel.org/r/20240326202833.523759-6-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 21 ++++++++-------------
 mm/huge_memory.c        | 28 ++++++++++++++--------------
 2 files changed, 22 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 600c6008262b..7ba59ba36354 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -348,17 +348,12 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
 
 vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf);
 
-extern struct page *huge_zero_page;
+extern struct folio *huge_zero_folio;
 extern unsigned long huge_zero_pfn;
 
-static inline bool is_huge_zero_page(const struct page *page)
-{
-	return READ_ONCE(huge_zero_page) == page;
-}
-
 static inline bool is_huge_zero_folio(const struct folio *folio)
 {
-	return READ_ONCE(huge_zero_page) == &folio->page;
+	return READ_ONCE(huge_zero_folio) == folio;
 }
 
 static inline bool is_huge_zero_pmd(pmd_t pmd)
@@ -371,9 +366,14 @@ static inline bool is_huge_zero_pud(pud_t pud)
 	return false;
 }
 
-struct page *mm_get_huge_zero_page(struct mm_struct *mm);
+struct folio *mm_get_huge_zero_folio(struct mm_struct *mm);
 void mm_put_huge_zero_page(struct mm_struct *mm);
 
+static inline struct page *mm_get_huge_zero_page(struct mm_struct *mm)
+{
+	return &mm_get_huge_zero_folio(mm)->page;
+}
+
 #define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot))
 
 static inline bool thp_migration_supported(void)
@@ -485,11 +485,6 @@ static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
 	return 0;
 }
 
-static inline bool is_huge_zero_page(const struct page *page)
-{
-	return false;
-}
-
 static inline bool is_huge_zero_folio(const struct folio *folio)
 {
 	return false;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 712263e3b1f6..6d510d635b1d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -74,7 +74,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
 					 struct shrink_control *sc);
 
 static atomic_t huge_zero_refcount;
-struct page *huge_zero_page __read_mostly;
+struct folio *huge_zero_folio __read_mostly;
 unsigned long huge_zero_pfn __read_mostly = ~0UL;
 unsigned long huge_anon_orders_always __read_mostly;
 unsigned long huge_anon_orders_madvise __read_mostly;
@@ -192,24 +192,24 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
 
 static bool get_huge_zero_page(void)
 {
-	struct page *zero_page;
+	struct folio *zero_folio;
 retry:
 	if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
 		return true;
 
-	zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
+	zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
 			HPAGE_PMD_ORDER);
-	if (!zero_page) {
+	if (!zero_folio) {
 		count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
 		return false;
 	}
 	preempt_disable();
-	if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
+	if (cmpxchg(&huge_zero_folio, NULL, zero_folio)) {
 		preempt_enable();
-		__free_pages(zero_page, compound_order(zero_page));
+		folio_put(zero_folio);
 		goto retry;
 	}
-	WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page));
+	WRITE_ONCE(huge_zero_pfn, folio_pfn(zero_folio));
 
 	/* We take additional reference here. It will be put back by shrinker */
 	atomic_set(&huge_zero_refcount, 2);
@@ -227,10 +227,10 @@ static void put_huge_zero_page(void)
 	BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
 }
 
-struct page *mm_get_huge_zero_page(struct mm_struct *mm)
+struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
 {
 	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
-		return READ_ONCE(huge_zero_page);
+		return READ_ONCE(huge_zero_folio);
 
 	if (!get_huge_zero_page())
 		return NULL;
@@ -238,7 +238,7 @@ struct page *mm_get_huge_zero_page(struct mm_struct *mm)
 	if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
 		put_huge_zero_page();
 
-	return READ_ONCE(huge_zero_page);
+	return READ_ONCE(huge_zero_folio);
 }
 
 void mm_put_huge_zero_page(struct mm_struct *mm)
@@ -258,10 +258,10 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
 				       struct shrink_control *sc)
 {
 	if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
-		struct page *zero_page = xchg(&huge_zero_page, NULL);
-		BUG_ON(zero_page == NULL);
+		struct folio *zero_folio = xchg(&huge_zero_folio, NULL);
+		BUG_ON(zero_folio == NULL);
 		WRITE_ONCE(huge_zero_pfn, ~0UL);
-		__free_pages(zero_page, compound_order(zero_page));
+		folio_put(zero_folio);
 		return HPAGE_PMD_NR;
 	}
 
@@ -1340,7 +1340,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		 * since we already have a zero page to copy. It just takes a
 		 * reference.
 		 */
-		mm_get_huge_zero_page(dst_mm);
+		mm_get_huge_zero_folio(dst_mm);
 		goto out_zero_page;
 	}
 
-- 
cgit v1.2.3


From c93012d849c9e3fab9540458623b98794c4bf0a1 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 26 Mar 2024 20:28:27 +0000
Subject: dax: use huge_zero_folio

Convert from huge_zero_page to huge_zero_folio.

Link: https://lkml.kernel.org/r/20240326202833.523759-8-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/dax.c                      | 14 +++++++-------
 include/trace/events/fs_dax.h | 16 ++++++++--------
 2 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/fs/dax.c b/fs/dax.c
index 423fc1607dfa..becb4a6920c6 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1207,17 +1207,17 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
 	struct vm_area_struct *vma = vmf->vma;
 	struct inode *inode = mapping->host;
 	pgtable_t pgtable = NULL;
-	struct page *zero_page;
+	struct folio *zero_folio;
 	spinlock_t *ptl;
 	pmd_t pmd_entry;
 	pfn_t pfn;
 
-	zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
+	zero_folio = mm_get_huge_zero_folio(vmf->vma->vm_mm);
 
-	if (unlikely(!zero_page))
+	if (unlikely(!zero_folio))
 		goto fallback;
 
-	pfn = page_to_pfn_t(zero_page);
+	pfn = page_to_pfn_t(&zero_folio->page);
 	*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn,
 				  DAX_PMD | DAX_ZERO_PAGE);
 
@@ -1237,17 +1237,17 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
 		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
 		mm_inc_nr_ptes(vma->vm_mm);
 	}
-	pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
+	pmd_entry = mk_pmd(&zero_folio->page, vmf->vma->vm_page_prot);
 	pmd_entry = pmd_mkhuge(pmd_entry);
 	set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
 	spin_unlock(ptl);
-	trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
+	trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry);
 	return VM_FAULT_NOPAGE;
 
 fallback:
 	if (pgtable)
 		pte_free(vma->vm_mm, pgtable);
-	trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
+	trace_dax_pmd_load_hole_fallback(inode, vmf, zero_folio, *entry);
 	return VM_FAULT_FALLBACK;
 }
 #else
diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h
index 97b09fcf7e52..86fe6aecff1e 100644
--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -62,14 +62,14 @@ DEFINE_PMD_FAULT_EVENT(dax_pmd_fault_done);
 
 DECLARE_EVENT_CLASS(dax_pmd_load_hole_class,
 	TP_PROTO(struct inode *inode, struct vm_fault *vmf,
-		struct page *zero_page,
+		struct folio *zero_folio,
 		void *radix_entry),
-	TP_ARGS(inode, vmf, zero_page, radix_entry),
+	TP_ARGS(inode, vmf, zero_folio, radix_entry),
 	TP_STRUCT__entry(
 		__field(unsigned long, ino)
 		__field(unsigned long, vm_flags)
 		__field(unsigned long, address)
-		__field(struct page *, zero_page)
+		__field(struct folio *, zero_folio)
 		__field(void *, radix_entry)
 		__field(dev_t, dev)
 	),
@@ -78,17 +78,17 @@ DECLARE_EVENT_CLASS(dax_pmd_load_hole_class,
 		__entry->ino = inode->i_ino;
 		__entry->vm_flags = vmf->vma->vm_flags;
 		__entry->address = vmf->address;
-		__entry->zero_page = zero_page;
+		__entry->zero_folio = zero_folio;
 		__entry->radix_entry = radix_entry;
 	),
-	TP_printk("dev %d:%d ino %#lx %s address %#lx zero_page %p "
+	TP_printk("dev %d:%d ino %#lx %s address %#lx zero_folio %p "
 			"radix_entry %#lx",
 		MAJOR(__entry->dev),
 		MINOR(__entry->dev),
 		__entry->ino,
 		__entry->vm_flags & VM_SHARED ? "shared" : "private",
 		__entry->address,
-		__entry->zero_page,
+		__entry->zero_folio,
 		(unsigned long)__entry->radix_entry
 	)
 )
@@ -96,8 +96,8 @@ DECLARE_EVENT_CLASS(dax_pmd_load_hole_class,
 #define DEFINE_PMD_LOAD_HOLE_EVENT(name) \
 DEFINE_EVENT(dax_pmd_load_hole_class, name, \
 	TP_PROTO(struct inode *inode, struct vm_fault *vmf, \
-		struct page *zero_page, void *radix_entry), \
-	TP_ARGS(inode, vmf, zero_page, radix_entry))
+		struct folio *zero_folio, void *radix_entry), \
+	TP_ARGS(inode, vmf, zero_folio, radix_entry))
 
 DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole);
 DEFINE_PMD_LOAD_HOLE_EVENT(dax_pmd_load_hole_fallback);
-- 
cgit v1.2.3


From 632230ff193954b514c908fdb45320f64ac9dc72 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 26 Mar 2024 20:28:28 +0000
Subject: mm: rename mm_put_huge_zero_page to mm_put_huge_zero_folio

Also remove mm_get_huge_zero_page() now it has no users.

Link: https://lkml.kernel.org/r/20240326202833.523759-9-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 9 ++-------
 kernel/fork.c           | 2 +-
 mm/huge_memory.c        | 2 +-
 3 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 7ba59ba36354..9d11e886aa9a 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -367,12 +367,7 @@ static inline bool is_huge_zero_pud(pud_t pud)
 }
 
 struct folio *mm_get_huge_zero_folio(struct mm_struct *mm);
-void mm_put_huge_zero_page(struct mm_struct *mm);
-
-static inline struct page *mm_get_huge_zero_page(struct mm_struct *mm)
-{
-	return &mm_get_huge_zero_folio(mm)->page;
-}
+void mm_put_huge_zero_folio(struct mm_struct *mm);
 
 #define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot))
 
@@ -500,7 +495,7 @@ static inline bool is_huge_zero_pud(pud_t pud)
 	return false;
 }
 
-static inline void mm_put_huge_zero_page(struct mm_struct *mm)
+static inline void mm_put_huge_zero_folio(struct mm_struct *mm)
 {
 	return;
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index aebb3e6c96dc..99076dbe27d8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1344,7 +1344,7 @@ static inline void __mmput(struct mm_struct *mm)
 	ksm_exit(mm);
 	khugepaged_exit(mm); /* must run before exit_mmap */
 	exit_mmap(mm);
-	mm_put_huge_zero_page(mm);
+	mm_put_huge_zero_folio(mm);
 	set_mm_exe_file(mm, NULL);
 	if (!list_empty(&mm->mmlist)) {
 		spin_lock(&mmlist_lock);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 098b8ad4bcc7..c4820cd4749e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -241,7 +241,7 @@ struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
 	return READ_ONCE(huge_zero_folio);
 }
 
-void mm_put_huge_zero_page(struct mm_struct *mm)
+void mm_put_huge_zero_folio(struct mm_struct *mm)
 {
 	if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
 		put_huge_zero_page();
-- 
cgit v1.2.3


From 24334e78e8e3ca6c8e51dad67d779b1ed40988a6 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Wed, 27 Mar 2024 11:23:21 -0400
Subject: mm/hugetlb: declare hugetlbfs_pagecache_present() non-static

It will be used outside hugetlb.c soon.

Link: https://lkml.kernel.org/r/20240327152332.950956-3-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrew Jones <andrew.jones@linux.dev>
Cc: Aneesh Kumar K.V (IBM) <aneesh.kumar@kernel.org>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: James Houghton <jthoughton@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Mike Rapoport (IBM)" <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Rik van Riel <riel@surriel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 9 +++++++++
 mm/hugetlb.c            | 4 ++--
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 03fc6d625068..a7988c78d69f 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -174,6 +174,9 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx);
 
 pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
 		      unsigned long addr, pud_t *pud);
+bool hugetlbfs_pagecache_present(struct hstate *h,
+				 struct vm_area_struct *vma,
+				 unsigned long address);
 
 struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage);
 
@@ -1228,6 +1231,12 @@ static inline void hugetlb_register_node(struct node *node)
 static inline void hugetlb_unregister_node(struct node *node)
 {
 }
+
+static inline bool hugetlbfs_pagecache_present(
+    struct hstate *h, struct vm_area_struct *vma, unsigned long address)
+{
+	return false;
+}
 #endif	/* CONFIG_HUGETLB_PAGE */
 
 static inline spinlock_t *huge_pte_lock(struct hstate *h,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 88bb38df1701..740435567be8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6113,8 +6113,8 @@ out_release_old:
 /*
  * Return whether there is a pagecache page to back given address within VMA.
  */
-static bool hugetlbfs_pagecache_present(struct hstate *h,
-			struct vm_area_struct *vma, unsigned long address)
+bool hugetlbfs_pagecache_present(struct hstate *h,
+				 struct vm_area_struct *vma, unsigned long address)
 {
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	pgoff_t idx = linear_page_index(vma, address);
-- 
cgit v1.2.3


From b979db1611a63b207dbc47706de989ac113ea898 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Wed, 27 Mar 2024 11:23:22 -0400
Subject: mm: make HPAGE_PXD_* macros even if !THP

These macros can be helpful when we plan to merge hugetlb code into
generic code.  Move them out and define them as long as
PGTABLE_HAS_HUGE_LEAVES is selected, because there are systems that only
define HUGETLB_PAGE not THP.

One note here is HPAGE_PMD_SHIFT must be defined even if PMD_SHIFT is not
defined (e.g.  !CONFIG_MMU case); it (or in other forms, like
HPAGE_PMD_NR) is already used in lots of common codes without ifdef
guards.  Use the old trick to let complations work.

Here we only need to differenciate HPAGE_PXD_SHIFT definitions.  All the
rest macros will be defined based on it.  When at it, move HPAGE_PMD_NR /
HPAGE_PMD_ORDER over together.

Link: https://lkml.kernel.org/r/20240327152332.950956-4-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrew Jones <andrew.jones@linux.dev>
Cc: Aneesh Kumar K.V (IBM) <aneesh.kumar@kernel.org>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: James Houghton <jthoughton@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Mike Rapoport (IBM)" <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Rik van Riel <riel@surriel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 9d11e886aa9a..ff72ee19a125 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -64,9 +64,6 @@ ssize_t single_hugepage_flag_show(struct kobject *kobj,
 				  enum transparent_hugepage_flag flag);
 extern struct kobj_attribute shmem_enabled_attr;
 
-#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
-#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
-
 /*
  * Mask of all large folio orders supported for anonymous THP; all orders up to
  * and including PMD_ORDER, except order-0 (which is not "huge") and order-1
@@ -87,14 +84,25 @@ extern struct kobj_attribute shmem_enabled_attr;
 #define thp_vma_allowable_order(vma, vm_flags, smaps, in_pf, enforce_sysfs, order) \
 	(!!thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf, enforce_sysfs, BIT(order)))
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
 #define HPAGE_PMD_SHIFT PMD_SHIFT
-#define HPAGE_PMD_SIZE	((1UL) << HPAGE_PMD_SHIFT)
+#define HPAGE_PUD_SHIFT PUD_SHIFT
+#else
+#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
+#define HPAGE_PUD_SHIFT ({ BUILD_BUG(); 0; })
+#endif
+
+#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
+#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
 #define HPAGE_PMD_MASK	(~(HPAGE_PMD_SIZE - 1))
+#define HPAGE_PMD_SIZE	((1UL) << HPAGE_PMD_SHIFT)
 
-#define HPAGE_PUD_SHIFT PUD_SHIFT
-#define HPAGE_PUD_SIZE	((1UL) << HPAGE_PUD_SHIFT)
+#define HPAGE_PUD_ORDER (HPAGE_PUD_SHIFT-PAGE_SHIFT)
+#define HPAGE_PUD_NR (1<<HPAGE_PUD_ORDER)
 #define HPAGE_PUD_MASK	(~(HPAGE_PUD_SIZE - 1))
+#define HPAGE_PUD_SIZE	((1UL) << HPAGE_PUD_SHIFT)
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 
 extern unsigned long transparent_hugepage_flags;
 extern unsigned long huge_anon_orders_always;
@@ -377,13 +385,6 @@ static inline bool thp_migration_supported(void)
 }
 
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
-#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
-#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
-#define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; })
-
-#define HPAGE_PUD_SHIFT ({ BUILD_BUG(); 0; })
-#define HPAGE_PUD_MASK ({ BUILD_BUG(); 0; })
-#define HPAGE_PUD_SIZE ({ BUILD_BUG(); 0; })
 
 static inline bool folio_test_pmd_mappable(struct folio *folio)
 {
-- 
cgit v1.2.3


From 239e9a90c887d33e9339870a650d2bd621b95674 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Wed, 27 Mar 2024 11:23:23 -0400
Subject: mm: introduce vma_pgtable_walk_{begin|end}()

Introduce per-vma begin()/end() helpers for pgtable walks.  This is a
preparation work to merge hugetlb pgtable walkers with generic mm.

The helpers need to be called before and after a pgtable walk, will start
to be needed if the pgtable walker code supports hugetlb pages.  It's a
hook point for any type of VMA, but for now only hugetlb uses it to
stablize the pgtable pages from getting away (due to possible pmd
unsharing).

Link: https://lkml.kernel.org/r/20240327152332.950956-5-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Christoph Hellwig <hch@infradead.org>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrew Jones <andrew.jones@linux.dev>
Cc: Aneesh Kumar K.V (IBM) <aneesh.kumar@kernel.org>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David Hildenbrand <david@redhat.com>
Cc: James Houghton <jthoughton@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Mike Rapoport (IBM)" <rppt@kernel.org>
Cc: Rik van Riel <riel@surriel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h |  3 +++
 mm/memory.c        | 12 ++++++++++++
 2 files changed, 15 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 00a46a5d5c9d..3dda0dbcae14 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4230,4 +4230,7 @@ static inline bool pfn_is_unaccepted_memory(unsigned long pfn)
 	return range_contains_unaccepted_memory(paddr, paddr + PAGE_SIZE);
 }
 
+void vma_pgtable_walk_begin(struct vm_area_struct *vma);
+void vma_pgtable_walk_end(struct vm_area_struct *vma);
+
 #endif /* _LINUX_MM_H */
diff --git a/mm/memory.c b/mm/memory.c
index 62ee4a15092a..0b92336bcebd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6378,3 +6378,15 @@ void ptlock_free(struct ptdesc *ptdesc)
 	kmem_cache_free(page_ptl_cachep, ptdesc->ptl);
 }
 #endif
+
+void vma_pgtable_walk_begin(struct vm_area_struct *vma)
+{
+	if (is_vm_hugetlb_page(vma))
+		hugetlb_vma_lock_read(vma);
+}
+
+void vma_pgtable_walk_end(struct vm_area_struct *vma)
+{
+	if (is_vm_hugetlb_page(vma))
+		hugetlb_vma_unlock_read(vma);
+}
-- 
cgit v1.2.3


From 35a76f5c0863fc6b570bf02ab6dbf4cd65d1e001 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Wed, 27 Mar 2024 11:23:24 -0400
Subject: mm/arch: provide pud_pfn() fallback

The comment in the code explains the reasons.  We took a different
approach comparing to pmd_pfn() by providing a fallback function.

Another option is to provide some lower level config options (compare to
HUGETLB_PAGE or THP) to identify which layer an arch can support for such
huge mappings.  However that can be an overkill.

[peterx@redhat.com: fix loongson defconfig]
  Link: https://lkml.kernel.org/r/20240403013249.1418299-4-peterx@redhat.com
Link: https://lkml.kernel.org/r/20240327152332.950956-6-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrew Jones <andrew.jones@linux.dev>
Cc: Aneesh Kumar K.V (IBM) <aneesh.kumar@kernel.org>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: James Houghton <jthoughton@google.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Rik van Riel <riel@surriel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/riscv/include/asm/pgtable.h    |  1 +
 arch/s390/include/asm/pgtable.h     |  1 +
 arch/sparc/include/asm/pgtable_64.h |  1 +
 arch/x86/include/asm/pgtable.h      |  1 +
 include/linux/pgtable.h             | 14 ++++++++++++++
 5 files changed, 18 insertions(+)

(limited to 'include')

diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 9f8ea0e33eb1..661b2b4fe758 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -648,6 +648,7 @@ static inline unsigned long pmd_pfn(pmd_t pmd)
 
 #define __pud_to_phys(pud)  (__page_val_to_pfn(pud_val(pud)) << PAGE_SHIFT)
 
+#define pud_pfn pud_pfn
 static inline unsigned long pud_pfn(pud_t pud)
 {
 	return ((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT);
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 60950e7a25f5..2cb2a2e7b34b 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1405,6 +1405,7 @@ static inline unsigned long pud_deref(pud_t pud)
 	return (unsigned long)__va(pud_val(pud) & origin_mask);
 }
 
+#define pud_pfn pud_pfn
 static inline unsigned long pud_pfn(pud_t pud)
 {
 	return __pa(pud_deref(pud)) >> PAGE_SHIFT;
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 4d1bafaba942..26efc9bb644a 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -875,6 +875,7 @@ static inline bool pud_leaf(pud_t pud)
 	return pte_val(pte) & _PAGE_PMD_HUGE;
 }
 
+#define pud_pfn pud_pfn
 static inline unsigned long pud_pfn(pud_t pud)
 {
 	pte_t pte = __pte(pud_val(pud));
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index cefc7a84f7a4..273f7557218c 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -234,6 +234,7 @@ static inline unsigned long pmd_pfn(pmd_t pmd)
 	return (pfn & pmd_pfn_mask(pmd)) >> PAGE_SHIFT;
 }
 
+#define pud_pfn pud_pfn
 static inline unsigned long pud_pfn(pud_t pud)
 {
 	phys_addr_t pfn = pud_val(pud);
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 09c85c7bf9c2..f108c7a3c1d9 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1819,6 +1819,20 @@ typedef unsigned int pgtbl_mod_mask;
 #define pte_leaf_size(x) PAGE_SIZE
 #endif
 
+/*
+ * We always define pmd_pfn for all archs as it's used in lots of generic
+ * code.  Now it happens too for pud_pfn (and can happen for larger
+ * mappings too in the future; we're not there yet).  Instead of defining
+ * it for all archs (like pmd_pfn), provide a fallback.
+ *
+ * Note that returning 0 here means any arch that didn't define this can
+ * get severely wrong when it hits a real pud leaf.  It's arch's
+ * responsibility to properly define it when a huge pud is possible.
+ */
+#ifndef pud_pfn
+#define pud_pfn(x) 0
+#endif
+
 /*
  * Some architectures have MMUs that are configurable or selectable at boot
  * time. These lead to variable PTRS_PER_x. For statically allocated arrays it
-- 
cgit v1.2.3


From 1b1676180246232308885c4f37fee01cf898fdb2 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Wed, 27 Mar 2024 11:23:29 -0400
Subject: mm/gup: handle huge pud for follow_pud_mask()

Teach follow_pud_mask() to be able to handle normal PUD pages like
hugetlb.

Rename follow_devmap_pud() to follow_huge_pud() so that it can process
either huge devmap or hugetlb.  Move it out of TRANSPARENT_HUGEPAGE_PUD
and and huge_memory.c (which relies on CONFIG_THP).  Switch to pud_leaf()
to detect both cases in the slow gup.

In the new follow_huge_pud(), taking care of possible CoR for hugetlb if
necessary.  touch_pud() needs to be moved out of huge_memory.c to be
accessable from gup.c even if !THP.

Since at it, optimize the non-present check by adding a pud_present()
early check before taking the pgtable lock, failing the follow_page()
early if PUD is not present: that is required by both devmap or hugetlb.
Use pud_huge() to also cover the pud_devmap() case.

One more trivial thing to mention is, introduce "pud_t pud" in the code
paths along the way, so the code doesn't dereference *pudp multiple time.
Not only because that looks less straightforward, but also because if the
dereference really happened, it's not clear whether there can be race to
see different *pudp values when it's being modified at the same time.

Setting ctx->page_mask properly for a PUD entry.  As a side effect, this
patch should also be able to optimize devmap GUP on PUD to be able to jump
over the whole PUD range, but not yet verified.  Hugetlb already can do so
prior to this patch.

Link: https://lkml.kernel.org/r/20240327152332.950956-11-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrew Jones <andrew.jones@linux.dev>
Cc: Aneesh Kumar K.V (IBM) <aneesh.kumar@kernel.org>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: James Houghton <jthoughton@google.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Mike Rapoport (IBM)" <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Rik van Riel <riel@surriel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h |  8 ------
 mm/gup.c                | 70 ++++++++++++++++++++++++++++++++++++++++++++++---
 mm/huge_memory.c        | 47 ++-------------------------------
 mm/internal.h           |  2 ++
 4 files changed, 71 insertions(+), 56 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index ff72ee19a125..ab26d9e65ec3 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -351,8 +351,6 @@ static inline bool folio_test_pmd_mappable(struct folio *folio)
 
 struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
 		pmd_t *pmd, int flags, struct dev_pagemap **pgmap);
-struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
-		pud_t *pud, int flags, struct dev_pagemap **pgmap);
 
 vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf);
 
@@ -507,12 +505,6 @@ static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
 	return NULL;
 }
 
-static inline struct page *follow_devmap_pud(struct vm_area_struct *vma,
-	unsigned long addr, pud_t *pud, int flags, struct dev_pagemap **pgmap)
-{
-	return NULL;
-}
-
 static inline bool thp_migration_supported(void)
 {
 	return false;
diff --git a/mm/gup.c b/mm/gup.c
index 39224b5fe62f..2b06d59f2fa3 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -525,6 +525,70 @@ static struct page *no_page_table(struct vm_area_struct *vma,
 	return NULL;
 }
 
+#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
+static struct page *follow_huge_pud(struct vm_area_struct *vma,
+				    unsigned long addr, pud_t *pudp,
+				    int flags, struct follow_page_context *ctx)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct page *page;
+	pud_t pud = *pudp;
+	unsigned long pfn = pud_pfn(pud);
+	int ret;
+
+	assert_spin_locked(pud_lockptr(mm, pudp));
+
+	if ((flags & FOLL_WRITE) && !pud_write(pud))
+		return NULL;
+
+	if (!pud_present(pud))
+		return NULL;
+
+	pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
+
+	if (IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
+	    pud_devmap(pud)) {
+		/*
+		 * device mapped pages can only be returned if the caller
+		 * will manage the page reference count.
+		 *
+		 * At least one of FOLL_GET | FOLL_PIN must be set, so
+		 * assert that here:
+		 */
+		if (!(flags & (FOLL_GET | FOLL_PIN)))
+			return ERR_PTR(-EEXIST);
+
+		if (flags & FOLL_TOUCH)
+			touch_pud(vma, addr, pudp, flags & FOLL_WRITE);
+
+		ctx->pgmap = get_dev_pagemap(pfn, ctx->pgmap);
+		if (!ctx->pgmap)
+			return ERR_PTR(-EFAULT);
+	}
+
+	page = pfn_to_page(pfn);
+
+	if (!pud_devmap(pud) && !pud_write(pud) &&
+	    gup_must_unshare(vma, flags, page))
+		return ERR_PTR(-EMLINK);
+
+	ret = try_grab_page(page, flags);
+	if (ret)
+		page = ERR_PTR(ret);
+	else
+		ctx->page_mask = HPAGE_PUD_NR - 1;
+
+	return page;
+}
+#else  /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */
+static struct page *follow_huge_pud(struct vm_area_struct *vma,
+				    unsigned long addr, pud_t *pudp,
+				    int flags, struct follow_page_context *ctx)
+{
+	return NULL;
+}
+#endif	/* CONFIG_PGTABLE_HAS_HUGE_LEAVES */
+
 static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
 		pte_t *pte, unsigned int flags)
 {
@@ -760,11 +824,11 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
 
 	pudp = pud_offset(p4dp, address);
 	pud = READ_ONCE(*pudp);
-	if (pud_none(pud))
+	if (!pud_present(pud))
 		return no_page_table(vma, flags, address);
-	if (pud_devmap(pud)) {
+	if (pud_leaf(pud)) {
 		ptl = pud_lock(mm, pudp);
-		page = follow_devmap_pud(vma, address, pudp, flags, &ctx->pgmap);
+		page = follow_huge_pud(vma, address, pudp, flags, ctx);
 		spin_unlock(ptl);
 		if (page)
 			return page;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c4820cd4749e..249318f367e4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1378,8 +1378,8 @@ out:
 }
 
 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
-		      pud_t *pud, bool write)
+void touch_pud(struct vm_area_struct *vma, unsigned long addr,
+	       pud_t *pud, bool write)
 {
 	pud_t _pud;
 
@@ -1391,49 +1391,6 @@ static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
 		update_mmu_cache_pud(vma, addr, pud);
 }
 
-struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
-		pud_t *pud, int flags, struct dev_pagemap **pgmap)
-{
-	unsigned long pfn = pud_pfn(*pud);
-	struct mm_struct *mm = vma->vm_mm;
-	struct page *page;
-	int ret;
-
-	assert_spin_locked(pud_lockptr(mm, pud));
-
-	if (flags & FOLL_WRITE && !pud_write(*pud))
-		return NULL;
-
-	if (pud_present(*pud) && pud_devmap(*pud))
-		/* pass */;
-	else
-		return NULL;
-
-	if (flags & FOLL_TOUCH)
-		touch_pud(vma, addr, pud, flags & FOLL_WRITE);
-
-	/*
-	 * device mapped pages can only be returned if the
-	 * caller will manage the page reference count.
-	 *
-	 * At least one of FOLL_GET | FOLL_PIN must be set, so assert that here:
-	 */
-	if (!(flags & (FOLL_GET | FOLL_PIN)))
-		return ERR_PTR(-EEXIST);
-
-	pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
-	*pgmap = get_dev_pagemap(pfn, *pgmap);
-	if (!*pgmap)
-		return ERR_PTR(-EFAULT);
-	page = pfn_to_page(pfn);
-
-	ret = try_grab_page(page, flags);
-	if (ret)
-		page = ERR_PTR(ret);
-
-	return page;
-}
-
 int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
 		  struct vm_area_struct *vma)
diff --git a/mm/internal.h b/mm/internal.h
index 8e11f7b2da21..d06072f06a8d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1112,6 +1112,8 @@ int __must_check try_grab_page(struct page *page, unsigned int flags);
 /*
  * mm/huge_memory.c
  */
+void touch_pud(struct vm_area_struct *vma, unsigned long addr,
+	       pud_t *pud, bool write);
 struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 				   unsigned long addr, pmd_t *pmd,
 				   unsigned int flags);
-- 
cgit v1.2.3


From 9cb28da54643ad464c47585cd5866c30b0218e67 Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Wed, 27 Mar 2024 11:23:32 -0400
Subject: mm/gup: handle hugetlb in the generic follow_page_mask code

Now follow_page() is ready to handle hugetlb pages in whatever form, and
over all architectures.  Switch to the generic code path.

Time to retire hugetlb_follow_page_mask(), following the previous
retirement of follow_hugetlb_page() in 4849807114b8.

There may be a slight difference of how the loops run when processing slow
GUP over a large hugetlb range on cont_pte/cont_pmd supported archs: each
loop of __get_user_pages() will resolve one pgtable entry with the patch
applied, rather than relying on the size of hugetlb hstate, the latter may
cover multiple entries in one loop.

A quick performance test on an aarch64 VM on M1 chip shows 15% degrade
over a tight loop of slow gup after the path switched.  That shouldn't be
a problem because slow-gup should not be a hot path for GUP in general:
when page is commonly present, fast-gup will already succeed, while when
the page is indeed missing and require a follow up page fault, the slow
gup degrade will probably buried in the fault paths anyway.  It also
explains why slow gup for THP used to be very slow before 57edfcfd3419
("mm/gup: accelerate thp gup even for "pages != NULL"") lands, the latter
not part of a performance analysis but a side benefit.  If the performance
will be a concern, we can consider handle CONT_PTE in follow_page().

Before that is justified to be necessary, keep everything clean and simple.

Link: https://lkml.kernel.org/r/20240327152332.950956-14-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andrew Jones <andrew.jones@linux.dev>
Cc: Aneesh Kumar K.V (IBM) <aneesh.kumar@kernel.org>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: James Houghton <jthoughton@google.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: "Mike Rapoport (IBM)" <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Rik van Riel <riel@surriel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h |  7 -----
 mm/gup.c                | 15 ++++-------
 mm/hugetlb.c            | 71 -------------------------------------------------
 3 files changed, 5 insertions(+), 88 deletions(-)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index a7988c78d69f..3f3e62880279 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -328,13 +328,6 @@ static inline void hugetlb_zap_end(
 {
 }
 
-static inline struct page *hugetlb_follow_page_mask(
-    struct vm_area_struct *vma, unsigned long address, unsigned int flags,
-    unsigned int *page_mask)
-{
-	BUILD_BUG(); /* should never be compiled in if !CONFIG_HUGETLB_PAGE*/
-}
-
 static inline int copy_hugetlb_page_range(struct mm_struct *dst,
 					  struct mm_struct *src,
 					  struct vm_area_struct *dst_vma,
diff --git a/mm/gup.c b/mm/gup.c
index a572a1169aa7..fbf80e16bc94 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1132,18 +1132,11 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
 {
 	pgd_t *pgd;
 	struct mm_struct *mm = vma->vm_mm;
+	struct page *page;
 
-	ctx->page_mask = 0;
-
-	/*
-	 * Call hugetlb_follow_page_mask for hugetlb vmas as it will use
-	 * special hugetlb page table walking code.  This eliminates the
-	 * need to check for hugetlb entries in the general walking code.
-	 */
-	if (is_vm_hugetlb_page(vma))
-		return hugetlb_follow_page_mask(vma, address, flags,
-						&ctx->page_mask);
+	vma_pgtable_walk_begin(vma);
 
+	ctx->page_mask = 0;
 	pgd = pgd_offset(mm, address);
 
 	if (unlikely(is_hugepd(__hugepd(pgd_val(*pgd)))))
@@ -1154,6 +1147,8 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
 	else
 		page = follow_p4d_mask(vma, address, pgd, flags, ctx);
 
+	vma_pgtable_walk_end(vma);
+
 	return page;
 }
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 740435567be8..8af2fd48f785 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6876,77 +6876,6 @@ out_release_nounlock:
 }
 #endif /* CONFIG_USERFAULTFD */
 
-struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
-				      unsigned long address, unsigned int flags,
-				      unsigned int *page_mask)
-{
-	struct hstate *h = hstate_vma(vma);
-	struct mm_struct *mm = vma->vm_mm;
-	unsigned long haddr = address & huge_page_mask(h);
-	struct page *page = NULL;
-	spinlock_t *ptl;
-	pte_t *pte, entry;
-	int ret;
-
-	hugetlb_vma_lock_read(vma);
-	pte = hugetlb_walk(vma, haddr, huge_page_size(h));
-	if (!pte)
-		goto out_unlock;
-
-	ptl = huge_pte_lock(h, mm, pte);
-	entry = huge_ptep_get(pte);
-	if (pte_present(entry)) {
-		page = pte_page(entry);
-
-		if (!huge_pte_write(entry)) {
-			if (flags & FOLL_WRITE) {
-				page = NULL;
-				goto out;
-			}
-
-			if (gup_must_unshare(vma, flags, page)) {
-				/* Tell the caller to do unsharing */
-				page = ERR_PTR(-EMLINK);
-				goto out;
-			}
-		}
-
-		page = nth_page(page, ((address & ~huge_page_mask(h)) >> PAGE_SHIFT));
-
-		/*
-		 * Note that page may be a sub-page, and with vmemmap
-		 * optimizations the page struct may be read only.
-		 * try_grab_page() will increase the ref count on the
-		 * head page, so this will be OK.
-		 *
-		 * try_grab_page() should always be able to get the page here,
-		 * because we hold the ptl lock and have verified pte_present().
-		 */
-		ret = try_grab_page(page, flags);
-
-		if (WARN_ON_ONCE(ret)) {
-			page = ERR_PTR(ret);
-			goto out;
-		}
-
-		*page_mask = (1U << huge_page_order(h)) - 1;
-	}
-out:
-	spin_unlock(ptl);
-out_unlock:
-	hugetlb_vma_unlock_read(vma);
-
-	/*
-	 * Fixup retval for dump requests: if pagecache doesn't exist,
-	 * don't try to allocate a new page but just skip it.
-	 */
-	if (!page && (flags & FOLL_DUMP) &&
-	    !hugetlbfs_pagecache_present(h, vma, address))
-		page = ERR_PTR(-EFAULT);
-
-	return page;
-}
-
 long hugetlb_change_protection(struct vm_area_struct *vma,
 		unsigned long address, unsigned long end,
 		pgprot_t newprot, unsigned long cp_flags)
-- 
cgit v1.2.3


From c0bff412e67b781d761e330ff9578aa9ed2be79e Mon Sep 17 00:00:00 2001
From: Peter Xu <peterx@redhat.com>
Date: Tue, 2 Apr 2024 21:32:47 -0400
Subject: mm: allow anon exclusive check over hugetlb tail pages

PageAnonExclusive() used to forbid tail pages for hugetlbfs, as that used
to be called mostly in hugetlb specific paths and the head page was
guaranteed.

As we move forward towards merging hugetlb paths into generic mm, we may
start to pass in tail hugetlb pages (when with cont-pte/cont-pmd huge
pages) for such check.  Allow it to properly fetch the head, in which case
the anon-exclusiveness of the head will always represents the tail page.

There's already a sign of it when we look at the GUP-fast which already
contain the hugetlb processing altogether: we used to have a specific
commit 5805192c7b72 ("mm/gup: handle cont-PTE hugetlb pages correctly in
gup_must_unshare() via GUP-fast") covering that area.  Now with this more
generic change, that can also go away.

[akpm@linux-foundation.org: simplify PageAnonExclusive(), per Matthew]
  Link: https://lkml.kernel.org/r/Zg3u5Sh9EbbYPhaI@casper.infradead.org
Link: https://lkml.kernel.org/r/20240403013249.1418299-2-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: WANG Xuerui <kernel@xen0n.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h |  7 ++++++-
 mm/internal.h              | 10 ----------
 2 files changed, 6 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 888353c209c0..7577fe7debaf 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -1095,7 +1095,12 @@ PAGEFLAG(Isolated, isolated, PF_ANY);
 static __always_inline int PageAnonExclusive(const struct page *page)
 {
 	VM_BUG_ON_PGFLAGS(!PageAnon(page), page);
-	VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page);
+	/*
+	 * HugeTLB stores this information on the head page; THP keeps it per
+	 * page
+	 */
+	if (PageHuge(page))
+		page = compound_head(page);
 	return test_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags);
 }
 
diff --git a/mm/internal.h b/mm/internal.h
index 5933a8e7d2dd..6614ba4ca9de 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1203,16 +1203,6 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma,
 	if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
 		smp_rmb();
 
-	/*
-	 * During GUP-fast we might not get called on the head page for a
-	 * hugetlb page that is mapped using cont-PTE, because GUP-fast does
-	 * not work with the abstracted hugetlb PTEs that always point at the
-	 * head page. For hugetlb, PageAnonExclusive only applies on the head
-	 * page (as it cannot be partially COW-shared), so lookup the head page.
-	 */
-	if (unlikely(!PageHead(page) && PageHuge(page)))
-		page = compound_head(page);
-
 	/*
 	 * Note that PageKsm() pages cannot be exclusive, and consequently,
 	 * cannot get pinned.
-- 
cgit v1.2.3


From ba168b52bf8ecb877af294769b0ab32b06bd9293 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 27 Mar 2024 19:06:59 +0000
Subject: mm: use rwsem assertion macros for mmap_lock

This slightly strengthens our write assertion when lockdep is disabled.
It also downgrades us from BUG_ON to WARN_ON, but I think that's an
improvement.  I don't think dumping the mm_struct was all that valuable;
the call chain is what's important.

Link: https://lkml.kernel.org/r/20240327190701.1082560-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mmap_lock.h | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 8d38dcb6d044..de9dc20b01ba 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -60,16 +60,14 @@ static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
 
 #endif /* CONFIG_TRACING */
 
-static inline void mmap_assert_locked(struct mm_struct *mm)
+static inline void mmap_assert_locked(const struct mm_struct *mm)
 {
-	lockdep_assert_held(&mm->mmap_lock);
-	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
+	rwsem_assert_held(&mm->mmap_lock);
 }
 
-static inline void mmap_assert_write_locked(struct mm_struct *mm)
+static inline void mmap_assert_write_locked(const struct mm_struct *mm)
 {
-	lockdep_assert_held_write(&mm->mmap_lock);
-	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
+	rwsem_assert_held_write(&mm->mmap_lock);
 }
 
 #ifdef CONFIG_PER_VMA_LOCK
-- 
cgit v1.2.3


From 07db63a2161a66a2198bc20c7580dd95af887ff4 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 27 Mar 2024 18:54:29 +0000
Subject: filemap: remove __set_page_dirty()

All callers have been converted to use folios; remove this wrapper.

Link: https://lkml.kernel.org/r/20240327185447.1076689-1-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pagemap.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 35636e67e2e1..ec8b83928c30 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1147,11 +1147,6 @@ void folio_end_writeback(struct folio *folio);
 void wait_for_stable_page(struct page *page);
 void folio_wait_stable(struct folio *folio);
 void __folio_mark_dirty(struct folio *folio, struct address_space *, int warn);
-static inline void __set_page_dirty(struct page *page,
-		struct address_space *mapping, int warn)
-{
-	__folio_mark_dirty(page_folio(page), mapping, warn);
-}
 void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb);
 void __folio_cancel_dirty(struct folio *folio);
 static inline void folio_cancel_dirty(struct folio *folio)
-- 
cgit v1.2.3


From 3b612c8f069fecb84af277c8b90e14832749ceca Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 27 Mar 2024 15:35:48 +0100
Subject: mm: optimize CONFIG_PER_VMA_LOCK member placement in vm_area_struct

Currently, we end up wasting some memory in each vm_area_struct. Pahole
states that:
	[...]
	int                        vm_lock_seq;          /*    40     4 */

	/* XXX 4 bytes hole, try to pack */

	struct vma_lock *          vm_lock;              /*    48     8 */
	bool                       detached;             /*    56     1 */

	/* XXX 7 bytes hole, try to pack */
	[...]

Let's reduce the holes and memory wastage by moving the bool:
	[...]
	bool                       detached;             /*    40     1 */

	/* XXX 3 bytes hole, try to pack */

	int                        vm_lock_seq;          /*    44     4 */
	struct vma_lock *          vm_lock;              /*    48     8 */
	[...]

Effectively shrinking the vm_area_struct with CONFIG_PER_VMA_LOCK by
8 byte.

Likely, we could place "detached" in the lowest bit of vm_lock, but at
least on 64bit that won't really make a difference, so keep it simple.

Link: https://lkml.kernel.org/r/20240327143548.744070-1-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_types.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6fef56938a17..fee6561feb7e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -671,6 +671,9 @@ struct vm_area_struct {
 	};
 
 #ifdef CONFIG_PER_VMA_LOCK
+	/* Flag to indicate areas detached from the mm->mm_mt tree */
+	bool detached;
+
 	/*
 	 * Can only be written (using WRITE_ONCE()) while holding both:
 	 *  - mmap_lock (in write mode)
@@ -687,9 +690,6 @@ struct vm_area_struct {
 	 */
 	int vm_lock_seq;
 	struct vma_lock *vm_lock;
-
-	/* Flag to indicate areas detached from the mm->mm_mt tree */
-	bool detached;
 #endif
 
 	/*
-- 
cgit v1.2.3


From 82a616d0f33b77b1cadd0652efbe11874771320f Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 27 Mar 2024 15:33:01 +0100
Subject: mm: remove "prot" parameter from move_pte()

The "prot" parameter is unused, and using it instead of what's stored in
that particular PTE would very likely be wrong.  Let's simply remove it.

Link: https://lkml.kernel.org/r/20240327143301.741807-1-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Andreas Larsson <andreas@gaisler.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/sparc/include/asm/pgtable_64.h | 2 +-
 include/linux/pgtable.h             | 2 +-
 mm/mremap.c                         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 26efc9bb644a..3fe429d73a65 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -957,7 +957,7 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
 
 #ifdef DCACHE_ALIASING_POSSIBLE
 #define __HAVE_ARCH_MOVE_PTE
-#define move_pte(pte, prot, old_addr, new_addr)				\
+#define move_pte(pte, old_addr, new_addr)				\
 ({									\
 	pte_t newpte = (pte);						\
 	if (tlb_type != hypervisor && pte_present(pte)) {		\
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index f108c7a3c1d9..a3fc8150b047 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1079,7 +1079,7 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
 #endif
 
 #ifndef __HAVE_ARCH_MOVE_PTE
-#define move_pte(pte, prot, old_addr, new_addr)	(pte)
+#define move_pte(pte, old_addr, new_addr)	(pte)
 #endif
 
 #ifndef pte_accessible
diff --git a/mm/mremap.c b/mm/mremap.c
index 38d98465f3d8..f5aba752d35f 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -205,7 +205,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 		 */
 		if (pte_present(pte))
 			force_flush = true;
-		pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
+		pte = move_pte(pte, old_addr, new_addr);
 		pte = move_soft_dirty_pte(pte);
 		set_pte_at(mm, new_addr, new_pte, pte);
 	}
-- 
cgit v1.2.3


From 08b8247ebd2b70367ab3a87657efb237662bd8f6 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Wed, 27 Mar 2024 22:30:08 +0800
Subject: mm: remove __set_page_dirty_nobuffers()

There are no more callers of __set_page_dirty_nobuffers(), remove it.

Link: https://lkml.kernel.org/r/20240327143008.3739435-1-wangkefeng.wang@huawei.com
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pagemap.h | 1 -
 mm/folio-compat.c       | 6 ------
 2 files changed, 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index ec8b83928c30..65e332df4d76 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1158,7 +1158,6 @@ static inline void folio_cancel_dirty(struct folio *folio)
 bool folio_clear_dirty_for_io(struct folio *folio);
 bool clear_page_dirty_for_io(struct page *page);
 void folio_invalidate(struct folio *folio, size_t offset, size_t length);
-int __set_page_dirty_nobuffers(struct page *page);
 bool noop_dirty_folio(struct address_space *mapping, struct folio *folio);
 
 #ifdef CONFIG_MIGRATION
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index 50412014f16f..f31e0ce65b11 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -58,12 +58,6 @@ bool set_page_dirty(struct page *page)
 }
 EXPORT_SYMBOL(set_page_dirty);
 
-int __set_page_dirty_nobuffers(struct page *page)
-{
-	return filemap_dirty_folio(page_mapping(page), page_folio(page));
-}
-EXPORT_SYMBOL(__set_page_dirty_nobuffers);
-
 bool clear_page_dirty_for_io(struct page *page)
 {
 	return folio_clear_dirty_for_io(page_folio(page));
-- 
cgit v1.2.3


From 529ce23a764f25d172198b4c6ba90f1e2ad17f93 Mon Sep 17 00:00:00 2001
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
Date: Mon, 25 Mar 2024 19:16:44 -0700
Subject: mm: switch mm->get_unmapped_area() to a flag

The mm_struct contains a function pointer *get_unmapped_area(), which is
set to either arch_get_unmapped_area() or arch_get_unmapped_area_topdown()
during the initialization of the mm.

Since the function pointer only ever points to two functions that are
named the same across all arch's, a function pointer is not really
required.  In addition future changes will want to add versions of the
functions that take additional arguments.  So to save a pointers worth of
bytes in mm_struct, and prevent adding additional function pointers to
mm_struct in future changes, remove it and keep the information about
which get_unmapped_area() to use in a flag.

Add the new flag to MMF_INIT_MASK so it doesn't get clobbered on fork by
mmf_init_flags().  Most MM flags get clobbered on fork.  In the
pre-existing behavior mm->get_unmapped_area() would get copied to the new
mm in dup_mm(), so not clobbering the flag preserves the existing behavior
around inheriting the topdown-ness.

Introduce a helper, mm_get_unmapped_area(), to easily convert code that
refers to the old function pointer to instead select and call either
arch_get_unmapped_area() or arch_get_unmapped_area_topdown() based on the
flag.  Then drop the mm->get_unmapped_area() function pointer.  Leave the
get_unmapped_area() pointer in struct file_operations alone.  The main
purpose of this change is to reorganize in preparation for future changes,
but it also converts the calls of mm->get_unmapped_area() from indirect
branches into a direct ones.

The stress-ng bigheap benchmark calls realloc a lot, which calls through
get_unmapped_area() in the kernel.  On x86, the change yielded a ~1%
improvement there on a retpoline config.

In testing a few x86 configs, removing the pointer unfortunately didn't
result in any actual size reductions in the compiled layout of mm_struct.
But depending on compiler or arch alignment requirements, the change could
shrink the size of mm_struct.

Link: https://lkml.kernel.org/r/20240326021656.202649-3-rick.p.edgecombe@intel.com
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Deepak Gupta <debug@rivosinc.com>
Cc: Guo Ren <guoren@kernel.org>
Cc: Helge Deller <deller@gmx.de>
Cc: H. Peter Anvin (Intel) <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/s390/mm/hugetlbpage.c       |  2 +-
 arch/s390/mm/mmap.c              |  4 ++--
 arch/sparc/kernel/sys_sparc_64.c | 15 ++++++---------
 arch/sparc/mm/hugetlbpage.c      |  2 +-
 arch/x86/kernel/cpu/sgx/driver.c |  2 +-
 arch/x86/mm/hugetlbpage.c        |  2 +-
 arch/x86/mm/mmap.c               |  4 ++--
 drivers/char/mem.c               |  2 +-
 drivers/dax/device.c             |  6 +++---
 fs/hugetlbfs/inode.c             |  4 ++--
 fs/proc/inode.c                  |  3 ++-
 fs/ramfs/file-mmu.c              |  2 +-
 include/linux/mm_types.h         |  6 +-----
 include/linux/sched/coredump.h   |  5 ++++-
 include/linux/sched/mm.h         |  5 +++++
 io_uring/io_uring.c              |  2 +-
 kernel/bpf/arena.c               |  2 +-
 kernel/bpf/syscall.c             |  2 +-
 mm/debug.c                       |  6 ------
 mm/huge_memory.c                 |  9 ++++-----
 mm/mmap.c                        | 21 ++++++++++++++++++---
 mm/shmem.c                       | 11 +++++------
 mm/util.c                        |  6 +++---
 23 files changed, 66 insertions(+), 57 deletions(-)

(limited to 'include')

diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index ca43b6fce71c..7d948e243f4b 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -318,7 +318,7 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 			goto check_asce_limit;
 	}
 
-	if (mm->get_unmapped_area == arch_get_unmapped_area)
+	if (!test_bit(MMF_TOPDOWN, &mm->flags))
 		addr = hugetlb_get_unmapped_area_bottomup(file, addr, len,
 				pgoff, flags);
 	else
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index b14fc0887654..6b2e4436ad4a 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -185,10 +185,10 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
 	 */
 	if (mmap_is_legacy(rlim_stack)) {
 		mm->mmap_base = mmap_base_legacy(random_factor);
-		mm->get_unmapped_area = arch_get_unmapped_area;
+		clear_bit(MMF_TOPDOWN, &mm->flags);
 	} else {
 		mm->mmap_base = mmap_base(random_factor, rlim_stack);
-		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+		set_bit(MMF_TOPDOWN, &mm->flags);
 	}
 }
 
diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c
index 1e9a9e016237..1dbf7211666e 100644
--- a/arch/sparc/kernel/sys_sparc_64.c
+++ b/arch/sparc/kernel/sys_sparc_64.c
@@ -218,14 +218,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags)
 {
 	unsigned long align_goal, addr = -ENOMEM;
-	unsigned long (*get_area)(struct file *, unsigned long,
-				  unsigned long, unsigned long, unsigned long);
-
-	get_area = current->mm->get_unmapped_area;
 
 	if (flags & MAP_FIXED) {
 		/* Ok, don't mess with it. */
-		return get_area(NULL, orig_addr, len, pgoff, flags);
+		return mm_get_unmapped_area(current->mm, NULL, orig_addr, len, pgoff, flags);
 	}
 	flags &= ~MAP_SHARED;
 
@@ -238,7 +234,8 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, u
 		align_goal = (64UL * 1024);
 
 	do {
-		addr = get_area(NULL, orig_addr, len + (align_goal - PAGE_SIZE), pgoff, flags);
+		addr = mm_get_unmapped_area(current->mm, NULL, orig_addr,
+					    len + (align_goal - PAGE_SIZE), pgoff, flags);
 		if (!(addr & ~PAGE_MASK)) {
 			addr = (addr + (align_goal - 1UL)) & ~(align_goal - 1UL);
 			break;
@@ -256,7 +253,7 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, u
 	 * be obtained.
 	 */
 	if (addr & ~PAGE_MASK)
-		addr = get_area(NULL, orig_addr, len, pgoff, flags);
+		addr = mm_get_unmapped_area(current->mm, NULL, orig_addr, len, pgoff, flags);
 
 	return addr;
 }
@@ -292,7 +289,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
 	    gap == RLIM_INFINITY ||
 	    sysctl_legacy_va_layout) {
 		mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
-		mm->get_unmapped_area = arch_get_unmapped_area;
+		clear_bit(MMF_TOPDOWN, &mm->flags);
 	} else {
 		/* We know it's 32-bit */
 		unsigned long task_size = STACK_TOP32;
@@ -303,7 +300,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
 			gap = (task_size / 6 * 5);
 
 		mm->mmap_base = PAGE_ALIGN(task_size - gap - random_factor);
-		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+		set_bit(MMF_TOPDOWN, &mm->flags);
 	}
 }
 
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index 8ed5bdf95d25..c23012e3a353 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -123,7 +123,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		    (!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
-	if (mm->get_unmapped_area == arch_get_unmapped_area)
+	if (!test_bit(MMF_TOPDOWN, &mm->flags))
 		return hugetlb_get_unmapped_area_bottomup(file, addr, len,
 				pgoff, flags);
 	else
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
index 262f5fb18d74..22b65a5f5ec6 100644
--- a/arch/x86/kernel/cpu/sgx/driver.c
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -113,7 +113,7 @@ static unsigned long sgx_get_unmapped_area(struct file *file,
 	if (flags & MAP_FIXED)
 		return addr;
 
-	return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
+	return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
 }
 
 #ifdef CONFIG_COMPAT
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index dab6db288e5b..06ca9a60bac2 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -115,7 +115,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	}
 
 get_unmapped_area:
-	if (mm->get_unmapped_area == arch_get_unmapped_area)
+	if (!test_bit(MMF_TOPDOWN, &mm->flags))
 		return hugetlb_get_unmapped_area_bottomup(file, addr, len,
 				pgoff, flags);
 	else
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index c90c20904a60..a2cabb1c81e1 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -129,9 +129,9 @@ static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base,
 void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
 {
 	if (mmap_is_legacy())
-		mm->get_unmapped_area = arch_get_unmapped_area;
+		clear_bit(MMF_TOPDOWN, &mm->flags);
 	else
-		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+		set_bit(MMF_TOPDOWN, &mm->flags);
 
 	arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base,
 			arch_rnd(mmap64_rnd_bits), task_size_64bit(0),
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 3c6670cf905f..9b80e622ae80 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -544,7 +544,7 @@ static unsigned long get_unmapped_area_zero(struct file *file,
 	}
 
 	/* Otherwise flags & MAP_PRIVATE: with no shmem object beneath it */
-	return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
+	return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
 #else
 	return -ENOSYS;
 #endif
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 93ebedc5ec8c..47c126d37b59 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -329,14 +329,14 @@ static unsigned long dax_get_unmapped_area(struct file *filp,
 	if ((off + len_align) < off)
 		goto out;
 
-	addr_align = current->mm->get_unmapped_area(filp, addr, len_align,
-			pgoff, flags);
+	addr_align = mm_get_unmapped_area(current->mm, filp, addr, len_align,
+					  pgoff, flags);
 	if (!IS_ERR_VALUE(addr_align)) {
 		addr_align += (off - addr_align) & (align - 1);
 		return addr_align;
 	}
  out:
-	return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
+	return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags);
 }
 
 static const struct address_space_operations dev_dax_aops = {
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6502c7e776d1..3dee18bf47ed 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -249,11 +249,11 @@ generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	}
 
 	/*
-	 * Use mm->get_unmapped_area value as a hint to use topdown routine.
+	 * Use MMF_TOPDOWN flag as a hint to use topdown routine.
 	 * If architectures have special needs, they should define their own
 	 * version of hugetlb_get_unmapped_area.
 	 */
-	if (mm->get_unmapped_area == arch_get_unmapped_area_topdown)
+	if (test_bit(MMF_TOPDOWN, &mm->flags))
 		return hugetlb_get_unmapped_area_topdown(file, addr, len,
 				pgoff, flags);
 	return hugetlb_get_unmapped_area_bottomup(file, addr, len,
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 75396a24fd8c..d19434e2a58e 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -455,8 +455,9 @@ pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned lo
 		return pde->proc_ops->proc_get_unmapped_area(file, orig_addr, len, pgoff, flags);
 
 #ifdef CONFIG_MMU
-	return current->mm->get_unmapped_area(file, orig_addr, len, pgoff, flags);
+	return mm_get_unmapped_area(current->mm, file, orig_addr, len, pgoff, flags);
 #endif
+
 	return orig_addr;
 }
 
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index c7a1aa3c882b..b45c7edc3225 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -35,7 +35,7 @@ static unsigned long ramfs_mmu_get_unmapped_area(struct file *file,
 		unsigned long addr, unsigned long len, unsigned long pgoff,
 		unsigned long flags)
 {
-	return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
+	return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
 }
 
 const struct file_operations ramfs_file_operations = {
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index fee6561feb7e..fa0d6995706f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -777,11 +777,7 @@ struct mm_struct {
 		} ____cacheline_aligned_in_smp;
 
 		struct maple_tree mm_mt;
-#ifdef CONFIG_MMU
-		unsigned long (*get_unmapped_area) (struct file *filp,
-				unsigned long addr, unsigned long len,
-				unsigned long pgoff, unsigned long flags);
-#endif
+
 		unsigned long mmap_base;	/* base of mmap area */
 		unsigned long mmap_legacy_base;	/* base of mmap area in bottom-up allocations */
 #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 02f5090ffea2..e62ff805cfc9 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -92,9 +92,12 @@ static inline int get_dumpable(struct mm_struct *mm)
 #define MMF_VM_MERGE_ANY	30
 #define MMF_VM_MERGE_ANY_MASK	(1 << MMF_VM_MERGE_ANY)
 
+#define MMF_TOPDOWN		31	/* mm searches top down by default */
+#define MMF_TOPDOWN_MASK	(1 << MMF_TOPDOWN)
+
 #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
 				 MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\
-				 MMF_VM_MERGE_ANY_MASK)
+				 MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK)
 
 static inline unsigned long mmf_init_flags(unsigned long flags)
 {
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index b6543f9d78d6..ed1caa26c8be 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -8,6 +8,7 @@
 #include <linux/mm_types.h>
 #include <linux/gfp.h>
 #include <linux/sync_core.h>
+#include <linux/sched/coredump.h>
 
 /*
  * Routines for handling mm_structs
@@ -186,6 +187,10 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
 			  unsigned long len, unsigned long pgoff,
 			  unsigned long flags);
 
+unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *filp,
+				   unsigned long addr, unsigned long len,
+				   unsigned long pgoff, unsigned long flags);
+
 unsigned long
 generic_get_unmapped_area(struct file *filp, unsigned long addr,
 			  unsigned long len, unsigned long pgoff,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c170a2b8d2cf..d5edfb8444d7 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3525,7 +3525,7 @@ static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp,
 #else
 	addr = 0UL;
 #endif
-	return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
+	return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags);
 }
 
 #else /* !CONFIG_MMU */
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 343c3456c8dd..16dbf4f6b77f 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -314,7 +314,7 @@ static unsigned long arena_get_unmapped_area(struct file *filp, unsigned long ad
 			return -EINVAL;
 	}
 
-	ret = current->mm->get_unmapped_area(filp, addr, len * 2, 0, flags);
+	ret = mm_get_unmapped_area(current->mm, filp, addr, len * 2, 0, flags);
 	if (IS_ERR_VALUE(ret))
 		return ret;
 	if ((ret >> 32) == ((ret + len - 1) >> 32))
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c287925471f6..9cb89e875f0d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -980,7 +980,7 @@ static unsigned long bpf_get_unmapped_area(struct file *filp, unsigned long addr
 	if (map->ops->map_get_unmapped_area)
 		return map->ops->map_get_unmapped_area(filp, addr, len, pgoff, flags);
 #ifdef CONFIG_MMU
-	return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
+	return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags);
 #else
 	return addr;
 #endif
diff --git a/mm/debug.c b/mm/debug.c
index e8a96b8b7197..b71186f1fb0b 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -177,9 +177,6 @@ EXPORT_SYMBOL(dump_vma);
 void dump_mm(const struct mm_struct *mm)
 {
 	pr_emerg("mm %px task_size %lu\n"
-#ifdef CONFIG_MMU
-		"get_unmapped_area %px\n"
-#endif
 		"mmap_base %lu mmap_legacy_base %lu\n"
 		"pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n"
 		"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
@@ -205,9 +202,6 @@ void dump_mm(const struct mm_struct *mm)
 		"def_flags: %#lx(%pGv)\n",
 
 		mm, mm->task_size,
-#ifdef CONFIG_MMU
-		mm->get_unmapped_area,
-#endif
 		mm->mmap_base, mm->mmap_legacy_base,
 		mm->pgd, atomic_read(&mm->mm_users),
 		atomic_read(&mm->mm_count),
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 157cee64850c..7fdb2275a1e0 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -816,8 +816,8 @@ static unsigned long __thp_get_unmapped_area(struct file *filp,
 	if (len_pad < len || (off + len_pad) < off)
 		return 0;
 
-	ret = current->mm->get_unmapped_area(filp, addr, len_pad,
-					      off >> PAGE_SHIFT, flags);
+	ret = mm_get_unmapped_area(current->mm, filp, addr, len_pad,
+				   off >> PAGE_SHIFT, flags);
 
 	/*
 	 * The failure might be due to length padding. The caller will retry
@@ -835,8 +835,7 @@ static unsigned long __thp_get_unmapped_area(struct file *filp,
 
 	off_sub = (off - ret) & (size - 1);
 
-	if (current->mm->get_unmapped_area == arch_get_unmapped_area_topdown &&
-	    !off_sub)
+	if (test_bit(MMF_TOPDOWN, &current->mm->flags) && !off_sub)
 		return ret + size;
 
 	ret += off_sub;
@@ -853,7 +852,7 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
 	if (ret)
 		return ret;
 
-	return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
+	return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags);
 }
 EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 77a625e13ec1..dfb8a518e8c9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1812,7 +1812,8 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 		unsigned long pgoff, unsigned long flags)
 {
 	unsigned long (*get_area)(struct file *, unsigned long,
-				  unsigned long, unsigned long, unsigned long);
+				  unsigned long, unsigned long, unsigned long)
+				  = NULL;
 
 	unsigned long error = arch_mmap_check(addr, len, flags);
 	if (error)
@@ -1822,7 +1823,6 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 	if (len > TASK_SIZE)
 		return -ENOMEM;
 
-	get_area = current->mm->get_unmapped_area;
 	if (file) {
 		if (file->f_op->get_unmapped_area)
 			get_area = file->f_op->get_unmapped_area;
@@ -1841,7 +1841,11 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 	if (!file)
 		pgoff = 0;
 
-	addr = get_area(file, addr, len, pgoff, flags);
+	if (get_area)
+		addr = get_area(file, addr, len, pgoff, flags);
+	else
+		addr = mm_get_unmapped_area(current->mm, file, addr, len,
+					    pgoff, flags);
 	if (IS_ERR_VALUE(addr))
 		return addr;
 
@@ -1856,6 +1860,17 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 
 EXPORT_SYMBOL(get_unmapped_area);
 
+unsigned long
+mm_get_unmapped_area(struct mm_struct *mm, struct file *file,
+		     unsigned long addr, unsigned long len,
+		     unsigned long pgoff, unsigned long flags)
+{
+	if (test_bit(MMF_TOPDOWN, &mm->flags))
+		return arch_get_unmapped_area_topdown(file, addr, len, pgoff, flags);
+	return arch_get_unmapped_area(file, addr, len, pgoff, flags);
+}
+EXPORT_SYMBOL(mm_get_unmapped_area);
+
 /**
  * find_vma_intersection() - Look up the first VMA which intersects the interval
  * @mm: The process address space.
diff --git a/mm/shmem.c b/mm/shmem.c
index 98985179f495..fa2a0ed97507 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2267,8 +2267,6 @@ unsigned long shmem_get_unmapped_area(struct file *file,
 				      unsigned long uaddr, unsigned long len,
 				      unsigned long pgoff, unsigned long flags)
 {
-	unsigned long (*get_area)(struct file *,
-		unsigned long, unsigned long, unsigned long, unsigned long);
 	unsigned long addr;
 	unsigned long offset;
 	unsigned long inflated_len;
@@ -2278,8 +2276,8 @@ unsigned long shmem_get_unmapped_area(struct file *file,
 	if (len > TASK_SIZE)
 		return -ENOMEM;
 
-	get_area = current->mm->get_unmapped_area;
-	addr = get_area(file, uaddr, len, pgoff, flags);
+	addr = mm_get_unmapped_area(current->mm, file, uaddr, len, pgoff,
+				    flags);
 
 	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
 		return addr;
@@ -2336,7 +2334,8 @@ unsigned long shmem_get_unmapped_area(struct file *file,
 	if (inflated_len < len)
 		return addr;
 
-	inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags);
+	inflated_addr = mm_get_unmapped_area(current->mm, NULL, uaddr,
+					     inflated_len, 0, flags);
 	if (IS_ERR_VALUE(inflated_addr))
 		return addr;
 	if (inflated_addr & ~PAGE_MASK)
@@ -4801,7 +4800,7 @@ unsigned long shmem_get_unmapped_area(struct file *file,
 				      unsigned long addr, unsigned long len,
 				      unsigned long pgoff, unsigned long flags)
 {
-	return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
+	return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
 }
 #endif
 
diff --git a/mm/util.c b/mm/util.c
index a9e911b22b99..c9e519e6811f 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -469,17 +469,17 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
 
 	if (mmap_is_legacy(rlim_stack)) {
 		mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
-		mm->get_unmapped_area = arch_get_unmapped_area;
+		clear_bit(MMF_TOPDOWN, &mm->flags);
 	} else {
 		mm->mmap_base = mmap_base(random_factor, rlim_stack);
-		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+		set_bit(MMF_TOPDOWN, &mm->flags);
 	}
 }
 #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
 void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
 {
 	mm->mmap_base = TASK_UNMAPPED_BASE;
-	mm->get_unmapped_area = arch_get_unmapped_area;
+	clear_bit(MMF_TOPDOWN, &mm->flags);
 }
 #endif
 
-- 
cgit v1.2.3


From 961148704acd814a4a5acaa4687a45e2315dd857 Mon Sep 17 00:00:00 2001
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
Date: Mon, 25 Mar 2024 19:16:45 -0700
Subject: mm: introduce arch_get_unmapped_area_vmflags()

When memory is being placed, mmap() will take care to respect the guard
gaps of certain types of memory (VM_SHADOWSTACK, VM_GROWSUP and
VM_GROWSDOWN).  In order to ensure guard gaps between mappings, mmap()
needs to consider two things:

 1. That the new mapping isn't placed in an any existing mappings guard
    gaps.
 2. That the new mapping isn't placed such that any existing mappings
    are not in *its* guard gaps.

The longstanding behavior of mmap() is to ensure 1, but not take any care
around 2.  So for example, if there is a PAGE_SIZE free area, and a mmap()
with a PAGE_SIZE size, and a type that has a guard gap is being placed,
mmap() may place the shadow stack in the PAGE_SIZE free area.  Then the
mapping that is supposed to have a guard gap will not have a gap to the
adjacent VMA.

In order to take the start gap into account, the maple tree search needs
to know the size of start gap the new mapping will need.  The call chain
from do_mmap() to the actual maple tree search looks like this:

do_mmap(size, vm_flags, map_flags, ..)
	mm/mmap.c:get_unmapped_area(size, map_flags, ...)
		arch_get_unmapped_area(size, map_flags, ...)
			vm_unmapped_area(struct vm_unmapped_area_info)

One option would be to add another MAP_ flag to mean a one page start gap
(as is for shadow stack), but this consumes a flag unnecessarily.  Another
option could be to simply increase the size passed in do_mmap() by the
start gap size, and adjust after the fact, but this will interfere with
the alignment requirements passed in struct vm_unmapped_area_info, and
unknown to mmap.c.  Instead, introduce variants of
arch_get_unmapped_area/_topdown() that take vm_flags.  In future changes,
these variants can be used in mmap.c:get_unmapped_area() to allow the
vm_flags to be passed through to vm_unmapped_area(), while preserving the
normal arch_get_unmapped_area/_topdown() for the existing callers.

Link: https://lkml.kernel.org/r/20240326021656.202649-4-rick.p.edgecombe@intel.com
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Deepak Gupta <debug@rivosinc.com>
Cc: Guo Ren <guoren@kernel.org>
Cc: Helge Deller <deller@gmx.de>
Cc: H. Peter Anvin (Intel) <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched/mm.h | 17 +++++++++++++++++
 mm/mmap.c                | 28 ++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

(limited to 'include')

diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index ed1caa26c8be..91546493c43d 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -191,6 +191,23 @@ unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *filp,
 				   unsigned long addr, unsigned long len,
 				   unsigned long pgoff, unsigned long flags);
 
+unsigned long
+arch_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
+			       unsigned long len, unsigned long pgoff,
+			       unsigned long flags, vm_flags_t vm_flags);
+unsigned long
+arch_get_unmapped_area_topdown_vmflags(struct file *filp, unsigned long addr,
+				       unsigned long len, unsigned long pgoff,
+				       unsigned long flags, vm_flags_t);
+
+unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm,
+					   struct file *filp,
+					   unsigned long addr,
+					   unsigned long len,
+					   unsigned long pgoff,
+					   unsigned long flags,
+					   vm_flags_t vm_flags);
+
 unsigned long
 generic_get_unmapped_area(struct file *filp, unsigned long addr,
 			  unsigned long len, unsigned long pgoff,
diff --git a/mm/mmap.c b/mm/mmap.c
index dfb8a518e8c9..b33055d1561b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1807,6 +1807,34 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
 }
 #endif
 
+#ifndef HAVE_ARCH_UNMAPPED_AREA_VMFLAGS
+unsigned long
+arch_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, unsigned long len,
+			       unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags)
+{
+	return arch_get_unmapped_area(filp, addr, len, pgoff, flags);
+}
+
+unsigned long
+arch_get_unmapped_area_topdown_vmflags(struct file *filp, unsigned long addr,
+				       unsigned long len, unsigned long pgoff,
+				       unsigned long flags, vm_flags_t vm_flags)
+{
+	return arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags);
+}
+#endif
+
+unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *filp,
+					   unsigned long addr, unsigned long len,
+					   unsigned long pgoff, unsigned long flags,
+					   vm_flags_t vm_flags)
+{
+	if (test_bit(MMF_TOPDOWN, &mm->flags))
+		return arch_get_unmapped_area_topdown_vmflags(filp, addr, len, pgoff,
+							      flags, vm_flags);
+	return arch_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, vm_flags);
+}
+
 unsigned long
 get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 		unsigned long pgoff, unsigned long flags)
-- 
cgit v1.2.3


From 8a0fe564bb2e744f21ca94e3eabe96d232cf5f41 Mon Sep 17 00:00:00 2001
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
Date: Mon, 25 Mar 2024 19:16:47 -0700
Subject: mm: use get_unmapped_area_vmflags()

When memory is being placed, mmap() will take care to respect the guard
gaps of certain types of memory (VM_SHADOWSTACK, VM_GROWSUP and
VM_GROWSDOWN).  In order to ensure guard gaps between mappings, mmap()
needs to consider two things:

 1. That the new mapping isn't placed in an any existing mappings guard
    gaps.
 2. That the new mapping isn't placed such that any existing mappings
    are not in *its* guard gaps.

The long standing behavior of mmap() is to ensure 1, but not take any care
around 2.  So for example, if there is a PAGE_SIZE free area, and a mmap()
with a PAGE_SIZE size, and a type that has a guard gap is being placed,
mmap() may place the shadow stack in the PAGE_SIZE free area.  Then the
mapping that is supposed to have a guard gap will not have a gap to the
adjacent VMA.

Use mm_get_unmapped_area_vmflags() in the do_mmap() so future changes can
cause shadow stack mappings to be placed with a guard gap.  Also use the
THP variant that takes vm_flags, such that THP shadow stack can get the
same treatment.  Adjust the vm_flags calculation to happen earlier so that
the vm_flags can be passed into __get_unmapped_area().

Link: https://lkml.kernel.org/r/20240326021656.202649-6-rick.p.edgecombe@intel.com
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Deepak Gupta <debug@rivosinc.com>
Cc: Guo Ren <guoren@kernel.org>
Cc: Helge Deller <deller@gmx.de>
Cc: H. Peter Anvin (Intel) <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 11 ++++++++++-
 mm/mmap.c          | 32 ++++++++++++++++----------------
 2 files changed, 26 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3dda0dbcae14..1b6903f4c57b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3413,7 +3413,16 @@ extern int install_special_mapping(struct mm_struct *mm,
 unsigned long randomize_stack_top(unsigned long stack_top);
 unsigned long randomize_page(unsigned long start, unsigned long range);
 
-extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+unsigned long
+__get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
+		    unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags);
+
+static inline unsigned long
+get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
+		  unsigned long pgoff, unsigned long flags)
+{
+	return __get_unmapped_area(file, addr, len, pgoff, flags, 0);
+}
 
 extern unsigned long mmap_region(struct file *file, unsigned long addr,
 	unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
diff --git a/mm/mmap.c b/mm/mmap.c
index 30baba087c65..a37c8dd29d64 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1255,18 +1255,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 	if (mm->map_count > sysctl_max_map_count)
 		return -ENOMEM;
 
-	/* Obtain the address to map to. we verify (or select) it and ensure
-	 * that it represents a valid section of the address space.
-	 */
-	addr = get_unmapped_area(file, addr, len, pgoff, flags);
-	if (IS_ERR_VALUE(addr))
-		return addr;
-
-	if (flags & MAP_FIXED_NOREPLACE) {
-		if (find_vma_intersection(mm, addr, addr + len))
-			return -EEXIST;
-	}
-
 	if (prot == PROT_EXEC) {
 		pkey = execute_only_pkey(mm);
 		if (pkey < 0)
@@ -1280,6 +1268,18 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 	vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
 			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 
+	/* Obtain the address to map to. we verify (or select) it and ensure
+	 * that it represents a valid section of the address space.
+	 */
+	addr = __get_unmapped_area(file, addr, len, pgoff, flags, vm_flags);
+	if (IS_ERR_VALUE(addr))
+		return addr;
+
+	if (flags & MAP_FIXED_NOREPLACE) {
+		if (find_vma_intersection(mm, addr, addr + len))
+			return -EEXIST;
+	}
+
 	if (flags & MAP_LOCKED)
 		if (!can_do_mlock())
 			return -EPERM;
@@ -1836,8 +1836,8 @@ unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *fi
 }
 
 unsigned long
-get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
-		unsigned long pgoff, unsigned long flags)
+__get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
+		unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags)
 {
 	unsigned long (*get_area)(struct file *, unsigned long,
 				  unsigned long, unsigned long, unsigned long)
@@ -1872,8 +1872,8 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 	if (get_area)
 		addr = get_area(file, addr, len, pgoff, flags);
 	else
-		addr = mm_get_unmapped_area(current->mm, file, addr, len,
-					    pgoff, flags);
+		addr = mm_get_unmapped_area_vmflags(current->mm, file, addr, len,
+						    pgoff, flags, vm_flags);
 	if (IS_ERR_VALUE(addr))
 		return addr;
 
-- 
cgit v1.2.3


From ed48e87c7df3651accfa3088076830727b070f54 Mon Sep 17 00:00:00 2001
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
Date: Mon, 25 Mar 2024 19:16:48 -0700
Subject: thp: add thp_get_unmapped_area_vmflags()

When memory is being placed, mmap() will take care to respect the guard
gaps of certain types of memory (VM_SHADOWSTACK, VM_GROWSUP and
VM_GROWSDOWN).  In order to ensure guard gaps between mappings, mmap()
needs to consider two things:

 1. That the new mapping isn't placed in an any existing mappings guard
    gaps.
 2. That the new mapping isn't placed such that any existing mappings
    are not in *its* guard gaps.

The longstanding behavior of mmap() is to ensure 1, but not take any care
around 2.  So for example, if there is a PAGE_SIZE free area, and a mmap()
with a PAGE_SIZE size, and a type that has a guard gap is being placed,
mmap() may place the shadow stack in the PAGE_SIZE free area.  Then the
mapping that is supposed to have a guard gap will not have a gap to the
adjacent VMA.

Add a THP implementations of the vm_flags variant of get_unmapped_area().
Future changes will call this from mmap.c in the do_mmap() path to allow
shadow stacks to be placed with consideration taken for the start guard
gap.  Shadow stack memory is always private and anonymous and so special
guard gap logic is not needed in a lot of caseis, but it can be mapped by
THP, so needs to be handled.

Link: https://lkml.kernel.org/r/20240326021656.202649-7-rick.p.edgecombe@intel.com
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Deepak Gupta <debug@rivosinc.com>
Cc: Guo Ren <guoren@kernel.org>
Cc: Helge Deller <deller@gmx.de>
Cc: H. Peter Anvin (Intel) <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 11 +++++++++++
 mm/huge_memory.c        | 23 ++++++++++++++++-------
 mm/mmap.c               | 12 +++++++-----
 3 files changed, 34 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index ab26d9e65ec3..e896ca4760f6 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -270,6 +270,9 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
 
 unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
 		unsigned long len, unsigned long pgoff, unsigned long flags);
+unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
+		unsigned long len, unsigned long pgoff, unsigned long flags,
+		vm_flags_t vm_flags);
 
 bool can_split_folio(struct folio *folio, int *pextra_pins);
 int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
@@ -413,6 +416,14 @@ static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
 
 #define thp_get_unmapped_area	NULL
 
+static inline unsigned long
+thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
+			      unsigned long len, unsigned long pgoff,
+			      unsigned long flags, vm_flags_t vm_flags)
+{
+	return 0;
+}
+
 static inline bool
 can_split_folio(struct folio *folio, int *pextra_pins)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7fdb2275a1e0..b757d9be310c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -800,7 +800,8 @@ static inline bool is_transparent_hugepage(const struct folio *folio)
 
 static unsigned long __thp_get_unmapped_area(struct file *filp,
 		unsigned long addr, unsigned long len,
-		loff_t off, unsigned long flags, unsigned long size)
+		loff_t off, unsigned long flags, unsigned long size,
+		vm_flags_t vm_flags)
 {
 	loff_t off_end = off + len;
 	loff_t off_align = round_up(off, size);
@@ -816,8 +817,8 @@ static unsigned long __thp_get_unmapped_area(struct file *filp,
 	if (len_pad < len || (off + len_pad) < off)
 		return 0;
 
-	ret = mm_get_unmapped_area(current->mm, filp, addr, len_pad,
-				   off >> PAGE_SHIFT, flags);
+	ret = mm_get_unmapped_area_vmflags(current->mm, filp, addr, len_pad,
+					   off >> PAGE_SHIFT, flags, vm_flags);
 
 	/*
 	 * The failure might be due to length padding. The caller will retry
@@ -842,17 +843,25 @@ static unsigned long __thp_get_unmapped_area(struct file *filp,
 	return ret;
 }
 
-unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
-		unsigned long len, unsigned long pgoff, unsigned long flags)
+unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
+		unsigned long len, unsigned long pgoff, unsigned long flags,
+		vm_flags_t vm_flags)
 {
 	unsigned long ret;
 	loff_t off = (loff_t)pgoff << PAGE_SHIFT;
 
-	ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE);
+	ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE, vm_flags);
 	if (ret)
 		return ret;
 
-	return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags);
+	return mm_get_unmapped_area_vmflags(current->mm, filp, addr, len, pgoff, flags,
+					    vm_flags);
+}
+
+unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
+		unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+	return thp_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, 0);
 }
 EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
 
diff --git a/mm/mmap.c b/mm/mmap.c
index a37c8dd29d64..0ef1191e3be5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1860,20 +1860,22 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 		 * so use shmem's get_unmapped_area in case it can be huge.
 		 */
 		get_area = shmem_get_unmapped_area;
-	} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
-		/* Ensures that larger anonymous mappings are THP aligned. */
-		get_area = thp_get_unmapped_area;
 	}
 
 	/* Always treat pgoff as zero for anonymous memory. */
 	if (!file)
 		pgoff = 0;
 
-	if (get_area)
+	if (get_area) {
 		addr = get_area(file, addr, len, pgoff, flags);
-	else
+	} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+		/* Ensures that larger anonymous mappings are THP aligned. */
+		addr = thp_get_unmapped_area_vmflags(file, addr, len,
+						     pgoff, flags, vm_flags);
+	} else {
 		addr = mm_get_unmapped_area_vmflags(current->mm, file, addr, len,
 						    pgoff, flags, vm_flags);
+	}
 	if (IS_ERR_VALUE(addr))
 		return addr;
 
-- 
cgit v1.2.3


From 44bd7ace9fd6490dce3b001b21d5c6bac869171b Mon Sep 17 00:00:00 2001
From: Rick Edgecombe <rick.p.edgecombe@intel.com>
Date: Mon, 25 Mar 2024 19:16:53 -0700
Subject: mm: take placement mappings gap into account

When memory is being placed, mmap() will take care to respect the guard
gaps of certain types of memory (VM_SHADOWSTACK, VM_GROWSUP and
VM_GROWSDOWN).  In order to ensure guard gaps between mappings, mmap()
needs to consider two things:

 1. That the new mapping isn't placed in an any existing mappings guard
    gaps.
 2. That the new mapping isn't placed such that any existing mappings
    are not in *its* guard gaps.

The longstanding behavior of mmap() is to ensure 1, but not take any care
around 2.  So for example, if there is a PAGE_SIZE free area, and a mmap()
with a PAGE_SIZE size, and a type that has a guard gap is being placed,
mmap() may place the shadow stack in the PAGE_SIZE free area.  Then the
mapping that is supposed to have a guard gap will not have a gap to the
adjacent VMA.

For MAP_GROWSDOWN/VM_GROWSDOWN and MAP_GROWSUP/VM_GROWSUP this has not
been a problem in practice because applications place these kinds of
mappings very early, when there is not many mappings to find a space
between.  But for shadow stacks, they may be placed throughout the
lifetime of the application.

Use the start_gap field to find a space that includes the guard gap for
the new mapping.  Take care to not interfere with the alignment.

Link: https://lkml.kernel.org/r/20240326021656.202649-12-rick.p.edgecombe@intel.com
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Deepak Gupta <debug@rivosinc.com>
Cc: Guo Ren <guoren@kernel.org>
Cc: Helge Deller <deller@gmx.de>
Cc: H. Peter Anvin (Intel) <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h |  1 +
 mm/mmap.c          | 12 +++++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1b6903f4c57b..2d5e492ef57f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3468,6 +3468,7 @@ struct vm_unmapped_area_info {
 	unsigned long high_limit;
 	unsigned long align_mask;
 	unsigned long align_offset;
+	unsigned long start_gap;
 };
 
 extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info);
diff --git a/mm/mmap.c b/mm/mmap.c
index 4ad386f3f63f..d3c2ca8efa53 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1579,7 +1579,7 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
 	VMA_ITERATOR(vmi, current->mm, 0);
 
 	/* Adjust search length to account for worst case alignment overhead */
-	length = info->length + info->align_mask;
+	length = info->length + info->align_mask + info->start_gap;
 	if (length < info->length)
 		return -ENOMEM;
 
@@ -1591,7 +1591,13 @@ retry:
 	if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length))
 		return -ENOMEM;
 
-	gap = vma_iter_addr(&vmi);
+	/*
+	 * Adjust for the gap first so it doesn't interfere with the
+	 * later alignment. The first step is the minimum needed to
+	 * fulill the start gap, the next steps is the minimum to align
+	 * that. It is the minimum needed to fulill both.
+	 */
+	gap = vma_iter_addr(&vmi) + info->start_gap;
 	gap += (info->align_offset - gap) & info->align_mask;
 	tmp = vma_next(&vmi);
 	if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
@@ -1630,7 +1636,7 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
 	VMA_ITERATOR(vmi, current->mm, 0);
 
 	/* Adjust search length to account for worst case alignment overhead */
-	length = info->length + info->align_mask;
+	length = info->length + info->align_mask + info->start_gap;
 	if (length < info->length)
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From 3a9e567ca45fb5280065283d10d9a11f0db61d2b Mon Sep 17 00:00:00 2001
From: Jinjiang Tu <tujinjiang@huawei.com>
Date: Thu, 28 Mar 2024 19:10:08 +0800
Subject: mm/ksm: fix ksm exec support for prctl

Patch series "mm/ksm: fix ksm exec support for prctl", v4.

commit 3c6f33b7273a ("mm/ksm: support fork/exec for prctl") inherits
MMF_VM_MERGE_ANY flag when a task calls execve().  However, it doesn't
create the mm_slot, so ksmd will not try to scan this task.  The first
patch fixes the issue.

The second patch refactors to prepare for the third patch.  The third
patch extends the selftests of ksm to verfity the deduplication really
happens after fork/exec inherits ths KSM setting.


This patch (of 3):

commit 3c6f33b7273a ("mm/ksm: support fork/exec for prctl") inherits
MMF_VM_MERGE_ANY flag when a task calls execve().  Howerver, it doesn't
create the mm_slot, so ksmd will not try to scan this task.

To fix it, allocate and add the mm_slot to ksm_mm_head in __bprm_mm_init()
when the mm has MMF_VM_MERGE_ANY flag.

Link: https://lkml.kernel.org/r/20240328111010.1502191-1-tujinjiang@huawei.com
Link: https://lkml.kernel.org/r/20240328111010.1502191-2-tujinjiang@huawei.com
Fixes: 3c6f33b7273a ("mm/ksm: support fork/exec for prctl")
Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Nanyong Sun <sunnanyong@huawei.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Stefan Roesch <shr@devkernel.io>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/exec.c           | 11 +++++++++++
 include/linux/ksm.h | 13 +++++++++++++
 2 files changed, 24 insertions(+)

(limited to 'include')

diff --git a/fs/exec.c b/fs/exec.c
index cf1df7f16e55..0c5f06d08c35 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -67,6 +67,7 @@
 #include <linux/time_namespace.h>
 #include <linux/user_events.h>
 #include <linux/rseq.h>
+#include <linux/ksm.h>
 
 #include <linux/uaccess.h>
 #include <asm/mmu_context.h>
@@ -267,6 +268,14 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
 		goto err_free;
 	}
 
+	/*
+	 * Need to be called with mmap write lock
+	 * held, to avoid race with ksmd.
+	 */
+	err = ksm_execve(mm);
+	if (err)
+		goto err_ksm;
+
 	/*
 	 * Place the stack at the largest stack address the architecture
 	 * supports. Later, we'll move this to an appropriate place. We don't
@@ -288,6 +297,8 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
 	bprm->p = vma->vm_end - sizeof(void *);
 	return 0;
 err:
+	ksm_exit(mm);
+err_ksm:
 	mmap_write_unlock(mm);
 err_free:
 	bprm->vma = NULL;
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 401348e9f92b..7e2b1de3996a 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -59,6 +59,14 @@ static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 	return 0;
 }
 
+static inline int ksm_execve(struct mm_struct *mm)
+{
+	if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+		return __ksm_enter(mm);
+
+	return 0;
+}
+
 static inline void ksm_exit(struct mm_struct *mm)
 {
 	if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
@@ -107,6 +115,11 @@ static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 	return 0;
 }
 
+static inline int ksm_execve(struct mm_struct *mm)
+{
+	return 0;
+}
+
 static inline void ksm_exit(struct mm_struct *mm)
 {
 }
-- 
cgit v1.2.3


From ba42b524a0408b5f92bd41edaee1ea84309ab9ae Mon Sep 17 00:00:00 2001
From: York Jasper Niebuhr <yjnworkstation@gmail.com>
Date: Fri, 29 Mar 2024 15:56:05 +0100
Subject: mm: init_mlocked_on_free_v3

Implements the "init_mlocked_on_free" boot option. When this boot option
is enabled, any mlock'ed pages are zeroed on free. If
the pages are munlock'ed beforehand, no initialization takes place.
This boot option is meant to combat the performance hit of
"init_on_free" as reported in commit 6471384af2a6 ("mm: security:
introduce init_on_alloc=1 and init_on_free=1 boot options"). With
"init_mlocked_on_free=1" only relevant data is freed while everything
else is left untouched by the kernel. Correspondingly, this patch
introduces no performance hit for unmapping non-mlock'ed memory. The
unmapping overhead for purely mlocked memory was measured to be
approximately 13%. Realistically, most systems mlock only a fraction of
the total memory so the real-world system overhead should be close to
zero.

Optimally, userspace programs clear any key material or other
confidential memory before exit and munlock the according memory
regions. If a program crashes, userspace key managers fail to do this
job. Accordingly, no munlock operations are performed so the data is
caught and zeroed by the kernel. Should the program not crash, all
memory will ideally be munlocked so no overhead is caused.

CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON can be set to enable
"init_mlocked_on_free" by default.

Link: https://lkml.kernel.org/r/20240329145605.149917-1-yjnworkstation@gmail.com
Signed-off-by: York Jasper Niebuhr <yjnworkstation@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: York Jasper Niebuhr <yjnworkstation@gmail.com>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  6 ++++
 include/linux/mm.h                              |  9 +++++-
 mm/internal.h                                   |  1 +
 mm/memory.c                                     |  6 ++++
 mm/mm_init.c                                    | 43 +++++++++++++++++++++----
 mm/page_alloc.c                                 |  2 +-
 security/Kconfig.hardening                      | 15 +++++++++
 7 files changed, 73 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 902ecd92a29f..3ff97de349da 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2148,6 +2148,12 @@
 			Format: 0 | 1
 			Default set by CONFIG_INIT_ON_FREE_DEFAULT_ON.
 
+	init_mlocked_on_free=	[MM] Fill freed userspace memory with zeroes if
+				it was mlock'ed and not explicitly munlock'ed
+				afterwards.
+				Format: 0 | 1
+				Default set by CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON
+
 	init_pkru=	[X86] Specify the default memory protection keys rights
 			register contents for all processes.  0x55555554 by
 			default (disallow access to all but pkey 0).  Can
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2d5e492ef57f..4f4e460d7853 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3762,7 +3762,14 @@ DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
 static inline bool want_init_on_free(void)
 {
 	return static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON,
-				   &init_on_free);
+				&init_on_free);
+}
+
+DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON, init_mlocked_on_free);
+static inline bool want_init_mlocked_on_free(void)
+{
+	return static_branch_maybe(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON,
+				&init_mlocked_on_free);
 }
 
 extern bool _debug_pagealloc_enabled_early;
diff --git a/mm/internal.h b/mm/internal.h
index 6614ba4ca9de..cf7799e29391 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -506,6 +506,7 @@ extern void __putback_isolated_page(struct page *page, unsigned int order,
 extern void memblock_free_pages(struct page *page, unsigned long pfn,
 					unsigned int order);
 extern void __free_pages_core(struct page *page, unsigned int order);
+extern void kernel_init_pages(struct page *page, int numpages);
 
 /*
  * This will have no effect, other than possibly generating a warning, if the
diff --git a/mm/memory.c b/mm/memory.c
index 0b92336bcebd..80944acb5b4e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1506,6 +1506,12 @@ static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
 		if (unlikely(page_mapcount(page) < 0))
 			print_bad_pte(vma, addr, ptent, page);
 	}
+
+	if (want_init_mlocked_on_free() && folio_test_mlocked(folio) &&
+	    !delay_rmap && folio_test_anon(folio)) {
+		kernel_init_pages(page, folio_nr_pages(folio));
+	}
+
 	if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) {
 		*force_flush = true;
 		*force_break = true;
diff --git a/mm/mm_init.c b/mm/mm_init.c
index d01912b8a597..2c8f3af4430f 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -2522,6 +2522,9 @@ EXPORT_SYMBOL(init_on_alloc);
 DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
 EXPORT_SYMBOL(init_on_free);
 
+DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON, init_mlocked_on_free);
+EXPORT_SYMBOL(init_mlocked_on_free);
+
 static bool _init_on_alloc_enabled_early __read_mostly
 				= IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
 static int __init early_init_on_alloc(char *buf)
@@ -2539,6 +2542,14 @@ static int __init early_init_on_free(char *buf)
 }
 early_param("init_on_free", early_init_on_free);
 
+static bool _init_mlocked_on_free_enabled_early __read_mostly
+				= IS_ENABLED(CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON);
+static int __init early_init_mlocked_on_free(char *buf)
+{
+	return kstrtobool(buf, &_init_mlocked_on_free_enabled_early);
+}
+early_param("init_mlocked_on_free", early_init_mlocked_on_free);
+
 DEFINE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);
 
 /*
@@ -2566,12 +2577,21 @@ static void __init mem_debugging_and_hardening_init(void)
 	}
 #endif
 
-	if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early) &&
+	if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early ||
+	    _init_mlocked_on_free_enabled_early) &&
 	    page_poisoning_requested) {
 		pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
-			"will take precedence over init_on_alloc and init_on_free\n");
+			"will take precedence over init_on_alloc, init_on_free "
+			"and init_mlocked_on_free\n");
 		_init_on_alloc_enabled_early = false;
 		_init_on_free_enabled_early = false;
+		_init_mlocked_on_free_enabled_early = false;
+	}
+
+	if (_init_mlocked_on_free_enabled_early && _init_on_free_enabled_early) {
+		pr_info("mem auto-init: init_on_free is on, "
+			"will take precedence over init_mlocked_on_free\n");
+		_init_mlocked_on_free_enabled_early = false;
 	}
 
 	if (_init_on_alloc_enabled_early) {
@@ -2588,9 +2608,17 @@ static void __init mem_debugging_and_hardening_init(void)
 		static_branch_disable(&init_on_free);
 	}
 
-	if (IS_ENABLED(CONFIG_KMSAN) &&
-	    (_init_on_alloc_enabled_early || _init_on_free_enabled_early))
-		pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n");
+	if (_init_mlocked_on_free_enabled_early) {
+		want_check_pages = true;
+		static_branch_enable(&init_mlocked_on_free);
+	} else {
+		static_branch_disable(&init_mlocked_on_free);
+	}
+
+	if (IS_ENABLED(CONFIG_KMSAN) && (_init_on_alloc_enabled_early ||
+	    _init_on_free_enabled_early || _init_mlocked_on_free_enabled_early))
+		pr_info("mem auto-init: please make sure init_on_alloc, init_on_free and "
+			"init_mlocked_on_free are disabled when running KMSAN\n");
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
 	if (debug_pagealloc_enabled()) {
@@ -2629,9 +2657,10 @@ static void __init report_meminit(void)
 	else
 		stack = "off";
 
-	pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s\n",
+	pr_info("mem auto-init: stack:%s, heap alloc:%s, heap free:%s, mlocked free:%s\n",
 		stack, want_init_on_alloc(GFP_KERNEL) ? "on" : "off",
-		want_init_on_free() ? "on" : "off");
+		want_init_on_free() ? "on" : "off",
+		want_init_mlocked_on_free() ? "on" : "off");
 	if (want_init_on_free())
 		pr_info("mem auto-init: clearing system memory may take some time...\n");
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 47ab0297838a..e030ccf9d5bc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1032,7 +1032,7 @@ static inline bool should_skip_kasan_poison(struct page *page)
 	return page_kasan_tag(page) == KASAN_TAG_KERNEL;
 }
 
-static void kernel_init_pages(struct page *page, int numpages)
+void kernel_init_pages(struct page *page, int numpages)
 {
 	int i;
 
diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening
index 2cff851ebfd7..effbf5982be1 100644
--- a/security/Kconfig.hardening
+++ b/security/Kconfig.hardening
@@ -255,6 +255,21 @@ config INIT_ON_FREE_DEFAULT_ON
 	  touching "cold" memory areas. Most cases see 3-5% impact. Some
 	  synthetic workloads have measured as high as 8%.
 
+config INIT_MLOCKED_ON_FREE_DEFAULT_ON
+	bool "Enable mlocked memory zeroing on free"
+	depends on !KMSAN
+	help
+	  This config has the effect of setting "init_mlocked_on_free=1"
+	  on the kernel command line. If it is enabled, all mlocked process
+	  memory is zeroed when freed. This restriction to mlocked memory
+	  improves performance over "init_on_free" but can still be used to
+	  protect confidential data like key material from content exposures
+	  to other processes, as well as live forensics and cold boot attacks.
+	  Any non-mlocked memory is not cleared before it is reassigned. This
+	  configuration can be overwritten by setting "init_mlocked_on_free=0"
+	  on the command line. The "init_on_free" boot option takes
+	  precedence over "init_mlocked_on_free".
+
 config CC_HAS_ZERO_CALL_USED_REGS
 	def_bool $(cc-option,-fzero-call-used-regs=used-gpr)
 	# https://github.com/ClangBuiltLinux/linux/issues/1766
-- 
cgit v1.2.3


From 610ff817b981921213ae51e5c5f38c76c6f0405e Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 3 Apr 2024 18:18:34 +0100
Subject: khugepaged: remove hpage from collapse_file()

Use new_folio throughout where we had been using hpage.

Link: https://lkml.kernel.org/r/20240403171838.1445826-6-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/huge_memory.h |  6 +--
 mm/khugepaged.c                    | 77 +++++++++++++++++++-------------------
 2 files changed, 42 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index 6e2ef1d4b002..dc6eeef2d3da 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -207,10 +207,10 @@ TRACE_EVENT(mm_khugepaged_scan_file,
 );
 
 TRACE_EVENT(mm_khugepaged_collapse_file,
-	TP_PROTO(struct mm_struct *mm, struct page *hpage, pgoff_t index,
+	TP_PROTO(struct mm_struct *mm, struct folio *new_folio, pgoff_t index,
 			bool is_shmem, unsigned long addr, struct file *file,
 			int nr, int result),
-	TP_ARGS(mm, hpage, index, addr, is_shmem, file, nr, result),
+	TP_ARGS(mm, new_folio, index, addr, is_shmem, file, nr, result),
 	TP_STRUCT__entry(
 		__field(struct mm_struct *, mm)
 		__field(unsigned long, hpfn)
@@ -224,7 +224,7 @@ TRACE_EVENT(mm_khugepaged_collapse_file,
 
 	TP_fast_assign(
 		__entry->mm = mm;
-		__entry->hpfn = hpage ? page_to_pfn(hpage) : -1;
+		__entry->hpfn = new_folio ? folio_pfn(new_folio) : -1;
 		__entry->index = index;
 		__entry->addr = addr;
 		__entry->is_shmem = is_shmem;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 71a4119ce3a8..d44584b5e004 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1780,30 +1780,27 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
 			 struct collapse_control *cc)
 {
 	struct address_space *mapping = file->f_mapping;
-	struct page *hpage;
 	struct page *page;
-	struct page *tmp;
+	struct page *tmp, *dst;
 	struct folio *folio, *new_folio;
 	pgoff_t index = 0, end = start + HPAGE_PMD_NR;
 	LIST_HEAD(pagelist);
 	XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
 	int nr_none = 0, result = SCAN_SUCCEED;
 	bool is_shmem = shmem_file(file);
-	int nr = 0;
 
 	VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
 	VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
 
 	result = alloc_charge_folio(&new_folio, mm, cc);
-	hpage = &new_folio->page;
 	if (result != SCAN_SUCCEED)
 		goto out;
 
-	__SetPageLocked(hpage);
+	__folio_set_locked(new_folio);
 	if (is_shmem)
-		__SetPageSwapBacked(hpage);
-	hpage->index = start;
-	hpage->mapping = mapping;
+		__folio_set_swapbacked(new_folio);
+	new_folio->index = start;
+	new_folio->mapping = mapping;
 
 	/*
 	 * Ensure we have slots for all the pages in the range.  This is
@@ -2036,20 +2033,24 @@ xa_unlocked:
 	 * The old pages are locked, so they won't change anymore.
 	 */
 	index = start;
+	dst = folio_page(new_folio, 0);
 	list_for_each_entry(page, &pagelist, lru) {
 		while (index < page->index) {
-			clear_highpage(hpage + (index % HPAGE_PMD_NR));
+			clear_highpage(dst);
 			index++;
+			dst++;
 		}
-		if (copy_mc_highpage(hpage + (page->index % HPAGE_PMD_NR), page) > 0) {
+		if (copy_mc_highpage(dst, page) > 0) {
 			result = SCAN_COPY_MC;
 			goto rollback;
 		}
 		index++;
+		dst++;
 	}
 	while (index < end) {
-		clear_highpage(hpage + (index % HPAGE_PMD_NR));
+		clear_highpage(dst);
 		index++;
+		dst++;
 	}
 
 	if (nr_none) {
@@ -2077,16 +2078,17 @@ xa_unlocked:
 		}
 
 		/*
-		 * If userspace observed a missing page in a VMA with a MODE_MISSING
-		 * userfaultfd, then it might expect a UFFD_EVENT_PAGEFAULT for that
-		 * page. If so, we need to roll back to avoid suppressing such an
-		 * event. Since wp/minor userfaultfds don't give userspace any
-		 * guarantees that the kernel doesn't fill a missing page with a zero
-		 * page, so they don't matter here.
+		 * If userspace observed a missing page in a VMA with
+		 * a MODE_MISSING userfaultfd, then it might expect a
+		 * UFFD_EVENT_PAGEFAULT for that page. If so, we need to
+		 * roll back to avoid suppressing such an event. Since
+		 * wp/minor userfaultfds don't give userspace any
+		 * guarantees that the kernel doesn't fill a missing
+		 * page with a zero page, so they don't matter here.
 		 *
-		 * Any userfaultfds registered after this point will not be able to
-		 * observe any missing pages due to the previously inserted retry
-		 * entries.
+		 * Any userfaultfds registered after this point will
+		 * not be able to observe any missing pages due to the
+		 * previously inserted retry entries.
 		 */
 		vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) {
 			if (userfaultfd_missing(vma)) {
@@ -2111,33 +2113,32 @@ immap_locked:
 		xas_lock_irq(&xas);
 	}
 
-	folio = page_folio(hpage);
-	nr = folio_nr_pages(folio);
 	if (is_shmem)
-		__lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr);
+		__lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR);
 	else
-		__lruvec_stat_mod_folio(folio, NR_FILE_THPS, nr);
+		__lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR);
 
 	if (nr_none) {
-		__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_none);
+		__lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none);
 		/* nr_none is always 0 for non-shmem. */
-		__lruvec_stat_mod_folio(folio, NR_SHMEM, nr_none);
+		__lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none);
 	}
 
 	/*
-	 * Mark hpage as uptodate before inserting it into the page cache so
-	 * that it isn't mistaken for an fallocated but unwritten page.
+	 * Mark new_folio as uptodate before inserting it into the
+	 * page cache so that it isn't mistaken for an fallocated but
+	 * unwritten page.
 	 */
-	folio_mark_uptodate(folio);
-	folio_ref_add(folio, HPAGE_PMD_NR - 1);
+	folio_mark_uptodate(new_folio);
+	folio_ref_add(new_folio, HPAGE_PMD_NR - 1);
 
 	if (is_shmem)
-		folio_mark_dirty(folio);
-	folio_add_lru(folio);
+		folio_mark_dirty(new_folio);
+	folio_add_lru(new_folio);
 
 	/* Join all the small entries into a single multi-index entry. */
 	xas_set_order(&xas, start, HPAGE_PMD_ORDER);
-	xas_store(&xas, folio);
+	xas_store(&xas, new_folio);
 	WARN_ON_ONCE(xas_error(&xas));
 	xas_unlock_irq(&xas);
 
@@ -2148,7 +2149,7 @@ immap_locked:
 	retract_page_tables(mapping, start);
 	if (cc && !cc->is_khugepaged)
 		result = SCAN_PTE_MAPPED_HUGEPAGE;
-	folio_unlock(folio);
+	folio_unlock(new_folio);
 
 	/*
 	 * The collapse has succeeded, so free the old pages.
@@ -2193,13 +2194,13 @@ rollback:
 		smp_mb();
 	}
 
-	hpage->mapping = NULL;
+	new_folio->mapping = NULL;
 
-	unlock_page(hpage);
-	put_page(hpage);
+	folio_unlock(new_folio);
+	folio_put(new_folio);
 out:
 	VM_BUG_ON(!list_empty(&pagelist));
-	trace_mm_khugepaged_collapse_file(mm, hpage, index, is_shmem, addr, file, nr, result);
+	trace_mm_khugepaged_collapse_file(mm, new_folio, index, is_shmem, addr, file, HPAGE_PMD_NR, result);
 	return result;
 }
 
-- 
cgit v1.2.3


From 43849758fdc976a6d6108ed6dfccdb136fdeec39 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 3 Apr 2024 18:18:36 +0100
Subject: khugepaged: use a folio throughout hpage_collapse_scan_file()

Replace the use of pages with folios.  Saves a few calls to
compound_head() and removes some uses of obsolete functions.

Link: https://lkml.kernel.org/r/20240403171838.1445826-8-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/huge_memory.h |  6 +++---
 mm/khugepaged.c                    | 33 ++++++++++++++++-----------------
 2 files changed, 19 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index dc6eeef2d3da..ab576898a126 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -174,10 +174,10 @@ TRACE_EVENT(mm_collapse_huge_page_swapin,
 
 TRACE_EVENT(mm_khugepaged_scan_file,
 
-	TP_PROTO(struct mm_struct *mm, struct page *page, struct file *file,
+	TP_PROTO(struct mm_struct *mm, struct folio *folio, struct file *file,
 		 int present, int swap, int result),
 
-	TP_ARGS(mm, page, file, present, swap, result),
+	TP_ARGS(mm, folio, file, present, swap, result),
 
 	TP_STRUCT__entry(
 		__field(struct mm_struct *, mm)
@@ -190,7 +190,7 @@ TRACE_EVENT(mm_khugepaged_scan_file,
 
 	TP_fast_assign(
 		__entry->mm = mm;
-		__entry->pfn = page ? page_to_pfn(page) : -1;
+		__entry->pfn = folio ? folio_pfn(folio) : -1;
 		__assign_str(filename, file->f_path.dentry->d_iname);
 		__entry->present = present;
 		__entry->swap = swap;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index f3116eba4dfd..8380ee0d0410 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -2203,7 +2203,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
 				    struct file *file, pgoff_t start,
 				    struct collapse_control *cc)
 {
-	struct page *page = NULL;
+	struct folio *folio = NULL;
 	struct address_space *mapping = file->f_mapping;
 	XA_STATE(xas, &mapping->i_pages, start);
 	int present, swap;
@@ -2215,11 +2215,11 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
 	memset(cc->node_load, 0, sizeof(cc->node_load));
 	nodes_clear(cc->alloc_nmask);
 	rcu_read_lock();
-	xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
-		if (xas_retry(&xas, page))
+	xas_for_each(&xas, folio, start + HPAGE_PMD_NR - 1) {
+		if (xas_retry(&xas, folio))
 			continue;
 
-		if (xa_is_value(page)) {
+		if (xa_is_value(folio)) {
 			++swap;
 			if (cc->is_khugepaged &&
 			    swap > khugepaged_max_ptes_swap) {
@@ -2234,11 +2234,9 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
 		 * TODO: khugepaged should compact smaller compound pages
 		 * into a PMD sized page
 		 */
-		if (PageTransCompound(page)) {
-			struct page *head = compound_head(page);
-
-			result = compound_order(head) == HPAGE_PMD_ORDER &&
-					head->index == start
+		if (folio_test_large(folio)) {
+			result = folio_order(folio) == HPAGE_PMD_ORDER &&
+					folio->index == start
 					/* Maybe PMD-mapped */
 					? SCAN_PTE_MAPPED_HUGEPAGE
 					: SCAN_PAGE_COMPOUND;
@@ -2251,28 +2249,29 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
 			break;
 		}
 
-		node = page_to_nid(page);
+		node = folio_nid(folio);
 		if (hpage_collapse_scan_abort(node, cc)) {
 			result = SCAN_SCAN_ABORT;
 			break;
 		}
 		cc->node_load[node]++;
 
-		if (!PageLRU(page)) {
+		if (!folio_test_lru(folio)) {
 			result = SCAN_PAGE_LRU;
 			break;
 		}
 
-		if (page_count(page) !=
-		    1 + page_mapcount(page) + page_has_private(page)) {
+		if (folio_ref_count(folio) !=
+		    1 + folio_mapcount(folio) + folio_test_private(folio)) {
 			result = SCAN_PAGE_COUNT;
 			break;
 		}
 
 		/*
-		 * We probably should check if the page is referenced here, but
-		 * nobody would transfer pte_young() to PageReferenced() for us.
-		 * And rmap walk here is just too costly...
+		 * We probably should check if the folio is referenced
+		 * here, but nobody would transfer pte_young() to
+		 * folio_test_referenced() for us.  And rmap walk here
+		 * is just too costly...
 		 */
 
 		present++;
@@ -2294,7 +2293,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
 		}
 	}
 
-	trace_mm_khugepaged_scan_file(mm, page, file, present, swap, result);
+	trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result);
 	return result;
 }
 #else
-- 
cgit v1.2.3


From 1ade67cd22e4aeb74ed0b86bab6760bbcf9e2f06 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 2 Apr 2024 21:12:50 +0100
Subject: mm: remove page_idle and page_young wrappers

All users have now been converted to the folio equivalents, so remove the
page wrappers.

Link: https://lkml.kernel.org/r/20240402201252.917342-4-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page_idle.h | 25 -------------------------
 1 file changed, 25 deletions(-)

(limited to 'include')

diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
index 511e22ef459f..6357f1e7918a 100644
--- a/include/linux/page_idle.h
+++ b/include/linux/page_idle.h
@@ -119,29 +119,4 @@ static inline void folio_clear_idle(struct folio *folio)
 }
 
 #endif /* CONFIG_PAGE_IDLE_FLAG */
-
-static inline bool page_is_young(struct page *page)
-{
-	return folio_test_young(page_folio(page));
-}
-
-static inline void set_page_young(struct page *page)
-{
-	folio_set_young(page_folio(page));
-}
-
-static inline bool test_and_clear_page_young(struct page *page)
-{
-	return folio_test_clear_young(page_folio(page));
-}
-
-static inline bool page_is_idle(struct page *page)
-{
-	return folio_test_idle(page_folio(page));
-}
-
-static inline void set_page_idle(struct page *page)
-{
-	folio_set_idle(page_folio(page));
-}
 #endif /* _LINUX_MM_PAGE_IDLE_H */
-- 
cgit v1.2.3


From 8c9e8381ea4bcca4305cf0d24f87f4d39514ab35 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 2 Apr 2024 21:12:51 +0100
Subject: mm: generate PAGE_IDLE_FLAG definitions

If CONFIG_PAGE_IDLE_FLAG is not set, we can use FOLIO_FLAG_FALSE() to
generate these definitions.

Link: https://lkml.kernel.org/r/20240402201252.917342-5-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h |  9 ++++++++-
 include/linux/page_idle.h  | 37 ++-----------------------------------
 2 files changed, 10 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 7577fe7debaf..f83331684e47 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -626,12 +626,19 @@ PAGEFLAG_FALSE(HWPoison, hwpoison)
 #define __PG_HWPOISON 0
 #endif
 
-#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
+#ifdef CONFIG_PAGE_IDLE_FLAG
+#ifdef CONFIG_64BIT
 FOLIO_TEST_FLAG(young, FOLIO_HEAD_PAGE)
 FOLIO_SET_FLAG(young, FOLIO_HEAD_PAGE)
 FOLIO_TEST_CLEAR_FLAG(young, FOLIO_HEAD_PAGE)
 FOLIO_FLAG(idle, FOLIO_HEAD_PAGE)
 #endif
+/* See page_idle.h for !64BIT workaround */
+#else /* !CONFIG_PAGE_IDLE_FLAG */
+FOLIO_FLAG_FALSE(young)
+FOLIO_TEST_CLEAR_FLAG_FALSE(young)
+FOLIO_FLAG_FALSE(idle)
+#endif
 
 /*
  * PageReported() is used to track reported free pages within the Buddy
diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h
index 6357f1e7918a..89ca0d5dc1e7 100644
--- a/include/linux/page_idle.h
+++ b/include/linux/page_idle.h
@@ -6,9 +6,7 @@
 #include <linux/page-flags.h>
 #include <linux/page_ext.h>
 
-#ifdef CONFIG_PAGE_IDLE_FLAG
-
-#ifndef CONFIG_64BIT
+#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
 /*
  * If there is not enough space to store Idle and Young bits in page flags, use
  * page ext flags instead.
@@ -87,36 +85,5 @@ static inline void folio_clear_idle(struct folio *folio)
 	clear_bit(PAGE_EXT_IDLE, &page_ext->flags);
 	page_ext_put(page_ext);
 }
-#endif /* !CONFIG_64BIT */
-
-#else /* !CONFIG_PAGE_IDLE_FLAG */
-
-static inline bool folio_test_young(const struct folio *folio)
-{
-	return false;
-}
-
-static inline void folio_set_young(struct folio *folio)
-{
-}
-
-static inline bool folio_test_clear_young(struct folio *folio)
-{
-	return false;
-}
-
-static inline bool folio_test_idle(const struct folio *folio)
-{
-	return false;
-}
-
-static inline void folio_set_idle(struct folio *folio)
-{
-}
-
-static inline void folio_clear_idle(struct folio *folio)
-{
-}
-
-#endif /* CONFIG_PAGE_IDLE_FLAG */
+#endif /* CONFIG_PAGE_IDLE_FLAG && !64BIT */
 #endif /* _LINUX_MM_PAGE_IDLE_H */
-- 
cgit v1.2.3


From d7d0d389ff90644546ffcb8e15ea3ccaf6138958 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Mon, 8 Apr 2024 19:39:40 +0100
Subject: mm: swap: remove CLUSTER_FLAG_HUGE from swap_cluster_info:flags

Patch series "Swap-out mTHP without splitting", v7.

This series adds support for swapping out multi-size THP (mTHP) without
needing to first split the large folio via
split_huge_page_to_list_to_order().  It closely follows the approach
already used to swap-out PMD-sized THP.

There are a couple of reasons for swapping out mTHP without splitting:

  - Performance: It is expensive to split a large folio and under
    extreme memory pressure some workloads regressed performance when
    using 64K mTHP vs 4K small folios because of this extra cost in the
    swap-out path.  This series not only eliminates the regression but
    makes it faster to swap out 64K mTHP vs 4K small folios.

  - Memory fragmentation avoidance: If we can avoid splitting a large
    folio memory is less likely to become fragmented, making it easier to
    re-allocate a large folio in future.

  - Performance: Enables a separate series [7] to swap-in whole mTHPs,
    which means we won't lose the TLB-efficiency benefits of mTHP once the
    memory has been through a swap cycle.

I've done what I thought was the smallest change possible, and as a
result, this approach is only employed when the swap is backed by a
non-rotating block device (just as PMD-sized THP is supported today).
Discussion against the RFC concluded that this is sufficient.


Performance Testing
===================

I've run some swap performance tests on Ampere Altra VM (arm64) with 8
CPUs.  The VM is set up with a 35G block ram device as the swap device and
the test is run from inside a memcg limited to 40G memory.  I've then run
`usemem` from vm-scalability with 70 processes, each allocating and
writing 1G of memory.  I've repeated everything 6 times and taken the mean
performance improvement relative to 4K page baseline:

| alloc size |                baseline |           + this series |
|            | mm-unstable (~v6.9-rc1) |                         |
|:-----------|------------------------:|------------------------:|
| 4K Page    |                    0.0% |                    1.3% |
| 64K THP    |                  -13.6% |                   46.3% |
| 2M THP     |                   91.4% |                   89.6% |

So with this change, the 64K swap performance goes from a 14% regression to a
46% improvement. While 2M shows a small regression I'm confident that this is
just noise.

[1] https://lore.kernel.org/linux-mm/20231010142111.3997780-1-ryan.roberts@arm.com/
[2] https://lore.kernel.org/linux-mm/20231017161302.2518826-1-ryan.roberts@arm.com/
[3] https://lore.kernel.org/linux-mm/20231025144546.577640-1-ryan.roberts@arm.com/
[4] https://lore.kernel.org/linux-mm/20240311150058.1122862-1-ryan.roberts@arm.com/
[5] https://lore.kernel.org/linux-mm/20240327144537.4165578-1-ryan.roberts@arm.com/
[6] https://lore.kernel.org/linux-mm/20240403114032.1162100-1-ryan.roberts@arm.com/
[7] https://lore.kernel.org/linux-mm/20240304081348.197341-1-21cnbao@gmail.com/
[8] https://lore.kernel.org/linux-mm/CAGsJ_4yMOow27WDvN2q=E4HAtDd2PJ=OQ5Pj9DG+6FLWwNuXUw@mail.gmail.com/
[9] https://lore.kernel.org/linux-mm/579d5127-c763-4001-9625-4563a9316ac3@redhat.com/


This patch (of 7):

As preparation for supporting small-sized THP in the swap-out path,
without first needing to split to order-0, Remove the CLUSTER_FLAG_HUGE,
which, when present, always implies PMD-sized THP, which is the same as
the cluster size.

The only use of the flag was to determine whether a swap entry refers to a
single page or a PMD-sized THP in swap_page_trans_huge_swapped().  Instead
of relying on the flag, we now pass in order, which originates from the
folio's order.  This allows the logic to work for folios of any order.

The one snag is that one of the swap_page_trans_huge_swapped() call sites
does not have the folio.  But it was only being called there to shortcut a
call __try_to_reclaim_swap() in some cases.  __try_to_reclaim_swap() gets
the folio and (via some other functions) calls
swap_page_trans_huge_swapped().  So I've removed the problematic call site
and believe the new logic should be functionally equivalent.

That said, removing the fast path means that we will take a reference and
trylock a large folio much more often, which we would like to avoid.  The
next patch will solve this.

Removing CLUSTER_FLAG_HUGE also means we can remove split_swap_cluster()
which used to be called during folio splitting, since
split_swap_cluster()'s only job was to remove the flag.

Link: https://lkml.kernel.org/r/20240408183946.2991168-1-ryan.roberts@arm.com
Link: https://lkml.kernel.org/r/20240408183946.2991168-2-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Chris Li <chrisl@kernel.org>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Gao Xiang <xiang@kernel.org>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Barry Song <v-songbaohua@oppo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h | 10 ----------
 mm/huge_memory.c     |  3 ---
 mm/swapfile.c        | 47 ++++++++---------------------------------------
 3 files changed, 8 insertions(+), 52 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index f53d608daa01..a803de0ac24f 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -259,7 +259,6 @@ struct swap_cluster_info {
 };
 #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
 #define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
-#define CLUSTER_FLAG_HUGE 4 /* This cluster is backing a transparent huge page */
 
 /*
  * We assign a cluster to each CPU, so each CPU can allocate swap entry from
@@ -587,15 +586,6 @@ static inline int add_swap_extent(struct swap_info_struct *sis,
 }
 #endif /* CONFIG_SWAP */
 
-#ifdef CONFIG_THP_SWAP
-extern int split_swap_cluster(swp_entry_t entry);
-#else
-static inline int split_swap_cluster(swp_entry_t entry)
-{
-	return 0;
-}
-#endif
-
 #ifdef CONFIG_MEMCG
 static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 4065bf8bfcc4..14f04fbc22d9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2844,9 +2844,6 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 		shmem_uncharge(folio->mapping->host, nr_dropped);
 	remap_page(folio, nr);
 
-	if (folio_test_swapcache(folio))
-		split_swap_cluster(folio->swap);
-
 	/*
 	 * set page to its compound_head when split to non order-0 pages, so
 	 * we can skip unlocking it below, since PG_locked is transferred to
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 5e6d2304a2a4..1ded6d1dcab4 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -343,18 +343,6 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
 	info->data = 0;
 }
 
-static inline bool cluster_is_huge(struct swap_cluster_info *info)
-{
-	if (IS_ENABLED(CONFIG_THP_SWAP))
-		return info->flags & CLUSTER_FLAG_HUGE;
-	return false;
-}
-
-static inline void cluster_clear_huge(struct swap_cluster_info *info)
-{
-	info->flags &= ~CLUSTER_FLAG_HUGE;
-}
-
 static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
 						     unsigned long offset)
 {
@@ -1027,7 +1015,7 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
 	offset = idx * SWAPFILE_CLUSTER;
 	ci = lock_cluster(si, offset);
 	alloc_cluster(si, idx);
-	cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
+	cluster_set_count(ci, SWAPFILE_CLUSTER);
 
 	memset(si->swap_map + offset, SWAP_HAS_CACHE, SWAPFILE_CLUSTER);
 	unlock_cluster(ci);
@@ -1365,7 +1353,6 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
 
 	ci = lock_cluster_or_swap_info(si, offset);
 	if (size == SWAPFILE_CLUSTER) {
-		VM_BUG_ON(!cluster_is_huge(ci));
 		map = si->swap_map + offset;
 		for (i = 0; i < SWAPFILE_CLUSTER; i++) {
 			val = map[i];
@@ -1373,7 +1360,6 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
 			if (val == SWAP_HAS_CACHE)
 				free_entries++;
 		}
-		cluster_clear_huge(ci);
 		if (free_entries == SWAPFILE_CLUSTER) {
 			unlock_cluster_or_swap_info(si, ci);
 			spin_lock(&si->lock);
@@ -1395,23 +1381,6 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
 	unlock_cluster_or_swap_info(si, ci);
 }
 
-#ifdef CONFIG_THP_SWAP
-int split_swap_cluster(swp_entry_t entry)
-{
-	struct swap_info_struct *si;
-	struct swap_cluster_info *ci;
-	unsigned long offset = swp_offset(entry);
-
-	si = _swap_info_get(entry);
-	if (!si)
-		return -EBUSY;
-	ci = lock_cluster(si, offset);
-	cluster_clear_huge(ci);
-	unlock_cluster(ci);
-	return 0;
-}
-#endif
-
 static int swp_entry_cmp(const void *ent1, const void *ent2)
 {
 	const swp_entry_t *e1 = ent1, *e2 = ent2;
@@ -1519,22 +1488,23 @@ out:
 }
 
 static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
-					 swp_entry_t entry)
+					 swp_entry_t entry, int order)
 {
 	struct swap_cluster_info *ci;
 	unsigned char *map = si->swap_map;
+	unsigned int nr_pages = 1 << order;
 	unsigned long roffset = swp_offset(entry);
-	unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
+	unsigned long offset = round_down(roffset, nr_pages);
 	int i;
 	bool ret = false;
 
 	ci = lock_cluster_or_swap_info(si, offset);
-	if (!ci || !cluster_is_huge(ci)) {
+	if (!ci || nr_pages == 1) {
 		if (swap_count(map[roffset]))
 			ret = true;
 		goto unlock_out;
 	}
-	for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+	for (i = 0; i < nr_pages; i++) {
 		if (swap_count(map[offset + i])) {
 			ret = true;
 			break;
@@ -1556,7 +1526,7 @@ static bool folio_swapped(struct folio *folio)
 	if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio)))
 		return swap_swapcount(si, entry) != 0;
 
-	return swap_page_trans_huge_swapped(si, entry);
+	return swap_page_trans_huge_swapped(si, entry, folio_order(folio));
 }
 
 /**
@@ -1622,8 +1592,7 @@ int free_swap_and_cache(swp_entry_t entry)
 		}
 
 		count = __swap_entry_free(p, entry);
-		if (count == SWAP_HAS_CACHE &&
-		    !swap_page_trans_huge_swapped(p, entry))
+		if (count == SWAP_HAS_CACHE)
 			__try_to_reclaim_swap(p, swp_offset(entry),
 					      TTRS_UNMAPPED | TTRS_FULL);
 		put_swap_device(p);
-- 
cgit v1.2.3


From a62fb92ac12ed39df4930dca599a3b427552882a Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Mon, 8 Apr 2024 19:39:41 +0100
Subject: mm: swap: free_swap_and_cache_nr() as batched free_swap_and_cache()

Now that we no longer have a convenient flag in the cluster to determine
if a folio is large, free_swap_and_cache() will take a reference and lock
a large folio much more often, which could lead to contention and (e.g.)
failure to split large folios, etc.

Let's solve that problem by batch freeing swap and cache with a new
function, free_swap_and_cache_nr(), to free a contiguous range of swap
entries together.  This allows us to first drop a reference to each swap
slot before we try to release the cache folio.  This means we only try to
release the folio once, only taking the reference and lock once - much
better than the previous 512 times for the 2M THP case.

Contiguous swap entries are gathered in zap_pte_range() and
madvise_free_pte_range() in a similar way to how present ptes are already
gathered in zap_pte_range().

While we are at it, let's simplify by converting the return type of both
functions to void.  The return value was used only by zap_pte_range() to
print a bad pte, and was ignored by everyone else, so the extra reporting
wasn't exactly guaranteed.  We will still get the warning with most of the
information from get_swap_device().  With the batch version, we wouldn't
know which pte was bad anyway so could print the wrong one.

[ryan.roberts@arm.com: fix a build warning on parisc]
  Link: https://lkml.kernel.org/r/20240409111840.3173122-1-ryan.roberts@arm.com
Link: https://lkml.kernel.org/r/20240408183946.2991168-3-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Barry Song <v-songbaohua@oppo.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Gao Xiang <xiang@kernel.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgtable.h | 29 +++++++++++++++
 include/linux/swap.h    | 12 ++++--
 mm/internal.h           | 64 ++++++++++++++++++++++++++++++++
 mm/madvise.c            | 12 ++++--
 mm/memory.c             | 13 ++++---
 mm/swapfile.c           | 97 ++++++++++++++++++++++++++++++++++++++++---------
 6 files changed, 196 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index a3fc8150b047..75096025fe52 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -708,6 +708,35 @@ static inline void pte_clear_not_present_full(struct mm_struct *mm,
 }
 #endif
 
+#ifndef clear_not_present_full_ptes
+/**
+ * clear_not_present_full_ptes - Clear multiple not present PTEs which are
+ *				 consecutive in the pgtable.
+ * @mm: Address space the ptes represent.
+ * @addr: Address of the first pte.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to clear.
+ * @full: Whether we are clearing a full mm.
+ *
+ * May be overridden by the architecture; otherwise, implemented as a simple
+ * loop over pte_clear_not_present_full().
+ *
+ * Context: The caller holds the page table lock.  The PTEs are all not present.
+ * The PTEs are all in the same PMD.
+ */
+static inline void clear_not_present_full_ptes(struct mm_struct *mm,
+		unsigned long addr, pte_t *ptep, unsigned int nr, int full)
+{
+	for (;;) {
+		pte_clear_not_present_full(mm, addr, ptep, full);
+		if (--nr == 0)
+			break;
+		ptep++;
+		addr += PAGE_SIZE;
+	}
+}
+#endif
+
 #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
 extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
 			      unsigned long address,
diff --git a/include/linux/swap.h b/include/linux/swap.h
index a803de0ac24f..2d8f2b950ddf 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -468,7 +468,7 @@ extern int swap_duplicate(swp_entry_t);
 extern int swapcache_prepare(swp_entry_t);
 extern void swap_free(swp_entry_t);
 extern void swapcache_free_entries(swp_entry_t *entries, int n);
-extern int free_swap_and_cache(swp_entry_t);
+extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
 int swap_type_of(dev_t device, sector_t offset);
 int find_first_swap(dev_t *device);
 extern unsigned int count_swap_pages(int, int);
@@ -517,8 +517,9 @@ static inline void put_swap_device(struct swap_info_struct *si)
 #define free_pages_and_swap_cache(pages, nr) \
 	release_pages((pages), (nr));
 
-/* used to sanity check ptes in zap_pte_range when CONFIG_SWAP=0 */
-#define free_swap_and_cache(e) is_pfn_swap_entry(e)
+static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr)
+{
+}
 
 static inline void free_swap_cache(struct folio *folio)
 {
@@ -586,6 +587,11 @@ static inline int add_swap_extent(struct swap_info_struct *sis,
 }
 #endif /* CONFIG_SWAP */
 
+static inline void free_swap_and_cache(swp_entry_t entry)
+{
+	free_swap_and_cache_nr(entry, 1);
+}
+
 #ifdef CONFIG_MEMCG
 static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
 {
diff --git a/mm/internal.h b/mm/internal.h
index d567381b12cc..d34df04b11f6 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -11,6 +11,8 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
 #include <linux/tracepoint-defs.h>
 
 struct folio_batch;
@@ -189,6 +191,68 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
 
 	return min(ptep - start_ptep, max_nr);
 }
+
+/**
+ * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
+ * @pte: The initial pte state; is_swap_pte(pte) must be true and
+ *	 non_swap_entry() must be false.
+ *
+ * Increments the swap offset, while maintaining all other fields, including
+ * swap type, and any swp pte bits. The resulting pte is returned.
+ */
+static inline pte_t pte_next_swp_offset(pte_t pte)
+{
+	swp_entry_t entry = pte_to_swp_entry(pte);
+	pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
+						   (swp_offset(entry) + 1)));
+
+	if (pte_swp_soft_dirty(pte))
+		new = pte_swp_mksoft_dirty(new);
+	if (pte_swp_exclusive(pte))
+		new = pte_swp_mkexclusive(new);
+	if (pte_swp_uffd_wp(pte))
+		new = pte_swp_mkuffd_wp(new);
+
+	return new;
+}
+
+/**
+ * swap_pte_batch - detect a PTE batch for a set of contiguous swap entries
+ * @start_ptep: Page table pointer for the first entry.
+ * @max_nr: The maximum number of table entries to consider.
+ * @pte: Page table entry for the first entry.
+ *
+ * Detect a batch of contiguous swap entries: consecutive (non-present) PTEs
+ * containing swap entries all with consecutive offsets and targeting the same
+ * swap type, all with matching swp pte bits.
+ *
+ * max_nr must be at least one and must be limited by the caller so scanning
+ * cannot exceed a single page table.
+ *
+ * Return: the number of table entries in the batch.
+ */
+static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
+{
+	pte_t expected_pte = pte_next_swp_offset(pte);
+	const pte_t *end_ptep = start_ptep + max_nr;
+	pte_t *ptep = start_ptep + 1;
+
+	VM_WARN_ON(max_nr < 1);
+	VM_WARN_ON(!is_swap_pte(pte));
+	VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte)));
+
+	while (ptep < end_ptep) {
+		pte = ptep_get(ptep);
+
+		if (!pte_same(pte, expected_pte))
+			break;
+
+		expected_pte = pte_next_swp_offset(expected_pte);
+		ptep++;
+	}
+
+	return ptep - start_ptep;
+}
 #endif /* CONFIG_MMU */
 
 void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
diff --git a/mm/madvise.c b/mm/madvise.c
index 1f77a51baaac..5011ecb24344 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -628,6 +628,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 	struct folio *folio;
 	int nr_swap = 0;
 	unsigned long next;
+	int nr, max_nr;
 
 	next = pmd_addr_end(addr, end);
 	if (pmd_trans_huge(*pmd))
@@ -640,7 +641,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 		return 0;
 	flush_tlb_batched_pending(mm);
 	arch_enter_lazy_mmu_mode();
-	for (; addr != end; pte++, addr += PAGE_SIZE) {
+	for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) {
+		nr = 1;
 		ptent = ptep_get(pte);
 
 		if (pte_none(ptent))
@@ -655,9 +657,11 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 
 			entry = pte_to_swp_entry(ptent);
 			if (!non_swap_entry(entry)) {
-				nr_swap--;
-				free_swap_and_cache(entry);
-				pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+				max_nr = (end - addr) / PAGE_SIZE;
+				nr = swap_pte_batch(pte, max_nr, ptent);
+				nr_swap -= nr;
+				free_swap_and_cache_nr(entry, nr);
+				clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
 			} else if (is_hwpoison_entry(entry) ||
 				   is_poisoned_swp_entry(entry)) {
 				pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
diff --git a/mm/memory.c b/mm/memory.c
index 694e18837cd8..7880400370c8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1637,12 +1637,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 				folio_remove_rmap_pte(folio, page, vma);
 			folio_put(folio);
 		} else if (!non_swap_entry(entry)) {
-			/* Genuine swap entry, hence a private anon page */
+			max_nr = (end - addr) / PAGE_SIZE;
+			nr = swap_pte_batch(pte, max_nr, ptent);
+			/* Genuine swap entries, hence a private anon pages */
 			if (!should_zap_cows(details))
 				continue;
-			rss[MM_SWAPENTS]--;
-			if (unlikely(!free_swap_and_cache(entry)))
-				print_bad_pte(vma, addr, ptent, NULL);
+			rss[MM_SWAPENTS] -= nr;
+			free_swap_and_cache_nr(entry, nr);
 		} else if (is_migration_entry(entry)) {
 			folio = pfn_swap_entry_folio(entry);
 			if (!should_zap_folio(details, folio))
@@ -1665,8 +1666,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			pr_alert("unrecognized swap entry 0x%lx\n", entry.val);
 			WARN_ON_ONCE(1);
 		}
-		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
-		zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent);
+		clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
+		zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent);
 	} while (pte += nr, addr += PAGE_SIZE * nr, addr != end);
 
 	add_mm_rss_vec(mm, rss);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1ded6d1dcab4..20c45757f2b2 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -130,7 +130,11 @@ static inline unsigned char swap_count(unsigned char ent)
 /* Reclaim the swap entry if swap is getting full*/
 #define TTRS_FULL		0x4
 
-/* returns 1 if swap entry is freed */
+/*
+ * returns number of pages in the folio that backs the swap entry. If positive,
+ * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
+ * folio was associated with the swap entry.
+ */
 static int __try_to_reclaim_swap(struct swap_info_struct *si,
 				 unsigned long offset, unsigned long flags)
 {
@@ -155,6 +159,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
 			ret = folio_free_swap(folio);
 		folio_unlock(folio);
 	}
+	ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio);
 	folio_put(folio);
 	return ret;
 }
@@ -895,7 +900,7 @@ checks:
 		swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
 		spin_lock(&si->lock);
 		/* entry was freed successfully, try to use this again */
-		if (swap_was_freed)
+		if (swap_was_freed > 0)
 			goto checks;
 		goto scan; /* check next one */
 	}
@@ -1572,32 +1577,88 @@ bool folio_free_swap(struct folio *folio)
 	return true;
 }
 
-/*
- * Free the swap entry like above, but also try to
- * free the page cache entry if it is the last user.
+/**
+ * free_swap_and_cache_nr() - Release reference on range of swap entries and
+ *                            reclaim their cache if no more references remain.
+ * @entry: First entry of range.
+ * @nr: Number of entries in range.
+ *
+ * For each swap entry in the contiguous range, release a reference. If any swap
+ * entries become free, try to reclaim their underlying folios, if present. The
+ * offset range is defined by [entry.offset, entry.offset + nr).
  */
-int free_swap_and_cache(swp_entry_t entry)
+void free_swap_and_cache_nr(swp_entry_t entry, int nr)
 {
-	struct swap_info_struct *p;
+	const unsigned long start_offset = swp_offset(entry);
+	const unsigned long end_offset = start_offset + nr;
+	unsigned int type = swp_type(entry);
+	struct swap_info_struct *si;
+	bool any_only_cache = false;
+	unsigned long offset;
 	unsigned char count;
 
 	if (non_swap_entry(entry))
-		return 1;
+		return;
 
-	p = get_swap_device(entry);
-	if (p) {
-		if (WARN_ON(data_race(!p->swap_map[swp_offset(entry)]))) {
-			put_swap_device(p);
-			return 0;
+	si = get_swap_device(entry);
+	if (!si)
+		return;
+
+	if (WARN_ON(end_offset > si->max))
+		goto out;
+
+	/*
+	 * First free all entries in the range.
+	 */
+	for (offset = start_offset; offset < end_offset; offset++) {
+		if (data_race(si->swap_map[offset])) {
+			count = __swap_entry_free(si, swp_entry(type, offset));
+			if (count == SWAP_HAS_CACHE)
+				any_only_cache = true;
+		} else {
+			WARN_ON_ONCE(1);
 		}
+	}
+
+	/*
+	 * Short-circuit the below loop if none of the entries had their
+	 * reference drop to zero.
+	 */
+	if (!any_only_cache)
+		goto out;
 
-		count = __swap_entry_free(p, entry);
-		if (count == SWAP_HAS_CACHE)
-			__try_to_reclaim_swap(p, swp_offset(entry),
+	/*
+	 * Now go back over the range trying to reclaim the swap cache. This is
+	 * more efficient for large folios because we will only try to reclaim
+	 * the swap once per folio in the common case. If we do
+	 * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the
+	 * latter will get a reference and lock the folio for every individual
+	 * page but will only succeed once the swap slot for every subpage is
+	 * zero.
+	 */
+	for (offset = start_offset; offset < end_offset; offset += nr) {
+		nr = 1;
+		if (READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
+			/*
+			 * Folios are always naturally aligned in swap so
+			 * advance forward to the next boundary. Zero means no
+			 * folio was found for the swap entry, so advance by 1
+			 * in this case. Negative value means folio was found
+			 * but could not be reclaimed. Here we can still advance
+			 * to the next boundary.
+			 */
+			nr = __try_to_reclaim_swap(si, offset,
 					      TTRS_UNMAPPED | TTRS_FULL);
-		put_swap_device(p);
+			if (nr == 0)
+				nr = 1;
+			else if (nr < 0)
+				nr = -nr;
+			nr = ALIGN(offset + 1, nr) - offset;
+		}
 	}
-	return p != NULL;
+
+out:
+	put_swap_device(si);
 }
 
 #ifdef CONFIG_HIBERNATION
-- 
cgit v1.2.3


From 14c62da21b2b865f4fc0c49edd74ed7299927d35 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Mon, 8 Apr 2024 19:39:42 +0100
Subject: mm: swap: simplify struct percpu_cluster

struct percpu_cluster stores the index of cpu's current cluster and the
offset of the next entry that will be allocated for the cpu.  These two
pieces of information are redundant because the cluster index is just
(offset / SWAPFILE_CLUSTER).  The only reason for explicitly keeping the
cluster index is because the structure used for it also has a flag to
indicate "no cluster".  However this data structure also contains a spin
lock, which is never used in this context, as a side effect the code
copies the spinlock_t structure, which is questionable coding practice in
my view.

So let's clean this up and store only the next offset, and use a sentinal
value (SWAP_NEXT_INVALID) to indicate "no cluster".  SWAP_NEXT_INVALID is
chosen to be 0, because 0 will never be seen legitimately; The first page
in the swap file is the swap header, which is always marked bad to prevent
it from being allocated as an entry.  This also prevents the cluster to
which it belongs being marked free, so it will never appear on the free
list.

This change saves 16 bytes per cpu.  And given we are shortly going to
extend this mechanism to be per-cpu-AND-per-order, we will end up saving
16 * 9 = 144 bytes per cpu, which adds up if you have 256 cpus in the
system.

Link: https://lkml.kernel.org/r/20240408183946.2991168-4-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Barry Song <v-songbaohua@oppo.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Gao Xiang <xiang@kernel.org>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h |  9 ++++++++-
 mm/swapfile.c        | 22 +++++++++++-----------
 2 files changed, 19 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 2d8f2b950ddf..ce35fd4abdf0 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -260,13 +260,20 @@ struct swap_cluster_info {
 #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
 #define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
 
+/*
+ * The first page in the swap file is the swap header, which is always marked
+ * bad to prevent it from being allocated as an entry. This also prevents the
+ * cluster to which it belongs being marked free. Therefore 0 is safe to use as
+ * a sentinel to indicate next is not valid in percpu_cluster.
+ */
+#define SWAP_NEXT_INVALID	0
+
 /*
  * We assign a cluster to each CPU, so each CPU can allocate swap entry from
  * its own cluster and swapout sequentially. The purpose is to optimize swapout
  * throughput.
  */
 struct percpu_cluster {
-	struct swap_cluster_info index; /* Current cluster index */
 	unsigned int next; /* Likely next allocation offset */
 };
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 20c45757f2b2..e3f855475278 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -609,7 +609,7 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
 		return false;
 
 	percpu_cluster = this_cpu_ptr(si->percpu_cluster);
-	cluster_set_null(&percpu_cluster->index);
+	percpu_cluster->next = SWAP_NEXT_INVALID;
 	return true;
 }
 
@@ -622,14 +622,14 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
 {
 	struct percpu_cluster *cluster;
 	struct swap_cluster_info *ci;
-	unsigned long tmp, max;
+	unsigned int tmp, max;
 
 new_cluster:
 	cluster = this_cpu_ptr(si->percpu_cluster);
-	if (cluster_is_null(&cluster->index)) {
+	tmp = cluster->next;
+	if (tmp == SWAP_NEXT_INVALID) {
 		if (!cluster_list_empty(&si->free_clusters)) {
-			cluster->index = si->free_clusters.head;
-			cluster->next = cluster_next(&cluster->index) *
+			tmp = cluster_next(&si->free_clusters.head) *
 					SWAPFILE_CLUSTER;
 		} else if (!cluster_list_empty(&si->discard_clusters)) {
 			/*
@@ -649,9 +649,7 @@ new_cluster:
 	 * Other CPUs can use our cluster if they can't find a free cluster,
 	 * check if there is still free entry in the cluster
 	 */
-	tmp = cluster->next;
-	max = min_t(unsigned long, si->max,
-		    (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
+	max = min_t(unsigned long, si->max, ALIGN(tmp + 1, SWAPFILE_CLUSTER));
 	if (tmp < max) {
 		ci = lock_cluster(si, tmp);
 		while (tmp < max) {
@@ -662,12 +660,13 @@ new_cluster:
 		unlock_cluster(ci);
 	}
 	if (tmp >= max) {
-		cluster_set_null(&cluster->index);
+		cluster->next = SWAP_NEXT_INVALID;
 		goto new_cluster;
 	}
-	cluster->next = tmp + 1;
 	*offset = tmp;
 	*scan_base = tmp;
+	tmp += 1;
+	cluster->next = tmp < max ? tmp : SWAP_NEXT_INVALID;
 	return true;
 }
 
@@ -3163,8 +3162,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 		}
 		for_each_possible_cpu(cpu) {
 			struct percpu_cluster *cluster;
+
 			cluster = per_cpu_ptr(p->percpu_cluster, cpu);
-			cluster_set_null(&cluster->index);
+			cluster->next = SWAP_NEXT_INVALID;
 		}
 	} else {
 		atomic_inc(&nr_rotate_swap);
-- 
cgit v1.2.3


From 9faaa0f8168bfcd81469b0724b25ba3093097a08 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Mon, 8 Apr 2024 19:39:43 +0100
Subject: mm: swap: update get_swap_pages() to take folio order

We are about to allow swap storage of any mTHP size.  To prepare for that,
let's change get_swap_pages() to take a folio order parameter instead of
nr_pages.  This makes the interface self-documenting; a power-of-2 number
of pages must be provided.  We will also need the order internally so this
simplifies accessing it.

Link: https://lkml.kernel.org/r/20240408183946.2991168-5-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Barry Song <v-songbaohua@oppo.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Gao Xiang <xiang@kernel.org>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h |  2 +-
 mm/swap_slots.c      |  6 +++---
 mm/swapfile.c        | 13 +++++++------
 3 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index ce35fd4abdf0..b69b06733247 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -468,7 +468,7 @@ swp_entry_t folio_alloc_swap(struct folio *folio);
 bool folio_free_swap(struct folio *folio);
 void put_swap_folio(struct folio *folio, swp_entry_t entry);
 extern swp_entry_t get_swap_page_of_type(int);
-extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size);
+extern int get_swap_pages(int n, swp_entry_t swp_entries[], int order);
 extern int add_swap_count_continuation(swp_entry_t, gfp_t);
 extern void swap_shmem_alloc(swp_entry_t);
 extern int swap_duplicate(swp_entry_t);
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 53abeaf1371d..13ab3b771409 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -264,7 +264,7 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache)
 	cache->cur = 0;
 	if (swap_slot_cache_active)
 		cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE,
-					   cache->slots, 1);
+					   cache->slots, 0);
 
 	return cache->nr;
 }
@@ -311,7 +311,7 @@ swp_entry_t folio_alloc_swap(struct folio *folio)
 
 	if (folio_test_large(folio)) {
 		if (IS_ENABLED(CONFIG_THP_SWAP))
-			get_swap_pages(1, &entry, folio_nr_pages(folio));
+			get_swap_pages(1, &entry, folio_order(folio));
 		goto out;
 	}
 
@@ -343,7 +343,7 @@ repeat:
 			goto out;
 	}
 
-	get_swap_pages(1, &entry, 1);
+	get_swap_pages(1, &entry, 0);
 out:
 	if (mem_cgroup_try_charge_swap(folio, entry)) {
 		put_swap_folio(folio, entry);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e3f855475278..d2e3d3cd439f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -278,15 +278,15 @@ static void discard_swap_cluster(struct swap_info_struct *si,
 #ifdef CONFIG_THP_SWAP
 #define SWAPFILE_CLUSTER	HPAGE_PMD_NR
 
-#define swap_entry_size(size)	(size)
+#define swap_entry_order(order)	(order)
 #else
 #define SWAPFILE_CLUSTER	256
 
 /*
- * Define swap_entry_size() as constant to let compiler to optimize
+ * Define swap_entry_order() as constant to let compiler to optimize
  * out some code if !CONFIG_THP_SWAP
  */
-#define swap_entry_size(size)	1
+#define swap_entry_order(order)	0
 #endif
 #define LATENCY_LIMIT		256
 
@@ -1042,9 +1042,10 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
 	swap_range_free(si, offset, SWAPFILE_CLUSTER);
 }
 
-int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
+int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
 {
-	unsigned long size = swap_entry_size(entry_size);
+	int order = swap_entry_order(entry_order);
+	unsigned long size = 1 << order;
 	struct swap_info_struct *si, *next;
 	long avail_pgs;
 	int n_ret = 0;
@@ -1349,7 +1350,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
 	unsigned char *map;
 	unsigned int i, free_entries = 0;
 	unsigned char val;
-	int size = swap_entry_size(folio_nr_pages(folio));
+	int size = 1 << swap_entry_order(folio_order(folio));
 
 	si = _swap_info_get(entry);
 	if (!si)
-- 
cgit v1.2.3


From 845982eb264bc64b0c3242ace217fb574f56a299 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Mon, 8 Apr 2024 19:39:44 +0100
Subject: mm: swap: allow storage of all mTHP orders

Multi-size THP enables performance improvements by allocating large,
pte-mapped folios for anonymous memory.  However I've observed that on an
arm64 system running a parallel workload (e.g.  kernel compilation) across
many cores, under high memory pressure, the speed regresses.  This is due
to bottlenecking on the increased number of TLBIs added due to all the
extra folio splitting when the large folios are swapped out.

Therefore, solve this regression by adding support for swapping out mTHP
without needing to split the folio, just like is already done for
PMD-sized THP.  This change only applies when CONFIG_THP_SWAP is enabled,
and when the swap backing store is a non-rotating block device.  These are
the same constraints as for the existing PMD-sized THP swap-out support.

Note that no attempt is made to swap-in (m)THP here - this is still done
page-by-page, like for PMD-sized THP.  But swapping-out mTHP is a
prerequisite for swapping-in mTHP.

The main change here is to improve the swap entry allocator so that it can
allocate any power-of-2 number of contiguous entries between [1, (1 <<
PMD_ORDER)].  This is done by allocating a cluster for each distinct order
and allocating sequentially from it until the cluster is full.  This
ensures that we don't need to search the map and we get no fragmentation
due to alignment padding for different orders in the cluster.  If there is
no current cluster for a given order, we attempt to allocate a free
cluster from the list.  If there are no free clusters, we fail the
allocation and the caller can fall back to splitting the folio and
allocates individual entries (as per existing PMD-sized THP fallback).

The per-order current clusters are maintained per-cpu using the existing
infrastructure.  This is done to avoid interleving pages from different
tasks, which would prevent IO being batched.  This is already done for the
order-0 allocations so we follow the same pattern.

As is done for order-0 per-cpu clusters, the scanner now can steal order-0
entries from any per-cpu-per-order reserved cluster.  This ensures that
when the swap file is getting full, space doesn't get tied up in the
per-cpu reserves.

This change only modifies swap to be able to accept any order mTHP.  It
doesn't change the callers to elide doing the actual split.  That will be
done in separate changes.

Link: https://lkml.kernel.org/r/20240408183946.2991168-6-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Barry Song <v-songbaohua@oppo.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Gao Xiang <xiang@kernel.org>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/swap.h |   8 ++-
 mm/swapfile.c        | 162 +++++++++++++++++++++++++++++----------------------
 2 files changed, 98 insertions(+), 72 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index b69b06733247..6b672a67ae5d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -268,13 +268,19 @@ struct swap_cluster_info {
  */
 #define SWAP_NEXT_INVALID	0
 
+#ifdef CONFIG_THP_SWAP
+#define SWAP_NR_ORDERS		(PMD_ORDER + 1)
+#else
+#define SWAP_NR_ORDERS		1
+#endif
+
 /*
  * We assign a cluster to each CPU, so each CPU can allocate swap entry from
  * its own cluster and swapout sequentially. The purpose is to optimize swapout
  * throughput.
  */
 struct percpu_cluster {
-	unsigned int next; /* Likely next allocation offset */
+	unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
 };
 
 struct swap_cluster_list {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d2e3d3cd439f..148ef08f19dd 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -551,10 +551,12 @@ static void free_cluster(struct swap_info_struct *si, unsigned long idx)
 
 /*
  * The cluster corresponding to page_nr will be used. The cluster will be
- * removed from free cluster list and its usage counter will be increased.
+ * removed from free cluster list and its usage counter will be increased by
+ * count.
  */
-static void inc_cluster_info_page(struct swap_info_struct *p,
-	struct swap_cluster_info *cluster_info, unsigned long page_nr)
+static void add_cluster_info_page(struct swap_info_struct *p,
+	struct swap_cluster_info *cluster_info, unsigned long page_nr,
+	unsigned long count)
 {
 	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
 
@@ -563,9 +565,19 @@ static void inc_cluster_info_page(struct swap_info_struct *p,
 	if (cluster_is_free(&cluster_info[idx]))
 		alloc_cluster(p, idx);
 
-	VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
+	VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER);
 	cluster_set_count(&cluster_info[idx],
-		cluster_count(&cluster_info[idx]) + 1);
+		cluster_count(&cluster_info[idx]) + count);
+}
+
+/*
+ * The cluster corresponding to page_nr will be used. The cluster will be
+ * removed from free cluster list and its usage counter will be increased by 1.
+ */
+static void inc_cluster_info_page(struct swap_info_struct *p,
+	struct swap_cluster_info *cluster_info, unsigned long page_nr)
+{
+	add_cluster_info_page(p, cluster_info, page_nr, 1);
 }
 
 /*
@@ -595,7 +607,7 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
  */
 static bool
 scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
-	unsigned long offset)
+	unsigned long offset, int order)
 {
 	struct percpu_cluster *percpu_cluster;
 	bool conflict;
@@ -609,24 +621,39 @@ scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
 		return false;
 
 	percpu_cluster = this_cpu_ptr(si->percpu_cluster);
-	percpu_cluster->next = SWAP_NEXT_INVALID;
+	percpu_cluster->next[order] = SWAP_NEXT_INVALID;
+	return true;
+}
+
+static inline bool swap_range_empty(char *swap_map, unsigned int start,
+				    unsigned int nr_pages)
+{
+	unsigned int i;
+
+	for (i = 0; i < nr_pages; i++) {
+		if (swap_map[start + i])
+			return false;
+	}
+
 	return true;
 }
 
 /*
- * Try to get a swap entry from current cpu's swap entry pool (a cluster). This
- * might involve allocating a new cluster for current CPU too.
+ * Try to get swap entries with specified order from current cpu's swap entry
+ * pool (a cluster). This might involve allocating a new cluster for current CPU
+ * too.
  */
 static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
-	unsigned long *offset, unsigned long *scan_base)
+	unsigned long *offset, unsigned long *scan_base, int order)
 {
+	unsigned int nr_pages = 1 << order;
 	struct percpu_cluster *cluster;
 	struct swap_cluster_info *ci;
 	unsigned int tmp, max;
 
 new_cluster:
 	cluster = this_cpu_ptr(si->percpu_cluster);
-	tmp = cluster->next;
+	tmp = cluster->next[order];
 	if (tmp == SWAP_NEXT_INVALID) {
 		if (!cluster_list_empty(&si->free_clusters)) {
 			tmp = cluster_next(&si->free_clusters.head) *
@@ -647,26 +674,27 @@ new_cluster:
 
 	/*
 	 * Other CPUs can use our cluster if they can't find a free cluster,
-	 * check if there is still free entry in the cluster
+	 * check if there is still free entry in the cluster, maintaining
+	 * natural alignment.
 	 */
 	max = min_t(unsigned long, si->max, ALIGN(tmp + 1, SWAPFILE_CLUSTER));
 	if (tmp < max) {
 		ci = lock_cluster(si, tmp);
 		while (tmp < max) {
-			if (!si->swap_map[tmp])
+			if (swap_range_empty(si->swap_map, tmp, nr_pages))
 				break;
-			tmp++;
+			tmp += nr_pages;
 		}
 		unlock_cluster(ci);
 	}
 	if (tmp >= max) {
-		cluster->next = SWAP_NEXT_INVALID;
+		cluster->next[order] = SWAP_NEXT_INVALID;
 		goto new_cluster;
 	}
 	*offset = tmp;
 	*scan_base = tmp;
-	tmp += 1;
-	cluster->next = tmp < max ? tmp : SWAP_NEXT_INVALID;
+	tmp += nr_pages;
+	cluster->next[order] = tmp < max ? tmp : SWAP_NEXT_INVALID;
 	return true;
 }
 
@@ -796,13 +824,14 @@ static bool swap_offset_available_and_locked(struct swap_info_struct *si,
 
 static int scan_swap_map_slots(struct swap_info_struct *si,
 			       unsigned char usage, int nr,
-			       swp_entry_t slots[])
+			       swp_entry_t slots[], int order)
 {
 	struct swap_cluster_info *ci;
 	unsigned long offset;
 	unsigned long scan_base;
 	unsigned long last_in_cluster = 0;
 	int latency_ration = LATENCY_LIMIT;
+	unsigned int nr_pages = 1 << order;
 	int n_ret = 0;
 	bool scanned_many = false;
 
@@ -817,6 +846,25 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
 	 * And we let swap pages go all over an SSD partition.  Hugh
 	 */
 
+	if (order > 0) {
+		/*
+		 * Should not even be attempting large allocations when huge
+		 * page swap is disabled.  Warn and fail the allocation.
+		 */
+		if (!IS_ENABLED(CONFIG_THP_SWAP) ||
+		    nr_pages > SWAPFILE_CLUSTER) {
+			VM_WARN_ON_ONCE(1);
+			return 0;
+		}
+
+		/*
+		 * Swapfile is not block device or not using clusters so unable
+		 * to allocate large entries.
+		 */
+		if (!(si->flags & SWP_BLKDEV) || !si->cluster_info)
+			return 0;
+	}
+
 	si->flags += SWP_SCANNING;
 	/*
 	 * Use percpu scan base for SSD to reduce lock contention on
@@ -831,8 +879,11 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
 
 	/* SSD algorithm */
 	if (si->cluster_info) {
-		if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
+		if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) {
+			if (order > 0)
+				goto no_page;
 			goto scan;
+		}
 	} else if (unlikely(!si->cluster_nr--)) {
 		if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
 			si->cluster_nr = SWAPFILE_CLUSTER - 1;
@@ -874,13 +925,16 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
 
 checks:
 	if (si->cluster_info) {
-		while (scan_swap_map_ssd_cluster_conflict(si, offset)) {
+		while (scan_swap_map_ssd_cluster_conflict(si, offset, order)) {
 		/* take a break if we already got some slots */
 			if (n_ret)
 				goto done;
 			if (!scan_swap_map_try_ssd_cluster(si, &offset,
-							&scan_base))
+							&scan_base, order)) {
+				if (order > 0)
+					goto no_page;
 				goto scan;
+			}
 		}
 	}
 	if (!(si->flags & SWP_WRITEOK))
@@ -911,11 +965,11 @@ checks:
 		else
 			goto done;
 	}
-	WRITE_ONCE(si->swap_map[offset], usage);
-	inc_cluster_info_page(si, si->cluster_info, offset);
+	memset(si->swap_map + offset, usage, nr_pages);
+	add_cluster_info_page(si, si->cluster_info, offset, nr_pages);
 	unlock_cluster(ci);
 
-	swap_range_alloc(si, offset, 1);
+	swap_range_alloc(si, offset, nr_pages);
 	slots[n_ret++] = swp_entry(si->type, offset);
 
 	/* got enough slots or reach max slots? */
@@ -936,8 +990,10 @@ checks:
 
 	/* try to get more slots in cluster */
 	if (si->cluster_info) {
-		if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
+		if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order))
 			goto checks;
+		if (order > 0)
+			goto done;
 	} else if (si->cluster_nr && !si->swap_map[++offset]) {
 		/* non-ssd case, still more slots in cluster? */
 		--si->cluster_nr;
@@ -964,11 +1020,13 @@ checks:
 	}
 
 done:
-	set_cluster_next(si, offset + 1);
+	if (order == 0)
+		set_cluster_next(si, offset + 1);
 	si->flags -= SWP_SCANNING;
 	return n_ret;
 
 scan:
+	VM_WARN_ON(order > 0);
 	spin_unlock(&si->lock);
 	while (++offset <= READ_ONCE(si->highest_bit)) {
 		if (unlikely(--latency_ration < 0)) {
@@ -997,38 +1055,6 @@ no_page:
 	return n_ret;
 }
 
-static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
-{
-	unsigned long idx;
-	struct swap_cluster_info *ci;
-	unsigned long offset;
-
-	/*
-	 * Should not even be attempting cluster allocations when huge
-	 * page swap is disabled.  Warn and fail the allocation.
-	 */
-	if (!IS_ENABLED(CONFIG_THP_SWAP)) {
-		VM_WARN_ON_ONCE(1);
-		return 0;
-	}
-
-	if (cluster_list_empty(&si->free_clusters))
-		return 0;
-
-	idx = cluster_list_first(&si->free_clusters);
-	offset = idx * SWAPFILE_CLUSTER;
-	ci = lock_cluster(si, offset);
-	alloc_cluster(si, idx);
-	cluster_set_count(ci, SWAPFILE_CLUSTER);
-
-	memset(si->swap_map + offset, SWAP_HAS_CACHE, SWAPFILE_CLUSTER);
-	unlock_cluster(ci);
-	swap_range_alloc(si, offset, SWAPFILE_CLUSTER);
-	*slot = swp_entry(si->type, offset);
-
-	return 1;
-}
-
 static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
 {
 	unsigned long offset = idx * SWAPFILE_CLUSTER;
@@ -1051,9 +1077,6 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
 	int n_ret = 0;
 	int node;
 
-	/* Only single cluster request supported */
-	WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
-
 	spin_lock(&swap_avail_lock);
 
 	avail_pgs = atomic_long_read(&nr_swap_pages) / size;
@@ -1089,14 +1112,10 @@ start_over:
 			spin_unlock(&si->lock);
 			goto nextsi;
 		}
-		if (size == SWAPFILE_CLUSTER) {
-			if (si->flags & SWP_BLKDEV)
-				n_ret = swap_alloc_cluster(si, swp_entries);
-		} else
-			n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
-						    n_goal, swp_entries);
+		n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
+					    n_goal, swp_entries, order);
 		spin_unlock(&si->lock);
-		if (n_ret || size == SWAPFILE_CLUSTER)
+		if (n_ret || size > 1)
 			goto check_out;
 		cond_resched();
 
@@ -1673,7 +1692,7 @@ swp_entry_t get_swap_page_of_type(int type)
 
 	/* This is called for allocating swap entry, not cache */
 	spin_lock(&si->lock);
-	if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry))
+	if ((si->flags & SWP_WRITEOK) && scan_swap_map_slots(si, 1, 1, &entry, 0))
 		atomic_long_dec(&nr_swap_pages);
 	spin_unlock(&si->lock);
 fail:
@@ -3127,7 +3146,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 		p->flags |= SWP_SYNCHRONOUS_IO;
 
 	if (p->bdev && bdev_nonrot(p->bdev)) {
-		int cpu;
+		int cpu, i;
 		unsigned long ci, nr_cluster;
 
 		p->flags |= SWP_SOLIDSTATE;
@@ -3165,7 +3184,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 			struct percpu_cluster *cluster;
 
 			cluster = per_cpu_ptr(p->percpu_cluster, cpu);
-			cluster->next = SWAP_NEXT_INVALID;
+			for (i = 0; i < SWAP_NR_ORDERS; i++)
+				cluster->next[i] = SWAP_NEXT_INVALID;
 		}
 	} else {
 		atomic_inc(&nr_rotate_swap);
-- 
cgit v1.2.3


From 3931b871c4936c00c4e27c469056d8da47a3493f Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Mon, 8 Apr 2024 19:39:46 +0100
Subject: mm: madvise: avoid split during MADV_PAGEOUT and MADV_COLD

Rework madvise_cold_or_pageout_pte_range() to avoid splitting any large
folio that is fully and contiguously mapped in the pageout/cold vm range.
This change means that large folios will be maintained all the way to swap
storage.  This both improves performance during swap-out, by eliding the
cost of splitting the folio, and sets us up nicely for maintaining the
large folio when it is swapped back in (to be covered in a separate
series).

Folios that are not fully mapped in the target range are still split, but
note that behavior is changed so that if the split fails for any reason
(folio locked, shared, etc) we now leave it as is and move to the next pte
in the range and continue work on the proceeding folios.  Previously any
failure of this sort would cause the entire operation to give up and no
folios mapped at higher addresses were paged out or made cold.  Given
large folios are becoming more common, this old behavior would have likely
lead to wasted opportunities.

While we are at it, change the code that clears young from the ptes to use
ptep_test_and_clear_young(), via the new mkold_ptes() batch helper
function.  This is more efficent than get_and_clear/modify/set, especially
for contpte mappings on arm64, where the old approach would require
unfolding/refolding and the new approach can be done in place.

Link: https://lkml.kernel.org/r/20240408183946.2991168-8-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Barry Song <v-songbaohua@oppo.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Gao Xiang <xiang@kernel.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yu Zhao <yuzhao@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pgtable.h | 30 +++++++++++++++++
 mm/internal.h           | 12 +++++--
 mm/madvise.c            | 87 ++++++++++++++++++++++++++++---------------------
 mm/memory.c             |  4 +--
 4 files changed, 92 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 75096025fe52..e2f45e22a6d1 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -361,6 +361,36 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
 }
 #endif
 
+#ifndef mkold_ptes
+/**
+ * mkold_ptes - Mark PTEs that map consecutive pages of the same folio as old.
+ * @vma: VMA the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to mark old.
+ *
+ * May be overridden by the architecture; otherwise, implemented as a simple
+ * loop over ptep_test_and_clear_young().
+ *
+ * Note that PTE bits in the PTE range besides the PFN can differ. For example,
+ * some PTEs might be write-protected.
+ *
+ * Context: The caller holds the page table lock.  The PTEs map consecutive
+ * pages that belong to the same folio.  The PTEs are all in the same PMD.
+ */
+static inline void mkold_ptes(struct vm_area_struct *vma, unsigned long addr,
+		pte_t *ptep, unsigned int nr)
+{
+	for (;;) {
+		ptep_test_and_clear_young(vma, addr, ptep);
+		if (--nr == 0)
+			break;
+		ptep++;
+		addr += PAGE_SIZE;
+	}
+}
+#endif
+
 #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
 static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
diff --git a/mm/internal.h b/mm/internal.h
index d34df04b11f6..640298a98a3c 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -130,6 +130,8 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
  * @flags: Flags to modify the PTE batch semantics.
  * @any_writable: Optional pointer to indicate whether any entry except the
  *		  first one is writable.
+ * @any_young: Optional pointer to indicate whether any entry except the
+ *		  first one is young.
  *
  * Detect a PTE batch: consecutive (present) PTEs that map consecutive
  * pages of the same large folio.
@@ -145,16 +147,18 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
  */
 static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
 		pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags,
-		bool *any_writable)
+		bool *any_writable, bool *any_young)
 {
 	unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
 	const pte_t *end_ptep = start_ptep + max_nr;
 	pte_t expected_pte, *ptep;
-	bool writable;
+	bool writable, young;
 	int nr;
 
 	if (any_writable)
 		*any_writable = false;
+	if (any_young)
+		*any_young = false;
 
 	VM_WARN_ON_FOLIO(!pte_present(pte), folio);
 	VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
@@ -168,6 +172,8 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
 		pte = ptep_get(ptep);
 		if (any_writable)
 			writable = !!pte_write(pte);
+		if (any_young)
+			young = !!pte_young(pte);
 		pte = __pte_batch_clear_ignored(pte, flags);
 
 		if (!pte_same(pte, expected_pte))
@@ -183,6 +189,8 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
 
 		if (any_writable)
 			*any_writable |= writable;
+		if (any_young)
+			*any_young |= young;
 
 		nr = pte_batch_hint(ptep, pte);
 		expected_pte = pte_advance_pfn(expected_pte, nr);
diff --git a/mm/madvise.c b/mm/madvise.c
index 5011ecb24344..f59169888b8e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -336,6 +336,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 	LIST_HEAD(folio_list);
 	bool pageout_anon_only_filter;
 	unsigned int batch_count = 0;
+	int nr;
 
 	if (fatal_signal_pending(current))
 		return -EINTR;
@@ -423,7 +424,8 @@ restart:
 		return 0;
 	flush_tlb_batched_pending(mm);
 	arch_enter_lazy_mmu_mode();
-	for (; addr < end; pte++, addr += PAGE_SIZE) {
+	for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) {
+		nr = 1;
 		ptent = ptep_get(pte);
 
 		if (++batch_count == SWAP_CLUSTER_MAX) {
@@ -447,55 +449,66 @@ restart:
 			continue;
 
 		/*
-		 * Creating a THP page is expensive so split it only if we
-		 * are sure it's worth. Split it if we are only owner.
+		 * If we encounter a large folio, only split it if it is not
+		 * fully mapped within the range we are operating on. Otherwise
+		 * leave it as is so that it can be swapped out whole. If we
+		 * fail to split a folio, leave it in place and advance to the
+		 * next pte in the range.
 		 */
 		if (folio_test_large(folio)) {
-			int err;
-
-			if (folio_likely_mapped_shared(folio))
-				break;
-			if (pageout_anon_only_filter && !folio_test_anon(folio))
-				break;
-			if (!folio_trylock(folio))
-				break;
-			folio_get(folio);
-			arch_leave_lazy_mmu_mode();
-			pte_unmap_unlock(start_pte, ptl);
-			start_pte = NULL;
-			err = split_folio(folio);
-			folio_unlock(folio);
-			folio_put(folio);
-			if (err)
-				break;
-			start_pte = pte =
-				pte_offset_map_lock(mm, pmd, addr, &ptl);
-			if (!start_pte)
-				break;
-			arch_enter_lazy_mmu_mode();
-			pte--;
-			addr -= PAGE_SIZE;
-			continue;
+			const fpb_t fpb_flags = FPB_IGNORE_DIRTY |
+						FPB_IGNORE_SOFT_DIRTY;
+			int max_nr = (end - addr) / PAGE_SIZE;
+			bool any_young;
+
+			nr = folio_pte_batch(folio, addr, pte, ptent, max_nr,
+					     fpb_flags, NULL, &any_young);
+			if (any_young)
+				ptent = pte_mkyoung(ptent);
+
+			if (nr < folio_nr_pages(folio)) {
+				int err;
+
+				if (folio_likely_mapped_shared(folio))
+					continue;
+				if (pageout_anon_only_filter && !folio_test_anon(folio))
+					continue;
+				if (!folio_trylock(folio))
+					continue;
+				folio_get(folio);
+				arch_leave_lazy_mmu_mode();
+				pte_unmap_unlock(start_pte, ptl);
+				start_pte = NULL;
+				err = split_folio(folio);
+				folio_unlock(folio);
+				folio_put(folio);
+				start_pte = pte =
+					pte_offset_map_lock(mm, pmd, addr, &ptl);
+				if (!start_pte)
+					break;
+				arch_enter_lazy_mmu_mode();
+				if (!err)
+					nr = 0;
+				continue;
+			}
 		}
 
 		/*
 		 * Do not interfere with other mappings of this folio and
-		 * non-LRU folio.
+		 * non-LRU folio. If we have a large folio at this point, we
+		 * know it is fully mapped so if its mapcount is the same as its
+		 * number of pages, it must be exclusive.
 		 */
-		if (!folio_test_lru(folio) || folio_mapcount(folio) != 1)
+		if (!folio_test_lru(folio) ||
+		    folio_mapcount(folio) != folio_nr_pages(folio))
 			continue;
 
 		if (pageout_anon_only_filter && !folio_test_anon(folio))
 			continue;
 
-		VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
-
 		if (!pageout && pte_young(ptent)) {
-			ptent = ptep_get_and_clear_full(mm, addr, pte,
-							tlb->fullmm);
-			ptent = pte_mkold(ptent);
-			set_pte_at(mm, addr, pte, ptent);
-			tlb_remove_tlb_entry(tlb, pte, addr);
+			mkold_ptes(vma, addr, pte, nr);
+			tlb_remove_tlb_entries(tlb, pte, nr, addr);
 		}
 
 		/*
diff --git a/mm/memory.c b/mm/memory.c
index 7880400370c8..59c05dc8b18a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -989,7 +989,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
 			flags |= FPB_IGNORE_SOFT_DIRTY;
 
 		nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags,
-				     &any_writable);
+				     &any_writable, NULL);
 		folio_ref_add(folio, nr);
 		if (folio_test_anon(folio)) {
 			if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
@@ -1559,7 +1559,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb,
 	 */
 	if (unlikely(folio_test_large(folio) && max_nr != 1)) {
 		nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags,
-				     NULL);
+				     NULL, NULL);
 
 		zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
 				       addr, details, rss, force_flush,
-- 
cgit v1.2.3


From 25176ad09ca395fcc83b1fc78adf25c8eb1bd964 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 2 Apr 2024 14:55:15 +0200
Subject: mm/treewide: rename CONFIG_HAVE_FAST_GUP to CONFIG_HAVE_GUP_FAST

Nowadays, we call it "GUP-fast", the external interface includes functions
like "get_user_pages_fast()", and we renamed all internal functions to
reflect that as well.

Let's make the config option reflect that.

Link: https://lkml.kernel.org/r/20240402125516.223131-3-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/arm/Kconfig       |  2 +-
 arch/arm64/Kconfig     |  2 +-
 arch/loongarch/Kconfig |  2 +-
 arch/mips/Kconfig      |  2 +-
 arch/powerpc/Kconfig   |  2 +-
 arch/riscv/Kconfig     |  2 +-
 arch/s390/Kconfig      |  2 +-
 arch/sh/Kconfig        |  2 +-
 arch/x86/Kconfig       |  2 +-
 include/linux/rmap.h   |  8 ++++----
 kernel/events/core.c   |  4 ++--
 mm/Kconfig             |  2 +-
 mm/gup.c               | 10 +++++-----
 mm/internal.h          |  2 +-
 14 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index b14aed3a17ab..817918f6635a 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -99,7 +99,7 @@ config ARM
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS if (CPU_V6 || CPU_V6K || CPU_V7) && MMU
 	select HAVE_EXIT_THREAD
-	select HAVE_FAST_GUP if ARM_LPAE
+	select HAVE_GUP_FAST if ARM_LPAE
 	select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL
 	select HAVE_FUNCTION_ERROR_INJECTION
 	select HAVE_FUNCTION_GRAPH_TRACER
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 7b11c98b3e84..de076a191e9f 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -205,7 +205,7 @@ config ARM64
 	select HAVE_SAMPLE_FTRACE_DIRECT
 	select HAVE_SAMPLE_FTRACE_DIRECT_MULTI
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
-	select HAVE_FAST_GUP
+	select HAVE_GUP_FAST
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_ERROR_INJECTION
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index a5f300ec6f28..cd642eefd9e5 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -119,7 +119,7 @@ config LOONGARCH
 	select HAVE_EBPF_JIT
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS if !ARCH_STRICT_ALIGN
 	select HAVE_EXIT_THREAD
-	select HAVE_FAST_GUP
+	select HAVE_GUP_FAST
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_ARG_ACCESS_API
 	select HAVE_FUNCTION_ERROR_INJECTION
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 516dc7022bd7..f1aa1bf11166 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -68,7 +68,7 @@ config MIPS
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_EBPF_JIT if !CPU_MICROMIPS
 	select HAVE_EXIT_THREAD
-	select HAVE_FAST_GUP
+	select HAVE_GUP_FAST
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_TRACER
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 1c4be3373686..e42cc8cd415f 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -236,7 +236,7 @@ config PPC
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS	if ARCH_USING_PATCHABLE_FUNCTION_ENTRY || MPROFILE_KERNEL || PPC32
 	select HAVE_EBPF_JIT
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
-	select HAVE_FAST_GUP
+	select HAVE_GUP_FAST
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_ARG_ACCESS_API
 	select HAVE_FUNCTION_DESCRIPTORS	if PPC64_ELF_ABI_V1
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index be09c8836d56..3ee60ddef93e 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -132,7 +132,7 @@ config RISCV
 	select HAVE_FUNCTION_GRAPH_RETVAL if HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_TRACER if !XIP_KERNEL && !PREEMPTION
 	select HAVE_EBPF_JIT if MMU
-	select HAVE_FAST_GUP if MMU
+	select HAVE_GUP_FAST if MMU
 	select HAVE_FUNCTION_ARG_ACCESS_API
 	select HAVE_FUNCTION_ERROR_INJECTION
 	select HAVE_GCC_PLUGINS
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 8f01ada6845e..d9aed4c93ee6 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -174,7 +174,7 @@ config S390
 	select HAVE_DYNAMIC_FTRACE_WITH_REGS
 	select HAVE_EBPF_JIT if HAVE_MARCH_Z196_FEATURES
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
-	select HAVE_FAST_GUP
+	select HAVE_GUP_FAST
 	select HAVE_FENTRY
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_ARG_ACCESS_API
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 2ad3e29f0ebe..7292542f75e8 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -38,7 +38,7 @@ config SUPERH
 	select HAVE_DEBUG_BUGVERBOSE
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DYNAMIC_FTRACE
-	select HAVE_FAST_GUP if MMU
+	select HAVE_GUP_FAST if MMU
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FTRACE_MCOUNT_RECORD
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 4474bf32d0a4..d95de5eab213 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -221,7 +221,7 @@ config X86
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
 	select HAVE_EISA
 	select HAVE_EXIT_THREAD
-	select HAVE_FAST_GUP
+	select HAVE_GUP_FAST
 	select HAVE_FENTRY			if X86_64 || DYNAMIC_FTRACE
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_FUNCTION_GRAPH_RETVAL	if HAVE_FUNCTION_GRAPH_TRACER
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b7944a833668..9bf9324214fc 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -284,7 +284,7 @@ static inline int hugetlb_try_share_anon_rmap(struct folio *folio)
 	VM_WARN_ON_FOLIO(!PageAnonExclusive(&folio->page), folio);
 
 	/* Paired with the memory barrier in try_grab_folio(). */
-	if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+	if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
 		smp_mb();
 
 	if (unlikely(folio_maybe_dma_pinned(folio)))
@@ -295,7 +295,7 @@ static inline int hugetlb_try_share_anon_rmap(struct folio *folio)
 	 * This is conceptually a smp_wmb() paired with the smp_rmb() in
 	 * gup_must_unshare().
 	 */
-	if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+	if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
 		smp_mb__after_atomic();
 	return 0;
 }
@@ -541,7 +541,7 @@ static __always_inline int __folio_try_share_anon_rmap(struct folio *folio,
 	 */
 
 	/* Paired with the memory barrier in try_grab_folio(). */
-	if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+	if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
 		smp_mb();
 
 	if (unlikely(folio_maybe_dma_pinned(folio)))
@@ -552,7 +552,7 @@ static __always_inline int __folio_try_share_anon_rmap(struct folio *folio,
 	 * This is conceptually a smp_wmb() paired with the smp_rmb() in
 	 * gup_must_unshare().
 	 */
-	if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+	if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
 		smp_mb__after_atomic();
 	return 0;
 }
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 724e6d7e128f..c5a0dc1f135f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7539,7 +7539,7 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
 {
 	u64 size = 0;
 
-#ifdef CONFIG_HAVE_FAST_GUP
+#ifdef CONFIG_HAVE_GUP_FAST
 	pgd_t *pgdp, pgd;
 	p4d_t *p4dp, p4d;
 	pud_t *pudp, pud;
@@ -7587,7 +7587,7 @@ again:
 	if (pte_present(pte))
 		size = pte_leaf_size(pte);
 	pte_unmap(ptep);
-#endif /* CONFIG_HAVE_FAST_GUP */
+#endif /* CONFIG_HAVE_GUP_FAST */
 
 	return size;
 }
diff --git a/mm/Kconfig b/mm/Kconfig
index f0ed3168db00..50df323eaece 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -473,7 +473,7 @@ config ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
 config HAVE_MEMBLOCK_PHYS_MAP
 	bool
 
-config HAVE_FAST_GUP
+config HAVE_GUP_FAST
 	depends on MMU
 	bool
 
diff --git a/mm/gup.c b/mm/gup.c
index 7609efc55f1c..8dcbeae714e2 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -501,7 +501,7 @@ static inline void mm_set_has_pinned_flag(unsigned long *mm_flags)
 
 #ifdef CONFIG_MMU
 
-#if defined(CONFIG_ARCH_HAS_HUGEPD) || defined(CONFIG_HAVE_FAST_GUP)
+#if defined(CONFIG_ARCH_HAS_HUGEPD) || defined(CONFIG_HAVE_GUP_FAST)
 static int record_subpages(struct page *page, unsigned long sz,
 			   unsigned long addr, unsigned long end,
 			   struct page **pages)
@@ -515,7 +515,7 @@ static int record_subpages(struct page *page, unsigned long sz,
 
 	return nr;
 }
-#endif	/* CONFIG_ARCH_HAS_HUGEPD || CONFIG_HAVE_FAST_GUP */
+#endif	/* CONFIG_ARCH_HAS_HUGEPD || CONFIG_HAVE_GUP_FAST */
 
 #ifdef CONFIG_ARCH_HAS_HUGEPD
 static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
@@ -2782,7 +2782,7 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
  *
  * This code is based heavily on the PowerPC implementation by Nick Piggin.
  */
-#ifdef CONFIG_HAVE_FAST_GUP
+#ifdef CONFIG_HAVE_GUP_FAST
 
 /*
  * Used in the GUP-fast path to determine whether GUP is permitted to work on
@@ -3361,7 +3361,7 @@ static inline void gup_fast_pgd_range(unsigned long addr, unsigned long end,
 		unsigned int flags, struct page **pages, int *nr)
 {
 }
-#endif /* CONFIG_HAVE_FAST_GUP */
+#endif /* CONFIG_HAVE_GUP_FAST */
 
 #ifndef gup_fast_permitted
 /*
@@ -3381,7 +3381,7 @@ static unsigned long gup_fast(unsigned long start, unsigned long end,
 	int nr_pinned = 0;
 	unsigned seq;
 
-	if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) ||
+	if (!IS_ENABLED(CONFIG_HAVE_GUP_FAST) ||
 	    !gup_fast_permitted(start, end))
 		return 0;
 
diff --git a/mm/internal.h b/mm/internal.h
index 640298a98a3c..8fd41f889a95 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1265,7 +1265,7 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma,
 	}
 
 	/* Paired with a memory barrier in folio_try_share_anon_rmap_*(). */
-	if (IS_ENABLED(CONFIG_HAVE_FAST_GUP))
+	if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
 		smp_rmb();
 
 	/*
-- 
cgit v1.2.3


From 7edea4c6fdf23754c77582a0377791e1aa9d2700 Mon Sep 17 00:00:00 2001
From: Jinjiang Tu <tujinjiang@huawei.com>
Date: Tue, 2 Apr 2024 10:49:34 +0800
Subject: mm/ksm: remove redundant code in ksm_fork

Since commit 3c6f33b7273a ("mm/ksm: support fork/exec for prctl"), when a
child process is forked, the MMF_VM_MERGE_ANY flag will be inherited in
mm_init(). So, it's unnecessary to set the flag in ksm_fork().

Link: https://lkml.kernel.org/r/20240402024934.1093361-1-tujinjiang@huawei.com
Signed-off-by: Jinjiang Tu <tujinjiang@huawei.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Nanyong Sun <sunnanyong@huawei.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Stefan Roesch <shr@devkernel.io>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/ksm.h | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 7e2b1de3996a..358803cfd4d5 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -45,16 +45,8 @@ static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte)
 
 static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 {
-	int ret;
-
-	if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) {
-		ret = __ksm_enter(mm);
-		if (ret)
-			return ret;
-	}
-
-	if (test_bit(MMF_VM_MERGE_ANY, &oldmm->flags))
-		set_bit(MMF_VM_MERGE_ANY, &mm->flags);
+	if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
+		return __ksm_enter(mm);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 2542b1ac9a46ac58f9565de0048457956898d481 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 5 Apr 2024 16:32:25 +0100
Subject: mm: inline destroy_large_folio() into __folio_put_large()

destroy_large_folio() has only one caller, move its contents there.

Link: https://lkml.kernel.org/r/20240405153228.2563754-4-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h |  2 --
 mm/page_alloc.c    | 14 --------------
 mm/swap.c          | 13 ++++++++++---
 3 files changed, 10 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4f4e460d7853..cc9410d772dd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1318,8 +1318,6 @@ void folio_copy(struct folio *dst, struct folio *src);
 
 unsigned long nr_free_buffer_pages(void);
 
-void destroy_large_folio(struct folio *folio);
-
 /* Returns the number of bytes in this potentially compound page. */
 static inline unsigned long page_size(struct page *page)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c01757251de2..22e8b9f1d710 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -565,20 +565,6 @@ void prep_compound_page(struct page *page, unsigned int order)
 	prep_compound_head(page, order);
 }
 
-void destroy_large_folio(struct folio *folio)
-{
-	if (folio_test_hugetlb(folio)) {
-		free_huge_folio(folio);
-		return;
-	}
-
-	if (folio_test_large_rmappable(folio))
-		folio_undo_large_rmappable(folio);
-
-	mem_cgroup_uncharge(folio);
-	free_unref_page(&folio->page, folio_order(folio));
-}
-
 static inline void set_buddy_order(struct page *page, unsigned int order)
 {
 	set_page_private(page, order);
diff --git a/mm/swap.c b/mm/swap.c
index d7db3cd4e80a..6e3bd03673ea 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -127,9 +127,16 @@ static void __folio_put_large(struct folio *folio)
 	 * (it's never listed to any LRU lists) and no memcg routines should
 	 * be called for hugetlb (it has a separate hugetlb_cgroup.)
 	 */
-	if (!folio_test_hugetlb(folio))
-		page_cache_release(folio);
-	destroy_large_folio(folio);
+	if (folio_test_hugetlb(folio)) {
+		free_huge_folio(folio);
+		return;
+	}
+
+	page_cache_release(folio);
+	if (folio_test_large_rmappable(folio))
+		folio_undo_large_rmappable(folio);
+	mem_cgroup_uncharge(folio);
+	free_unref_page(&folio->page, folio_order(folio));
 }
 
 void __folio_put(struct folio *folio)
-- 
cgit v1.2.3


From 72801513b2bfb6bf571956a604d38f98ce9aacd9 Mon Sep 17 00:00:00 2001
From: Baolin Wang <baolin.wang@linux.alibaba.com>
Date: Fri, 5 Apr 2024 20:24:16 +0800
Subject: mm: set pageblock_order to HPAGE_PMD_ORDER in case with
 !CONFIG_HUGETLB_PAGE but THP enabled

As Vlastimil suggested in previous discussion[1], it doesn't make sense to
set pageblock_order as MAX_PAGE_ORDER when hugetlbfs is not enabled and
THP is enabled.  Instead, it should be set to HPAGE_PMD_ORDER.

[1] https://lore.kernel.org/all/76457ec5-d789-449b-b8ca-dcb6ceb12445@suse.cz/
Link: https://lkml.kernel.org/r/3d57d253070035bdc0f6d6e5681ce1ed0e1934f7.1712286863.git.baolin.wang@linux.alibaba.com
Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Zi Yan <ziy@nvidia.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pageblock-flags.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index 3f2409b968ec..547e82cdc89a 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -28,7 +28,7 @@ enum pageblock_bits {
 	NR_PAGEBLOCK_BITS
 };
 
-#ifdef CONFIG_HUGETLB_PAGE
+#if defined(CONFIG_HUGETLB_PAGE)
 
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 
@@ -45,7 +45,11 @@ extern unsigned int pageblock_order;
 
 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 
-#else /* CONFIG_HUGETLB_PAGE */
+#elif defined(CONFIG_TRANSPARENT_HUGEPAGE)
+
+#define pageblock_order		min_t(unsigned int, HPAGE_PMD_ORDER, MAX_PAGE_ORDER)
+
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 /* If huge pages are not used, group by MAX_ORDER_NR_PAGES */
 #define pageblock_order		MAX_PAGE_ORDER
-- 
cgit v1.2.3


From 3429055f0451cd3a281f8ed6691335ead626b136 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Wed, 20 Mar 2024 11:18:49 +0100
Subject: mm: kmsan: implement kmsan_memmove()

Provide a hook that can be used by custom memcpy implementations to tell
KMSAN that the metadata needs to be copied.  Without that, false positive
reports are possible in the cases where KMSAN fails to intercept memory
initialization.

Link: https://lore.kernel.org/all/3b7dbd88-0861-4638-b2d2-911c97a4cadf@I-love.SAKURA.ne.jp/
Link: https://lkml.kernel.org/r/20240320101851.2589698-1-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Suggested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kmsan-checks.h | 15 +++++++++++++++
 mm/kmsan/hooks.c             | 11 +++++++++++
 2 files changed, 26 insertions(+)

(limited to 'include')

diff --git a/include/linux/kmsan-checks.h b/include/linux/kmsan-checks.h
index c4cae333deec..e1082dc40abc 100644
--- a/include/linux/kmsan-checks.h
+++ b/include/linux/kmsan-checks.h
@@ -61,6 +61,17 @@ void kmsan_check_memory(const void *address, size_t size);
 void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy,
 			size_t left);
 
+/**
+ * kmsan_memmove() - Notify KMSAN about a data copy within kernel.
+ * @to:   destination address in the kernel.
+ * @from: source address in the kernel.
+ * @size: number of bytes to copy.
+ *
+ * Invoked after non-instrumented version (e.g. implemented using assembly
+ * code) of memmove()/memcpy() is called, in order to copy KMSAN's metadata.
+ */
+void kmsan_memmove(void *to, const void *from, size_t to_copy);
+
 #else
 
 static inline void kmsan_poison_memory(const void *address, size_t size,
@@ -78,6 +89,10 @@ static inline void kmsan_copy_to_user(void __user *to, const void *from,
 {
 }
 
+static inline void kmsan_memmove(void *to, const void *from, size_t to_copy)
+{
+}
+
 #endif
 
 #endif /* _LINUX_KMSAN_CHECKS_H */
diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c
index 0b09daa188ef..22e8657800ef 100644
--- a/mm/kmsan/hooks.c
+++ b/mm/kmsan/hooks.c
@@ -285,6 +285,17 @@ void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy,
 }
 EXPORT_SYMBOL(kmsan_copy_to_user);
 
+void kmsan_memmove(void *to, const void *from, size_t size)
+{
+	if (!kmsan_enabled || kmsan_in_runtime())
+		return;
+
+	kmsan_enter_runtime();
+	kmsan_internal_memmove_metadata(to, (void *)from, size);
+	kmsan_leave_runtime();
+}
+EXPORT_SYMBOL(kmsan_memmove);
+
 /* Helper function to check an URB. */
 void kmsan_handle_urb(const struct urb *urb, bool is_out)
 {
-- 
cgit v1.2.3


From 922621a6828430ea3119b869336157d253489334 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Wed, 20 Mar 2024 11:18:50 +0100
Subject: instrumented.h: add instrument_memcpy_before, instrument_memcpy_after

Bug detection tools based on compiler instrumentation may miss memory
accesses in custom memcpy implementations (such as copy_mc_to_kernel).
Provide instrumentation hooks that tell KASAN, KCSAN, and KMSAN about such
accesses.

Link: https://lore.kernel.org/all/3b7dbd88-0861-4638-b2d2-911c97a4cadf@I-love.SAKURA.ne.jp/
Link: https://lkml.kernel.org/r/20240320101851.2589698-2-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/instrumented.h | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

(limited to 'include')

diff --git a/include/linux/instrumented.h b/include/linux/instrumented.h
index 1b608e00290a..711a1f0d1a73 100644
--- a/include/linux/instrumented.h
+++ b/include/linux/instrumented.h
@@ -147,6 +147,41 @@ instrument_copy_from_user_after(const void *to, const void __user *from,
 	kmsan_unpoison_memory(to, n - left);
 }
 
+/**
+ * instrument_memcpy_before - add instrumentation before non-instrumented memcpy
+ * @to: destination address
+ * @from: source address
+ * @n: number of bytes to copy
+ *
+ * Instrument memory accesses that happen in custom memcpy implementations. The
+ * instrumentation should be inserted before the memcpy call.
+ */
+static __always_inline void instrument_memcpy_before(void *to, const void *from,
+						     unsigned long n)
+{
+	kasan_check_write(to, n);
+	kasan_check_read(from, n);
+	kcsan_check_write(to, n);
+	kcsan_check_read(from, n);
+}
+
+/**
+ * instrument_memcpy_after - add instrumentation after non-instrumented memcpy
+ * @to: destination address
+ * @from: source address
+ * @n: number of bytes to copy
+ * @left: number of bytes not copied (if known)
+ *
+ * Instrument memory accesses that happen in custom memcpy implementations. The
+ * instrumentation should be inserted after the memcpy call.
+ */
+static __always_inline void instrument_memcpy_after(void *to, const void *from,
+						    unsigned long n,
+						    unsigned long left)
+{
+	kmsan_memmove(to, from, n - left);
+}
+
 /**
  * instrument_get_user() - add instrumentation to get_user()-like macros
  * @to: destination variable, may not be address-taken
-- 
cgit v1.2.3


From 58fbfecab965014b6e3cc956a76b4a96265a1add Mon Sep 17 00:00:00 2001
From: Paul Davey <paul.davey@alliedtelesis.co.nz>
Date: Tue, 23 Apr 2024 18:00:24 +1200
Subject: xfrm: Preserve vlan tags for transport mode software GRO

The software GRO path for esp transport mode uses skb_mac_header_rebuild
prior to re-injecting the packet via the xfrm_napi_dev.  This only
copies skb->mac_len bytes of header which may not be sufficient if the
packet contains 802.1Q tags or other VLAN tags.  Worse copying only the
initial header will leave a packet marked as being VLAN tagged but
without the corresponding tag leading to mangling when it is later
untagged.

The VLAN tags are important when receiving the decrypted esp transport
mode packet after GRO processing to ensure it is received on the correct
interface.

Therefore record the full mac header length in xfrm*_transport_input for
later use in corresponding xfrm*_transport_finish to copy the entire mac
header when rebuilding the mac header for GRO.  The skb->data pointer is
left pointing skb->mac_header bytes after the start of the mac header as
is expected by the network stack and network and transport header
offsets reset to this location.

Fixes: 7785bba299a8 ("esp: Add a software GRO codepath")
Signed-off-by: Paul Davey <paul.davey@alliedtelesis.co.nz>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/linux/skbuff.h | 15 +++++++++++++++
 include/net/xfrm.h     |  3 +++
 net/ipv4/xfrm4_input.c |  6 +++++-
 net/ipv6/xfrm6_input.c |  6 +++++-
 net/xfrm/xfrm_input.c  |  8 ++++++++
 5 files changed, 36 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9d24aec064e8..4ff48eda3f64 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3031,6 +3031,21 @@ static inline void skb_mac_header_rebuild(struct sk_buff *skb)
 	}
 }
 
+/* Move the full mac header up to current network_header.
+ * Leaves skb->data pointing at offset skb->mac_len into the mac_header.
+ * Must be provided the complete mac header length.
+ */
+static inline void skb_mac_header_rebuild_full(struct sk_buff *skb, u32 full_mac_len)
+{
+	if (skb_mac_header_was_set(skb)) {
+		const unsigned char *old_mac = skb_mac_header(skb);
+
+		skb_set_mac_header(skb, -full_mac_len);
+		memmove(skb_mac_header(skb), old_mac, full_mac_len);
+		__skb_push(skb, full_mac_len - skb->mac_len);
+	}
+}
+
 static inline int skb_checksum_start_offset(const struct sk_buff *skb)
 {
 	return skb->csum_start - skb_headroom(skb);
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 57c743b7e4fe..cb4841a9fffd 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1049,6 +1049,9 @@ struct xfrm_offload {
 #define CRYPTO_INVALID_PACKET_SYNTAX		64
 #define CRYPTO_INVALID_PROTOCOL			128
 
+	/* Used to keep whole l2 header for transport mode GRO */
+	__u32			orig_mac_len;
+
 	__u8			proto;
 	__u8			inner_ipproto;
 };
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index dae35101d189..86382e08140e 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -63,7 +63,11 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async)
 	ip_send_check(iph);
 
 	if (xo && (xo->flags & XFRM_GRO)) {
-		skb_mac_header_rebuild(skb);
+		/* The full l2 header needs to be preserved so that re-injecting the packet at l2
+		 * works correctly in the presence of vlan tags.
+		 */
+		skb_mac_header_rebuild_full(skb, xo->orig_mac_len);
+		skb_reset_network_header(skb);
 		skb_reset_transport_header(skb);
 		return 0;
 	}
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index a17d783dc7c0..c6b8e132e10a 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -58,7 +58,11 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async)
 	skb_postpush_rcsum(skb, skb_network_header(skb), nhlen);
 
 	if (xo && (xo->flags & XFRM_GRO)) {
-		skb_mac_header_rebuild(skb);
+		/* The full l2 header needs to be preserved so that re-injecting the packet at l2
+		 * works correctly in the presence of vlan tags.
+		 */
+		skb_mac_header_rebuild_full(skb, xo->orig_mac_len);
+		skb_reset_network_header(skb);
 		skb_reset_transport_header(skb);
 		return 0;
 	}
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 161f535c8b94..3a2982a72a6b 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -389,11 +389,15 @@ static int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb)
  */
 static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
 {
+	struct xfrm_offload *xo = xfrm_offload(skb);
 	int ihl = skb->data - skb_transport_header(skb);
 
 	if (skb->transport_header != skb->network_header) {
 		memmove(skb_transport_header(skb),
 			skb_network_header(skb), ihl);
+		if (xo)
+			xo->orig_mac_len =
+				skb_mac_header_was_set(skb) ? skb_mac_header_len(skb) : 0;
 		skb->network_header = skb->transport_header;
 	}
 	ip_hdr(skb)->tot_len = htons(skb->len + ihl);
@@ -404,11 +408,15 @@ static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
 static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb)
 {
 #if IS_ENABLED(CONFIG_IPV6)
+	struct xfrm_offload *xo = xfrm_offload(skb);
 	int ihl = skb->data - skb_transport_header(skb);
 
 	if (skb->transport_header != skb->network_header) {
 		memmove(skb_transport_header(skb),
 			skb_network_header(skb), ihl);
+		if (xo)
+			xo->orig_mac_len =
+				skb_mac_header_was_set(skb) ? skb_mac_header_len(skb) : 0;
 		skb->network_header = skb->transport_header;
 	}
 	ipv6_hdr(skb)->payload_len = htons(skb->len + ihl -
-- 
cgit v1.2.3


From 63a7cd660246aa36af263b85c33ecc6601bf04be Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Wed, 10 Apr 2024 21:16:34 +0200
Subject: mmc: core: Add mmc_gpiod_set_cd_config() function

Some mmc host drivers may need to fixup a card-detection GPIO's config
to e.g. enable the GPIO controllers builtin pull-up resistor on devices
where the firmware description of the GPIO is broken (e.g. GpioInt with
PullNone instead of PullUp in ACPI DSDT).

Since this is the exception rather then the rule adding a config
parameter to mmc_gpiod_request_cd() seems undesirable, so instead
add a new mmc_gpiod_set_cd_config() function. This is simply a wrapper
to call gpiod_set_config() on the card-detect GPIO acquired through
mmc_gpiod_request_cd().

Reviewed-by: Andy Shevchenko <andy@kernel.org>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Acked-by: Adrian Hunter <adrian.hunter@intel.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20240410191639.526324-2-hdegoede@redhat.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/slot-gpio.c  | 20 ++++++++++++++++++++
 include/linux/mmc/slot-gpio.h |  1 +
 2 files changed, 21 insertions(+)

(limited to 'include')

diff --git a/drivers/mmc/core/slot-gpio.c b/drivers/mmc/core/slot-gpio.c
index 39f45c2b6de8..8791656e9e20 100644
--- a/drivers/mmc/core/slot-gpio.c
+++ b/drivers/mmc/core/slot-gpio.c
@@ -221,6 +221,26 @@ int mmc_gpiod_request_cd(struct mmc_host *host, const char *con_id,
 }
 EXPORT_SYMBOL(mmc_gpiod_request_cd);
 
+/**
+ * mmc_gpiod_set_cd_config - set config for card-detection GPIO
+ * @host: mmc host
+ * @config: Generic pinconf config (from pinconf_to_config_packed())
+ *
+ * This can be used by mmc host drivers to fixup a card-detection GPIO's config
+ * (e.g. set PIN_CONFIG_BIAS_PULL_UP) after acquiring the GPIO descriptor
+ * through mmc_gpiod_request_cd().
+ *
+ * Returns:
+ * 0 on success, or a negative errno value on error.
+ */
+int mmc_gpiod_set_cd_config(struct mmc_host *host, unsigned long config)
+{
+	struct mmc_gpio *ctx = host->slot.handler_priv;
+
+	return gpiod_set_config(ctx->cd_gpio, config);
+}
+EXPORT_SYMBOL(mmc_gpiod_set_cd_config);
+
 bool mmc_can_gpio_cd(struct mmc_host *host)
 {
 	struct mmc_gpio *ctx = host->slot.handler_priv;
diff --git a/include/linux/mmc/slot-gpio.h b/include/linux/mmc/slot-gpio.h
index 5d3d15e97868..66272fdce43d 100644
--- a/include/linux/mmc/slot-gpio.h
+++ b/include/linux/mmc/slot-gpio.h
@@ -21,6 +21,7 @@ int mmc_gpiod_request_cd(struct mmc_host *host, const char *con_id,
 			 unsigned int debounce);
 int mmc_gpiod_request_ro(struct mmc_host *host, const char *con_id,
 			 unsigned int idx, unsigned int debounce);
+int mmc_gpiod_set_cd_config(struct mmc_host *host, unsigned long config);
 void mmc_gpio_set_cd_isr(struct mmc_host *host,
 			 irqreturn_t (*isr)(int irq, void *dev_id));
 int mmc_gpio_set_cd_wake(struct mmc_host *host, bool on);
-- 
cgit v1.2.3


From 6faaea2c70a5fbb05e460473698848f449bdb9ee Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 10 Apr 2024 22:56:18 +0300
Subject: mmc: slot-gpio: Use irq_handler_t type

The irq_handler_t is already defined globally, let's use it
in slot-gpio code.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Alexander Stein <alexander.stein@ew.tq-group.com>
Link: https://lore.kernel.org/r/20240410195618.1632778-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/mmc/core/slot-gpio.c  | 5 ++---
 include/linux/mmc/slot-gpio.h | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/mmc/core/slot-gpio.c b/drivers/mmc/core/slot-gpio.c
index 8791656e9e20..12247219e1c2 100644
--- a/drivers/mmc/core/slot-gpio.c
+++ b/drivers/mmc/core/slot-gpio.c
@@ -19,7 +19,7 @@
 struct mmc_gpio {
 	struct gpio_desc *ro_gpio;
 	struct gpio_desc *cd_gpio;
-	irqreturn_t (*cd_gpio_isr)(int irq, void *dev_id);
+	irq_handler_t cd_gpio_isr;
 	char *ro_label;
 	char *cd_label;
 	u32 cd_debounce_delay_ms;
@@ -162,8 +162,7 @@ EXPORT_SYMBOL(mmc_gpio_set_cd_wake);
 /* Register an alternate interrupt service routine for
  * the card-detect GPIO.
  */
-void mmc_gpio_set_cd_isr(struct mmc_host *host,
-			 irqreturn_t (*isr)(int irq, void *dev_id))
+void mmc_gpio_set_cd_isr(struct mmc_host *host, irq_handler_t isr)
 {
 	struct mmc_gpio *ctx = host->slot.handler_priv;
 
diff --git a/include/linux/mmc/slot-gpio.h b/include/linux/mmc/slot-gpio.h
index 66272fdce43d..274a2767ea49 100644
--- a/include/linux/mmc/slot-gpio.h
+++ b/include/linux/mmc/slot-gpio.h
@@ -8,8 +8,8 @@
 #ifndef MMC_SLOT_GPIO_H
 #define MMC_SLOT_GPIO_H
 
+#include <linux/interrupt.h>
 #include <linux/types.h>
-#include <linux/irqreturn.h>
 
 struct mmc_host;
 
@@ -22,8 +22,7 @@ int mmc_gpiod_request_cd(struct mmc_host *host, const char *con_id,
 int mmc_gpiod_request_ro(struct mmc_host *host, const char *con_id,
 			 unsigned int idx, unsigned int debounce);
 int mmc_gpiod_set_cd_config(struct mmc_host *host, unsigned long config);
-void mmc_gpio_set_cd_isr(struct mmc_host *host,
-			 irqreturn_t (*isr)(int irq, void *dev_id));
+void mmc_gpio_set_cd_isr(struct mmc_host *host, irq_handler_t isr);
 int mmc_gpio_set_cd_wake(struct mmc_host *host, bool on);
 void mmc_gpiod_request_cd_irq(struct mmc_host *host);
 bool mmc_can_gpio_cd(struct mmc_host *host);
-- 
cgit v1.2.3


From a8625217a054ca058cf74742e58c2c88460eb202 Mon Sep 17 00:00:00 2001
From: Atish Patra <atishp@rivosinc.com>
Date: Sat, 20 Apr 2024 08:17:25 -0700
Subject: drivers/perf: riscv: Implement SBI PMU snapshot function

SBI v2.0 SBI introduced PMU snapshot feature which adds the following
features.

1. Read counter values directly from the shared memory instead of
csr read.
2. Start multiple counters with initial values with one SBI call.

These functionalities optimizes the number of traps to the higher
privilege mode. If the kernel is in VS mode while the hypervisor
deploy trap & emulate method, this would minimize all the hpmcounter
CSR read traps. If the kernel is running in S-mode, the benefits
reduced to CSR latency vs DRAM/cache latency as there is no trap
involved while accessing the hpmcounter CSRs.

In both modes, it does saves the number of ecalls while starting
multiple counter together with an initial values. This is a likely
scenario if multiple counters overflow at the same time.

Acked-by: Palmer Dabbelt <palmer@rivosinc.com>
Reviewed-by: Anup Patel <anup@brainfault.org>
Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Reviewed-by: Samuel Holland <samuel.holland@sifive.com>
Signed-off-by: Atish Patra <atishp@rivosinc.com>
Link: https://lore.kernel.org/r/20240420151741.962500-10-atishp@rivosinc.com
Signed-off-by: Anup Patel <anup@brainfault.org>
---
 drivers/perf/riscv_pmu.c       |   3 +-
 drivers/perf/riscv_pmu_sbi.c   | 274 ++++++++++++++++++++++++++++++++++++++---
 include/linux/perf/riscv_pmu.h |   8 ++
 3 files changed, 264 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/perf/riscv_pmu.c b/drivers/perf/riscv_pmu.c
index b4efdddb2ad9..78c490e0505a 100644
--- a/drivers/perf/riscv_pmu.c
+++ b/drivers/perf/riscv_pmu.c
@@ -191,8 +191,6 @@ void riscv_pmu_stop(struct perf_event *event, int flags)
 	struct hw_perf_event *hwc = &event->hw;
 	struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu);
 
-	WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
-
 	if (!(hwc->state & PERF_HES_STOPPED)) {
 		if (rvpmu->ctr_stop) {
 			rvpmu->ctr_stop(event, 0);
@@ -408,6 +406,7 @@ struct riscv_pmu *riscv_pmu_alloc(void)
 		cpuc->n_events = 0;
 		for (i = 0; i < RISCV_MAX_COUNTERS; i++)
 			cpuc->events[i] = NULL;
+		cpuc->snapshot_addr = NULL;
 	}
 	pmu->pmu = (struct pmu) {
 		.event_init	= riscv_pmu_event_init,
diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c
index 4eacd89141a9..5d699b06dcb6 100644
--- a/drivers/perf/riscv_pmu_sbi.c
+++ b/drivers/perf/riscv_pmu_sbi.c
@@ -58,6 +58,9 @@ PMU_FORMAT_ATTR(event, "config:0-47");
 PMU_FORMAT_ATTR(firmware, "config:63");
 
 static bool sbi_v2_available;
+static DEFINE_STATIC_KEY_FALSE(sbi_pmu_snapshot_available);
+#define sbi_pmu_snapshot_available() \
+	static_branch_unlikely(&sbi_pmu_snapshot_available)
 
 static struct attribute *riscv_arch_formats_attr[] = {
 	&format_attr_event.attr,
@@ -508,14 +511,106 @@ static int pmu_sbi_event_map(struct perf_event *event, u64 *econfig)
 	return ret;
 }
 
+static void pmu_sbi_snapshot_free(struct riscv_pmu *pmu)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct cpu_hw_events *cpu_hw_evt = per_cpu_ptr(pmu->hw_events, cpu);
+
+		if (!cpu_hw_evt->snapshot_addr)
+			continue;
+
+		free_page((unsigned long)cpu_hw_evt->snapshot_addr);
+		cpu_hw_evt->snapshot_addr = NULL;
+		cpu_hw_evt->snapshot_addr_phys = 0;
+	}
+}
+
+static int pmu_sbi_snapshot_alloc(struct riscv_pmu *pmu)
+{
+	int cpu;
+	struct page *snapshot_page;
+
+	for_each_possible_cpu(cpu) {
+		struct cpu_hw_events *cpu_hw_evt = per_cpu_ptr(pmu->hw_events, cpu);
+
+		snapshot_page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
+		if (!snapshot_page) {
+			pmu_sbi_snapshot_free(pmu);
+			return -ENOMEM;
+		}
+		cpu_hw_evt->snapshot_addr = page_to_virt(snapshot_page);
+		cpu_hw_evt->snapshot_addr_phys = page_to_phys(snapshot_page);
+	}
+
+	return 0;
+}
+
+static int pmu_sbi_snapshot_disable(void)
+{
+	struct sbiret ret;
+
+	ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_SNAPSHOT_SET_SHMEM, SBI_SHMEM_DISABLE,
+			SBI_SHMEM_DISABLE, 0, 0, 0, 0);
+	if (ret.error) {
+		pr_warn("failed to disable snapshot shared memory\n");
+		return sbi_err_map_linux_errno(ret.error);
+	}
+
+	return 0;
+}
+
+static int pmu_sbi_snapshot_setup(struct riscv_pmu *pmu, int cpu)
+{
+	struct cpu_hw_events *cpu_hw_evt;
+	struct sbiret ret = {0};
+
+	cpu_hw_evt = per_cpu_ptr(pmu->hw_events, cpu);
+	if (!cpu_hw_evt->snapshot_addr_phys)
+		return -EINVAL;
+
+	if (cpu_hw_evt->snapshot_set_done)
+		return 0;
+
+	if (IS_ENABLED(CONFIG_32BIT))
+		ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_SNAPSHOT_SET_SHMEM,
+				cpu_hw_evt->snapshot_addr_phys,
+				(u64)(cpu_hw_evt->snapshot_addr_phys) >> 32, 0, 0, 0, 0);
+	else
+		ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_SNAPSHOT_SET_SHMEM,
+				cpu_hw_evt->snapshot_addr_phys, 0, 0, 0, 0, 0);
+
+	/* Free up the snapshot area memory and fall back to SBI PMU calls without snapshot */
+	if (ret.error) {
+		if (ret.error != SBI_ERR_NOT_SUPPORTED)
+			pr_warn("pmu snapshot setup failed with error %ld\n", ret.error);
+		return sbi_err_map_linux_errno(ret.error);
+	}
+
+	memset(cpu_hw_evt->snapshot_cval_shcopy, 0, sizeof(u64) * RISCV_MAX_COUNTERS);
+	cpu_hw_evt->snapshot_set_done = true;
+
+	return 0;
+}
+
 static u64 pmu_sbi_ctr_read(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
 	int idx = hwc->idx;
 	struct sbiret ret;
 	u64 val = 0;
+	struct riscv_pmu *pmu = to_riscv_pmu(event->pmu);
+	struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
+	struct riscv_pmu_snapshot_data *sdata = cpu_hw_evt->snapshot_addr;
 	union sbi_pmu_ctr_info info = pmu_ctr_list[idx];
 
+	/* Read the value from the shared memory directly only if counter is stopped */
+	if (sbi_pmu_snapshot_available() && (hwc->state & PERF_HES_STOPPED)) {
+		val = sdata->ctr_values[idx];
+		return val;
+	}
+
 	if (pmu_sbi_is_fw_event(event)) {
 		ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_FW_READ,
 				hwc->idx, 0, 0, 0, 0, 0);
@@ -565,6 +660,7 @@ static void pmu_sbi_ctr_start(struct perf_event *event, u64 ival)
 	struct hw_perf_event *hwc = &event->hw;
 	unsigned long flag = SBI_PMU_START_FLAG_SET_INIT_VALUE;
 
+	/* There is no benefit setting SNAPSHOT FLAG for a single counter */
 #if defined(CONFIG_32BIT)
 	ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_START, hwc->idx,
 			1, flag, ival, ival >> 32, 0);
@@ -585,16 +681,36 @@ static void pmu_sbi_ctr_stop(struct perf_event *event, unsigned long flag)
 {
 	struct sbiret ret;
 	struct hw_perf_event *hwc = &event->hw;
+	struct riscv_pmu *pmu = to_riscv_pmu(event->pmu);
+	struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
+	struct riscv_pmu_snapshot_data *sdata = cpu_hw_evt->snapshot_addr;
 
 	if ((hwc->flags & PERF_EVENT_FLAG_USER_ACCESS) &&
 	    (hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT))
 		pmu_sbi_reset_scounteren((void *)event);
 
+	if (sbi_pmu_snapshot_available())
+		flag |= SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT;
+
 	ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, hwc->idx, 1, flag, 0, 0, 0);
-	if (ret.error && (ret.error != SBI_ERR_ALREADY_STOPPED) &&
-		flag != SBI_PMU_STOP_FLAG_RESET)
+	if (!ret.error && sbi_pmu_snapshot_available()) {
+		/*
+		 * The counter snapshot is based on the index base specified by hwc->idx.
+		 * The actual counter value is updated in shared memory at index 0 when counter
+		 * mask is 0x01. To ensure accurate counter values, it's necessary to transfer
+		 * the counter value to shared memory. However, if hwc->idx is zero, the counter
+		 * value is already correctly updated in shared memory, requiring no further
+		 * adjustment.
+		 */
+		if (hwc->idx > 0) {
+			sdata->ctr_values[hwc->idx] = sdata->ctr_values[0];
+			sdata->ctr_values[0] = 0;
+		}
+	} else if (ret.error && (ret.error != SBI_ERR_ALREADY_STOPPED) &&
+		flag != SBI_PMU_STOP_FLAG_RESET) {
 		pr_err("Stopping counter idx %d failed with error %d\n",
 			hwc->idx, sbi_err_map_linux_errno(ret.error));
+	}
 }
 
 static int pmu_sbi_find_num_ctrs(void)
@@ -652,12 +768,39 @@ static inline void pmu_sbi_stop_all(struct riscv_pmu *pmu)
 static inline void pmu_sbi_stop_hw_ctrs(struct riscv_pmu *pmu)
 {
 	struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
-	int i;
+	struct riscv_pmu_snapshot_data *sdata = cpu_hw_evt->snapshot_addr;
+	unsigned long flag = 0;
+	int i, idx;
+	struct sbiret ret;
+	u64 temp_ctr_overflow_mask = 0;
 
-	for (i = 0; i < BITS_TO_LONGS(RISCV_MAX_COUNTERS); i++)
+	if (sbi_pmu_snapshot_available())
+		flag = SBI_PMU_STOP_FLAG_TAKE_SNAPSHOT;
+
+	/* Reset the shadow copy to avoid save/restore any value from previous overflow */
+	memset(cpu_hw_evt->snapshot_cval_shcopy, 0, sizeof(u64) * RISCV_MAX_COUNTERS);
+
+	for (i = 0; i < BITS_TO_LONGS(RISCV_MAX_COUNTERS); i++) {
 		/* No need to check the error here as we can't do anything about the error */
-		sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, i * BITS_PER_LONG,
-			  cpu_hw_evt->used_hw_ctrs[i], 0, 0, 0, 0);
+		ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, i * BITS_PER_LONG,
+				cpu_hw_evt->used_hw_ctrs[i], flag, 0, 0, 0);
+		if (!ret.error && sbi_pmu_snapshot_available()) {
+			/* Save the counter values to avoid clobbering */
+			for_each_set_bit(idx, &cpu_hw_evt->used_hw_ctrs[i], BITS_PER_LONG)
+				cpu_hw_evt->snapshot_cval_shcopy[i * BITS_PER_LONG + idx] =
+							sdata->ctr_values[idx];
+			/* Save the overflow mask to avoid clobbering */
+			temp_ctr_overflow_mask |= sdata->ctr_overflow_mask << (i * BITS_PER_LONG);
+		}
+	}
+
+	/* Restore the counter values to the shared memory for used hw counters */
+	if (sbi_pmu_snapshot_available()) {
+		for_each_set_bit(idx, cpu_hw_evt->used_hw_ctrs, RISCV_MAX_COUNTERS)
+			sdata->ctr_values[idx] = cpu_hw_evt->snapshot_cval_shcopy[idx];
+		if (temp_ctr_overflow_mask)
+			sdata->ctr_overflow_mask = temp_ctr_overflow_mask;
+	}
 }
 
 /*
@@ -666,11 +809,10 @@ static inline void pmu_sbi_stop_hw_ctrs(struct riscv_pmu *pmu)
  * while the overflowed counters need to be started with updated initialization
  * value.
  */
-static inline void pmu_sbi_start_overflow_mask(struct riscv_pmu *pmu,
-					       unsigned long ctr_ovf_mask)
+static inline void pmu_sbi_start_ovf_ctrs_sbi(struct cpu_hw_events *cpu_hw_evt,
+					      u64 ctr_ovf_mask)
 {
 	int idx = 0, i;
-	struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
 	struct perf_event *event;
 	unsigned long flag = SBI_PMU_START_FLAG_SET_INIT_VALUE;
 	unsigned long ctr_start_mask = 0;
@@ -706,6 +848,52 @@ static inline void pmu_sbi_start_overflow_mask(struct riscv_pmu *pmu,
 	}
 }
 
+static inline void pmu_sbi_start_ovf_ctrs_snapshot(struct cpu_hw_events *cpu_hw_evt,
+						   u64 ctr_ovf_mask)
+{
+	int i, idx = 0;
+	struct perf_event *event;
+	unsigned long flag = SBI_PMU_START_FLAG_INIT_SNAPSHOT;
+	u64 max_period, init_val = 0;
+	struct hw_perf_event *hwc;
+	struct riscv_pmu_snapshot_data *sdata = cpu_hw_evt->snapshot_addr;
+
+	for_each_set_bit(idx, cpu_hw_evt->used_hw_ctrs, RISCV_MAX_COUNTERS) {
+		if (ctr_ovf_mask & BIT(idx)) {
+			event = cpu_hw_evt->events[idx];
+			hwc = &event->hw;
+			max_period = riscv_pmu_ctr_get_width_mask(event);
+			init_val = local64_read(&hwc->prev_count) & max_period;
+			cpu_hw_evt->snapshot_cval_shcopy[idx] = init_val;
+		}
+		/*
+		 * We do not need to update the non-overflow counters the previous
+		 * value should have been there already.
+		 */
+	}
+
+	for (i = 0; i < BITS_TO_LONGS(RISCV_MAX_COUNTERS); i++) {
+		/* Restore the counter values to relative indices for used hw counters */
+		for_each_set_bit(idx, &cpu_hw_evt->used_hw_ctrs[i], BITS_PER_LONG)
+			sdata->ctr_values[idx] =
+					cpu_hw_evt->snapshot_cval_shcopy[idx + i * BITS_PER_LONG];
+		/* Start all the counters in a single shot */
+		sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_START, idx * BITS_PER_LONG,
+			  cpu_hw_evt->used_hw_ctrs[i], flag, 0, 0, 0);
+	}
+}
+
+static void pmu_sbi_start_overflow_mask(struct riscv_pmu *pmu,
+					u64 ctr_ovf_mask)
+{
+	struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events);
+
+	if (sbi_pmu_snapshot_available())
+		pmu_sbi_start_ovf_ctrs_snapshot(cpu_hw_evt, ctr_ovf_mask);
+	else
+		pmu_sbi_start_ovf_ctrs_sbi(cpu_hw_evt, ctr_ovf_mask);
+}
+
 static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
 {
 	struct perf_sample_data data;
@@ -715,10 +903,11 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
 	int lidx, hidx, fidx;
 	struct riscv_pmu *pmu;
 	struct perf_event *event;
-	unsigned long overflow;
-	unsigned long overflowed_ctrs = 0;
+	u64 overflow;
+	u64 overflowed_ctrs = 0;
 	struct cpu_hw_events *cpu_hw_evt = dev;
 	u64 start_clock = sched_clock();
+	struct riscv_pmu_snapshot_data *sdata = cpu_hw_evt->snapshot_addr;
 
 	if (WARN_ON_ONCE(!cpu_hw_evt))
 		return IRQ_NONE;
@@ -740,7 +929,10 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
 	pmu_sbi_stop_hw_ctrs(pmu);
 
 	/* Overflow status register should only be read after counter are stopped */
-	ALT_SBI_PMU_OVERFLOW(overflow);
+	if (sbi_pmu_snapshot_available())
+		overflow = sdata->ctr_overflow_mask;
+	else
+		ALT_SBI_PMU_OVERFLOW(overflow);
 
 	/*
 	 * Overflow interrupt pending bit should only be cleared after stopping
@@ -766,9 +958,14 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
 		if (!info || info->type != SBI_PMU_CTR_TYPE_HW)
 			continue;
 
-		/* compute hardware counter index */
-		hidx = info->csr - CSR_CYCLE;
-		/* check if the corresponding bit is set in sscountovf */
+		if (sbi_pmu_snapshot_available())
+			/* SBI implementation already updated the logical indicies */
+			hidx = lidx;
+		else
+			/* compute hardware counter index */
+			hidx = info->csr - CSR_CYCLE;
+
+		/* check if the corresponding bit is set in sscountovf or overflow mask in shmem */
 		if (!(overflow & BIT(hidx)))
 			continue;
 
@@ -778,7 +975,10 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
 		 */
 		overflowed_ctrs |= BIT(lidx);
 		hw_evt = &event->hw;
+		/* Update the event states here so that we know the state while reading */
+		hw_evt->state |= PERF_HES_STOPPED;
 		riscv_pmu_event_update(event);
+		hw_evt->state |= PERF_HES_UPTODATE;
 		perf_sample_data_init(&data, 0, hw_evt->last_period);
 		if (riscv_pmu_event_set_period(event)) {
 			/*
@@ -791,6 +991,8 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
 			 */
 			perf_event_overflow(event, &data, regs);
 		}
+		/* Reset the state as we are going to start the counter after the loop */
+		hw_evt->state = 0;
 	}
 
 	pmu_sbi_start_overflow_mask(pmu, overflowed_ctrs);
@@ -822,6 +1024,9 @@ static int pmu_sbi_starting_cpu(unsigned int cpu, struct hlist_node *node)
 		enable_percpu_irq(riscv_pmu_irq, IRQ_TYPE_NONE);
 	}
 
+	if (sbi_pmu_snapshot_available())
+		return pmu_sbi_snapshot_setup(pmu, cpu);
+
 	return 0;
 }
 
@@ -834,6 +1039,9 @@ static int pmu_sbi_dying_cpu(unsigned int cpu, struct hlist_node *node)
 	/* Disable all counters access for user mode now */
 	csr_write(CSR_SCOUNTEREN, 0x0);
 
+	if (sbi_pmu_snapshot_available())
+		return pmu_sbi_snapshot_disable();
+
 	return 0;
 }
 
@@ -942,6 +1150,12 @@ static inline void riscv_pm_pmu_unregister(struct riscv_pmu *pmu) { }
 
 static void riscv_pmu_destroy(struct riscv_pmu *pmu)
 {
+	if (sbi_v2_available) {
+		if (sbi_pmu_snapshot_available()) {
+			pmu_sbi_snapshot_disable();
+			pmu_sbi_snapshot_free(pmu);
+		}
+	}
 	riscv_pm_pmu_unregister(pmu);
 	cpuhp_state_remove_instance(CPUHP_AP_PERF_RISCV_STARTING, &pmu->node);
 }
@@ -1109,10 +1323,6 @@ static int pmu_sbi_device_probe(struct platform_device *pdev)
 	pmu->event_unmapped = pmu_sbi_event_unmapped;
 	pmu->csr_index = pmu_sbi_csr_index;
 
-	ret = cpuhp_state_add_instance(CPUHP_AP_PERF_RISCV_STARTING, &pmu->node);
-	if (ret)
-		return ret;
-
 	ret = riscv_pm_pmu_register(pmu);
 	if (ret)
 		goto out_unregister;
@@ -1121,8 +1331,34 @@ static int pmu_sbi_device_probe(struct platform_device *pdev)
 	if (ret)
 		goto out_unregister;
 
+	/* SBI PMU Snapsphot is only available in SBI v2.0 */
+	if (sbi_v2_available) {
+		ret = pmu_sbi_snapshot_alloc(pmu);
+		if (ret)
+			goto out_unregister;
+
+		ret = pmu_sbi_snapshot_setup(pmu, smp_processor_id());
+		if (ret) {
+			/* Snapshot is an optional feature. Continue if not available */
+			pmu_sbi_snapshot_free(pmu);
+		} else {
+			pr_info("SBI PMU snapshot detected\n");
+			/*
+			 * We enable it once here for the boot cpu. If snapshot shmem setup
+			 * fails during cpu hotplug process, it will fail to start the cpu
+			 * as we can not handle hetergenous PMUs with different snapshot
+			 * capability.
+			 */
+			static_branch_enable(&sbi_pmu_snapshot_available);
+		}
+	}
+
 	register_sysctl("kernel", sbi_pmu_sysctl_table);
 
+	ret = cpuhp_state_add_instance(CPUHP_AP_PERF_RISCV_STARTING, &pmu->node);
+	if (ret)
+		goto out_unregister;
+
 	return 0;
 
 out_unregister:
diff --git a/include/linux/perf/riscv_pmu.h b/include/linux/perf/riscv_pmu.h
index 43282e22ebe1..701974639ff2 100644
--- a/include/linux/perf/riscv_pmu.h
+++ b/include/linux/perf/riscv_pmu.h
@@ -39,6 +39,14 @@ struct cpu_hw_events {
 	DECLARE_BITMAP(used_hw_ctrs, RISCV_MAX_COUNTERS);
 	/* currently enabled firmware counters */
 	DECLARE_BITMAP(used_fw_ctrs, RISCV_MAX_COUNTERS);
+	/* The virtual address of the shared memory where counter snapshot will be taken */
+	void *snapshot_addr;
+	/* The physical address of the shared memory where counter snapshot will be taken */
+	phys_addr_t snapshot_addr_phys;
+	/* Boolean flag to indicate setup is already done */
+	bool snapshot_set_done;
+	/* A shadow copy of the counter values to avoid clobbering during multiple SBI calls */
+	u64 snapshot_cval_shcopy[RISCV_MAX_COUNTERS];
 };
 
 struct riscv_pmu {
-- 
cgit v1.2.3


From 55394d29c9e164f2e1991352f1dc8f973c4f1589 Mon Sep 17 00:00:00 2001
From: Dawid Osuchowski <linux@osuchow.ski>
Date: Fri, 26 Apr 2024 09:58:54 +0200
Subject: fs: Create anon_inode_getfile_fmode()

Creates an anon_inode_getfile_fmode() function that works similarly to
anon_inode_getfile() with the addition of being able to set the fmode
member.

Signed-off-by: Dawid Osuchowski <linux@osuchow.ski>
Link: https://lore.kernel.org/r/20240426075854.4723-1-linux@osuchow.ski
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/anon_inodes.c            | 33 +++++++++++++++++++++++++++++++++
 include/linux/anon_inodes.h |  5 +++++
 2 files changed, 38 insertions(+)

(limited to 'include')

diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 0496cb5b6eab..42bd1cb7c9cd 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -148,6 +148,38 @@ struct file *anon_inode_getfile(const char *name,
 }
 EXPORT_SYMBOL_GPL(anon_inode_getfile);
 
+/**
+ * anon_inode_getfile_fmode - creates a new file instance by hooking it up to an
+ *                      anonymous inode, and a dentry that describe the "class"
+ *                      of the file
+ *
+ * @name:    [in]    name of the "class" of the new file
+ * @fops:    [in]    file operations for the new file
+ * @priv:    [in]    private data for the new file (will be file's private_data)
+ * @flags:   [in]    flags
+ * @f_mode:  [in]    fmode
+ *
+ * Creates a new file by hooking it on a single inode. This is useful for files
+ * that do not need to have a full-fledged inode in order to operate correctly.
+ * All the files created with anon_inode_getfile() will share a single inode,
+ * hence saving memory and avoiding code duplication for the file/inode/dentry
+ * setup. Allows setting the fmode. Returns the newly created file* or an error
+ * pointer.
+ */
+struct file *anon_inode_getfile_fmode(const char *name,
+				const struct file_operations *fops,
+				void *priv, int flags, fmode_t f_mode)
+{
+	struct file *file;
+
+	file = __anon_inode_getfile(name, fops, priv, flags, NULL, false);
+	if (!IS_ERR(file))
+		file->f_mode |= f_mode;
+
+	return file;
+}
+EXPORT_SYMBOL_GPL(anon_inode_getfile_fmode);
+
 /**
  * anon_inode_create_getfile - Like anon_inode_getfile(), but creates a new
  *                             !S_PRIVATE anon inode rather than reuse the
@@ -271,6 +303,7 @@ int anon_inode_create_getfd(const char *name, const struct file_operations *fops
 	return __anon_inode_getfd(name, fops, priv, flags, context_inode, true);
 }
 
+
 static int __init anon_inode_init(void)
 {
 	anon_inode_mnt = kern_mount(&anon_inode_fs_type);
diff --git a/include/linux/anon_inodes.h b/include/linux/anon_inodes.h
index 93a5f16d03f3..edef565c2a1a 100644
--- a/include/linux/anon_inodes.h
+++ b/include/linux/anon_inodes.h
@@ -9,12 +9,17 @@
 #ifndef _LINUX_ANON_INODES_H
 #define _LINUX_ANON_INODES_H
 
+#include <linux/types.h>
+
 struct file_operations;
 struct inode;
 
 struct file *anon_inode_getfile(const char *name,
 				const struct file_operations *fops,
 				void *priv, int flags);
+struct file *anon_inode_getfile_fmode(const char *name,
+				const struct file_operations *fops,
+				void *priv, int flags, fmode_t f_mode);
 struct file *anon_inode_create_getfile(const char *name,
 				       const struct file_operations *fops,
 				       void *priv, int flags,
-- 
cgit v1.2.3


From af0cb3fa3f9ed258d14abab0152e28a0f9593084 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Thu, 18 Apr 2024 15:50:11 +0200
Subject: net/sched: fix false lockdep warning on qdisc root lock

Xiumei and Christoph reported the following lockdep splat, complaining of
the qdisc root lock being taken twice:

 ============================================
 WARNING: possible recursive locking detected
 6.7.0-rc3+ #598 Not tainted
 --------------------------------------------
 swapper/2/0 is trying to acquire lock:
 ffff888177190110 (&sch->q.lock){+.-.}-{2:2}, at: __dev_queue_xmit+0x1560/0x2e70

 but task is already holding lock:
 ffff88811995a110 (&sch->q.lock){+.-.}-{2:2}, at: __dev_queue_xmit+0x1560/0x2e70

 other info that might help us debug this:
  Possible unsafe locking scenario:

        CPU0
        ----
   lock(&sch->q.lock);
   lock(&sch->q.lock);

  *** DEADLOCK ***

  May be due to missing lock nesting notation

 5 locks held by swapper/2/0:
  #0: ffff888135a09d98 ((&in_dev->mr_ifc_timer)){+.-.}-{0:0}, at: call_timer_fn+0x11a/0x510
  #1: ffffffffaaee5260 (rcu_read_lock){....}-{1:2}, at: ip_finish_output2+0x2c0/0x1ed0
  #2: ffffffffaaee5200 (rcu_read_lock_bh){....}-{1:2}, at: __dev_queue_xmit+0x209/0x2e70
  #3: ffff88811995a110 (&sch->q.lock){+.-.}-{2:2}, at: __dev_queue_xmit+0x1560/0x2e70
  #4: ffffffffaaee5200 (rcu_read_lock_bh){....}-{1:2}, at: __dev_queue_xmit+0x209/0x2e70

 stack backtrace:
 CPU: 2 PID: 0 Comm: swapper/2 Not tainted 6.7.0-rc3+ #598
 Hardware name: Red Hat KVM, BIOS 1.13.0-2.module+el8.3.0+7353+9de0a3cc 04/01/2014
 Call Trace:
  <IRQ>
  dump_stack_lvl+0x4a/0x80
  __lock_acquire+0xfdd/0x3150
  lock_acquire+0x1ca/0x540
  _raw_spin_lock+0x34/0x80
  __dev_queue_xmit+0x1560/0x2e70
  tcf_mirred_act+0x82e/0x1260 [act_mirred]
  tcf_action_exec+0x161/0x480
  tcf_classify+0x689/0x1170
  prio_enqueue+0x316/0x660 [sch_prio]
  dev_qdisc_enqueue+0x46/0x220
  __dev_queue_xmit+0x1615/0x2e70
  ip_finish_output2+0x1218/0x1ed0
  __ip_finish_output+0x8b3/0x1350
  ip_output+0x163/0x4e0
  igmp_ifc_timer_expire+0x44b/0x930
  call_timer_fn+0x1a2/0x510
  run_timer_softirq+0x54d/0x11a0
  __do_softirq+0x1b3/0x88f
  irq_exit_rcu+0x18f/0x1e0
  sysvec_apic_timer_interrupt+0x6f/0x90
  </IRQ>

This happens when TC does a mirred egress redirect from the root qdisc of
device A to the root qdisc of device B. As long as these two locks aren't
protecting the same qdisc, they can be acquired in chain: add a per-qdisc
lockdep key to silence false warnings.
This dynamic key should safely replace the static key we have in sch_htb:
it was added to allow enqueueing to the device "direct qdisc" while still
holding the qdisc root lock.

v2: don't use static keys anymore in HTB direct qdiscs (thanks Eric Dumazet)

CC: Maxim Mikityanskiy <maxim@isovalent.com>
CC: Xiumei Mu <xmu@redhat.com>
Reported-by: Christoph Paasch <cpaasch@apple.com>
Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/451
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Link: https://lore.kernel.org/r/7dc06d6158f72053cf877a82e2a7a5bd23692faa.1713448007.git.dcaratti@redhat.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/sch_generic.h |  1 +
 net/sched/sch_generic.c   |  3 +++
 net/sched/sch_htb.c       | 22 +++-------------------
 3 files changed, 7 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index f561dfb79743..79edd5b5e3c9 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -128,6 +128,7 @@ struct Qdisc {
 
 	struct rcu_head		rcu;
 	netdevice_tracker	dev_tracker;
+	struct lock_class_key	root_lock_key;
 	/* private data */
 	long privdata[] ____cacheline_aligned;
 };
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 4a2c763e2d11..31dfd6c7405b 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -945,7 +945,9 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 	__skb_queue_head_init(&sch->gso_skb);
 	__skb_queue_head_init(&sch->skb_bad_txq);
 	gnet_stats_basic_sync_init(&sch->bstats);
+	lockdep_register_key(&sch->root_lock_key);
 	spin_lock_init(&sch->q.lock);
+	lockdep_set_class(&sch->q.lock, &sch->root_lock_key);
 
 	if (ops->static_flags & TCQ_F_CPUSTATS) {
 		sch->cpu_bstats =
@@ -1068,6 +1070,7 @@ static void __qdisc_destroy(struct Qdisc *qdisc)
 	if (ops->destroy)
 		ops->destroy(qdisc);
 
+	lockdep_unregister_key(&qdisc->root_lock_key);
 	module_put(ops->owner);
 	netdev_put(dev, &qdisc->dev_tracker);
 
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 93e6fb56f3b5..ff3de37874e4 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1039,13 +1039,6 @@ static void htb_work_func(struct work_struct *work)
 	rcu_read_unlock();
 }
 
-static void htb_set_lockdep_class_child(struct Qdisc *q)
-{
-	static struct lock_class_key child_key;
-
-	lockdep_set_class(qdisc_lock(q), &child_key);
-}
-
 static int htb_offload(struct net_device *dev, struct tc_htb_qopt_offload *opt)
 {
 	return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_HTB, opt);
@@ -1132,7 +1125,6 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt,
 			return -ENOMEM;
 		}
 
-		htb_set_lockdep_class_child(qdisc);
 		q->direct_qdiscs[ntx] = qdisc;
 		qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
 	}
@@ -1468,7 +1460,6 @@ static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 	}
 
 	if (q->offload) {
-		htb_set_lockdep_class_child(new);
 		/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
 		qdisc_refcount_inc(new);
 		old_q = htb_graft_helper(dev_queue, new);
@@ -1733,11 +1724,8 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg,
 		new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
 					  cl->parent->common.classid,
 					  NULL);
-		if (q->offload) {
-			if (new_q)
-				htb_set_lockdep_class_child(new_q);
+		if (q->offload)
 			htb_parent_to_leaf_offload(sch, dev_queue, new_q);
-		}
 	}
 
 	sch_tree_lock(sch);
@@ -1947,13 +1935,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 		new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops,
 					  classid, NULL);
 		if (q->offload) {
-			if (new_q) {
-				htb_set_lockdep_class_child(new_q);
-				/* One ref for cl->leaf.q, the other for
-				 * dev_queue->qdisc.
-				 */
+			/* One ref for cl->leaf.q, the other for dev_queue->qdisc. */
+			if (new_q)
 				qdisc_refcount_inc(new_q);
-			}
 			old_q = htb_graft_helper(dev_queue, new_q);
 			/* No qdisc_put needed. */
 			WARN_ON(!(old_q->flags & TCQ_F_BUILTIN));
-- 
cgit v1.2.3


From 01474b70a779319db6d3d2d67a7232a7b4202029 Mon Sep 17 00:00:00 2001
From: Stefan Berger <stefanb@linux.ibm.com>
Date: Thu, 18 Apr 2024 11:24:45 -0400
Subject: crypto: ecdh - Initialize ctx->private_key in proper byte order

The private key in ctx->private_key is currently initialized in reverse
byte order in ecdh_set_secret and whenever the key is needed in proper
byte order the variable priv is introduced and the bytes from
ctx->private_key are copied into priv while being byte-swapped
(ecc_swap_digits). To get rid of the unnecessary byte swapping initialize
ctx->private_key in proper byte order and clean up all functions that were
previously using priv or were called with ctx->private_key:

- ecc_gen_privkey: Directly initialize the passed ctx->private_key with
  random bytes filling all the digits of the private key. Get rid of the
  priv variable. This function only has ecdh_set_secret as a caller to
  create NIST P192/256/384 private keys.

- crypto_ecdh_shared_secret: Called only from ecdh_compute_value with
  ctx->private_key. Get rid of the priv variable and work with the passed
  private_key directly.

- ecc_make_pub_key: Called only from ecdh_compute_value with
  ctx->private_key. Get rid of the priv variable and work with the passed
  private_key directly.

Cc: Salvatore Benedetto <salvatore.benedetto@intel.com>
Signed-off-by: Stefan Berger <stefanb@linux.ibm.com>
Acked-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/ecc.c                  | 29 ++++++++++-------------------
 crypto/ecdh.c                 |  8 +++-----
 include/crypto/internal/ecc.h |  3 ++-
 3 files changed, 15 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/crypto/ecc.c b/crypto/ecc.c
index 2e05387b9499..c1d2e884be1e 100644
--- a/crypto/ecc.c
+++ b/crypto/ecc.c
@@ -1497,10 +1497,10 @@ EXPORT_SYMBOL(ecc_is_key_valid);
  * This method generates a private key uniformly distributed in the range
  * [2, n-3].
  */
-int ecc_gen_privkey(unsigned int curve_id, unsigned int ndigits, u64 *privkey)
+int ecc_gen_privkey(unsigned int curve_id, unsigned int ndigits,
+		    u64 *private_key)
 {
 	const struct ecc_curve *curve = ecc_get_curve(curve_id);
-	u64 priv[ECC_MAX_DIGITS];
 	unsigned int nbytes = ndigits << ECC_DIGITS_TO_BYTES_SHIFT;
 	unsigned int nbits = vli_num_bits(curve->n, ndigits);
 	int err;
@@ -1509,7 +1509,7 @@ int ecc_gen_privkey(unsigned int curve_id, unsigned int ndigits, u64 *privkey)
 	 * Step 1 & 2: check that N is included in Table 1 of FIPS 186-5,
 	 * section 6.1.1.
 	 */
-	if (nbits < 224 || ndigits > ARRAY_SIZE(priv))
+	if (nbits < 224)
 		return -EINVAL;
 
 	/*
@@ -1527,17 +1527,16 @@ int ecc_gen_privkey(unsigned int curve_id, unsigned int ndigits, u64 *privkey)
 		return -EFAULT;
 
 	/* Step 3: obtain N returned_bits from the DRBG. */
-	err = crypto_rng_get_bytes(crypto_default_rng, (u8 *)priv, nbytes);
+	err = crypto_rng_get_bytes(crypto_default_rng,
+				   (u8 *)private_key, nbytes);
 	crypto_put_default_rng();
 	if (err)
 		return err;
 
 	/* Step 4: make sure the private key is in the valid range. */
-	if (__ecc_is_key_valid(curve, priv, ndigits))
+	if (__ecc_is_key_valid(curve, private_key, ndigits))
 		return -EINVAL;
 
-	ecc_swap_digits(priv, privkey, ndigits);
-
 	return 0;
 }
 EXPORT_SYMBOL(ecc_gen_privkey);
@@ -1547,23 +1546,20 @@ int ecc_make_pub_key(unsigned int curve_id, unsigned int ndigits,
 {
 	int ret = 0;
 	struct ecc_point *pk;
-	u64 priv[ECC_MAX_DIGITS];
 	const struct ecc_curve *curve = ecc_get_curve(curve_id);
 
-	if (!private_key || ndigits > ARRAY_SIZE(priv)) {
+	if (!private_key) {
 		ret = -EINVAL;
 		goto out;
 	}
 
-	ecc_swap_digits(private_key, priv, ndigits);
-
 	pk = ecc_alloc_point(ndigits);
 	if (!pk) {
 		ret = -ENOMEM;
 		goto out;
 	}
 
-	ecc_point_mult(pk, &curve->g, priv, NULL, curve, ndigits);
+	ecc_point_mult(pk, &curve->g, private_key, NULL, curve, ndigits);
 
 	/* SP800-56A rev 3 5.6.2.1.3 key check */
 	if (ecc_is_pubkey_valid_full(curve, pk)) {
@@ -1647,13 +1643,11 @@ int crypto_ecdh_shared_secret(unsigned int curve_id, unsigned int ndigits,
 {
 	int ret = 0;
 	struct ecc_point *product, *pk;
-	u64 priv[ECC_MAX_DIGITS];
 	u64 rand_z[ECC_MAX_DIGITS];
 	unsigned int nbytes;
 	const struct ecc_curve *curve = ecc_get_curve(curve_id);
 
-	if (!private_key || !public_key ||
-	    ndigits > ARRAY_SIZE(priv) || ndigits > ARRAY_SIZE(rand_z)) {
+	if (!private_key || !public_key || ndigits > ARRAY_SIZE(rand_z)) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -1674,15 +1668,13 @@ int crypto_ecdh_shared_secret(unsigned int curve_id, unsigned int ndigits,
 	if (ret)
 		goto err_alloc_product;
 
-	ecc_swap_digits(private_key, priv, ndigits);
-
 	product = ecc_alloc_point(ndigits);
 	if (!product) {
 		ret = -ENOMEM;
 		goto err_alloc_product;
 	}
 
-	ecc_point_mult(product, pk, priv, rand_z, curve, ndigits);
+	ecc_point_mult(product, pk, private_key, rand_z, curve, ndigits);
 
 	if (ecc_point_is_zero(product)) {
 		ret = -EFAULT;
@@ -1692,7 +1684,6 @@ int crypto_ecdh_shared_secret(unsigned int curve_id, unsigned int ndigits,
 	ecc_swap_digits(product->x, secret, ndigits);
 
 err_validity:
-	memzero_explicit(priv, sizeof(priv));
 	memzero_explicit(rand_z, sizeof(rand_z));
 	ecc_free_point(product);
 err_alloc_product:
diff --git a/crypto/ecdh.c b/crypto/ecdh.c
index c02c9a2b9682..72cfd1590156 100644
--- a/crypto/ecdh.c
+++ b/crypto/ecdh.c
@@ -27,7 +27,6 @@ static int ecdh_set_secret(struct crypto_kpp *tfm, const void *buf,
 			   unsigned int len)
 {
 	struct ecdh_ctx *ctx = ecdh_get_ctx(tfm);
-	u64 priv[ECC_MAX_DIGITS];
 	struct ecdh params;
 	int ret = 0;
 
@@ -41,15 +40,14 @@ static int ecdh_set_secret(struct crypto_kpp *tfm, const void *buf,
 		return ecc_gen_privkey(ctx->curve_id, ctx->ndigits,
 				       ctx->private_key);
 
-	memcpy(ctx->private_key, params.key, params.key_size);
-	ecc_swap_digits(ctx->private_key, priv, ctx->ndigits);
+	ecc_digits_from_bytes(params.key, params.key_size,
+			      ctx->private_key, ctx->ndigits);
 
 	if (ecc_is_key_valid(ctx->curve_id, ctx->ndigits,
-			     priv, params.key_size) < 0) {
+			     ctx->private_key, params.key_size) < 0) {
 		memzero_explicit(ctx->private_key, params.key_size);
 		ret = -EINVAL;
 	}
-	memzero_explicit(priv, sizeof(priv));
 
 	return ret;
 }
diff --git a/include/crypto/internal/ecc.h b/include/crypto/internal/ecc.h
index 4e2f5f938e91..7ca1f463d1ec 100644
--- a/include/crypto/internal/ecc.h
+++ b/include/crypto/internal/ecc.h
@@ -103,7 +103,8 @@ int ecc_is_key_valid(unsigned int curve_id, unsigned int ndigits,
  * Returns 0 if the private key was generated successfully, a negative value
  * if an error occurred.
  */
-int ecc_gen_privkey(unsigned int curve_id, unsigned int ndigits, u64 *privkey);
+int ecc_gen_privkey(unsigned int curve_id, unsigned int ndigits,
+		    u64 *private_key);
 
 /**
  * ecc_make_pub_key() - Compute an ECC public key
-- 
cgit v1.2.3


From d74169ceb0d2e32438946a2f1f9fc8c803304bd6 Mon Sep 17 00:00:00 2001
From: Dimitri Sivanich <sivanich@hpe.com>
Date: Wed, 24 Apr 2024 15:16:29 +0800
Subject: iommu/vt-d: Allocate DMAR fault interrupts locally

The Intel IOMMU code currently tries to allocate all DMAR fault interrupt
vectors on the boot cpu.  On large systems with high DMAR counts this
results in vector exhaustion, and most of the vectors are not initially
allocated socket local.

Instead, have a cpu on each node do the vector allocation for the DMARs on
that node.  The boot cpu still does the allocation for its node during its
boot sequence.

Signed-off-by: Dimitri Sivanich <sivanich@hpe.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/Zfydpp2Hm+as16TY@hpe.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/amd/amd_iommu.h | 2 +-
 drivers/iommu/amd/init.c      | 2 +-
 drivers/iommu/intel/dmar.c    | 9 +++++++--
 drivers/iommu/irq_remapping.c | 5 ++++-
 drivers/iommu/irq_remapping.h | 2 +-
 include/linux/dmar.h          | 2 +-
 6 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h
index f482aab420f7..410c360e7e24 100644
--- a/drivers/iommu/amd/amd_iommu.h
+++ b/drivers/iommu/amd/amd_iommu.h
@@ -33,7 +33,7 @@ int amd_iommu_prepare(void);
 int amd_iommu_enable(void);
 void amd_iommu_disable(void);
 int amd_iommu_reenable(int mode);
-int amd_iommu_enable_faulting(void);
+int amd_iommu_enable_faulting(unsigned int cpu);
 extern int amd_iommu_guest_ir;
 extern enum io_pgtable_fmt amd_iommu_pgtable;
 extern int amd_iommu_gpt_level;
diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index ac6754a85f35..8085e13e0100 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -3392,7 +3392,7 @@ int amd_iommu_reenable(int mode)
 	return 0;
 }
 
-int __init amd_iommu_enable_faulting(void)
+int __init amd_iommu_enable_faulting(unsigned int cpu)
 {
 	/* We enable MSI later when PCI is initialized */
 	return 0;
diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index 351be9455214..932e0c10c0fe 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -2121,7 +2121,7 @@ int dmar_set_interrupt(struct intel_iommu *iommu)
 	return ret;
 }
 
-int __init enable_drhd_fault_handling(void)
+int enable_drhd_fault_handling(unsigned int cpu)
 {
 	struct dmar_drhd_unit *drhd;
 	struct intel_iommu *iommu;
@@ -2131,7 +2131,12 @@ int __init enable_drhd_fault_handling(void)
 	 */
 	for_each_iommu(iommu, drhd) {
 		u32 fault_status;
-		int ret = dmar_set_interrupt(iommu);
+		int ret;
+
+		if (iommu->irq || iommu->node != cpu_to_node(cpu))
+			continue;
+
+		ret = dmar_set_interrupt(iommu);
 
 		if (ret) {
 			pr_err("DRHD %Lx: failed to enable fault, interrupt, ret %d\n",
diff --git a/drivers/iommu/irq_remapping.c b/drivers/iommu/irq_remapping.c
index ee59647c2050..2f7281ccc05f 100644
--- a/drivers/iommu/irq_remapping.c
+++ b/drivers/iommu/irq_remapping.c
@@ -151,7 +151,10 @@ int __init irq_remap_enable_fault_handling(void)
 	if (!remap_ops->enable_faulting)
 		return -ENODEV;
 
-	return remap_ops->enable_faulting();
+	cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "dmar:enable_fault_handling",
+			  remap_ops->enable_faulting, NULL);
+
+	return remap_ops->enable_faulting(smp_processor_id());
 }
 
 void panic_if_irq_remap(const char *msg)
diff --git a/drivers/iommu/irq_remapping.h b/drivers/iommu/irq_remapping.h
index 8c89cb947cdb..0d6f140b5e01 100644
--- a/drivers/iommu/irq_remapping.h
+++ b/drivers/iommu/irq_remapping.h
@@ -41,7 +41,7 @@ struct irq_remap_ops {
 	int  (*reenable)(int);
 
 	/* Enable fault handling */
-	int  (*enable_faulting)(void);
+	int  (*enable_faulting)(unsigned int);
 };
 
 extern struct irq_remap_ops intel_irq_remap_ops;
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index e34b601b71fd..499bb2c63483 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -117,7 +117,7 @@ extern int dmar_remove_dev_scope(struct dmar_pci_notify_info *info,
 				 int count);
 /* Intel IOMMU detection */
 void detect_intel_iommu(void);
-extern int enable_drhd_fault_handling(void);
+extern int enable_drhd_fault_handling(unsigned int cpu);
 extern int dmar_device_add(acpi_handle handle);
 extern int dmar_device_remove(acpi_handle handle);
 
-- 
cgit v1.2.3


From 621b7e54f288c5e4e32d1dd81a926b8ecb547c60 Mon Sep 17 00:00:00 2001
From: Jingqi Liu <Jingqi.liu@intel.com>
Date: Wed, 24 Apr 2024 15:16:31 +0800
Subject: iommu/vt-d: Remove private data use in fault message

According to Intel VT-d specification revision 4.0, "Private Data"
field has been removed from Page Request/Response.

Since the private data field is not used in fault message, remove the
related definitions in page request descriptor and remove the related
code in page request/response handler, as Intel hasn't shipped any
products which support private data in the page request message.

Signed-off-by: Jingqi Liu <Jingqi.liu@intel.com>
Link: https://lore.kernel.org/r/20240308103811.76744-3-Jingqi.liu@intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel/iommu.h |  1 -
 drivers/iommu/intel/svm.c   | 75 +++++++++------------------------------------
 include/linux/iommu.h       |  3 +-
 3 files changed, 16 insertions(+), 63 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index 404d2476a877..9ee326f7bf62 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -455,7 +455,6 @@ enum {
 
 /* Page group response descriptor QW0 */
 #define QI_PGRP_PASID_P(p)	(((u64)(p)) << 4)
-#define QI_PGRP_PDP(p)		(((u64)(p)) << 5)
 #define QI_PGRP_RESP_CODE(res)	(((u64)(res)) << 12)
 #define QI_PGRP_DID(rid)	(((u64)(rid)) << 16)
 #define QI_PGRP_PASID(pasid)	(((u64)(pasid)) << 32)
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index e014350db354..e05c6c4cb8c3 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -418,8 +418,7 @@ struct page_req_dsc {
 		struct {
 			u64 type:8;
 			u64 pasid_present:1;
-			u64 priv_data_present:1;
-			u64 rsvd:6;
+			u64 rsvd:7;
 			u64 rid:16;
 			u64 pasid:20;
 			u64 exe_req:1;
@@ -438,7 +437,8 @@ struct page_req_dsc {
 		};
 		u64 qw_1;
 	};
-	u64 priv_data[2];
+	u64 qw_2;
+	u64 qw_3;
 };
 
 static bool is_canonical_address(u64 addr)
@@ -572,18 +572,6 @@ static void intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev,
 		event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
 		event.fault.prm.flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID;
 	}
-	if (desc->priv_data_present) {
-		/*
-		 * Set last page in group bit if private data is present,
-		 * page response is required as it does for LPIG.
-		 * iommu_report_device_fault() doesn't understand this vendor
-		 * specific requirement thus we set last_page as a workaround.
-		 */
-		event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
-		event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
-		event.fault.prm.private_data[0] = desc->priv_data[0];
-		event.fault.prm.private_data[1] = desc->priv_data[1];
-	}
 
 	iommu_report_device_fault(dev, &event);
 }
@@ -591,39 +579,23 @@ static void intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev,
 static void handle_bad_prq_event(struct intel_iommu *iommu,
 				 struct page_req_dsc *req, int result)
 {
-	struct qi_desc desc;
+	struct qi_desc desc = { };
 
 	pr_err("%s: Invalid page request: %08llx %08llx\n",
 	       iommu->name, ((unsigned long long *)req)[0],
 	       ((unsigned long long *)req)[1]);
 
-	/*
-	 * Per VT-d spec. v3.0 ch7.7, system software must
-	 * respond with page group response if private data
-	 * is present (PDP) or last page in group (LPIG) bit
-	 * is set. This is an additional VT-d feature beyond
-	 * PCI ATS spec.
-	 */
-	if (!req->lpig && !req->priv_data_present)
+	if (!req->lpig)
 		return;
 
 	desc.qw0 = QI_PGRP_PASID(req->pasid) |
 			QI_PGRP_DID(req->rid) |
 			QI_PGRP_PASID_P(req->pasid_present) |
-			QI_PGRP_PDP(req->priv_data_present) |
 			QI_PGRP_RESP_CODE(result) |
 			QI_PGRP_RESP_TYPE;
 	desc.qw1 = QI_PGRP_IDX(req->prg_index) |
 			QI_PGRP_LPIG(req->lpig);
 
-	if (req->priv_data_present) {
-		desc.qw2 = req->priv_data[0];
-		desc.qw3 = req->priv_data[1];
-	} else {
-		desc.qw2 = 0;
-		desc.qw3 = 0;
-	}
-
 	qi_submit_sync(iommu, &desc, 1, 0);
 }
 
@@ -691,7 +663,7 @@ bad_req:
 
 		intel_svm_prq_report(iommu, dev, req);
 		trace_prq_report(iommu, dev, req->qw_0, req->qw_1,
-				 req->priv_data[0], req->priv_data[1],
+				 req->qw_2, req->qw_3,
 				 iommu->prq_seq_number++);
 		mutex_unlock(&iommu->iopf_lock);
 prq_advance:
@@ -730,7 +702,7 @@ void intel_svm_page_response(struct device *dev, struct iopf_fault *evt,
 	struct intel_iommu *iommu = info->iommu;
 	u8 bus = info->bus, devfn = info->devfn;
 	struct iommu_fault_page_request *prm;
-	bool private_present;
+	struct qi_desc desc;
 	bool pasid_present;
 	bool last_page;
 	u16 sid;
@@ -738,34 +710,17 @@ void intel_svm_page_response(struct device *dev, struct iopf_fault *evt,
 	prm = &evt->fault.prm;
 	sid = PCI_DEVID(bus, devfn);
 	pasid_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
-	private_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
 	last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
 
-	/*
-	 * Per VT-d spec. v3.0 ch7.7, system software must respond
-	 * with page group response if private data is present (PDP)
-	 * or last page in group (LPIG) bit is set. This is an
-	 * additional VT-d requirement beyond PCI ATS spec.
-	 */
-	if (last_page || private_present) {
-		struct qi_desc desc;
-
-		desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) |
-				QI_PGRP_PASID_P(pasid_present) |
-				QI_PGRP_PDP(private_present) |
-				QI_PGRP_RESP_CODE(msg->code) |
-				QI_PGRP_RESP_TYPE;
-		desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page);
-		desc.qw2 = 0;
-		desc.qw3 = 0;
-
-		if (private_present) {
-			desc.qw2 = prm->private_data[0];
-			desc.qw3 = prm->private_data[1];
-		}
+	desc.qw0 = QI_PGRP_PASID(prm->pasid) | QI_PGRP_DID(sid) |
+			QI_PGRP_PASID_P(pasid_present) |
+			QI_PGRP_RESP_CODE(msg->code) |
+			QI_PGRP_RESP_TYPE;
+	desc.qw1 = QI_PGRP_IDX(prm->grpid) | QI_PGRP_LPIG(last_page);
+	desc.qw2 = 0;
+	desc.qw3 = 0;
 
-		qi_submit_sync(iommu, &desc, 1, 0);
-	}
+	qi_submit_sync(iommu, &desc, 1, 0);
 }
 
 static void intel_svm_domain_free(struct iommu_domain *domain)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 2e925b5eba53..e6549bdfaed9 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -69,8 +69,7 @@ enum iommu_fault_type {
 struct iommu_fault_page_request {
 #define IOMMU_FAULT_PAGE_REQUEST_PASID_VALID	(1 << 0)
 #define IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE	(1 << 1)
-#define IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA	(1 << 2)
-#define IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID	(1 << 3)
+#define IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID	(1 << 2)
 	u32	flags;
 	u32	pasid;
 	u32	grpid;
-- 
cgit v1.2.3


From 65442507026a79e2de8014880e6ba376b8b44584 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Wed, 24 Apr 2024 15:16:43 +0800
Subject: iommu: Add ops->domain_alloc_sva()

Make a new op that receives the device and the mm_struct that the SVA
domain should be created for. Unlike domain_alloc_paging() the dev
argument is never NULL here.

This allows drivers to fully initialize the SVA domain and allocate the
mmu_notifier during allocation. It allows the notifier lifetime to follow
the lifetime of the iommu_domain.

Since we have only one call site, upgrade the new op to return ERR_PTR
instead of NULL.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Vasant Hegde <vasant.hegde@amd.com>
Reviewed-by: Tina Zhang <tina.zhang@intel.com>
Link: https://lore.kernel.org/r/20240311090843.133455-15-vasant.hegde@amd.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20240416080656.60968-12-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu-sva.c | 16 +++++++++++-----
 include/linux/iommu.h     |  3 +++
 2 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
index 640acc804e8c..18a35e798b72 100644
--- a/drivers/iommu/iommu-sva.c
+++ b/drivers/iommu/iommu-sva.c
@@ -108,8 +108,8 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm
 
 	/* Allocate a new domain and set it on device pasid. */
 	domain = iommu_sva_domain_alloc(dev, mm);
-	if (!domain) {
-		ret = -ENOMEM;
+	if (IS_ERR(domain)) {
+		ret = PTR_ERR(domain);
 		goto out_free_handle;
 	}
 
@@ -283,9 +283,15 @@ struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
 	const struct iommu_ops *ops = dev_iommu_ops(dev);
 	struct iommu_domain *domain;
 
-	domain = ops->domain_alloc(IOMMU_DOMAIN_SVA);
-	if (!domain)
-		return NULL;
+	if (ops->domain_alloc_sva) {
+		domain = ops->domain_alloc_sva(dev, mm);
+		if (IS_ERR(domain))
+			return domain;
+	} else {
+		domain = ops->domain_alloc(IOMMU_DOMAIN_SVA);
+		if (!domain)
+			return ERR_PTR(-ENOMEM);
+	}
 
 	domain->type = IOMMU_DOMAIN_SVA;
 	mmgrab(mm);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index e6549bdfaed9..f4cdffa985e4 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -517,6 +517,7 @@ static inline int __iommu_copy_struct_from_user_array(
  *                     Upon failure, ERR_PTR must be returned.
  * @domain_alloc_paging: Allocate an iommu_domain that can be used for
  *                       UNMANAGED, DMA, and DMA_FQ domain types.
+ * @domain_alloc_sva: Allocate an iommu_domain for Shared Virtual Addressing.
  * @probe_device: Add device to iommu driver handling
  * @release_device: Remove device from iommu driver handling
  * @probe_finalize: Do final setup work after the device is added to an IOMMU
@@ -557,6 +558,8 @@ struct iommu_ops {
 		struct device *dev, u32 flags, struct iommu_domain *parent,
 		const struct iommu_user_data *user_data);
 	struct iommu_domain *(*domain_alloc_paging)(struct device *dev);
+	struct iommu_domain *(*domain_alloc_sva)(struct device *dev,
+						 struct mm_struct *mm);
 
 	struct iommu_device *(*probe_device)(struct device *dev);
 	void (*release_device)(struct device *dev);
-- 
cgit v1.2.3


From 5055cccfc2d1cc1a7306f6bcdcd0ee9521d707f5 Mon Sep 17 00:00:00 2001
From: Lukasz Majewski <lukma@denx.de>
Date: Tue, 23 Apr 2024 14:49:04 +0200
Subject: net: hsr: Provide RedBox support (HSR-SAN)

Introduce RedBox support (HSR-SAN to be more precise) for HSR networks.
Following traffic reduction optimizations have been implemented:
- Do not send HSR supervisory frames to Port C (interlink)
- Do not forward to HSR ring frames addressed to Port C
- Do not forward to Port C frames from HSR ring
- Do not send duplicate HSR frame to HSR ring when destination is Port C

The corresponding patch to modify iptable2 sources has already been sent:
https://lore.kernel.org/netdev/20240308145729.490863-1-lukma@denx.de/T/

Testing procedure (veth and netns):
-----------------------------------
One shall run:
linux-vanila/tools/testing/selftests/net/hsr/hsr_redbox.sh
(Detailed description of the setup one can find in the test
script file).

Testing procedure (real hardware):
----------------------------------
The EVB-KSZ9477 has been used for testing on net-next branch
(SHA1: 5fc68320c1fb3c7d456ddcae0b4757326a043e6f).

Ports 4/5 were used for SW managed HSR (hsr1) as first hsr0 for ports 1/2
(with HW offloading for ksz9477) was created. Port 3 has been used as
interlink port (single USB-ETH dongle).

Configuration - RedBox (EVB-KSZ9477):
if link set lan1 down;ip link set lan2 down
ip link add name hsr0 type hsr slave1 lan1 slave2 lan2 supervision 45 version 1
ip link add name hsr1 type hsr slave1 lan4 slave2 lan5 interlink lan3 supervision 45 version 1
ip link set lan4 up;ip link set lan5 up
ip link set lan3 up
ip addr add 192.168.0.11/24 dev hsr1
ip link set hsr1 up

Configuration - DAN-H (EVB-KSZ9477):

ip link set lan1 down;ip link set lan2 down
ip link add name hsr0 type hsr slave1 lan1 slave2 lan2 supervision 45 version 1
ip link add name hsr1 type hsr slave1 lan4 slave2 lan5 supervision 45 version 1
ip link set lan4 up;ip link set lan5 up
ip addr add 192.168.0.12/24 dev hsr1
ip link set hsr1 up

This approach uses only SW based HSR devices (hsr1).

--------------          -----------------       ------------
DAN-H  Port5 | <------> | Port5         |       |
       Port4 | <------> | Port4   Port3 | <---> | PC
             |          | (RedBox)      |       | (USB-ETH)
EVB-KSZ9477  |          | EVB-KSZ9477   |       |
--------------          -----------------       ------------

Signed-off-by: Lukasz Majewski <lukma@denx.de>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/uapi/linux/if_link.h |  1 +
 net/hsr/hsr_device.c         | 36 +++++++++++++++++--
 net/hsr/hsr_device.h         |  4 +--
 net/hsr/hsr_forward.c        | 85 +++++++++++++++++++++++++++++++++++++++-----
 net/hsr/hsr_framereg.c       | 52 +++++++++++++++++++++++++++
 net/hsr/hsr_framereg.h       |  4 +++
 net/hsr/hsr_main.h           |  7 ++++
 net/hsr/hsr_netlink.c        | 30 ++++++++++++++--
 net/hsr/hsr_slave.c          |  1 +
 9 files changed, 204 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index ffa637b38c93..e9f10860ec8e 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -1771,6 +1771,7 @@ enum {
 	IFLA_HSR_PROTOCOL,		/* Indicate different protocol than
 					 * HSR. For example PRP.
 					 */
+	IFLA_HSR_INTERLINK,		/* HSR interlink network device */
 	__IFLA_HSR_MAX,
 };
 
diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index e9d45133d641..cd1e7c6d2fc0 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -146,6 +146,9 @@ static int hsr_dev_open(struct net_device *dev)
 		case HSR_PT_SLAVE_B:
 			designation = "Slave B";
 			break;
+		case HSR_PT_INTERLINK:
+			designation = "Interlink";
+			break;
 		default:
 			designation = "Unknown";
 		}
@@ -285,6 +288,7 @@ static void send_hsr_supervision_frame(struct hsr_port *master,
 	struct hsr_priv *hsr = master->hsr;
 	__u8 type = HSR_TLV_LIFE_CHECK;
 	struct hsr_sup_payload *hsr_sp;
+	struct hsr_sup_tlv *hsr_stlv;
 	struct hsr_sup_tag *hsr_stag;
 	struct sk_buff *skb;
 
@@ -324,6 +328,16 @@ static void send_hsr_supervision_frame(struct hsr_port *master,
 	hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload));
 	ether_addr_copy(hsr_sp->macaddress_A, master->dev->dev_addr);
 
+	if (hsr->redbox) {
+		hsr_stlv = skb_put(skb, sizeof(struct hsr_sup_tlv));
+		hsr_stlv->HSR_TLV_type = PRP_TLV_REDBOX_MAC;
+		hsr_stlv->HSR_TLV_length = sizeof(struct hsr_sup_payload);
+
+		/* Payload: MacAddressRedBox */
+		hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload));
+		ether_addr_copy(hsr_sp->macaddress_A, hsr->macaddress_redbox);
+	}
+
 	if (skb_put_padto(skb, ETH_ZLEN)) {
 		spin_unlock_bh(&hsr->seqnr_lock);
 		return;
@@ -405,6 +419,10 @@ void hsr_del_ports(struct hsr_priv *hsr)
 	if (port)
 		hsr_del_port(port);
 
+	port = hsr_port_get_hsr(hsr, HSR_PT_INTERLINK);
+	if (port)
+		hsr_del_port(port);
+
 	port = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
 	if (port)
 		hsr_del_port(port);
@@ -534,8 +552,8 @@ static const unsigned char def_multicast_addr[ETH_ALEN] __aligned(2) = {
 };
 
 int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
-		     unsigned char multicast_spec, u8 protocol_version,
-		     struct netlink_ext_ack *extack)
+		     struct net_device *interlink, unsigned char multicast_spec,
+		     u8 protocol_version, struct netlink_ext_ack *extack)
 {
 	bool unregister = false;
 	struct hsr_priv *hsr;
@@ -544,6 +562,7 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
 	hsr = netdev_priv(hsr_dev);
 	INIT_LIST_HEAD(&hsr->ports);
 	INIT_LIST_HEAD(&hsr->node_db);
+	INIT_LIST_HEAD(&hsr->proxy_node_db);
 	spin_lock_init(&hsr->list_lock);
 
 	eth_hw_addr_set(hsr_dev, slave[0]->dev_addr);
@@ -569,6 +588,7 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
 	/* Overflow soon to find bugs easier: */
 	hsr->sequence_nr = HSR_SEQNR_START;
 	hsr->sup_sequence_nr = HSR_SUP_SEQNR_START;
+	hsr->interlink_sequence_nr = HSR_SEQNR_START;
 
 	timer_setup(&hsr->announce_timer, hsr_announce, 0);
 	timer_setup(&hsr->prune_timer, hsr_prune_nodes, 0);
@@ -604,6 +624,18 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
 	if (res)
 		goto err_unregister;
 
+	if (interlink) {
+		res = hsr_add_port(hsr, interlink, HSR_PT_INTERLINK, extack);
+		if (res)
+			goto err_unregister;
+
+		hsr->redbox = true;
+		ether_addr_copy(hsr->macaddress_redbox, interlink->dev_addr);
+		timer_setup(&hsr->prune_proxy_timer, hsr_prune_proxy_nodes, 0);
+		mod_timer(&hsr->prune_proxy_timer,
+			  jiffies + msecs_to_jiffies(PRUNE_PROXY_PERIOD));
+	}
+
 	hsr_debugfs_init(hsr, hsr_dev);
 	mod_timer(&hsr->prune_timer, jiffies + msecs_to_jiffies(PRUNE_PERIOD));
 
diff --git a/net/hsr/hsr_device.h b/net/hsr/hsr_device.h
index 9060c92168f9..655284095b78 100644
--- a/net/hsr/hsr_device.h
+++ b/net/hsr/hsr_device.h
@@ -16,8 +16,8 @@
 void hsr_del_ports(struct hsr_priv *hsr);
 void hsr_dev_setup(struct net_device *dev);
 int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
-		     unsigned char multicast_spec, u8 protocol_version,
-		     struct netlink_ext_ack *extack);
+		     struct net_device *interlink, unsigned char multicast_spec,
+		     u8 protocol_version, struct netlink_ext_ack *extack);
 void hsr_check_carrier_and_operstate(struct hsr_priv *hsr);
 int hsr_get_max_mtu(struct hsr_priv *hsr);
 #endif /* __HSR_DEVICE_H */
diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c
index 5d68cb181695..05a61b8286ec 100644
--- a/net/hsr/hsr_forward.c
+++ b/net/hsr/hsr_forward.c
@@ -377,6 +377,15 @@ static int hsr_xmit(struct sk_buff *skb, struct hsr_port *port,
 		 */
 		ether_addr_copy(eth_hdr(skb)->h_source, port->dev->dev_addr);
 	}
+
+	/* When HSR node is used as RedBox - the frame received from HSR ring
+	 * requires source MAC address (SA) replacement to one which can be
+	 * recognized by SAN devices (otherwise, frames are dropped by switch)
+	 */
+	if (port->type == HSR_PT_INTERLINK)
+		ether_addr_copy(eth_hdr(skb)->h_source,
+				port->hsr->macaddress_redbox);
+
 	return dev_queue_xmit(skb);
 }
 
@@ -390,9 +399,57 @@ bool prp_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port)
 
 bool hsr_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port)
 {
+	struct sk_buff *skb;
+
 	if (port->dev->features & NETIF_F_HW_HSR_FWD)
 		return prp_drop_frame(frame, port);
 
+	/* RedBox specific frames dropping policies
+	 *
+	 * Do not send HSR supervisory frames to SAN devices
+	 */
+	if (frame->is_supervision && port->type == HSR_PT_INTERLINK)
+		return true;
+
+	/* Do not forward to other HSR port (A or B) unicast frames which
+	 * are addressed to interlink port (and are in the ProxyNodeTable).
+	 */
+	skb = frame->skb_hsr;
+	if (skb && prp_drop_frame(frame, port) &&
+	    is_unicast_ether_addr(eth_hdr(skb)->h_dest) &&
+	    hsr_is_node_in_db(&port->hsr->proxy_node_db,
+			      eth_hdr(skb)->h_dest)) {
+		return true;
+	}
+
+	/* Do not forward to port C (Interlink) frames from nodes A and B
+	 * if DA is in NodeTable.
+	 */
+	if ((frame->port_rcv->type == HSR_PT_SLAVE_A ||
+	     frame->port_rcv->type == HSR_PT_SLAVE_B) &&
+	    port->type == HSR_PT_INTERLINK) {
+		skb = frame->skb_hsr;
+		if (skb && is_unicast_ether_addr(eth_hdr(skb)->h_dest) &&
+		    hsr_is_node_in_db(&port->hsr->node_db,
+				      eth_hdr(skb)->h_dest)) {
+			return true;
+		}
+	}
+
+	/* Do not forward to port A and B unicast frames received on the
+	 * interlink port if it is addressed to one of nodes registered in
+	 * the ProxyNodeTable.
+	 */
+	if ((port->type == HSR_PT_SLAVE_A || port->type == HSR_PT_SLAVE_B) &&
+	    frame->port_rcv->type == HSR_PT_INTERLINK) {
+		skb = frame->skb_std;
+		if (skb && is_unicast_ether_addr(eth_hdr(skb)->h_dest) &&
+		    hsr_is_node_in_db(&port->hsr->proxy_node_db,
+				      eth_hdr(skb)->h_dest)) {
+			return true;
+		}
+	}
+
 	return false;
 }
 
@@ -448,13 +505,14 @@ static void hsr_forward_do(struct hsr_frame_info *frame)
 		}
 
 		/* Check if frame is to be dropped. Eg. for PRP no forward
-		 * between ports.
+		 * between ports, or sending HSR supervision to RedBox.
 		 */
 		if (hsr->proto_ops->drop_frame &&
 		    hsr->proto_ops->drop_frame(frame, port))
 			continue;
 
-		if (port->type != HSR_PT_MASTER)
+		if (port->type == HSR_PT_SLAVE_A ||
+		    port->type == HSR_PT_SLAVE_B)
 			skb = hsr->proto_ops->create_tagged_frame(frame, port);
 		else
 			skb = hsr->proto_ops->get_untagged_frame(frame, port);
@@ -469,7 +527,9 @@ static void hsr_forward_do(struct hsr_frame_info *frame)
 			hsr_deliver_master(skb, port->dev, frame->node_src);
 		} else {
 			if (!hsr_xmit(skb, port, frame))
-				sent = true;
+				if (port->type == HSR_PT_SLAVE_A ||
+				    port->type == HSR_PT_SLAVE_B)
+					sent = true;
 		}
 	}
 }
@@ -503,10 +563,12 @@ static void handle_std_frame(struct sk_buff *skb,
 	frame->skb_prp = NULL;
 	frame->skb_std = skb;
 
-	if (port->type != HSR_PT_MASTER) {
+	if (port->type != HSR_PT_MASTER)
 		frame->is_from_san = true;
-	} else {
-		/* Sequence nr for the master node */
+
+	if (port->type == HSR_PT_MASTER ||
+	    port->type == HSR_PT_INTERLINK) {
+		/* Sequence nr for the master/interlink node */
 		lockdep_assert_held(&hsr->seqnr_lock);
 		frame->sequence_nr = hsr->sequence_nr;
 		hsr->sequence_nr++;
@@ -564,6 +626,7 @@ static int fill_frame_info(struct hsr_frame_info *frame,
 {
 	struct hsr_priv *hsr = port->hsr;
 	struct hsr_vlan_ethhdr *vlan_hdr;
+	struct list_head *n_db;
 	struct ethhdr *ethhdr;
 	__be16 proto;
 	int ret;
@@ -574,9 +637,13 @@ static int fill_frame_info(struct hsr_frame_info *frame,
 
 	memset(frame, 0, sizeof(*frame));
 	frame->is_supervision = is_supervision_frame(port->hsr, skb);
-	frame->node_src = hsr_get_node(port, &hsr->node_db, skb,
-				       frame->is_supervision,
-				       port->type);
+
+	n_db = &hsr->node_db;
+	if (port->type == HSR_PT_INTERLINK)
+		n_db = &hsr->proxy_node_db;
+
+	frame->node_src = hsr_get_node(port, n_db, skb,
+				       frame->is_supervision, port->type);
 	if (!frame->node_src)
 		return -1; /* Unknown node and !is_supervision, or no mem */
 
diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c
index 26329db09210..614df9649794 100644
--- a/net/hsr/hsr_framereg.c
+++ b/net/hsr/hsr_framereg.c
@@ -71,6 +71,14 @@ static struct hsr_node *find_node_by_addr_A(struct list_head *node_db,
 	return NULL;
 }
 
+/* Check if node for a given MAC address is already present in data base
+ */
+bool hsr_is_node_in_db(struct list_head *node_db,
+		       const unsigned char addr[ETH_ALEN])
+{
+	return !!find_node_by_addr_A(node_db, addr);
+}
+
 /* Helper for device init; the self_node is used in hsr_rcv() to recognize
  * frames from self that's been looped over the HSR ring.
  */
@@ -223,6 +231,15 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db,
 		}
 	}
 
+	/* Check if required node is not in proxy nodes table */
+	list_for_each_entry_rcu(node, &hsr->proxy_node_db, mac_list) {
+		if (ether_addr_equal(node->macaddress_A, ethhdr->h_source)) {
+			if (hsr->proto_ops->update_san_info)
+				hsr->proto_ops->update_san_info(node, is_sup);
+			return node;
+		}
+	}
+
 	/* Everyone may create a node entry, connected node to a HSR/PRP
 	 * device.
 	 */
@@ -418,6 +435,10 @@ void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb,
 
 	node_dst = find_node_by_addr_A(&port->hsr->node_db,
 				       eth_hdr(skb)->h_dest);
+	if (!node_dst && port->hsr->redbox)
+		node_dst = find_node_by_addr_A(&port->hsr->proxy_node_db,
+					       eth_hdr(skb)->h_dest);
+
 	if (!node_dst) {
 		if (port->hsr->prot_version != PRP_V1 && net_ratelimit())
 			netdev_err(skb->dev, "%s: Unknown node\n", __func__);
@@ -561,6 +582,37 @@ void hsr_prune_nodes(struct timer_list *t)
 		  jiffies + msecs_to_jiffies(PRUNE_PERIOD));
 }
 
+void hsr_prune_proxy_nodes(struct timer_list *t)
+{
+	struct hsr_priv *hsr = from_timer(hsr, t, prune_proxy_timer);
+	unsigned long timestamp;
+	struct hsr_node *node;
+	struct hsr_node *tmp;
+
+	spin_lock_bh(&hsr->list_lock);
+	list_for_each_entry_safe(node, tmp, &hsr->proxy_node_db, mac_list) {
+		timestamp = node->time_in[HSR_PT_INTERLINK];
+
+		/* Prune old entries */
+		if (time_is_before_jiffies(timestamp +
+				msecs_to_jiffies(HSR_PROXY_NODE_FORGET_TIME))) {
+			hsr_nl_nodedown(hsr, node->macaddress_A);
+			if (!node->removed) {
+				list_del_rcu(&node->mac_list);
+				node->removed = true;
+				/* Note that we need to free this entry later: */
+				kfree_rcu(node, rcu_head);
+			}
+		}
+	}
+
+	spin_unlock_bh(&hsr->list_lock);
+
+	/* Restart timer */
+	mod_timer(&hsr->prune_proxy_timer,
+		  jiffies + msecs_to_jiffies(PRUNE_PROXY_PERIOD));
+}
+
 void *hsr_get_next_node(struct hsr_priv *hsr, void *_pos,
 			unsigned char addr[ETH_ALEN])
 {
diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h
index b23556251d62..7619e31c1d2d 100644
--- a/net/hsr/hsr_framereg.h
+++ b/net/hsr/hsr_framereg.h
@@ -46,6 +46,7 @@ int hsr_register_frame_out(struct hsr_port *port, struct hsr_node *node,
 			   u16 sequence_nr);
 
 void hsr_prune_nodes(struct timer_list *t);
+void hsr_prune_proxy_nodes(struct timer_list *t);
 
 int hsr_create_self_node(struct hsr_priv *hsr,
 			 const unsigned char addr_a[ETH_ALEN],
@@ -67,6 +68,9 @@ void prp_handle_san_frame(bool san, enum hsr_port_type port,
 			  struct hsr_node *node);
 void prp_update_san_info(struct hsr_node *node, bool is_sup);
 
+bool hsr_is_node_in_db(struct list_head *node_db,
+		       const unsigned char addr[ETH_ALEN]);
+
 struct hsr_node {
 	struct list_head	mac_list;
 	/* Protect R/W access to seq_out */
diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h
index 18e01791ad79..23850b16d1ea 100644
--- a/net/hsr/hsr_main.h
+++ b/net/hsr/hsr_main.h
@@ -21,6 +21,7 @@
  */
 #define HSR_LIFE_CHECK_INTERVAL		 2000 /* ms */
 #define HSR_NODE_FORGET_TIME		60000 /* ms */
+#define HSR_PROXY_NODE_FORGET_TIME	60000 /* ms */
 #define HSR_ANNOUNCE_INTERVAL		  100 /* ms */
 #define HSR_ENTRY_FORGET_TIME		  400 /* ms */
 
@@ -35,6 +36,7 @@
  * HSR_NODE_FORGET_TIME?
  */
 #define PRUNE_PERIOD			 3000 /* ms */
+#define PRUNE_PROXY_PERIOD		 3000 /* ms */
 #define HSR_TLV_EOT				   0  /* End of TLVs */
 #define HSR_TLV_ANNOUNCE		   22
 #define HSR_TLV_LIFE_CHECK		   23
@@ -192,11 +194,14 @@ struct hsr_priv {
 	struct rcu_head		rcu_head;
 	struct list_head	ports;
 	struct list_head	node_db;	/* Known HSR nodes */
+	struct list_head	proxy_node_db;	/* RedBox HSR proxy nodes */
 	struct hsr_self_node	__rcu *self_node;	/* MACs of slaves */
 	struct timer_list	announce_timer;	/* Supervision frame dispatch */
 	struct timer_list	prune_timer;
+	struct timer_list	prune_proxy_timer;
 	int announce_count;
 	u16 sequence_nr;
+	u16 interlink_sequence_nr; /* Interlink port seq_nr */
 	u16 sup_sequence_nr;	/* For HSRv1 separate seq_nr for supervision */
 	enum hsr_version prot_version;	/* Indicate if HSRv0, HSRv1 or PRPv1 */
 	spinlock_t seqnr_lock;	/* locking for sequence_nr */
@@ -209,6 +214,8 @@ struct hsr_priv {
 				 * of lan_id
 				 */
 	bool fwd_offloaded;	/* Forwarding offloaded to HW */
+	bool redbox;            /* Device supports HSR RedBox */
+	unsigned char		macaddress_redbox[ETH_ALEN];
 	unsigned char		sup_multicast_addr[ETH_ALEN] __aligned(sizeof(u16));
 				/* Align to u16 boundary to avoid unaligned access
 				 * in ether_addr_equal
diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c
index 78fe40eb9f01..898f18c6da53 100644
--- a/net/hsr/hsr_netlink.c
+++ b/net/hsr/hsr_netlink.c
@@ -23,6 +23,7 @@ static const struct nla_policy hsr_policy[IFLA_HSR_MAX + 1] = {
 	[IFLA_HSR_SUPERVISION_ADDR]	= { .len = ETH_ALEN },
 	[IFLA_HSR_SEQ_NR]		= { .type = NLA_U16 },
 	[IFLA_HSR_PROTOCOL]		= { .type = NLA_U8 },
+	[IFLA_HSR_INTERLINK]		= { .type = NLA_U32 },
 };
 
 /* Here, it seems a netdevice has already been allocated for us, and the
@@ -35,8 +36,8 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev,
 	enum hsr_version proto_version;
 	unsigned char multicast_spec;
 	u8 proto = HSR_PROTOCOL_HSR;
-	struct net_device *link[2];
 
+	struct net_device *link[2], *interlink = NULL;
 	if (!data) {
 		NL_SET_ERR_MSG_MOD(extack, "No slave devices specified");
 		return -EINVAL;
@@ -67,6 +68,20 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev,
 		return -EINVAL;
 	}
 
+	if (data[IFLA_HSR_INTERLINK])
+		interlink = __dev_get_by_index(src_net,
+					       nla_get_u32(data[IFLA_HSR_INTERLINK]));
+
+	if (interlink && interlink == link[0]) {
+		NL_SET_ERR_MSG_MOD(extack, "Interlink and Slave1 are the same");
+		return -EINVAL;
+	}
+
+	if (interlink && interlink == link[1]) {
+		NL_SET_ERR_MSG_MOD(extack, "Interlink and Slave2 are the same");
+		return -EINVAL;
+	}
+
 	if (!data[IFLA_HSR_MULTICAST_SPEC])
 		multicast_spec = 0;
 	else
@@ -96,10 +111,17 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev,
 		}
 	}
 
-	if (proto == HSR_PROTOCOL_PRP)
+	if (proto == HSR_PROTOCOL_PRP) {
 		proto_version = PRP_V1;
+		if (interlink) {
+			NL_SET_ERR_MSG_MOD(extack,
+					   "Interlink only works with HSR");
+			return -EINVAL;
+		}
+	}
 
-	return hsr_dev_finalize(dev, link, multicast_spec, proto_version, extack);
+	return hsr_dev_finalize(dev, link, interlink, multicast_spec,
+				proto_version, extack);
 }
 
 static void hsr_dellink(struct net_device *dev, struct list_head *head)
@@ -107,6 +129,7 @@ static void hsr_dellink(struct net_device *dev, struct list_head *head)
 	struct hsr_priv *hsr = netdev_priv(dev);
 
 	del_timer_sync(&hsr->prune_timer);
+	del_timer_sync(&hsr->prune_proxy_timer);
 	del_timer_sync(&hsr->announce_timer);
 
 	hsr_debugfs_term(hsr);
@@ -114,6 +137,7 @@ static void hsr_dellink(struct net_device *dev, struct list_head *head)
 
 	hsr_del_self_node(hsr);
 	hsr_del_nodes(&hsr->node_db);
+	hsr_del_nodes(&hsr->proxy_node_db);
 
 	unregister_netdevice_queue(dev, head);
 }
diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c
index 1b6457f357bd..af6cf64a00e0 100644
--- a/net/hsr/hsr_slave.c
+++ b/net/hsr/hsr_slave.c
@@ -55,6 +55,7 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb)
 	protocol = eth_hdr(skb)->h_proto;
 
 	if (!(port->dev->features & NETIF_F_HW_HSR_TAG_RM) &&
+	    port->type != HSR_PT_INTERLINK &&
 	    hsr->proto_ops->invalid_dan_ingress_frame &&
 	    hsr->proto_ops->invalid_dan_ingress_frame(protocol))
 		goto finish_pass;
-- 
cgit v1.2.3


From 91cfd679f9e8b9a7bf2f26adf66eff99dbe2026b Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 19 Apr 2024 17:54:42 +0100
Subject: ACPI/IORT: Handle memory address size limits as limits

Return the Root Complex/Named Component memory address size limit as an
inclusive limit value, rather than an exclusive size. This saves having
to fudge an off-by-one for the 64-bit case, and simplifies our caller.

Acked-by: Hanjun Guo <guohanjun@huawei.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/284ae9fbadb12f2e3b5a30cd4d037d0e6843a8f4.1713523152.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/acpi/arm64/dma.c  |  9 +++------
 drivers/acpi/arm64/iort.c | 20 ++++++++++----------
 include/linux/acpi_iort.h |  4 ++--
 3 files changed, 15 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/arm64/dma.c b/drivers/acpi/arm64/dma.c
index 93d796531af3..b98a149f8d50 100644
--- a/drivers/acpi/arm64/dma.c
+++ b/drivers/acpi/arm64/dma.c
@@ -8,7 +8,6 @@ void acpi_arch_dma_setup(struct device *dev)
 {
 	int ret;
 	u64 end, mask;
-	u64 size = 0;
 	const struct bus_dma_region *map = NULL;
 
 	/*
@@ -23,9 +22,9 @@ void acpi_arch_dma_setup(struct device *dev)
 	}
 
 	if (dev->coherent_dma_mask)
-		size = max(dev->coherent_dma_mask, dev->coherent_dma_mask + 1);
+		end = dev->coherent_dma_mask;
 	else
-		size = 1ULL << 32;
+		end = (1ULL << 32) - 1;
 
 	ret = acpi_dma_get_range(dev, &map);
 	if (!ret && map) {
@@ -36,18 +35,16 @@ void acpi_arch_dma_setup(struct device *dev)
 				end = r->dma_start + r->size - 1;
 		}
 
-		size = end + 1;
 		dev->dma_range_map = map;
 	}
 
 	if (ret == -ENODEV)
-		ret = iort_dma_get_ranges(dev, &size);
+		ret = iort_dma_get_ranges(dev, &end);
 	if (!ret) {
 		/*
 		 * Limit coherent and dma mask based on size retrieved from
 		 * firmware.
 		 */
-		end = size - 1;
 		mask = DMA_BIT_MASK(ilog2(end) + 1);
 		dev->bus_dma_limit = end;
 		dev->coherent_dma_mask = min(dev->coherent_dma_mask, mask);
diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 6496ff5a6ba2..c0b1c2c19444 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -1367,7 +1367,7 @@ int iort_iommu_configure_id(struct device *dev, const u32 *input_id)
 { return -ENODEV; }
 #endif
 
-static int nc_dma_get_range(struct device *dev, u64 *size)
+static int nc_dma_get_range(struct device *dev, u64 *limit)
 {
 	struct acpi_iort_node *node;
 	struct acpi_iort_named_component *ncomp;
@@ -1384,13 +1384,13 @@ static int nc_dma_get_range(struct device *dev, u64 *size)
 		return -EINVAL;
 	}
 
-	*size = ncomp->memory_address_limit >= 64 ? U64_MAX :
-			1ULL<<ncomp->memory_address_limit;
+	*limit = ncomp->memory_address_limit >= 64 ? U64_MAX :
+			(1ULL << ncomp->memory_address_limit) - 1;
 
 	return 0;
 }
 
-static int rc_dma_get_range(struct device *dev, u64 *size)
+static int rc_dma_get_range(struct device *dev, u64 *limit)
 {
 	struct acpi_iort_node *node;
 	struct acpi_iort_root_complex *rc;
@@ -1408,8 +1408,8 @@ static int rc_dma_get_range(struct device *dev, u64 *size)
 		return -EINVAL;
 	}
 
-	*size = rc->memory_address_limit >= 64 ? U64_MAX :
-			1ULL<<rc->memory_address_limit;
+	*limit = rc->memory_address_limit >= 64 ? U64_MAX :
+			(1ULL << rc->memory_address_limit) - 1;
 
 	return 0;
 }
@@ -1417,16 +1417,16 @@ static int rc_dma_get_range(struct device *dev, u64 *size)
 /**
  * iort_dma_get_ranges() - Look up DMA addressing limit for the device
  * @dev: device to lookup
- * @size: DMA range size result pointer
+ * @limit: DMA limit result pointer
  *
  * Return: 0 on success, an error otherwise.
  */
-int iort_dma_get_ranges(struct device *dev, u64 *size)
+int iort_dma_get_ranges(struct device *dev, u64 *limit)
 {
 	if (dev_is_pci(dev))
-		return rc_dma_get_range(dev, size);
+		return rc_dma_get_range(dev, limit);
 	else
-		return nc_dma_get_range(dev, size);
+		return nc_dma_get_range(dev, limit);
 }
 
 static void __init acpi_iort_register_irq(int hwirq, const char *name,
diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
index 1cb65592c95d..d4ed5622cf2b 100644
--- a/include/linux/acpi_iort.h
+++ b/include/linux/acpi_iort.h
@@ -39,7 +39,7 @@ void iort_get_rmr_sids(struct fwnode_handle *iommu_fwnode,
 void iort_put_rmr_sids(struct fwnode_handle *iommu_fwnode,
 		       struct list_head *head);
 /* IOMMU interface */
-int iort_dma_get_ranges(struct device *dev, u64 *size);
+int iort_dma_get_ranges(struct device *dev, u64 *limit);
 int iort_iommu_configure_id(struct device *dev, const u32 *id_in);
 void iort_iommu_get_resv_regions(struct device *dev, struct list_head *head);
 phys_addr_t acpi_iort_dma_get_max_cpu_address(void);
@@ -55,7 +55,7 @@ void iort_get_rmr_sids(struct fwnode_handle *iommu_fwnode, struct list_head *hea
 static inline
 void iort_put_rmr_sids(struct fwnode_handle *iommu_fwnode, struct list_head *head) { }
 /* IOMMU interface */
-static inline int iort_dma_get_ranges(struct device *dev, u64 *size)
+static inline int iort_dma_get_ranges(struct device *dev, u64 *limit)
 { return -ENODEV; }
 static inline int iort_iommu_configure_id(struct device *dev, const u32 *id_in)
 { return -ENODEV; }
-- 
cgit v1.2.3


From fece6530bf4b59b01a476a12851e07751e73d69f Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 19 Apr 2024 17:54:43 +0100
Subject: dma-mapping: Add helpers for dma_range_map bounds

Several places want to compute the lower and/or upper bounds of a
dma_range_map, so let's factor that out into reusable helpers.

Acked-by: Rob Herring <robh@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hanjun Guo <guohanjun@huawei.com> # For arm64
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/45ec52f033ec4dfb364e23f48abaf787f612fa53.1713523152.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 arch/loongarch/kernel/dma.c |  9 ++-------
 drivers/acpi/arm64/dma.c    |  8 +-------
 drivers/of/device.c         | 11 ++---------
 include/linux/dma-direct.h  | 18 ++++++++++++++++++
 4 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/arch/loongarch/kernel/dma.c b/arch/loongarch/kernel/dma.c
index 7a9c6a9dd2d0..429555fb4e13 100644
--- a/arch/loongarch/kernel/dma.c
+++ b/arch/loongarch/kernel/dma.c
@@ -8,17 +8,12 @@
 void acpi_arch_dma_setup(struct device *dev)
 {
 	int ret;
-	u64 mask, end = 0;
+	u64 mask, end;
 	const struct bus_dma_region *map = NULL;
 
 	ret = acpi_dma_get_range(dev, &map);
 	if (!ret && map) {
-		const struct bus_dma_region *r = map;
-
-		for (end = 0; r->size; r++) {
-			if (r->dma_start + r->size - 1 > end)
-				end = r->dma_start + r->size - 1;
-		}
+		end = dma_range_map_max(map);
 
 		mask = DMA_BIT_MASK(ilog2(end) + 1);
 		dev->bus_dma_limit = end;
diff --git a/drivers/acpi/arm64/dma.c b/drivers/acpi/arm64/dma.c
index b98a149f8d50..52b2abf88689 100644
--- a/drivers/acpi/arm64/dma.c
+++ b/drivers/acpi/arm64/dma.c
@@ -28,13 +28,7 @@ void acpi_arch_dma_setup(struct device *dev)
 
 	ret = acpi_dma_get_range(dev, &map);
 	if (!ret && map) {
-		const struct bus_dma_region *r = map;
-
-		for (end = 0; r->size; r++) {
-			if (r->dma_start + r->size - 1 > end)
-				end = r->dma_start + r->size - 1;
-		}
-
+		end = dma_range_map_max(map);
 		dev->dma_range_map = map;
 	}
 
diff --git a/drivers/of/device.c b/drivers/of/device.c
index 841ccd3a19d1..9e7963972fa7 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -117,16 +117,9 @@ int of_dma_configure_id(struct device *dev, struct device_node *np,
 		if (!force_dma)
 			return ret == -ENODEV ? 0 : ret;
 	} else {
-		const struct bus_dma_region *r = map;
-
 		/* Determine the overall bounds of all DMA regions */
-		for (dma_start = ~0; r->size; r++) {
-			/* Take lower and upper limits */
-			if (r->dma_start < dma_start)
-				dma_start = r->dma_start;
-			if (r->dma_start + r->size > end)
-				end = r->dma_start + r->size;
-		}
+		dma_start = dma_range_map_min(map);
+		end = dma_range_map_max(map);
 	}
 
 	/*
diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 3eb3589ff43e..edbe13d00776 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -54,6 +54,24 @@ static inline phys_addr_t translate_dma_to_phys(struct device *dev,
 	return (phys_addr_t)-1;
 }
 
+static inline dma_addr_t dma_range_map_min(const struct bus_dma_region *map)
+{
+	dma_addr_t ret = (dma_addr_t)U64_MAX;
+
+	for (; map->size; map++)
+		ret = min(ret, map->dma_start);
+	return ret;
+}
+
+static inline dma_addr_t dma_range_map_max(const struct bus_dma_region *map)
+{
+	dma_addr_t ret = 0;
+
+	for (; map->size; map++)
+		ret = max(ret, map->dma_start + map->size - 1);
+	return ret;
+}
+
 #ifdef CONFIG_ARCH_HAS_PHYS_TO_DMA
 #include <asm/dma-direct.h>
 #ifndef phys_to_dma_unencrypted
-- 
cgit v1.2.3


From b67483b3c44eaef2f771fa4c712e13f452675a67 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 19 Apr 2024 17:54:45 +0100
Subject: iommu/dma: Centralise iommu_setup_dma_ops()

It's somewhat hard to see, but arm64's arch_setup_dma_ops() should only
ever call iommu_setup_dma_ops() after a successful iommu_probe_device(),
which means there should be no harm in achieving the same order of
operations by running it off the back of iommu_probe_device() itself.
This then puts it in line with the x86 and s390 .probe_finalize bodges,
letting us pull it all into the main flow properly. As a bonus this lets
us fold in and de-scope the PCI workaround setup as well.

At this point we can also then pull the call up inside the group mutex,
and avoid having to think about whether iommu_group_store_type() could
theoretically race and free the domain if iommu_setup_dma_ops() ran just
*before* iommu_device_use_default_domain() claims it... Furthermore we
replace one .probe_finalize call completely, since the only remaining
implementations are now one which only needs to run once for the initial
boot-time probe, and two which themselves render that path unreachable.

This leaves us a big step closer to realistically being able to unpick
the variety of different things that iommu_setup_dma_ops() has been
muddling together, and further streamline iommu-dma into core API flows
in future.

Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com> # For Intel IOMMU
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/bebea331c1d688b34d9862eefd5ede47503961b8.1713523152.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 arch/arm64/mm/dma-mapping.c  |  2 --
 drivers/iommu/amd/iommu.c    |  8 --------
 drivers/iommu/dma-iommu.c    | 18 ++++++------------
 drivers/iommu/dma-iommu.h    | 14 ++++++--------
 drivers/iommu/intel/iommu.c  |  7 -------
 drivers/iommu/iommu.c        | 20 +++++++-------------
 drivers/iommu/s390-iommu.c   |  6 ------
 drivers/iommu/virtio-iommu.c | 10 ----------
 include/linux/iommu.h        |  7 -------
 9 files changed, 19 insertions(+), 73 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 61886e43e3a1..313d8938a2f0 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -58,8 +58,6 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 		   ARCH_DMA_MINALIGN, cls);
 
 	dev->dma_coherent = coherent;
-	if (device_iommu_mapped(dev))
-		iommu_setup_dma_ops(dev, dma_base, dma_base + size - 1);
 
 	xen_setup_dma_ops(dev);
 }
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index d35c1b8c8e65..085abf098fa9 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -2175,13 +2175,6 @@ static struct iommu_device *amd_iommu_probe_device(struct device *dev)
 	return iommu_dev;
 }
 
-static void amd_iommu_probe_finalize(struct device *dev)
-{
-	/* Domains are initialized for this device - have a look what we ended up with */
-	set_dma_ops(dev, NULL);
-	iommu_setup_dma_ops(dev, 0, U64_MAX);
-}
-
 static void amd_iommu_release_device(struct device *dev)
 {
 	struct amd_iommu *iommu;
@@ -2784,7 +2777,6 @@ const struct iommu_ops amd_iommu_ops = {
 	.domain_alloc_user = amd_iommu_domain_alloc_user,
 	.probe_device = amd_iommu_probe_device,
 	.release_device = amd_iommu_release_device,
-	.probe_finalize = amd_iommu_probe_finalize,
 	.device_group = amd_iommu_device_group,
 	.get_resv_regions = amd_iommu_get_resv_regions,
 	.is_attach_deferred = amd_iommu_is_attach_deferred,
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index c827ed690f1f..bc1490bd4f7a 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1741,25 +1741,20 @@ static const struct dma_map_ops iommu_dma_ops = {
 	.max_mapping_size       = iommu_dma_max_mapping_size,
 };
 
-/*
- * The IOMMU core code allocates the default DMA domain, which the underlying
- * IOMMU driver needs to support via the dma-iommu layer.
- */
-void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit)
+void iommu_setup_dma_ops(struct device *dev)
 {
 	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
 
-	if (!domain)
-		goto out_err;
+	if (dev_is_pci(dev))
+		dev->iommu->pci_32bit_workaround = !iommu_dma_forcedac;
 
-	/*
-	 * The IOMMU core code allocates the default DMA domain, which the
-	 * underlying IOMMU driver needs to support via the dma-iommu layer.
-	 */
 	if (iommu_is_dma_domain(domain)) {
 		if (iommu_dma_init_domain(domain, dev))
 			goto out_err;
 		dev->dma_ops = &iommu_dma_ops;
+	} else if (dev->dma_ops == &iommu_dma_ops) {
+		/* Clean up if we've switched *from* a DMA domain */
+		dev->dma_ops = NULL;
 	}
 
 	return;
@@ -1767,7 +1762,6 @@ out_err:
 	 pr_warn("Failed to set up IOMMU for device %s; retaining platform DMA ops\n",
 		 dev_name(dev));
 }
-EXPORT_SYMBOL_GPL(iommu_setup_dma_ops);
 
 static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
 		phys_addr_t msi_addr, struct iommu_domain *domain)
diff --git a/drivers/iommu/dma-iommu.h b/drivers/iommu/dma-iommu.h
index c829f1f82a99..c12d63457c76 100644
--- a/drivers/iommu/dma-iommu.h
+++ b/drivers/iommu/dma-iommu.h
@@ -9,6 +9,8 @@
 
 #ifdef CONFIG_IOMMU_DMA
 
+void iommu_setup_dma_ops(struct device *dev);
+
 int iommu_get_dma_cookie(struct iommu_domain *domain);
 void iommu_put_dma_cookie(struct iommu_domain *domain);
 
@@ -17,13 +19,13 @@ int iommu_dma_init_fq(struct iommu_domain *domain);
 void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list);
 
 extern bool iommu_dma_forcedac;
-static inline void iommu_dma_set_pci_32bit_workaround(struct device *dev)
-{
-	dev->iommu->pci_32bit_workaround = !iommu_dma_forcedac;
-}
 
 #else /* CONFIG_IOMMU_DMA */
 
+static inline void iommu_setup_dma_ops(struct device *dev)
+{
+}
+
 static inline int iommu_dma_init_fq(struct iommu_domain *domain)
 {
 	return -EINVAL;
@@ -42,9 +44,5 @@ static inline void iommu_dma_get_resv_regions(struct device *dev, struct list_he
 {
 }
 
-static inline void iommu_dma_set_pci_32bit_workaround(struct device *dev)
-{
-}
-
 #endif	/* CONFIG_IOMMU_DMA */
 #endif	/* __DMA_IOMMU_H */
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 45c75a8a0ef5..64adfdadeb49 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4349,12 +4349,6 @@ static void intel_iommu_release_device(struct device *dev)
 	set_dma_ops(dev, NULL);
 }
 
-static void intel_iommu_probe_finalize(struct device *dev)
-{
-	set_dma_ops(dev, NULL);
-	iommu_setup_dma_ops(dev, 0, U64_MAX);
-}
-
 static void intel_iommu_get_resv_regions(struct device *device,
 					 struct list_head *head)
 {
@@ -4834,7 +4828,6 @@ const struct iommu_ops intel_iommu_ops = {
 	.domain_alloc		= intel_iommu_domain_alloc,
 	.domain_alloc_user	= intel_iommu_domain_alloc_user,
 	.probe_device		= intel_iommu_probe_device,
-	.probe_finalize		= intel_iommu_probe_finalize,
 	.release_device		= intel_iommu_release_device,
 	.get_resv_regions	= intel_iommu_get_resv_regions,
 	.device_group		= intel_iommu_device_group,
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 3183b0ed4cdb..9df7cc75c1bc 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -581,10 +581,11 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list
 		if (list_empty(&group->entry))
 			list_add_tail(&group->entry, group_list);
 	}
-	mutex_unlock(&group->mutex);
 
-	if (dev_is_pci(dev))
-		iommu_dma_set_pci_32bit_workaround(dev);
+	if (group->default_domain)
+		iommu_setup_dma_ops(dev);
+
+	mutex_unlock(&group->mutex);
 
 	return 0;
 
@@ -1828,6 +1829,8 @@ int bus_iommu_probe(const struct bus_type *bus)
 			mutex_unlock(&group->mutex);
 			return ret;
 		}
+		for_each_group_device(group, gdev)
+			iommu_setup_dma_ops(gdev->dev);
 		mutex_unlock(&group->mutex);
 
 		/*
@@ -3066,18 +3069,9 @@ static ssize_t iommu_group_store_type(struct iommu_group *group,
 	if (ret)
 		goto out_unlock;
 
-	/*
-	 * Release the mutex here because ops->probe_finalize() call-back of
-	 * some vendor IOMMU drivers calls arm_iommu_attach_device() which
-	 * in-turn might call back into IOMMU core code, where it tries to take
-	 * group->mutex, resulting in a deadlock.
-	 */
-	mutex_unlock(&group->mutex);
-
 	/* Make sure dma_ops is appropriatley set */
 	for_each_group_device(group, gdev)
-		iommu_group_do_probe_finalize(gdev->dev);
-	return count;
+		iommu_setup_dma_ops(gdev->dev);
 
 out_unlock:
 	mutex_unlock(&group->mutex);
diff --git a/drivers/iommu/s390-iommu.c b/drivers/iommu/s390-iommu.c
index 9a5196f523de..d8eaa7ea380b 100644
--- a/drivers/iommu/s390-iommu.c
+++ b/drivers/iommu/s390-iommu.c
@@ -695,11 +695,6 @@ static size_t s390_iommu_unmap_pages(struct iommu_domain *domain,
 	return size;
 }
 
-static void s390_iommu_probe_finalize(struct device *dev)
-{
-	iommu_setup_dma_ops(dev, 0, U64_MAX);
-}
-
 struct zpci_iommu_ctrs *zpci_get_iommu_ctrs(struct zpci_dev *zdev)
 {
 	if (!zdev || !zdev->s390_domain)
@@ -785,7 +780,6 @@ static const struct iommu_ops s390_iommu_ops = {
 	.capable = s390_iommu_capable,
 	.domain_alloc_paging = s390_domain_alloc_paging,
 	.probe_device = s390_iommu_probe_device,
-	.probe_finalize = s390_iommu_probe_finalize,
 	.release_device = s390_iommu_release_device,
 	.device_group = generic_device_group,
 	.pgsize_bitmap = SZ_4K,
diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
index 04048f64a2c0..8e776f6c6e35 100644
--- a/drivers/iommu/virtio-iommu.c
+++ b/drivers/iommu/virtio-iommu.c
@@ -1025,15 +1025,6 @@ err_free_dev:
 	return ERR_PTR(ret);
 }
 
-static void viommu_probe_finalize(struct device *dev)
-{
-#ifndef CONFIG_ARCH_HAS_SETUP_DMA_OPS
-	/* First clear the DMA ops in case we're switching from a DMA domain */
-	set_dma_ops(dev, NULL);
-	iommu_setup_dma_ops(dev, 0, U64_MAX);
-#endif
-}
-
 static void viommu_release_device(struct device *dev)
 {
 	struct viommu_endpoint *vdev = dev_iommu_priv_get(dev);
@@ -1073,7 +1064,6 @@ static struct iommu_ops viommu_ops = {
 	.capable		= viommu_capable,
 	.domain_alloc		= viommu_domain_alloc,
 	.probe_device		= viommu_probe_device,
-	.probe_finalize		= viommu_probe_finalize,
 	.release_device		= viommu_release_device,
 	.device_group		= viommu_device_group,
 	.get_resv_regions	= viommu_get_resv_regions,
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 40dd439307e8..86f0bff3cc95 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -1446,9 +1446,6 @@ static inline void iommu_debugfs_setup(void) {}
 #ifdef CONFIG_IOMMU_DMA
 #include <linux/msi.h>
 
-/* Setup call for arch DMA mapping code */
-void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit);
-
 int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base);
 
 int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr);
@@ -1459,10 +1456,6 @@ void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg);
 struct msi_desc;
 struct msi_msg;
 
-static inline void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit)
-{
-}
-
 static inline int iommu_get_msi_cookie(struct iommu_domain *domain, dma_addr_t base)
 {
 	return -ENODEV;
-- 
cgit v1.2.3


From f091e93306e0429ebb7589b9874590b6a9705e64 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 19 Apr 2024 17:54:46 +0100
Subject: dma-mapping: Simplify arch_setup_dma_ops()

The dma_base, size and iommu arguments are only used by ARM, and can
now easily be deduced from the device itself, so there's no need to pass
them through the callchain as well.

Acked-by: Rob Herring <robh@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Michael Kelley <mhklinux@outlook.com> # For Hyper-V
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/5291c2326eab405b1aa7693aa964e8d3cb7193de.1713523152.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 arch/arc/mm/dma.c               |  3 +--
 arch/arm/mm/dma-mapping-nommu.c |  3 +--
 arch/arm/mm/dma-mapping.c       | 16 +++++++++-------
 arch/arm64/mm/dma-mapping.c     |  3 +--
 arch/mips/mm/dma-noncoherent.c  |  3 +--
 arch/riscv/mm/dma-noncoherent.c |  3 +--
 drivers/acpi/scan.c             |  7 +------
 drivers/hv/hv_common.c          |  6 +-----
 drivers/of/device.c             |  4 +---
 include/linux/dma-map-ops.h     |  6 ++----
 10 files changed, 19 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c
index 197707bc7658..6b85e94f3275 100644
--- a/arch/arc/mm/dma.c
+++ b/arch/arc/mm/dma.c
@@ -90,8 +90,7 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
 /*
  * Plug in direct dma map ops.
  */
-void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
-			bool coherent)
+void arch_setup_dma_ops(struct device *dev, bool coherent)
 {
 	/*
 	 * IOC hardware snoops all DMA traffic keeping the caches consistent
diff --git a/arch/arm/mm/dma-mapping-nommu.c b/arch/arm/mm/dma-mapping-nommu.c
index b94850b57995..97db5397c320 100644
--- a/arch/arm/mm/dma-mapping-nommu.c
+++ b/arch/arm/mm/dma-mapping-nommu.c
@@ -33,8 +33,7 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
 	}
 }
 
-void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
-			bool coherent)
+void arch_setup_dma_ops(struct device *dev, bool coherent)
 {
 	if (IS_ENABLED(CONFIG_CPU_V7M)) {
 		/*
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index f68db05eba29..5adf1769eee4 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -1709,11 +1709,15 @@ void arm_iommu_detach_device(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(arm_iommu_detach_device);
 
-static void arm_setup_iommu_dma_ops(struct device *dev, u64 dma_base, u64 size,
-				    bool coherent)
+static void arm_setup_iommu_dma_ops(struct device *dev)
 {
 	struct dma_iommu_mapping *mapping;
+	u64 dma_base = 0, size = 1ULL << 32;
 
+	if (dev->dma_range_map) {
+		dma_base = dma_range_map_min(dev->dma_range_map);
+		size = dma_range_map_max(dev->dma_range_map) - dma_base;
+	}
 	mapping = arm_iommu_create_mapping(dev->bus, dma_base, size);
 	if (IS_ERR(mapping)) {
 		pr_warn("Failed to create %llu-byte IOMMU mapping for device %s\n",
@@ -1744,8 +1748,7 @@ static void arm_teardown_iommu_dma_ops(struct device *dev)
 
 #else
 
-static void arm_setup_iommu_dma_ops(struct device *dev, u64 dma_base, u64 size,
-				    bool coherent)
+static void arm_setup_iommu_dma_ops(struct device *dev)
 {
 }
 
@@ -1753,8 +1756,7 @@ static void arm_teardown_iommu_dma_ops(struct device *dev) { }
 
 #endif	/* CONFIG_ARM_DMA_USE_IOMMU */
 
-void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
-			bool coherent)
+void arch_setup_dma_ops(struct device *dev, bool coherent)
 {
 	/*
 	 * Due to legacy code that sets the ->dma_coherent flag from a bus
@@ -1774,7 +1776,7 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
 		return;
 
 	if (device_iommu_mapped(dev))
-		arm_setup_iommu_dma_ops(dev, dma_base, size, coherent);
+		arm_setup_iommu_dma_ops(dev);
 
 	xen_setup_dma_ops(dev);
 	dev->archdata.dma_ops_setup = true;
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 313d8938a2f0..0b320a25a471 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -46,8 +46,7 @@ void arch_teardown_dma_ops(struct device *dev)
 }
 #endif
 
-void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
-			bool coherent)
+void arch_setup_dma_ops(struct device *dev, bool coherent)
 {
 	int cls = cache_line_size_of_cpu();
 
diff --git a/arch/mips/mm/dma-noncoherent.c b/arch/mips/mm/dma-noncoherent.c
index 0f3cec663a12..ab4f2a75a7d0 100644
--- a/arch/mips/mm/dma-noncoherent.c
+++ b/arch/mips/mm/dma-noncoherent.c
@@ -137,8 +137,7 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
 #endif
 
 #ifdef CONFIG_ARCH_HAS_SETUP_DMA_OPS
-void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
-		bool coherent)
+void arch_setup_dma_ops(struct device *dev, bool coherent)
 {
 	dev->dma_coherent = coherent;
 }
diff --git a/arch/riscv/mm/dma-noncoherent.c b/arch/riscv/mm/dma-noncoherent.c
index 843107f834b2..cb89d7e0ba88 100644
--- a/arch/riscv/mm/dma-noncoherent.c
+++ b/arch/riscv/mm/dma-noncoherent.c
@@ -128,8 +128,7 @@ void arch_dma_prep_coherent(struct page *page, size_t size)
 	ALT_CMO_OP(FLUSH, flush_addr, size, riscv_cbom_block_size);
 }
 
-void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
-			bool coherent)
+void arch_setup_dma_ops(struct device *dev, bool coherent)
 {
 	WARN_TAINT(!coherent && riscv_cbom_block_size > ARCH_DMA_MINALIGN,
 		   TAINT_CPU_OUT_OF_SPEC,
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 7c157bf92695..b1a88992c1a9 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -1675,12 +1675,7 @@ int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr,
 	if (ret == -EPROBE_DEFER)
 		return -EPROBE_DEFER;
 
-	/*
-	 * Historically this routine doesn't fail driver probing due to errors
-	 * in acpi_iommu_configure_id()
-	 */
-
-	arch_setup_dma_ops(dev, 0, U64_MAX, attr == DEV_DMA_COHERENT);
+	arch_setup_dma_ops(dev, attr == DEV_DMA_COHERENT);
 
 	return 0;
 }
diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c
index dde3f9b6871a..9c452bfbd571 100644
--- a/drivers/hv/hv_common.c
+++ b/drivers/hv/hv_common.c
@@ -561,11 +561,7 @@ EXPORT_SYMBOL_GPL(hv_query_ext_cap);
 
 void hv_setup_dma_ops(struct device *dev, bool coherent)
 {
-	/*
-	 * Hyper-V does not offer a vIOMMU in the guest
-	 * VM, so pass 0/NULL for the IOMMU settings
-	 */
-	arch_setup_dma_ops(dev, 0, 0, coherent);
+	arch_setup_dma_ops(dev, coherent);
 }
 EXPORT_SYMBOL_GPL(hv_setup_dma_ops);
 
diff --git a/drivers/of/device.c b/drivers/of/device.c
index 9e7963972fa7..312c63361211 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -95,7 +95,6 @@ int of_dma_configure_id(struct device *dev, struct device_node *np,
 {
 	const struct bus_dma_region *map = NULL;
 	struct device_node *bus_np;
-	u64 dma_start = 0;
 	u64 mask, end = 0;
 	bool coherent;
 	int iommu_ret;
@@ -118,7 +117,6 @@ int of_dma_configure_id(struct device *dev, struct device_node *np,
 			return ret == -ENODEV ? 0 : ret;
 	} else {
 		/* Determine the overall bounds of all DMA regions */
-		dma_start = dma_range_map_min(map);
 		end = dma_range_map_max(map);
 	}
 
@@ -175,7 +173,7 @@ int of_dma_configure_id(struct device *dev, struct device_node *np,
 	} else
 		dev_dbg(dev, "device is behind an iommu\n");
 
-	arch_setup_dma_ops(dev, dma_start, end - dma_start + 1, coherent);
+	arch_setup_dma_ops(dev, coherent);
 
 	if (iommu_ret)
 		of_dma_set_restricted_buffer(dev, np);
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 4abc60f04209..ed89e1ce0114 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -426,11 +426,9 @@ bool arch_dma_unmap_sg_direct(struct device *dev, struct scatterlist *sg,
 #endif
 
 #ifdef CONFIG_ARCH_HAS_SETUP_DMA_OPS
-void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
-		bool coherent);
+void arch_setup_dma_ops(struct device *dev, bool coherent);
 #else
-static inline void arch_setup_dma_ops(struct device *dev, u64 dma_base,
-		u64 size, bool coherent)
+static inline void arch_setup_dma_ops(struct device *dev, bool coherent)
 {
 }
 #endif /* CONFIG_ARCH_HAS_SETUP_DMA_OPS */
-- 
cgit v1.2.3


From 80af5a45202422db957549a241e00bf4d4e0ce89 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 18 Apr 2024 10:33:59 +0000
Subject: iommu: Add ops->domain_alloc_sva()

Make a new op that receives the device and the mm_struct that the SVA
domain should be created for. Unlike domain_alloc_paging() the dev
argument is never NULL here.

This allows drivers to fully initialize the SVA domain and allocate the
mmu_notifier during allocation. It allows the notifier lifetime to follow
the lifetime of the iommu_domain.

Since we have only one call site, upgrade the new op to return ERR_PTR
instead of NULL.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
[Removed smmu3 related changes - Vasant]
Signed-off-by: Vasant Hegde <vasant.hegde@amd.com>
Reviewed-by: Tina Zhang <tina.zhang@intel.com>
Link: https://lore.kernel.org/r/20240418103400.6229-15-vasant.hegde@amd.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/iommu-sva.c | 16 +++++++++++-----
 include/linux/iommu.h     |  3 +++
 2 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
index 640acc804e8c..18a35e798b72 100644
--- a/drivers/iommu/iommu-sva.c
+++ b/drivers/iommu/iommu-sva.c
@@ -108,8 +108,8 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm
 
 	/* Allocate a new domain and set it on device pasid. */
 	domain = iommu_sva_domain_alloc(dev, mm);
-	if (!domain) {
-		ret = -ENOMEM;
+	if (IS_ERR(domain)) {
+		ret = PTR_ERR(domain);
 		goto out_free_handle;
 	}
 
@@ -283,9 +283,15 @@ struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
 	const struct iommu_ops *ops = dev_iommu_ops(dev);
 	struct iommu_domain *domain;
 
-	domain = ops->domain_alloc(IOMMU_DOMAIN_SVA);
-	if (!domain)
-		return NULL;
+	if (ops->domain_alloc_sva) {
+		domain = ops->domain_alloc_sva(dev, mm);
+		if (IS_ERR(domain))
+			return domain;
+	} else {
+		domain = ops->domain_alloc(IOMMU_DOMAIN_SVA);
+		if (!domain)
+			return ERR_PTR(-ENOMEM);
+	}
 
 	domain->type = IOMMU_DOMAIN_SVA;
 	mmgrab(mm);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 2e925b5eba53..8aabe83af8f2 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -518,6 +518,7 @@ static inline int __iommu_copy_struct_from_user_array(
  *                     Upon failure, ERR_PTR must be returned.
  * @domain_alloc_paging: Allocate an iommu_domain that can be used for
  *                       UNMANAGED, DMA, and DMA_FQ domain types.
+ * @domain_alloc_sva: Allocate an iommu_domain for Shared Virtual Addressing.
  * @probe_device: Add device to iommu driver handling
  * @release_device: Remove device from iommu driver handling
  * @probe_finalize: Do final setup work after the device is added to an IOMMU
@@ -558,6 +559,8 @@ struct iommu_ops {
 		struct device *dev, u32 flags, struct iommu_domain *parent,
 		const struct iommu_user_data *user_data);
 	struct iommu_domain *(*domain_alloc_paging)(struct device *dev);
+	struct iommu_domain *(*domain_alloc_sva)(struct device *dev,
+						 struct mm_struct *mm);
 
 	struct iommu_device *(*probe_device)(struct device *dev);
 	void (*release_device)(struct device *dev);
-- 
cgit v1.2.3


From aa7b148855e8265c4f72ae0305842efdda72680b Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Wed, 13 Mar 2024 14:03:19 +0000
Subject: media: v4l: Set sub-device's owner field to the caller's module

Set a sub-device's owner field to the caller's module, provided as an
argument to the function. v4l2_device_register_subdev() becomes a macro
passing THIS_MODULE to the __v4l2_device_register_subdev() function.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 drivers/media/v4l2-core/v4l2-async.c  |  2 +-
 drivers/media/v4l2-core/v4l2-device.c | 12 +++++++-----
 drivers/media/v4l2-core/v4l2-i2c.c    |  2 +-
 drivers/media/v4l2-core/v4l2-spi.c    |  2 +-
 include/media/v4l2-device.h           |  7 +++++--
 5 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/media/v4l2-core/v4l2-async.c b/drivers/media/v4l2-core/v4l2-async.c
index 4bb073587817..b28f6811a5f4 100644
--- a/drivers/media/v4l2-core/v4l2-async.c
+++ b/drivers/media/v4l2-core/v4l2-async.c
@@ -341,7 +341,7 @@ static int v4l2_async_match_notify(struct v4l2_async_notifier *notifier,
 	int ret;
 
 	if (list_empty(&sd->asc_list)) {
-		ret = v4l2_device_register_subdev(v4l2_dev, sd);
+		ret = __v4l2_device_register_subdev(v4l2_dev, sd, sd->owner);
 		if (ret < 0)
 			return ret;
 		registered = true;
diff --git a/drivers/media/v4l2-core/v4l2-device.c b/drivers/media/v4l2-core/v4l2-device.c
index d2e58ae91f9b..5e537454f5cd 100644
--- a/drivers/media/v4l2-core/v4l2-device.c
+++ b/drivers/media/v4l2-core/v4l2-device.c
@@ -108,8 +108,8 @@ void v4l2_device_unregister(struct v4l2_device *v4l2_dev)
 }
 EXPORT_SYMBOL_GPL(v4l2_device_unregister);
 
-int v4l2_device_register_subdev(struct v4l2_device *v4l2_dev,
-				struct v4l2_subdev *sd)
+int __v4l2_device_register_subdev(struct v4l2_device *v4l2_dev,
+				  struct v4l2_subdev *sd, struct module *module)
 {
 	int err;
 
@@ -125,9 +125,9 @@ int v4l2_device_register_subdev(struct v4l2_device *v4l2_dev,
 	 * try_module_get() such sub-device owners.
 	 */
 	sd->owner_v4l2_dev = v4l2_dev->dev && v4l2_dev->dev->driver &&
-		sd->owner == v4l2_dev->dev->driver->owner;
+		module == v4l2_dev->dev->driver->owner;
 
-	if (!sd->owner_v4l2_dev && !try_module_get(sd->owner))
+	if (!sd->owner_v4l2_dev && !try_module_get(module))
 		return -ENODEV;
 
 	sd->v4l2_dev = v4l2_dev;
@@ -152,6 +152,8 @@ int v4l2_device_register_subdev(struct v4l2_device *v4l2_dev,
 			goto error_unregister;
 	}
 
+	sd->owner = module;
+
 	spin_lock(&v4l2_dev->lock);
 	list_add_tail(&sd->list, &v4l2_dev->subdevs);
 	spin_unlock(&v4l2_dev->lock);
@@ -168,7 +170,7 @@ error_module:
 	sd->v4l2_dev = NULL;
 	return err;
 }
-EXPORT_SYMBOL_GPL(v4l2_device_register_subdev);
+EXPORT_SYMBOL_GPL(__v4l2_device_register_subdev);
 
 static void v4l2_subdev_release(struct v4l2_subdev *sd)
 {
diff --git a/drivers/media/v4l2-core/v4l2-i2c.c b/drivers/media/v4l2-core/v4l2-i2c.c
index b4acca75644b..586c46544255 100644
--- a/drivers/media/v4l2-core/v4l2-i2c.c
+++ b/drivers/media/v4l2-core/v4l2-i2c.c
@@ -100,7 +100,7 @@ struct v4l2_subdev
 	 * Register with the v4l2_device which increases the module's
 	 * use count as well.
 	 */
-	if (v4l2_device_register_subdev(v4l2_dev, sd))
+	if (__v4l2_device_register_subdev(v4l2_dev, sd, sd->owner))
 		sd = NULL;
 	/* Decrease the module use count to match the first try_module_get. */
 	module_put(client->dev.driver->owner);
diff --git a/drivers/media/v4l2-core/v4l2-spi.c b/drivers/media/v4l2-core/v4l2-spi.c
index a7092c3930d6..1baf8e63f19e 100644
--- a/drivers/media/v4l2-core/v4l2-spi.c
+++ b/drivers/media/v4l2-core/v4l2-spi.c
@@ -59,7 +59,7 @@ struct v4l2_subdev *v4l2_spi_new_subdev(struct v4l2_device *v4l2_dev,
 	 * Register with the v4l2_device which increases the module's
 	 * use count as well.
 	 */
-	if (v4l2_device_register_subdev(v4l2_dev, sd))
+	if (__v4l2_device_register_subdev(v4l2_dev, sd, sd->owner))
 		sd = NULL;
 
 	/* Decrease the module use count to match the first try_module_get. */
diff --git a/include/media/v4l2-device.h b/include/media/v4l2-device.h
index f6f111fae33c..dd897a362f36 100644
--- a/include/media/v4l2-device.h
+++ b/include/media/v4l2-device.h
@@ -156,8 +156,11 @@ void v4l2_device_unregister(struct v4l2_device *v4l2_dev);
  * An error is returned if the module is no longer loaded on any attempts
  * to register it.
  */
-int __must_check v4l2_device_register_subdev(struct v4l2_device *v4l2_dev,
-					     struct v4l2_subdev *sd);
+#define v4l2_device_register_subdev(v4l2_dev, sd) \
+	__v4l2_device_register_subdev(v4l2_dev, sd, THIS_MODULE)
+int __must_check __v4l2_device_register_subdev(struct v4l2_device *v4l2_dev,
+					       struct v4l2_subdev *sd,
+					       struct module *module);
 
 /**
  * v4l2_device_unregister_subdev - Unregisters a subdev with a v4l2 device.
-- 
cgit v1.2.3


From 8a718752f5c339137c5b05e54f116cd26d5a4143 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Wed, 13 Mar 2024 13:57:50 +0000
Subject: media: v4l: async: Set owner for async sub-devices

Set the owner field of the async sub-devices by making
v4l2_async_register_subdev() a macro and obtaining THIS_MODULE that way.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 drivers/media/v4l2-core/v4l2-async.c | 8 ++++++--
 include/media/v4l2-async.h           | 4 +++-
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/media/v4l2-core/v4l2-async.c b/drivers/media/v4l2-core/v4l2-async.c
index b28f6811a5f4..3a22da443a22 100644
--- a/drivers/media/v4l2-core/v4l2-async.c
+++ b/drivers/media/v4l2-core/v4l2-async.c
@@ -783,7 +783,7 @@ v4l2_async_connection_unique(struct v4l2_subdev *sd)
 }
 EXPORT_SYMBOL_GPL(v4l2_async_connection_unique);
 
-int v4l2_async_register_subdev(struct v4l2_subdev *sd)
+int __v4l2_async_register_subdev(struct v4l2_subdev *sd, struct module *module)
 {
 	struct v4l2_async_notifier *subdev_notifier;
 	struct v4l2_async_notifier *notifier;
@@ -807,6 +807,8 @@ int v4l2_async_register_subdev(struct v4l2_subdev *sd)
 		return -EINVAL;
 	}
 
+	sd->owner = module;
+
 	mutex_lock(&list_lock);
 
 	list_for_each_entry(notifier, &notifier_list, notifier_entry) {
@@ -849,9 +851,11 @@ err_unbind:
 
 	mutex_unlock(&list_lock);
 
+	sd->owner = NULL;
+
 	return ret;
 }
-EXPORT_SYMBOL(v4l2_async_register_subdev);
+EXPORT_SYMBOL(__v4l2_async_register_subdev);
 
 void v4l2_async_unregister_subdev(struct v4l2_subdev *sd)
 {
diff --git a/include/media/v4l2-async.h b/include/media/v4l2-async.h
index 9bd326d31181..f26c323e9c96 100644
--- a/include/media/v4l2-async.h
+++ b/include/media/v4l2-async.h
@@ -310,7 +310,9 @@ void v4l2_async_nf_cleanup(struct v4l2_async_notifier *notifier);
  *
  * @sd: pointer to &struct v4l2_subdev
  */
-int v4l2_async_register_subdev(struct v4l2_subdev *sd);
+#define v4l2_async_register_subdev(sd) \
+	__v4l2_async_register_subdev(sd, THIS_MODULE)
+int __v4l2_async_register_subdev(struct v4l2_subdev *sd, struct module *module);
 
 /**
  * v4l2_async_register_subdev_sensor - registers a sensor sub-device to the
-- 
cgit v1.2.3


From 96a8326d69fffd7e8c2eb737dc060d8009a33b39 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Wed, 24 Apr 2024 12:40:45 +0200
Subject: virtio: add debugfs infrastructure to allow to debug virtio features

Currently there is no way for user to set what features the driver
should obey or not, it is hard wired in the code.

In order to be able to debug the device behavior in case some feature is
disabled, introduce a debugfs infrastructure with couple of files
allowing user to see what features the device advertises and
to set filter for features used by driver.

Example:
$cat /sys/bus/virtio/devices/virtio0/features
1110010111111111111101010000110010000000100000000000000000000000
$ echo "5" >/sys/kernel/debug/virtio/virtio0/filter_feature_add
$ cat /sys/kernel/debug/virtio/virtio0/filter_features
5
$ echo "virtio0" > /sys/bus/virtio/drivers/virtio_net/unbind
$ echo "virtio0" > /sys/bus/virtio/drivers/virtio_net/bind
$ cat /sys/bus/virtio/devices/virtio0/features
1110000111111111111101010000110010000000100000000000000000000000

Note that sysfs "features" now already exists, this patch does not
touch it.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/virtio/Kconfig        |  10 ++++
 drivers/virtio/Makefile       |   1 +
 drivers/virtio/virtio.c       |   8 +++
 drivers/virtio/virtio_debug.c | 114 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/virtio.h        |  35 +++++++++++++
 5 files changed, 168 insertions(+)
 create mode 100644 drivers/virtio/virtio_debug.c

(limited to 'include')

diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index c17193544268..6284538a8184 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -178,4 +178,14 @@ config VIRTIO_DMA_SHARED_BUFFER
 	 This option adds a flavor of dma buffers that are backed by
 	 virtio resources.
 
+config VIRTIO_DEBUG
+	bool "Debug facilities"
+	depends on VIRTIO
+	help
+	  Enable this to expose debug facilities over debugfs.
+	  This allows to debug features, to see what features the device
+	  advertises and to set filter for features used by driver.
+
+	  If unsure, say N.
+
 endif # VIRTIO_MENU
diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
index 73ace62af440..58b2b0489fc9 100644
--- a/drivers/virtio/Makefile
+++ b/drivers/virtio/Makefile
@@ -13,3 +13,4 @@ obj-$(CONFIG_VIRTIO_INPUT) += virtio_input.o
 obj-$(CONFIG_VIRTIO_VDPA) += virtio_vdpa.o
 obj-$(CONFIG_VIRTIO_MEM) += virtio_mem.o
 obj-$(CONFIG_VIRTIO_DMA_SHARED_BUFFER) += virtio_dma_buf.o
+obj-$(CONFIG_VIRTIO_DEBUG) += virtio_debug.o
diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
index 9510c551dce8..b968b2aa5f4d 100644
--- a/drivers/virtio/virtio.c
+++ b/drivers/virtio/virtio.c
@@ -274,6 +274,9 @@ static int virtio_dev_probe(struct device *_d)
 	else
 		dev->features = driver_features_legacy & device_features;
 
+	/* When debugging, user may filter some features by hand. */
+	virtio_debug_device_filter_features(dev);
+
 	/* Transport features always preserved to pass to finalize_features. */
 	for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++)
 		if (device_features & (1ULL << i))
@@ -465,6 +468,8 @@ int register_virtio_device(struct virtio_device *dev)
 	/* Acknowledge that we've seen the device. */
 	virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE);
 
+	virtio_debug_device_init(dev);
+
 	/*
 	 * device_add() causes the bus infrastructure to look for a matching
 	 * driver.
@@ -496,6 +501,7 @@ void unregister_virtio_device(struct virtio_device *dev)
 	int index = dev->index; /* save for after device release */
 
 	device_unregister(&dev->dev);
+	virtio_debug_device_exit(dev);
 	ida_free(&virtio_index_ida, index);
 }
 EXPORT_SYMBOL_GPL(unregister_virtio_device);
@@ -590,11 +596,13 @@ static int virtio_init(void)
 {
 	if (bus_register(&virtio_bus) != 0)
 		panic("virtio bus registration failed");
+	virtio_debug_init();
 	return 0;
 }
 
 static void __exit virtio_exit(void)
 {
+	virtio_debug_exit();
 	bus_unregister(&virtio_bus);
 	ida_destroy(&virtio_index_ida);
 }
diff --git a/drivers/virtio/virtio_debug.c b/drivers/virtio/virtio_debug.c
new file mode 100644
index 000000000000..95c8fc7705bb
--- /dev/null
+++ b/drivers/virtio/virtio_debug.c
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/virtio.h>
+#include <linux/virtio_config.h>
+#include <linux/debugfs.h>
+
+static struct dentry *virtio_debugfs_dir;
+
+static int virtio_debug_device_features_show(struct seq_file *s, void *data)
+{
+	struct virtio_device *dev = s->private;
+	u64 device_features;
+	unsigned int i;
+
+	device_features = dev->config->get_features(dev);
+	for (i = 0; i < BITS_PER_LONG_LONG; i++) {
+		if (device_features & (1ULL << i))
+			seq_printf(s, "%u\n", i);
+	}
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(virtio_debug_device_features);
+
+static int virtio_debug_filter_features_show(struct seq_file *s, void *data)
+{
+	struct virtio_device *dev = s->private;
+	unsigned int i;
+
+	for (i = 0; i < BITS_PER_LONG_LONG; i++) {
+		if (dev->debugfs_filter_features & (1ULL << i))
+			seq_printf(s, "%u\n", i);
+	}
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(virtio_debug_filter_features);
+
+static int virtio_debug_filter_features_clear(void *data, u64 val)
+{
+	struct virtio_device *dev = data;
+
+	if (val == 1)
+		dev->debugfs_filter_features = 0;
+	return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(virtio_debug_filter_features_clear_fops, NULL,
+			 virtio_debug_filter_features_clear, "%llu\n");
+
+static int virtio_debug_filter_feature_add(void *data, u64 val)
+{
+	struct virtio_device *dev = data;
+
+	if (val >= BITS_PER_LONG_LONG)
+		return -EINVAL;
+	dev->debugfs_filter_features |= BIT_ULL_MASK(val);
+	return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(virtio_debug_filter_feature_add_fops, NULL,
+			 virtio_debug_filter_feature_add, "%llu\n");
+
+static int virtio_debug_filter_feature_del(void *data, u64 val)
+{
+	struct virtio_device *dev = data;
+
+	if (val >= BITS_PER_LONG_LONG)
+		return -EINVAL;
+	dev->debugfs_filter_features &= ~BIT_ULL_MASK(val);
+	return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(virtio_debug_filter_feature_del_fops, NULL,
+			 virtio_debug_filter_feature_del, "%llu\n");
+
+void virtio_debug_device_init(struct virtio_device *dev)
+{
+	dev->debugfs_dir = debugfs_create_dir(dev_name(&dev->dev),
+					      virtio_debugfs_dir);
+	debugfs_create_file("device_features", 0400, dev->debugfs_dir, dev,
+			    &virtio_debug_device_features_fops);
+	debugfs_create_file("filter_features", 0400, dev->debugfs_dir, dev,
+			    &virtio_debug_filter_features_fops);
+	debugfs_create_file("filter_features_clear", 0200, dev->debugfs_dir, dev,
+			    &virtio_debug_filter_features_clear_fops);
+	debugfs_create_file("filter_feature_add", 0200, dev->debugfs_dir, dev,
+			    &virtio_debug_filter_feature_add_fops);
+	debugfs_create_file("filter_feature_del", 0200, dev->debugfs_dir, dev,
+			    &virtio_debug_filter_feature_del_fops);
+}
+EXPORT_SYMBOL_GPL(virtio_debug_device_init);
+
+void virtio_debug_device_filter_features(struct virtio_device *dev)
+{
+	dev->features &= ~dev->debugfs_filter_features;
+}
+EXPORT_SYMBOL_GPL(virtio_debug_device_filter_features);
+
+void virtio_debug_device_exit(struct virtio_device *dev)
+{
+	debugfs_remove_recursive(dev->debugfs_dir);
+}
+EXPORT_SYMBOL_GPL(virtio_debug_device_exit);
+
+void virtio_debug_init(void)
+{
+	virtio_debugfs_dir = debugfs_create_dir("virtio", NULL);
+}
+EXPORT_SYMBOL_GPL(virtio_debug_init);
+
+void virtio_debug_exit(void)
+{
+	debugfs_remove_recursive(virtio_debugfs_dir);
+}
+EXPORT_SYMBOL_GPL(virtio_debug_exit);
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 26c4325aa373..96fea920873b 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -126,6 +126,8 @@ struct virtio_admin_cmd {
  * @vqs: the list of virtqueues for this device.
  * @features: the features supported by both driver and device.
  * @priv: private pointer for the driver's use.
+ * @debugfs_dir: debugfs directory entry.
+ * @debugfs_filter_features: features to be filtered set by debugfs.
  */
 struct virtio_device {
 	int index;
@@ -141,6 +143,10 @@ struct virtio_device {
 	struct list_head vqs;
 	u64 features;
 	void *priv;
+#ifdef CONFIG_VIRTIO_DEBUG
+	struct dentry *debugfs_dir;
+	u64 debugfs_filter_features;
+#endif
 };
 
 #define dev_to_virtio(_dev)	container_of_const(_dev, struct virtio_device, dev)
@@ -237,4 +243,33 @@ void virtqueue_dma_sync_single_range_for_cpu(struct virtqueue *_vq, dma_addr_t a
 void virtqueue_dma_sync_single_range_for_device(struct virtqueue *_vq, dma_addr_t addr,
 						unsigned long offset, size_t size,
 						enum dma_data_direction dir);
+
+#ifdef CONFIG_VIRTIO_DEBUG
+void virtio_debug_device_init(struct virtio_device *dev);
+void virtio_debug_device_exit(struct virtio_device *dev);
+void virtio_debug_device_filter_features(struct virtio_device *dev);
+void virtio_debug_init(void);
+void virtio_debug_exit(void);
+#else
+static inline void virtio_debug_device_init(struct virtio_device *dev)
+{
+}
+
+static inline void virtio_debug_device_exit(struct virtio_device *dev)
+{
+}
+
+static inline void virtio_debug_device_filter_features(struct virtio_device *dev)
+{
+}
+
+static inline void virtio_debug_init(void)
+{
+}
+
+static inline void virtio_debug_exit(void)
+{
+}
+#endif
+
 #endif /* _LINUX_VIRTIO_H */
-- 
cgit v1.2.3


From 5cb2cb3cb20cb2618c877a50db3c013449cc4e75 Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Thu, 25 Apr 2024 11:13:34 +0800
Subject: net: introduce rstreason to detect why the RST is sent

Add a new standalone file for the easy future extension to support
both active reset and passive reset in the TCP/DCCP/MPTCP protocols.

This patch only does the preparations for reset reason mechanism,
nothing else changes.

The reset reasons are divided into three parts:
1) reuse drop reasons for passive reset in TCP
2) our own independent reasons which aren't relying on other reasons at all
3) reuse MP_TCPRST option for MPTCP

The benefits of a standalone reset reason are listed here:
1) it can cover more than one case, such as reset reasons in MPTCP,
active reset reasons.
2) people can easily/fastly understand and maintain this mechanism.
3) we get unified format of output with prefix stripped.
4) more new reset reasons are on the way
...

I will implement the basic codes of active/passive reset reason in
those three protocols, which are not complete for this moment. For
passive reset part in TCP, I only introduce the NO_SOCKET common case
which could be set as an example.

After this series applied, it will have the ability to open a new
gate to let other people contribute more reasons into it :)

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Acked-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/rstreason.h | 106 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 include/net/rstreason.h

(limited to 'include')

diff --git a/include/net/rstreason.h b/include/net/rstreason.h
new file mode 100644
index 000000000000..bc53b5a24505
--- /dev/null
+++ b/include/net/rstreason.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _LINUX_RSTREASON_H
+#define _LINUX_RSTREASON_H
+#include <net/dropreason-core.h>
+#include <uapi/linux/mptcp.h>
+
+#define DEFINE_RST_REASON(FN, FNe)	\
+	FN(NOT_SPECIFIED)		\
+	FN(NO_SOCKET)			\
+	FN(MPTCP_RST_EUNSPEC)		\
+	FN(MPTCP_RST_EMPTCP)		\
+	FN(MPTCP_RST_ERESOURCE)		\
+	FN(MPTCP_RST_EPROHIBIT)		\
+	FN(MPTCP_RST_EWQ2BIG)		\
+	FN(MPTCP_RST_EBADPERF)		\
+	FN(MPTCP_RST_EMIDDLEBOX)	\
+	FN(ERROR)			\
+	FNe(MAX)
+
+/**
+ * enum sk_rst_reason - the reasons of socket reset
+ *
+ * The reasons of sk reset, which are used in DCCP/TCP/MPTCP protocols.
+ *
+ * There are three parts in order:
+ * 1) skb drop reasons: relying on drop reasons for such as passive reset
+ * 2) independent reset reasons: such as active reset reasons
+ * 3) reset reasons in MPTCP: only for MPTCP use
+ */
+enum sk_rst_reason {
+	/* Refer to include/net/dropreason-core.h
+	 * Rely on skb drop reasons because it indicates exactly why RST
+	 * could happen.
+	 */
+	/** @SK_RST_REASON_NOT_SPECIFIED: reset reason is not specified */
+	SK_RST_REASON_NOT_SPECIFIED,
+	/** @SK_RST_REASON_NO_SOCKET: no valid socket that can be used */
+	SK_RST_REASON_NO_SOCKET,
+
+	/* Copy from include/uapi/linux/mptcp.h.
+	 * These reset fields will not be changed since they adhere to
+	 * RFC 8684. So do not touch them. I'm going to list each definition
+	 * of them respectively.
+	 */
+	/**
+	 * @SK_RST_REASON_MPTCP_RST_EUNSPEC: Unspecified error.
+	 * This is the default error; it implies that the subflow is no
+	 * longer available. The presence of this option shows that the
+	 * RST was generated by an MPTCP-aware device.
+	 */
+	SK_RST_REASON_MPTCP_RST_EUNSPEC,
+	/**
+	 * @SK_RST_REASON_MPTCP_RST_EMPTCP: MPTCP-specific error.
+	 * An error has been detected in the processing of MPTCP options.
+	 * This is the usual reason code to return in the cases where a RST
+	 * is being sent to close a subflow because of an invalid response.
+	 */
+	SK_RST_REASON_MPTCP_RST_EMPTCP,
+	/**
+	 * @SK_RST_REASON_MPTCP_RST_ERESOURCE: Lack of resources.
+	 * This code indicates that the sending host does not have enough
+	 * resources to support the terminated subflow.
+	 */
+	SK_RST_REASON_MPTCP_RST_ERESOURCE,
+	/**
+	 * @SK_RST_REASON_MPTCP_RST_EPROHIBIT: Administratively prohibited.
+	 * This code indicates that the requested subflow is prohibited by
+	 * the policies of the sending host.
+	 */
+	SK_RST_REASON_MPTCP_RST_EPROHIBIT,
+	/**
+	 * @SK_RST_REASON_MPTCP_RST_EWQ2BIG: Too much outstanding data.
+	 * This code indicates that there is an excessive amount of data
+	 * that needs to be transmitted over the terminated subflow while
+	 * having already been acknowledged over one or more other subflows.
+	 * This may occur if a path has been unavailable for a short period
+	 * and it is more efficient to reset and start again than it is to
+	 * retransmit the queued data.
+	 */
+	SK_RST_REASON_MPTCP_RST_EWQ2BIG,
+	/**
+	 * @SK_RST_REASON_MPTCP_RST_EBADPERF: Unacceptable performance.
+	 * This code indicates that the performance of this subflow was
+	 * too low compared to the other subflows of this Multipath TCP
+	 * connection.
+	 */
+	SK_RST_REASON_MPTCP_RST_EBADPERF,
+	/**
+	 * @SK_RST_REASON_MPTCP_RST_EMIDDLEBOX: Middlebox interference.
+	 * Middlebox interference has been detected over this subflow,
+	 * making MPTCP signaling invalid. For example, this may be sent
+	 * if the checksum does not validate.
+	 */
+	SK_RST_REASON_MPTCP_RST_EMIDDLEBOX,
+
+	/** @SK_RST_REASON_ERROR: unexpected error happens */
+	SK_RST_REASON_ERROR,
+
+	/**
+	 * @SK_RST_REASON_MAX: Maximum of socket reset reasons.
+	 * It shouldn't be used as a real 'reason'.
+	 */
+	SK_RST_REASON_MAX,
+};
+#endif
-- 
cgit v1.2.3


From 6be49deaa09576c141002a2e6f816a1709bc2c86 Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Thu, 25 Apr 2024 11:13:35 +0800
Subject: rstreason: prepare for passive reset

Adjust the parameter and support passing reason of reset which
is for now NOT_SPECIFIED. No functional changes.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Acked-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/request_sock.h |  4 +++-
 net/dccp/ipv4.c            | 10 ++++++----
 net/dccp/ipv6.c            | 10 ++++++----
 net/dccp/minisocks.c       |  3 ++-
 net/ipv4/tcp_ipv4.c        | 12 +++++++-----
 net/ipv4/tcp_minisocks.c   |  3 ++-
 net/ipv6/tcp_ipv6.c        | 15 +++++++++------
 net/mptcp/subflow.c        |  8 +++++---
 8 files changed, 40 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 004e651e6067..bdc737832da6 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -18,6 +18,7 @@
 #include <linux/refcount.h>
 
 #include <net/sock.h>
+#include <net/rstreason.h>
 
 struct request_sock;
 struct sk_buff;
@@ -34,7 +35,8 @@ struct request_sock_ops {
 	void		(*send_ack)(const struct sock *sk, struct sk_buff *skb,
 				    struct request_sock *req);
 	void		(*send_reset)(const struct sock *sk,
-				      struct sk_buff *skb);
+				      struct sk_buff *skb,
+				      enum sk_rst_reason reason);
 	void		(*destructor)(struct request_sock *req);
 	void		(*syn_ack_timeout)(const struct request_sock *req);
 };
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 9fc9cea4c251..ff41bd6f99c3 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -24,6 +24,7 @@
 #include <net/xfrm.h>
 #include <net/secure_seq.h>
 #include <net/netns/generic.h>
+#include <net/rstreason.h>
 
 #include "ackvec.h"
 #include "ccid.h"
@@ -521,7 +522,8 @@ out:
 	return err;
 }
 
-static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb)
+static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb,
+				   enum sk_rst_reason reason)
 {
 	int err;
 	const struct iphdr *rxiph;
@@ -706,7 +708,7 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 	return 0;
 
 reset:
-	dccp_v4_ctl_send_reset(sk, skb);
+	dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
 	kfree_skb(skb);
 	return 0;
 }
@@ -869,7 +871,7 @@ lookup:
 		if (nsk == sk) {
 			reqsk_put(req);
 		} else if (dccp_child_process(sk, nsk, skb)) {
-			dccp_v4_ctl_send_reset(sk, skb);
+			dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
 			goto discard_and_relse;
 		} else {
 			sock_put(sk);
@@ -909,7 +911,7 @@ no_dccp_socket:
 	if (dh->dccph_type != DCCP_PKT_RESET) {
 		DCCP_SKB_CB(skb)->dccpd_reset_code =
 					DCCP_RESET_CODE_NO_CONNECTION;
-		dccp_v4_ctl_send_reset(sk, skb);
+		dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
 	}
 
 discard_it:
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index c8ca703dc331..85f4b8fdbe5e 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -29,6 +29,7 @@
 #include <net/secure_seq.h>
 #include <net/netns/generic.h>
 #include <net/sock.h>
+#include <net/rstreason.h>
 
 #include "dccp.h"
 #include "ipv6.h"
@@ -256,7 +257,8 @@ static void dccp_v6_reqsk_destructor(struct request_sock *req)
 	kfree_skb(inet_rsk(req)->pktopts);
 }
 
-static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb)
+static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb,
+				   enum sk_rst_reason reason)
 {
 	const struct ipv6hdr *rxip6h;
 	struct sk_buff *skb;
@@ -656,7 +658,7 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 	return 0;
 
 reset:
-	dccp_v6_ctl_send_reset(sk, skb);
+	dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
 discard:
 	if (opt_skb != NULL)
 		__kfree_skb(opt_skb);
@@ -762,7 +764,7 @@ lookup:
 		if (nsk == sk) {
 			reqsk_put(req);
 		} else if (dccp_child_process(sk, nsk, skb)) {
-			dccp_v6_ctl_send_reset(sk, skb);
+			dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
 			goto discard_and_relse;
 		} else {
 			sock_put(sk);
@@ -801,7 +803,7 @@ no_dccp_socket:
 	if (dh->dccph_type != DCCP_PKT_RESET) {
 		DCCP_SKB_CB(skb)->dccpd_reset_code =
 					DCCP_RESET_CODE_NO_CONNECTION;
-		dccp_v6_ctl_send_reset(sk, skb);
+		dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
 	}
 
 discard_it:
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 64d805b27add..251a57cf5822 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -15,6 +15,7 @@
 #include <net/sock.h>
 #include <net/xfrm.h>
 #include <net/inet_timewait_sock.h>
+#include <net/rstreason.h>
 
 #include "ackvec.h"
 #include "ccid.h"
@@ -202,7 +203,7 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
 	DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY;
 drop:
 	if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET)
-		req->rsk_ops->send_reset(sk, skb);
+		req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
 
 	inet_csk_reqsk_queue_drop(sk, req);
 out:
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index e06f0cd04f7e..202f6dfa958b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -70,6 +70,7 @@
 #include <net/xfrm.h>
 #include <net/secure_seq.h>
 #include <net/busy_poll.h>
+#include <net/rstreason.h>
 
 #include <linux/inet.h>
 #include <linux/ipv6.h>
@@ -723,7 +724,8 @@ out:
  *	Exception: precedence violation. We do not implement it in any case.
  */
 
-static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
+static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
+			      enum sk_rst_reason reason)
 {
 	const struct tcphdr *th = tcp_hdr(skb);
 	struct {
@@ -1934,7 +1936,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 	return 0;
 
 reset:
-	tcp_v4_send_reset(rsk, skb);
+	tcp_v4_send_reset(rsk, skb, SK_RST_REASON_NOT_SPECIFIED);
 discard:
 	kfree_skb_reason(skb, reason);
 	/* Be careful here. If this function gets more complicated and
@@ -2285,7 +2287,7 @@ lookup:
 		} else {
 			drop_reason = tcp_child_process(sk, nsk, skb);
 			if (drop_reason) {
-				tcp_v4_send_reset(nsk, skb);
+				tcp_v4_send_reset(nsk, skb, SK_RST_REASON_NOT_SPECIFIED);
 				goto discard_and_relse;
 			}
 			sock_put(sk);
@@ -2364,7 +2366,7 @@ csum_error:
 bad_packet:
 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
 	} else {
-		tcp_v4_send_reset(NULL, skb);
+		tcp_v4_send_reset(NULL, skb, SK_RST_REASON_NOT_SPECIFIED);
 	}
 
 discard_it:
@@ -2416,7 +2418,7 @@ do_time_wait:
 		tcp_v4_timewait_ack(sk, skb);
 		break;
 	case TCP_TW_RST:
-		tcp_v4_send_reset(sk, skb);
+		tcp_v4_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
 		inet_twsk_deschedule_put(inet_twsk(sk));
 		goto discard_it;
 	case TCP_TW_SUCCESS:;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 146c061145b4..7d543569a180 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -22,6 +22,7 @@
 #include <net/tcp.h>
 #include <net/xfrm.h>
 #include <net/busy_poll.h>
+#include <net/rstreason.h>
 
 static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
 {
@@ -878,7 +879,7 @@ embryonic_reset:
 		 * avoid becoming vulnerable to outside attack aiming at
 		 * resetting legit local connections.
 		 */
-		req->rsk_ops->send_reset(sk, skb);
+		req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
 	} else if (fastopen) { /* received a valid RST pkt */
 		reqsk_fastopen_remove(sk, req, true);
 		tcp_reset(sk, skb);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index bb7c3caf4f85..017f6293b5f4 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -60,6 +60,7 @@
 #include <net/secure_seq.h>
 #include <net/hotdata.h>
 #include <net/busy_poll.h>
+#include <net/rstreason.h>
 
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
@@ -69,7 +70,8 @@
 
 #include <trace/events/tcp.h>
 
-static void	tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb);
+static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb,
+			      enum sk_rst_reason reason);
 static void	tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
 				      struct request_sock *req);
 
@@ -1008,7 +1010,8 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
 	kfree_skb(buff);
 }
 
-static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
+static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb,
+			      enum sk_rst_reason reason)
 {
 	const struct tcphdr *th = tcp_hdr(skb);
 	struct ipv6hdr *ipv6h = ipv6_hdr(skb);
@@ -1677,7 +1680,7 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 	return 0;
 
 reset:
-	tcp_v6_send_reset(sk, skb);
+	tcp_v6_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
 discard:
 	if (opt_skb)
 		__kfree_skb(opt_skb);
@@ -1862,7 +1865,7 @@ lookup:
 		} else {
 			drop_reason = tcp_child_process(sk, nsk, skb);
 			if (drop_reason) {
-				tcp_v6_send_reset(nsk, skb);
+				tcp_v6_send_reset(nsk, skb, SK_RST_REASON_NOT_SPECIFIED);
 				goto discard_and_relse;
 			}
 			sock_put(sk);
@@ -1939,7 +1942,7 @@ csum_error:
 bad_packet:
 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
 	} else {
-		tcp_v6_send_reset(NULL, skb);
+		tcp_v6_send_reset(NULL, skb, SK_RST_REASON_NOT_SPECIFIED);
 	}
 
 discard_it:
@@ -1995,7 +1998,7 @@ do_time_wait:
 		tcp_v6_timewait_ack(sk, skb);
 		break;
 	case TCP_TW_RST:
-		tcp_v6_send_reset(sk, skb);
+		tcp_v6_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
 		inet_twsk_deschedule_put(inet_twsk(sk));
 		goto discard_it;
 	case TCP_TW_SUCCESS:
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index b94d1dca1094..32fe2ef36d56 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -20,6 +20,8 @@
 #include <net/transp_v6.h>
 #endif
 #include <net/mptcp.h>
+#include <net/rstreason.h>
+
 #include "protocol.h"
 #include "mib.h"
 
@@ -308,7 +310,7 @@ static struct dst_entry *subflow_v4_route_req(const struct sock *sk,
 
 	dst_release(dst);
 	if (!req->syncookie)
-		tcp_request_sock_ops.send_reset(sk, skb);
+		tcp_request_sock_ops.send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
 	return NULL;
 }
 
@@ -376,7 +378,7 @@ static struct dst_entry *subflow_v6_route_req(const struct sock *sk,
 
 	dst_release(dst);
 	if (!req->syncookie)
-		tcp6_request_sock_ops.send_reset(sk, skb);
+		tcp6_request_sock_ops.send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
 	return NULL;
 }
 #endif
@@ -911,7 +913,7 @@ dispose_child:
 	tcp_rsk(req)->drop_req = true;
 	inet_csk_prepare_for_destroy_sock(child);
 	tcp_done(child);
-	req->rsk_ops->send_reset(sk, skb);
+	req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
 
 	/* The last child reference will be released by the caller */
 	return child;
-- 
cgit v1.2.3


From 5691276b39daf90294c6a81fb6d62d667f634c92 Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Thu, 25 Apr 2024 11:13:36 +0800
Subject: rstreason: prepare for active reset

Like what we did to passive reset:
only passing possible reset reason in each active reset path.

No functional changes.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Acked-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/tcp.h     |  3 ++-
 net/ipv4/tcp.c        | 15 ++++++++++-----
 net/ipv4/tcp_output.c |  3 ++-
 net/ipv4/tcp_timer.c  |  9 ++++++---
 net/mptcp/protocol.c  |  4 +++-
 net/mptcp/subflow.c   |  5 +++--
 6 files changed, 26 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index ffc9371fe9de..a9eb21251195 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -670,7 +670,8 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
 void tcp_send_probe0(struct sock *);
 int tcp_write_wakeup(struct sock *, int mib);
 void tcp_send_fin(struct sock *sk);
-void tcp_send_active_reset(struct sock *sk, gfp_t priority);
+void tcp_send_active_reset(struct sock *sk, gfp_t priority,
+			   enum sk_rst_reason reason);
 int tcp_send_synack(struct sock *);
 void tcp_push_one(struct sock *, unsigned int mss_now);
 void __tcp_send_ack(struct sock *sk, u32 rcv_nxt);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f23b97777ea5..4ec0f4feee00 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -275,6 +275,7 @@
 #include <net/xfrm.h>
 #include <net/ip.h>
 #include <net/sock.h>
+#include <net/rstreason.h>
 
 #include <linux/uaccess.h>
 #include <asm/ioctls.h>
@@ -2811,7 +2812,8 @@ void __tcp_close(struct sock *sk, long timeout)
 		/* Unread data was tossed, zap the connection. */
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
 		tcp_set_state(sk, TCP_CLOSE);
-		tcp_send_active_reset(sk, sk->sk_allocation);
+		tcp_send_active_reset(sk, sk->sk_allocation,
+				      SK_RST_REASON_NOT_SPECIFIED);
 	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
 		/* Check zero linger _after_ checking for unread data. */
 		sk->sk_prot->disconnect(sk, 0);
@@ -2885,7 +2887,8 @@ adjudge_to_death:
 		struct tcp_sock *tp = tcp_sk(sk);
 		if (READ_ONCE(tp->linger2) < 0) {
 			tcp_set_state(sk, TCP_CLOSE);
-			tcp_send_active_reset(sk, GFP_ATOMIC);
+			tcp_send_active_reset(sk, GFP_ATOMIC,
+					      SK_RST_REASON_NOT_SPECIFIED);
 			__NET_INC_STATS(sock_net(sk),
 					LINUX_MIB_TCPABORTONLINGER);
 		} else {
@@ -2903,7 +2906,8 @@ adjudge_to_death:
 	if (sk->sk_state != TCP_CLOSE) {
 		if (tcp_check_oom(sk, 0)) {
 			tcp_set_state(sk, TCP_CLOSE);
-			tcp_send_active_reset(sk, GFP_ATOMIC);
+			tcp_send_active_reset(sk, GFP_ATOMIC,
+					      SK_RST_REASON_NOT_SPECIFIED);
 			__NET_INC_STATS(sock_net(sk),
 					LINUX_MIB_TCPABORTONMEMORY);
 		} else if (!check_net(sock_net(sk))) {
@@ -3007,7 +3011,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 		/* The last check adjusts for discrepancy of Linux wrt. RFC
 		 * states
 		 */
-		tcp_send_active_reset(sk, gfp_any());
+		tcp_send_active_reset(sk, gfp_any(), SK_RST_REASON_NOT_SPECIFIED);
 		WRITE_ONCE(sk->sk_err, ECONNRESET);
 	} else if (old_state == TCP_SYN_SENT)
 		WRITE_ONCE(sk->sk_err, ECONNRESET);
@@ -4564,7 +4568,8 @@ int tcp_abort(struct sock *sk, int err)
 		smp_wmb();
 		sk_error_report(sk);
 		if (tcp_need_reset(sk->sk_state))
-			tcp_send_active_reset(sk, GFP_ATOMIC);
+			tcp_send_active_reset(sk, GFP_ATOMIC,
+					      SK_RST_REASON_NOT_SPECIFIED);
 		tcp_done(sk);
 	}
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index ce59e4499b66..41c352bf3394 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3615,7 +3615,8 @@ void tcp_send_fin(struct sock *sk)
  * was unread data in the receive queue.  This behavior is recommended
  * by RFC 2525, section 2.17.  -DaveM
  */
-void tcp_send_active_reset(struct sock *sk, gfp_t priority)
+void tcp_send_active_reset(struct sock *sk, gfp_t priority,
+			   enum sk_rst_reason reason)
 {
 	struct sk_buff *skb;
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 976db57b95d4..83fe7f62f7f1 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,6 +22,7 @@
 #include <linux/module.h>
 #include <linux/gfp.h>
 #include <net/tcp.h>
+#include <net/rstreason.h>
 
 static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
 {
@@ -127,7 +128,8 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
 		    (!tp->snd_wnd && !tp->packets_out))
 			do_reset = true;
 		if (do_reset)
-			tcp_send_active_reset(sk, GFP_ATOMIC);
+			tcp_send_active_reset(sk, GFP_ATOMIC,
+					      SK_RST_REASON_NOT_SPECIFIED);
 		tcp_done(sk);
 		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
 		return 1;
@@ -768,7 +770,7 @@ static void tcp_keepalive_timer (struct timer_list *t)
 				goto out;
 			}
 		}
-		tcp_send_active_reset(sk, GFP_ATOMIC);
+		tcp_send_active_reset(sk, GFP_ATOMIC, SK_RST_REASON_NOT_SPECIFIED);
 		goto death;
 	}
 
@@ -795,7 +797,8 @@ static void tcp_keepalive_timer (struct timer_list *t)
 		    icsk->icsk_probes_out > 0) ||
 		    (user_timeout == 0 &&
 		    icsk->icsk_probes_out >= keepalive_probes(tp))) {
-			tcp_send_active_reset(sk, GFP_ATOMIC);
+			tcp_send_active_reset(sk, GFP_ATOMIC,
+					      SK_RST_REASON_NOT_SPECIFIED);
 			tcp_write_err(sk);
 			goto out;
 		}
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index f8bc34f0d973..065967086492 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -21,6 +21,7 @@
 #endif
 #include <net/mptcp.h>
 #include <net/xfrm.h>
+#include <net/rstreason.h>
 #include <asm/ioctls.h>
 #include "protocol.h"
 #include "mib.h"
@@ -2569,7 +2570,8 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)
 
 		slow = lock_sock_fast(tcp_sk);
 		if (tcp_sk->sk_state != TCP_CLOSE) {
-			tcp_send_active_reset(tcp_sk, GFP_ATOMIC);
+			tcp_send_active_reset(tcp_sk, GFP_ATOMIC,
+					      SK_RST_REASON_NOT_SPECIFIED);
 			tcp_set_state(tcp_sk, TCP_CLOSE);
 		}
 		unlock_sock_fast(tcp_sk, slow);
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 32fe2ef36d56..ac867d277860 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -414,7 +414,7 @@ void mptcp_subflow_reset(struct sock *ssk)
 	/* must hold: tcp_done() could drop last reference on parent */
 	sock_hold(sk);
 
-	tcp_send_active_reset(ssk, GFP_ATOMIC);
+	tcp_send_active_reset(ssk, GFP_ATOMIC, SK_RST_REASON_NOT_SPECIFIED);
 	tcp_done(ssk);
 	if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags))
 		mptcp_schedule_work(sk);
@@ -1350,7 +1350,8 @@ reset:
 			tcp_set_state(ssk, TCP_CLOSE);
 			while ((skb = skb_peek(&ssk->sk_receive_queue)))
 				sk_eat_skb(ssk, skb);
-			tcp_send_active_reset(ssk, GFP_ATOMIC);
+			tcp_send_active_reset(ssk, GFP_ATOMIC,
+					      SK_RST_REASON_NOT_SPECIFIED);
 			WRITE_ONCE(subflow->data_avail, false);
 			return false;
 		}
-- 
cgit v1.2.3


From 120391ef9ca8fe8f82ea3f2961ad802043468226 Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Thu, 25 Apr 2024 11:13:37 +0800
Subject: tcp: support rstreason for passive reset

Reuse the dropreason logic to show the exact reason of tcp reset,
so we can finally display the corresponding item in enum sk_reset_reason
instead of reinventing new reset reasons. This patch replaces all
the prior NOT_SPECIFIED reasons.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/rstreason.h | 15 +++++++++++++++
 net/ipv4/tcp_ipv4.c     | 11 +++++++----
 net/ipv6/tcp_ipv6.c     | 11 +++++++----
 3 files changed, 29 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/net/rstreason.h b/include/net/rstreason.h
index bc53b5a24505..df3b6ac0c9b3 100644
--- a/include/net/rstreason.h
+++ b/include/net/rstreason.h
@@ -103,4 +103,19 @@ enum sk_rst_reason {
 	 */
 	SK_RST_REASON_MAX,
 };
+
+/* Convert skb drop reasons to enum sk_rst_reason type */
+static inline enum sk_rst_reason
+sk_rst_convert_drop_reason(enum skb_drop_reason reason)
+{
+	switch (reason) {
+	case SKB_DROP_REASON_NOT_SPECIFIED:
+		return SK_RST_REASON_NOT_SPECIFIED;
+	case SKB_DROP_REASON_NO_SOCKET:
+		return SK_RST_REASON_NO_SOCKET;
+	default:
+		/* If we don't have our own corresponding reason */
+		return SK_RST_REASON_NOT_SPECIFIED;
+	}
+}
 #endif
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 202f6dfa958b..621879ba916d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1936,7 +1936,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 	return 0;
 
 reset:
-	tcp_v4_send_reset(rsk, skb, SK_RST_REASON_NOT_SPECIFIED);
+	tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
 discard:
 	kfree_skb_reason(skb, reason);
 	/* Be careful here. If this function gets more complicated and
@@ -2287,7 +2287,10 @@ lookup:
 		} else {
 			drop_reason = tcp_child_process(sk, nsk, skb);
 			if (drop_reason) {
-				tcp_v4_send_reset(nsk, skb, SK_RST_REASON_NOT_SPECIFIED);
+				enum sk_rst_reason rst_reason;
+
+				rst_reason = sk_rst_convert_drop_reason(drop_reason);
+				tcp_v4_send_reset(nsk, skb, rst_reason);
 				goto discard_and_relse;
 			}
 			sock_put(sk);
@@ -2366,7 +2369,7 @@ csum_error:
 bad_packet:
 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
 	} else {
-		tcp_v4_send_reset(NULL, skb, SK_RST_REASON_NOT_SPECIFIED);
+		tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
 	}
 
 discard_it:
@@ -2418,7 +2421,7 @@ do_time_wait:
 		tcp_v4_timewait_ack(sk, skb);
 		break;
 	case TCP_TW_RST:
-		tcp_v4_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+		tcp_v4_send_reset(sk, skb, sk_rst_convert_drop_reason(drop_reason));
 		inet_twsk_deschedule_put(inet_twsk(sk));
 		goto discard_it;
 	case TCP_TW_SUCCESS:;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 017f6293b5f4..317d7a6e6b01 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1680,7 +1680,7 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 	return 0;
 
 reset:
-	tcp_v6_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+	tcp_v6_send_reset(sk, skb, sk_rst_convert_drop_reason(reason));
 discard:
 	if (opt_skb)
 		__kfree_skb(opt_skb);
@@ -1865,7 +1865,10 @@ lookup:
 		} else {
 			drop_reason = tcp_child_process(sk, nsk, skb);
 			if (drop_reason) {
-				tcp_v6_send_reset(nsk, skb, SK_RST_REASON_NOT_SPECIFIED);
+				enum sk_rst_reason rst_reason;
+
+				rst_reason = sk_rst_convert_drop_reason(drop_reason);
+				tcp_v6_send_reset(nsk, skb, rst_reason);
 				goto discard_and_relse;
 			}
 			sock_put(sk);
@@ -1942,7 +1945,7 @@ csum_error:
 bad_packet:
 		__TCP_INC_STATS(net, TCP_MIB_INERRS);
 	} else {
-		tcp_v6_send_reset(NULL, skb, SK_RST_REASON_NOT_SPECIFIED);
+		tcp_v6_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
 	}
 
 discard_it:
@@ -1998,7 +2001,7 @@ do_time_wait:
 		tcp_v6_timewait_ack(sk, skb);
 		break;
 	case TCP_TW_RST:
-		tcp_v6_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+		tcp_v6_send_reset(sk, skb, sk_rst_convert_drop_reason(drop_reason));
 		inet_twsk_deschedule_put(inet_twsk(sk));
 		goto discard_it;
 	case TCP_TW_SUCCESS:
-- 
cgit v1.2.3


From b533fb9cf4f7c6ca2aa255a5a1fdcde49fff2b24 Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Thu, 25 Apr 2024 11:13:40 +0800
Subject: rstreason: make it work in trace world

At last, we should let it work by introducing this reset reason in
trace world.

One of the possible expected outputs is:
... tcp_send_reset: skbaddr=xxx skaddr=xxx src=xxx dest=xxx
state=TCP_ESTABLISHED reason=NOT_SPECIFIED

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/trace/events/tcp.h | 26 ++++++++++++++++++++++----
 net/ipv4/tcp_ipv4.c        |  2 +-
 net/ipv4/tcp_output.c      |  2 +-
 net/ipv6/tcp_ipv6.c        |  2 +-
 4 files changed, 25 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 5c04a61a11c2..49b5ee091cf6 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -11,6 +11,7 @@
 #include <net/ipv6.h>
 #include <net/tcp.h>
 #include <linux/sock_diag.h>
+#include <net/rstreason.h>
 
 /*
  * tcp event with arguments sk and skb
@@ -74,20 +75,32 @@ DEFINE_EVENT(tcp_event_sk_skb, tcp_retransmit_skb,
 	TP_ARGS(sk, skb)
 );
 
+#undef FN
+#define FN(reason)	TRACE_DEFINE_ENUM(SK_RST_REASON_##reason);
+DEFINE_RST_REASON(FN, FN)
+
+#undef FN
+#undef FNe
+#define FN(reason)	{ SK_RST_REASON_##reason, #reason },
+#define FNe(reason)	{ SK_RST_REASON_##reason, #reason }
+
 /*
  * skb of trace_tcp_send_reset is the skb that caused RST. In case of
  * active reset, skb should be NULL
  */
 TRACE_EVENT(tcp_send_reset,
 
-	TP_PROTO(const struct sock *sk, const struct sk_buff *skb),
+	TP_PROTO(const struct sock *sk,
+		 const struct sk_buff *skb,
+		 const enum sk_rst_reason reason),
 
-	TP_ARGS(sk, skb),
+	TP_ARGS(sk, skb, reason),
 
 	TP_STRUCT__entry(
 		__field(const void *, skbaddr)
 		__field(const void *, skaddr)
 		__field(int, state)
+		__field(enum sk_rst_reason, reason)
 		__array(__u8, saddr, sizeof(struct sockaddr_in6))
 		__array(__u8, daddr, sizeof(struct sockaddr_in6))
 	),
@@ -113,14 +126,19 @@ TRACE_EVENT(tcp_send_reset,
 			 */
 			TP_STORE_ADDR_PORTS_SKB(skb, th, entry->daddr, entry->saddr);
 		}
+		__entry->reason = reason;
 	),
 
-	TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s",
+	TP_printk("skbaddr=%p skaddr=%p src=%pISpc dest=%pISpc state=%s reason=%s",
 		  __entry->skbaddr, __entry->skaddr,
 		  __entry->saddr, __entry->daddr,
-		  __entry->state ? show_tcp_state_name(__entry->state) : "UNKNOWN")
+		  __entry->state ? show_tcp_state_name(__entry->state) : "UNKNOWN",
+		  __print_symbolic(__entry->reason, DEFINE_RST_REASON(FN, FNe)))
 );
 
+#undef FN
+#undef FNe
+
 /*
  * tcp event with arguments sk
  *
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 621879ba916d..0427deca3e0e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -871,7 +871,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
 	if (sk)
 		arg.bound_dev_if = sk->sk_bound_dev_if;
 
-	trace_tcp_send_reset(sk, skb);
+	trace_tcp_send_reset(sk, skb, reason);
 
 	BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
 		     offsetof(struct inet_timewait_sock, tw_bound_dev_if));
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 41c352bf3394..ece4726c8751 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3641,7 +3641,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority,
 	/* skb of trace_tcp_send_reset() keeps the skb that caused RST,
 	 * skb here is different to the troublesome skb, so use NULL
 	 */
-	trace_tcp_send_reset(sk, NULL);
+	trace_tcp_send_reset(sk, NULL, SK_RST_REASON_NOT_SPECIFIED);
 }
 
 /* Send a crossed SYN-ACK during socket establishment.
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 317d7a6e6b01..77958adf2e16 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1133,7 +1133,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb,
 			label = ip6_flowlabel(ipv6h);
 	}
 
-	trace_tcp_send_reset(sk, skb);
+	trace_tcp_send_reset(sk, skb, reason);
 
 	tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1,
 			     ipv6_get_dsfield(ipv6h), label, priority, txhash,
-- 
cgit v1.2.3


From 66e13b615a0ce76b785d780ecc9776ba71983629 Mon Sep 17 00:00:00 2001
From: Puranjay Mohan <puranjay12@gmail.com>
Date: Wed, 24 Apr 2024 10:02:08 +0000
Subject: bpf: verifier: prevent userspace memory access

With BPF_PROBE_MEM, BPF allows de-referencing an untrusted pointer. To
thwart invalid memory accesses, the JITs add an exception table entry
for all such accesses. But in case the src_reg + offset is a userspace
address, the BPF program might read that memory if the user has
mapped it.

Make the verifier add guard instructions around such memory accesses and
skip the load if the address falls into the userspace region.

The JITs need to implement bpf_arch_uaddress_limit() to define where
the userspace addresses end for that architecture or TASK_SIZE is taken
as default.

The implementation is as follows:

REG_AX =  SRC_REG
if(offset)
	REG_AX += offset;
REG_AX >>= 32;
if (REG_AX <= (uaddress_limit >> 32))
	DST_REG = 0;
else
	DST_REG = *(size *)(SRC_REG + offset);

Comparing just the upper 32 bits of the load address with the upper
32 bits of uaddress_limit implies that the values are being aligned down
to a 4GB boundary before comparison.

The above means that all loads with address <= uaddress_limit + 4GB are
skipped. This is acceptable because there is a large hole (much larger
than 4GB) between userspace and kernel space memory, therefore a
correctly functioning BPF program should not access this 4GB memory
above the userspace.

Let's analyze what this patch does to the following fentry program
dereferencing an untrusted pointer:

  SEC("fentry/tcp_v4_connect")
  int BPF_PROG(fentry_tcp_v4_connect, struct sock *sk)
  {
                *(volatile long *)sk;
                return 0;
  }

    BPF Program before              |           BPF Program after
    ------------------              |           -----------------

  0: (79) r1 = *(u64 *)(r1 +0)          0: (79) r1 = *(u64 *)(r1 +0)
  -----------------------------------------------------------------------
  1: (79) r1 = *(u64 *)(r1 +0) --\      1: (bf) r11 = r1
  ----------------------------\   \     2: (77) r11 >>= 32
  2: (b7) r0 = 0               \   \    3: (b5) if r11 <= 0x8000 goto pc+2
  3: (95) exit                  \   \-> 4: (79) r1 = *(u64 *)(r1 +0)
                                 \      5: (05) goto pc+1
                                  \     6: (b7) r1 = 0
                                   \--------------------------------------
                                        7: (b7) r0 = 0
                                        8: (95) exit

As you can see from above, in the best case (off=0), 5 extra instructions
are emitted.

Now, we analyze the same program after it has gone through the JITs of
ARM64 and RISC-V architectures. We follow the single load instruction
that has the untrusted pointer and see what instrumentation has been
added around it.

                                x86-64 JIT
                                ==========
     JIT's Instrumentation
          (upstream)
     ---------------------

   0:   nopl   0x0(%rax,%rax,1)
   5:   xchg   %ax,%ax
   7:   push   %rbp
   8:   mov    %rsp,%rbp
   b:   mov    0x0(%rdi),%rdi
  ---------------------------------
   f:   movabs $0x800000000000,%r11
  19:   cmp    %r11,%rdi
  1c:   jb     0x000000000000002a
  1e:   mov    %rdi,%r11
  21:   add    $0x0,%r11
  28:   jae    0x000000000000002e
  2a:   xor    %edi,%edi
  2c:   jmp    0x0000000000000032
  2e:   mov    0x0(%rdi),%rdi
  ---------------------------------
  32:   xor    %eax,%eax
  34:   leave
  35:   ret

The x86-64 JIT already emits some instructions to protect against user
memory access. This patch doesn't make any changes for the x86-64 JIT.

                                  ARM64 JIT
                                  =========

        No Intrumentation                       Verifier's Instrumentation
           (upstream)                                  (This patch)
        -----------------                       --------------------------

   0:   add     x9, x30, #0x0                0:   add     x9, x30, #0x0
   4:   nop                                  4:   nop
   8:   paciasp                              8:   paciasp
   c:   stp     x29, x30, [sp, #-16]!        c:   stp     x29, x30, [sp, #-16]!
  10:   mov     x29, sp                     10:   mov     x29, sp
  14:   stp     x19, x20, [sp, #-16]!       14:   stp     x19, x20, [sp, #-16]!
  18:   stp     x21, x22, [sp, #-16]!       18:   stp     x21, x22, [sp, #-16]!
  1c:   stp     x25, x26, [sp, #-16]!       1c:   stp     x25, x26, [sp, #-16]!
  20:   stp     x27, x28, [sp, #-16]!       20:   stp     x27, x28, [sp, #-16]!
  24:   mov     x25, sp                     24:   mov     x25, sp
  28:   mov     x26, #0x0                   28:   mov     x26, #0x0
  2c:   sub     x27, x25, #0x0              2c:   sub     x27, x25, #0x0
  30:   sub     sp, sp, #0x0                30:   sub     sp, sp, #0x0
  34:   ldr     x0, [x0]                    34:   ldr     x0, [x0]
--------------------------------------------------------------------------------
  38:   ldr     x0, [x0] ----------\        38:   add     x9, x0, #0x0
-----------------------------------\\       3c:   lsr     x9, x9, #32
  3c:   mov     x7, #0x0            \\      40:   cmp     x9, #0x10, lsl #12
  40:   mov     sp, sp               \\     44:   b.ls    0x0000000000000050
  44:   ldp     x27, x28, [sp], #16   \\--> 48:   ldr     x0, [x0]
  48:   ldp     x25, x26, [sp], #16    \    4c:   b       0x0000000000000054
  4c:   ldp     x21, x22, [sp], #16     \   50:   mov     x0, #0x0
  50:   ldp     x19, x20, [sp], #16      \---------------------------------------
  54:   ldp     x29, x30, [sp], #16         54:   mov     x7, #0x0
  58:   add     x0, x7, #0x0                58:   mov     sp, sp
  5c:   autiasp                             5c:   ldp     x27, x28, [sp], #16
  60:   ret                                 60:   ldp     x25, x26, [sp], #16
  64:   nop                                 64:   ldp     x21, x22, [sp], #16
  68:   ldr     x10, 0x0000000000000070     68:   ldp     x19, x20, [sp], #16
  6c:   br      x10                         6c:   ldp     x29, x30, [sp], #16
                                            70:   add     x0, x7, #0x0
                                            74:   autiasp
                                            78:   ret
                                            7c:   nop
                                            80:   ldr     x10, 0x0000000000000088
                                            84:   br      x10

There are 6 extra instructions added in ARM64 in the best case. This will
become 7 in the worst case (off != 0).

                           RISC-V JIT (RISCV_ISA_C Disabled)
                           ==========

        No Intrumentation           Verifier's Instrumentation
           (upstream)                      (This patch)
        -----------------           --------------------------

   0:   nop                            0:   nop
   4:   nop                            4:   nop
   8:   li      a6, 33                 8:   li      a6, 33
   c:   addi    sp, sp, -16            c:   addi    sp, sp, -16
  10:   sd      s0, 8(sp)             10:   sd      s0, 8(sp)
  14:   addi    s0, sp, 16            14:   addi    s0, sp, 16
  18:   ld      a0, 0(a0)             18:   ld      a0, 0(a0)
---------------------------------------------------------------
  1c:   ld      a0, 0(a0) --\         1c:   mv      t0, a0
--------------------------\  \        20:   srli    t0, t0, 32
  20:   li      a5, 0      \  \       24:   lui     t1, 4096
  24:   ld      s0, 8(sp)   \  \      28:   sext.w  t1, t1
  28:   addi    sp, sp, 16   \  \     2c:   bgeu    t1, t0, 12
  2c:   sext.w  a0, a5        \  \--> 30:   ld      a0, 0(a0)
  30:   ret                    \      34:   j       8
                                \     38:   li      a0, 0
                                 \------------------------------
                                      3c:   li      a5, 0
                                      40:   ld      s0, 8(sp)
                                      44:   addi    sp, sp, 16
                                      48:   sext.w  a0, a5
                                      4c:   ret

There are 7 extra instructions added in RISC-V.

Fixes: 800834285361 ("bpf, arm64: Add BPF exception tables")
Reported-by: Breno Leitao <leitao@debian.org>
Suggested-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Ilya Leoshkevich <iii@linux.ibm.com>
Signed-off-by: Puranjay Mohan <puranjay12@gmail.com>
Link: https://lore.kernel.org/r/20240424100210.11982-2-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/x86/net/bpf_jit_comp.c |  6 ++++++
 include/linux/filter.h      |  1 +
 kernel/bpf/core.c           |  9 +++++++++
 kernel/bpf/verifier.c       | 30 ++++++++++++++++++++++++++++++
 4 files changed, 46 insertions(+)

(limited to 'include')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index df5fac428408..b520b66ad14b 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -3473,3 +3473,9 @@ bool bpf_jit_supports_ptr_xchg(void)
 {
 	return true;
 }
+
+/* x86-64 JIT emits its own code to filter user addresses so return 0 here */
+u64 bpf_arch_uaddress_limit(void)
+{
+	return 0;
+}
diff --git a/include/linux/filter.h b/include/linux/filter.h
index c99bc3df2d28..219ee7a76874 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -963,6 +963,7 @@ bool bpf_jit_supports_far_kfunc_call(void);
 bool bpf_jit_supports_exceptions(void);
 bool bpf_jit_supports_ptr_xchg(void);
 bool bpf_jit_supports_arena(void);
+u64 bpf_arch_uaddress_limit(void);
 void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
 bool bpf_helper_changes_pkt_data(void *func);
 
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 696bc55de8e8..1ea5ce5bb599 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2942,6 +2942,15 @@ bool __weak bpf_jit_supports_arena(void)
 	return false;
 }
 
+u64 __weak bpf_arch_uaddress_limit(void)
+{
+#if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE)
+	return TASK_SIZE;
+#else
+	return 0;
+#endif
+}
+
 /* Return TRUE if the JIT backend satisfies the following two conditions:
  * 1) JIT backend supports atomic_xchg() on pointer-sized words.
  * 2) Under the specific arch, the implementation of xchg() is the same
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index aa24beb65393..cb7ad1f795e1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -19675,6 +19675,36 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			goto next_insn;
 		}
 
+		/* Make it impossible to de-reference a userspace address */
+		if (BPF_CLASS(insn->code) == BPF_LDX &&
+		    (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
+		     BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) {
+			struct bpf_insn *patch = &insn_buf[0];
+			u64 uaddress_limit = bpf_arch_uaddress_limit();
+
+			if (!uaddress_limit)
+				goto next_insn;
+
+			*patch++ = BPF_MOV64_REG(BPF_REG_AX, insn->src_reg);
+			if (insn->off)
+				*patch++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_AX, insn->off);
+			*patch++ = BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32);
+			*patch++ = BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2);
+			*patch++ = *insn;
+			*patch++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
+			*patch++ = BPF_MOV64_IMM(insn->dst_reg, 0);
+
+			cnt = patch - insn_buf;
+			new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+			if (!new_prog)
+				return -ENOMEM;
+
+			delta    += cnt - 1;
+			env->prog = prog = new_prog;
+			insn      = new_prog->insnsi + i + delta;
+			goto next_insn;
+		}
+
 		/* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */
 		if (BPF_CLASS(insn->code) == BPF_LD &&
 		    (BPF_MODE(insn->code) == BPF_ABS ||
-- 
cgit v1.2.3


From b37ef7210e51b1e996ca03b03227d93f7470784b Mon Sep 17 00:00:00 2001
From: "Gautham R. Shenoy" <gautham.shenoy@amd.com>
Date: Thu, 25 Apr 2024 16:07:51 +0800
Subject: cpufreq: amd-pstate: Document *_limit_* fields in struct amd_cpudata

The four fields of struct cpudata namely min_limit_perf,
max_limit_perf, min_limit_freq, max_limit_freq introduced in the
commit febab20caeba("cpufreq/amd-pstate: Fix scaling_min_freq and
scaling_max_freq update") are currently undocumented

Add comments describing these fields

Acked-by: Huang Rui <ray.huang@amd.com>
Fixes: febab20caeba("cpufreq/amd-pstate: Fix scaling_min_freq and scaling_max_freq update")
Reviewed-by: Li Meng <li.meng@amd.com>
Tested-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Signed-off-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Perry Yuan <perry.yuan@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/amd-pstate.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h
index d21838835abd..83fb3fc647fc 100644
--- a/include/linux/amd-pstate.h
+++ b/include/linux/amd-pstate.h
@@ -49,6 +49,10 @@ struct amd_aperf_mperf {
  * @lowest_perf: the absolute lowest performance level of the processor
  * @prefcore_ranking: the preferred core ranking, the higher value indicates a higher
  * 		  priority.
+ * @min_limit_perf: Cached value of the performance corresponding to policy->min
+ * @max_limit_perf: Cached value of the performance corresponding to policy->max
+ * @min_limit_freq: Cached value of policy->min
+ * @max_limit_freq: Cached value of policy->max
  * @max_freq: the frequency that mapped to highest_perf
  * @min_freq: the frequency that mapped to lowest_perf
  * @nominal_freq: the frequency that mapped to nominal_perf
-- 
cgit v1.2.3


From 4fcfd1954ad305e331b6b4b62de2874fbae61394 Mon Sep 17 00:00:00 2001
From: "Gautham R. Shenoy" <gautham.shenoy@amd.com>
Date: Thu, 25 Apr 2024 16:07:52 +0800
Subject: cpufreq: amd-pstate: Document the units for freq variables in
 amd_cpudata

The min_limit_freq, max_limit_freq, min_freq, max_freq, nominal_freq
and the lowest_nominal_freq members of struct cpudata store the
frequency value in khz to be consistent with the cpufreq
core. Update the comment to document this.

Reviewed-by: Li Meng <li.meng@amd.com>
Tested-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Signed-off-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Perry Yuan <perry.yuan@amd.com>
Acked-by: Huang Rui <ray.huang@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/amd-pstate.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h
index 83fb3fc647fc..ec0b0fa3e9bb 100644
--- a/include/linux/amd-pstate.h
+++ b/include/linux/amd-pstate.h
@@ -51,15 +51,15 @@ struct amd_aperf_mperf {
  * 		  priority.
  * @min_limit_perf: Cached value of the performance corresponding to policy->min
  * @max_limit_perf: Cached value of the performance corresponding to policy->max
- * @min_limit_freq: Cached value of policy->min
- * @max_limit_freq: Cached value of policy->max
- * @max_freq: the frequency that mapped to highest_perf
- * @min_freq: the frequency that mapped to lowest_perf
- * @nominal_freq: the frequency that mapped to nominal_perf
- * @lowest_nonlinear_freq: the frequency that mapped to lowest_nonlinear_perf
+ * @min_limit_freq: Cached value of policy->min (in khz)
+ * @max_limit_freq: Cached value of policy->max (in khz)
+ * @max_freq: the frequency (in khz) that mapped to highest_perf
+ * @min_freq: the frequency (in khz) that mapped to lowest_perf
+ * @nominal_freq: the frequency (in khz) that mapped to nominal_perf
+ * @lowest_nonlinear_freq: the frequency (in khz) that mapped to lowest_nonlinear_perf
  * @cur: Difference of Aperf/Mperf/tsc count between last and current sample
  * @prev: Last Aperf/Mperf/tsc count value read from register
- * @freq: current cpu frequency value
+ * @freq: current cpu frequency value (in khz)
  * @boost_supported: check whether the Processor or SBIOS supports boost mode
  * @hw_prefcore: check whether HW supports preferred core featue.
  * 		  Only when hw_prefcore and early prefcore param are true,
-- 
cgit v1.2.3


From eb8b6c36820214df96e7e86d8614d93f6b028f28 Mon Sep 17 00:00:00 2001
From: Perry Yuan <perry.yuan@amd.com>
Date: Thu, 25 Apr 2024 16:07:58 +0800
Subject: cpufreq: amd-pstate: Add quirk for the pstate CPPC capabilities
 missing

Add quirks table to get CPPC capabilities issue fixed by providing
correct perf or frequency values while driver loading.

If CPPC capabilities are not defined in the ACPI tables or wrongly
defined by platform firmware, it needs to use quick to get those
issues fixed with correct workaround values to make pstate driver
can be loaded even though there are CPPC capabilities errors.

The workaround will match the broken BIOS which lack of CPPC capabilities
nominal_freq and lowest_freq definition in the ACPI table.

$ cat /sys/devices/system/cpu/cpu0/acpi_cppc/lowest_freq
0
$ cat /sys/devices/system/cpu/cpu0/acpi_cppc/nominal_freq
0

Acked-by: Huang Rui <ray.huang@amd.com>
Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Tested-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Signed-off-by: Perry Yuan <perry.yuan@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/amd-pstate.c | 53 ++++++++++++++++++++++++++++++++++++++++++--
 include/linux/amd-pstate.h   |  6 +++++
 2 files changed, 57 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index 510b5aec42ea..83a29b257794 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -67,6 +67,7 @@ static struct cpufreq_driver amd_pstate_epp_driver;
 static int cppc_state = AMD_PSTATE_UNDEFINED;
 static bool cppc_enabled;
 static bool amd_pstate_prefcore = true;
+static struct quirk_entry *quirks;
 
 /*
  * AMD Energy Preference Performance (EPP)
@@ -111,6 +112,41 @@ static unsigned int epp_values[] = {
 
 typedef int (*cppc_mode_transition_fn)(int);
 
+static struct quirk_entry quirk_amd_7k62 = {
+	.nominal_freq = 2600,
+	.lowest_freq = 550,
+};
+
+static int __init dmi_matched_7k62_bios_bug(const struct dmi_system_id *dmi)
+{
+	/**
+	 * match the broken bios for family 17h processor support CPPC V2
+	 * broken BIOS lack of nominal_freq and lowest_freq capabilities
+	 * definition in ACPI tables
+	 */
+	if (boot_cpu_has(X86_FEATURE_ZEN2)) {
+		quirks = dmi->driver_data;
+		pr_info("Overriding nominal and lowest frequencies for %s\n", dmi->ident);
+		return 1;
+	}
+
+	return 0;
+}
+
+static const struct dmi_system_id amd_pstate_quirks_table[] __initconst = {
+	{
+		.callback = dmi_matched_7k62_bios_bug,
+		.ident = "AMD EPYC 7K62",
+		.matches = {
+			DMI_MATCH(DMI_BIOS_VERSION, "5.14"),
+			DMI_MATCH(DMI_BIOS_RELEASE, "12/12/2019"),
+		},
+		.driver_data = &quirk_amd_7k62,
+	},
+	{}
+};
+MODULE_DEVICE_TABLE(dmi, amd_pstate_quirks_table);
+
 static inline int get_mode_idx_from_str(const char *str, size_t size)
 {
 	int i;
@@ -812,8 +848,16 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata)
 	if (ret)
 		return ret;
 
-	min_freq = cppc_perf.lowest_freq * 1000;
-	nominal_freq = cppc_perf.nominal_freq;
+	if (quirks && quirks->lowest_freq)
+		min_freq = quirks->lowest_freq * 1000;
+	else
+		min_freq = cppc_perf.lowest_freq * 1000;
+
+	if (quirks && quirks->nominal_freq)
+		nominal_freq = quirks->nominal_freq ;
+	else
+		nominal_freq = cppc_perf.nominal_freq;
+
 	nominal_perf = READ_ONCE(cpudata->nominal_perf);
 
 	highest_perf = READ_ONCE(cpudata->highest_perf);
@@ -1662,6 +1706,11 @@ static int __init amd_pstate_init(void)
 	if (cpufreq_get_current_driver())
 		return -EEXIST;
 
+	quirks = NULL;
+
+	/* check if this machine need CPPC quirks */
+	dmi_check_system(amd_pstate_quirks_table);
+
 	switch (cppc_state) {
 	case AMD_PSTATE_UNDEFINED:
 		/* Disable on the following configs by default:
diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h
index ec0b0fa3e9bb..d58fc022ec46 100644
--- a/include/linux/amd-pstate.h
+++ b/include/linux/amd-pstate.h
@@ -128,4 +128,10 @@ static const char * const amd_pstate_mode_string[] = {
 	[AMD_PSTATE_GUIDED]      = "guided",
 	NULL,
 };
+
+struct quirk_entry {
+	u32 nominal_freq;
+	u32 lowest_freq;
+};
+
 #endif /* _LINUX_AMD_PSTATE_H */
-- 
cgit v1.2.3


From 05947224ff469bf17b3791fd009bc27ce5151997 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Wed, 14 Feb 2024 10:33:28 +0100
Subject: pwm: Ensure that pwm_chips are allocated using pwmchip_alloc()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Memory holding a struct device must not be freed before the reference
count drops to zero. So a struct pwm_chip must not live in memory
freed by a driver on unbind. All in-tree drivers were fixed accordingly,
but as out-of-tree drivers, that were not adapted, still compile fine,
catch these in pwmchip_add().

Link: https://lore.kernel.org/r/35f5b229c98f78b2f6ce2397259a4a936be477c0.1707900770.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/core.c  | 10 ++++++++++
 include/linux/pwm.h |  2 ++
 2 files changed, 12 insertions(+)

(limited to 'include')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index 403525cc1783..d3f475eca97f 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -481,6 +481,7 @@ struct pwm_chip *pwmchip_alloc(struct device *parent, unsigned int npwm, size_t
 
 	chip->dev = parent;
 	chip->npwm = npwm;
+	chip->uses_pwmchip_alloc = true;
 
 	pwmchip_set_drvdata(chip, pwmchip_priv(chip));
 
@@ -561,6 +562,15 @@ int __pwmchip_add(struct pwm_chip *chip, struct module *owner)
 	if (!chip || !pwmchip_parent(chip) || !chip->ops || !chip->npwm)
 		return -EINVAL;
 
+	/*
+	 * a struct pwm_chip must be allocated using (devm_)pwmchip_alloc,
+	 * otherwise the embedded struct device might disappear too early
+	 * resulting in memory corruption.
+	 * Catch drivers that were not converted appropriately.
+	 */
+	if (!chip->uses_pwmchip_alloc)
+		return -EINVAL;
+
 	if (!pwm_ops_check(chip))
 		return -EINVAL;
 
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 4a6568dfdf3f..94a642a88817 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -272,6 +272,7 @@ struct pwm_ops {
  * @npwm: number of PWMs controlled by this chip
  * @of_xlate: request a PWM device given a device tree PWM specifier
  * @atomic: can the driver's ->apply() be called in atomic context
+ * @uses_pwmchip_alloc: signals if pwmchip_allow was used to allocate this chip
  * @driver_data: Private pointer for driver specific info
  * @pwms: array of PWM devices allocated by the framework
  */
@@ -287,6 +288,7 @@ struct pwm_chip {
 	bool atomic;
 
 	/* only used internally by the PWM framework */
+	bool uses_pwmchip_alloc;
 	void *driver_data;
 	struct pwm_device *pwms;
 };
-- 
cgit v1.2.3


From e9cc807f87ffd1ccc919731e8f624982935af3e0 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Sun, 17 Mar 2024 11:40:34 +0100
Subject: pwm: Move contents of sysfs.c into core.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With the upcoming restructuring having all in a single file simplifies
things a bit. The relevant and somewhat visible changes are:

 - Some dropped prototypes from include/linux/pwm.h that were only
   necessary that core.c has a declaration of the symbols defined in
   sysfs.c. The respective functions are static now.

 - The pwm class now also exists if CONFIG_SYSFS isn't enabled. Having
   CONFIG_SYSFS is not very relevant today, but even without it the
   class and device stuff still provides lifetime tracking.

 - Both files had an initcall, these are merged into a single one now.
   Instead of a big #ifdef block for CONFIG_DEBUG_FS, a single
   if (IS_ENABLED(CONFIG_DEBUG_FS)) is used now. This increases compile
   coverage a bit and is a tad nicer on the eyes.

Link: https://lore.kernel.org/r/9e2d39a5280d7dda5bfc6682a8aef510148635b2.1710670958.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/Kconfig  |   4 -
 drivers/pwm/Makefile |   1 -
 drivers/pwm/core.c   | 540 +++++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/pwm/sysfs.c  | 550 ---------------------------------------------------
 include/linux/pwm.h  |  13 --
 5 files changed, 534 insertions(+), 574 deletions(-)
 delete mode 100644 drivers/pwm/sysfs.c

(limited to 'include')

diff --git a/drivers/pwm/Kconfig b/drivers/pwm/Kconfig
index 4b956d661755..1dd7921194f5 100644
--- a/drivers/pwm/Kconfig
+++ b/drivers/pwm/Kconfig
@@ -29,10 +29,6 @@ menuconfig PWM
 
 if PWM
 
-config PWM_SYSFS
-	bool
-	default y if SYSFS
-
 config PWM_DEBUG
 	bool "PWM lowlevel drivers additional checks and debug messages"
 	depends on DEBUG_KERNEL
diff --git a/drivers/pwm/Makefile b/drivers/pwm/Makefile
index c5ec9e168ee7..90913519f11a 100644
--- a/drivers/pwm/Makefile
+++ b/drivers/pwm/Makefile
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_PWM)		+= core.o
-obj-$(CONFIG_PWM_SYSFS)		+= sysfs.o
 obj-$(CONFIG_PWM_AB8500)	+= pwm-ab8500.o
 obj-$(CONFIG_PWM_APPLE)		+= pwm-apple.o
 obj-$(CONFIG_PWM_ATMEL)		+= pwm-atmel.o
diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index d3f475eca97f..faa0f885e057 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -454,6 +454,535 @@ of_pwm_single_xlate(struct pwm_chip *chip, const struct of_phandle_args *args)
 }
 EXPORT_SYMBOL_GPL(of_pwm_single_xlate);
 
+struct pwm_export {
+	struct device pwm_dev;
+	struct pwm_device *pwm;
+	struct mutex lock;
+	struct pwm_state suspend;
+};
+
+static inline struct pwm_chip *pwmchip_from_dev(struct device *pwmchip_dev)
+{
+	return dev_get_drvdata(pwmchip_dev);
+}
+
+static inline struct pwm_export *pwmexport_from_dev(struct device *pwm_dev)
+{
+	return container_of(pwm_dev, struct pwm_export, pwm_dev);
+}
+
+static inline struct pwm_device *pwm_from_dev(struct device *pwm_dev)
+{
+	struct pwm_export *export = pwmexport_from_dev(pwm_dev);
+
+	return export->pwm;
+}
+
+static ssize_t period_show(struct device *pwm_dev,
+			   struct device_attribute *attr,
+			   char *buf)
+{
+	const struct pwm_device *pwm = pwm_from_dev(pwm_dev);
+	struct pwm_state state;
+
+	pwm_get_state(pwm, &state);
+
+	return sysfs_emit(buf, "%llu\n", state.period);
+}
+
+static ssize_t period_store(struct device *pwm_dev,
+			    struct device_attribute *attr,
+			    const char *buf, size_t size)
+{
+	struct pwm_export *export = pwmexport_from_dev(pwm_dev);
+	struct pwm_device *pwm = export->pwm;
+	struct pwm_state state;
+	u64 val;
+	int ret;
+
+	ret = kstrtou64(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	mutex_lock(&export->lock);
+	pwm_get_state(pwm, &state);
+	state.period = val;
+	ret = pwm_apply_might_sleep(pwm, &state);
+	mutex_unlock(&export->lock);
+
+	return ret ? : size;
+}
+
+static ssize_t duty_cycle_show(struct device *pwm_dev,
+			       struct device_attribute *attr,
+			       char *buf)
+{
+	const struct pwm_device *pwm = pwm_from_dev(pwm_dev);
+	struct pwm_state state;
+
+	pwm_get_state(pwm, &state);
+
+	return sysfs_emit(buf, "%llu\n", state.duty_cycle);
+}
+
+static ssize_t duty_cycle_store(struct device *pwm_dev,
+				struct device_attribute *attr,
+				const char *buf, size_t size)
+{
+	struct pwm_export *export = pwmexport_from_dev(pwm_dev);
+	struct pwm_device *pwm = export->pwm;
+	struct pwm_state state;
+	u64 val;
+	int ret;
+
+	ret = kstrtou64(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	mutex_lock(&export->lock);
+	pwm_get_state(pwm, &state);
+	state.duty_cycle = val;
+	ret = pwm_apply_might_sleep(pwm, &state);
+	mutex_unlock(&export->lock);
+
+	return ret ? : size;
+}
+
+static ssize_t enable_show(struct device *pwm_dev,
+			   struct device_attribute *attr,
+			   char *buf)
+{
+	const struct pwm_device *pwm = pwm_from_dev(pwm_dev);
+	struct pwm_state state;
+
+	pwm_get_state(pwm, &state);
+
+	return sysfs_emit(buf, "%d\n", state.enabled);
+}
+
+static ssize_t enable_store(struct device *pwm_dev,
+			    struct device_attribute *attr,
+			    const char *buf, size_t size)
+{
+	struct pwm_export *export = pwmexport_from_dev(pwm_dev);
+	struct pwm_device *pwm = export->pwm;
+	struct pwm_state state;
+	int val, ret;
+
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	mutex_lock(&export->lock);
+
+	pwm_get_state(pwm, &state);
+
+	switch (val) {
+	case 0:
+		state.enabled = false;
+		break;
+	case 1:
+		state.enabled = true;
+		break;
+	default:
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	ret = pwm_apply_might_sleep(pwm, &state);
+
+unlock:
+	mutex_unlock(&export->lock);
+	return ret ? : size;
+}
+
+static ssize_t polarity_show(struct device *pwm_dev,
+			     struct device_attribute *attr,
+			     char *buf)
+{
+	const struct pwm_device *pwm = pwm_from_dev(pwm_dev);
+	const char *polarity = "unknown";
+	struct pwm_state state;
+
+	pwm_get_state(pwm, &state);
+
+	switch (state.polarity) {
+	case PWM_POLARITY_NORMAL:
+		polarity = "normal";
+		break;
+
+	case PWM_POLARITY_INVERSED:
+		polarity = "inversed";
+		break;
+	}
+
+	return sysfs_emit(buf, "%s\n", polarity);
+}
+
+static ssize_t polarity_store(struct device *pwm_dev,
+			      struct device_attribute *attr,
+			      const char *buf, size_t size)
+{
+	struct pwm_export *export = pwmexport_from_dev(pwm_dev);
+	struct pwm_device *pwm = export->pwm;
+	enum pwm_polarity polarity;
+	struct pwm_state state;
+	int ret;
+
+	if (sysfs_streq(buf, "normal"))
+		polarity = PWM_POLARITY_NORMAL;
+	else if (sysfs_streq(buf, "inversed"))
+		polarity = PWM_POLARITY_INVERSED;
+	else
+		return -EINVAL;
+
+	mutex_lock(&export->lock);
+	pwm_get_state(pwm, &state);
+	state.polarity = polarity;
+	ret = pwm_apply_might_sleep(pwm, &state);
+	mutex_unlock(&export->lock);
+
+	return ret ? : size;
+}
+
+static ssize_t capture_show(struct device *pwm_dev,
+			    struct device_attribute *attr,
+			    char *buf)
+{
+	struct pwm_device *pwm = pwm_from_dev(pwm_dev);
+	struct pwm_capture result;
+	int ret;
+
+	ret = pwm_capture(pwm, &result, jiffies_to_msecs(HZ));
+	if (ret)
+		return ret;
+
+	return sysfs_emit(buf, "%u %u\n", result.period, result.duty_cycle);
+}
+
+static DEVICE_ATTR_RW(period);
+static DEVICE_ATTR_RW(duty_cycle);
+static DEVICE_ATTR_RW(enable);
+static DEVICE_ATTR_RW(polarity);
+static DEVICE_ATTR_RO(capture);
+
+static struct attribute *pwm_attrs[] = {
+	&dev_attr_period.attr,
+	&dev_attr_duty_cycle.attr,
+	&dev_attr_enable.attr,
+	&dev_attr_polarity.attr,
+	&dev_attr_capture.attr,
+	NULL
+};
+ATTRIBUTE_GROUPS(pwm);
+
+static void pwm_export_release(struct device *pwm_dev)
+{
+	struct pwm_export *export = pwmexport_from_dev(pwm_dev);
+
+	kfree(export);
+}
+
+static int pwm_export_child(struct device *pwmchip_dev, struct pwm_device *pwm)
+{
+	struct pwm_export *export;
+	char *pwm_prop[2];
+	int ret;
+
+	if (test_and_set_bit(PWMF_EXPORTED, &pwm->flags))
+		return -EBUSY;
+
+	export = kzalloc(sizeof(*export), GFP_KERNEL);
+	if (!export) {
+		clear_bit(PWMF_EXPORTED, &pwm->flags);
+		return -ENOMEM;
+	}
+
+	export->pwm = pwm;
+	mutex_init(&export->lock);
+
+	export->pwm_dev.release = pwm_export_release;
+	export->pwm_dev.parent = pwmchip_dev;
+	export->pwm_dev.devt = MKDEV(0, 0);
+	export->pwm_dev.groups = pwm_groups;
+	dev_set_name(&export->pwm_dev, "pwm%u", pwm->hwpwm);
+
+	ret = device_register(&export->pwm_dev);
+	if (ret) {
+		clear_bit(PWMF_EXPORTED, &pwm->flags);
+		put_device(&export->pwm_dev);
+		export = NULL;
+		return ret;
+	}
+	pwm_prop[0] = kasprintf(GFP_KERNEL, "EXPORT=pwm%u", pwm->hwpwm);
+	pwm_prop[1] = NULL;
+	kobject_uevent_env(&pwmchip_dev->kobj, KOBJ_CHANGE, pwm_prop);
+	kfree(pwm_prop[0]);
+
+	return 0;
+}
+
+static int pwm_unexport_match(struct device *pwm_dev, void *data)
+{
+	return pwm_from_dev(pwm_dev) == data;
+}
+
+static int pwm_unexport_child(struct device *pwmchip_dev, struct pwm_device *pwm)
+{
+	struct device *pwm_dev;
+	char *pwm_prop[2];
+
+	if (!test_and_clear_bit(PWMF_EXPORTED, &pwm->flags))
+		return -ENODEV;
+
+	pwm_dev = device_find_child(pwmchip_dev, pwm, pwm_unexport_match);
+	if (!pwm_dev)
+		return -ENODEV;
+
+	pwm_prop[0] = kasprintf(GFP_KERNEL, "UNEXPORT=pwm%u", pwm->hwpwm);
+	pwm_prop[1] = NULL;
+	kobject_uevent_env(&pwmchip_dev->kobj, KOBJ_CHANGE, pwm_prop);
+	kfree(pwm_prop[0]);
+
+	/* for device_find_child() */
+	put_device(pwm_dev);
+	device_unregister(pwm_dev);
+	pwm_put(pwm);
+
+	return 0;
+}
+
+static ssize_t export_store(struct device *pwmchip_dev,
+			    struct device_attribute *attr,
+			    const char *buf, size_t len)
+{
+	struct pwm_chip *chip = pwmchip_from_dev(pwmchip_dev);
+	struct pwm_device *pwm;
+	unsigned int hwpwm;
+	int ret;
+
+	ret = kstrtouint(buf, 0, &hwpwm);
+	if (ret < 0)
+		return ret;
+
+	if (hwpwm >= chip->npwm)
+		return -ENODEV;
+
+	pwm = pwm_request_from_chip(chip, hwpwm, "sysfs");
+	if (IS_ERR(pwm))
+		return PTR_ERR(pwm);
+
+	ret = pwm_export_child(pwmchip_dev, pwm);
+	if (ret < 0)
+		pwm_put(pwm);
+
+	return ret ? : len;
+}
+static DEVICE_ATTR_WO(export);
+
+static ssize_t unexport_store(struct device *pwmchip_dev,
+			      struct device_attribute *attr,
+			      const char *buf, size_t len)
+{
+	struct pwm_chip *chip = pwmchip_from_dev(pwmchip_dev);
+	unsigned int hwpwm;
+	int ret;
+
+	ret = kstrtouint(buf, 0, &hwpwm);
+	if (ret < 0)
+		return ret;
+
+	if (hwpwm >= chip->npwm)
+		return -ENODEV;
+
+	ret = pwm_unexport_child(pwmchip_dev, &chip->pwms[hwpwm]);
+
+	return ret ? : len;
+}
+static DEVICE_ATTR_WO(unexport);
+
+static ssize_t npwm_show(struct device *pwmchip_dev, struct device_attribute *attr,
+			 char *buf)
+{
+	const struct pwm_chip *chip = pwmchip_from_dev(pwmchip_dev);
+
+	return sysfs_emit(buf, "%u\n", chip->npwm);
+}
+static DEVICE_ATTR_RO(npwm);
+
+static struct attribute *pwm_chip_attrs[] = {
+	&dev_attr_export.attr,
+	&dev_attr_unexport.attr,
+	&dev_attr_npwm.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(pwm_chip);
+
+/* takes export->lock on success */
+static struct pwm_export *pwm_class_get_state(struct device *pwmchip_dev,
+					      struct pwm_device *pwm,
+					      struct pwm_state *state)
+{
+	struct device *pwm_dev;
+	struct pwm_export *export;
+
+	if (!test_bit(PWMF_EXPORTED, &pwm->flags))
+		return NULL;
+
+	pwm_dev = device_find_child(pwmchip_dev, pwm, pwm_unexport_match);
+	if (!pwm_dev)
+		return NULL;
+
+	export = pwmexport_from_dev(pwm_dev);
+	put_device(pwm_dev);	/* for device_find_child() */
+
+	mutex_lock(&export->lock);
+	pwm_get_state(pwm, state);
+
+	return export;
+}
+
+static int pwm_class_apply_state(struct pwm_export *export,
+				 struct pwm_device *pwm,
+				 struct pwm_state *state)
+{
+	int ret = pwm_apply_might_sleep(pwm, state);
+
+	/* release lock taken in pwm_class_get_state */
+	mutex_unlock(&export->lock);
+
+	return ret;
+}
+
+static int pwm_class_resume_npwm(struct device *pwmchip_dev, unsigned int npwm)
+{
+	struct pwm_chip *chip = pwmchip_from_dev(pwmchip_dev);
+	unsigned int i;
+	int ret = 0;
+
+	for (i = 0; i < npwm; i++) {
+		struct pwm_device *pwm = &chip->pwms[i];
+		struct pwm_state state;
+		struct pwm_export *export;
+
+		export = pwm_class_get_state(pwmchip_dev, pwm, &state);
+		if (!export)
+			continue;
+
+		/* If pwmchip was not enabled before suspend, do nothing. */
+		if (!export->suspend.enabled) {
+			/* release lock taken in pwm_class_get_state */
+			mutex_unlock(&export->lock);
+			continue;
+		}
+
+		state.enabled = export->suspend.enabled;
+		ret = pwm_class_apply_state(export, pwm, &state);
+		if (ret < 0)
+			break;
+	}
+
+	return ret;
+}
+
+static int pwm_class_suspend(struct device *pwmchip_dev)
+{
+	struct pwm_chip *chip = pwmchip_from_dev(pwmchip_dev);
+	unsigned int i;
+	int ret = 0;
+
+	for (i = 0; i < chip->npwm; i++) {
+		struct pwm_device *pwm = &chip->pwms[i];
+		struct pwm_state state;
+		struct pwm_export *export;
+
+		export = pwm_class_get_state(pwmchip_dev, pwm, &state);
+		if (!export)
+			continue;
+
+		/*
+		 * If pwmchip was not enabled before suspend, save
+		 * state for resume time and do nothing else.
+		 */
+		export->suspend = state;
+		if (!state.enabled) {
+			/* release lock taken in pwm_class_get_state */
+			mutex_unlock(&export->lock);
+			continue;
+		}
+
+		state.enabled = false;
+		ret = pwm_class_apply_state(export, pwm, &state);
+		if (ret < 0) {
+			/*
+			 * roll back the PWM devices that were disabled by
+			 * this suspend function.
+			 */
+			pwm_class_resume_npwm(pwmchip_dev, i);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int pwm_class_resume(struct device *pwmchip_dev)
+{
+	struct pwm_chip *chip = pwmchip_from_dev(pwmchip_dev);
+
+	return pwm_class_resume_npwm(pwmchip_dev, chip->npwm);
+}
+
+static DEFINE_SIMPLE_DEV_PM_OPS(pwm_class_pm_ops, pwm_class_suspend, pwm_class_resume);
+
+static struct class pwm_class = {
+	.name = "pwm",
+	.dev_groups = pwm_chip_groups,
+	.pm = pm_sleep_ptr(&pwm_class_pm_ops),
+};
+
+static int pwmchip_sysfs_match(struct device *pwmchip_dev, const void *data)
+{
+	return pwmchip_from_dev(pwmchip_dev) == data;
+}
+
+static void pwmchip_sysfs_export(struct pwm_chip *chip)
+{
+	struct device *pwmchip_dev;
+
+	/*
+	 * If device_create() fails the pwm_chip is still usable by
+	 * the kernel it's just not exported.
+	 */
+	pwmchip_dev = device_create(&pwm_class, pwmchip_parent(chip), MKDEV(0, 0), chip,
+			       "pwmchip%d", chip->id);
+	if (IS_ERR(pwmchip_dev)) {
+		dev_warn(pwmchip_parent(chip),
+			 "device_create failed for pwm_chip sysfs export\n");
+	}
+}
+
+static void pwmchip_sysfs_unexport(struct pwm_chip *chip)
+{
+	struct device *pwmchip_dev;
+	unsigned int i;
+
+	pwmchip_dev = class_find_device(&pwm_class, NULL, chip,
+					pwmchip_sysfs_match);
+	if (!pwmchip_dev)
+		return;
+
+	for (i = 0; i < chip->npwm; i++) {
+		struct pwm_device *pwm = &chip->pwms[i];
+
+		if (test_bit(PWMF_EXPORTED, &pwm->flags))
+			pwm_unexport_child(pwmchip_dev, pwm);
+	}
+
+	put_device(pwmchip_dev);
+	device_unregister(pwmchip_dev);
+}
+
 #define PWMCHIP_ALIGN ARCH_DMA_MINALIGN
 
 static void *pwmchip_priv(struct pwm_chip *chip)
@@ -1086,7 +1615,6 @@ struct pwm_device *devm_fwnode_pwm_get(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(devm_fwnode_pwm_get);
 
-#ifdef CONFIG_DEBUG_FS
 static void pwm_dbg_show(struct pwm_chip *chip, struct seq_file *s)
 {
 	unsigned int i;
@@ -1171,11 +1699,11 @@ static const struct seq_operations pwm_debugfs_sops = {
 
 DEFINE_SEQ_ATTRIBUTE(pwm_debugfs);
 
-static int __init pwm_debugfs_init(void)
+static int __init pwm_init(void)
 {
-	debugfs_create_file("pwm", 0444, NULL, NULL, &pwm_debugfs_fops);
+	if (IS_ENABLED(CONFIG_DEBUG_FS))
+		debugfs_create_file("pwm", 0444, NULL, NULL, &pwm_debugfs_fops);
 
-	return 0;
+	return class_register(&pwm_class);
 }
-subsys_initcall(pwm_debugfs_init);
-#endif /* CONFIG_DEBUG_FS */
+subsys_initcall(pwm_init);
diff --git a/drivers/pwm/sysfs.c b/drivers/pwm/sysfs.c
deleted file mode 100644
index 1a1b1f6dd98f..000000000000
--- a/drivers/pwm/sysfs.c
+++ /dev/null
@@ -1,550 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * A simple sysfs interface for the generic PWM framework
- *
- * Copyright (C) 2013 H Hartley Sweeten <hsweeten@visionengravers.com>
- *
- * Based on previous work by Lars Poeschel <poeschel@lemonage.de>
- */
-
-#include <linux/device.h>
-#include <linux/mutex.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-#include <linux/kdev_t.h>
-#include <linux/pwm.h>
-
-struct pwm_export {
-	struct device pwm_dev;
-	struct pwm_device *pwm;
-	struct mutex lock;
-	struct pwm_state suspend;
-};
-
-static inline struct pwm_chip *pwmchip_from_dev(struct device *pwmchip_dev)
-{
-	return dev_get_drvdata(pwmchip_dev);
-}
-
-static inline struct pwm_export *pwmexport_from_dev(struct device *pwm_dev)
-{
-	return container_of(pwm_dev, struct pwm_export, pwm_dev);
-}
-
-static inline struct pwm_device *pwm_from_dev(struct device *pwm_dev)
-{
-	struct pwm_export *export = pwmexport_from_dev(pwm_dev);
-
-	return export->pwm;
-}
-
-static ssize_t period_show(struct device *pwm_dev,
-			   struct device_attribute *attr,
-			   char *buf)
-{
-	const struct pwm_device *pwm = pwm_from_dev(pwm_dev);
-	struct pwm_state state;
-
-	pwm_get_state(pwm, &state);
-
-	return sysfs_emit(buf, "%llu\n", state.period);
-}
-
-static ssize_t period_store(struct device *pwm_dev,
-			    struct device_attribute *attr,
-			    const char *buf, size_t size)
-{
-	struct pwm_export *export = pwmexport_from_dev(pwm_dev);
-	struct pwm_device *pwm = export->pwm;
-	struct pwm_state state;
-	u64 val;
-	int ret;
-
-	ret = kstrtou64(buf, 0, &val);
-	if (ret)
-		return ret;
-
-	mutex_lock(&export->lock);
-	pwm_get_state(pwm, &state);
-	state.period = val;
-	ret = pwm_apply_might_sleep(pwm, &state);
-	mutex_unlock(&export->lock);
-
-	return ret ? : size;
-}
-
-static ssize_t duty_cycle_show(struct device *pwm_dev,
-			       struct device_attribute *attr,
-			       char *buf)
-{
-	const struct pwm_device *pwm = pwm_from_dev(pwm_dev);
-	struct pwm_state state;
-
-	pwm_get_state(pwm, &state);
-
-	return sysfs_emit(buf, "%llu\n", state.duty_cycle);
-}
-
-static ssize_t duty_cycle_store(struct device *pwm_dev,
-				struct device_attribute *attr,
-				const char *buf, size_t size)
-{
-	struct pwm_export *export = pwmexport_from_dev(pwm_dev);
-	struct pwm_device *pwm = export->pwm;
-	struct pwm_state state;
-	u64 val;
-	int ret;
-
-	ret = kstrtou64(buf, 0, &val);
-	if (ret)
-		return ret;
-
-	mutex_lock(&export->lock);
-	pwm_get_state(pwm, &state);
-	state.duty_cycle = val;
-	ret = pwm_apply_might_sleep(pwm, &state);
-	mutex_unlock(&export->lock);
-
-	return ret ? : size;
-}
-
-static ssize_t enable_show(struct device *pwm_dev,
-			   struct device_attribute *attr,
-			   char *buf)
-{
-	const struct pwm_device *pwm = pwm_from_dev(pwm_dev);
-	struct pwm_state state;
-
-	pwm_get_state(pwm, &state);
-
-	return sysfs_emit(buf, "%d\n", state.enabled);
-}
-
-static ssize_t enable_store(struct device *pwm_dev,
-			    struct device_attribute *attr,
-			    const char *buf, size_t size)
-{
-	struct pwm_export *export = pwmexport_from_dev(pwm_dev);
-	struct pwm_device *pwm = export->pwm;
-	struct pwm_state state;
-	int val, ret;
-
-	ret = kstrtoint(buf, 0, &val);
-	if (ret)
-		return ret;
-
-	mutex_lock(&export->lock);
-
-	pwm_get_state(pwm, &state);
-
-	switch (val) {
-	case 0:
-		state.enabled = false;
-		break;
-	case 1:
-		state.enabled = true;
-		break;
-	default:
-		ret = -EINVAL;
-		goto unlock;
-	}
-
-	ret = pwm_apply_might_sleep(pwm, &state);
-
-unlock:
-	mutex_unlock(&export->lock);
-	return ret ? : size;
-}
-
-static ssize_t polarity_show(struct device *pwm_dev,
-			     struct device_attribute *attr,
-			     char *buf)
-{
-	const struct pwm_device *pwm = pwm_from_dev(pwm_dev);
-	const char *polarity = "unknown";
-	struct pwm_state state;
-
-	pwm_get_state(pwm, &state);
-
-	switch (state.polarity) {
-	case PWM_POLARITY_NORMAL:
-		polarity = "normal";
-		break;
-
-	case PWM_POLARITY_INVERSED:
-		polarity = "inversed";
-		break;
-	}
-
-	return sysfs_emit(buf, "%s\n", polarity);
-}
-
-static ssize_t polarity_store(struct device *pwm_dev,
-			      struct device_attribute *attr,
-			      const char *buf, size_t size)
-{
-	struct pwm_export *export = pwmexport_from_dev(pwm_dev);
-	struct pwm_device *pwm = export->pwm;
-	enum pwm_polarity polarity;
-	struct pwm_state state;
-	int ret;
-
-	if (sysfs_streq(buf, "normal"))
-		polarity = PWM_POLARITY_NORMAL;
-	else if (sysfs_streq(buf, "inversed"))
-		polarity = PWM_POLARITY_INVERSED;
-	else
-		return -EINVAL;
-
-	mutex_lock(&export->lock);
-	pwm_get_state(pwm, &state);
-	state.polarity = polarity;
-	ret = pwm_apply_might_sleep(pwm, &state);
-	mutex_unlock(&export->lock);
-
-	return ret ? : size;
-}
-
-static ssize_t capture_show(struct device *pwm_dev,
-			    struct device_attribute *attr,
-			    char *buf)
-{
-	struct pwm_device *pwm = pwm_from_dev(pwm_dev);
-	struct pwm_capture result;
-	int ret;
-
-	ret = pwm_capture(pwm, &result, jiffies_to_msecs(HZ));
-	if (ret)
-		return ret;
-
-	return sysfs_emit(buf, "%u %u\n", result.period, result.duty_cycle);
-}
-
-static DEVICE_ATTR_RW(period);
-static DEVICE_ATTR_RW(duty_cycle);
-static DEVICE_ATTR_RW(enable);
-static DEVICE_ATTR_RW(polarity);
-static DEVICE_ATTR_RO(capture);
-
-static struct attribute *pwm_attrs[] = {
-	&dev_attr_period.attr,
-	&dev_attr_duty_cycle.attr,
-	&dev_attr_enable.attr,
-	&dev_attr_polarity.attr,
-	&dev_attr_capture.attr,
-	NULL
-};
-ATTRIBUTE_GROUPS(pwm);
-
-static void pwm_export_release(struct device *pwm_dev)
-{
-	struct pwm_export *export = pwmexport_from_dev(pwm_dev);
-
-	kfree(export);
-}
-
-static int pwm_export_child(struct device *pwmchip_dev, struct pwm_device *pwm)
-{
-	struct pwm_export *export;
-	char *pwm_prop[2];
-	int ret;
-
-	if (test_and_set_bit(PWMF_EXPORTED, &pwm->flags))
-		return -EBUSY;
-
-	export = kzalloc(sizeof(*export), GFP_KERNEL);
-	if (!export) {
-		clear_bit(PWMF_EXPORTED, &pwm->flags);
-		return -ENOMEM;
-	}
-
-	export->pwm = pwm;
-	mutex_init(&export->lock);
-
-	export->pwm_dev.release = pwm_export_release;
-	export->pwm_dev.parent = pwmchip_dev;
-	export->pwm_dev.devt = MKDEV(0, 0);
-	export->pwm_dev.groups = pwm_groups;
-	dev_set_name(&export->pwm_dev, "pwm%u", pwm->hwpwm);
-
-	ret = device_register(&export->pwm_dev);
-	if (ret) {
-		clear_bit(PWMF_EXPORTED, &pwm->flags);
-		put_device(&export->pwm_dev);
-		export = NULL;
-		return ret;
-	}
-	pwm_prop[0] = kasprintf(GFP_KERNEL, "EXPORT=pwm%u", pwm->hwpwm);
-	pwm_prop[1] = NULL;
-	kobject_uevent_env(&pwmchip_dev->kobj, KOBJ_CHANGE, pwm_prop);
-	kfree(pwm_prop[0]);
-
-	return 0;
-}
-
-static int pwm_unexport_match(struct device *pwm_dev, void *data)
-{
-	return pwm_from_dev(pwm_dev) == data;
-}
-
-static int pwm_unexport_child(struct device *pwmchip_dev, struct pwm_device *pwm)
-{
-	struct device *pwm_dev;
-	char *pwm_prop[2];
-
-	if (!test_and_clear_bit(PWMF_EXPORTED, &pwm->flags))
-		return -ENODEV;
-
-	pwm_dev = device_find_child(pwmchip_dev, pwm, pwm_unexport_match);
-	if (!pwm_dev)
-		return -ENODEV;
-
-	pwm_prop[0] = kasprintf(GFP_KERNEL, "UNEXPORT=pwm%u", pwm->hwpwm);
-	pwm_prop[1] = NULL;
-	kobject_uevent_env(&pwmchip_dev->kobj, KOBJ_CHANGE, pwm_prop);
-	kfree(pwm_prop[0]);
-
-	/* for device_find_child() */
-	put_device(pwm_dev);
-	device_unregister(pwm_dev);
-	pwm_put(pwm);
-
-	return 0;
-}
-
-static ssize_t export_store(struct device *pwmchip_dev,
-			    struct device_attribute *attr,
-			    const char *buf, size_t len)
-{
-	struct pwm_chip *chip = pwmchip_from_dev(pwmchip_dev);
-	struct pwm_device *pwm;
-	unsigned int hwpwm;
-	int ret;
-
-	ret = kstrtouint(buf, 0, &hwpwm);
-	if (ret < 0)
-		return ret;
-
-	if (hwpwm >= chip->npwm)
-		return -ENODEV;
-
-	pwm = pwm_request_from_chip(chip, hwpwm, "sysfs");
-	if (IS_ERR(pwm))
-		return PTR_ERR(pwm);
-
-	ret = pwm_export_child(pwmchip_dev, pwm);
-	if (ret < 0)
-		pwm_put(pwm);
-
-	return ret ? : len;
-}
-static DEVICE_ATTR_WO(export);
-
-static ssize_t unexport_store(struct device *pwmchip_dev,
-			      struct device_attribute *attr,
-			      const char *buf, size_t len)
-{
-	struct pwm_chip *chip = pwmchip_from_dev(pwmchip_dev);
-	unsigned int hwpwm;
-	int ret;
-
-	ret = kstrtouint(buf, 0, &hwpwm);
-	if (ret < 0)
-		return ret;
-
-	if (hwpwm >= chip->npwm)
-		return -ENODEV;
-
-	ret = pwm_unexport_child(pwmchip_dev, &chip->pwms[hwpwm]);
-
-	return ret ? : len;
-}
-static DEVICE_ATTR_WO(unexport);
-
-static ssize_t npwm_show(struct device *pwmchip_dev, struct device_attribute *attr,
-			 char *buf)
-{
-	const struct pwm_chip *chip = pwmchip_from_dev(pwmchip_dev);
-
-	return sysfs_emit(buf, "%u\n", chip->npwm);
-}
-static DEVICE_ATTR_RO(npwm);
-
-static struct attribute *pwm_chip_attrs[] = {
-	&dev_attr_export.attr,
-	&dev_attr_unexport.attr,
-	&dev_attr_npwm.attr,
-	NULL,
-};
-ATTRIBUTE_GROUPS(pwm_chip);
-
-/* takes export->lock on success */
-static struct pwm_export *pwm_class_get_state(struct device *pwmchip_dev,
-					      struct pwm_device *pwm,
-					      struct pwm_state *state)
-{
-	struct device *pwm_dev;
-	struct pwm_export *export;
-
-	if (!test_bit(PWMF_EXPORTED, &pwm->flags))
-		return NULL;
-
-	pwm_dev = device_find_child(pwmchip_dev, pwm, pwm_unexport_match);
-	if (!pwm_dev)
-		return NULL;
-
-	export = pwmexport_from_dev(pwm_dev);
-	put_device(pwm_dev);	/* for device_find_child() */
-
-	mutex_lock(&export->lock);
-	pwm_get_state(pwm, state);
-
-	return export;
-}
-
-static int pwm_class_apply_state(struct pwm_export *export,
-				 struct pwm_device *pwm,
-				 struct pwm_state *state)
-{
-	int ret = pwm_apply_might_sleep(pwm, state);
-
-	/* release lock taken in pwm_class_get_state */
-	mutex_unlock(&export->lock);
-
-	return ret;
-}
-
-static int pwm_class_resume_npwm(struct device *pwmchip_dev, unsigned int npwm)
-{
-	struct pwm_chip *chip = pwmchip_from_dev(pwmchip_dev);
-	unsigned int i;
-	int ret = 0;
-
-	for (i = 0; i < npwm; i++) {
-		struct pwm_device *pwm = &chip->pwms[i];
-		struct pwm_state state;
-		struct pwm_export *export;
-
-		export = pwm_class_get_state(pwmchip_dev, pwm, &state);
-		if (!export)
-			continue;
-
-		/* If pwmchip was not enabled before suspend, do nothing. */
-		if (!export->suspend.enabled) {
-			/* release lock taken in pwm_class_get_state */
-			mutex_unlock(&export->lock);
-			continue;
-		}
-
-		state.enabled = export->suspend.enabled;
-		ret = pwm_class_apply_state(export, pwm, &state);
-		if (ret < 0)
-			break;
-	}
-
-	return ret;
-}
-
-static int pwm_class_suspend(struct device *pwmchip_dev)
-{
-	struct pwm_chip *chip = pwmchip_from_dev(pwmchip_dev);
-	unsigned int i;
-	int ret = 0;
-
-	for (i = 0; i < chip->npwm; i++) {
-		struct pwm_device *pwm = &chip->pwms[i];
-		struct pwm_state state;
-		struct pwm_export *export;
-
-		export = pwm_class_get_state(pwmchip_dev, pwm, &state);
-		if (!export)
-			continue;
-
-		/*
-		 * If pwmchip was not enabled before suspend, save
-		 * state for resume time and do nothing else.
-		 */
-		export->suspend = state;
-		if (!state.enabled) {
-			/* release lock taken in pwm_class_get_state */
-			mutex_unlock(&export->lock);
-			continue;
-		}
-
-		state.enabled = false;
-		ret = pwm_class_apply_state(export, pwm, &state);
-		if (ret < 0) {
-			/*
-			 * roll back the PWM devices that were disabled by
-			 * this suspend function.
-			 */
-			pwm_class_resume_npwm(pwmchip_dev, i);
-			break;
-		}
-	}
-
-	return ret;
-}
-
-static int pwm_class_resume(struct device *pwmchip_dev)
-{
-	struct pwm_chip *chip = pwmchip_from_dev(pwmchip_dev);
-
-	return pwm_class_resume_npwm(pwmchip_dev, chip->npwm);
-}
-
-static DEFINE_SIMPLE_DEV_PM_OPS(pwm_class_pm_ops, pwm_class_suspend, pwm_class_resume);
-
-static struct class pwm_class = {
-	.name = "pwm",
-	.dev_groups = pwm_chip_groups,
-	.pm = pm_sleep_ptr(&pwm_class_pm_ops),
-};
-
-static int pwmchip_sysfs_match(struct device *pwmchip_dev, const void *data)
-{
-	return pwmchip_from_dev(pwmchip_dev) == data;
-}
-
-void pwmchip_sysfs_export(struct pwm_chip *chip)
-{
-	struct device *pwmchip_dev;
-
-	/*
-	 * If device_create() fails the pwm_chip is still usable by
-	 * the kernel it's just not exported.
-	 */
-	pwmchip_dev = device_create(&pwm_class, pwmchip_parent(chip), MKDEV(0, 0), chip,
-			       "pwmchip%d", chip->id);
-	if (IS_ERR(pwmchip_dev)) {
-		dev_warn(pwmchip_parent(chip),
-			 "device_create failed for pwm_chip sysfs export\n");
-	}
-}
-
-void pwmchip_sysfs_unexport(struct pwm_chip *chip)
-{
-	struct device *pwmchip_dev;
-	unsigned int i;
-
-	pwmchip_dev = class_find_device(&pwm_class, NULL, chip,
-				   pwmchip_sysfs_match);
-	if (!pwmchip_dev)
-		return;
-
-	for (i = 0; i < chip->npwm; i++) {
-		struct pwm_device *pwm = &chip->pwms[i];
-
-		if (test_bit(PWMF_EXPORTED, &pwm->flags))
-			pwm_unexport_child(pwmchip_dev, pwm);
-	}
-
-	put_device(pwmchip_dev);
-	device_unregister(pwmchip_dev);
-}
-
-static int __init pwm_sysfs_init(void)
-{
-	return class_register(&pwm_class);
-}
-subsys_initcall(pwm_sysfs_init);
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 94a642a88817..17e45d8413ed 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -630,17 +630,4 @@ static inline void pwm_remove_table(struct pwm_lookup *table, size_t num)
 }
 #endif
 
-#ifdef CONFIG_PWM_SYSFS
-void pwmchip_sysfs_export(struct pwm_chip *chip);
-void pwmchip_sysfs_unexport(struct pwm_chip *chip);
-#else
-static inline void pwmchip_sysfs_export(struct pwm_chip *chip)
-{
-}
-
-static inline void pwmchip_sysfs_unexport(struct pwm_chip *chip)
-{
-}
-#endif /* CONFIG_PWM_SYSFS */
-
 #endif /* __LINUX_PWM_H */
-- 
cgit v1.2.3


From ee37bf50749f06b29394d7ba8a85b47f023b61e2 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Sun, 17 Mar 2024 11:40:35 +0100
Subject: pwm: Ensure a struct pwm has the same lifetime as its pwm_chip
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It's required to not free the memory underlying a requested PWM
while a consumer still has a reference to it. While currently a pwm_chip
doesn't live long enough in all cases, linking the struct pwm to the
pwm_chip results in the right lifetime as soon as the pwmchip is living
long enough. This happens with the following commits.

Note this is a breaking change for all pwm drivers that don't use
pwmchip_alloc().

Reviewed-by: Gustavo A. R. Silva <gustavoars@kernel.org> # for struct_size() and __counted_by()
Link: https://lore.kernel.org/r/7e9e958841f049026c0023b309cc9deecf0ab61d.1710670958.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/core.c  | 27 ++++++++++-----------------
 include/linux/pwm.h |  2 +-
 2 files changed, 11 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index faa0f885e057..19386452e6c8 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -987,7 +987,7 @@ static void pwmchip_sysfs_unexport(struct pwm_chip *chip)
 
 static void *pwmchip_priv(struct pwm_chip *chip)
 {
-	return (void *)chip + ALIGN(sizeof(*chip), PWMCHIP_ALIGN);
+	return (void *)chip + ALIGN(struct_size(chip, pwms, chip->npwm), PWMCHIP_ALIGN);
 }
 
 /* This is the counterpart to pwmchip_alloc() */
@@ -1001,8 +1001,10 @@ struct pwm_chip *pwmchip_alloc(struct device *parent, unsigned int npwm, size_t
 {
 	struct pwm_chip *chip;
 	size_t alloc_size;
+	unsigned int i;
 
-	alloc_size = size_add(ALIGN(sizeof(*chip), PWMCHIP_ALIGN), sizeof_priv);
+	alloc_size = size_add(ALIGN(struct_size(chip, pwms, npwm), PWMCHIP_ALIGN),
+			      sizeof_priv);
 
 	chip = kzalloc(alloc_size, GFP_KERNEL);
 	if (!chip)
@@ -1014,6 +1016,12 @@ struct pwm_chip *pwmchip_alloc(struct device *parent, unsigned int npwm, size_t
 
 	pwmchip_set_drvdata(chip, pwmchip_priv(chip));
 
+	for (i = 0; i < chip->npwm; i++) {
+		struct pwm_device *pwm = &chip->pwms[i];
+		pwm->chip = chip;
+		pwm->hwpwm = i;
+	}
+
 	return chip;
 }
 EXPORT_SYMBOL_GPL(pwmchip_alloc);
@@ -1085,7 +1093,6 @@ static bool pwm_ops_check(const struct pwm_chip *chip)
  */
 int __pwmchip_add(struct pwm_chip *chip, struct module *owner)
 {
-	unsigned int i;
 	int ret;
 
 	if (!chip || !pwmchip_parent(chip) || !chip->ops || !chip->npwm)
@@ -1105,28 +1112,16 @@ int __pwmchip_add(struct pwm_chip *chip, struct module *owner)
 
 	chip->owner = owner;
 
-	chip->pwms = kcalloc(chip->npwm, sizeof(*chip->pwms), GFP_KERNEL);
-	if (!chip->pwms)
-		return -ENOMEM;
-
 	mutex_lock(&pwm_lock);
 
 	ret = idr_alloc(&pwm_chips, chip, 0, 0, GFP_KERNEL);
 	if (ret < 0) {
 		mutex_unlock(&pwm_lock);
-		kfree(chip->pwms);
 		return ret;
 	}
 
 	chip->id = ret;
 
-	for (i = 0; i < chip->npwm; i++) {
-		struct pwm_device *pwm = &chip->pwms[i];
-
-		pwm->chip = chip;
-		pwm->hwpwm = i;
-	}
-
 	mutex_unlock(&pwm_lock);
 
 	if (IS_ENABLED(CONFIG_OF))
@@ -1156,8 +1151,6 @@ void pwmchip_remove(struct pwm_chip *chip)
 	idr_remove(&pwm_chips, chip->id);
 
 	mutex_unlock(&pwm_lock);
-
-	kfree(chip->pwms);
 }
 EXPORT_SYMBOL_GPL(pwmchip_remove);
 
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 17e45d8413ed..78b9061572ff 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -290,7 +290,7 @@ struct pwm_chip {
 	/* only used internally by the PWM framework */
 	bool uses_pwmchip_alloc;
 	void *driver_data;
-	struct pwm_device *pwms;
+	struct pwm_device pwms[] __counted_by(npwm);
 };
 
 static inline struct device *pwmchip_parent(const struct pwm_chip *chip)
-- 
cgit v1.2.3


From 4c56b1434b814899c42a9d9f43d8265371282cd0 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Sun, 17 Mar 2024 11:40:36 +0100
Subject: pwm: Add a struct device to struct pwm_chip
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This replaces the formerly dynamically allocated struct device. This
allows to additionally use it to track the lifetime of the struct
pwm_chip. Otherwise the new struct device provides the same sysfs API as
was provided by the dynamic device before.

Link: https://lore.kernel.org/r/35c65ea7f6de789a568ff39d7b6b4ce80de4b7dc.1710670958.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 drivers/pwm/core.c  | 93 +++++++++++++++++++++++++++++------------------------
 include/linux/pwm.h |  5 +--
 2 files changed, 54 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index 19386452e6c8..32871687c0a4 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -343,9 +343,16 @@ static int pwm_device_request(struct pwm_device *pwm, const char *label)
 	if (!try_module_get(chip->owner))
 		return -ENODEV;
 
+	if (!get_device(&chip->dev)) {
+		err = -ENODEV;
+		goto err_get_device;
+	}
+
 	if (ops->request) {
 		err = ops->request(chip, pwm);
 		if (err) {
+			put_device(&chip->dev);
+err_get_device:
 			module_put(chip->owner);
 			return err;
 		}
@@ -463,7 +470,7 @@ struct pwm_export {
 
 static inline struct pwm_chip *pwmchip_from_dev(struct device *pwmchip_dev)
 {
-	return dev_get_drvdata(pwmchip_dev);
+	return container_of(pwmchip_dev, struct pwm_chip, dev);
 }
 
 static inline struct pwm_export *pwmexport_from_dev(struct device *pwm_dev)
@@ -941,46 +948,16 @@ static struct class pwm_class = {
 	.pm = pm_sleep_ptr(&pwm_class_pm_ops),
 };
 
-static int pwmchip_sysfs_match(struct device *pwmchip_dev, const void *data)
-{
-	return pwmchip_from_dev(pwmchip_dev) == data;
-}
-
-static void pwmchip_sysfs_export(struct pwm_chip *chip)
-{
-	struct device *pwmchip_dev;
-
-	/*
-	 * If device_create() fails the pwm_chip is still usable by
-	 * the kernel it's just not exported.
-	 */
-	pwmchip_dev = device_create(&pwm_class, pwmchip_parent(chip), MKDEV(0, 0), chip,
-			       "pwmchip%d", chip->id);
-	if (IS_ERR(pwmchip_dev)) {
-		dev_warn(pwmchip_parent(chip),
-			 "device_create failed for pwm_chip sysfs export\n");
-	}
-}
-
 static void pwmchip_sysfs_unexport(struct pwm_chip *chip)
 {
-	struct device *pwmchip_dev;
 	unsigned int i;
 
-	pwmchip_dev = class_find_device(&pwm_class, NULL, chip,
-					pwmchip_sysfs_match);
-	if (!pwmchip_dev)
-		return;
-
 	for (i = 0; i < chip->npwm; i++) {
 		struct pwm_device *pwm = &chip->pwms[i];
 
 		if (test_bit(PWMF_EXPORTED, &pwm->flags))
-			pwm_unexport_child(pwmchip_dev, pwm);
+			pwm_unexport_child(&chip->dev, pwm);
 	}
-
-	put_device(pwmchip_dev);
-	device_unregister(pwmchip_dev);
 }
 
 #define PWMCHIP_ALIGN ARCH_DMA_MINALIGN
@@ -993,13 +970,21 @@ static void *pwmchip_priv(struct pwm_chip *chip)
 /* This is the counterpart to pwmchip_alloc() */
 void pwmchip_put(struct pwm_chip *chip)
 {
-	kfree(chip);
+	put_device(&chip->dev);
 }
 EXPORT_SYMBOL_GPL(pwmchip_put);
 
+static void pwmchip_release(struct device *pwmchip_dev)
+{
+	struct pwm_chip *chip = pwmchip_from_dev(pwmchip_dev);
+
+	kfree(chip);
+}
+
 struct pwm_chip *pwmchip_alloc(struct device *parent, unsigned int npwm, size_t sizeof_priv)
 {
 	struct pwm_chip *chip;
+	struct device *pwmchip_dev;
 	size_t alloc_size;
 	unsigned int i;
 
@@ -1010,10 +995,15 @@ struct pwm_chip *pwmchip_alloc(struct device *parent, unsigned int npwm, size_t
 	if (!chip)
 		return ERR_PTR(-ENOMEM);
 
-	chip->dev = parent;
 	chip->npwm = npwm;
 	chip->uses_pwmchip_alloc = true;
 
+	pwmchip_dev = &chip->dev;
+	device_initialize(pwmchip_dev);
+	pwmchip_dev->class = &pwm_class;
+	pwmchip_dev->parent = parent;
+	pwmchip_dev->release = pwmchip_release;
+
 	pwmchip_set_drvdata(chip, pwmchip_priv(chip));
 
 	for (i = 0; i < chip->npwm; i++) {
@@ -1115,21 +1105,34 @@ int __pwmchip_add(struct pwm_chip *chip, struct module *owner)
 	mutex_lock(&pwm_lock);
 
 	ret = idr_alloc(&pwm_chips, chip, 0, 0, GFP_KERNEL);
-	if (ret < 0) {
-		mutex_unlock(&pwm_lock);
-		return ret;
-	}
+	if (ret < 0)
+		goto err_idr_alloc;
 
 	chip->id = ret;
 
-	mutex_unlock(&pwm_lock);
+	dev_set_name(&chip->dev, "pwmchip%u", chip->id);
 
 	if (IS_ENABLED(CONFIG_OF))
 		of_pwmchip_add(chip);
 
-	pwmchip_sysfs_export(chip);
+	ret = device_add(&chip->dev);
+	if (ret)
+		goto err_device_add;
+
+	mutex_unlock(&pwm_lock);
 
 	return 0;
+
+err_device_add:
+	if (IS_ENABLED(CONFIG_OF))
+		of_pwmchip_remove(chip);
+
+	idr_remove(&pwm_chips, chip->id);
+err_idr_alloc:
+
+	mutex_unlock(&pwm_lock);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(__pwmchip_add);
 
@@ -1151,6 +1154,8 @@ void pwmchip_remove(struct pwm_chip *chip)
 	idr_remove(&pwm_chips, chip->id);
 
 	mutex_unlock(&pwm_lock);
+
+	device_del(&chip->dev);
 }
 EXPORT_SYMBOL_GPL(pwmchip_remove);
 
@@ -1520,6 +1525,8 @@ EXPORT_SYMBOL_GPL(pwm_get);
  */
 void pwm_put(struct pwm_device *pwm)
 {
+	struct pwm_chip *chip = pwm->chip;
+
 	if (!pwm)
 		return;
 
@@ -1530,12 +1537,14 @@ void pwm_put(struct pwm_device *pwm)
 		goto out;
 	}
 
-	if (pwm->chip->ops->free)
+	if (chip->ops->free)
 		pwm->chip->ops->free(pwm->chip, pwm);
 
 	pwm->label = NULL;
 
-	module_put(pwm->chip->owner);
+	put_device(&chip->dev);
+
+	module_put(chip->owner);
 out:
 	mutex_unlock(&pwm_lock);
 }
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 78b9061572ff..495e23761f34 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -2,6 +2,7 @@
 #ifndef __LINUX_PWM_H
 #define __LINUX_PWM_H
 
+#include <linux/device.h>
 #include <linux/err.h>
 #include <linux/mutex.h>
 #include <linux/of.h>
@@ -277,7 +278,7 @@ struct pwm_ops {
  * @pwms: array of PWM devices allocated by the framework
  */
 struct pwm_chip {
-	struct device *dev;
+	struct device dev;
 	const struct pwm_ops *ops;
 	struct module *owner;
 	unsigned int id;
@@ -295,7 +296,7 @@ struct pwm_chip {
 
 static inline struct device *pwmchip_parent(const struct pwm_chip *chip)
 {
-	return chip->dev;
+	return chip->dev.parent;
 }
 
 static inline void *pwmchip_get_drvdata(struct pwm_chip *chip)
-- 
cgit v1.2.3


From f848337cd801c7106a4ec0d61765771dab2a5909 Mon Sep 17 00:00:00 2001
From: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Date: Sun, 28 Apr 2024 11:37:13 +0200
Subject: ALSA: emu10k1: move the whole GPIO event handling to the workqueue

The actual event processing was already done by workqueue items. We can
move the event dispatching there as well, rather than doing it already
in the interrupt handler callback.

This change has a rather profound "side effect" on the reliability of
the FPGA programming: once we enter programming mode, we must not issue
any snd_emu1010_fpga_{read,write}() calls until we're done, as these
would badly mess up the programming protocol. But exactly that would
happen when trying to program the dock, as that triggers GPIO interrupts
as a side effect. This is mitigated by deferring the actual interrupt
handling, as workqueue items are not re-entrant.

To avoid scheduling the dispatcher on non-events, we now explicitly
ignore GPIO IRQs triggered by "uninteresting" pins, which happens a lot
as a side effect of calling snd_emu1010_fpga_{read,write}().

Fixes: fbb64eedf5a3 ("ALSA: emu10k1: make E-MU dock monitoring interrupt-driven")
Link: https://bugzilla.kernel.org/show_bug.cgi?id=218584
Signed-off-by: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Message-ID: <20240428093716.3198666-4-oswald.buddenhagen@gmx.de>
---
 include/sound/emu10k1.h          |  3 +--
 sound/pci/emu10k1/emu10k1.c      |  3 +--
 sound/pci/emu10k1/emu10k1_main.c | 56 ++++++++++++++++++++--------------------
 3 files changed, 30 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/sound/emu10k1.h b/include/sound/emu10k1.h
index 1af9e6819392..9cc10fab01a8 100644
--- a/include/sound/emu10k1.h
+++ b/include/sound/emu10k1.h
@@ -1684,8 +1684,7 @@ struct snd_emu1010 {
 	unsigned int clock_fallback;
 	unsigned int optical_in; /* 0:SPDIF, 1:ADAT */
 	unsigned int optical_out; /* 0:SPDIF, 1:ADAT */
-	struct work_struct firmware_work;
-	struct work_struct clock_work;
+	struct work_struct work;
 };
 
 struct snd_emu10k1 {
diff --git a/sound/pci/emu10k1/emu10k1.c b/sound/pci/emu10k1/emu10k1.c
index fe72e7d77241..dadeda7758ce 100644
--- a/sound/pci/emu10k1/emu10k1.c
+++ b/sound/pci/emu10k1/emu10k1.c
@@ -189,8 +189,7 @@ static int snd_emu10k1_suspend(struct device *dev)
 
 	emu->suspend = 1;
 
-	cancel_work_sync(&emu->emu1010.firmware_work);
-	cancel_work_sync(&emu->emu1010.clock_work);
+	cancel_work_sync(&emu->emu1010.work);
 
 	snd_ac97_suspend(emu->ac97);
 
diff --git a/sound/pci/emu10k1/emu10k1_main.c b/sound/pci/emu10k1/emu10k1_main.c
index 6265fc9ae260..86eaf5963502 100644
--- a/sound/pci/emu10k1/emu10k1_main.c
+++ b/sound/pci/emu10k1/emu10k1_main.c
@@ -765,19 +765,10 @@ static void snd_emu1010_load_dock_firmware(struct snd_emu10k1 *emu)
 	msleep(10);
 }
 
-static void emu1010_firmware_work(struct work_struct *work)
+static void emu1010_dock_event(struct snd_emu10k1 *emu)
 {
-	struct snd_emu10k1 *emu;
 	u32 reg;
 
-	emu = container_of(work, struct snd_emu10k1,
-			   emu1010.firmware_work);
-	if (emu->card->shutdown)
-		return;
-#ifdef CONFIG_PM_SLEEP
-	if (emu->suspend)
-		return;
-#endif
 	snd_emu1010_fpga_read(emu, EMU_HANA_OPTION_CARDS, &reg); /* OPTIONS: Which cards are attached to the EMU */
 	if (reg & EMU_HANA_OPTION_DOCK_OFFLINE) {
 		/* Audio Dock attached */
@@ -792,20 +783,10 @@ static void emu1010_firmware_work(struct work_struct *work)
 	}
 }
 
-static void emu1010_clock_work(struct work_struct *work)
+static void emu1010_clock_event(struct snd_emu10k1 *emu)
 {
-	struct snd_emu10k1 *emu;
 	struct snd_ctl_elem_id id;
 
-	emu = container_of(work, struct snd_emu10k1,
-			   emu1010.clock_work);
-	if (emu->card->shutdown)
-		return;
-#ifdef CONFIG_PM_SLEEP
-	if (emu->suspend)
-		return;
-#endif
-
 	spin_lock_irq(&emu->reg_lock);
 	// This is the only thing that can actually happen.
 	emu->emu1010.clock_source = emu->emu1010.clock_fallback;
@@ -816,19 +797,40 @@ static void emu1010_clock_work(struct work_struct *work)
 	snd_ctl_notify(emu->card, SNDRV_CTL_EVENT_MASK_VALUE, &id);
 }
 
-static void emu1010_interrupt(struct snd_emu10k1 *emu)
+static void emu1010_work(struct work_struct *work)
 {
+	struct snd_emu10k1 *emu;
 	u32 sts;
 
+	emu = container_of(work, struct snd_emu10k1, emu1010.work);
+	if (emu->card->shutdown)
+		return;
+#ifdef CONFIG_PM_SLEEP
+	if (emu->suspend)
+		return;
+#endif
+
 	snd_emu1010_fpga_read(emu, EMU_HANA_IRQ_STATUS, &sts);
 
 	// The distinction of the IRQ status bits is unreliable,
 	// so we dispatch later based on option card status.
 	if (sts & (EMU_HANA_IRQ_DOCK | EMU_HANA_IRQ_DOCK_LOST))
-		schedule_work(&emu->emu1010.firmware_work);
+		emu1010_dock_event(emu);
 
 	if (sts & EMU_HANA_IRQ_WCLK_CHANGED)
-		schedule_work(&emu->emu1010.clock_work);
+		emu1010_clock_event(emu);
+}
+
+static void emu1010_interrupt(struct snd_emu10k1 *emu)
+{
+	// We get an interrupt on each GPIO input pin change, but we
+	// care only about the ones triggered by the dedicated pin.
+	u16 sts = inw(emu->port + A_GPIO);
+	u16 bit = emu->card_capabilities->ca0108_chip ? 0x2000 : 0x8000;
+	if (!(sts & bit))
+		return;
+
+	schedule_work(&emu->emu1010.work);
 }
 
 /*
@@ -969,8 +971,7 @@ static void snd_emu10k1_free(struct snd_card *card)
 		/* Disable 48Volt power to Audio Dock */
 		snd_emu1010_fpga_write(emu, EMU_HANA_DOCK_PWR, 0);
 	}
-	cancel_work_sync(&emu->emu1010.firmware_work);
-	cancel_work_sync(&emu->emu1010.clock_work);
+	cancel_work_sync(&emu->emu1010.work);
 	release_firmware(emu->firmware);
 	release_firmware(emu->dock_fw);
 	snd_util_memhdr_free(emu->memhdr);
@@ -1549,8 +1550,7 @@ int snd_emu10k1_create(struct snd_card *card,
 	emu->irq = -1;
 	emu->synth = NULL;
 	emu->get_synth_voice = NULL;
-	INIT_WORK(&emu->emu1010.firmware_work, emu1010_firmware_work);
-	INIT_WORK(&emu->emu1010.clock_work, emu1010_clock_work);
+	INIT_WORK(&emu->emu1010.work, emu1010_work);
 	/* read revision & serial */
 	emu->revision = pci->revision;
 	pci_read_config_dword(pci, PCI_SUBSYSTEM_VENDOR_ID, &emu->serial);
-- 
cgit v1.2.3


From 2d3f4810886eb7c319cec41b6d725d2953bfa88a Mon Sep 17 00:00:00 2001
From: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Date: Sun, 28 Apr 2024 11:37:14 +0200
Subject: ALSA: emu10k1: use mutex for E-MU FPGA access locking

The FPGA access through the GPIO port does not interfere with other
sound processor register access, so there is no need to subject it to
emu_lock. And after moving all FPGA access out of the interrupt handler,
it does not need to be IRQ-safe, either.

What's more, attaching the dock causes a firmware upload, which takes
several seconds. We really don't want to disable IRQs for this long, and
even less also have someone else spin with IRQs disabled waiting for us.

Therefore, use a mutex for FPGA access locking.

This makes the code somewhat more noisy, as we need to wrap bigger
sections into the mutex, as it needs to enclose the spinlocks.

The latter has the "side effect" of fixing dock FPGA programming in a
corner case: a really badly timed mixer access right between entering
FPGA programming mode and uploading the netlist would mess up the
protocol.

Signed-off-by: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Message-ID: <20240428093716.3198666-5-oswald.buddenhagen@gmx.de>
---
 include/sound/emu10k1.h          |  4 ++++
 sound/pci/emu10k1/emu10k1_main.c | 19 ++++++++++++----
 sound/pci/emu10k1/emumixer.c     | 18 ++++++++++-----
 sound/pci/emu10k1/emuproc.c      |  9 ++++++++
 sound/pci/emu10k1/io.c           | 48 +++++++++++++++++-----------------------
 5 files changed, 61 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/include/sound/emu10k1.h b/include/sound/emu10k1.h
index 9cc10fab01a8..234b5baea69c 100644
--- a/include/sound/emu10k1.h
+++ b/include/sound/emu10k1.h
@@ -1685,6 +1685,7 @@ struct snd_emu1010 {
 	unsigned int optical_in; /* 0:SPDIF, 1:ADAT */
 	unsigned int optical_out; /* 0:SPDIF, 1:ADAT */
 	struct work_struct work;
+	struct mutex lock;
 };
 
 struct snd_emu10k1 {
@@ -1833,6 +1834,9 @@ unsigned int snd_emu10k1_ptr20_read(struct snd_emu10k1 * emu, unsigned int reg,
 void snd_emu10k1_ptr20_write(struct snd_emu10k1 *emu, unsigned int reg, unsigned int chn, unsigned int data);
 int snd_emu10k1_spi_write(struct snd_emu10k1 * emu, unsigned int data);
 int snd_emu10k1_i2c_write(struct snd_emu10k1 *emu, u32 reg, u32 value);
+static inline void snd_emu1010_fpga_lock(struct snd_emu10k1 *emu) { mutex_lock(&emu->emu1010.lock); };
+static inline void snd_emu1010_fpga_unlock(struct snd_emu10k1 *emu) { mutex_unlock(&emu->emu1010.lock); };
+void snd_emu1010_fpga_write_lock(struct snd_emu10k1 *emu, u32 reg, u32 value);
 void snd_emu1010_fpga_write(struct snd_emu10k1 *emu, u32 reg, u32 value);
 void snd_emu1010_fpga_read(struct snd_emu10k1 *emu, u32 reg, u32 *value);
 void snd_emu1010_fpga_link_dst_src_write(struct snd_emu10k1 *emu, u32 dst, u32 src);
diff --git a/sound/pci/emu10k1/emu10k1_main.c b/sound/pci/emu10k1/emu10k1_main.c
index 86eaf5963502..1a2905e8672b 100644
--- a/sound/pci/emu10k1/emu10k1_main.c
+++ b/sound/pci/emu10k1/emu10k1_main.c
@@ -810,6 +810,8 @@ static void emu1010_work(struct work_struct *work)
 		return;
 #endif
 
+	snd_emu1010_fpga_lock(emu);
+
 	snd_emu1010_fpga_read(emu, EMU_HANA_IRQ_STATUS, &sts);
 
 	// The distinction of the IRQ status bits is unreliable,
@@ -819,6 +821,8 @@ static void emu1010_work(struct work_struct *work)
 
 	if (sts & EMU_HANA_IRQ_WCLK_CHANGED)
 		emu1010_clock_event(emu);
+
+	snd_emu1010_fpga_unlock(emu);
 }
 
 static void emu1010_interrupt(struct snd_emu10k1 *emu)
@@ -852,6 +856,8 @@ static int snd_emu10k1_emu1010_init(struct snd_emu10k1 *emu)
 	 * Proper init follows in snd_emu10k1_init(). */
 	outl(HCFG_LOCKSOUNDCACHE | HCFG_LOCKTANKCACHE_MASK, emu->port + HCFG);
 
+	snd_emu1010_fpga_lock(emu);
+
 	/* Disable 48Volt power to Audio Dock */
 	snd_emu1010_fpga_write(emu, EMU_HANA_DOCK_PWR, 0);
 
@@ -877,7 +883,7 @@ static int snd_emu10k1_emu1010_init(struct snd_emu10k1 *emu)
 	err = snd_emu1010_load_firmware(emu, 0, &emu->firmware);
 	if (err < 0) {
 		dev_info(emu->card->dev, "emu1010: Loading Firmware failed\n");
-		return err;
+		goto fail;
 	}
 
 	/* ID, should read & 0x7f = 0x55 when FPGA programmed. */
@@ -887,7 +893,8 @@ static int snd_emu10k1_emu1010_init(struct snd_emu10k1 *emu)
 		dev_info(emu->card->dev,
 			 "emu1010: Loading Hana Firmware file failed, reg = 0x%x\n",
 			 reg);
-		return -ENODEV;
+		err = -ENODEV;
+		goto fail;
 	}
 
 	dev_info(emu->card->dev, "emu1010: Hana Firmware loaded\n");
@@ -947,7 +954,9 @@ static int snd_emu10k1_emu1010_init(struct snd_emu10k1 *emu)
 	// so it is safe to simply enable the outputs.
 	snd_emu1010_fpga_write(emu, EMU_HANA_UNMUTE, EMU_UNMUTE);
 
-	return 0;
+fail:
+	snd_emu1010_fpga_unlock(emu);
+	return err;
 }
 /*
  *  Create the EMU10K1 instance
@@ -969,9 +978,10 @@ static void snd_emu10k1_free(struct snd_card *card)
 	}
 	if (emu->card_capabilities->emu_model == EMU_MODEL_EMU1010) {
 		/* Disable 48Volt power to Audio Dock */
-		snd_emu1010_fpga_write(emu, EMU_HANA_DOCK_PWR, 0);
+		snd_emu1010_fpga_write_lock(emu, EMU_HANA_DOCK_PWR, 0);
 	}
 	cancel_work_sync(&emu->emu1010.work);
+	mutex_destroy(&emu->emu1010.lock);
 	release_firmware(emu->firmware);
 	release_firmware(emu->dock_fw);
 	snd_util_memhdr_free(emu->memhdr);
@@ -1551,6 +1561,7 @@ int snd_emu10k1_create(struct snd_card *card,
 	emu->synth = NULL;
 	emu->get_synth_voice = NULL;
 	INIT_WORK(&emu->emu1010.work, emu1010_work);
+	mutex_init(&emu->emu1010.lock);
 	/* read revision & serial */
 	emu->revision = pci->revision;
 	pci_read_config_dword(pci, PCI_SUBSYSTEM_VENDOR_ID, &emu->serial);
diff --git a/sound/pci/emu10k1/emumixer.c b/sound/pci/emu10k1/emumixer.c
index 0a32ea53d8c6..05b98d9b547b 100644
--- a/sound/pci/emu10k1/emumixer.c
+++ b/sound/pci/emu10k1/emumixer.c
@@ -661,7 +661,9 @@ static int snd_emu1010_output_source_put(struct snd_kcontrol *kcontrol,
 	change = (emu->emu1010.output_source[channel] != val);
 	if (change) {
 		emu->emu1010.output_source[channel] = val;
+		snd_emu1010_fpga_lock(emu);
 		snd_emu1010_output_source_apply(emu, channel, val);
+		snd_emu1010_fpga_unlock(emu);
 	}
 	return change;
 }
@@ -705,7 +707,9 @@ static int snd_emu1010_input_source_put(struct snd_kcontrol *kcontrol,
 	change = (emu->emu1010.input_source[channel] != val);
 	if (change) {
 		emu->emu1010.input_source[channel] = val;
+		snd_emu1010_fpga_lock(emu);
 		snd_emu1010_input_source_apply(emu, channel, val);
+		snd_emu1010_fpga_unlock(emu);
 	}
 	return change;
 }
@@ -774,7 +778,7 @@ static int snd_emu1010_adc_pads_put(struct snd_kcontrol *kcontrol, struct snd_ct
 		cache = cache & ~mask;
 	change = (cache != emu->emu1010.adc_pads);
 	if (change) {
-		snd_emu1010_fpga_write(emu, EMU_HANA_ADC_PADS, cache );
+		snd_emu1010_fpga_write_lock(emu, EMU_HANA_ADC_PADS, cache );
 	        emu->emu1010.adc_pads = cache;
 	}
 
@@ -832,7 +836,7 @@ static int snd_emu1010_dac_pads_put(struct snd_kcontrol *kcontrol, struct snd_ct
 		cache = cache & ~mask;
 	change = (cache != emu->emu1010.dac_pads);
 	if (change) {
-		snd_emu1010_fpga_write(emu, EMU_HANA_DAC_PADS, cache );
+		snd_emu1010_fpga_write_lock(emu, EMU_HANA_DAC_PADS, cache );
 	        emu->emu1010.dac_pads = cache;
 	}
 
@@ -980,6 +984,7 @@ static int snd_emu1010_clock_source_put(struct snd_kcontrol *kcontrol,
 	val = ucontrol->value.enumerated.item[0] ;
 	if (val >= emu_ci->num)
 		return -EINVAL;
+	snd_emu1010_fpga_lock(emu);
 	spin_lock_irq(&emu->reg_lock);
 	change = (emu->emu1010.clock_source != val);
 	if (change) {
@@ -996,6 +1001,7 @@ static int snd_emu1010_clock_source_put(struct snd_kcontrol *kcontrol,
 	} else {
 		spin_unlock_irq(&emu->reg_lock);
 	}
+	snd_emu1010_fpga_unlock(emu);
 	return change;
 }
 
@@ -1041,7 +1047,7 @@ static int snd_emu1010_clock_fallback_put(struct snd_kcontrol *kcontrol,
 	change = (emu->emu1010.clock_fallback != val);
 	if (change) {
 		emu->emu1010.clock_fallback = val;
-		snd_emu1010_fpga_write(emu, EMU_HANA_DEFCLOCK, 1 - val);
+		snd_emu1010_fpga_write_lock(emu, EMU_HANA_DEFCLOCK, 1 - val);
 	}
 	return change;
 }
@@ -1093,7 +1099,7 @@ static int snd_emu1010_optical_out_put(struct snd_kcontrol *kcontrol,
 		emu->emu1010.optical_out = val;
 		tmp = (emu->emu1010.optical_in ? EMU_HANA_OPTICAL_IN_ADAT : EMU_HANA_OPTICAL_IN_SPDIF) |
 			(emu->emu1010.optical_out ? EMU_HANA_OPTICAL_OUT_ADAT : EMU_HANA_OPTICAL_OUT_SPDIF);
-		snd_emu1010_fpga_write(emu, EMU_HANA_OPTICAL_TYPE, tmp);
+		snd_emu1010_fpga_write_lock(emu, EMU_HANA_OPTICAL_TYPE, tmp);
 	}
 	return change;
 }
@@ -1144,7 +1150,7 @@ static int snd_emu1010_optical_in_put(struct snd_kcontrol *kcontrol,
 		emu->emu1010.optical_in = val;
 		tmp = (emu->emu1010.optical_in ? EMU_HANA_OPTICAL_IN_ADAT : EMU_HANA_OPTICAL_IN_SPDIF) |
 			(emu->emu1010.optical_out ? EMU_HANA_OPTICAL_OUT_ADAT : EMU_HANA_OPTICAL_OUT_SPDIF);
-		snd_emu1010_fpga_write(emu, EMU_HANA_OPTICAL_TYPE, tmp);
+		snd_emu1010_fpga_write_lock(emu, EMU_HANA_OPTICAL_TYPE, tmp);
 	}
 	return change;
 }
@@ -2323,7 +2329,9 @@ int snd_emu10k1_mixer(struct snd_emu10k1 *emu,
 		for (i = 0; i < emu_ri->n_outs; i++)
 			emu->emu1010.output_source[i] =
 				emu1010_map_source(emu_ri, emu_ri->out_dflts[i]);
+		snd_emu1010_fpga_lock(emu);
 		snd_emu1010_apply_sources(emu);
+		snd_emu1010_fpga_unlock(emu);
 
 		kctl = emu->ctl_clock_source = snd_ctl_new1(&snd_emu1010_clock_source, emu);
 		err = snd_ctl_add(card, kctl);
diff --git a/sound/pci/emu10k1/emuproc.c b/sound/pci/emu10k1/emuproc.c
index 2f80fd91017c..737c28d31b41 100644
--- a/sound/pci/emu10k1/emuproc.c
+++ b/sound/pci/emu10k1/emuproc.c
@@ -165,6 +165,8 @@ static void snd_emu10k1_proc_spdif_read(struct snd_info_entry *entry,
 	u32 value2;
 
 	if (emu->card_capabilities->emu_model) {
+		snd_emu1010_fpga_lock(emu);
+
 		// This represents the S/PDIF lock status on 0404b, which is
 		// kinda weird and unhelpful, because monitoring it via IRQ is
 		// impractical (one gets an IRQ flood as long as it is desynced).
@@ -197,6 +199,8 @@ static void snd_emu10k1_proc_spdif_read(struct snd_info_entry *entry,
 			snd_iprintf(buffer, "\nS/PDIF mode: %s%s\n",
 				    value & EMU_HANA_SPDIF_MODE_RX_PRO ? "professional" : "consumer",
 				    value & EMU_HANA_SPDIF_MODE_RX_NOCOPY ? ", no copy" : "");
+
+		snd_emu1010_fpga_unlock(emu);
 	} else {
 		snd_emu10k1_proc_spdif_status(emu, buffer, "CD-ROM S/PDIF In", CDCS, CDSRCS);
 		snd_emu10k1_proc_spdif_status(emu, buffer, "Optical or Coax S/PDIF In", GPSCS, GPSRCS);
@@ -458,6 +462,9 @@ static void snd_emu_proc_emu1010_reg_read(struct snd_info_entry *entry,
 	struct snd_emu10k1 *emu = entry->private_data;
 	u32 value;
 	int i;
+
+	snd_emu1010_fpga_lock(emu);
+
 	snd_iprintf(buffer, "EMU1010 Registers:\n\n");
 
 	for(i = 0; i < 0x40; i+=1) {
@@ -496,6 +503,8 @@ static void snd_emu_proc_emu1010_reg_read(struct snd_info_entry *entry,
 			snd_emu_proc_emu1010_link_read(emu, buffer, 0x701);
 		}
 	}
+
+	snd_emu1010_fpga_unlock(emu);
 }
 
 static void snd_emu_proc_io_reg_read(struct snd_info_entry *entry,
diff --git a/sound/pci/emu10k1/io.c b/sound/pci/emu10k1/io.c
index 74df2330015f..f3260a81e47b 100644
--- a/sound/pci/emu10k1/io.c
+++ b/sound/pci/emu10k1/io.c
@@ -289,20 +289,28 @@ static void snd_emu1010_fpga_write_locked(struct snd_emu10k1 *emu, u32 reg, u32
 
 void snd_emu1010_fpga_write(struct snd_emu10k1 *emu, u32 reg, u32 value)
 {
-	unsigned long flags;
+	if (snd_BUG_ON(!mutex_is_locked(&emu->emu1010.lock)))
+		return;
+	snd_emu1010_fpga_write_locked(emu, reg, value);
+}
 
-	spin_lock_irqsave(&emu->emu_lock, flags);
+void snd_emu1010_fpga_write_lock(struct snd_emu10k1 *emu, u32 reg, u32 value)
+{
+	snd_emu1010_fpga_lock(emu);
 	snd_emu1010_fpga_write_locked(emu, reg, value);
-	spin_unlock_irqrestore(&emu->emu_lock, flags);
+	snd_emu1010_fpga_unlock(emu);
 }
 
-static void snd_emu1010_fpga_read_locked(struct snd_emu10k1 *emu, u32 reg, u32 *value)
+void snd_emu1010_fpga_read(struct snd_emu10k1 *emu, u32 reg, u32 *value)
 {
 	// The higest input pin is used as the designated interrupt trigger,
 	// so it needs to be masked out.
 	// But note that any other input pin change will also cause an IRQ,
 	// so using this function often causes an IRQ as a side effect.
 	u32 mask = emu->card_capabilities->ca0108_chip ? 0x1f : 0x7f;
+
+	if (snd_BUG_ON(!mutex_is_locked(&emu->emu1010.lock)))
+		return;
 	if (snd_BUG_ON(reg > 0x3f))
 		return;
 	reg += 0x40; /* 0x40 upwards are registers. */
@@ -313,47 +321,31 @@ static void snd_emu1010_fpga_read_locked(struct snd_emu10k1 *emu, u32 reg, u32 *
 	*value = ((inw(emu->port + A_GPIO) >> 8) & mask);
 }
 
-void snd_emu1010_fpga_read(struct snd_emu10k1 *emu, u32 reg, u32 *value)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&emu->emu_lock, flags);
-	snd_emu1010_fpga_read_locked(emu, reg, value);
-	spin_unlock_irqrestore(&emu->emu_lock, flags);
-}
-
 /* Each Destination has one and only one Source,
  * but one Source can feed any number of Destinations simultaneously.
  */
 void snd_emu1010_fpga_link_dst_src_write(struct snd_emu10k1 *emu, u32 dst, u32 src)
 {
-	unsigned long flags;
-
 	if (snd_BUG_ON(dst & ~0x71f))
 		return;
 	if (snd_BUG_ON(src & ~0x71f))
 		return;
-	spin_lock_irqsave(&emu->emu_lock, flags);
-	snd_emu1010_fpga_write_locked(emu, EMU_HANA_DESTHI, dst >> 8);
-	snd_emu1010_fpga_write_locked(emu, EMU_HANA_DESTLO, dst & 0x1f);
-	snd_emu1010_fpga_write_locked(emu, EMU_HANA_SRCHI, src >> 8);
-	snd_emu1010_fpga_write_locked(emu, EMU_HANA_SRCLO, src & 0x1f);
-	spin_unlock_irqrestore(&emu->emu_lock, flags);
+	snd_emu1010_fpga_write(emu, EMU_HANA_DESTHI, dst >> 8);
+	snd_emu1010_fpga_write(emu, EMU_HANA_DESTLO, dst & 0x1f);
+	snd_emu1010_fpga_write(emu, EMU_HANA_SRCHI, src >> 8);
+	snd_emu1010_fpga_write(emu, EMU_HANA_SRCLO, src & 0x1f);
 }
 
 u32 snd_emu1010_fpga_link_dst_src_read(struct snd_emu10k1 *emu, u32 dst)
 {
-	unsigned long flags;
 	u32 hi, lo;
 
 	if (snd_BUG_ON(dst & ~0x71f))
 		return 0;
-	spin_lock_irqsave(&emu->emu_lock, flags);
-	snd_emu1010_fpga_write_locked(emu, EMU_HANA_DESTHI, dst >> 8);
-	snd_emu1010_fpga_write_locked(emu, EMU_HANA_DESTLO, dst & 0x1f);
-	snd_emu1010_fpga_read_locked(emu, EMU_HANA_SRCHI, &hi);
-	snd_emu1010_fpga_read_locked(emu, EMU_HANA_SRCLO, &lo);
-	spin_unlock_irqrestore(&emu->emu_lock, flags);
+	snd_emu1010_fpga_write(emu, EMU_HANA_DESTHI, dst >> 8);
+	snd_emu1010_fpga_write(emu, EMU_HANA_DESTLO, dst & 0x1f);
+	snd_emu1010_fpga_read(emu, EMU_HANA_SRCHI, &hi);
+	snd_emu1010_fpga_read(emu, EMU_HANA_SRCLO, &lo);
 	return (hi << 8) | lo;
 }
 
-- 
cgit v1.2.3


From 4c0c36863cc560d24ad482b3ba3bccd62bb101d5 Mon Sep 17 00:00:00 2001
From: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Date: Sun, 28 Apr 2024 11:37:16 +0200
Subject: ALSA: emu10k1: move snd_emu1010_load_firmware_entry() to io.c

It is a low-level I/O access function, so io.c is the natural place for
it.

While we're moving the code, reduce the scope of some variables, use
compound assignment operators, and add/adjust some comments.

Signed-off-by: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Message-ID: <20240428093717.3198716-4-oswald.buddenhagen@gmx.de>
---
 include/sound/emu10k1.h          |  1 +
 sound/pci/emu10k1/emu10k1_main.c | 41 ----------------------------------
 sound/pci/emu10k1/io.c           | 48 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 49 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/include/sound/emu10k1.h b/include/sound/emu10k1.h
index 234b5baea69c..2856f4717c93 100644
--- a/include/sound/emu10k1.h
+++ b/include/sound/emu10k1.h
@@ -1843,6 +1843,7 @@ void snd_emu1010_fpga_link_dst_src_write(struct snd_emu10k1 *emu, u32 dst, u32 s
 u32 snd_emu1010_fpga_link_dst_src_read(struct snd_emu10k1 *emu, u32 dst);
 int snd_emu1010_get_raw_rate(struct snd_emu10k1 *emu, u8 src);
 void snd_emu1010_update_clock(struct snd_emu10k1 *emu);
+void snd_emu1010_load_firmware_entry(struct snd_emu10k1 *emu, const struct firmware *fw_entry);
 unsigned int snd_emu10k1_efx_read(struct snd_emu10k1 *emu, unsigned int pc);
 void snd_emu10k1_intr_enable(struct snd_emu10k1 *emu, unsigned int intrenb);
 void snd_emu10k1_intr_disable(struct snd_emu10k1 *emu, unsigned int intrenb);
diff --git a/sound/pci/emu10k1/emu10k1_main.c b/sound/pci/emu10k1/emu10k1_main.c
index ec010971a220..c8aa4143ac4f 100644
--- a/sound/pci/emu10k1/emu10k1_main.c
+++ b/sound/pci/emu10k1/emu10k1_main.c
@@ -652,47 +652,6 @@ static int snd_emu10k1_cardbus_init(struct snd_emu10k1 *emu)
 	return 0;
 }
 
-static void snd_emu1010_load_firmware_entry(struct snd_emu10k1 *emu,
-				     const struct firmware *fw_entry)
-{
-	int n, i;
-	u16 reg;
-	u8 value;
-	__always_unused u16 write_post;
-
-	/* The FPGA is a Xilinx Spartan IIE XC2S50E */
-	/* On E-MU 0404b it is a Xilinx Spartan III XC3S50 */
-	/* GPIO7 -> FPGA PGMN
-	 * GPIO6 -> FPGA CCLK
-	 * GPIO5 -> FPGA DIN
-	 * FPGA CONFIG OFF -> FPGA PGMN
-	 */
-	spin_lock_irq(&emu->emu_lock);
-	outw(0x00, emu->port + A_GPIO); /* Set PGMN low for 100uS. */
-	write_post = inw(emu->port + A_GPIO);
-	udelay(100);
-	outw(0x80, emu->port + A_GPIO); /* Leave bit 7 set during netlist setup. */
-	write_post = inw(emu->port + A_GPIO);
-	udelay(100); /* Allow FPGA memory to clean */
-	for (n = 0; n < fw_entry->size; n++) {
-		value = fw_entry->data[n];
-		for (i = 0; i < 8; i++) {
-			reg = 0x80;
-			if (value & 0x1)
-				reg = reg | 0x20;
-			value = value >> 1;
-			outw(reg, emu->port + A_GPIO);
-			write_post = inw(emu->port + A_GPIO);
-			outw(reg | 0x40, emu->port + A_GPIO);
-			write_post = inw(emu->port + A_GPIO);
-		}
-	}
-	/* After programming, set GPIO bit 4 high again. */
-	outw(0x10, emu->port + A_GPIO);
-	write_post = inw(emu->port + A_GPIO);
-	spin_unlock_irq(&emu->emu_lock);
-}
-
 /* firmware file names, per model, init-fw and dock-fw (optional) */
 static const char * const firmware_names[5][2] = {
 	[EMU_MODEL_EMU1010] = {
diff --git a/sound/pci/emu10k1/io.c b/sound/pci/emu10k1/io.c
index f4a1c2d4b078..fafa299efa5c 100644
--- a/sound/pci/emu10k1/io.c
+++ b/sound/pci/emu10k1/io.c
@@ -422,6 +422,54 @@ void snd_emu1010_update_clock(struct snd_emu10k1 *emu)
 	snd_emu1010_fpga_write(emu, EMU_HANA_DOCK_LEDS_2, leds);
 }
 
+void snd_emu1010_load_firmware_entry(struct snd_emu10k1 *emu,
+				     const struct firmware *fw_entry)
+{
+	__always_unused u16 write_post;
+
+	// On E-MU 1010 rev1 the FPGA is a Xilinx Spartan IIE XC2S50E.
+	// On E-MU 0404b it is a Xilinx Spartan III XC3S50.
+	// The wiring is as follows:
+	// GPO7 -> FPGA input & 1K resistor -> FPGA /PGMN <- FPGA output
+	//   In normal operation, the active low reset line is held up by
+	//   an FPGA output, while the GPO pin performs its duty as control
+	//   register access strobe signal. Writing the respective bit to
+	//   EMU_HANA_FPGA_CONFIG puts the FPGA output into high-Z mode, at
+	//   which point the GPO pin can control the reset line through the
+	//   resistor.
+	// GPO6 -> FPGA CCLK & FPGA input
+	// GPO5 -> FPGA DIN (dual function)
+
+	// Assert reset line for 100uS
+	outw(0x00, emu->port + A_GPIO);
+	write_post = inw(emu->port + A_GPIO);
+	udelay(100);
+	outw(0x80, emu->port + A_GPIO);
+	write_post = inw(emu->port + A_GPIO);
+	udelay(100);  // Allow FPGA memory to clean
+
+	// Upload the netlist. Keep reset line high!
+	for (int n = 0; n < fw_entry->size; n++) {
+		u8 value = fw_entry->data[n];
+		for (int i = 0; i < 8; i++) {
+			u16 reg = 0x80;
+			if (value & 1)
+				reg |= 0x20;
+			value >>= 1;
+			outw(reg, emu->port + A_GPIO);
+			write_post = inw(emu->port + A_GPIO);
+			outw(reg | 0x40, emu->port + A_GPIO);
+			write_post = inw(emu->port + A_GPIO);
+		}
+	}
+
+	// After programming, set GPIO bit 4 high again.
+	// This appears to be a config word that the rev1 Hana
+	// firmware reads; weird things happen without this.
+	outw(0x10, emu->port + A_GPIO);
+	write_post = inw(emu->port + A_GPIO);
+}
+
 void snd_emu10k1_intr_enable(struct snd_emu10k1 *emu, unsigned int intrenb)
 {
 	unsigned long flags;
-- 
cgit v1.2.3


From 1aa41272efff530ccf6cd8455133ada1d09474a2 Mon Sep 17 00:00:00 2001
From: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Date: Sun, 28 Apr 2024 11:37:17 +0200
Subject: ALSA: emu10k1: move code for entering E-MU card FPGA programming mode

... into snd_emu1010_load_firmware_entry(). This makes it clearer that
these steps belong together tightly, as implied by prior commits.

Signed-off-by: Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
Message-ID: <20240428093717.3198716-5-oswald.buddenhagen@gmx.de>
---
 include/sound/emu10k1.h          | 2 +-
 sound/pci/emu10k1/emu10k1_main.c | 7 +------
 sound/pci/emu10k1/io.c           | 7 ++++++-
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/sound/emu10k1.h b/include/sound/emu10k1.h
index 2856f4717c93..b83862259eec 100644
--- a/include/sound/emu10k1.h
+++ b/include/sound/emu10k1.h
@@ -1843,7 +1843,7 @@ void snd_emu1010_fpga_link_dst_src_write(struct snd_emu10k1 *emu, u32 dst, u32 s
 u32 snd_emu1010_fpga_link_dst_src_read(struct snd_emu10k1 *emu, u32 dst);
 int snd_emu1010_get_raw_rate(struct snd_emu10k1 *emu, u8 src);
 void snd_emu1010_update_clock(struct snd_emu10k1 *emu);
-void snd_emu1010_load_firmware_entry(struct snd_emu10k1 *emu, const struct firmware *fw_entry);
+void snd_emu1010_load_firmware_entry(struct snd_emu10k1 *emu, int dock, const struct firmware *fw_entry);
 unsigned int snd_emu10k1_efx_read(struct snd_emu10k1 *emu, unsigned int pc);
 void snd_emu10k1_intr_enable(struct snd_emu10k1 *emu, unsigned int intrenb);
 void snd_emu10k1_intr_disable(struct snd_emu10k1 *emu, unsigned int intrenb);
diff --git a/sound/pci/emu10k1/emu10k1_main.c b/sound/pci/emu10k1/emu10k1_main.c
index c8aa4143ac4f..5b8a5ba825bd 100644
--- a/sound/pci/emu10k1/emu10k1_main.c
+++ b/sound/pci/emu10k1/emu10k1_main.c
@@ -683,7 +683,7 @@ static int snd_emu1010_load_firmware(struct snd_emu10k1 *emu, int dock,
 			return err;
 	}
 
-	snd_emu1010_load_firmware_entry(emu, *fw);
+	snd_emu1010_load_firmware_entry(emu, dock, *fw);
 	return 0;
 }
 
@@ -699,9 +699,6 @@ static void snd_emu1010_load_dock_firmware(struct snd_emu10k1 *emu)
 	msleep(200);
 
 	dev_info(emu->card->dev, "emu1010: Loading Audio Dock Firmware\n");
-	/* Return to Audio Dock programming mode */
-	snd_emu1010_fpga_write(emu, EMU_HANA_FPGA_CONFIG,
-			       EMU_HANA_FPGA_CONFIG_AUDIODOCK);
 	err = snd_emu1010_load_firmware(emu, 1, &emu->dock_fw);
 	if (err < 0)
 		return;
@@ -820,8 +817,6 @@ static int snd_emu10k1_emu1010_init(struct snd_emu10k1 *emu)
 	snd_emu1010_fpga_lock(emu);
 
 	dev_info(emu->card->dev, "emu1010: Loading Hana Firmware\n");
-	snd_emu1010_fpga_write(emu, EMU_HANA_FPGA_CONFIG,
-			       EMU_HANA_FPGA_CONFIG_HANA);
 	err = snd_emu1010_load_firmware(emu, 0, &emu->firmware);
 	if (err < 0) {
 		dev_info(emu->card->dev, "emu1010: Loading Firmware failed\n");
diff --git a/sound/pci/emu10k1/io.c b/sound/pci/emu10k1/io.c
index fafa299efa5c..b60ab5671e00 100644
--- a/sound/pci/emu10k1/io.c
+++ b/sound/pci/emu10k1/io.c
@@ -422,7 +422,7 @@ void snd_emu1010_update_clock(struct snd_emu10k1 *emu)
 	snd_emu1010_fpga_write(emu, EMU_HANA_DOCK_LEDS_2, leds);
 }
 
-void snd_emu1010_load_firmware_entry(struct snd_emu10k1 *emu,
+void snd_emu1010_load_firmware_entry(struct snd_emu10k1 *emu, int dock,
 				     const struct firmware *fw_entry)
 {
 	__always_unused u16 write_post;
@@ -440,6 +440,11 @@ void snd_emu1010_load_firmware_entry(struct snd_emu10k1 *emu,
 	// GPO6 -> FPGA CCLK & FPGA input
 	// GPO5 -> FPGA DIN (dual function)
 
+	// If the FPGA is already programmed, return it to programming mode
+	snd_emu1010_fpga_write(emu, EMU_HANA_FPGA_CONFIG,
+			       dock ? EMU_HANA_FPGA_CONFIG_AUDIODOCK :
+				      EMU_HANA_FPGA_CONFIG_HANA);
+
 	// Assert reset line for 100uS
 	outw(0x00, emu->port + A_GPIO);
 	write_post = inw(emu->port + A_GPIO);
-- 
cgit v1.2.3


From 670e98a34a9e44cd384bafbda681c8c8e072b714 Mon Sep 17 00:00:00 2001
From: Kai-Heng Feng <kai.heng.feng@canonical.com>
Date: Wed, 20 Mar 2024 16:43:16 +0800
Subject: ACPI: IPMI: Add helper to wait for when SMI is selected

On Dell servers, many APCI methods of acpi_power_meter module evaluate
variables inside IPMI region, so the region handler needs to be
installed. In addition to that, the handler needs to be fully
functional, and that depends on SMI being selected.

So add a helper to let acpi_power_meter know when the handler is
installed and ready to be used.

Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Link: https://lore.kernel.org/r/20240320084317.366853-1-kai.heng.feng@canonical.com
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/acpi/acpi_ipmi.c | 23 ++++++++++++++++++++++-
 include/acpi/acpi_bus.h  |  5 +++++
 2 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/acpi/acpi_ipmi.c b/drivers/acpi/acpi_ipmi.c
index 0555f68c2dfd..5fba4dab5d08 100644
--- a/drivers/acpi/acpi_ipmi.c
+++ b/drivers/acpi/acpi_ipmi.c
@@ -22,6 +22,8 @@ MODULE_LICENSE("GPL");
 /* the IPMI timeout is 5s */
 #define IPMI_TIMEOUT			(5000)
 #define ACPI_IPMI_MAX_MSG_LENGTH	64
+/* 2s should be suffient for SMI being selected */
+#define ACPI_IPMI_SMI_SELECTION_TIMEOUT	(2 * HZ)
 
 struct acpi_ipmi_device {
 	/* the device list attached to driver_data.ipmi_devices */
@@ -54,6 +56,7 @@ struct ipmi_driver_data {
 	 * to this selected global IPMI system interface.
 	 */
 	struct acpi_ipmi_device *selected_smi;
+	struct completion smi_selection_done;
 };
 
 struct acpi_ipmi_msg {
@@ -463,8 +466,10 @@ static void ipmi_register_bmc(int iface, struct device *dev)
 		if (temp->handle == handle)
 			goto err_lock;
 	}
-	if (!driver_data.selected_smi)
+	if (!driver_data.selected_smi) {
 		driver_data.selected_smi = ipmi_device;
+		complete(&driver_data.smi_selection_done);
+	}
 	list_add_tail(&ipmi_device->head, &driver_data.ipmi_devices);
 	mutex_unlock(&driver_data.ipmi_lock);
 
@@ -578,6 +583,20 @@ out_msg:
 	return status;
 }
 
+int acpi_wait_for_acpi_ipmi(void)
+{
+	long ret;
+
+	ret = wait_for_completion_interruptible_timeout(&driver_data.smi_selection_done,
+							ACPI_IPMI_SMI_SELECTION_TIMEOUT);
+
+	if (ret <= 0)
+		return -ETIMEDOUT;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(acpi_wait_for_acpi_ipmi);
+
 static int __init acpi_ipmi_init(void)
 {
 	int result;
@@ -586,6 +605,8 @@ static int __init acpi_ipmi_init(void)
 	if (acpi_disabled)
 		return 0;
 
+	init_completion(&driver_data.smi_selection_done);
+
 	status = acpi_install_address_space_handler(ACPI_ROOT_OBJECT,
 						    ACPI_ADR_SPACE_IPMI,
 						    &acpi_ipmi_space_handler,
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 5de954e2b18a..5a69cbd58c5e 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -976,11 +976,16 @@ static inline void acpi_put_acpi_dev(struct acpi_device *adev)
 {
 	acpi_dev_put(adev);
 }
+
+int acpi_wait_for_acpi_ipmi(void);
+
 #else	/* CONFIG_ACPI */
 
 static inline int register_acpi_bus_type(void *bus) { return 0; }
 static inline int unregister_acpi_bus_type(void *bus) { return 0; }
 
+static inline int acpi_wait_for_acpi_ipmi(void) { return 0; }
+
 #endif				/* CONFIG_ACPI */
 
 #endif /*__ACPI_BUS_H__*/
-- 
cgit v1.2.3


From b6d2e438e16c7d4dbde08cfb2b95b0f3f325ba40 Mon Sep 17 00:00:00 2001
From: Antony Antony <antony.antony@secunet.com>
Date: Fri, 26 Apr 2024 12:15:13 +0200
Subject: xfrm: Correct spelling mistake in xfrm.h comment

A spelling error was found in the comment section of
include/uapi/linux/xfrm.h. Since this header file is copied to many
userspace programs and undergoes Debian spellcheck, it's preferable to
fix it in upstream rather than downstream having exceptions.

This commit fixes the spelling mistake.

Fixes: df71837d5024 ("[LSM-IPSec]: Security association restriction.")
Signed-off-by: Antony Antony <antony.antony@secunet.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/uapi/linux/xfrm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index 6a77328be114..594b66e16395 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -228,7 +228,7 @@ enum {
 #define XFRM_NR_MSGTYPES (XFRM_MSG_MAX + 1 - XFRM_MSG_BASE)
 
 /*
- * Generic LSM security context for comunicating to user space
+ * Generic LSM security context for communicating to user space
  * NOTE: Same format as sadb_x_sec_ctx
  */
 struct xfrm_user_sec_ctx {
-- 
cgit v1.2.3


From 15b429f4e047dc4f55bc38bc8e2557a812a7d822 Mon Sep 17 00:00:00 2001
From: Jithu Joseph <jithu.joseph@intel.com>
Date: Fri, 12 Apr 2024 10:23:48 -0700
Subject: platform/x86/intel/ifs: trace: display batch num in hex

In Field Scan test image files are named in ff-mm-ss-<batch02x>.scan
format. Current trace output, prints the batch number in decimal format.

Make it easier to correlate the trace line to a test image file by
showing the batch number also in hex format.

Add 0x prefix to all fields in the trace line to make the type explicit.

Signed-off-by: Jithu Joseph <jithu.joseph@intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Ashok Raj <ashok.raj@intel.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Link: https://lore.kernel.org/r/20240412172349.544064-3-jithu.joseph@intel.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/trace/events/intel_ifs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/intel_ifs.h b/include/trace/events/intel_ifs.h
index 8ce2de120f2d..0d88ebf2c980 100644
--- a/include/trace/events/intel_ifs.h
+++ b/include/trace/events/intel_ifs.h
@@ -28,7 +28,7 @@ TRACE_EVENT(ifs_status,
 		__entry->status	= status;
 	),
 
-	TP_printk("batch: %.2d, start: %.4x, stop: %.4x, status: %.16llx",
+	TP_printk("batch: 0x%.2x, start: 0x%.4x, stop: 0x%.4x, status: 0x%.16llx",
 		__entry->batch,
 		__entry->start,
 		__entry->stop,
-- 
cgit v1.2.3


From b5327b9a300e2ecec1372ed375615f39e3df0542 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 26 Apr 2024 10:47:22 +0000
Subject: ipv6: use call_rcu_hurry() in fib6_info_release()

This is a followup of commit c4e86b4363ac ("net: add two more
call_rcu_hurry()")

fib6_info_destroy_rcu() is calling nexthop_put() or fib6_nh_release()

We must not delay it too much or risk unregister_netdevice/ref_tracker
traces because references to netdev are not released in time.

This should speedup device/netns dismantles when CONFIG_RCU_LAZY=y

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip6_fib.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 323c94f1845b..7834f7f29d3c 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -338,7 +338,7 @@ static inline void fib6_info_release(struct fib6_info *f6i)
 {
 	if (f6i && refcount_dec_and_test(&f6i->fib6_ref)) {
 		DEBUG_NET_WARN_ON_ONCE(!hlist_unhashed(&f6i->gc_link));
-		call_rcu(&f6i->rcu, fib6_info_destroy_rcu);
+		call_rcu_hurry(&f6i->rcu, fib6_info_destroy_rcu);
 	}
 }
 
-- 
cgit v1.2.3


From 1192534407d0f8ab9a0503052777260ae74968c3 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Tue, 23 Apr 2024 13:46:12 -0700
Subject: platform/x86/intel/tpmi: Align comments in kernel-doc
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Align comments in kernel-doc for the struct intel_tpmi_plat_info.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20240423204619.3946901-4-srinivas.pandruvada@linux.intel.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/intel_tpmi.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/intel_tpmi.h b/include/linux/intel_tpmi.h
index a3529b962be6..685a41dddf82 100644
--- a/include/linux/intel_tpmi.h
+++ b/include/linux/intel_tpmi.h
@@ -27,9 +27,9 @@ enum intel_tpmi_id {
 
 /**
  * struct intel_tpmi_plat_info - Platform information for a TPMI device instance
- * @package_id:	CPU Package id
- * @bus_number:	PCI bus number
- * @device_number: PCI device number
+ * @package_id:      CPU Package id
+ * @bus_number:      PCI bus number
+ * @device_number:   PCI device number
  * @function_number: PCI function number
  *
  * Structure to store platform data for a TPMI device instance. This
-- 
cgit v1.2.3


From c8405cc815151a8b2fa6f7510ede8256228e45da Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Tue, 23 Apr 2024 13:46:13 -0700
Subject: platform/x86/intel/tpmi: Add additional TPMI header fields
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TPMI information header added additional fields in version 2. Some of the
reserved fields in version 1 are used to define new fields.

Parse new fields and export as part of platform data. These fields include:
- PCI segment ID
- Partition ID of the package: If a package is represented by more than
  one PCI device, then partition ID along with cdie_mask, describes the
  scope. For example to update get/set properties for a compute die, one
  of the PCI MMIO region is selected from the partition ID.
- cdie_mask: Mask of all compute dies in this partition.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Zhang Rui <rui.zhang@intel.com>
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Link: https://lore.kernel.org/r/20240423204619.3946901-5-srinivas.pandruvada@linux.intel.com
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 drivers/platform/x86/intel/tpmi.c | 15 ++++++++++++++-
 include/linux/intel_tpmi.h        |  6 ++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/platform/x86/intel/tpmi.c b/drivers/platform/x86/intel/tpmi.c
index a5bcb77bcb35..6c0cbccd80bb 100644
--- a/drivers/platform/x86/intel/tpmi.c
+++ b/drivers/platform/x86/intel/tpmi.c
@@ -128,6 +128,9 @@ struct intel_tpmi_info {
  * @dev:	PCI device number
  * @bus:	PCI bus number
  * @pkg:	CPU Package id
+ * @segment:	PCI segment id
+ * @partition:	Package Partition id
+ * @cdie_mask:	Bitmap of compute dies in the current partition
  * @reserved:	Reserved for future use
  * @lock:	When set to 1 the register is locked and becomes read-only
  *		until next reset. Not for use by the OS driver.
@@ -139,7 +142,10 @@ struct tpmi_info_header {
 	u64 dev:5;
 	u64 bus:8;
 	u64 pkg:8;
-	u64 reserved:39;
+	u64 segment:8;
+	u64 partition:2;
+	u64 cdie_mask:16;
+	u64 reserved:13;
 	u64 lock:1;
 } __packed;
 
@@ -667,6 +673,7 @@ static int tpmi_create_devices(struct intel_tpmi_info *tpmi_info)
 
 #define TPMI_INFO_BUS_INFO_OFFSET	0x08
 #define TPMI_INFO_MAJOR_VERSION		0x00
+#define TPMI_INFO_MINOR_VERSION		0x02
 
 static int tpmi_process_info(struct intel_tpmi_info *tpmi_info,
 			     struct intel_tpmi_pm_feature *pfs)
@@ -693,6 +700,12 @@ static int tpmi_process_info(struct intel_tpmi_info *tpmi_info,
 	tpmi_info->plat_info.device_number = header.dev;
 	tpmi_info->plat_info.function_number = header.fn;
 
+	if (TPMI_MINOR_VERSION(feature_header) >= TPMI_INFO_MINOR_VERSION) {
+		tpmi_info->plat_info.cdie_mask = header.cdie_mask;
+		tpmi_info->plat_info.partition = header.partition;
+		tpmi_info->plat_info.segment = header.segment;
+	}
+
 error_info_header:
 	iounmap(info_mem);
 
diff --git a/include/linux/intel_tpmi.h b/include/linux/intel_tpmi.h
index 685a41dddf82..1e880cb0f454 100644
--- a/include/linux/intel_tpmi.h
+++ b/include/linux/intel_tpmi.h
@@ -27,7 +27,10 @@ enum intel_tpmi_id {
 
 /**
  * struct intel_tpmi_plat_info - Platform information for a TPMI device instance
+ * @cdie_mask:       Mask of all compute dies in the partition
  * @package_id:      CPU Package id
+ * @partition:       Package partition id when multiple VSEC PCI devices per package
+ * @segment:         PCI segment ID
  * @bus_number:      PCI bus number
  * @device_number:   PCI device number
  * @function_number: PCI function number
@@ -36,7 +39,10 @@ enum intel_tpmi_id {
  * struct is used to return data via tpmi_get_platform_data().
  */
 struct intel_tpmi_plat_info {
+	u16 cdie_mask;
 	u8 package_id;
+	u8 partition;
+	u8 segment;
 	u8 bus_number;
 	u8 device_number;
 	u8 function_number;
-- 
cgit v1.2.3


From e8dfd42c17faf183415323db1ef0c977be0d6489 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 26 Apr 2024 15:19:52 +0000
Subject: ipv6: introduce dst_rt6_info() helper

Instead of (struct rt6_info *)dst casts, we can use :

 #define dst_rt6_info(_ptr) \
         container_of_const(_ptr, struct rt6_info, dst)

Some places needed missing const qualifiers :

ip6_confirm_neigh(), ipv6_anycast_destination(),
ipv6_unicast_destination(), has_gateway()

v2: added missing parts (David Ahern)

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/core/addr.c                     |  6 ++---
 .../net/ethernet/mellanox/mlxsw/spectrum_span.c    |  2 +-
 drivers/net/vrf.c                                  |  2 +-
 drivers/net/vxlan/vxlan_core.c                     |  2 +-
 drivers/s390/net/qeth_core.h                       |  4 ++--
 include/net/ip6_fib.h                              |  6 +++--
 include/net/ip6_route.h                            | 11 ++++-----
 net/bluetooth/6lowpan.c                            |  2 +-
 net/core/dst_cache.c                               |  2 +-
 net/core/filter.c                                  |  2 +-
 net/ipv4/ip_tunnel.c                               |  2 +-
 net/ipv6/icmp.c                                    |  8 +++----
 net/ipv6/ila/ila_lwt.c                             |  4 ++--
 net/ipv6/ip6_output.c                              | 18 +++++++-------
 net/ipv6/ip6mr.c                                   |  2 +-
 net/ipv6/ndisc.c                                   |  2 +-
 net/ipv6/ping.c                                    |  2 +-
 net/ipv6/raw.c                                     |  4 ++--
 net/ipv6/route.c                                   | 28 +++++++++++-----------
 net/ipv6/tcp_ipv6.c                                |  4 +---
 net/ipv6/udp.c                                     | 11 ++++-----
 net/ipv6/xfrm6_policy.c                            |  2 +-
 net/l2tp/l2tp_ip6.c                                |  2 +-
 net/mpls/mpls_iptunnel.c                           |  2 +-
 net/netfilter/ipvs/ip_vs_xmit.c                    | 14 +++++------
 net/netfilter/nf_flow_table_core.c                 |  8 ++-----
 net/netfilter/nf_flow_table_ip.c                   |  4 ++--
 net/netfilter/nft_rt.c                             |  2 +-
 net/sctp/ipv6.c                                    |  2 +-
 net/xfrm/xfrm_policy.c                             |  3 +--
 30 files changed, 77 insertions(+), 86 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index f253295795f0..f20dfe70fa0e 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -348,15 +348,15 @@ static int dst_fetch_ha(const struct dst_entry *dst,
 
 static bool has_gateway(const struct dst_entry *dst, sa_family_t family)
 {
-	struct rtable *rt;
-	struct rt6_info *rt6;
+	const struct rtable *rt;
+	const struct rt6_info *rt6;
 
 	if (family == AF_INET) {
 		rt = container_of(dst, struct rtable, dst);
 		return rt->rt_uses_gateway;
 	}
 
-	rt6 = container_of(dst, struct rt6_info, dst);
+	rt6 = dst_rt6_info(dst);
 	return rt6->rt6i_flags & RTF_GATEWAY;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
index 3de69b2eb5c4..4b5fd71c897d 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
@@ -540,7 +540,7 @@ mlxsw_sp_span_gretap6_route(const struct net_device *to_dev,
 	if (!dst || dst->error)
 		goto out;
 
-	rt6 = container_of(dst, struct rt6_info, dst);
+	rt6 = dst_rt6_info(dst);
 
 	dev = dst->dev;
 	*saddrp = fl6.saddr;
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 66f8542f3b18..784b9b2d275e 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -653,7 +653,7 @@ static int vrf_finish_output6(struct net *net, struct sock *sk,
 	skb->dev = dev;
 
 	rcu_read_lock();
-	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
+	nexthop = rt6_nexthop(dst_rt6_info(dst), &ipv6_hdr(skb)->daddr);
 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 	if (unlikely(!neigh))
 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
index 4bbfb516ac05..a9a85d0f5a5d 100644
--- a/drivers/net/vxlan/vxlan_core.c
+++ b/drivers/net/vxlan/vxlan_core.c
@@ -2513,7 +2513,7 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 		}
 
 		if (!info) {
-			u32 rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags;
+			u32 rt6i_flags = dst_rt6_info(ndst)->rt6i_flags;
 
 			err = encap_bypass_if_local(skb, dev, vxlan, AF_INET6,
 						    dst_port, ifindex, vni,
diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
index 613eab729704..5f17a2a5d0e3 100644
--- a/drivers/s390/net/qeth_core.h
+++ b/drivers/s390/net/qeth_core.h
@@ -956,7 +956,7 @@ static inline struct dst_entry *qeth_dst_check_rcu(struct sk_buff *skb,
 	struct dst_entry *dst = skb_dst(skb);
 	struct rt6_info *rt;
 
-	rt = (struct rt6_info *) dst;
+	rt = dst_rt6_info(dst);
 	if (dst) {
 		if (proto == htons(ETH_P_IPV6))
 			dst = dst_check(dst, rt6_get_cookie(rt));
@@ -978,7 +978,7 @@ static inline __be32 qeth_next_hop_v4_rcu(struct sk_buff *skb,
 static inline struct in6_addr *qeth_next_hop_v6_rcu(struct sk_buff *skb,
 						    struct dst_entry *dst)
 {
-	struct rt6_info *rt = (struct rt6_info *) dst;
+	struct rt6_info *rt = dst_rt6_info(dst);
 
 	if (rt && !ipv6_addr_any(&rt->rt6i_gateway))
 		return &rt->rt6i_gateway;
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 7834f7f29d3c..6cb867ce4878 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -234,9 +234,11 @@ struct fib6_result {
 	for (rt = (w)->leaf; rt;					\
 	     rt = rcu_dereference_protected(rt->fib6_next, 1))
 
-static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst)
+#define dst_rt6_info(_ptr) container_of_const(_ptr, struct rt6_info, dst)
+
+static inline struct inet6_dev *ip6_dst_idev(const struct dst_entry *dst)
 {
-	return ((struct rt6_info *)dst)->rt6i_idev;
+	return dst_rt6_info(dst)->rt6i_idev;
 }
 
 static inline bool fib6_requires_src(const struct fib6_info *rt)
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index a30c6aa9e5cf..a18ed24fed94 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -210,12 +210,11 @@ void rt6_uncached_list_del(struct rt6_info *rt);
 static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb)
 {
 	const struct dst_entry *dst = skb_dst(skb);
-	const struct rt6_info *rt6 = NULL;
 
 	if (dst)
-		rt6 = container_of(dst, struct rt6_info, dst);
+		return dst_rt6_info(dst);
 
-	return rt6;
+	return NULL;
 }
 
 /*
@@ -227,7 +226,7 @@ static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst,
 {
 	struct ipv6_pinfo *np = inet6_sk(sk);
 
-	np->dst_cookie = rt6_get_cookie((struct rt6_info *)dst);
+	np->dst_cookie = rt6_get_cookie(dst_rt6_info(dst));
 	sk_setup_caps(sk, dst);
 	np->daddr_cache = daddr;
 #ifdef CONFIG_IPV6_SUBTREES
@@ -240,7 +239,7 @@ void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
 
 static inline bool ipv6_unicast_destination(const struct sk_buff *skb)
 {
-	struct rt6_info *rt = (struct rt6_info *) skb_dst(skb);
+	const struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
 
 	return rt->rt6i_flags & RTF_LOCAL;
 }
@@ -248,7 +247,7 @@ static inline bool ipv6_unicast_destination(const struct sk_buff *skb)
 static inline bool ipv6_anycast_destination(const struct dst_entry *dst,
 					    const struct in6_addr *daddr)
 {
-	struct rt6_info *rt = (struct rt6_info *)dst;
+	const struct rt6_info *rt = dst_rt6_info(dst);
 
 	return rt->rt6i_flags & RTF_ANYCAST ||
 		(rt->rt6i_dst.plen < 127 &&
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index 27520a8a486f..50cfec8ccac4 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -133,7 +133,7 @@ static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_btle_dev *dev,
 						  struct in6_addr *daddr,
 						  struct sk_buff *skb)
 {
-	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
+	struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
 	int count = atomic_read(&dev->peer_count);
 	const struct in6_addr *nexthop;
 	struct lowpan_peer *peer;
diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c
index 0ccfd5fa5cb9..b17171345d64 100644
--- a/net/core/dst_cache.c
+++ b/net/core/dst_cache.c
@@ -112,7 +112,7 @@ void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst,
 
 	idst = this_cpu_ptr(dst_cache->cache);
 	dst_cache_per_cpu_dst_set(this_cpu_ptr(dst_cache->cache), dst,
-				  rt6_get_cookie((struct rt6_info *)dst));
+				  rt6_get_cookie(dst_rt6_info(dst)));
 	idst->in6_saddr = *saddr;
 }
 EXPORT_SYMBOL_GPL(dst_cache_set_ip6);
diff --git a/net/core/filter.c b/net/core/filter.c
index 294670d3850d..5662464e1abd 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2215,7 +2215,7 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
 	rcu_read_lock();
 	if (!nh) {
 		dst = skb_dst(skb);
-		nexthop = rt6_nexthop(container_of(dst, struct rt6_info, dst),
+		nexthop = rt6_nexthop(dst_rt6_info(dst),
 				      &ipv6_hdr(skb)->daddr);
 	} else {
 		nexthop = &nh->ipv6_nh;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 177f40c3a8e8..ba46cf7612f4 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -543,7 +543,7 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
 		struct rt6_info *rt6;
 		__be32 daddr;
 
-		rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
+		rt6 = skb_valid_dst(skb) ? dst_rt6_info(skb_dst(skb)) :
 					   NULL;
 		daddr = md ? dst : tunnel->parms.iph.daddr;
 
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 1635da07285f..d285c1f6f1a6 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -212,7 +212,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
 	} else if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) {
 		res = true;
 	} else {
-		struct rt6_info *rt = (struct rt6_info *)dst;
+		struct rt6_info *rt = dst_rt6_info(dst);
 		int tmo = net->ipv6.sysctl.icmpv6_time;
 		struct inet_peer *peer;
 
@@ -241,7 +241,7 @@ static bool icmpv6_rt_has_prefsrc(struct sock *sk, u8 type,
 
 	dst = ip6_route_output(net, sk, fl6);
 	if (!dst->error) {
-		struct rt6_info *rt = (struct rt6_info *)dst;
+		struct rt6_info *rt = dst_rt6_info(dst);
 		struct in6_addr prefsrc;
 
 		rt6_get_prefsrc(rt, &prefsrc);
@@ -616,7 +616,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 	if (ip6_append_data(sk, icmpv6_getfrag, &msg,
 			    len + sizeof(struct icmp6hdr),
 			    sizeof(struct icmp6hdr),
-			    &ipc6, &fl6, (struct rt6_info *)dst,
+			    &ipc6, &fl6, dst_rt6_info(dst),
 			    MSG_DONTWAIT)) {
 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS);
 		ip6_flush_pending_frames(sk);
@@ -803,7 +803,7 @@ static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb)
 	if (ip6_append_data(sk, icmpv6_getfrag, &msg,
 			    skb->len + sizeof(struct icmp6hdr),
 			    sizeof(struct icmp6hdr), &ipc6, &fl6,
-			    (struct rt6_info *)dst, MSG_DONTWAIT)) {
+			    dst_rt6_info(dst), MSG_DONTWAIT)) {
 		__ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS);
 		ip6_flush_pending_frames(sk);
 	} else {
diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c
index 8c1ce78956ba..0601bad79822 100644
--- a/net/ipv6/ila/ila_lwt.c
+++ b/net/ipv6/ila/ila_lwt.c
@@ -38,7 +38,7 @@ static inline struct ila_params *ila_params_lwtunnel(
 static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	struct dst_entry *orig_dst = skb_dst(skb);
-	struct rt6_info *rt = (struct rt6_info *)orig_dst;
+	struct rt6_info *rt = dst_rt6_info(orig_dst);
 	struct ila_lwt *ilwt = ila_lwt_lwtunnel(orig_dst->lwtstate);
 	struct dst_entry *dst;
 	int err = -EINVAL;
@@ -70,7 +70,7 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 		memset(&fl6, 0, sizeof(fl6));
 		fl6.flowi6_oif = orig_dst->dev->ifindex;
 		fl6.flowi6_iif = LOOPBACK_IFINDEX;
-		fl6.daddr = *rt6_nexthop((struct rt6_info *)orig_dst,
+		fl6.daddr = *rt6_nexthop(dst_rt6_info(orig_dst),
 					 &ip6h->daddr);
 
 		dst = ip6_route_output(net, NULL, &fl6);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index b9dd3a66e423..8f906e9fbc38 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -120,7 +120,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
 	IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
 
 	rcu_read_lock();
-	nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
+	nexthop = rt6_nexthop(dst_rt6_info(dst), daddr);
 	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
 
 	if (unlikely(IS_ERR_OR_NULL(neigh))) {
@@ -599,7 +599,7 @@ int ip6_forward(struct sk_buff *skb)
 		 *	send a redirect.
 		 */
 
-		rt = (struct rt6_info *) dst;
+		rt = dst_rt6_info(dst);
 		if (rt->rt6i_flags & RTF_GATEWAY)
 			target = &rt->rt6i_gateway;
 		else
@@ -856,7 +856,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
 {
 	struct sk_buff *frag;
-	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
+	struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 				inet6_sk(skb->sk) : NULL;
 	bool mono_delivery_time = skb->mono_delivery_time;
@@ -1063,7 +1063,7 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 		return NULL;
 	}
 
-	rt = (struct rt6_info *)dst;
+	rt = dst_rt6_info(dst);
 	/* Yes, checking route validity in not connected
 	 * case is not very simple. Take into account,
 	 * that we do not support routing by source, TOS,
@@ -1118,7 +1118,7 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 		struct rt6_info *rt;
 
 		*dst = ip6_route_output(net, sk, fl6);
-		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
+		rt = (*dst)->error ? NULL : dst_rt6_info(*dst);
 
 		rcu_read_lock();
 		from = rt ? rcu_dereference(rt->from) : NULL;
@@ -1159,7 +1159,7 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 	 * dst entry and replace it instead with the
 	 * dst entry of the nexthop router
 	 */
-	rt = (struct rt6_info *) *dst;
+	rt = dst_rt6_info(*dst);
 	rcu_read_lock();
 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
 				      rt6_nexthop(rt, &fl6->daddr));
@@ -1423,7 +1423,7 @@ static int __ip6_append_data(struct sock *sk,
 	int offset = 0;
 	bool zc = false;
 	u32 tskey = 0;
-	struct rt6_info *rt = (struct rt6_info *)cork->dst;
+	struct rt6_info *rt = dst_rt6_info(cork->dst);
 	bool paged, hold_tskey, extra_uref = false;
 	struct ipv6_txoptions *opt = v6_cork->opt;
 	int csummode = CHECKSUM_NONE;
@@ -1877,7 +1877,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
 	struct net *net = sock_net(sk);
 	struct ipv6hdr *hdr;
 	struct ipv6_txoptions *opt = v6_cork->opt;
-	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
+	struct rt6_info *rt = dst_rt6_info(cork->base.dst);
 	struct flowi6 *fl6 = &cork->fl.u.ip6;
 	unsigned char proto = fl6->flowi6_proto;
 
@@ -1949,7 +1949,7 @@ out:
 int ip6_send_skb(struct sk_buff *skb)
 {
 	struct net *net = sock_net(skb->sk);
-	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
+	struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
 	int err;
 
 	err = ip6_local_out(net, skb->sk, skb);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index cb0ee81a068a..dd342e6ecf3f 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -2273,7 +2273,7 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
 	int err;
 	struct mr_table *mrt;
 	struct mfc6_cache *cache;
-	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
+	struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
 
 	mrt = ip6mr_get_table(net, RT6_TABLE_DFLT);
 	if (!mrt)
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index ae134634c323..d914b23256ce 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1722,7 +1722,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
 	if (IS_ERR(dst))
 		return;
 
-	rt = (struct rt6_info *) dst;
+	rt = dst_rt6_info(dst);
 
 	if (rt->rt6i_flags & RTF_GATEWAY) {
 		ND_PRINTK(2, warn,
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index ef2059c88955..88b3fcacd4f9 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -154,7 +154,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr, false);
 	if (IS_ERR(dst))
 		return PTR_ERR(dst);
-	rt = (struct rt6_info *) dst;
+	rt = dst_rt6_info(dst);
 
 	if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
 		fl6.flowi6_oif = READ_ONCE(np->mcast_oif);
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 0d896ca7b589..2eedf255600b 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -598,7 +598,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
 	struct ipv6hdr *iph;
 	struct sk_buff *skb;
 	int err;
-	struct rt6_info *rt = (struct rt6_info *)*dstp;
+	struct rt6_info *rt = dst_rt6_info(*dstp);
 	int hlen = LL_RESERVED_SPACE(rt->dst.dev);
 	int tlen = rt->dst.dev->needed_tailroom;
 
@@ -917,7 +917,7 @@ back_from_confirm:
 		ipc6.opt = opt;
 		lock_sock(sk);
 		err = ip6_append_data(sk, raw6_getfrag, &rfv,
-			len, 0, &ipc6, &fl6, (struct rt6_info *)dst,
+			len, 0, &ipc6, &fl6, dst_rt6_info(dst),
 			msg->msg_flags);
 
 		if (err)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1f4b935a0e57..3e0b2cb20fd2 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -226,7 +226,7 @@ static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
 					      struct sk_buff *skb,
 					      const void *daddr)
 {
-	const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
+	const struct rt6_info *rt = dst_rt6_info(dst);
 
 	return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any),
 				dst->dev, skb, daddr);
@@ -234,8 +234,8 @@ static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
 
 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 {
+	const struct rt6_info *rt = dst_rt6_info(dst);
 	struct net_device *dev = dst->dev;
-	struct rt6_info *rt = (struct rt6_info *)dst;
 
 	daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr);
 	if (!daddr)
@@ -354,7 +354,7 @@ EXPORT_SYMBOL(ip6_dst_alloc);
 
 static void ip6_dst_destroy(struct dst_entry *dst)
 {
-	struct rt6_info *rt = (struct rt6_info *)dst;
+	struct rt6_info *rt = dst_rt6_info(dst);
 	struct fib6_info *from;
 	struct inet6_dev *idev;
 
@@ -373,7 +373,7 @@ static void ip6_dst_destroy(struct dst_entry *dst)
 
 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev)
 {
-	struct rt6_info *rt = (struct rt6_info *)dst;
+	struct rt6_info *rt = dst_rt6_info(dst);
 	struct inet6_dev *idev = rt->rt6i_idev;
 
 	if (idev && idev->dev != blackhole_netdev) {
@@ -1288,7 +1288,7 @@ struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
 
 	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
 	if (dst->error == 0)
-		return (struct rt6_info *) dst;
+		return dst_rt6_info(dst);
 
 	dst_release(dst);
 
@@ -2647,7 +2647,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net,
 
 	rcu_read_lock();
 	dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
-	rt6 = (struct rt6_info *)dst;
+	rt6 = dst_rt6_info(dst);
 	/* For dst cached in uncached_list, refcnt is already taken. */
 	if (list_empty(&rt6->dst.rt_uncached) && !dst_hold_safe(dst)) {
 		dst = &net->ipv6.ip6_null_entry->dst;
@@ -2661,7 +2661,7 @@ EXPORT_SYMBOL_GPL(ip6_route_output_flags);
 
 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
 {
-	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
+	struct rt6_info *rt, *ort = dst_rt6_info(dst_orig);
 	struct net_device *loopback_dev = net->loopback_dev;
 	struct dst_entry *new = NULL;
 
@@ -2744,7 +2744,7 @@ INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst,
 	struct fib6_info *from;
 	struct rt6_info *rt;
 
-	rt = container_of(dst, struct rt6_info, dst);
+	rt = dst_rt6_info(dst);
 
 	if (rt->sernum)
 		return rt6_is_valid(rt) ? dst : NULL;
@@ -2772,7 +2772,7 @@ EXPORT_INDIRECT_CALLABLE(ip6_dst_check);
 
 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
 {
-	struct rt6_info *rt = (struct rt6_info *) dst;
+	struct rt6_info *rt = dst_rt6_info(dst);
 
 	if (rt) {
 		if (rt->rt6i_flags & RTF_CACHE) {
@@ -2796,7 +2796,7 @@ static void ip6_link_failure(struct sk_buff *skb)
 
 	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
 
-	rt = (struct rt6_info *) skb_dst(skb);
+	rt = dst_rt6_info(skb_dst(skb));
 	if (rt) {
 		rcu_read_lock();
 		if (rt->rt6i_flags & RTF_CACHE) {
@@ -2852,7 +2852,7 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
 				 bool confirm_neigh)
 {
 	const struct in6_addr *daddr, *saddr;
-	struct rt6_info *rt6 = (struct rt6_info *)dst;
+	struct rt6_info *rt6 = dst_rt6_info(dst);
 
 	/* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU)
 	 * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it.
@@ -4174,7 +4174,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
 		}
 	}
 
-	rt = (struct rt6_info *) dst;
+	rt = dst_rt6_info(dst);
 	if (rt->rt6i_flags & RTF_REJECT) {
 		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
 		return;
@@ -5608,7 +5608,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 			 int iif, int type, u32 portid, u32 seq,
 			 unsigned int flags)
 {
-	struct rt6_info *rt6 = (struct rt6_info *)dst;
+	struct rt6_info *rt6 = dst_rt6_info(dst);
 	struct rt6key *rt6_dst, *rt6_src;
 	u32 *pmetrics, table, rt6_flags;
 	unsigned char nh_flags = 0;
@@ -6111,7 +6111,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	}
 
 
-	rt = container_of(dst, struct rt6_info, dst);
+	rt = dst_rt6_info(dst);
 	if (rt->dst.error) {
 		err = rt->dst.error;
 		ip6_rt_put(rt);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 77958adf2e16..37201c4fb393 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -97,11 +97,9 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
 	struct dst_entry *dst = skb_dst(skb);
 
 	if (dst && dst_hold_safe(dst)) {
-		const struct rt6_info *rt = (const struct rt6_info *)dst;
-
 		rcu_assign_pointer(sk->sk_rx_dst, dst);
 		sk->sk_rx_dst_ifindex = skb->skb_iif;
-		sk->sk_rx_dst_cookie = rt6_get_cookie(rt);
+		sk->sk_rx_dst_cookie = rt6_get_cookie(dst_rt6_info(dst));
 	}
 }
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 085ee236d9a1..674eadfae569 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -910,11 +910,8 @@ start_lookup:
 
 static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
 {
-	if (udp_sk_rx_dst_set(sk, dst)) {
-		const struct rt6_info *rt = (const struct rt6_info *)dst;
-
-		sk->sk_rx_dst_cookie = rt6_get_cookie(rt);
-	}
+	if (udp_sk_rx_dst_set(sk, dst))
+		sk->sk_rx_dst_cookie = rt6_get_cookie(dst_rt6_info(dst));
 }
 
 /* wrapper for udp_queue_rcv_skb tacking care of csum conversion and
@@ -1585,7 +1582,7 @@ back_from_confirm:
 
 		skb = ip6_make_skb(sk, getfrag, msg, ulen,
 				   sizeof(struct udphdr), &ipc6,
-				   (struct rt6_info *)dst,
+				   dst_rt6_info(dst),
 				   msg->msg_flags, &cork);
 		err = PTR_ERR(skb);
 		if (!IS_ERR_OR_NULL(skb))
@@ -1612,7 +1609,7 @@ do_append_data:
 		ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk);
 	up->len += ulen;
 	err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr),
-			      &ipc6, fl6, (struct rt6_info *)dst,
+			      &ipc6, fl6, dst_rt6_info(dst),
 			      corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
 	if (err)
 		udp_v6_flush_pending_frames(sk);
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 4891012b692f..7924e08ee142 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -70,7 +70,7 @@ static int xfrm6_get_saddr(struct net *net, int oif,
 static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 			  const struct flowi *fl)
 {
-	struct rt6_info *rt = (struct rt6_info *)xdst->route;
+	struct rt6_info *rt = dst_rt6_info(xdst->route);
 
 	xdst->u.dst.dev = dev;
 	netdev_hold(dev, &xdst->u.dst.dev_tracker, GFP_ATOMIC);
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 7bf14cf9ffaa..8780ec64f376 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -630,7 +630,7 @@ back_from_confirm:
 	ulen = len + (skb_queue_empty(&sk->sk_write_queue) ? transhdrlen : 0);
 	err = ip6_append_data(sk, ip_generic_getfrag, msg,
 			      ulen, transhdrlen, &ipc6,
-			      &fl6, (struct rt6_info *)dst,
+			      &fl6, dst_rt6_info(dst),
 			      msg->msg_flags);
 	if (err)
 		ip6_flush_pending_frames(sk);
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index 8fc790f2a01b..606349c8df0e 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -90,7 +90,7 @@ static int mpls_xmit(struct sk_buff *skb)
 			ttl = net->mpls.default_ttl;
 		else
 			ttl = ipv6_hdr(skb)->hop_limit;
-		rt6 = (struct rt6_info *)dst;
+		rt6 = dst_rt6_info(dst);
 	} else {
 		goto drop;
 	}
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 39b5fd6bbf65..6e8b9d100ad2 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -180,7 +180,7 @@ static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb,
 			(!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
 			(addr_type & IPV6_ADDR_LOOPBACK);
 		old_rt_is_local = __ip_vs_is_local_route6(
-			(struct rt6_info *)skb_dst(skb));
+			dst_rt6_info(skb_dst(skb)));
 	} else
 #endif
 	{
@@ -481,7 +481,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
 	if (dest) {
 		dest_dst = __ip_vs_dst_check(dest);
 		if (likely(dest_dst))
-			rt = (struct rt6_info *) dest_dst->dst_cache;
+			rt = dst_rt6_info(dest_dst->dst_cache);
 		else {
 			u32 cookie;
 
@@ -501,7 +501,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
 				ip_vs_dest_dst_free(dest_dst);
 				goto err_unreach;
 			}
-			rt = (struct rt6_info *) dst;
+			rt = dst_rt6_info(dst);
 			cookie = rt6_get_cookie(rt);
 			__ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie);
 			spin_unlock_bh(&dest->dst_lock);
@@ -517,7 +517,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
 					      rt_mode);
 		if (!dst)
 			goto err_unreach;
-		rt = (struct rt6_info *) dst;
+		rt = dst_rt6_info(dst);
 	}
 
 	local = __ip_vs_is_local_route6(rt);
@@ -862,7 +862,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 				      IP_VS_RT_MODE_RDR);
 	if (local < 0)
 		goto tx_error;
-	rt = (struct rt6_info *) skb_dst(skb);
+	rt = dst_rt6_info(skb_dst(skb));
 	/*
 	 * Avoid duplicate tuple in reply direction for NAT traffic
 	 * to local address when connection is sync-ed
@@ -1288,7 +1288,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 	if (local)
 		return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
 
-	rt = (struct rt6_info *) skb_dst(skb);
+	rt = dst_rt6_info(skb_dst(skb));
 	tdev = rt->dst.dev;
 
 	/*
@@ -1590,7 +1590,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
 				      &cp->daddr.in6, NULL, ipvsh, 0, rt_mode);
 	if (local < 0)
 		goto tx_error;
-	rt = (struct rt6_info *) skb_dst(skb);
+	rt = dst_rt6_info(skb_dst(skb));
 	/*
 	 * Avoid duplicate tuple in reply direction for NAT traffic
 	 * to local address when connection is sync-ed
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index a0571339239c..5c1ff07eaee0 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -77,12 +77,8 @@ EXPORT_SYMBOL_GPL(flow_offload_alloc);
 
 static u32 flow_offload_dst_cookie(struct flow_offload_tuple *flow_tuple)
 {
-	const struct rt6_info *rt;
-
-	if (flow_tuple->l3proto == NFPROTO_IPV6) {
-		rt = (const struct rt6_info *)flow_tuple->dst_cache;
-		return rt6_get_cookie(rt);
-	}
+	if (flow_tuple->l3proto == NFPROTO_IPV6)
+		return rt6_get_cookie(dst_rt6_info(flow_tuple->dst_cache));
 
 	return 0;
 }
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 5383bed3d3e0..100887beed31 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -729,7 +729,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
 		return NF_ACCEPT;
 
 	if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) {
-		rt = (struct rt6_info *)tuplehash->tuple.dst_cache;
+		rt = dst_rt6_info(tuplehash->tuple.dst_cache);
 		memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
 		IP6CB(skb)->iif = skb->dev->ifindex;
 		IP6CB(skb)->flags = IP6SKB_FORWARDED;
@@ -741,7 +741,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
 
 	switch (tuplehash->tuple.xmit_type) {
 	case FLOW_OFFLOAD_XMIT_NEIGH:
-		rt = (struct rt6_info *)tuplehash->tuple.dst_cache;
+		rt = dst_rt6_info(tuplehash->tuple.dst_cache);
 		outdev = rt->dst.dev;
 		skb->dev = outdev;
 		nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c
index 24d977138572..2434c624aafd 100644
--- a/net/netfilter/nft_rt.c
+++ b/net/netfilter/nft_rt.c
@@ -80,7 +80,7 @@ void nft_rt_get_eval(const struct nft_expr *expr,
 		if (nft_pf(pkt) != NFPROTO_IPV6)
 			goto err;
 
-		memcpy(dest, rt6_nexthop((struct rt6_info *)dst,
+		memcpy(dest, rt6_nexthop(dst_rt6_info(dst),
 					 &ipv6_hdr(skb)->daddr),
 		       sizeof(struct in6_addr));
 		break;
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 24368f755ab1..f7b809c0d142 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -415,7 +415,7 @@ out:
 	if (!IS_ERR_OR_NULL(dst)) {
 		struct rt6_info *rt;
 
-		rt = (struct rt6_info *)dst;
+		rt = dst_rt6_info(dst);
 		t->dst_cookie = rt6_get_cookie(rt);
 		pr_debug("rt6_dst:%pI6/%d rt6_src:%pI6\n",
 			 &rt->rt6i_dst.addr, rt->rt6i_dst.plen,
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 6affe5cd85d8..1a41650d9d7b 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2598,8 +2598,7 @@ static void xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst,
 			   int nfheader_len)
 {
 	if (dst->ops->family == AF_INET6) {
-		struct rt6_info *rt = (struct rt6_info *)dst;
-		path->path_cookie = rt6_get_cookie(rt);
+		path->path_cookie = rt6_get_cookie(dst_rt6_info(dst));
 		path->u.rt6.rt6i_nfheader_len = nfheader_len;
 	}
 }
-- 
cgit v1.2.3


From c580efb94abed05f99bb80245abe96cf5d49128c Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Date: Tue, 2 Apr 2024 02:37:25 +0300
Subject: media: v4l2-subdev: Clearly document that the crop API won't be
 extended

The V4L2 subdev crop API has been marked as obsolete, deprecated by the
selection API. Despite this, it has recently been extended with streams
support. In hindsight this was a mistake. Make sure it doesn't happen
again by clearly documenting that no new extensions will be accepted.

Suggested-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Acked-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Tomi Valkeinen <tomi.valkeinen@ideasonboard.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 Documentation/userspace-api/media/v4l/vidioc-subdev-g-crop.rst | 6 +++---
 include/uapi/linux/v4l2-subdev.h                               | 4 ++++
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/vidioc-subdev-g-crop.rst b/Documentation/userspace-api/media/v4l/vidioc-subdev-g-crop.rst
index 92d933631fda..88a748103a71 100644
--- a/Documentation/userspace-api/media/v4l/vidioc-subdev-g-crop.rst
+++ b/Documentation/userspace-api/media/v4l/vidioc-subdev-g-crop.rst
@@ -37,9 +37,9 @@ Description
 
 .. note::
 
-    This is an :ref:`obsolete` interface and may be removed
-    in the future. It is superseded by
-    :ref:`the selection API <VIDIOC_SUBDEV_G_SELECTION>`.
+    This is an :ref:`obsolete` interface and may be removed in the future. It is
+    superseded by :ref:`the selection API <VIDIOC_SUBDEV_G_SELECTION>`. No new
+    extensions to the :c:type:`v4l2_subdev_crop` structure will be accepted.
 
 To retrieve the current crop rectangle applications set the ``pad``
 field of a struct :c:type:`v4l2_subdev_crop` to the
diff --git a/include/uapi/linux/v4l2-subdev.h b/include/uapi/linux/v4l2-subdev.h
index 7048c51581c6..81a24bd38003 100644
--- a/include/uapi/linux/v4l2-subdev.h
+++ b/include/uapi/linux/v4l2-subdev.h
@@ -50,6 +50,10 @@ struct v4l2_subdev_format {
  * @rect: pad crop rectangle boundaries
  * @stream: stream number, defined in subdev routing
  * @reserved: drivers and applications must zero this array
+ *
+ * The subdev crop API is an obsolete interface and may be removed in the
+ * future. It is superseded by the selection API. No new extensions to this
+ * structure will be accepted.
  */
 struct v4l2_subdev_crop {
 	__u32 which;
-- 
cgit v1.2.3


From f28bdda2f87aa0c299ad3ddef3e1fa5134ae8e47 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Tue, 25 Apr 2023 16:50:25 +0300
Subject: media: uapi: Add generic serial metadata mbus formats

Add generic serial metadata mbus formats. These formats describe data
width and packing but not the content itself. The reason for specifying
such formats is that the formats as such are fairly device specific but
they are still handled by CSI-2 receiver drivers that should not be aware
of device specific formats. What makes generic metadata formats possible
is that these formats are parsed by software only, after capturing the
data to system memory.

Also add a definition for "Data Unit" to cover what is essentially a pixel
but is not image data.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 Documentation/userspace-api/media/glossary.rst     |   7 +
 .../userspace-api/media/v4l/subdev-formats.rst     | 254 +++++++++++++++++++++
 include/uapi/linux/media-bus-format.h              |   9 +
 3 files changed, 270 insertions(+)

(limited to 'include')

diff --git a/Documentation/userspace-api/media/glossary.rst b/Documentation/userspace-api/media/glossary.rst
index 5759f4c78398..55a9c37130ba 100644
--- a/Documentation/userspace-api/media/glossary.rst
+++ b/Documentation/userspace-api/media/glossary.rst
@@ -25,6 +25,13 @@ Glossary
 
 	See :ref:`cec`.
 
+    Data Unit
+
+	Unit of data transported by a bus. On parallel buses, the data unit
+	consists of one or more related samples while on serial buses the data
+	unit is logical. If the data unit is image data, it may also be called a
+	pixel.
+
     Device Driver
 	Part of the Linux Kernel that implements support for a hardware
 	component.
diff --git a/Documentation/userspace-api/media/v4l/subdev-formats.rst b/Documentation/userspace-api/media/v4l/subdev-formats.rst
index eb3cd20b0cf2..c2cebf61d528 100644
--- a/Documentation/userspace-api/media/v4l/subdev-formats.rst
+++ b/Documentation/userspace-api/media/v4l/subdev-formats.rst
@@ -8306,3 +8306,257 @@ The following table lists the existing metadata formats.
 	both sides of the link and the bus format is a fixed
 	metadata format that is not configurable from userspace.
 	Width and height will be set to 0 for this format.
+
+Generic Serial Metadata Formats
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Generic serial metadata formats are used on serial buses where the actual data
+content is more or less device specific but the data is transmitted and received
+by multiple devices that do not process the data in any way, simply writing
+it to system memory for processing in software at the end of the pipeline.
+
+"b" in an array cell signifies a byte of data, followed by the number of the bit
+and finally the bit number in subscript. "x" indicates a padding bit.
+
+.. _media-bus-format-generic-meta:
+
+.. cssclass: longtable
+
+.. flat-table:: Generic Serial Metadata Formats
+    :header-rows:  2
+    :stub-columns: 0
+
+    * - Identifier
+      - Code
+      -
+      - :cspan:`23` Data organization within bus :term:`Data Unit`
+    * -
+      -
+      - Bit
+      - 23
+      - 22
+      - 21
+      - 20
+      - 19
+      - 18
+      - 17
+      - 16
+      - 15
+      - 14
+      - 13
+      - 12
+      - 11
+      - 10
+      - 9
+      - 8
+      - 7
+      - 6
+      - 5
+      - 4
+      - 3
+      - 2
+      - 1
+      - 0
+    * .. _MEDIA-BUS-FMT-META-8:
+
+      - MEDIA_BUS_FMT_META_8
+      - 0x8001
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      - b0\ :sub:`7`
+      - b0\ :sub:`6`
+      - b0\ :sub:`5`
+      - b0\ :sub:`4`
+      - b0\ :sub:`3`
+      - b0\ :sub:`2`
+      - b0\ :sub:`1`
+      - b0\ :sub:`0`
+    * .. _MEDIA-BUS-FMT-META-10:
+
+      - MEDIA_BUS_FMT_META_10
+      - 0x8002
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      - b0\ :sub:`7`
+      - b0\ :sub:`6`
+      - b0\ :sub:`5`
+      - b0\ :sub:`4`
+      - b0\ :sub:`3`
+      - b0\ :sub:`2`
+      - b0\ :sub:`1`
+      - b0\ :sub:`0`
+      - x
+      - x
+    * .. _MEDIA-BUS-FMT-META-12:
+
+      - MEDIA_BUS_FMT_META_12
+      - 0x8003
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      - b0\ :sub:`7`
+      - b0\ :sub:`6`
+      - b0\ :sub:`5`
+      - b0\ :sub:`4`
+      - b0\ :sub:`3`
+      - b0\ :sub:`2`
+      - b0\ :sub:`1`
+      - b0\ :sub:`0`
+      - x
+      - x
+      - x
+      - x
+    * .. _MEDIA-BUS-FMT-META-14:
+
+      - MEDIA_BUS_FMT_META_14
+      - 0x8004
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      - b0\ :sub:`7`
+      - b0\ :sub:`6`
+      - b0\ :sub:`5`
+      - b0\ :sub:`4`
+      - b0\ :sub:`3`
+      - b0\ :sub:`2`
+      - b0\ :sub:`1`
+      - b0\ :sub:`0`
+      - x
+      - x
+      - x
+      - x
+      - x
+      - x
+    * .. _MEDIA-BUS-FMT-META-16:
+
+      - MEDIA_BUS_FMT_META_16
+      - 0x8005
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      -
+      - b0\ :sub:`7`
+      - b0\ :sub:`6`
+      - b0\ :sub:`5`
+      - b0\ :sub:`4`
+      - b0\ :sub:`3`
+      - b0\ :sub:`2`
+      - b0\ :sub:`1`
+      - b0\ :sub:`0`
+      - x
+      - x
+      - x
+      - x
+      - x
+      - x
+      - x
+      - x
+    * .. _MEDIA-BUS-FMT-META-20:
+
+      - MEDIA_BUS_FMT_META_20
+      - 0x8006
+      -
+      -
+      -
+      -
+      -
+      - b0\ :sub:`7`
+      - b0\ :sub:`6`
+      - b0\ :sub:`5`
+      - b0\ :sub:`4`
+      - b0\ :sub:`3`
+      - b0\ :sub:`2`
+      - b0\ :sub:`1`
+      - b0\ :sub:`0`
+      - x
+      - x
+      - x
+      - x
+      - x
+      - x
+      - x
+      - x
+      - x
+      - x
+      - x
+      - x
+    * .. _MEDIA-BUS-FMT-META-24:
+
+      - MEDIA_BUS_FMT_META_24
+      - 0x8007
+      -
+      - b0\ :sub:`7`
+      - b0\ :sub:`6`
+      - b0\ :sub:`5`
+      - b0\ :sub:`4`
+      - b0\ :sub:`3`
+      - b0\ :sub:`2`
+      - b0\ :sub:`1`
+      - b0\ :sub:`0`
+      - x
+      - x
+      - x
+      - x
+      - x
+      - x
+      - x
+      - x
+      - x
+      - x
+      - x
+      - x
+      - x
+      - x
+      - x
+      - x
diff --git a/include/uapi/linux/media-bus-format.h b/include/uapi/linux/media-bus-format.h
index f05f747e444d..d4c1d991014b 100644
--- a/include/uapi/linux/media-bus-format.h
+++ b/include/uapi/linux/media-bus-format.h
@@ -174,4 +174,13 @@
  */
 #define MEDIA_BUS_FMT_METADATA_FIXED		0x7001
 
+/* Generic line based metadata formats for serial buses. Next is 0x8008. */
+#define MEDIA_BUS_FMT_META_8			0x8001
+#define MEDIA_BUS_FMT_META_10			0x8002
+#define MEDIA_BUS_FMT_META_12			0x8003
+#define MEDIA_BUS_FMT_META_14			0x8004
+#define MEDIA_BUS_FMT_META_16			0x8005
+#define MEDIA_BUS_FMT_META_20			0x8006
+#define MEDIA_BUS_FMT_META_24			0x8007
+
 #endif /* __LINUX_MEDIA_BUS_FORMAT_H */
-- 
cgit v1.2.3


From ac5214a47336b8a8f7e54f406ee4f7eed9223599 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Tue, 1 Aug 2023 16:15:36 +0300
Subject: media: uapi: Document which mbus format fields are valid for metadata

Now that metadata mbus formats have been added, it is necessary to define
which fields in struct v4l2_mbus_format are applicable to them (not many).

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Tomi Valkeinen <tomi.valkeinen@ideasonboard.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 .../userspace-api/media/v4l/subdev-formats.rst         | 15 ++++++++-------
 include/uapi/linux/v4l2-mediabus.h                     | 18 ++++++++++++------
 2 files changed, 20 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/subdev-formats.rst b/Documentation/userspace-api/media/v4l/subdev-formats.rst
index c2cebf61d528..d2a6cd2e1eb2 100644
--- a/Documentation/userspace-api/media/v4l/subdev-formats.rst
+++ b/Documentation/userspace-api/media/v4l/subdev-formats.rst
@@ -33,7 +33,7 @@ Media Bus Formats
     * - __u32
       - ``field``
       - Field order, from enum :c:type:`v4l2_field`. See
-	:ref:`field-order` for details.
+	:ref:`field-order` for details. Zero for metadata mbus codes.
     * - __u32
       - ``colorspace``
       - Image colorspace, from enum :c:type:`v4l2_colorspace`.
@@ -45,7 +45,7 @@ Media Bus Formats
 	conversion is supported by setting the flag
 	V4L2_SUBDEV_MBUS_CODE_CSC_COLORSPACE in the corresponding struct
 	:c:type:`v4l2_subdev_mbus_code_enum` during enumeration.
-	See :ref:`v4l2-subdev-mbus-code-flags`.
+	See :ref:`v4l2-subdev-mbus-code-flags`. Zero for metadata mbus codes.
     * - union {
       - (anonymous)
     * - __u16
@@ -61,7 +61,7 @@ Media Bus Formats
 	that ycbcr_enc conversion is supported by setting the flag
 	V4L2_SUBDEV_MBUS_CODE_CSC_YCBCR_ENC in the corresponding struct
 	:c:type:`v4l2_subdev_mbus_code_enum` during enumeration.
-	See :ref:`v4l2-subdev-mbus-code-flags`.
+	See :ref:`v4l2-subdev-mbus-code-flags`. Zero for metadata mbus codes.
     * - __u16
       - ``hsv_enc``
       - HSV encoding, from enum :c:type:`v4l2_hsv_encoding`.
@@ -75,7 +75,7 @@ Media Bus Formats
 	that hsv_enc conversion is supported by setting the flag
 	V4L2_SUBDEV_MBUS_CODE_CSC_HSV_ENC in the corresponding struct
 	:c:type:`v4l2_subdev_mbus_code_enum` during enumeration.
-	See :ref:`v4l2-subdev-mbus-code-flags`
+	See :ref:`v4l2-subdev-mbus-code-flags`. Zero for metadata mbus codes.
     * - }
       -
     * - __u16
@@ -90,8 +90,8 @@ Media Bus Formats
 	The driver indicates that quantization conversion is supported by
 	setting the flag V4L2_SUBDEV_MBUS_CODE_CSC_QUANTIZATION in the
 	corresponding struct :c:type:`v4l2_subdev_mbus_code_enum`
-	during enumeration. See :ref:`v4l2-subdev-mbus-code-flags`.
-
+	during enumeration. See :ref:`v4l2-subdev-mbus-code-flags`. Zero for
+	metadata mbus codes.
     * - __u16
       - ``xfer_func``
       - Transfer function, from enum :c:type:`v4l2_xfer_func`.
@@ -104,7 +104,8 @@ Media Bus Formats
 	The driver indicates that the transfer function conversion is supported by
 	setting the flag V4L2_SUBDEV_MBUS_CODE_CSC_XFER_FUNC in the
 	corresponding struct :c:type:`v4l2_subdev_mbus_code_enum`
-	during enumeration. See :ref:`v4l2-subdev-mbus-code-flags`.
+	during enumeration. See :ref:`v4l2-subdev-mbus-code-flags`. Zero for
+	metadata mbus codes.
     * - __u16
       - ``flags``
       - flags See:  :ref:v4l2-mbus-framefmt-flags
diff --git a/include/uapi/linux/v4l2-mediabus.h b/include/uapi/linux/v4l2-mediabus.h
index 6b07b73473b5..946520bc49f1 100644
--- a/include/uapi/linux/v4l2-mediabus.h
+++ b/include/uapi/linux/v4l2-mediabus.h
@@ -19,12 +19,18 @@
  * @width:	image width
  * @height:	image height
  * @code:	data format code (from enum v4l2_mbus_pixelcode)
- * @field:	used interlacing type (from enum v4l2_field)
- * @colorspace:	colorspace of the data (from enum v4l2_colorspace)
- * @ycbcr_enc:	YCbCr encoding of the data (from enum v4l2_ycbcr_encoding)
- * @hsv_enc:	HSV encoding of the data (from enum v4l2_hsv_encoding)
- * @quantization: quantization of the data (from enum v4l2_quantization)
- * @xfer_func:  transfer function of the data (from enum v4l2_xfer_func)
+ * @field:	used interlacing type (from enum v4l2_field), zero for metadata
+ *		mbus codes
+ * @colorspace:	colorspace of the data (from enum v4l2_colorspace), zero on
+ *		metadata mbus codes
+ * @ycbcr_enc:	YCbCr encoding of the data (from enum v4l2_ycbcr_encoding), zero
+ *		for metadata mbus codes
+ * @hsv_enc:	HSV encoding of the data (from enum v4l2_hsv_encoding), zero for
+ *		metadata mbus codes
+ * @quantization: quantization of the data (from enum v4l2_quantization), zero
+ *		for metadata mbus codes
+ * @xfer_func:  transfer function of the data (from enum v4l2_xfer_func), zero
+ *		for metadata mbus codes
  * @flags:	flags (V4L2_MBUS_FRAMEFMT_*)
  * @reserved:  reserved bytes that can be later used
  */
-- 
cgit v1.2.3


From 1d92152339587174363c3c9f7561de95646d5625 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Wed, 26 Apr 2023 13:28:48 +0300
Subject: media: uapi: v4l: Add generic 8-bit metadata format definitions

Generic 8-bit metadata formats define the in-memory data layout but not
the format of the data itself. The reasoning for having such formats is to
allow CSI-2 receiver drivers to receive and DMA drivers to write the data
to memory without knowing a large number of device-specific formats.

These formats may be used only in conjunction with a Media Controller
pipeline where the internal pad of the source sub-device defines the
specific format of the data (using an mbus code).

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 .../userspace-api/media/v4l/dev-subdev.rst         |   2 +
 .../userspace-api/media/v4l/meta-formats.rst       |   3 +-
 .../userspace-api/media/v4l/metafmt-generic.rst    | 340 +++++++++++++++++++++
 drivers/media/v4l2-core/v4l2-ioctl.c               |   7 +
 include/uapi/linux/videodev2.h                     |   8 +
 5 files changed, 359 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/userspace-api/media/v4l/metafmt-generic.rst

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/dev-subdev.rst b/Documentation/userspace-api/media/v4l/dev-subdev.rst
index 43988516acdd..f375b820ab68 100644
--- a/Documentation/userspace-api/media/v4l/dev-subdev.rst
+++ b/Documentation/userspace-api/media/v4l/dev-subdev.rst
@@ -506,6 +506,8 @@ source pads.
 
     subdev-formats
 
+.. _subdev-routing:
+
 Streams, multiplexed media pads and internal routing
 ----------------------------------------------------
 
diff --git a/Documentation/userspace-api/media/v4l/meta-formats.rst b/Documentation/userspace-api/media/v4l/meta-formats.rst
index 0bb61fc5bc00..c23aac823d2c 100644
--- a/Documentation/userspace-api/media/v4l/meta-formats.rst
+++ b/Documentation/userspace-api/media/v4l/meta-formats.rst
@@ -13,9 +13,10 @@ These formats are used for the :ref:`metadata` interface only.
     :maxdepth: 1
 
     metafmt-d4xx
+    metafmt-generic
     metafmt-intel-ipu3
     metafmt-rkisp1
     metafmt-uvc
+    metafmt-vivid
     metafmt-vsp1-hgo
     metafmt-vsp1-hgt
-    metafmt-vivid
diff --git a/Documentation/userspace-api/media/v4l/metafmt-generic.rst b/Documentation/userspace-api/media/v4l/metafmt-generic.rst
new file mode 100644
index 000000000000..78ab56b21682
--- /dev/null
+++ b/Documentation/userspace-api/media/v4l/metafmt-generic.rst
@@ -0,0 +1,340 @@
+.. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.1-no-invariants-or-later
+
+********************************************************************************************************************************************************************************************************************************************************************************
+V4L2_META_FMT_GENERIC_8 ('MET8'), V4L2_META_FMT_GENERIC_CSI2_10 ('MC1A'), V4L2_META_FMT_GENERIC_CSI2_12 ('MC1C'), V4L2_META_FMT_GENERIC_CSI2_14 ('MC1E'), V4L2_META_FMT_GENERIC_CSI2_16 ('MC1G'), V4L2_META_FMT_GENERIC_CSI2_20 ('MC1K'), V4L2_META_FMT_GENERIC_CSI2_24 ('MC1O')
+********************************************************************************************************************************************************************************************************************************************************************************
+
+
+Generic line-based metadata formats
+
+
+Description
+===========
+
+These generic line-based metadata formats define the memory layout of the data
+without defining the format or meaning of the metadata itself.
+
+.. _v4l2-meta-fmt-generic-8:
+
+V4L2_META_FMT_GENERIC_8
+-----------------------
+
+The V4L2_META_FMT_GENERIC_8 format is a plain 8-bit metadata format. This format
+is used on CSI-2 for 8 bits per :term:`Data Unit`.
+
+Additionally it is used for 16 bits per Data Unit when two bytes of metadata are
+packed into one 16-bit Data Unit. Otherwise the 16 bits per pixel dataformat is
+:ref:`V4L2_META_FMT_GENERIC_CSI2_16 <v4l2-meta-fmt-generic-csi2-16>`.
+
+**Byte Order Of V4L2_META_FMT_GENERIC_8.**
+Each cell is one byte. "M" denotes a byte of metadata.
+
+.. tabularcolumns:: |p{2.4cm}|p{1.2cm}|p{1.2cm}|p{1.2cm}|p{1.2cm}|
+
+.. flat-table:: Sample 4x2 Metadata Frame
+    :header-rows:  0
+    :stub-columns: 0
+    :widths: 12 8 8 8 8
+
+    * - start + 0:
+      - M\ :sub:`00`
+      - M\ :sub:`10`
+      - M\ :sub:`20`
+      - M\ :sub:`30`
+    * - start + 4:
+      - M\ :sub:`01`
+      - M\ :sub:`11`
+      - M\ :sub:`21`
+      - M\ :sub:`31`
+
+.. _v4l2-meta-fmt-generic-csi2-10:
+
+V4L2_META_FMT_GENERIC_CSI2_10
+-----------------------------
+
+V4L2_META_FMT_GENERIC_CSI2_10 contains 8-bit generic metadata packed in 10-bit
+Data Units, with one padding byte after every four bytes of metadata. This
+format is typically used by CSI-2 receivers with a source that transmits
+MEDIA_BUS_FMT_META_10 and the CSI-2 receiver writes the received data to memory
+as-is.
+
+The packing of the data follows the MIPI CSI-2 specification and the padding of
+the data is defined in the MIPI CCS specification.
+
+This format is also used in conjunction with 20 bits per :term:`Data Unit`
+formats that pack two bytes of metadata into one Data Unit. Otherwise the
+20 bits per pixel dataformat is :ref:`V4L2_META_FMT_GENERIC_CSI2_20
+<v4l2-meta-fmt-generic-csi2-20>`.
+
+This format is little endian.
+
+**Byte Order Of V4L2_META_FMT_GENERIC_CSI2_10.**
+Each cell is one byte. "M" denotes a byte of metadata and "x" a byte of padding.
+
+.. tabularcolumns:: |p{2.4cm}|p{1.2cm}|p{1.2cm}|p{1.2cm}|p{1.2cm}|p{.8cm}|
+
+.. flat-table:: Sample 4x2 Metadata Frame
+    :header-rows:  0
+    :stub-columns: 0
+    :widths: 12 8 8 8 8 8
+
+    * - start + 0:
+      - M\ :sub:`00`
+      - M\ :sub:`10`
+      - M\ :sub:`20`
+      - M\ :sub:`30`
+      - x
+    * - start + 5:
+      - M\ :sub:`01`
+      - M\ :sub:`11`
+      - M\ :sub:`21`
+      - M\ :sub:`31`
+      - x
+
+.. _v4l2-meta-fmt-generic-csi2-12:
+
+V4L2_META_FMT_GENERIC_CSI2_12
+-----------------------------
+
+V4L2_META_FMT_GENERIC_CSI2_12 contains 8-bit generic metadata packed in 12-bit
+Data Units, with one padding byte after every two bytes of metadata. This format
+is typically used by CSI-2 receivers with a source that transmits
+MEDIA_BUS_FMT_META_12 and the CSI-2 receiver writes the received data to memory
+as-is.
+
+The packing of the data follows the MIPI CSI-2 specification and the padding of
+the data is defined in the MIPI CCS specification.
+
+This format is also used in conjunction with 24 bits per :term:`Data Unit`
+formats that pack two bytes of metadata into one Data Unit. Otherwise the
+24 bits per pixel dataformat is :ref:`V4L2_META_FMT_GENERIC_CSI2_24
+<v4l2-meta-fmt-generic-csi2-24>`.
+
+This format is little endian.
+
+**Byte Order Of V4L2_META_FMT_GENERIC_CSI2_12.**
+Each cell is one byte. "M" denotes a byte of metadata and "x" a byte of padding.
+
+.. tabularcolumns:: |p{2.4cm}|p{1.2cm}|p{1.2cm}|p{1.2cm}|p{1.2cm}|p{.8cm}|p{.8cm}|
+
+.. flat-table:: Sample 4x2 Metadata Frame
+    :header-rows:  0
+    :stub-columns: 0
+    :widths: 12 8 8 8 8 8 8
+
+    * - start + 0:
+      - M\ :sub:`00`
+      - M\ :sub:`10`
+      - x
+      - M\ :sub:`20`
+      - M\ :sub:`30`
+      - x
+    * - start + 6:
+      - M\ :sub:`01`
+      - M\ :sub:`11`
+      - x
+      - M\ :sub:`21`
+      - M\ :sub:`31`
+      - x
+
+.. _v4l2-meta-fmt-generic-csi2-14:
+
+V4L2_META_FMT_GENERIC_CSI2_14
+-----------------------------
+
+V4L2_META_FMT_GENERIC_CSI2_14 contains 8-bit generic metadata packed in 14-bit
+Data Units, with three padding bytes after every four bytes of metadata. This
+format is typically used by CSI-2 receivers with a source that transmits
+MEDIA_BUS_FMT_META_14 and the CSI-2 receiver writes the received data to memory
+as-is.
+
+The packing of the data follows the MIPI CSI-2 specification and the padding of
+the data is defined in the MIPI CCS specification.
+
+This format is little endian.
+
+**Byte Order Of V4L2_META_FMT_GENERIC_CSI2_14.**
+Each cell is one byte. "M" denotes a byte of metadata and "x" a byte of padding.
+
+.. tabularcolumns:: |p{2.4cm}|p{1.2cm}|p{1.2cm}|p{1.2cm}|p{1.2cm}|p{.8cm}|p{.8cm}|p{.8cm}|
+
+.. flat-table:: Sample 4x2 Metadata Frame
+    :header-rows:  0
+    :stub-columns: 0
+    :widths: 12 8 8 8 8 8 8 8
+
+    * - start + 0:
+      - M\ :sub:`00`
+      - M\ :sub:`10`
+      - M\ :sub:`20`
+      - M\ :sub:`30`
+      - x
+      - x
+      - x
+    * - start + 7:
+      - M\ :sub:`01`
+      - M\ :sub:`11`
+      - M\ :sub:`21`
+      - M\ :sub:`31`
+      - x
+      - x
+      - x
+
+.. _v4l2-meta-fmt-generic-csi2-16:
+
+V4L2_META_FMT_GENERIC_CSI2_16
+-----------------------------
+
+V4L2_META_FMT_GENERIC_CSI2_16 contains 8-bit generic metadata packed in 16-bit
+Data Units, with one padding byte after every byte of metadata. This format is
+typically used by CSI-2 receivers with a source that transmits
+MEDIA_BUS_FMT_META_16 and the CSI-2 receiver writes the received data to memory
+as-is.
+
+The packing of the data follows the MIPI CSI-2 specification and the padding of
+the data is defined in the MIPI CCS specification.
+
+Some devices support more efficient packing of metadata in conjunction with
+16-bit image data. In that case the dataformat is
+:ref:`V4L2_META_FMT_GENERIC_8 <v4l2-meta-fmt-generic-8>`.
+
+This format is little endian.
+
+**Byte Order Of V4L2_META_FMT_GENERIC_CSI2_16.**
+Each cell is one byte. "M" denotes a byte of metadata and "x" a byte of padding.
+
+.. tabularcolumns:: |p{2.4cm}|p{1.2cm}|p{.8cm}|p{1.2cm}|p{.8cm}|p{1.2cm}|p{.8cm}|p{1.2cm}|p{.8cm}|
+
+.. flat-table:: Sample 4x2 Metadata Frame
+    :header-rows:  0
+    :stub-columns: 0
+    :widths: 12 8 8 8 8 8 8 8 8
+
+    * - start + 0:
+      - M\ :sub:`00`
+      - x
+      - M\ :sub:`10`
+      - x
+      - M\ :sub:`20`
+      - x
+      - M\ :sub:`30`
+      - x
+    * - start + 8:
+      - M\ :sub:`01`
+      - x
+      - M\ :sub:`11`
+      - x
+      - M\ :sub:`21`
+      - x
+      - M\ :sub:`31`
+      - x
+
+.. _v4l2-meta-fmt-generic-csi2-20:
+
+V4L2_META_FMT_GENERIC_CSI2_20
+-----------------------------
+
+V4L2_META_FMT_GENERIC_CSI2_20 contains 8-bit generic metadata packed in 20-bit
+Data Units, with alternating one or two padding bytes after every byte of
+metadata. This format is typically used by CSI-2 receivers with a source that
+transmits MEDIA_BUS_FMT_META_20 and the CSI-2 receiver writes the received data
+to memory as-is.
+
+The packing of the data follows the MIPI CSI-2 specification and the padding of
+the data is defined in the MIPI CCS specification.
+
+Some devices support more efficient packing of metadata in conjunction with
+16-bit image data. In that case the dataformat is
+:ref:`V4L2_META_FMT_GENERIC_CSI2_10 <v4l2-meta-fmt-generic-csi2-10>`.
+
+This format is little endian.
+
+**Byte Order Of V4L2_META_FMT_GENERIC_CSI2_20.**
+Each cell is one byte. "M" denotes a byte of metadata and "x" a byte of padding.
+
+.. tabularcolumns:: |p{2.4cm}|p{1.2cm}|p{.8cm}|p{1.2cm}|p{.8cm}|p{.8cm}|p{1.2cm}|p{.8cm}|p{1.2cm}|p{.8cm}|p{.8cm}|
+
+.. flat-table:: Sample 4x2 Metadata Frame
+    :header-rows:  0
+    :stub-columns: 0
+    :widths: 12 8 8 8 8 8 8 8 8 8 8
+
+    * - start + 0:
+      - M\ :sub:`00`
+      - x
+      - M\ :sub:`10`
+      - x
+      - x
+      - M\ :sub:`20`
+      - x
+      - M\ :sub:`30`
+      - x
+      - x
+    * - start + 10:
+      - M\ :sub:`01`
+      - x
+      - M\ :sub:`11`
+      - x
+      - x
+      - M\ :sub:`21`
+      - x
+      - M\ :sub:`31`
+      - x
+      - x
+
+.. _v4l2-meta-fmt-generic-csi2-24:
+
+V4L2_META_FMT_GENERIC_CSI2_24
+-----------------------------
+
+V4L2_META_FMT_GENERIC_CSI2_24 contains 8-bit generic metadata packed in 24-bit
+Data Units, with two padding bytes after every byte of metadata. This format is
+typically used by CSI-2 receivers with a source that transmits
+MEDIA_BUS_FMT_META_24 and the CSI-2 receiver writes the received data to memory
+as-is.
+
+The packing of the data follows the MIPI CSI-2 specification and the padding of
+the data is defined in the MIPI CCS specification.
+
+Some devices support more efficient packing of metadata in conjunction with
+16-bit image data. In that case the dataformat is
+:ref:`V4L2_META_FMT_GENERIC_CSI2_12 <v4l2-meta-fmt-generic-csi2-12>`.
+
+This format is little endian.
+
+**Byte Order Of V4L2_META_FMT_GENERIC_CSI2_24.**
+Each cell is one byte. "M" denotes a byte of metadata and "x" a byte of padding.
+
+.. tabularcolumns:: |p{2.4cm}|p{1.2cm}|p{.8cm}|p{.8cm}|p{1.2cm}|p{.8cm}|p{.8cm}|p{1.2cm}|p{.8cm}|p{.8cm}|p{1.2cm}|p{.8cm}|p{.8cm}|
+
+.. flat-table:: Sample 4x2 Metadata Frame
+    :header-rows:  0
+    :stub-columns: 0
+    :widths: 12 8 8 8 8 8 8 8 8 8 8 8 8
+
+    * - start + 0:
+      - M\ :sub:`00`
+      - x
+      - x
+      - M\ :sub:`10`
+      - x
+      - x
+      - M\ :sub:`20`
+      - x
+      - x
+      - M\ :sub:`30`
+      - x
+      - x
+    * - start + 12:
+      - M\ :sub:`01`
+      - x
+      - x
+      - M\ :sub:`11`
+      - x
+      - x
+      - M\ :sub:`21`
+      - x
+      - x
+      - M\ :sub:`31`
+      - x
+      - x
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index 5aeff5519407..ae2dca7f2817 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1460,6 +1460,13 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_Y210:		descr = "10-bit YUYV Packed"; break;
 	case V4L2_PIX_FMT_Y212:		descr = "12-bit YUYV Packed"; break;
 	case V4L2_PIX_FMT_Y216:		descr = "16-bit YUYV Packed"; break;
+	case V4L2_META_FMT_GENERIC_8:	descr = "8-bit Generic Metadata"; break;
+	case V4L2_META_FMT_GENERIC_CSI2_10:	descr = "8-bit Generic Meta, 10b CSI-2"; break;
+	case V4L2_META_FMT_GENERIC_CSI2_12:	descr = "8-bit Generic Meta, 12b CSI-2"; break;
+	case V4L2_META_FMT_GENERIC_CSI2_14:	descr = "8-bit Generic Meta, 14b CSI-2"; break;
+	case V4L2_META_FMT_GENERIC_CSI2_16:	descr = "8-bit Generic Meta, 16b CSI-2"; break;
+	case V4L2_META_FMT_GENERIC_CSI2_20:	descr = "8-bit Generic Meta, 20b CSI-2"; break;
+	case V4L2_META_FMT_GENERIC_CSI2_24:	descr = "8-bit Generic Meta, 24b CSI-2"; break;
 
 	default:
 		/* Compressed formats */
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index bf12860d570a..0f44b8486ea8 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -839,6 +839,14 @@ struct v4l2_pix_format {
 #define V4L2_META_FMT_RK_ISP1_PARAMS	v4l2_fourcc('R', 'K', '1', 'P') /* Rockchip ISP1 3A Parameters */
 #define V4L2_META_FMT_RK_ISP1_STAT_3A	v4l2_fourcc('R', 'K', '1', 'S') /* Rockchip ISP1 3A Statistics */
 
+#define V4L2_META_FMT_GENERIC_8		v4l2_fourcc('M', 'E', 'T', '8') /* Generic 8-bit metadata */
+#define V4L2_META_FMT_GENERIC_CSI2_10	v4l2_fourcc('M', 'C', '1', 'A') /* 10-bit CSI-2 packed 8-bit metadata */
+#define V4L2_META_FMT_GENERIC_CSI2_12	v4l2_fourcc('M', 'C', '1', 'C') /* 12-bit CSI-2 packed 8-bit metadata */
+#define V4L2_META_FMT_GENERIC_CSI2_14	v4l2_fourcc('M', 'C', '1', 'E') /* 14-bit CSI-2 packed 8-bit metadata */
+#define V4L2_META_FMT_GENERIC_CSI2_16	v4l2_fourcc('M', 'C', '1', 'G') /* 16-bit CSI-2 packed 8-bit metadata */
+#define V4L2_META_FMT_GENERIC_CSI2_20	v4l2_fourcc('M', 'C', '1', 'K') /* 20-bit CSI-2 packed 8-bit metadata */
+#define V4L2_META_FMT_GENERIC_CSI2_24	v4l2_fourcc('M', 'C', '1', 'O') /* 24-bit CSI-2 packed 8-bit metadata */
+
 /* priv field value to indicates that subsequent fields are valid. */
 #define V4L2_PIX_FMT_PRIV_MAGIC		0xfeedcafe
 
-- 
cgit v1.2.3


From 89345c2a6ff9c48c5f1ea336e66e46dfc38a467a Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Tue, 21 Feb 2023 19:37:29 +0200
Subject: media: v4l: Support line-based metadata capture

Many camera sensors, among other devices, transmit embedded data and image
data for each CSI-2 frame. This embedded data typically contains register
configuration of the sensor that has been used to capture the image data
of the same frame.

The embedded data is received by the CSI-2 receiver and has the same
properties as the image data, including that it is line based: it has
width, height and bytesperline (stride).

Add these fields to struct v4l2_meta_format and document them.

Also add V4L2_FMT_FLAG_META_LINE_BASED to tell a given format is
line-based i.e. these fields of struct v4l2_meta_format are valid for it.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 Documentation/userspace-api/media/v4l/dev-meta.rst  | 21 +++++++++++++++++++++
 .../userspace-api/media/v4l/vidioc-enum-fmt.rst     |  7 +++++++
 .../userspace-api/media/videodev2.h.rst.exceptions  |  1 +
 drivers/media/v4l2-core/v4l2-ioctl.c                |  5 +++--
 include/uapi/linux/videodev2.h                      | 10 ++++++++++
 5 files changed, 42 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/dev-meta.rst b/Documentation/userspace-api/media/v4l/dev-meta.rst
index 0e7e1ee1471a..5eee9ab60395 100644
--- a/Documentation/userspace-api/media/v4l/dev-meta.rst
+++ b/Documentation/userspace-api/media/v4l/dev-meta.rst
@@ -47,6 +47,12 @@ member of the ``fmt`` union as needed per the desired operation. Both drivers
 and applications must set the remainder of the :c:type:`v4l2_format` structure
 to 0.
 
+Devices that capture metadata by line have the struct v4l2_fmtdesc
+``V4L2_FMT_FLAG_META_LINE_BASED`` flag set for :c:func:`VIDIOC_ENUM_FMT`. Such
+devices can typically also :ref:`capture image data <capture>`. This primarily
+involves devices that receive the data from a different devices such as a camera
+sensor.
+
 .. c:type:: v4l2_meta_format
 
 .. tabularcolumns:: |p{1.4cm}|p{2.4cm}|p{13.5cm}|
@@ -65,3 +71,18 @@ to 0.
       - ``buffersize``
       - Maximum buffer size in bytes required for data. The value is set by the
         driver.
+    * - __u32
+      - ``width``
+      - Width of a line of metadata in Data Units. Valid when
+	:c:type`v4l2_fmtdesc` flag ``V4L2_FMT_FLAG_META_LINE_BASED`` is set,
+	otherwise zero. See :c:func:`VIDIOC_ENUM_FMT`.
+    * - __u32
+      - ``height``
+      - Number of rows of metadata. Valid when :c:type`v4l2_fmtdesc` flag
+	``V4L2_FMT_FLAG_META_LINE_BASED`` is set, otherwise zero. See
+	:c:func:`VIDIOC_ENUM_FMT`.
+    * - __u32
+      - ``bytesperline``
+      - Offset in bytes between the beginning of two consecutive lines. Valid
+	when :c:type`v4l2_fmtdesc` flag ``V4L2_FMT_FLAG_META_LINE_BASED`` is
+	set, otherwise zero. See :c:func:`VIDIOC_ENUM_FMT`.
diff --git a/Documentation/userspace-api/media/v4l/vidioc-enum-fmt.rst b/Documentation/userspace-api/media/v4l/vidioc-enum-fmt.rst
index 000c154b0f98..3adb3d205531 100644
--- a/Documentation/userspace-api/media/v4l/vidioc-enum-fmt.rst
+++ b/Documentation/userspace-api/media/v4l/vidioc-enum-fmt.rst
@@ -227,6 +227,13 @@ the ``mbus_code`` field is handled differently:
 	The application can ask to configure the quantization of the capture
 	device when calling the :ref:`VIDIOC_S_FMT <VIDIOC_G_FMT>` ioctl with
 	:ref:`V4L2_PIX_FMT_FLAG_SET_CSC <v4l2-pix-fmt-flag-set-csc>` set.
+    * - ``V4L2_FMT_FLAG_META_LINE_BASED``
+      - 0x0200
+      - The metadata format is line-based. In this case the ``width``,
+	``height`` and ``bytesperline`` fields of :c:type:`v4l2_meta_format` are
+	valid. The buffer consists of ``height`` lines, each having ``width``
+	Data Units of data and the offset (in bytes) between the beginning of
+	each two consecutive lines is ``bytesperline``.
 
 Return Value
 ============
diff --git a/Documentation/userspace-api/media/videodev2.h.rst.exceptions b/Documentation/userspace-api/media/videodev2.h.rst.exceptions
index 3e58aac4ef0b..bdc628e8c1d6 100644
--- a/Documentation/userspace-api/media/videodev2.h.rst.exceptions
+++ b/Documentation/userspace-api/media/videodev2.h.rst.exceptions
@@ -215,6 +215,7 @@ replace define V4L2_FMT_FLAG_CSC_XFER_FUNC fmtdesc-flags
 replace define V4L2_FMT_FLAG_CSC_YCBCR_ENC fmtdesc-flags
 replace define V4L2_FMT_FLAG_CSC_HSV_ENC fmtdesc-flags
 replace define V4L2_FMT_FLAG_CSC_QUANTIZATION fmtdesc-flags
+replace define V4L2_FMT_FLAG_META_LINE_BASED fmtdesc-flags
 
 # V4L2 timecode types
 replace define V4L2_TC_TYPE_24FPS timecode-type
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index ae2dca7f2817..2cfc9106857a 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -343,8 +343,9 @@ static void v4l_print_format(const void *arg, bool write_only)
 	case V4L2_BUF_TYPE_META_OUTPUT:
 		meta = &p->fmt.meta;
 		pixelformat = meta->dataformat;
-		pr_cont(", dataformat=%p4cc, buffersize=%u\n",
-			&pixelformat, meta->buffersize);
+		pr_cont(", dataformat=%p4cc, buffersize=%u, width=%u, height=%u, bytesperline=%u\n",
+			&pixelformat, meta->buffersize, meta->width,
+			meta->height, meta->bytesperline);
 		break;
 	}
 }
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 0f44b8486ea8..647ce3419bd8 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -877,6 +877,7 @@ struct v4l2_fmtdesc {
 #define V4L2_FMT_FLAG_CSC_YCBCR_ENC		0x0080
 #define V4L2_FMT_FLAG_CSC_HSV_ENC		V4L2_FMT_FLAG_CSC_YCBCR_ENC
 #define V4L2_FMT_FLAG_CSC_QUANTIZATION		0x0100
+#define V4L2_FMT_FLAG_META_LINE_BASED		0x0200
 
 	/* Frame Size and frame rate enumeration */
 /*
@@ -2424,10 +2425,19 @@ struct v4l2_sdr_format {
  * struct v4l2_meta_format - metadata format definition
  * @dataformat:		little endian four character code (fourcc)
  * @buffersize:		maximum size in bytes required for data
+ * @width:		number of data units of data per line (valid for line
+ *			based formats only, see format documentation)
+ * @height:		number of lines of data per buffer (valid for line based
+ *			formats only)
+ * @bytesperline:	offset between the beginnings of two adjacent lines in
+ *			bytes (valid for line based formats only)
  */
 struct v4l2_meta_format {
 	__u32				dataformat;
 	__u32				buffersize;
+	__u32				width;
+	__u32				height;
+	__u32				bytesperline;
 } __attribute__ ((packed));
 
 /**
-- 
cgit v1.2.3


From 21828609f0a652d825f73dca621f783a2f225762 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Tue, 23 Apr 2024 10:39:47 +0300
Subject: media: v4l: Set line based metadata flag in V4L2 core

Set (and unset) the V4L2_FMT_FLAG_META_LINE_BASED flag in struct
v4l2_fmtdesc based on the format after returning the driver callback for
enumerating formats. This way the drivers don't need to care about the
flag.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 drivers/media/v4l2-core/v4l2-ioctl.c | 16 ++++++++++++++++
 include/uapi/linux/videodev2.h       |  4 ++++
 2 files changed, 20 insertions(+)

(limited to 'include')

diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index 2cfc9106857a..5e928e2a2beb 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1538,6 +1538,22 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 		}
 	}
 
+	if (fmt->type == V4L2_BUF_TYPE_META_CAPTURE) {
+		switch (fmt->pixelformat) {
+		case V4L2_META_FMT_GENERIC_8:
+		case V4L2_META_FMT_GENERIC_CSI2_10:
+		case V4L2_META_FMT_GENERIC_CSI2_12:
+		case V4L2_META_FMT_GENERIC_CSI2_14:
+		case V4L2_META_FMT_GENERIC_CSI2_16:
+		case V4L2_META_FMT_GENERIC_CSI2_20:
+		case V4L2_META_FMT_GENERIC_CSI2_24:
+			fmt->flags |= V4L2_FMT_FLAG_META_LINE_BASED;
+			break;
+		default:
+			fmt->flags &= ~V4L2_FMT_FLAG_META_LINE_BASED;
+		}
+	}
+
 	if (descr)
 		WARN_ON(strscpy(fmt->description, descr, sz) < 0);
 	fmt->flags |= flags;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 647ce3419bd8..d944db7ae1ed 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -839,6 +839,10 @@ struct v4l2_pix_format {
 #define V4L2_META_FMT_RK_ISP1_PARAMS	v4l2_fourcc('R', 'K', '1', 'P') /* Rockchip ISP1 3A Parameters */
 #define V4L2_META_FMT_RK_ISP1_STAT_3A	v4l2_fourcc('R', 'K', '1', 'S') /* Rockchip ISP1 3A Statistics */
 
+/*
+ * Line-based metadata formats. Remember to update v4l_fill_fmtdesc() when
+ * adding new ones!
+ */
 #define V4L2_META_FMT_GENERIC_8		v4l2_fourcc('M', 'E', 'T', '8') /* Generic 8-bit metadata */
 #define V4L2_META_FMT_GENERIC_CSI2_10	v4l2_fourcc('M', 'C', '1', 'A') /* 10-bit CSI-2 packed 8-bit metadata */
 #define V4L2_META_FMT_GENERIC_CSI2_12	v4l2_fourcc('M', 'C', '1', 'C') /* 12-bit CSI-2 packed 8-bit metadata */
-- 
cgit v1.2.3


From 72364b91ce022ded4aa541c62f51c10a65fb3498 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Wed, 6 Sep 2023 10:56:14 +0300
Subject: media: v4l: subdev: Add a function to lock two sub-device states, use
 it

Add two new functions, v4l2_subdev_lock_states() and
v4l2_subdev_unclock_states(), to acquire and release the state of two
sub-devices. They differ from calling v4l2_subdev_{un,}lock_state() so
that if the two states share the same lock, the lock is acquired only
once.

Also use the new functions in v4l2_subdev_link_validate().

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Julien Massot <julien.massot@collabora.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 drivers/media/v4l2-core/v4l2-subdev.c | 12 ++++-------
 include/media/v4l2-subdev.h           | 40 +++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/media/v4l2-core/v4l2-subdev.c b/drivers/media/v4l2-core/v4l2-subdev.c
index 2d67ec54569a..779583447eac 100644
--- a/drivers/media/v4l2-core/v4l2-subdev.c
+++ b/drivers/media/v4l2-core/v4l2-subdev.c
@@ -1437,17 +1437,13 @@ int v4l2_subdev_link_validate(struct media_link *link)
 
 	states_locked = sink_state && source_state;
 
-	if (states_locked) {
-		v4l2_subdev_lock_state(sink_state);
-		v4l2_subdev_lock_state(source_state);
-	}
+	if (states_locked)
+		v4l2_subdev_lock_states(sink_state, source_state);
 
 	ret = v4l2_subdev_link_validate_locked(link, states_locked);
 
-	if (states_locked) {
-		v4l2_subdev_unlock_state(sink_state);
-		v4l2_subdev_unlock_state(source_state);
-	}
+	if (states_locked)
+		v4l2_subdev_unlock_states(sink_state, source_state);
 
 	return ret;
 }
diff --git a/include/media/v4l2-subdev.h b/include/media/v4l2-subdev.h
index 1af16b16f0bf..e22c50ce7e05 100644
--- a/include/media/v4l2-subdev.h
+++ b/include/media/v4l2-subdev.h
@@ -1724,6 +1724,46 @@ static inline void v4l2_subdev_unlock_state(struct v4l2_subdev_state *state)
 	mutex_unlock(state->lock);
 }
 
+/**
+ * v4l2_subdev_lock_states - Lock two sub-device states
+ * @state1: One subdevice state
+ * @state2: The other subdevice state
+ *
+ * Locks the state of two sub-devices.
+ *
+ * The states must be unlocked with v4l2_subdev_unlock_states() after use.
+ *
+ * This differs from calling v4l2_subdev_lock_state() on both states so that if
+ * the states share the same lock, the lock is acquired only once (so no
+ * deadlock occurs). The caller is responsible for ensuring the locks will
+ * always be acquired in the same order.
+ */
+static inline void v4l2_subdev_lock_states(struct v4l2_subdev_state *state1,
+					   struct v4l2_subdev_state *state2)
+{
+	mutex_lock(state1->lock);
+	if (state1->lock != state2->lock)
+		mutex_lock(state2->lock);
+}
+
+/**
+ * v4l2_subdev_unlock_states() - Unlock two sub-device states
+ * @state1: One subdevice state
+ * @state2: The other subdevice state
+ *
+ * Unlocks the state of two sub-devices.
+ *
+ * This differs from calling v4l2_subdev_unlock_state() on both states so that
+ * if the states share the same lock, the lock is released only once.
+ */
+static inline void v4l2_subdev_unlock_states(struct v4l2_subdev_state *state1,
+					     struct v4l2_subdev_state *state2)
+{
+	mutex_unlock(state1->lock);
+	if (state1->lock != state2->lock)
+		mutex_unlock(state2->lock);
+}
+
 /**
  * v4l2_subdev_get_unlocked_active_state() - Checks that the active subdev state
  *					     is unlocked and returns it
-- 
cgit v1.2.3


From 83a22a07cd9d51b7ffd18a8904e1857061eda28f Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Thu, 31 Aug 2023 14:56:28 +0300
Subject: media: v4l: subdev: Add len_routes field to struct
 v4l2_subdev_routing

The len_routes field is used to tell the size of the routes array in
struct v4l2_subdev_routing. This way the number of routes returned from
S_ROUTING IOCTL may be larger than the number of routes provided, in case
there are more routes returned by the driver.

Note that this uAPI is still disabled in the code, so this change can
safely be done. Anyone who manually patched the code to enable this uAPI
must update their code.

The patch also increases the number of reserved fields in struct
v4l2_subdev_routing.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 .../media/v4l/vidioc-subdev-g-routing.rst          | 46 +++++++++++++++-------
 drivers/media/v4l2-core/v4l2-ioctl.c               |  4 +-
 drivers/media/v4l2-core/v4l2-subdev.c              | 12 +++---
 include/media/v4l2-subdev.h                        |  2 +
 include/uapi/linux/v4l2-subdev.h                   | 10 +++--
 5 files changed, 49 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/vidioc-subdev-g-routing.rst b/Documentation/userspace-api/media/v4l/vidioc-subdev-g-routing.rst
index 26b5004bfe6d..cbd9370006b6 100644
--- a/Documentation/userspace-api/media/v4l/vidioc-subdev-g-routing.rst
+++ b/Documentation/userspace-api/media/v4l/vidioc-subdev-g-routing.rst
@@ -43,23 +43,38 @@ The routing configuration determines the flows of data inside an entity.
 Drivers report their current routing tables using the
 ``VIDIOC_SUBDEV_G_ROUTING`` ioctl and application may enable or disable routes
 with the ``VIDIOC_SUBDEV_S_ROUTING`` ioctl, by adding or removing routes and
-setting or clearing flags of the  ``flags`` field of a
-struct :c:type:`v4l2_subdev_route`.
+setting or clearing flags of the ``flags`` field of a struct
+:c:type:`v4l2_subdev_route`.
 
-All stream configurations are reset when ``VIDIOC_SUBDEV_S_ROUTING`` is called. This
-means that the userspace must reconfigure all streams after calling the ioctl
-with e.g. ``VIDIOC_SUBDEV_S_FMT``.
+All stream configurations are reset when ``VIDIOC_SUBDEV_S_ROUTING`` is called.
+This means that the userspace must reconfigure all stream formats and selections
+after calling the ioctl with e.g. ``VIDIOC_SUBDEV_S_FMT``.
 
 Only subdevices which have both sink and source pads can support routing.
 
-When inspecting routes through ``VIDIOC_SUBDEV_G_ROUTING`` and the application
-provided ``num_routes`` is not big enough to contain all the available routes
-the subdevice exposes, drivers return the ENOSPC error code and adjust the
-value of the ``num_routes`` field. Application should then reserve enough memory
-for all the route entries and call ``VIDIOC_SUBDEV_G_ROUTING`` again.
-
-On a successful ``VIDIOC_SUBDEV_G_ROUTING`` call the driver updates the
-``num_routes`` field to reflect the actual number of routes returned.
+The ``len_routes`` field indicates the number of routes that can fit in the
+``routes`` array allocated by userspace. It is set by applications for both
+ioctls to indicate how many routes the kernel can return, and is never modified
+by the kernel.
+
+The ``num_routes`` field indicates the number of routes in the routing
+table. For ``VIDIOC_SUBDEV_S_ROUTING``, it is set by userspace to the number of
+routes that the application stored in the ``routes`` array. For both ioctls, it
+is returned by the kernel and indicates how many routes are stored in the
+subdevice routing table. This may be smaller or larger than the value of
+``num_routes`` set by the application for ``VIDIOC_SUBDEV_S_ROUTING``, as
+drivers may adjust the requested routing table.
+
+The kernel can return a ``num_routes`` value larger than ``len_routes`` from
+both ioctls. This indicates thare are more routes in the routing table than fits
+the ``routes`` array. In this case, the ``routes`` array is filled by the kernel
+with the first ``len_routes`` entries of the subdevice routing table. This is
+not considered to be an error, and the ioctl call succeeds. If the applications
+wants to retrieve the missing routes, it can issue a new
+``VIDIOC_SUBDEV_G_ROUTING`` call with a large enough ``routes`` array.
+
+``VIDIOC_SUBDEV_S_ROUTING`` may return more routes than the user provided in
+``num_routes`` field due to e.g. hardware properties.
 
 .. tabularcolumns:: |p{4.4cm}|p{4.4cm}|p{8.7cm}|
 
@@ -74,6 +89,9 @@ On a successful ``VIDIOC_SUBDEV_G_ROUTING`` call the driver updates the
       - ``which``
       - Routing table to be accessed, from enum
         :ref:`v4l2_subdev_format_whence <v4l2-subdev-format-whence>`.
+    * - __u32
+      - ``len_routes``
+      - The length of the array (as in memory reserved for the array)
     * - struct :c:type:`v4l2_subdev_route`
       - ``routes[]``
       - Array of struct :c:type:`v4l2_subdev_route` entries
@@ -81,7 +99,7 @@ On a successful ``VIDIOC_SUBDEV_G_ROUTING`` call the driver updates the
       - ``num_routes``
       - Number of entries of the routes array
     * - __u32
-      - ``reserved``\ [5]
+      - ``reserved``\ [11]
       - Reserved for future extensions. Applications and drivers must set
 	the array to zero.
 
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index 0260acef97d2..aab671fae45b 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -3201,13 +3201,13 @@ static int check_array_args(unsigned int cmd, void *parg, size_t *array_size,
 	case VIDIOC_SUBDEV_S_ROUTING: {
 		struct v4l2_subdev_routing *routing = parg;
 
-		if (routing->num_routes > 256)
+		if (routing->len_routes > 256)
 			return -E2BIG;
 
 		*user_ptr = u64_to_user_ptr(routing->routes);
 		*kernel_ptr = (void **)&routing->routes;
 		*array_size = sizeof(struct v4l2_subdev_route)
-			    * routing->num_routes;
+			    * routing->len_routes;
 		ret = 1;
 		break;
 	}
diff --git a/drivers/media/v4l2-core/v4l2-subdev.c b/drivers/media/v4l2-core/v4l2-subdev.c
index 779583447eac..b565f202df67 100644
--- a/drivers/media/v4l2-core/v4l2-subdev.c
+++ b/drivers/media/v4l2-core/v4l2-subdev.c
@@ -960,14 +960,10 @@ static long subdev_do_ioctl(struct file *file, unsigned int cmd, void *arg,
 
 		krouting = &state->routing;
 
-		if (routing->num_routes < krouting->num_routes) {
-			routing->num_routes = krouting->num_routes;
-			return -ENOSPC;
-		}
-
 		memcpy((struct v4l2_subdev_route *)(uintptr_t)routing->routes,
 		       krouting->routes,
-		       krouting->num_routes * sizeof(*krouting->routes));
+		       min(krouting->num_routes, routing->len_routes) *
+		       sizeof(*krouting->routes));
 		routing->num_routes = krouting->num_routes;
 
 		return 0;
@@ -989,6 +985,9 @@ static long subdev_do_ioctl(struct file *file, unsigned int cmd, void *arg,
 		if (routing->which != V4L2_SUBDEV_FORMAT_TRY && ro_subdev)
 			return -EPERM;
 
+		if (routing->num_routes > routing->len_routes)
+			return -EINVAL;
+
 		memset(routing->reserved, 0, sizeof(routing->reserved));
 
 		for (i = 0; i < routing->num_routes; ++i) {
@@ -1015,6 +1014,7 @@ static long subdev_do_ioctl(struct file *file, unsigned int cmd, void *arg,
 		}
 
 		krouting.num_routes = routing->num_routes;
+		krouting.len_routes = routing->len_routes;
 		krouting.routes = routes;
 
 		return v4l2_subdev_call(sd, pad, set_routing, state,
diff --git a/include/media/v4l2-subdev.h b/include/media/v4l2-subdev.h
index e22c50ce7e05..e30c463d90e5 100644
--- a/include/media/v4l2-subdev.h
+++ b/include/media/v4l2-subdev.h
@@ -714,12 +714,14 @@ struct v4l2_subdev_stream_configs {
 /**
  * struct v4l2_subdev_krouting - subdev routing table
  *
+ * @len_routes: length of routes array, in routes
  * @num_routes: number of routes
  * @routes: &struct v4l2_subdev_route
  *
  * This structure contains the routing table for a subdev.
  */
 struct v4l2_subdev_krouting {
+	unsigned int len_routes;
 	unsigned int num_routes;
 	struct v4l2_subdev_route *routes;
 };
diff --git a/include/uapi/linux/v4l2-subdev.h b/include/uapi/linux/v4l2-subdev.h
index 81a24bd38003..2347e266cf75 100644
--- a/include/uapi/linux/v4l2-subdev.h
+++ b/include/uapi/linux/v4l2-subdev.h
@@ -228,15 +228,19 @@ struct v4l2_subdev_route {
  * struct v4l2_subdev_routing - Subdev routing information
  *
  * @which: configuration type (from enum v4l2_subdev_format_whence)
- * @num_routes: the total number of routes in the routes array
+ * @len_routes: the length of the routes array, in routes; set by the user, not
+ *		modified by the kernel
  * @routes: pointer to the routes array
+ * @num_routes: the total number of routes, possibly more than fits in the
+ *		routes array
  * @reserved: drivers and applications must zero this array
  */
 struct v4l2_subdev_routing {
 	__u32 which;
-	__u32 num_routes;
+	__u32 len_routes;
 	__u64 routes;
-	__u32 reserved[6];
+	__u32 num_routes;
+	__u32 reserved[11];
 };
 
 /*
-- 
cgit v1.2.3


From 1810477630404ed798f97e4babee4d6dd07a6095 Mon Sep 17 00:00:00 2001
From: Jean-Michel Hautbois <jeanmichel.hautbois@ideasonboard.com>
Date: Wed, 24 Apr 2024 18:35:37 +0300
Subject: media: v4l: Add V4L2-PIX-FMT-Y12P format

This is a packed grey-scale image format with a depth of 12 bits per
pixel. Two consecutive pixels are packed into 3 bytes. The first 2 bytes
contain the 8 high order bits of the pixels, and the 3rd byte contains
the 4 least significants bits of each pixel, in the same order.

Add the entry in userspace API, and document it.

Signed-off-by: Jean-Michel Hautbois <jeanmichel.hautbois@ideasonboard.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 Documentation/userspace-api/media/v4l/pixfmt-yuv-luma.rst | 11 +++++++++++
 drivers/media/v4l2-core/v4l2-ioctl.c                      |  1 +
 include/uapi/linux/videodev2.h                            |  1 +
 3 files changed, 13 insertions(+)

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/pixfmt-yuv-luma.rst b/Documentation/userspace-api/media/v4l/pixfmt-yuv-luma.rst
index cf8e4dfbfbd4..a650ceda7cdb 100644
--- a/Documentation/userspace-api/media/v4l/pixfmt-yuv-luma.rst
+++ b/Documentation/userspace-api/media/v4l/pixfmt-yuv-luma.rst
@@ -114,6 +114,17 @@ are often referred to as greyscale formats.
       - ...
       - ...
 
+    * .. _V4L2-PIX-FMT-Y12P:
+
+      - ``V4L2_PIX_FMT_Y12P``
+      - 'Y12P'
+
+      - Y'\ :sub:`0`\ [11:4]
+      - Y'\ :sub:`1`\ [11:4]
+      - Y'\ :sub:`1`\ [3:0] Y'\ :sub:`0`\ [3:0]
+      - ...
+      - ...
+
     * .. _V4L2-PIX-FMT-Y14:
 
       - ``V4L2_PIX_FMT_Y14``
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index aab671fae45b..94fe2a1d13cd 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1321,6 +1321,7 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_Y10BPACK:	descr = "10-bit Greyscale (Packed)"; break;
 	case V4L2_PIX_FMT_Y10P:		descr = "10-bit Greyscale (MIPI Packed)"; break;
 	case V4L2_PIX_FMT_IPU3_Y10:	descr = "10-bit greyscale (IPU3 Packed)"; break;
+	case V4L2_PIX_FMT_Y12P:		descr = "12-bit Greyscale (MIPI Packed)"; break;
 	case V4L2_PIX_FMT_Y8I:		descr = "Interleaved 8-bit Greyscale"; break;
 	case V4L2_PIX_FMT_Y12I:		descr = "Interleaved 12-bit Greyscale"; break;
 	case V4L2_PIX_FMT_Z16:		descr = "16-bit Depth"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index d944db7ae1ed..760fb8ed7f69 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -599,6 +599,7 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_Y10BPACK    v4l2_fourcc('Y', '1', '0', 'B') /* 10  Greyscale bit-packed */
 #define V4L2_PIX_FMT_Y10P    v4l2_fourcc('Y', '1', '0', 'P') /* 10  Greyscale, MIPI RAW10 packed */
 #define V4L2_PIX_FMT_IPU3_Y10		v4l2_fourcc('i', 'p', '3', 'y') /* IPU3 packed 10-bit greyscale */
+#define V4L2_PIX_FMT_Y12P    v4l2_fourcc('Y', '1', '2', 'P') /* 12  Greyscale, MIPI RAW12 packed */
 
 /* Palette formats */
 #define V4L2_PIX_FMT_PAL8    v4l2_fourcc('P', 'A', 'L', '8') /*  8  8-bit palette */
-- 
cgit v1.2.3


From adb1d4655e53829658f42f13bd81657528a7a60d Mon Sep 17 00:00:00 2001
From: Jean-Michel Hautbois <jeanmichel.hautbois@ideasonboard.com>
Date: Wed, 24 Apr 2024 18:35:38 +0300
Subject: media: v4l: Add V4L2-PIX-FMT-Y14P format

This is a packed grey-scale image format with a depth of 14 bits per
pixel. Every four consecutive samples are packed into seven bytes. Each
of the first four bytes contain the eight high order bits of the pixels,
and the three following bytes contains the six least significants bits
of each pixel, in the same order.

As the other formats only needed 5 bytes before, append two bytes in the
documentation array.

Signed-off-by: Jean-Michel Hautbois <jeanmichel.hautbois@ideasonboard.com>
Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 .../userspace-api/media/v4l/pixfmt-yuv-luma.rst    | 37 ++++++++++++++++++++++
 drivers/media/v4l2-core/v4l2-ioctl.c               |  1 +
 include/uapi/linux/videodev2.h                     |  1 +
 3 files changed, 39 insertions(+)

(limited to 'include')

diff --git a/Documentation/userspace-api/media/v4l/pixfmt-yuv-luma.rst b/Documentation/userspace-api/media/v4l/pixfmt-yuv-luma.rst
index a650ceda7cdb..b3c5779521d8 100644
--- a/Documentation/userspace-api/media/v4l/pixfmt-yuv-luma.rst
+++ b/Documentation/userspace-api/media/v4l/pixfmt-yuv-luma.rst
@@ -36,6 +36,8 @@ are often referred to as greyscale formats.
       - Byte 2
       - Byte 3
       - Byte 4
+      - Byte 5
+      - Byte 6
 
     * .. _V4L2-PIX-FMT-GREY:
 
@@ -47,6 +49,8 @@ are often referred to as greyscale formats.
       - ...
       - ...
       - ...
+      - ...
+      - ...
 
     * .. _V4L2-PIX-FMT-IPU3-Y10:
 
@@ -58,6 +62,8 @@ are often referred to as greyscale formats.
       - Y'\ :sub:`2`\ [3:0] Y'\ :sub:`1`\ [9:6]
       - Y'\ :sub:`3`\ [1:0] Y'\ :sub:`2`\ [9:4]
       - Y'\ :sub:`3`\ [9:2]
+      - ...
+      - ...
 
     * .. _V4L2-PIX-FMT-Y10:
 
@@ -69,6 +75,8 @@ are often referred to as greyscale formats.
       - ...
       - ...
       - ...
+      - ...
+      - ...
 
     * .. _V4L2-PIX-FMT-Y10BPACK:
 
@@ -80,6 +88,8 @@ are often referred to as greyscale formats.
       - Y'\ :sub:`1`\ [3:0] Y'\ :sub:`2`\ [9:6]
       - Y'\ :sub:`2`\ [5:0] Y'\ :sub:`3`\ [9:8]
       - Y'\ :sub:`3`\ [7:0]
+      - ...
+      - ...
 
     * .. _V4L2-PIX-FMT-Y10P:
 
@@ -91,6 +101,8 @@ are often referred to as greyscale formats.
       - Y'\ :sub:`2`\ [9:2]
       - Y'\ :sub:`3`\ [9:2]
       - Y'\ :sub:`3`\ [1:0] Y'\ :sub:`2`\ [1:0] Y'\ :sub:`1`\ [1:0] Y'\ :sub:`0`\ [1:0]
+      - ...
+      - ...
 
     * .. _V4L2-PIX-FMT-Y12:
 
@@ -102,6 +114,8 @@ are often referred to as greyscale formats.
       - ...
       - ...
       - ...
+      - ...
+      - ...
 
     * .. _V4L2-PIX-FMT-Y012:
 
@@ -113,6 +127,8 @@ are often referred to as greyscale formats.
       - ...
       - ...
       - ...
+      - ...
+      - ...
 
     * .. _V4L2-PIX-FMT-Y12P:
 
@@ -124,6 +140,8 @@ are often referred to as greyscale formats.
       - Y'\ :sub:`1`\ [3:0] Y'\ :sub:`0`\ [3:0]
       - ...
       - ...
+      - ...
+      - ...
 
     * .. _V4L2-PIX-FMT-Y14:
 
@@ -135,6 +153,21 @@ are often referred to as greyscale formats.
       - ...
       - ...
       - ...
+      - ...
+      - ...
+
+    * .. _V4L2-PIX-FMT-Y14P:
+
+      - ``V4L2_PIX_FMT_Y14P``
+      - 'Y14P'
+
+      - Y'\ :sub:`0`\ [13:6]
+      - Y'\ :sub:`1`\ [13:6]
+      - Y'\ :sub:`2`\ [13:6]
+      - Y'\ :sub:`3`\ [13:6]
+      - Y'\ :sub:`1`\ [1:0] Y'\ :sub:`0`\ [5:0]
+      - Y'\ :sub:`2`\ [3:0] Y'\ :sub:`1`\ [5:2]
+      - Y'\ :sub:`3`\ [5:0] Y'\ :sub:`2`\ [5:4]
 
     * .. _V4L2-PIX-FMT-Y16:
 
@@ -146,6 +179,8 @@ are often referred to as greyscale formats.
       - ...
       - ...
       - ...
+      - ...
+      - ...
 
     * .. _V4L2-PIX-FMT-Y16-BE:
 
@@ -157,6 +192,8 @@ are often referred to as greyscale formats.
       - ...
       - ...
       - ...
+      - ...
+      - ...
 
 .. raw:: latex
 
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index 94fe2a1d13cd..4c76d17b4629 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1322,6 +1322,7 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_Y10P:		descr = "10-bit Greyscale (MIPI Packed)"; break;
 	case V4L2_PIX_FMT_IPU3_Y10:	descr = "10-bit greyscale (IPU3 Packed)"; break;
 	case V4L2_PIX_FMT_Y12P:		descr = "12-bit Greyscale (MIPI Packed)"; break;
+	case V4L2_PIX_FMT_Y14P:		descr = "14-bit Greyscale (MIPI Packed)"; break;
 	case V4L2_PIX_FMT_Y8I:		descr = "Interleaved 8-bit Greyscale"; break;
 	case V4L2_PIX_FMT_Y12I:		descr = "Interleaved 12-bit Greyscale"; break;
 	case V4L2_PIX_FMT_Z16:		descr = "16-bit Depth"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 760fb8ed7f69..2f9762842313 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -600,6 +600,7 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_Y10P    v4l2_fourcc('Y', '1', '0', 'P') /* 10  Greyscale, MIPI RAW10 packed */
 #define V4L2_PIX_FMT_IPU3_Y10		v4l2_fourcc('i', 'p', '3', 'y') /* IPU3 packed 10-bit greyscale */
 #define V4L2_PIX_FMT_Y12P    v4l2_fourcc('Y', '1', '2', 'P') /* 12  Greyscale, MIPI RAW12 packed */
+#define V4L2_PIX_FMT_Y14P    v4l2_fourcc('Y', '1', '4', 'P') /* 14  Greyscale, MIPI RAW14 packed */
 
 /* Palette formats */
 #define V4L2_PIX_FMT_PAL8    v4l2_fourcc('P', 'A', 'L', '8') /*  8  8-bit palette */
-- 
cgit v1.2.3


From e42ae51b7628f589477588ae5837e4e5875a2d92 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Tue, 13 Feb 2024 11:00:28 +0200
Subject: media: ipu6: Add PCI device table header

Add a header for the PCI device table of all IPU6 devices. This will be
used by the IPU bridge as well so place the table in a header.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 include/media/ipu6-pci-table.h | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 include/media/ipu6-pci-table.h

(limited to 'include')

diff --git a/include/media/ipu6-pci-table.h b/include/media/ipu6-pci-table.h
new file mode 100644
index 000000000000..0899d9d2f978
--- /dev/null
+++ b/include/media/ipu6-pci-table.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2024 Intel Corporation
+ */
+
+#ifndef __IPU6_PCI_TBL_H__
+#define __IPU6_PCI_TBL_H__
+
+#include <linux/pci.h>
+
+#define PCI_DEVICE_ID_INTEL_IPU6		0x9a19
+#define PCI_DEVICE_ID_INTEL_IPU6SE		0x4e19
+#define PCI_DEVICE_ID_INTEL_IPU6EP_ADLP		0x465d
+#define PCI_DEVICE_ID_INTEL_IPU6EP_ADLN		0x462e
+#define PCI_DEVICE_ID_INTEL_IPU6EP_RPLP		0xa75d
+#define PCI_DEVICE_ID_INTEL_IPU6EP_MTL		0x7d19
+
+static const struct pci_device_id ipu6_pci_tbl[] = {
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IPU6) },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IPU6SE) },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IPU6EP_ADLP) },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IPU6EP_ADLN) },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IPU6EP_RPLP) },
+	{ PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IPU6EP_MTL) },
+	{ }
+};
+
+#endif /* __IPU6_PCI_TBL_H__ */
-- 
cgit v1.2.3


From d69c8429ea80af02e89e5b3eecb78e417ad049c8 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Date: Fri, 26 Apr 2024 18:33:19 +0300
Subject: media: uapi: v4l: Don't expose generic metadata formats to userspace

The generic metadata pixel formats (V4L2_META_FMT_GENERIC_*) are meant
to be used in conjunction with device-specific media bus codes. Those
codes are work in progress and not available in the upstream kernel yet.
To make sure the generic metadata pixel formats won't be used by
userspace until we have the full infrastructure in place, keep their
definition private to the kernel for now.

Signed-off-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 include/uapi/linux/videodev2.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 2f9762842313..fe6b67e83751 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -841,6 +841,7 @@ struct v4l2_pix_format {
 #define V4L2_META_FMT_RK_ISP1_PARAMS	v4l2_fourcc('R', 'K', '1', 'P') /* Rockchip ISP1 3A Parameters */
 #define V4L2_META_FMT_RK_ISP1_STAT_3A	v4l2_fourcc('R', 'K', '1', 'S') /* Rockchip ISP1 3A Statistics */
 
+#ifdef __KERNEL__
 /*
  * Line-based metadata formats. Remember to update v4l_fill_fmtdesc() when
  * adding new ones!
@@ -852,6 +853,7 @@ struct v4l2_pix_format {
 #define V4L2_META_FMT_GENERIC_CSI2_16	v4l2_fourcc('M', 'C', '1', 'G') /* 16-bit CSI-2 packed 8-bit metadata */
 #define V4L2_META_FMT_GENERIC_CSI2_20	v4l2_fourcc('M', 'C', '1', 'K') /* 20-bit CSI-2 packed 8-bit metadata */
 #define V4L2_META_FMT_GENERIC_CSI2_24	v4l2_fourcc('M', 'C', '1', 'O') /* 24-bit CSI-2 packed 8-bit metadata */
+#endif
 
 /* priv field value to indicates that subsequent fields are valid. */
 #define V4L2_PIX_FMT_PRIV_MAGIC		0xfeedcafe
-- 
cgit v1.2.3


From 2ff1e97587f4d398686f52c07afde3faf3da4e5c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 19 Mar 2024 10:00:09 +0000
Subject: netfs: Replace PG_fscache by setting folio->private and marking dirty

When dirty data is being written to the cache, setting/waiting on/clearing
the fscache flag is always done in tandem with setting/waiting on/clearing
the writeback flag.  The netfslib buffered write routines wait on and set
both flags and the write request cleanup clears both flags, so the fscache
flag is almost superfluous.

The reason it isn't superfluous is because the fscache flag is also used to
indicate that data just read from the server is being written to the cache.
The flag is used to prevent a race involving overlapping direct-I/O writes
to the cache.

Change this to indicate that a page is in need of being copied to the cache
by placing a magic value in folio->private and marking the folios dirty.
Then when the writeback code sees a folio marked in this way, it only
writes it to the cache and not to the server.

If a folio that has this magic value set is modified, the value is just
replaced and the folio will then be uplodaded too.

With this, PG_fscache is no longer required by the netfslib core, 9p and
afs.

Ceph and nfs, however, still need to use the old PG_fscache-based tracking.
To deal with this, a flag, NETFS_ICTX_USE_PGPRIV2, now has to be set on the
flags in the netfs_inode struct for those filesystems.  This reenables the
use of PG_fscache in that inode.  9p and afs use the netfslib write helpers
so get switched over; cifs, for the moment, does page-by-page manual access
to the cache, so doesn't use PG_fscache and is unaffected.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: Matthew Wilcox (Oracle) <willy@infradead.org>
cc: Eric Van Hensbergen <ericvh@kernel.org>
cc: Latchesar Ionkov <lucho@ionkov.net>
cc: Dominique Martinet <asmadeus@codewreck.org>
cc: Christian Schoenebeck <linux_oss@crudebyte.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Xiubo Li <xiubli@redhat.com>
cc: Steve French <sfrench@samba.org>
cc: Paulo Alcantara <pc@manguebit.com>
cc: Ronnie Sahlberg <ronniesahlberg@gmail.com>
cc: Shyam Prasad N <sprasad@microsoft.com>
cc: Tom Talpey <tom@talpey.com>
cc: Bharath SM <bharathsm@microsoft.com>
cc: Trond Myklebust <trond.myklebust@hammerspace.com>
cc: Anna Schumaker <anna@kernel.org>
cc: netfs@lists.linux.dev
cc: v9fs@lists.linux.dev
cc: linux-afs@lists.infradead.org
cc: ceph-devel@vger.kernel.org
cc: linux-cifs@vger.kernel.org
cc: linux-nfs@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 fs/ceph/addr.c               |  2 +-
 fs/ceph/inode.c              |  2 +
 fs/netfs/buffered_read.c     | 36 ++++++++++++-----
 fs/netfs/buffered_write.c    | 93 +++++++++++++++++++++-----------------------
 fs/netfs/fscache_io.c        | 12 ++++--
 fs/netfs/internal.h          | 10 +++--
 fs/netfs/io.c                | 18 +++++----
 fs/netfs/main.c              |  1 +
 fs/netfs/misc.c              | 10 +----
 fs/netfs/objects.c           |  6 ++-
 fs/nfs/fscache.h             |  2 +
 include/linux/fscache.h      | 22 +++++++----
 include/linux/netfs.h        | 20 ++++++++--
 include/trace/events/netfs.h |  6 ++-
 14 files changed, 143 insertions(+), 97 deletions(-)

(limited to 'include')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index ee9caf7916fb..28ae4976a4f9 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -517,7 +517,7 @@ static void ceph_fscache_write_to_cache(struct inode *inode, u64 off, u64 len, b
 	struct fscache_cookie *cookie = ceph_fscache_cookie(ci);
 
 	fscache_write_to_cache(cookie, inode->i_mapping, off, len, i_size_read(inode),
-			       ceph_fscache_write_terminated, inode, caching);
+			       ceph_fscache_write_terminated, inode, true, caching);
 }
 #else
 static inline void ceph_set_page_fscache(struct page *page)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 7b2e77517f23..99561fddcb38 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -577,6 +577,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 
 	/* Set parameters for the netfs library */
 	netfs_inode_init(&ci->netfs, &ceph_netfs_ops, false);
+	/* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */
+	__set_bit(NETFS_ICTX_USE_PGPRIV2, &ci->netfs.flags);
 
 	spin_lock_init(&ci->i_ceph_lock);
 
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 3298c29b5548..6d49319c82c6 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -10,8 +10,11 @@
 #include "internal.h"
 
 /*
- * Unlock the folios in a read operation.  We need to set PG_fscache on any
+ * Unlock the folios in a read operation.  We need to set PG_writeback on any
  * folios we're going to write back before we unlock them.
+ *
+ * Note that if the deprecated NETFS_RREQ_USE_PGPRIV2 is set then we use
+ * PG_private_2 and do a direct write to the cache from here instead.
  */
 void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
 {
@@ -48,14 +51,14 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
 	xas_for_each(&xas, folio, last_page) {
 		loff_t pg_end;
 		bool pg_failed = false;
-		bool folio_started;
+		bool wback_to_cache = false;
+		bool folio_started = false;
 
 		if (xas_retry(&xas, folio))
 			continue;
 
 		pg_end = folio_pos(folio) + folio_size(folio) - 1;
 
-		folio_started = false;
 		for (;;) {
 			loff_t sreq_end;
 
@@ -63,10 +66,16 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
 				pg_failed = true;
 				break;
 			}
-			if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
-				trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
-				folio_start_fscache(folio);
-				folio_started = true;
+			if (test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags)) {
+				if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE,
+							       &subreq->flags)) {
+					trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
+					folio_start_fscache(folio);
+					folio_started = true;
+				}
+			} else {
+				wback_to_cache |=
+					test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
 			}
 			pg_failed |= subreq_failed;
 			sreq_end = subreq->start + subreq->len - 1;
@@ -98,6 +107,11 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
 				kfree(finfo);
 			}
 			folio_mark_uptodate(folio);
+			if (wback_to_cache && !WARN_ON_ONCE(folio_get_private(folio) != NULL)) {
+				trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
+				folio_attach_private(folio, NETFS_FOLIO_COPY_TO_CACHE);
+				filemap_dirty_folio(folio->mapping, folio);
+			}
 		}
 
 		if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
@@ -491,9 +505,11 @@ retry:
 	netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
 
 have_folio:
-	ret = folio_wait_fscache_killable(folio);
-	if (ret < 0)
-		goto error;
+	if (test_bit(NETFS_ICTX_USE_PGPRIV2, &ctx->flags)) {
+		ret = folio_wait_fscache_killable(folio);
+		if (ret < 0)
+			goto error;
+	}
 have_folio_no_wait:
 	*_folio = folio;
 	_leave(" = 0");
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index f7455a579f21..57c6eab01261 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -30,21 +30,13 @@ static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq);
 
 static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group)
 {
-	if (netfs_group && !folio_get_private(folio))
-		folio_attach_private(folio, netfs_get_group(netfs_group));
-}
+	void *priv = folio_get_private(folio);
 
-#if IS_ENABLED(CONFIG_FSCACHE)
-static void netfs_folio_start_fscache(bool caching, struct folio *folio)
-{
-	if (caching)
-		folio_start_fscache(folio);
-}
-#else
-static void netfs_folio_start_fscache(bool caching, struct folio *folio)
-{
+	if (netfs_group && (!priv || priv == NETFS_FOLIO_COPY_TO_CACHE))
+		folio_attach_private(folio, netfs_get_group(netfs_group));
+	else if (!netfs_group && priv == NETFS_FOLIO_COPY_TO_CACHE)
+		folio_detach_private(folio);
 }
-#endif
 
 /*
  * Decide how we should modify a folio.  We might be attempting to do
@@ -63,11 +55,12 @@ static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx,
 						    bool maybe_trouble)
 {
 	struct netfs_folio *finfo = netfs_folio_info(folio);
+	struct netfs_group *group = netfs_folio_group(folio);
 	loff_t pos = folio_file_pos(folio);
 
 	_enter("");
 
-	if (netfs_folio_group(folio) != netfs_group)
+	if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE)
 		return NETFS_FLUSH_CONTENT;
 
 	if (folio_test_uptodate(folio))
@@ -396,9 +389,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
 				folio_clear_dirty_for_io(folio);
 			/* We make multiple writes to the folio... */
 			if (!folio_test_writeback(folio)) {
-				folio_wait_fscache(folio);
 				folio_start_writeback(folio);
-				folio_start_fscache(folio);
 				if (wreq->iter.count == 0)
 					trace_netfs_folio(folio, netfs_folio_trace_wthru);
 				else
@@ -528,6 +519,7 @@ EXPORT_SYMBOL(netfs_file_write_iter);
  */
 vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group)
 {
+	struct netfs_group *group;
 	struct folio *folio = page_folio(vmf->page);
 	struct file *file = vmf->vma->vm_file;
 	struct inode *inode = file_inode(file);
@@ -550,7 +542,8 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
 		goto out;
 	}
 
-	if (netfs_folio_group(folio) != netfs_group) {
+	group = netfs_folio_group(folio);
+	if (group != netfs_group && group != NETFS_FOLIO_COPY_TO_CACHE) {
 		folio_unlock(folio);
 		err = filemap_fdatawait_range(inode->i_mapping,
 					      folio_pos(folio),
@@ -606,8 +599,6 @@ static void netfs_kill_pages(struct address_space *mapping,
 
 		trace_netfs_folio(folio, netfs_folio_trace_kill);
 		folio_clear_uptodate(folio);
-		if (folio_test_fscache(folio))
-			folio_end_fscache(folio);
 		folio_end_writeback(folio);
 		folio_lock(folio);
 		generic_error_remove_folio(mapping, folio);
@@ -643,8 +634,6 @@ static void netfs_redirty_pages(struct address_space *mapping,
 		next = folio_next_index(folio);
 		trace_netfs_folio(folio, netfs_folio_trace_redirty);
 		filemap_dirty_folio(mapping, folio);
-		if (folio_test_fscache(folio))
-			folio_end_fscache(folio);
 		folio_end_writeback(folio);
 		folio_put(folio);
 	} while (index = next, index <= last);
@@ -700,7 +689,11 @@ static void netfs_pages_written_back(struct netfs_io_request *wreq)
 				if (!folio_test_dirty(folio)) {
 					folio_detach_private(folio);
 					gcount++;
-					trace_netfs_folio(folio, netfs_folio_trace_clear_g);
+					if (group == NETFS_FOLIO_COPY_TO_CACHE)
+						trace_netfs_folio(folio,
+								  netfs_folio_trace_end_copy);
+					else
+						trace_netfs_folio(folio, netfs_folio_trace_clear_g);
 				} else {
 					trace_netfs_folio(folio, netfs_folio_trace_redirtied);
 				}
@@ -724,8 +717,6 @@ static void netfs_pages_written_back(struct netfs_io_request *wreq)
 			trace_netfs_folio(folio, netfs_folio_trace_clear);
 		}
 	end_wb:
-		if (folio_test_fscache(folio))
-			folio_end_fscache(folio);
 		xas_advance(&xas, folio_next_index(folio) - 1);
 		folio_end_writeback(folio);
 	}
@@ -795,7 +786,6 @@ static void netfs_extend_writeback(struct address_space *mapping,
 				   long *_count,
 				   loff_t start,
 				   loff_t max_len,
-				   bool caching,
 				   size_t *_len,
 				   size_t *_top)
 {
@@ -846,8 +836,7 @@ static void netfs_extend_writeback(struct address_space *mapping,
 				break;
 			}
 			if (!folio_test_dirty(folio) ||
-			    folio_test_writeback(folio) ||
-			    folio_test_fscache(folio)) {
+			    folio_test_writeback(folio)) {
 				folio_unlock(folio);
 				folio_put(folio);
 				xas_reset(xas);
@@ -860,7 +849,8 @@ static void netfs_extend_writeback(struct address_space *mapping,
 			if ((const struct netfs_group *)priv != group) {
 				stop = true;
 				finfo = netfs_folio_info(folio);
-				if (finfo->netfs_group != group ||
+				if (!finfo ||
+				    finfo->netfs_group != group ||
 				    finfo->dirty_offset > 0) {
 					folio_unlock(folio);
 					folio_put(folio);
@@ -894,12 +884,14 @@ static void netfs_extend_writeback(struct address_space *mapping,
 
 		for (i = 0; i < folio_batch_count(&fbatch); i++) {
 			folio = fbatch.folios[i];
-			trace_netfs_folio(folio, netfs_folio_trace_store_plus);
+			if (group == NETFS_FOLIO_COPY_TO_CACHE)
+				trace_netfs_folio(folio, netfs_folio_trace_copy_plus);
+			else
+				trace_netfs_folio(folio, netfs_folio_trace_store_plus);
 
 			if (!folio_clear_dirty_for_io(folio))
 				BUG();
 			folio_start_writeback(folio);
-			netfs_folio_start_fscache(caching, folio);
 			folio_unlock(folio);
 		}
 
@@ -925,14 +917,14 @@ static ssize_t netfs_write_back_from_locked_folio(struct address_space *mapping,
 	struct netfs_inode *ctx = netfs_inode(mapping->host);
 	unsigned long long i_size = i_size_read(&ctx->inode);
 	size_t len, max_len;
-	bool caching = netfs_is_cache_enabled(ctx);
 	long count = wbc->nr_to_write;
 	int ret;
 
-	_enter(",%lx,%llx-%llx,%u", folio->index, start, end, caching);
+	_enter(",%lx,%llx-%llx", folio->index, start, end);
 
 	wreq = netfs_alloc_request(mapping, NULL, start, folio_size(folio),
-				   NETFS_WRITEBACK);
+				   group == NETFS_FOLIO_COPY_TO_CACHE ?
+				   NETFS_COPY_TO_CACHE : NETFS_WRITEBACK);
 	if (IS_ERR(wreq)) {
 		folio_unlock(folio);
 		return PTR_ERR(wreq);
@@ -941,7 +933,6 @@ static ssize_t netfs_write_back_from_locked_folio(struct address_space *mapping,
 	if (!folio_clear_dirty_for_io(folio))
 		BUG();
 	folio_start_writeback(folio);
-	netfs_folio_start_fscache(caching, folio);
 
 	count -= folio_nr_pages(folio);
 
@@ -950,7 +941,10 @@ static ssize_t netfs_write_back_from_locked_folio(struct address_space *mapping,
 	 * immediately lockable, is not dirty or is missing, or we reach the
 	 * end of the range.
 	 */
-	trace_netfs_folio(folio, netfs_folio_trace_store);
+	if (group == NETFS_FOLIO_COPY_TO_CACHE)
+		trace_netfs_folio(folio, netfs_folio_trace_copy);
+	else
+		trace_netfs_folio(folio, netfs_folio_trace_store);
 
 	len = wreq->len;
 	finfo = netfs_folio_info(folio);
@@ -973,7 +967,7 @@ static ssize_t netfs_write_back_from_locked_folio(struct address_space *mapping,
 
 		if (len < max_len)
 			netfs_extend_writeback(mapping, group, xas, &count, start,
-					       max_len, caching, &len, &wreq->upper_len);
+					       max_len, &len, &wreq->upper_len);
 	}
 
 cant_expand:
@@ -997,15 +991,18 @@ cant_expand:
 
 		iov_iter_xarray(&wreq->iter, ITER_SOURCE, &mapping->i_pages, start,
 				wreq->upper_len);
-		__set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
-		ret = netfs_begin_write(wreq, true, netfs_write_trace_writeback);
+		if (group != NETFS_FOLIO_COPY_TO_CACHE) {
+			__set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
+			ret = netfs_begin_write(wreq, true, netfs_write_trace_writeback);
+		} else {
+			ret = netfs_begin_write(wreq, true, netfs_write_trace_copy_to_cache);
+		}
 		if (ret == 0 || ret == -EIOCBQUEUED)
 			wbc->nr_to_write -= len / PAGE_SIZE;
 	} else {
 		_debug("write discard %zx @%llx [%llx]", len, start, i_size);
 
 		/* The dirty region was entirely beyond the EOF. */
-		fscache_clear_page_bits(mapping, start, len, caching);
 		netfs_pages_written_back(wreq);
 		ret = 0;
 	}
@@ -1058,9 +1055,11 @@ search_again:
 
 		/* Skip any dirty folio that's not in the group of interest. */
 		priv = folio_get_private(folio);
-		if ((const struct netfs_group *)priv != group) {
-			finfo = netfs_folio_info(folio);
-			if (finfo->netfs_group != group) {
+		if ((const struct netfs_group *)priv == NETFS_FOLIO_COPY_TO_CACHE) {
+			group = NETFS_FOLIO_COPY_TO_CACHE;
+		} else if ((const struct netfs_group *)priv != group) {
+			finfo = __netfs_folio_info(priv);
+			if (!finfo || finfo->netfs_group != group) {
 				folio_put(folio);
 				continue;
 			}
@@ -1099,14 +1098,10 @@ lock_again:
 		goto search_again;
 	}
 
-	if (folio_test_writeback(folio) ||
-	    folio_test_fscache(folio)) {
+	if (folio_test_writeback(folio)) {
 		folio_unlock(folio);
 		if (wbc->sync_mode != WB_SYNC_NONE) {
 			folio_wait_writeback(folio);
-#ifdef CONFIG_FSCACHE
-			folio_wait_fscache(folio);
-#endif
 			goto lock_again;
 		}
 
@@ -1265,7 +1260,8 @@ int netfs_launder_folio(struct folio *folio)
 
 	bvec_set_folio(&bvec, folio, len, offset);
 	iov_iter_bvec(&wreq->iter, ITER_SOURCE, &bvec, 1, len);
-	__set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
+	if (group != NETFS_FOLIO_COPY_TO_CACHE)
+		__set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
 	ret = netfs_begin_write(wreq, true, netfs_write_trace_launder);
 
 out_put:
@@ -1274,7 +1270,6 @@ out_put:
 	kfree(finfo);
 	netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
 out:
-	folio_wait_fscache(folio);
 	_leave(" = %d", ret);
 	return ret;
 }
diff --git a/fs/netfs/fscache_io.c b/fs/netfs/fscache_io.c
index 43a651ed8264..5028f2ae30da 100644
--- a/fs/netfs/fscache_io.c
+++ b/fs/netfs/fscache_io.c
@@ -166,6 +166,7 @@ struct fscache_write_request {
 	loff_t			start;
 	size_t			len;
 	bool			set_bits;
+	bool			using_pgpriv2;
 	netfs_io_terminated_t	term_func;
 	void			*term_func_priv;
 };
@@ -197,8 +198,9 @@ static void fscache_wreq_done(void *priv, ssize_t transferred_or_error,
 {
 	struct fscache_write_request *wreq = priv;
 
-	fscache_clear_page_bits(wreq->mapping, wreq->start, wreq->len,
-				wreq->set_bits);
+	if (wreq->using_pgpriv2)
+		fscache_clear_page_bits(wreq->mapping, wreq->start, wreq->len,
+					wreq->set_bits);
 
 	if (wreq->term_func)
 		wreq->term_func(wreq->term_func_priv, transferred_or_error,
@@ -212,7 +214,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie,
 			      loff_t start, size_t len, loff_t i_size,
 			      netfs_io_terminated_t term_func,
 			      void *term_func_priv,
-			      bool cond)
+			      bool using_pgpriv2, bool cond)
 {
 	struct fscache_write_request *wreq;
 	struct netfs_cache_resources *cres;
@@ -230,6 +232,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie,
 	wreq->mapping		= mapping;
 	wreq->start		= start;
 	wreq->len		= len;
+	wreq->using_pgpriv2	= using_pgpriv2;
 	wreq->set_bits		= cond;
 	wreq->term_func		= term_func;
 	wreq->term_func_priv	= term_func_priv;
@@ -257,7 +260,8 @@ abandon_end:
 abandon_free:
 	kfree(wreq);
 abandon:
-	fscache_clear_page_bits(mapping, start, len, cond);
+	if (using_pgpriv2)
+		fscache_clear_page_bits(mapping, start, len, cond);
 	if (term_func)
 		term_func(term_func_priv, ret, false);
 }
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index ec7045d24400..156ab138e224 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -168,7 +168,7 @@ static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx)
  */
 static inline struct netfs_group *netfs_get_group(struct netfs_group *netfs_group)
 {
-	if (netfs_group)
+	if (netfs_group && netfs_group != NETFS_FOLIO_COPY_TO_CACHE)
 		refcount_inc(&netfs_group->ref);
 	return netfs_group;
 }
@@ -178,7 +178,9 @@ static inline struct netfs_group *netfs_get_group(struct netfs_group *netfs_grou
  */
 static inline void netfs_put_group(struct netfs_group *netfs_group)
 {
-	if (netfs_group && refcount_dec_and_test(&netfs_group->ref))
+	if (netfs_group &&
+	    netfs_group != NETFS_FOLIO_COPY_TO_CACHE &&
+	    refcount_dec_and_test(&netfs_group->ref))
 		netfs_group->free(netfs_group);
 }
 
@@ -187,7 +189,9 @@ static inline void netfs_put_group(struct netfs_group *netfs_group)
  */
 static inline void netfs_put_group_many(struct netfs_group *netfs_group, int nr)
 {
-	if (netfs_group && refcount_sub_and_test(nr, &netfs_group->ref))
+	if (netfs_group &&
+	    netfs_group != NETFS_FOLIO_COPY_TO_CACHE &&
+	    refcount_sub_and_test(nr, &netfs_group->ref))
 		netfs_group->free(netfs_group);
 }
 
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index 4261ad6c55b6..b3b9827a9709 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -99,8 +99,9 @@ static void netfs_rreq_completed(struct netfs_io_request *rreq, bool was_async)
 }
 
 /*
- * Deal with the completion of writing the data to the cache.  We have to clear
- * the PG_fscache bits on the folios involved and release the caller's ref.
+ * [DEPRECATED] Deal with the completion of writing the data to the cache.  We
+ * have to clear the PG_fscache bits on the folios involved and release the
+ * caller's ref.
  *
  * May be called in softirq mode and we inherit a ref from the caller.
  */
@@ -138,7 +139,7 @@ static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq,
 }
 
 static void netfs_rreq_copy_terminated(void *priv, ssize_t transferred_or_error,
-				       bool was_async)
+				       bool was_async) /* [DEPRECATED] */
 {
 	struct netfs_io_subrequest *subreq = priv;
 	struct netfs_io_request *rreq = subreq->rreq;
@@ -161,8 +162,8 @@ static void netfs_rreq_copy_terminated(void *priv, ssize_t transferred_or_error,
 }
 
 /*
- * Perform any outstanding writes to the cache.  We inherit a ref from the
- * caller.
+ * [DEPRECATED] Perform any outstanding writes to the cache.  We inherit a ref
+ * from the caller.
  */
 static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq)
 {
@@ -222,7 +223,7 @@ static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq)
 		netfs_rreq_unmark_after_write(rreq, false);
 }
 
-static void netfs_rreq_write_to_cache_work(struct work_struct *work)
+static void netfs_rreq_write_to_cache_work(struct work_struct *work) /* [DEPRECATED] */
 {
 	struct netfs_io_request *rreq =
 		container_of(work, struct netfs_io_request, work);
@@ -230,7 +231,7 @@ static void netfs_rreq_write_to_cache_work(struct work_struct *work)
 	netfs_rreq_do_write_to_cache(rreq);
 }
 
-static void netfs_rreq_write_to_cache(struct netfs_io_request *rreq)
+static void netfs_rreq_write_to_cache(struct netfs_io_request *rreq) /* [DEPRECATED] */
 {
 	rreq->work.func = netfs_rreq_write_to_cache_work;
 	if (!queue_work(system_unbound_wq, &rreq->work))
@@ -409,7 +410,8 @@ again:
 	clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
 	wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
 
-	if (test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags))
+	if (test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags) &&
+	    test_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags))
 		return netfs_rreq_write_to_cache(rreq);
 
 	netfs_rreq_completed(rreq, was_async);
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index 5e77618a7940..c5a73c9ed126 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -31,6 +31,7 @@ static const char *netfs_origins[nr__netfs_io_origin] = {
 	[NETFS_READAHEAD]		= "RA",
 	[NETFS_READPAGE]		= "RP",
 	[NETFS_READ_FOR_WRITE]		= "RW",
+	[NETFS_COPY_TO_CACHE]		= "CC",
 	[NETFS_WRITEBACK]		= "WB",
 	[NETFS_WRITETHROUGH]		= "WT",
 	[NETFS_LAUNDER_WRITE]		= "LW",
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index 90051ced8e2a..bc1fc54fb724 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -177,13 +177,11 @@ EXPORT_SYMBOL(netfs_clear_inode_writeback);
  */
 void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
 {
-	struct netfs_folio *finfo = NULL;
+	struct netfs_folio *finfo;
 	size_t flen = folio_size(folio);
 
 	_enter("{%lx},%zx,%zx", folio->index, offset, length);
 
-	folio_wait_fscache(folio);
-
 	if (!folio_test_private(folio))
 		return;
 
@@ -248,12 +246,6 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp)
 
 	if (folio_test_private(folio))
 		return false;
-	if (folio_test_fscache(folio)) {
-		if (current_is_kswapd() || !(gfp & __GFP_FS))
-			return false;
-		folio_wait_fscache(folio);
-	}
-
 	fscache_note_page_release(netfs_i_cookie(ctx));
 	return true;
 }
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index 610ceb5bd86c..72b52f070270 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -45,8 +45,12 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
 	refcount_set(&rreq->ref, 1);
 
 	__set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
-	if (cached)
+	if (cached) {
 		__set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags);
+		if (test_bit(NETFS_ICTX_USE_PGPRIV2, &ctx->flags))
+			/* Filesystem uses deprecated PG_private_2 marking. */
+			__set_bit(NETFS_RREQ_USE_PGPRIV2, &rreq->flags);
+	}
 	if (file && file->f_flags & O_NONBLOCK)
 		__set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags);
 	if (rreq->netfs_ops->init_request) {
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index e3cb4923316b..814363d1d7c7 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -81,6 +81,8 @@ static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs)
 static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi)
 {
 	netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops, false);
+	/* [DEPRECATED] Use PG_private_2 to mark folio being written to the cache. */
+	__set_bit(NETFS_ICTX_USE_PGPRIV2, &nfsi->netfs.flags);
 }
 extern void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr);
 extern void nfs_netfs_read_completion(struct nfs_pgio_header *hdr);
diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index 6e8562cbcc43..9de27643607f 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -172,9 +172,12 @@ extern void __fscache_invalidate(struct fscache_cookie *, const void *, loff_t,
 extern int __fscache_begin_read_operation(struct netfs_cache_resources *, struct fscache_cookie *);
 extern int __fscache_begin_write_operation(struct netfs_cache_resources *, struct fscache_cookie *);
 
-extern void __fscache_write_to_cache(struct fscache_cookie *, struct address_space *,
-				     loff_t, size_t, loff_t, netfs_io_terminated_t, void *,
-				     bool);
+void __fscache_write_to_cache(struct fscache_cookie *cookie,
+			      struct address_space *mapping,
+			      loff_t start, size_t len, loff_t i_size,
+			      netfs_io_terminated_t term_func,
+			      void *term_func_priv,
+			      bool using_pgpriv2, bool cond);
 extern void __fscache_clear_page_bits(struct address_space *, loff_t, size_t);
 
 /**
@@ -597,7 +600,8 @@ static inline void fscache_clear_page_bits(struct address_space *mapping,
  * @i_size: The new size of the inode
  * @term_func: The function to call upon completion
  * @term_func_priv: The private data for @term_func
- * @caching: If PG_fscache has been set
+ * @using_pgpriv2: If we're using PG_private_2 to mark in-progress write
+ * @caching: If we actually want to do the caching
  *
  * Helper function for a netfs to write dirty data from an inode into the cache
  * object that's backing it.
@@ -608,19 +612,21 @@ static inline void fscache_clear_page_bits(struct address_space *mapping,
  * marked with PG_fscache.
  *
  * If given, @term_func will be called upon completion and supplied with
- * @term_func_priv.  Note that the PG_fscache flags will have been cleared by
- * this point, so the netfs must retain its own pin on the mapping.
+ * @term_func_priv.  Note that if @using_pgpriv2 is set, the PG_private_2 flags
+ * will have been cleared by this point, so the netfs must retain its own pin
+ * on the mapping.
  */
 static inline void fscache_write_to_cache(struct fscache_cookie *cookie,
 					  struct address_space *mapping,
 					  loff_t start, size_t len, loff_t i_size,
 					  netfs_io_terminated_t term_func,
 					  void *term_func_priv,
-					  bool caching)
+					  bool using_pgpriv2, bool caching)
 {
 	if (caching)
 		__fscache_write_to_cache(cookie, mapping, start, len, i_size,
-					 term_func, term_func_priv, caching);
+					 term_func, term_func_priv,
+					 using_pgpriv2, caching);
 	else if (term_func)
 		term_func(term_func_priv, -ENOBUFS, false);
 
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 100cbb261269..f5e9c5f84a0c 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -143,6 +143,8 @@ struct netfs_inode {
 #define NETFS_ICTX_UNBUFFERED	1		/* I/O should not use the pagecache */
 #define NETFS_ICTX_WRITETHROUGH	2		/* Write-through caching */
 #define NETFS_ICTX_NO_WRITE_STREAMING	3	/* Don't engage in write-streaming */
+#define NETFS_ICTX_USE_PGPRIV2	31		/* [DEPRECATED] Use PG_private_2 to mark
+						 * write to cache on read */
 };
 
 /*
@@ -165,16 +167,25 @@ struct netfs_folio {
 	unsigned int		dirty_len;	/* Write-streaming dirty data length */
 };
 #define NETFS_FOLIO_INFO	0x1UL	/* OR'd with folio->private. */
+#define NETFS_FOLIO_COPY_TO_CACHE ((struct netfs_group *)0x356UL) /* Write to the cache only */
 
-static inline struct netfs_folio *netfs_folio_info(struct folio *folio)
+static inline bool netfs_is_folio_info(const void *priv)
 {
-	void *priv = folio_get_private(folio);
+	return (unsigned long)priv & NETFS_FOLIO_INFO;
+}
 
-	if ((unsigned long)priv & NETFS_FOLIO_INFO)
+static inline struct netfs_folio *__netfs_folio_info(const void *priv)
+{
+	if (netfs_is_folio_info(priv))
 		return (struct netfs_folio *)((unsigned long)priv & ~NETFS_FOLIO_INFO);
 	return NULL;
 }
 
+static inline struct netfs_folio *netfs_folio_info(struct folio *folio)
+{
+	return __netfs_folio_info(folio_get_private(folio));
+}
+
 static inline struct netfs_group *netfs_folio_group(struct folio *folio)
 {
 	struct netfs_folio *finfo;
@@ -230,6 +241,7 @@ enum netfs_io_origin {
 	NETFS_READAHEAD,		/* This read was triggered by readahead */
 	NETFS_READPAGE,			/* This read is a synchronous read */
 	NETFS_READ_FOR_WRITE,		/* This read is to prepare a write */
+	NETFS_COPY_TO_CACHE,		/* This write is to copy a read to the cache */
 	NETFS_WRITEBACK,		/* This write was triggered by writepages */
 	NETFS_WRITETHROUGH,		/* This write was made by netfs_perform_write() */
 	NETFS_LAUNDER_WRITE,		/* This is triggered by ->launder_folio() */
@@ -287,6 +299,8 @@ struct netfs_io_request {
 #define NETFS_RREQ_UPLOAD_TO_SERVER	8	/* Need to write to the server */
 #define NETFS_RREQ_NONBLOCK		9	/* Don't block if possible (O_NONBLOCK) */
 #define NETFS_RREQ_BLOCKED		10	/* We blocked */
+#define NETFS_RREQ_USE_PGPRIV2		31	/* [DEPRECATED] Use PG_private_2 to mark
+						 * write to cache on read */
 	const struct netfs_request_ops *netfs_ops;
 	void (*cleanup)(struct netfs_io_request *req);
 };
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index 447a8c21cf57..e03fafb0c1e3 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -24,6 +24,7 @@
 	E_(netfs_read_trace_write_begin,	"WRITEBEGN")
 
 #define netfs_write_traces					\
+	EM(netfs_write_trace_copy_to_cache,	"COPY2CACH")	\
 	EM(netfs_write_trace_dio_write,		"DIO-WRITE")	\
 	EM(netfs_write_trace_launder,		"LAUNDER  ")	\
 	EM(netfs_write_trace_unbuffered_write,	"UNB-WRITE")	\
@@ -34,6 +35,7 @@
 	EM(NETFS_READAHEAD,			"RA")		\
 	EM(NETFS_READPAGE,			"RP")		\
 	EM(NETFS_READ_FOR_WRITE,		"RW")		\
+	EM(NETFS_COPY_TO_CACHE,			"CC")		\
 	EM(NETFS_WRITEBACK,			"WB")		\
 	EM(NETFS_WRITETHROUGH,			"WT")		\
 	EM(NETFS_LAUNDER_WRITE,			"LW")		\
@@ -127,7 +129,9 @@
 	EM(netfs_folio_trace_clear,		"clear")	\
 	EM(netfs_folio_trace_clear_s,		"clear-s")	\
 	EM(netfs_folio_trace_clear_g,		"clear-g")	\
-	EM(netfs_folio_trace_copy_to_cache,	"copy")		\
+	EM(netfs_folio_trace_copy,		"copy")		\
+	EM(netfs_folio_trace_copy_plus,		"copy+")	\
+	EM(netfs_folio_trace_copy_to_cache,	"mark-copy")	\
 	EM(netfs_folio_trace_end_copy,		"end-copy")	\
 	EM(netfs_folio_trace_filled_gaps,	"filled-gaps")	\
 	EM(netfs_folio_trace_kill,		"kill")		\
-- 
cgit v1.2.3


From 2e9d7e4b984a61823c41ba65e1b58b98ca9912bb Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 19 Mar 2024 11:13:26 +0000
Subject: mm: Remove the PG_fscache alias for PG_private_2

Remove the PG_fscache alias for PG_private_2 and use the latter directly.
Use of this flag for marking pages undergoing writing to the cache should
be considered deprecated and the folios should be marked dirty instead and
the write done in ->writepages().

Note that PG_private_2 itself should be considered deprecated and up for
future removal by the MM folks too.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: Matthew Wilcox (Oracle) <willy@infradead.org>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Xiubo Li <xiubli@redhat.com>
cc: Steve French <sfrench@samba.org>
cc: Paulo Alcantara <pc@manguebit.com>
cc: Ronnie Sahlberg <ronniesahlberg@gmail.com>
cc: Shyam Prasad N <sprasad@microsoft.com>
cc: Tom Talpey <tom@talpey.com>
cc: Bharath SM <bharathsm@microsoft.com>
cc: Trond Myklebust <trond.myklebust@hammerspace.com>
cc: Anna Schumaker <anna@kernel.org>
cc: netfs@lists.linux.dev
cc: ceph-devel@vger.kernel.org
cc: linux-cifs@vger.kernel.org
cc: linux-nfs@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 fs/ceph/addr.c           | 11 ++++---
 fs/netfs/buffered_read.c |  4 +--
 fs/netfs/fscache_io.c    |  2 +-
 fs/netfs/io.c            |  2 +-
 fs/nfs/file.c            |  8 ++---
 fs/nfs/fscache.h         |  4 +--
 fs/nfs/write.c           |  4 +--
 fs/smb/client/file.c     | 16 +++++-----
 include/linux/netfs.h    | 80 +++---------------------------------------------
 mm/filemap.c             |  6 ++--
 10 files changed, 33 insertions(+), 104 deletions(-)

(limited to 'include')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 28ae4976a4f9..18ddacb00511 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -500,7 +500,7 @@ const struct netfs_request_ops ceph_netfs_ops = {
 #ifdef CONFIG_CEPH_FSCACHE
 static void ceph_set_page_fscache(struct page *page)
 {
-	set_page_fscache(page);
+	folio_start_private_2(page_folio(page)); /* [DEPRECATED] */
 }
 
 static void ceph_fscache_write_terminated(void *priv, ssize_t error, bool was_async)
@@ -800,7 +800,7 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
 		return AOP_WRITEPAGE_ACTIVATE;
 	}
 
-	wait_on_page_fscache(page);
+	folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */
 
 	err = writepage_nounlock(page, wbc);
 	if (err == -ERESTARTSYS) {
@@ -1075,7 +1075,8 @@ get_more_pages:
 				unlock_page(page);
 				break;
 			}
-			if (PageWriteback(page) || PageFsCache(page)) {
+			if (PageWriteback(page) ||
+			    PagePrivate2(page) /* [DEPRECATED] */) {
 				if (wbc->sync_mode == WB_SYNC_NONE) {
 					doutc(cl, "%p under writeback\n", page);
 					unlock_page(page);
@@ -1083,7 +1084,7 @@ get_more_pages:
 				}
 				doutc(cl, "waiting on writeback %p\n", page);
 				wait_on_page_writeback(page);
-				wait_on_page_fscache(page);
+				folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */
 			}
 
 			if (!clear_page_dirty_for_io(page)) {
@@ -1513,7 +1514,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
 	if (r < 0)
 		return r;
 
-	folio_wait_fscache(folio);
+	folio_wait_private_2(folio); /* [DEPRECATED] */
 	WARN_ON_ONCE(!folio_test_locked(folio));
 	*pagep = &folio->page;
 	return 0;
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 6d49319c82c6..b3fd6e1fa322 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -70,7 +70,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
 				if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE,
 							       &subreq->flags)) {
 					trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
-					folio_start_fscache(folio);
+					folio_start_private_2(folio);
 					folio_started = true;
 				}
 			} else {
@@ -506,7 +506,7 @@ retry:
 
 have_folio:
 	if (test_bit(NETFS_ICTX_USE_PGPRIV2, &ctx->flags)) {
-		ret = folio_wait_fscache_killable(folio);
+		ret = folio_wait_private_2_killable(folio);
 		if (ret < 0)
 			goto error;
 	}
diff --git a/fs/netfs/fscache_io.c b/fs/netfs/fscache_io.c
index 5028f2ae30da..38637e5c9b57 100644
--- a/fs/netfs/fscache_io.c
+++ b/fs/netfs/fscache_io.c
@@ -183,7 +183,7 @@ void __fscache_clear_page_bits(struct address_space *mapping,
 
 		rcu_read_lock();
 		xas_for_each(&xas, page, last) {
-			end_page_fscache(page);
+			folio_end_private_2(page_folio(page));
 		}
 		rcu_read_unlock();
 	}
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index b3b9827a9709..60a19f96e0ce 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -129,7 +129,7 @@ static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq,
 				continue;
 			unlocked = folio_next_index(folio) - 1;
 			trace_netfs_folio(folio, netfs_folio_trace_end_copy);
-			folio_end_fscache(folio);
+			folio_end_private_2(folio);
 			have_unlocked = true;
 		}
 	}
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 407c6e15afe2..6bd127e6683d 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -433,7 +433,7 @@ static void nfs_invalidate_folio(struct folio *folio, size_t offset,
 		return;
 	/* Cancel any unstarted writes on this page */
 	nfs_wb_folio_cancel(inode, folio);
-	folio_wait_fscache(folio);
+	folio_wait_private_2(folio); /* [DEPRECATED] */
 	trace_nfs_invalidate_folio(inode, folio);
 }
 
@@ -500,7 +500,7 @@ static int nfs_launder_folio(struct folio *folio)
 	dfprintk(PAGECACHE, "NFS: launder_folio(%ld, %llu)\n",
 		inode->i_ino, folio_pos(folio));
 
-	folio_wait_fscache(folio);
+	folio_wait_private_2(folio); /* [DEPRECATED] */
 	ret = nfs_wb_folio(inode, folio);
 	trace_nfs_launder_folio_done(inode, folio, ret);
 	return ret;
@@ -593,8 +593,8 @@ static vm_fault_t nfs_vm_page_mkwrite(struct vm_fault *vmf)
 	sb_start_pagefault(inode->i_sb);
 
 	/* make sure the cache has finished storing the page */
-	if (folio_test_fscache(folio) &&
-	    folio_wait_fscache_killable(folio) < 0) {
+	if (folio_test_private_2(folio) && /* [DEPRECATED] */
+	    folio_wait_private_2_killable(folio) < 0) {
 		ret = VM_FAULT_RETRY;
 		goto out;
 	}
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 814363d1d7c7..fbed0027996f 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -103,10 +103,10 @@ extern int nfs_netfs_read_folio(struct file *file, struct folio *folio);
 
 static inline bool nfs_fscache_release_folio(struct folio *folio, gfp_t gfp)
 {
-	if (folio_test_fscache(folio)) {
+	if (folio_test_private_2(folio)) { /* [DEPRECATED] */
 		if (current_is_kswapd() || !(gfp & __GFP_FS))
 			return false;
-		folio_wait_fscache(folio);
+		folio_wait_private_2(folio);
 	}
 	fscache_note_page_release(netfs_i_cookie(netfs_inode(folio->mapping->host)));
 	return true;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 5de85d725fb9..2329cbb0e446 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -2120,10 +2120,10 @@ int nfs_migrate_folio(struct address_space *mapping, struct folio *dst,
 	if (folio_test_private(src))
 		return -EBUSY;
 
-	if (folio_test_fscache(src)) {
+	if (folio_test_private_2(src)) { /* [DEPRECATED] */
 		if (mode == MIGRATE_ASYNC)
 			return -EBUSY;
-		folio_wait_fscache(src);
+		folio_wait_private_2(src);
 	}
 
 	return migrate_folio(mapping, dst, src, mode);
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 9be37d0fe724..b39caae652f6 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -3032,12 +3032,12 @@ lock_again:
 	}
 
 	if (folio_test_writeback(folio) ||
-	    folio_test_fscache(folio)) {
+	    folio_test_private_2(folio)) { /* [DEPRECATED] */
 		folio_unlock(folio);
 		if (wbc->sync_mode != WB_SYNC_NONE) {
 			folio_wait_writeback(folio);
 #ifdef CONFIG_CIFS_FSCACHE
-			folio_wait_fscache(folio);
+			folio_wait_private_2(folio);
 #endif
 			goto lock_again;
 		}
@@ -4510,8 +4510,8 @@ static vm_fault_t cifs_page_mkwrite(struct vm_fault *vmf)
 	 * be modified.  We then assume the entire folio will need writing back.
 	 */
 #ifdef CONFIG_CIFS_FSCACHE
-	if (folio_test_fscache(folio) &&
-	    folio_wait_fscache_killable(folio) < 0)
+	if (folio_test_private_2(folio) && /* [DEPRECATED] */
+	    folio_wait_private_2_killable(folio) < 0)
 		return VM_FAULT_RETRY;
 #endif
 
@@ -4977,10 +4977,10 @@ static bool cifs_release_folio(struct folio *folio, gfp_t gfp)
 {
 	if (folio_test_private(folio))
 		return 0;
-	if (folio_test_fscache(folio)) {
+	if (folio_test_private_2(folio)) { /* [DEPRECATED] */
 		if (current_is_kswapd() || !(gfp & __GFP_FS))
 			return false;
-		folio_wait_fscache(folio);
+		folio_wait_private_2(folio);
 	}
 	fscache_note_page_release(cifs_inode_cookie(folio->mapping->host));
 	return true;
@@ -4989,7 +4989,7 @@ static bool cifs_release_folio(struct folio *folio, gfp_t gfp)
 static void cifs_invalidate_folio(struct folio *folio, size_t offset,
 				 size_t length)
 {
-	folio_wait_fscache(folio);
+	folio_wait_private_2(folio); /* [DEPRECATED] */
 }
 
 static int cifs_launder_folio(struct folio *folio)
@@ -5009,7 +5009,7 @@ static int cifs_launder_folio(struct folio *folio)
 	if (folio_clear_dirty_for_io(folio))
 		rc = cifs_writepage_locked(&folio->page, &wbc);
 
-	folio_wait_fscache(folio);
+	folio_wait_private_2(folio); /* [DEPRECATED] */
 	return rc;
 }
 
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index f5e9c5f84a0c..f36a6d8163d1 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -21,94 +21,22 @@
 
 enum netfs_sreq_ref_trace;
 
-/*
- * Overload PG_private_2 to give us PG_fscache - this is used to indicate that
- * a page is currently backed by a local disk cache
- */
-#define folio_test_fscache(folio)	folio_test_private_2(folio)
-#define PageFsCache(page)		PagePrivate2((page))
-#define SetPageFsCache(page)		SetPagePrivate2((page))
-#define ClearPageFsCache(page)		ClearPagePrivate2((page))
-#define TestSetPageFsCache(page)	TestSetPagePrivate2((page))
-#define TestClearPageFsCache(page)	TestClearPagePrivate2((page))
-
 /**
- * folio_start_fscache - Start an fscache write on a folio.
+ * folio_start_private_2 - Start an fscache write on a folio.  [DEPRECATED]
  * @folio: The folio.
  *
  * Call this function before writing a folio to a local cache.  Starting a
  * second write before the first one finishes is not allowed.
+ *
+ * Note that this should no longer be used.
  */
-static inline void folio_start_fscache(struct folio *folio)
+static inline void folio_start_private_2(struct folio *folio)
 {
 	VM_BUG_ON_FOLIO(folio_test_private_2(folio), folio);
 	folio_get(folio);
 	folio_set_private_2(folio);
 }
 
-/**
- * folio_end_fscache - End an fscache write on a folio.
- * @folio: The folio.
- *
- * Call this function after the folio has been written to the local cache.
- * This will wake any sleepers waiting on this folio.
- */
-static inline void folio_end_fscache(struct folio *folio)
-{
-	folio_end_private_2(folio);
-}
-
-/**
- * folio_wait_fscache - Wait for an fscache write on this folio to end.
- * @folio: The folio.
- *
- * If this folio is currently being written to a local cache, wait for
- * the write to finish.  Another write may start after this one finishes,
- * unless the caller holds the folio lock.
- */
-static inline void folio_wait_fscache(struct folio *folio)
-{
-	folio_wait_private_2(folio);
-}
-
-/**
- * folio_wait_fscache_killable - Wait for an fscache write on this folio to end.
- * @folio: The folio.
- *
- * If this folio is currently being written to a local cache, wait
- * for the write to finish or for a fatal signal to be received.
- * Another write may start after this one finishes, unless the caller
- * holds the folio lock.
- *
- * Return:
- * - 0 if successful.
- * - -EINTR if a fatal signal was encountered.
- */
-static inline int folio_wait_fscache_killable(struct folio *folio)
-{
-	return folio_wait_private_2_killable(folio);
-}
-
-static inline void set_page_fscache(struct page *page)
-{
-	folio_start_fscache(page_folio(page));
-}
-
-static inline void end_page_fscache(struct page *page)
-{
-	folio_end_private_2(page_folio(page));
-}
-
-static inline void wait_on_page_fscache(struct page *page)
-{
-	folio_wait_private_2(page_folio(page));
-}
-
-static inline int wait_on_page_fscache_killable(struct page *page)
-{
-	return folio_wait_private_2_killable(page_folio(page));
-}
-
 /* Marks used on xarray-based buffers */
 #define NETFS_BUF_PUT_MARK	XA_MARK_0	/* - Page needs putting  */
 #define NETFS_BUF_PAGECACHE_MARK XA_MARK_1	/* - Page needs wb/dirty flag wrangling */
diff --git a/mm/filemap.c b/mm/filemap.c
index 30de18c4fd28..9a2e28bf298a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1540,7 +1540,7 @@ EXPORT_SYMBOL(folio_end_private_2);
  * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio.
  * @folio: The folio to wait on.
  *
- * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio.
+ * Wait for PG_private_2 to be cleared on a folio.
  */
 void folio_wait_private_2(struct folio *folio)
 {
@@ -1553,8 +1553,8 @@ EXPORT_SYMBOL(folio_wait_private_2);
  * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio.
  * @folio: The folio to wait on.
  *
- * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio or until a
- * fatal signal is received by the calling task.
+ * Wait for PG_private_2 to be cleared on a folio or until a fatal signal is
+ * received by the calling task.
  *
  * Return:
  * - 0 if successful.
-- 
cgit v1.2.3


From 93bf1cc0096fa1e02244078db3334ca7fa1d88c1 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 18 Mar 2024 17:15:57 +0000
Subject: netfs: Make netfs_io_request::subreq_counter an atomic_t

Make the netfs_io_request::subreq_counter, used to generate values for
netfs_io_subrequest::debug_index, into an atomic_t so that it can be called
from the retry thread at the same time as the app thread issuing writes.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
---
 fs/netfs/output.c     | 2 +-
 include/linux/netfs.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/netfs/output.c b/fs/netfs/output.c
index 625eb68f3e5a..fbdbb4f78234 100644
--- a/fs/netfs/output.c
+++ b/fs/netfs/output.c
@@ -37,7 +37,7 @@ struct netfs_io_subrequest *netfs_create_write_request(struct netfs_io_request *
 		subreq->source	= dest;
 		subreq->start	= start;
 		subreq->len	= len;
-		subreq->debug_index = wreq->subreq_counter++;
+		subreq->debug_index = atomic_inc_return(&wreq->subreq_counter);
 
 		switch (subreq->source) {
 		case NETFS_UPLOAD_TO_SERVER:
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index f36a6d8163d1..ddafc6ebff42 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -202,7 +202,7 @@ struct netfs_io_request {
 	unsigned int		debug_id;
 	unsigned int		rsize;		/* Maximum read size (0 for none) */
 	unsigned int		wsize;		/* Maximum write size (0 for none) */
-	unsigned int		subreq_counter;	/* Next subreq->debug_index */
+	atomic_t		subreq_counter;	/* Next subreq->debug_index */
 	atomic_t		nr_outstanding;	/* Number of ops in progress */
 	atomic_t		nr_copy_ops;	/* Number of copy-to-cache ops in progress */
 	size_t			submitted;	/* Amount submitted for I/O so far */
-- 
cgit v1.2.3


From 31a70a71b3a730aa703bbd05713d21115dd6d33a Mon Sep 17 00:00:00 2001
From: Cezary Rojewski <cezary.rojewski@intel.com>
Date: Fri, 26 Apr 2024 11:57:26 +0200
Subject: ASoC: pcm: Reverse iterate DAIs when shutting them down

During startup snd_soc_dai_startup() is launched in ascending order and
the exact same thing is done during shutdown procedure. Reverse the
order in the latter so that it is symmetric to the former.

Signed-off-by: Cezary Rojewski <cezary.rojewski@intel.com>
Link: https://lore.kernel.org/r/20240426095733.3946951-2-cezary.rojewski@intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h | 4 ++++
 sound/soc/soc-pcm.c | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index 2a1b6c198547..8aa6ddb7ee1e 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -1235,6 +1235,10 @@ struct snd_soc_pcm_runtime {
 	     ((i) < (rtd)->dai_link->num_cpus + (rtd)->dai_link->num_codecs) &&	\
 		     ((dai) = (rtd)->dais[i]);				\
 	     (i)++)
+#define for_each_rtd_dais_reverse(rtd, i, dai)					\
+	for ((i) = (rtd)->dai_link->num_cpus + (rtd)->dai_link->num_codecs - 1;	\
+	     (i) >= 0 && ((dai) = (rtd)->dais[i]);				\
+	     (i)--)
 #define for_each_rtd_ch_maps(rtd, i, ch_maps) for_each_link_ch_maps(rtd->dai_link, i, ch_maps)
 
 void snd_soc_close_delayed_work(struct snd_soc_pcm_runtime *rtd);
diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c
index b0e1bd7f588b..711b2f49ed88 100644
--- a/sound/soc/soc-pcm.c
+++ b/sound/soc/soc-pcm.c
@@ -724,7 +724,7 @@ static int soc_pcm_clean(struct snd_soc_pcm_runtime *rtd,
 		}
 	}
 
-	for_each_rtd_dais(rtd, i, dai)
+	for_each_rtd_dais_reverse(rtd, i, dai)
 		snd_soc_dai_shutdown(dai, substream, rollback);
 
 	snd_soc_link_shutdown(substream, rollback);
-- 
cgit v1.2.3


From 8bbc692d1abce5bc949dea9acba85fc686601c04 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Fri, 26 Apr 2024 11:03:51 +0200
Subject: ASoC: SOF: Constify stored pointer to snd_sof_dsp_ops

The core code does not modify the 'struct snd_sof_dsp_ops', stored in
'struct sof_dev_desc'.  Make the pointer to 'struct snd_sof_dsp_ops'
const to annotate this, make code a bit safer and allow individual
drivers to also define this structure as const.

Tested-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Link: https://lore.kernel.org/r/20240426-n-const-ops-var-v2-4-e553fe67ae82@kernel.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/sof.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/sound/sof.h b/include/sound/sof.h
index 05213bb515a3..780c1a755917 100644
--- a/include/sound/sof.h
+++ b/include/sound/sof.h
@@ -166,7 +166,7 @@ struct sof_dev_desc {
 	/* default firmware name */
 	const char *default_fw_filename[SOF_IPC_TYPE_COUNT];
 
-	struct snd_sof_dsp_ops *ops;
+	const struct snd_sof_dsp_ops *ops;
 	int (*ops_init)(struct snd_sof_dev *sdev);
 	void (*ops_free)(struct snd_sof_dev *sdev);
 };
-- 
cgit v1.2.3


From 01aea123b11c7ebbdd64b2df3a4a5a7ad86a460d Mon Sep 17 00:00:00 2001
From: Peter Griffin <peter.griffin@linaro.org>
Date: Mon, 29 Apr 2024 14:02:17 +0100
Subject: dt-bindings: clock: google,gs101-clock: add HSI2 clock management
 unit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add dt schema documentation and clock IDs for the High Speed Interface
2 (HSI2) clock management unit. This CMU feeds high speed interfaces
such as PCIe and UFS.

[AD: * keep CMUs in google,gs101.h sorted alphabetically
     * resolve minor merge conflicts in google,gs101-clock.yaml
     * s/ufs_embd/ufs    s/mmc_card/mmc

Signed-off-by: Peter Griffin <peter.griffin@linaro.org>
Reviewed-by: Rob Herring (Arm) <robh@kernel.org>
Signed-off-by: André Draszik <andre.draszik@linaro.org>
Link: https://lore.kernel.org/r/20240429-hsi0-gs101-v3-1-f233be0a2455@linaro.org
Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
---
 .../bindings/clock/google,gs101-clock.yaml         | 26 +++++++++
 include/dt-bindings/clock/google,gs101.h           | 62 ++++++++++++++++++++++
 2 files changed, 88 insertions(+)

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/clock/google,gs101-clock.yaml b/Documentation/devicetree/bindings/clock/google,gs101-clock.yaml
index 94dcc4f84c85..caf442ead24b 100644
--- a/Documentation/devicetree/bindings/clock/google,gs101-clock.yaml
+++ b/Documentation/devicetree/bindings/clock/google,gs101-clock.yaml
@@ -31,6 +31,7 @@ properties:
       - google,gs101-cmu-apm
       - google,gs101-cmu-misc
       - google,gs101-cmu-hsi0
+      - google,gs101-cmu-hsi2
       - google,gs101-cmu-peric0
       - google,gs101-cmu-peric1
 
@@ -97,6 +98,31 @@ allOf:
             - const: usb31drd
             - const: usbdpdbg
 
+  - if:
+      properties:
+        compatible:
+          contains:
+            enum:
+              - google,gs101-cmu-hsi2
+
+    then:
+      properties:
+        clocks:
+          items:
+            - description: External reference clock (24.576 MHz)
+            - description: High Speed Interface bus clock (from CMU_TOP)
+            - description: High Speed Interface pcie clock (from CMU_TOP)
+            - description: High Speed Interface ufs clock (from CMU_TOP)
+            - description: High Speed Interface mmc clock (from CMU_TOP)
+
+        clock-names:
+          items:
+            - const: oscclk
+            - const: bus
+            - const: pcie
+            - const: ufs
+            - const: mmc
+
   - if:
       properties:
         compatible:
diff --git a/include/dt-bindings/clock/google,gs101.h b/include/dt-bindings/clock/google,gs101.h
index 7a2006f0edf1..442f9e9037dc 100644
--- a/include/dt-bindings/clock/google,gs101.h
+++ b/include/dt-bindings/clock/google,gs101.h
@@ -367,6 +367,68 @@
 #define CLK_GOUT_HSI0_XIU_D1_HSI0_ACLK				51
 #define CLK_GOUT_HSI0_XIU_P_HSI0_ACLK				52
 
+/* CMU_HSI2 */
+#define CLK_MOUT_HSI2_BUS_USER						1
+#define CLK_MOUT_HSI2_MMC_CARD_USER					2
+#define CLK_MOUT_HSI2_PCIE_USER						3
+#define CLK_MOUT_HSI2_UFS_EMBD_USER					4
+#define CLK_GOUT_HSI2_PCIE_GEN4_1_PCIE_003_PHY_REFCLK_IN		5
+#define CLK_GOUT_HSI2_PCIE_GEN4_1_PCIE_004_PHY_REFCLK_IN		6
+#define CLK_GOUT_HSI2_SSMT_PCIE_IA_GEN4A_1_ACLK				7
+#define CLK_GOUT_HSI2_SSMT_PCIE_IA_GEN4A_1_PCLK				8
+#define CLK_GOUT_HSI2_SSMT_PCIE_IA_GEN4B_1_ACLK				9
+#define CLK_GOUT_HSI2_SSMT_PCIE_IA_GEN4B_1_PCLK				10
+#define CLK_GOUT_HSI2_D_TZPC_HSI2_PCLK					11
+#define CLK_GOUT_HSI2_GPC_HSI2_PCLK					12
+#define CLK_GOUT_HSI2_GPIO_HSI2_PCLK					13
+#define CLK_GOUT_HSI2_HSI2_CMU_HSI2_PCLK				14
+#define CLK_GOUT_HSI2_LHM_AXI_P_HSI2_I_CLK				15
+#define CLK_GOUT_HSI2_LHS_ACEL_D_HSI2_I_CLK				16
+#define CLK_GOUT_HSI2_MMC_CARD_I_ACLK					17
+#define CLK_GOUT_HSI2_MMC_CARD_SDCLKIN					18
+#define CLK_GOUT_HSI2_PCIE_GEN4_1_PCIE_003_DBI_ACLK_UG			19
+#define CLK_GOUT_HSI2_PCIE_GEN4_1_PCIE_003_MSTR_ACLK_UG			20
+#define CLK_GOUT_HSI2_PCIE_GEN4_1_PCIE_003_SLV_ACLK_UG			21
+#define CLK_GOUT_HSI2_PCIE_GEN4_1_PCIE_003_I_DRIVER_APB_CLK		22
+#define CLK_GOUT_HSI2_PCIE_GEN4_1_PCIE_004_DBI_ACLK_UG			23
+#define CLK_GOUT_HSI2_PCIE_GEN4_1_PCIE_004_MSTR_ACLK_UG			24
+#define CLK_GOUT_HSI2_PCIE_GEN4_1_PCIE_004_SLV_ACLK_UG			25
+#define CLK_GOUT_HSI2_PCIE_GEN4_1_PCIE_004_I_DRIVER_APB_CLK		26
+#define CLK_GOUT_HSI2_PCIE_GEN4_1_PCS_PMA_PHY_UDBG_I_APB_PCLK		27
+#define CLK_GOUT_HSI2_PCIE_GEN4_1_PCS_PMA_PIPE_PAL_PCIE_I_APB_PCLK	28
+#define CLK_GOUT_HSI2_PCIE_GEN4_1_PCS_PMA_PCIEPHY210X2_QCH_I_APB_PCLK	29
+#define CLK_GOUT_HSI2_PCIE_IA_GEN4A_1_I_CLK				30
+#define CLK_GOUT_HSI2_PCIE_IA_GEN4B_1_I_CLK				31
+#define CLK_GOUT_HSI2_PPMU_HSI2_ACLK					32
+#define CLK_GOUT_HSI2_PPMU_HSI2_PCLK					33
+#define CLK_GOUT_HSI2_QE_MMC_CARD_HSI2_ACLK				34
+#define CLK_GOUT_HSI2_QE_MMC_CARD_HSI2_PCLK				35
+#define CLK_GOUT_HSI2_QE_PCIE_GEN4A_HSI2_ACLK				36
+#define CLK_GOUT_HSI2_QE_PCIE_GEN4A_HSI2_PCLK				37
+#define CLK_GOUT_HSI2_QE_PCIE_GEN4B_HSI2_ACLK				38
+#define CLK_GOUT_HSI2_QE_PCIE_GEN4B_HSI2_PCLK				39
+#define CLK_GOUT_HSI2_QE_UFS_EMBD_HSI2_ACLK				40
+#define CLK_GOUT_HSI2_QE_UFS_EMBD_HSI2_PCLK				41
+#define CLK_GOUT_HSI2_CLK_HSI2_BUS_CLK					42
+#define CLK_GOUT_HSI2_CLK_HSI2_OSCCLK_CLK				43
+#define CLK_GOUT_HSI2_SSMT_HSI2_ACLK					44
+#define CLK_GOUT_HSI2_SSMT_HSI2_PCLK					45
+#define CLK_GOUT_HSI2_SYSMMU_HSI2_CLK_S2				46
+#define CLK_GOUT_HSI2_SYSREG_HSI2_PCLK					47
+#define CLK_GOUT_HSI2_UASC_PCIE_GEN4A_DBI_1_ACLK			48
+#define CLK_GOUT_HSI2_UASC_PCIE_GEN4A_DBI_1_PCLK			49
+#define CLK_GOUT_HSI2_UASC_PCIE_GEN4A_SLV_1_ACLK			50
+#define CLK_GOUT_HSI2_UASC_PCIE_GEN4A_SLV_1_PCLK			51
+#define CLK_GOUT_HSI2_UASC_PCIE_GEN4B_DBI_1_ACLK			52
+#define CLK_GOUT_HSI2_UASC_PCIE_GEN4B_DBI_1_PCLK			53
+#define CLK_GOUT_HSI2_UASC_PCIE_GEN4B_SLV_1_ACLK			54
+#define CLK_GOUT_HSI2_UASC_PCIE_GEN4B_SLV_1_PCLK			55
+#define CLK_GOUT_HSI2_UFS_EMBD_I_ACLK					56
+#define CLK_GOUT_HSI2_UFS_EMBD_I_CLK_UNIPRO				57
+#define CLK_GOUT_HSI2_UFS_EMBD_I_FMP_CLK				58
+#define CLK_GOUT_HSI2_XIU_D_HSI2_ACLK					59
+#define CLK_GOUT_HSI2_XIU_P_HSI2_ACLK					60
+
 /* CMU_MISC */
 #define CLK_MOUT_MISC_BUS_USER				1
 #define CLK_MOUT_MISC_SSS_USER				2
-- 
cgit v1.2.3


From 02eae0bb9538dc7dcb5a6bc2c3066bd6ca682969 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Thu, 25 Apr 2024 14:57:51 +0200
Subject: iio: core: Add iio_read_acpi_mount_matrix() helper function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The ACPI "ROTM" rotation matrix parsing code atm is already duplicated
between bmc150-accel-core.c and kxcjk-1013.c and a third user of this
is coming.

Add an iio_read_acpi_mount_matrix() helper function for this.
The 2 existing copies of the code are identical, except that
the kxcjk-1013.c has slightly better error logging.

To new helper is a 1:1 copy of the kxcjk-1013.c version, the only change
is the addition of a "char *acpi_method" parameter since some bmc150
dual-accel setups (360° hinges with 1 accel in kbd/base + 1 in display)
declare both accels in a single ACPI device with 2 different method names
for the 2 matrices. This new acpi_method parameter is not "const char *"
because the pathname parameter to acpi_evaluate_object() is not const.

The 2 existing copies of this function will be removed in further patches
in this series.

Acked-by: Rafael J. Wysocki <rafael@kernel.org>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Link: https://lore.kernel.org/r/20240425125754.76010-2-hdegoede@redhat.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/Makefile            |  1 +
 drivers/iio/industrialio-acpi.c | 85 +++++++++++++++++++++++++++++++++++++++++
 include/linux/iio/iio.h         | 13 +++++++
 3 files changed, 99 insertions(+)
 create mode 100644 drivers/iio/industrialio-acpi.c

(limited to 'include')

diff --git a/drivers/iio/Makefile b/drivers/iio/Makefile
index 0ba0e1521ba4..cb80ef837e84 100644
--- a/drivers/iio/Makefile
+++ b/drivers/iio/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_IIO) += industrialio.o
 industrialio-y := industrialio-core.o industrialio-event.o inkern.o
 industrialio-$(CONFIG_IIO_BUFFER) += industrialio-buffer.o
 industrialio-$(CONFIG_IIO_TRIGGER) += industrialio-trigger.o
+industrialio-$(CONFIG_ACPI) += industrialio-acpi.o
 
 obj-$(CONFIG_IIO_CONFIGFS) += industrialio-configfs.o
 obj-$(CONFIG_IIO_GTS_HELPER) += industrialio-gts-helper.o
diff --git a/drivers/iio/industrialio-acpi.c b/drivers/iio/industrialio-acpi.c
new file mode 100644
index 000000000000..981b75d40780
--- /dev/null
+++ b/drivers/iio/industrialio-acpi.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* IIO ACPI helper functions */
+
+#include <linux/acpi.h>
+#include <linux/dev_printk.h>
+#include <linux/iio/iio.h>
+#include <linux/sprintf.h>
+
+/**
+ * iio_read_acpi_mount_matrix() - Read accelerometer mount matrix info from ACPI
+ * @dev:		Device structure
+ * @orientation:	iio_mount_matrix struct to fill
+ * @acpi_method:	ACPI method name to read the matrix from, usually "ROTM"
+ *
+ * Try to read the mount-matrix by calling the specified method on the device's
+ * ACPI firmware-node. If the device has no ACPI firmware-node; or the method
+ * does not exist then this will fail silently. This expects the method to
+ * return data in the ACPI "ROTM" format defined by Microsoft:
+ * https://learn.microsoft.com/en-us/windows-hardware/drivers/sensors/sensors-acpi-entries
+ * This is a Microsoft extension and not part of the official ACPI spec.
+ * The method name is configurable because some dual-accel setups define 2 mount
+ * matrices in a single ACPI device using separate "ROMK" and "ROMS" methods.
+ *
+ * Returns: true if the matrix was successfully, false otherwise.
+ */
+bool iio_read_acpi_mount_matrix(struct device *dev,
+				struct iio_mount_matrix *orientation,
+				char *acpi_method)
+{
+	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+	struct acpi_device *adev = ACPI_COMPANION(dev);
+	char *str;
+	union acpi_object *obj, *elements;
+	acpi_status status;
+	int i, j, val[3];
+	bool ret = false;
+
+	if (!adev || !acpi_has_method(adev->handle, acpi_method))
+		return false;
+
+	status = acpi_evaluate_object(adev->handle, acpi_method, NULL, &buffer);
+	if (ACPI_FAILURE(status)) {
+		dev_err(dev, "Failed to get ACPI mount matrix: %d\n", status);
+		return false;
+	}
+
+	obj = buffer.pointer;
+	if (obj->type != ACPI_TYPE_PACKAGE || obj->package.count != 3) {
+		dev_err(dev, "Unknown ACPI mount matrix package format\n");
+		goto out_free_buffer;
+	}
+
+	elements = obj->package.elements;
+	for (i = 0; i < 3; i++) {
+		if (elements[i].type != ACPI_TYPE_STRING) {
+			dev_err(dev, "Unknown ACPI mount matrix element format\n");
+			goto out_free_buffer;
+		}
+
+		str = elements[i].string.pointer;
+		if (sscanf(str, "%d %d %d", &val[0], &val[1], &val[2]) != 3) {
+			dev_err(dev, "Incorrect ACPI mount matrix string format\n");
+			goto out_free_buffer;
+		}
+
+		for (j = 0; j < 3; j++) {
+			switch (val[j]) {
+			case -1: str = "-1"; break;
+			case 0:  str = "0";  break;
+			case 1:  str = "1";  break;
+			default:
+				dev_err(dev, "Invalid value in ACPI mount matrix: %d\n", val[j]);
+				goto out_free_buffer;
+			}
+			orientation->rotation[i * 3 + j] = str;
+		}
+	}
+
+	ret = true;
+
+out_free_buffer:
+	kfree(buffer.pointer);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iio_read_acpi_mount_matrix);
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index e370a7bb3300..55e2b22086a1 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -788,6 +788,19 @@ static inline struct dentry *iio_get_debugfs_dentry(struct iio_dev *indio_dev)
 }
 #endif
 
+#ifdef CONFIG_ACPI
+bool iio_read_acpi_mount_matrix(struct device *dev,
+				struct iio_mount_matrix *orientation,
+				char *acpi_method);
+#else
+static inline bool iio_read_acpi_mount_matrix(struct device *dev,
+					      struct iio_mount_matrix *orientation,
+					      char *acpi_method)
+{
+	return false;
+}
+#endif
+
 ssize_t iio_format_value(char *buf, unsigned int type, int size, int *vals);
 
 int iio_str_to_fixpoint(const char *str, int fract_mult, int *integer,
-- 
cgit v1.2.3


From 09415814cd1d0b90b898f81d6ad6a1c0a2e22d32 Mon Sep 17 00:00:00 2001
From: Nuno Sa <nuno.sa@analog.com>
Date: Fri, 26 Apr 2024 17:42:10 +0200
Subject: iio: backend: change docs padding

Using tabs and maintaining the start of the docs aligned is a pain and
may lead to lot's of unrelated changes when adding new members. Hence,
let#s change things now and just have a simple space after the member
name.

Signed-off-by: Nuno Sa <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20240426-ad9467-new-features-v2-1-6361fc3ba1cc@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/industrialio-backend.c | 76 +++++++++++++++++++-------------------
 include/linux/iio/backend.h        | 38 +++++++++----------
 2 files changed, 57 insertions(+), 57 deletions(-)

(limited to 'include')

diff --git a/drivers/iio/industrialio-backend.c b/drivers/iio/industrialio-backend.c
index f08ed6d70ae5..c27243e88462 100644
--- a/drivers/iio/industrialio-backend.c
+++ b/drivers/iio/industrialio-backend.c
@@ -115,8 +115,8 @@ static DEFINE_MUTEX(iio_back_lock);
 
 /**
  * iio_backend_chan_enable - Enable a backend channel
- * @back:	Backend device
- * @chan:	Channel number
+ * @back: Backend device
+ * @chan: Channel number
  *
  * RETURNS:
  * 0 on success, negative error number on failure.
@@ -129,8 +129,8 @@ EXPORT_SYMBOL_NS_GPL(iio_backend_chan_enable, IIO_BACKEND);
 
 /**
  * iio_backend_chan_disable - Disable a backend channel
- * @back:	Backend device
- * @chan:	Channel number
+ * @back: Backend device
+ * @chan: Channel number
  *
  * RETURNS:
  * 0 on success, negative error number on failure.
@@ -148,8 +148,8 @@ static void __iio_backend_disable(void *back)
 
 /**
  * devm_iio_backend_enable - Device managed backend enable
- * @dev:	Consumer device for the backend
- * @back:	Backend device
+ * @dev: Consumer device for the backend
+ * @back: Backend device
  *
  * RETURNS:
  * 0 on success, negative error number on failure.
@@ -168,9 +168,9 @@ EXPORT_SYMBOL_NS_GPL(devm_iio_backend_enable, IIO_BACKEND);
 
 /**
  * iio_backend_data_format_set - Configure the channel data format
- * @back:	Backend device
- * @chan:	Channel number
- * @data:	Data format
+ * @back: Backend device
+ * @chan: Channel number
+ * @data: Data format
  *
  * Properly configure a channel with respect to the expected data format. A
  * @struct iio_backend_data_fmt must be passed with the settings.
@@ -190,9 +190,9 @@ EXPORT_SYMBOL_NS_GPL(iio_backend_data_format_set, IIO_BACKEND);
 
 /**
  * iio_backend_data_source_set - Select data source
- * @back:	Backend device
- * @chan:	Channel number
- * @data:	Data source
+ * @back: Backend device
+ * @chan: Channel number
+ * @data: Data source
  *
  * A given backend may have different sources to stream/sync data. This allows
  * to choose that source.
@@ -212,9 +212,9 @@ EXPORT_SYMBOL_NS_GPL(iio_backend_data_source_set, IIO_BACKEND);
 
 /**
  * iio_backend_set_sampling_freq - Set channel sampling rate
- * @back:		Backend device
- * @chan:		Channel number
- * @sample_rate_hz:	Sample rate
+ * @back: Backend device
+ * @chan: Channel number
+ * @sample_rate_hz: Sample rate
  *
  * RETURNS:
  * 0 on success, negative error number on failure.
@@ -235,9 +235,9 @@ static void iio_backend_free_buffer(void *arg)
 
 /**
  * devm_iio_backend_request_buffer - Device managed buffer request
- * @dev:	Consumer device for the backend
- * @back:	Backend device
- * @indio_dev:	IIO device
+ * @dev: Consumer device for the backend
+ * @back: Backend device
+ * @indio_dev: IIO device
  *
  * Request an IIO buffer from the backend. The type of the buffer (typically
  * INDIO_BUFFER_HARDWARE) is up to the backend to decide. This is because,
@@ -300,10 +300,10 @@ static struct iio_backend *iio_backend_from_indio_dev_parent(const struct device
 
 /**
  * iio_backend_ext_info_get - IIO ext_info read callback
- * @indio_dev:	IIO device
- * @private:	Data private to the driver
- * @chan:	IIO channel
- * @buf:	Buffer where to place the attribute data
+ * @indio_dev: IIO device
+ * @private: Data private to the driver
+ * @chan: IIO channel
+ * @buf: Buffer where to place the attribute data
  *
  * This helper is intended to be used by backends that extend an IIO channel
  * (through iio_backend_extend_chan_spec()) with extended info. In that case,
@@ -335,11 +335,11 @@ EXPORT_SYMBOL_NS_GPL(iio_backend_ext_info_get, IIO_BACKEND);
 
 /**
  * iio_backend_ext_info_set - IIO ext_info write callback
- * @indio_dev:	IIO device
- * @private:	Data private to the driver
- * @chan:	IIO channel
- * @buf:	Buffer holding the sysfs attribute
- * @len:	Buffer length
+ * @indio_dev: IIO device
+ * @private: Data private to the driver
+ * @chan: IIO channel
+ * @buf: Buffer holding the sysfs attribute
+ * @len: Buffer length
  *
  * This helper is intended to be used by backends that extend an IIO channel
  * (trough iio_backend_extend_chan_spec()) with extended info. In that case,
@@ -365,9 +365,9 @@ EXPORT_SYMBOL_NS_GPL(iio_backend_ext_info_set, IIO_BACKEND);
 
 /**
  * iio_backend_extend_chan_spec - Extend an IIO channel
- * @indio_dev:	IIO device
- * @back:	Backend device
- * @chan:	IIO channel
+ * @indio_dev: IIO device
+ * @back: Backend device
+ * @chan: IIO channel
  *
  * Some backends may have their own functionalities and hence capable of
  * extending a frontend's channel.
@@ -449,8 +449,8 @@ static int __devm_iio_backend_get(struct device *dev, struct iio_backend *back)
 
 /**
  * devm_iio_backend_get - Device managed backend device get
- * @dev:	Consumer device for the backend
- * @name:	Backend name
+ * @dev: Consumer device for the backend
+ * @name: Backend name
  *
  * Get's the backend associated with @dev.
  *
@@ -501,8 +501,8 @@ EXPORT_SYMBOL_NS_GPL(devm_iio_backend_get, IIO_BACKEND);
 
 /**
  * __devm_iio_backend_get_from_fwnode_lookup - Device managed fwnode backend device get
- * @dev:	Consumer device for the backend
- * @fwnode:	Firmware node of the backend device
+ * @dev: Consumer device for the backend
+ * @fwnode: Firmware node of the backend device
  *
  * Search the backend list for a device matching @fwnode.
  * This API should not be used and it's only present for preventing the first
@@ -536,7 +536,7 @@ EXPORT_SYMBOL_NS_GPL(__devm_iio_backend_get_from_fwnode_lookup, IIO_BACKEND);
 
 /**
  * iio_backend_get_priv - Get driver private data
- * @back:	Backend device
+ * @back: Backend device
  */
 void *iio_backend_get_priv(const struct iio_backend *back)
 {
@@ -554,9 +554,9 @@ static void iio_backend_unregister(void *arg)
 
 /**
  * devm_iio_backend_register - Device managed backend device register
- * @dev:	Backend device being registered
- * @ops:	Backend ops
- * @priv:	Device private data
+ * @dev: Backend device being registered
+ * @ops: Backend ops
+ * @priv: Device private data
  *
  * @ops is mandatory. Not providing it results in -EINVAL.
  *
diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h
index 9d144631134d..e3e62f65db14 100644
--- a/include/linux/iio/backend.h
+++ b/include/linux/iio/backend.h
@@ -24,9 +24,9 @@ enum iio_backend_data_source {
 
 /**
  * IIO_BACKEND_EX_INFO - Helper for an IIO extended channel attribute
- * @_name:	Attribute name
- * @_shared:	Whether the attribute is shared between all channels
- * @_what:	Data private to the driver
+ * @_name: Attribute name
+ * @_shared: Whether the attribute is shared between all channels
+ * @_what: Data private to the driver
  */
 #define IIO_BACKEND_EX_INFO(_name, _shared, _what) {	\
 	.name = (_name),				\
@@ -38,10 +38,10 @@ enum iio_backend_data_source {
 
 /**
  * struct iio_backend_data_fmt - Backend data format
- * @type:		Data type.
- * @sign_extend:	Bool to tell if the data is sign extended.
- * @enable:		Enable/Disable the data format module. If disabled,
- *			not formatting will happen.
+ * @type: Data type.
+ * @sign_extend: Bool to tell if the data is sign extended.
+ * @enable: Enable/Disable the data format module. If disabled,
+ *	    not formatting will happen.
  */
 struct iio_backend_data_fmt {
 	enum iio_backend_data_type type;
@@ -51,18 +51,18 @@ struct iio_backend_data_fmt {
 
 /**
  * struct iio_backend_ops - operations structure for an iio_backend
- * @enable:		Enable backend.
- * @disable:		Disable backend.
- * @chan_enable:	Enable one channel.
- * @chan_disable:	Disable one channel.
- * @data_format_set:	Configure the data format for a specific channel.
- * @data_source_set:	Configure the data source for a specific channel.
- * @set_sample_rate:	Configure the sampling rate for a specific channel.
- * @request_buffer:	Request an IIO buffer.
- * @free_buffer:	Free an IIO buffer.
- * @extend_chan_spec:	Extend an IIO channel.
- * @ext_info_set:	Extended info setter.
- * @ext_info_get:	Extended info getter.
+ * @enable: Enable backend.
+ * @disable: Disable backend.
+ * @chan_enable: Enable one channel.
+ * @chan_disable: Disable one channel.
+ * @data_format_set: Configure the data format for a specific channel.
+ * @data_source_set: Configure the data source for a specific channel.
+ * @set_sample_rate: Configure the sampling rate for a specific channel.
+ * @request_buffer: Request an IIO buffer.
+ * @free_buffer: Free an IIO buffer.
+ * @extend_chan_spec: Extend an IIO channel.
+ * @ext_info_set: Extended info setter.
+ * @ext_info_get: Extended info getter.
  **/
 struct iio_backend_ops {
 	int (*enable)(struct iio_backend *back);
-- 
cgit v1.2.3


From c66eabcc1ca64dbf20d0758ce210a85fa83f4b21 Mon Sep 17 00:00:00 2001
From: Nuno Sa <nuno.sa@analog.com>
Date: Fri, 26 Apr 2024 17:42:11 +0200
Subject: iio: backend: add API for interface tuning

This is in preparation for supporting interface tuning in one for the
devices using the axi-adc backend. The new added interfaces are all
needed for that calibration:

 * iio_backend_test_pattern_set();
 * iio_backend_chan_status();
 * iio_backend_iodelay_set();
 * iio_backend_data_sample_trigger().

Interface tuning is the process of going through a set of known points
(typically by the frontend), change some clk or data delays (or both)
and send/receive some known signal (so called test patterns in this
change). The receiving end (either frontend or the backend) is
responsible for validating the signal and see if it's good or not. The
goal for all of this is to come up with ideal delays at the data
interface level so we can have a proper, more reliable data transfer.

Also note that for some devices we can change the sampling rate
(which typically means changing some reference clock) and that can
affect the data interface. In that case, it's import to run the tuning
algorithm again as the values we had before may no longer be the best (or
even valid) ones.

Signed-off-by: Nuno Sa <nuno.sa@analog.com>
Link: https://lore.kernel.org/r/20240426-ad9467-new-features-v2-2-6361fc3ba1cc@analog.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 drivers/iio/industrialio-backend.c | 86 ++++++++++++++++++++++++++++++++++++++
 include/linux/iio/backend.h        | 36 ++++++++++++++++
 2 files changed, 122 insertions(+)

(limited to 'include')

diff --git a/drivers/iio/industrialio-backend.c b/drivers/iio/industrialio-backend.c
index c27243e88462..929aff4040ed 100644
--- a/drivers/iio/industrialio-backend.c
+++ b/drivers/iio/industrialio-backend.c
@@ -226,6 +226,92 @@ int iio_backend_set_sampling_freq(struct iio_backend *back, unsigned int chan,
 }
 EXPORT_SYMBOL_NS_GPL(iio_backend_set_sampling_freq, IIO_BACKEND);
 
+/**
+ * iio_backend_test_pattern_set - Configure a test pattern
+ * @back: Backend device
+ * @chan: Channel number
+ * @pattern: Test pattern
+ *
+ * Configure a test pattern on the backend. This is typically used for
+ * calibrating the timings on the data digital interface.
+ *
+ * RETURNS:
+ * 0 on success, negative error number on failure.
+ */
+int iio_backend_test_pattern_set(struct iio_backend *back,
+				 unsigned int chan,
+				 enum iio_backend_test_pattern pattern)
+{
+	if (pattern >= IIO_BACKEND_TEST_PATTERN_MAX)
+		return -EINVAL;
+
+	return iio_backend_op_call(back, test_pattern_set, chan, pattern);
+}
+EXPORT_SYMBOL_NS_GPL(iio_backend_test_pattern_set, IIO_BACKEND);
+
+/**
+ * iio_backend_chan_status - Get the channel status
+ * @back: Backend device
+ * @chan: Channel number
+ * @error: Error indication
+ *
+ * Get the current state of the backend channel. Typically used to check if
+ * there were any errors sending/receiving data.
+ *
+ * RETURNS:
+ * 0 on success, negative error number on failure.
+ */
+int iio_backend_chan_status(struct iio_backend *back, unsigned int chan,
+			    bool *error)
+{
+	return iio_backend_op_call(back, chan_status, chan, error);
+}
+EXPORT_SYMBOL_NS_GPL(iio_backend_chan_status, IIO_BACKEND);
+
+/**
+ * iio_backend_iodelay_set - Set digital I/O delay
+ * @back: Backend device
+ * @lane: Lane number
+ * @taps: Number of taps
+ *
+ * Controls delays on sending/receiving data. One usecase for this is to
+ * calibrate the data digital interface so we get the best results when
+ * transferring data. Note that @taps has no unit since the actual delay per tap
+ * is very backend specific. Hence, frontend devices typically should go through
+ * an array of @taps (the size of that array should typically match the size of
+ * calibration points on the frontend device) and call this API.
+ *
+ * RETURNS:
+ * 0 on success, negative error number on failure.
+ */
+int iio_backend_iodelay_set(struct iio_backend *back, unsigned int lane,
+			    unsigned int taps)
+{
+	return iio_backend_op_call(back, iodelay_set, lane, taps);
+}
+EXPORT_SYMBOL_NS_GPL(iio_backend_iodelay_set, IIO_BACKEND);
+
+/**
+ * iio_backend_data_sample_trigger - Control when to sample data
+ * @back: Backend device
+ * @trigger: Data trigger
+ *
+ * Mostly useful for input backends. Configures the backend for when to sample
+ * data (eg: rising vs falling edge).
+ *
+ * RETURNS:
+ * 0 on success, negative error number on failure.
+ */
+int iio_backend_data_sample_trigger(struct iio_backend *back,
+				    enum iio_backend_sample_trigger trigger)
+{
+	if (trigger >= IIO_BACKEND_SAMPLE_TRIGGER_MAX)
+		return -EINVAL;
+
+	return iio_backend_op_call(back, data_sample_trigger, trigger);
+}
+EXPORT_SYMBOL_NS_GPL(iio_backend_data_sample_trigger, IIO_BACKEND);
+
 static void iio_backend_free_buffer(void *arg)
 {
 	struct iio_backend_buffer_pair *pair = arg;
diff --git a/include/linux/iio/backend.h b/include/linux/iio/backend.h
index e3e62f65db14..8099759d7242 100644
--- a/include/linux/iio/backend.h
+++ b/include/linux/iio/backend.h
@@ -49,6 +49,20 @@ struct iio_backend_data_fmt {
 	bool enable;
 };
 
+/* vendor specific from 32 */
+enum iio_backend_test_pattern {
+	IIO_BACKEND_NO_TEST_PATTERN,
+	/* modified prbs9 */
+	IIO_BACKEND_ADI_PRBS_9A = 32,
+	IIO_BACKEND_TEST_PATTERN_MAX
+};
+
+enum iio_backend_sample_trigger {
+	IIO_BACKEND_SAMPLE_TRIGGER_EDGE_FALLING,
+	IIO_BACKEND_SAMPLE_TRIGGER_EDGE_RISING,
+	IIO_BACKEND_SAMPLE_TRIGGER_MAX
+};
+
 /**
  * struct iio_backend_ops - operations structure for an iio_backend
  * @enable: Enable backend.
@@ -58,6 +72,10 @@ struct iio_backend_data_fmt {
  * @data_format_set: Configure the data format for a specific channel.
  * @data_source_set: Configure the data source for a specific channel.
  * @set_sample_rate: Configure the sampling rate for a specific channel.
+ * @test_pattern_set: Configure a test pattern.
+ * @chan_status: Get the channel status.
+ * @iodelay_set: Set digital I/O delay.
+ * @data_sample_trigger: Control when to sample data.
  * @request_buffer: Request an IIO buffer.
  * @free_buffer: Free an IIO buffer.
  * @extend_chan_spec: Extend an IIO channel.
@@ -75,6 +93,15 @@ struct iio_backend_ops {
 			       enum iio_backend_data_source data);
 	int (*set_sample_rate)(struct iio_backend *back, unsigned int chan,
 			       u64 sample_rate_hz);
+	int (*test_pattern_set)(struct iio_backend *back,
+				unsigned int chan,
+				enum iio_backend_test_pattern pattern);
+	int (*chan_status)(struct iio_backend *back, unsigned int chan,
+			   bool *error);
+	int (*iodelay_set)(struct iio_backend *back, unsigned int chan,
+			   unsigned int taps);
+	int (*data_sample_trigger)(struct iio_backend *back,
+				   enum iio_backend_sample_trigger trigger);
 	struct iio_buffer *(*request_buffer)(struct iio_backend *back,
 					     struct iio_dev *indio_dev);
 	void (*free_buffer)(struct iio_backend *back,
@@ -97,6 +124,15 @@ int iio_backend_data_source_set(struct iio_backend *back, unsigned int chan,
 				enum iio_backend_data_source data);
 int iio_backend_set_sampling_freq(struct iio_backend *back, unsigned int chan,
 				  u64 sample_rate_hz);
+int iio_backend_test_pattern_set(struct iio_backend *back,
+				 unsigned int chan,
+				 enum iio_backend_test_pattern pattern);
+int iio_backend_chan_status(struct iio_backend *back, unsigned int chan,
+			    bool *error);
+int iio_backend_iodelay_set(struct iio_backend *back, unsigned int lane,
+			    unsigned int taps);
+int iio_backend_data_sample_trigger(struct iio_backend *back,
+				    enum iio_backend_sample_trigger trigger);
 int devm_iio_backend_request_buffer(struct device *dev,
 				    struct iio_backend *back,
 				    struct iio_dev *indio_dev);
-- 
cgit v1.2.3


From cfd3bfe9507b4aa39f7e86772e60b50b799e490e Mon Sep 17 00:00:00 2001
From: Dmitrii Bundin <dmitrii.bundin.a@gmail.com>
Date: Sat, 20 Apr 2024 07:24:57 +0300
Subject: bpf: Include linux/types.h for u32

Inclusion of the header linux/btf_ids.h relies on indirect inclusion of
the header linux/types.h. Including it directly on the top level helps
to avoid potential problems if linux/types.h hasn't been included
before.

The main motivation to introduce this it is to avoid similar problems that
have shown up in the bpftool where GNU libc indirectly pulls
linux/types.h causing compile error of the form:

   error: unknown type name 'u32'
                             u32 cnt;
                             ^~~

The bpftool compile error was fixed in
62248b22d01e ("tools/resolve_btfids: fix build with musl libc").

Signed-off-by: Dmitrii Bundin <dmitrii.bundin.a@gmail.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240420042457.3198883-1-dmitrii.bundin.a@gmail.com
---
 include/linux/btf_ids.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h
index e24aabfe8ecc..c0e3e1426a82 100644
--- a/include/linux/btf_ids.h
+++ b/include/linux/btf_ids.h
@@ -3,6 +3,8 @@
 #ifndef _LINUX_BTF_IDS_H
 #define _LINUX_BTF_IDS_H
 
+#include <linux/types.h> /* for u32 */
+
 struct btf_id_set {
 	u32 cnt;
 	u32 ids[];
-- 
cgit v1.2.3


From e18fa0bbcedf82aaa1db27079ef6a43e11367592 Mon Sep 17 00:00:00 2001
From: Chiara Meiohas <cmeiohas@nvidia.com>
Date: Tue, 16 Apr 2024 15:03:50 +0300
Subject: RDMA/core: Add an option to display driver-specific QPs in the
 rdmatool

Utilize the -dd flag (driver-specific details) in the rdmatool
to view driver-specific QPs which are not exposed yet.

Add the netlink attribute to mark request to convey driver details and
use it to return QP subtype as a string.

$ rdma resource show qp link ibp8s0f1
link ibp8s0f1/1 lqpn 360 type UD state RTS sq-psn 0 comm [mlx5_ib]
link ibp8s0f1/1 lqpn 0 type SMI state RTS sq-psn 0 comm [ib_core]
link ibp8s0f1/1 lqpn 1 type GSI state RTS sq-psn 0 comm [ib_core]

$ rdma resource show qp link ibp8s0f1 -dd
link ibp8s0f1/1 lqpn 360 type UD state RTS sq-psn 0 comm [mlx5_ib]
link ibp8s0f1/1 lqpn 465 type DRIVER subtype REG_UMR state RTS sq-psn 0 comm [mlx5_ib]
link ibp8s0f1/1 lqpn 0 type SMI state RTS sq-psn 0 comm [ib_core]
link ibp8s0f1/1 lqpn 1 type GSI state RTS sq-psn 0 comm [ib_core]

$ rdma resource show
0: ibp8s0f0: pd 3 cq 4 qp 3 cm_id 0 mr 0 ctx 0 srq 2
1: ibp8s0f1: pd 3 cq 4 qp 3 cm_id 0 mr 0 ctx 0 srq 2

$ rdma resource show -dd
0: ibp8s0f0: pd 3 cq 4 qp 4 cm_id 0 mr 0 ctx 0 srq 2
1: ibp8s0f1: pd 3 cq 4 qp 4 cm_id 0 mr 0 ctx 0 srq 2

Signed-off-by: Chiara Meiohas <cmeiohas@nvidia.com>
Link: https://lore.kernel.org/r/2607bb3ddec3cae3443c2ea19e9f700825d20a98.1713268997.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/infiniband/core/nldev.c    | 23 +++++++++++++++++++----
 drivers/infiniband/core/restrack.c | 12 ++++++++++--
 include/rdma/restrack.h            |  7 +++++--
 include/uapi/rdma/rdma_netlink.h   |  6 ++++++
 4 files changed, 40 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 4900a0848124..bc79ee630d8d 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -137,6 +137,8 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME]= { .type = NLA_NUL_STRING,
 					.len = RDMA_NLDEV_ATTR_EMPTY_STRING },
 	[RDMA_NLDEV_ATTR_RES_TYPE]		= { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_RES_SUBTYPE]		= { .type = NLA_NUL_STRING,
+					.len = RDMA_NLDEV_ATTR_EMPTY_STRING },
 	[RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY]= { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_RES_USECNT]		= { .type = NLA_U64 },
 	[RDMA_NLDEV_ATTR_RES_SRQ]		= { .type = NLA_NESTED },
@@ -164,6 +166,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
 	[RDMA_NLDEV_ATTR_STAT_HWCOUNTER_INDEX]	= { .type = NLA_U32 },
 	[RDMA_NLDEV_ATTR_STAT_HWCOUNTER_DYNAMIC] = { .type = NLA_U8 },
 	[RDMA_NLDEV_SYS_ATTR_PRIVILEGED_QKEY_MODE] = { .type = NLA_U8 },
+	[RDMA_NLDEV_ATTR_DRIVER_DETAILS]	= { .type = NLA_U8 },
 };
 
 static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
@@ -399,7 +402,8 @@ err:
 	return -EMSGSIZE;
 }
 
-static int fill_res_info(struct sk_buff *msg, struct ib_device *device)
+static int fill_res_info(struct sk_buff *msg, struct ib_device *device,
+			 bool show_details)
 {
 	static const char * const names[RDMA_RESTRACK_MAX] = {
 		[RDMA_RESTRACK_PD] = "pd",
@@ -424,7 +428,7 @@ static int fill_res_info(struct sk_buff *msg, struct ib_device *device)
 	for (i = 0; i < RDMA_RESTRACK_MAX; i++) {
 		if (!names[i])
 			continue;
-		curr = rdma_restrack_count(device, i);
+		curr = rdma_restrack_count(device, i, show_details);
 		ret = fill_res_info_entry(msg, names[i], curr);
 		if (ret)
 			goto err;
@@ -1305,6 +1309,7 @@ static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
 			      struct netlink_ext_ack *extack)
 {
 	struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+	bool show_details = false;
 	struct ib_device *device;
 	struct sk_buff *msg;
 	u32 index;
@@ -1320,6 +1325,9 @@ static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (!device)
 		return -EINVAL;
 
+	if (tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS])
+		show_details = nla_get_u8(tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]);
+
 	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
 	if (!msg) {
 		ret = -ENOMEM;
@@ -1334,7 +1342,7 @@ static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
 		goto err_free;
 	}
 
-	ret = fill_res_info(msg, device);
+	ret = fill_res_info(msg, device, show_details);
 	if (ret)
 		goto err_free;
 
@@ -1364,7 +1372,7 @@ static int _nldev_res_get_dumpit(struct ib_device *device,
 			RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET),
 			0, NLM_F_MULTI);
 
-	if (!nlh || fill_res_info(skb, device)) {
+	if (!nlh || fill_res_info(skb, device, false)) {
 		nlmsg_cancel(skb, nlh);
 		goto out;
 	}
@@ -1534,6 +1542,7 @@ static int res_get_common_dumpit(struct sk_buff *skb,
 	struct rdma_restrack_entry *res;
 	struct rdma_restrack_root *rt;
 	int err, ret = 0, idx = 0;
+	bool show_details = false;
 	struct nlattr *table_attr;
 	struct nlattr *entry_attr;
 	struct ib_device *device;
@@ -1562,6 +1571,9 @@ static int res_get_common_dumpit(struct sk_buff *skb,
 	if (!device)
 		return -EINVAL;
 
+	if (tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS])
+		show_details = nla_get_u8(tb[RDMA_NLDEV_ATTR_DRIVER_DETAILS]);
+
 	/*
 	 * If no PORT_INDEX is supplied, we will return all QPs from that device
 	 */
@@ -1599,6 +1611,9 @@ static int res_get_common_dumpit(struct sk_buff *skb,
 	 * objects.
 	 */
 	xa_for_each(&rt->xa, id, res) {
+		if (xa_get_mark(&rt->xa, res->id, RESTRACK_DD) && !show_details)
+			goto next;
+
 		if (idx < start || !rdma_restrack_get(res))
 			goto next;
 
diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c
index 438ed3588175..3313410014cd 100644
--- a/drivers/infiniband/core/restrack.c
+++ b/drivers/infiniband/core/restrack.c
@@ -59,8 +59,10 @@ void rdma_restrack_clean(struct ib_device *dev)
  * rdma_restrack_count() - the current usage of specific object
  * @dev:  IB device
  * @type: actual type of object to operate
+ * @show_details: count driver specific objects
  */
-int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type)
+int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type,
+			bool show_details)
 {
 	struct rdma_restrack_root *rt = &dev->res[type];
 	struct rdma_restrack_entry *e;
@@ -68,8 +70,11 @@ int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type)
 	u32 cnt = 0;
 
 	xa_lock(&rt->xa);
-	xas_for_each(&xas, e, U32_MAX)
+	xas_for_each(&xas, e, U32_MAX) {
+		if (xa_get_mark(&rt->xa, e->id, RESTRACK_DD) && !show_details)
+			continue;
 		cnt++;
+	}
 	xa_unlock(&rt->xa);
 	return cnt;
 }
@@ -198,6 +203,9 @@ void rdma_restrack_add(struct rdma_restrack_entry *res)
 		ret = xa_insert(&rt->xa, res->id, res, GFP_KERNEL);
 		if (ret)
 			res->id = 0;
+
+		if (qp->qp_type >= IB_QPT_DRIVER)
+			xa_set_mark(&rt->xa, res->id, RESTRACK_DD);
 	} else if (res->type == RDMA_RESTRACK_COUNTER) {
 		/* Special case to ensure that cntn points to right counter */
 		struct rdma_counter *counter;
diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h
index 8b7c46daeb07..0d69ded73bf2 100644
--- a/include/rdma/restrack.h
+++ b/include/rdma/restrack.h
@@ -14,6 +14,9 @@
 #include <uapi/rdma/rdma_netlink.h>
 #include <linux/xarray.h>
 
+/* Mark entry as containing driver specific details, it is used to provide QP subtype for now */
+#define RESTRACK_DD XA_MARK_1
+
 struct ib_device;
 struct sk_buff;
 
@@ -116,8 +119,8 @@ struct rdma_restrack_entry {
 	u32 id;
 };
 
-int rdma_restrack_count(struct ib_device *dev,
-			enum rdma_restrack_type type);
+int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type,
+			bool show_details);
 /**
  * rdma_is_kernel_res() - check the owner of resource
  * @res:  resource entry
diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index 723bbb0f7042..a214fc259f28 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -558,6 +558,12 @@ enum rdma_nldev_attr {
 
 	RDMA_NLDEV_SYS_ATTR_PRIVILEGED_QKEY_MODE, /* u8 */
 
+	RDMA_NLDEV_ATTR_DRIVER_DETAILS,		/* u8 */
+	/*
+	 * QP subtype string, used for driver QPs
+	 */
+	RDMA_NLDEV_ATTR_RES_SUBTYPE,		/* string */
+
 	/*
 	 * Always the end
 	 */
-- 
cgit v1.2.3


From 34cfe87221363d98160f74788dd060b1c43bae0d Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Fri, 26 Apr 2024 11:39:22 +0800
Subject: virtio_net: introduce device stats feature and structures

The virtio-net device stats spec:

https://github.com/oasis-tcs/virtio-spec/commit/42f389989823039724f95bbbd243291ab0064f82

We introduce the relative feature and structures.

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/uapi/linux/virtio_net.h | 143 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 143 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index cc65ef0f3c3e..ac9174717ef1 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -56,6 +56,7 @@
 #define VIRTIO_NET_F_MQ	22	/* Device supports Receive Flow
 					 * Steering */
 #define VIRTIO_NET_F_CTRL_MAC_ADDR 23	/* Set MAC address */
+#define VIRTIO_NET_F_DEVICE_STATS 50	/* Device can provide device-level statistics. */
 #define VIRTIO_NET_F_VQ_NOTF_COAL 52	/* Device supports virtqueue notification coalescing */
 #define VIRTIO_NET_F_NOTF_COAL	53	/* Device supports notifications coalescing */
 #define VIRTIO_NET_F_GUEST_USO4	54	/* Guest can handle USOv4 in. */
@@ -406,4 +407,146 @@ struct  virtio_net_ctrl_coal_vq {
 	struct virtio_net_ctrl_coal coal;
 };
 
+/*
+ * Device Statistics
+ */
+#define VIRTIO_NET_CTRL_STATS         8
+#define VIRTIO_NET_CTRL_STATS_QUERY   0
+#define VIRTIO_NET_CTRL_STATS_GET     1
+
+struct virtio_net_stats_capabilities {
+
+#define VIRTIO_NET_STATS_TYPE_CVQ       (1ULL << 32)
+
+#define VIRTIO_NET_STATS_TYPE_RX_BASIC  (1ULL << 0)
+#define VIRTIO_NET_STATS_TYPE_RX_CSUM   (1ULL << 1)
+#define VIRTIO_NET_STATS_TYPE_RX_GSO    (1ULL << 2)
+#define VIRTIO_NET_STATS_TYPE_RX_SPEED  (1ULL << 3)
+
+#define VIRTIO_NET_STATS_TYPE_TX_BASIC  (1ULL << 16)
+#define VIRTIO_NET_STATS_TYPE_TX_CSUM   (1ULL << 17)
+#define VIRTIO_NET_STATS_TYPE_TX_GSO    (1ULL << 18)
+#define VIRTIO_NET_STATS_TYPE_TX_SPEED  (1ULL << 19)
+
+	__le64 supported_stats_types[1];
+};
+
+struct virtio_net_ctrl_queue_stats {
+	struct {
+		__le16 vq_index;
+		__le16 reserved[3];
+		__le64 types_bitmap[1];
+	} stats[1];
+};
+
+struct virtio_net_stats_reply_hdr {
+#define VIRTIO_NET_STATS_TYPE_REPLY_CVQ       32
+
+#define VIRTIO_NET_STATS_TYPE_REPLY_RX_BASIC  0
+#define VIRTIO_NET_STATS_TYPE_REPLY_RX_CSUM   1
+#define VIRTIO_NET_STATS_TYPE_REPLY_RX_GSO    2
+#define VIRTIO_NET_STATS_TYPE_REPLY_RX_SPEED  3
+
+#define VIRTIO_NET_STATS_TYPE_REPLY_TX_BASIC  16
+#define VIRTIO_NET_STATS_TYPE_REPLY_TX_CSUM   17
+#define VIRTIO_NET_STATS_TYPE_REPLY_TX_GSO    18
+#define VIRTIO_NET_STATS_TYPE_REPLY_TX_SPEED  19
+	__u8 type;
+	__u8 reserved;
+	__le16 vq_index;
+	__le16 reserved1;
+	__le16 size;
+};
+
+struct virtio_net_stats_cvq {
+	struct virtio_net_stats_reply_hdr hdr;
+
+	__le64 command_num;
+	__le64 ok_num;
+};
+
+struct virtio_net_stats_rx_basic {
+	struct virtio_net_stats_reply_hdr hdr;
+
+	__le64 rx_notifications;
+
+	__le64 rx_packets;
+	__le64 rx_bytes;
+
+	__le64 rx_interrupts;
+
+	__le64 rx_drops;
+	__le64 rx_drop_overruns;
+};
+
+struct virtio_net_stats_tx_basic {
+	struct virtio_net_stats_reply_hdr hdr;
+
+	__le64 tx_notifications;
+
+	__le64 tx_packets;
+	__le64 tx_bytes;
+
+	__le64 tx_interrupts;
+
+	__le64 tx_drops;
+	__le64 tx_drop_malformed;
+};
+
+struct virtio_net_stats_rx_csum {
+	struct virtio_net_stats_reply_hdr hdr;
+
+	__le64 rx_csum_valid;
+	__le64 rx_needs_csum;
+	__le64 rx_csum_none;
+	__le64 rx_csum_bad;
+};
+
+struct virtio_net_stats_tx_csum {
+	struct virtio_net_stats_reply_hdr hdr;
+
+	__le64 tx_csum_none;
+	__le64 tx_needs_csum;
+};
+
+struct virtio_net_stats_rx_gso {
+	struct virtio_net_stats_reply_hdr hdr;
+
+	__le64 rx_gso_packets;
+	__le64 rx_gso_bytes;
+	__le64 rx_gso_packets_coalesced;
+	__le64 rx_gso_bytes_coalesced;
+};
+
+struct virtio_net_stats_tx_gso {
+	struct virtio_net_stats_reply_hdr hdr;
+
+	__le64 tx_gso_packets;
+	__le64 tx_gso_bytes;
+	__le64 tx_gso_segments;
+	__le64 tx_gso_segments_bytes;
+	__le64 tx_gso_packets_noseg;
+	__le64 tx_gso_bytes_noseg;
+};
+
+struct virtio_net_stats_rx_speed {
+	struct virtio_net_stats_reply_hdr hdr;
+
+	/* rx_{packets,bytes}_allowance_exceeded are too long. So rename to
+	 * short name.
+	 */
+	__le64 rx_ratelimit_packets;
+	__le64 rx_ratelimit_bytes;
+};
+
+struct virtio_net_stats_tx_speed {
+	struct virtio_net_stats_reply_hdr hdr;
+
+	/* tx_{packets,bytes}_allowance_exceeded are too long. So rename to
+	 * short name.
+	 */
+	__le64 tx_ratelimit_packets;
+	__le64 tx_ratelimit_bytes;
+};
+
 #endif /* _UAPI_LINUX_VIRTIO_NET_H */
-- 
cgit v1.2.3


From 0cfe71f45f420e412fda2395807a56c453a6e0b6 Mon Sep 17 00:00:00 2001
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Fri, 26 Apr 2024 11:39:27 +0800
Subject: netdev: add queue stats

These stats are commonly. Support reporting those via netdev-genl queue
stats.

name: rx-hw-drops
name: rx-hw-drop-overruns
name: rx-csum-unnecessary
name: rx-csum-none
name: rx-csum-bad
name: rx-hw-gro-packets
name: rx-hw-gro-bytes
name: rx-hw-gro-wire-packets
name: rx-hw-gro-wire-bytes
name: rx-hw-drop-ratelimits
name: tx-hw-drops
name: tx-hw-drop-errors
name: tx-csum-none
name: tx-needs-csum
name: tx-hw-gso-packets
name: tx-hw-gso-bytes
name: tx-hw-gso-wire-packets
name: tx-hw-gso-wire-bytes
name: tx-hw-drop-ratelimits

Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 Documentation/netlink/specs/netdev.yaml | 104 ++++++++++++++++++++++++++++++++
 include/net/netdev_queues.h             |  27 +++++++++
 include/uapi/linux/netdev.h             |  19 ++++++
 net/core/netdev-genl.c                  |  23 ++++++-
 tools/include/uapi/linux/netdev.h       |  19 ++++++
 5 files changed, 190 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index 679c4130707c..2be4b3714d17 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -335,6 +335,110 @@ attribute-sets:
           Allocation failure may, or may not result in a packet drop, depending
           on driver implementation and whether system recovers quickly.
         type: uint
+      -
+        name: rx-hw-drops
+        doc: |
+          Number of all packets which entered the device, but never left it,
+          including but not limited to: packets dropped due to lack of buffer
+          space, processing errors, explicit or implicit policies and packet
+          filters.
+        type: uint
+      -
+        name: rx-hw-drop-overruns
+        doc: |
+          Number of packets dropped due to transient lack of resources, such as
+          buffer space, host descriptors etc.
+        type: uint
+      -
+        name: rx-csum-unnecessary
+        doc: Number of packets that were marked as CHECKSUM_UNNECESSARY.
+        type: uint
+      -
+        name: rx-csum-none
+        doc: Number of packets that were not checksummed by device.
+        type: uint
+      -
+        name: rx-csum-bad
+        doc: |
+          Number of packets with bad checksum. The packets are not discarded,
+          but still delivered to the stack.
+        type: uint
+      -
+        name: rx-hw-gro-packets
+        doc: |
+          Number of packets that were coalesced from smaller packets by the device.
+          Counts only packets coalesced with the HW-GRO netdevice feature,
+          LRO-coalesced packets are not counted.
+        type: uint
+      -
+        name: rx-hw-gro-bytes
+        doc: See `rx-hw-gro-packets`.
+        type: uint
+      -
+        name: rx-hw-gro-wire-packets
+        doc: |
+          Number of packets that were coalesced to bigger packetss with the HW-GRO
+          netdevice feature. LRO-coalesced packets are not counted.
+        type: uint
+      -
+        name: rx-hw-gro-wire-bytes
+        doc: See `rx-hw-gro-wire-packets`.
+        type: uint
+      -
+        name: rx-hw-drop-ratelimits
+        doc: |
+          Number of the packets dropped by the device due to the received
+          packets bitrate exceeding the device rate limit.
+        type: uint
+      -
+        name: tx-hw-drops
+        doc: |
+          Number of packets that arrived at the device but never left it,
+          encompassing packets dropped for reasons such as processing errors, as
+          well as those affected by explicitly defined policies and packet
+          filtering criteria.
+        type: uint
+      -
+        name: tx-hw-drop-errors
+        doc: Number of packets dropped because they were invalid or malformed.
+        type: uint
+      -
+        name: tx-csum-none
+        doc: |
+          Number of packets that did not require the device to calculate the
+          checksum.
+        type: uint
+      -
+        name: tx-needs-csum
+        doc: |
+          Number of packets that required the device to calculate the checksum.
+        type: uint
+      -
+        name: tx-hw-gso-packets
+        doc: |
+          Number of packets that necessitated segmentation into smaller packets
+          by the device.
+        type: uint
+      -
+        name: tx-hw-gso-bytes
+        doc: See `tx-hw-gso-packets`.
+        type: uint
+      -
+        name: tx-hw-gso-wire-packets
+        doc: |
+          Number of wire-sized packets generated by processing
+          `tx-hw-gso-packets`
+        type: uint
+      -
+        name: tx-hw-gso-wire-bytes
+        doc: See `tx-hw-gso-wire-packets`.
+        type: uint
+      -
+        name: tx-hw-drop-ratelimits
+        doc: |
+          Number of the packets dropped by the device due to the transmit
+          packets bitrate exceeding the device rate limit.
+        type: uint
 
 operations:
   list:
diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h
index 1ec408585373..c7ac4539eafc 100644
--- a/include/net/netdev_queues.h
+++ b/include/net/netdev_queues.h
@@ -9,11 +9,38 @@ struct netdev_queue_stats_rx {
 	u64 bytes;
 	u64 packets;
 	u64 alloc_fail;
+
+	u64 hw_drops;
+	u64 hw_drop_overruns;
+
+	u64 csum_unnecessary;
+	u64 csum_none;
+	u64 csum_bad;
+
+	u64 hw_gro_packets;
+	u64 hw_gro_bytes;
+	u64 hw_gro_wire_packets;
+	u64 hw_gro_wire_bytes;
+
+	u64 hw_drop_ratelimits;
 };
 
 struct netdev_queue_stats_tx {
 	u64 bytes;
 	u64 packets;
+
+	u64 hw_drops;
+	u64 hw_drop_errors;
+
+	u64 csum_none;
+	u64 needs_csum;
+
+	u64 hw_gso_packets;
+	u64 hw_gso_bytes;
+	u64 hw_gso_wire_packets;
+	u64 hw_gso_wire_bytes;
+
+	u64 hw_drop_ratelimits;
 };
 
 /**
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index bb65ee840cda..cf24f1d9adf8 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -146,6 +146,25 @@ enum {
 	NETDEV_A_QSTATS_TX_PACKETS,
 	NETDEV_A_QSTATS_TX_BYTES,
 	NETDEV_A_QSTATS_RX_ALLOC_FAIL,
+	NETDEV_A_QSTATS_RX_HW_DROPS,
+	NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS,
+	NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY,
+	NETDEV_A_QSTATS_RX_CSUM_NONE,
+	NETDEV_A_QSTATS_RX_CSUM_BAD,
+	NETDEV_A_QSTATS_RX_HW_GRO_PACKETS,
+	NETDEV_A_QSTATS_RX_HW_GRO_BYTES,
+	NETDEV_A_QSTATS_RX_HW_GRO_WIRE_PACKETS,
+	NETDEV_A_QSTATS_RX_HW_GRO_WIRE_BYTES,
+	NETDEV_A_QSTATS_RX_HW_DROP_RATELIMITS,
+	NETDEV_A_QSTATS_TX_HW_DROPS,
+	NETDEV_A_QSTATS_TX_HW_DROP_ERRORS,
+	NETDEV_A_QSTATS_TX_CSUM_NONE,
+	NETDEV_A_QSTATS_TX_NEEDS_CSUM,
+	NETDEV_A_QSTATS_TX_HW_GSO_PACKETS,
+	NETDEV_A_QSTATS_TX_HW_GSO_BYTES,
+	NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS,
+	NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES,
+	NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS,
 
 	__NETDEV_A_QSTATS_MAX,
 	NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1)
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index dd6510f2c652..4b5054087309 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -489,7 +489,17 @@ netdev_nl_stats_write_rx(struct sk_buff *rsp, struct netdev_queue_stats_rx *rx)
 {
 	if (netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_PACKETS, rx->packets) ||
 	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_BYTES, rx->bytes) ||
-	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail))
+	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail) ||
+	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROPS, rx->hw_drops) ||
+	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, rx->hw_drop_overruns) ||
+	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, rx->csum_unnecessary) ||
+	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_NONE, rx->csum_none) ||
+	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_BAD, rx->csum_bad) ||
+	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_PACKETS, rx->hw_gro_packets) ||
+	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_BYTES, rx->hw_gro_bytes) ||
+	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_PACKETS, rx->hw_gro_wire_packets) ||
+	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_BYTES, rx->hw_gro_wire_bytes) ||
+	    netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_RATELIMITS, rx->hw_drop_ratelimits))
 		return -EMSGSIZE;
 	return 0;
 }
@@ -498,7 +508,16 @@ static int
 netdev_nl_stats_write_tx(struct sk_buff *rsp, struct netdev_queue_stats_tx *tx)
 {
 	if (netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_PACKETS, tx->packets) ||
-	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_BYTES, tx->bytes))
+	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_BYTES, tx->bytes) ||
+	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROPS, tx->hw_drops) ||
+	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_ERRORS, tx->hw_drop_errors) ||
+	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_CSUM_NONE, tx->csum_none) ||
+	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_NEEDS_CSUM, tx->needs_csum) ||
+	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_PACKETS, tx->hw_gso_packets) ||
+	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_BYTES, tx->hw_gso_bytes) ||
+	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS, tx->hw_gso_wire_packets) ||
+	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES, tx->hw_gso_wire_bytes) ||
+	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS, tx->hw_drop_ratelimits))
 		return -EMSGSIZE;
 	return 0;
 }
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index bb65ee840cda..cf24f1d9adf8 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -146,6 +146,25 @@ enum {
 	NETDEV_A_QSTATS_TX_PACKETS,
 	NETDEV_A_QSTATS_TX_BYTES,
 	NETDEV_A_QSTATS_RX_ALLOC_FAIL,
+	NETDEV_A_QSTATS_RX_HW_DROPS,
+	NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS,
+	NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY,
+	NETDEV_A_QSTATS_RX_CSUM_NONE,
+	NETDEV_A_QSTATS_RX_CSUM_BAD,
+	NETDEV_A_QSTATS_RX_HW_GRO_PACKETS,
+	NETDEV_A_QSTATS_RX_HW_GRO_BYTES,
+	NETDEV_A_QSTATS_RX_HW_GRO_WIRE_PACKETS,
+	NETDEV_A_QSTATS_RX_HW_GRO_WIRE_BYTES,
+	NETDEV_A_QSTATS_RX_HW_DROP_RATELIMITS,
+	NETDEV_A_QSTATS_TX_HW_DROPS,
+	NETDEV_A_QSTATS_TX_HW_DROP_ERRORS,
+	NETDEV_A_QSTATS_TX_CSUM_NONE,
+	NETDEV_A_QSTATS_TX_NEEDS_CSUM,
+	NETDEV_A_QSTATS_TX_HW_GSO_PACKETS,
+	NETDEV_A_QSTATS_TX_HW_GSO_BYTES,
+	NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS,
+	NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES,
+	NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS,
 
 	__NETDEV_A_QSTATS_MAX,
 	NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1)
-- 
cgit v1.2.3


From f8193e21c72c1978d689e2dc86227f418341eff1 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Fri, 18 Aug 2023 13:12:48 +0100
Subject: media: cec: cec.h: 2.1 ms -> 2100 ms

The transfer timeout is 2100 ms, not 2.1 ms. Fix this in the
kerneldoc comment.

Link: https://lore.kernel.org/linux-media/17cd1a67-3966-237c-2e0d-2d3ae618f915@xs4all.nl
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 include/media/cec.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/media/cec.h b/include/media/cec.h
index 10c9cf6058b7..c8350258f10a 100644
--- a/include/media/cec.h
+++ b/include/media/cec.h
@@ -173,7 +173,7 @@ struct cec_adap_ops {
  *			case the transmit will finish, but will not retransmit
  *			and be marked as ABORTED.
  * @xfer_timeout_ms:	the transfer timeout in ms.
- *			If 0, then timeout after 2.1 ms.
+ *			If 0, then timeout after 2100 ms.
  * @kthread_config:	kthread used to configure a CEC adapter
  * @config_completion:	used to signal completion of the config kthread
  * @kthread:		main CEC processing thread
-- 
cgit v1.2.3


From 784c46f5467c3cd516e518de211e26611da5c0fb Mon Sep 17 00:00:00 2001
From: Wen Gu <guwen@linux.alibaba.com>
Date: Sun, 28 Apr 2024 14:07:28 +0800
Subject: net/smc: decouple ism_client from SMC-D DMB registration

The struct 'ism_client' is specialized for s390 platform firmware ISM.
So replace it with 'void' to make SMCD DMB registration helper generic
for both Emulated-ISM and existing ISM.

Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Reviewed-by: Wenjia Zhang <wenjia@linux.ibm.com>
Reviewed-and-tested-by: Jan Karcher <jaka@linux.ibm.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/s390/net/ism_drv.c | 2 +-
 include/net/smc.h          | 4 ++--
 net/smc/smc_ism.c          | 7 ++-----
 3 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c
index 43778b088ffa..e36e3ea165d3 100644
--- a/drivers/s390/net/ism_drv.c
+++ b/drivers/s390/net/ism_drv.c
@@ -745,7 +745,7 @@ static int smcd_query_rgid(struct smcd_dev *smcd, struct smcd_gid *rgid,
 }
 
 static int smcd_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb,
-			     struct ism_client *client)
+			     void *client)
 {
 	return ism_register_dmb(smcd->priv, (struct ism_dmb *)dmb, client);
 }
diff --git a/include/net/smc.h b/include/net/smc.h
index 10684d0a33df..542d12372c18 100644
--- a/include/net/smc.h
+++ b/include/net/smc.h
@@ -47,7 +47,6 @@ struct smcd_dmb {
 #define ISM_ERROR	0xFFFF
 
 struct smcd_dev;
-struct ism_client;
 
 struct smcd_gid {
 	u64	gid;
@@ -58,7 +57,7 @@ struct smcd_ops {
 	int (*query_remote_gid)(struct smcd_dev *dev, struct smcd_gid *rgid,
 				u32 vid_valid, u32 vid);
 	int (*register_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb,
-			    struct ism_client *client);
+			    void *client);
 	int (*unregister_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb);
 	int (*add_vlan_id)(struct smcd_dev *dev, u64 vlan_id);
 	int (*del_vlan_id)(struct smcd_dev *dev, u64 vlan_id);
@@ -78,6 +77,7 @@ struct smcd_ops {
 struct smcd_dev {
 	const struct smcd_ops *ops;
 	void *priv;
+	void *client;
 	struct list_head list;
 	spinlock_t lock;
 	struct smc_connection **conn;
diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c
index ac88de2a06a0..051726586730 100644
--- a/net/smc/smc_ism.c
+++ b/net/smc/smc_ism.c
@@ -222,7 +222,6 @@ int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc)
 int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len,
 			 struct smc_buf_desc *dmb_desc)
 {
-#if IS_ENABLED(CONFIG_ISM)
 	struct smcd_dmb dmb;
 	int rc;
 
@@ -231,7 +230,7 @@ int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len,
 	dmb.sba_idx = dmb_desc->sba_idx;
 	dmb.vlan_id = lgr->vlan_id;
 	dmb.rgid = lgr->peer_gid.gid;
-	rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb, &smc_ism_client);
+	rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb, lgr->smcd->client);
 	if (!rc) {
 		dmb_desc->sba_idx = dmb.sba_idx;
 		dmb_desc->token = dmb.dmb_tok;
@@ -240,9 +239,6 @@ int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len,
 		dmb_desc->len = dmb.dmb_len;
 	}
 	return rc;
-#else
-	return 0;
-#endif
 }
 
 static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd,
@@ -446,6 +442,7 @@ static void smcd_register_dev(struct ism_dev *ism)
 	if (!smcd)
 		return;
 	smcd->priv = ism;
+	smcd->client = &smc_ism_client;
 	ism_set_priv(ism, &smc_ism_client, smcd);
 	if (smc_pnetid_by_dev_port(&ism->pdev->dev, 0, smcd->pnetid))
 		smc_pnetid_by_table_smcd(smcd);
-- 
cgit v1.2.3


From d1d8d0b6c7c68b0665456831fa779174ebd78f90 Mon Sep 17 00:00:00 2001
From: Wen Gu <guwen@linux.alibaba.com>
Date: Sun, 28 Apr 2024 14:07:32 +0800
Subject: net/smc: mark optional smcd_ops and check for support when called

Some operations are not supported by new introduced Emulated-ISM, so
mark them as optional and check if the device supports them when called.

Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Reviewed-by: Wenjia Zhang <wenjia@linux.ibm.com>
Reviewed-and-tested-by: Jan Karcher <jaka@linux.ibm.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/smc.h | 14 ++++++++------
 net/smc/smc_ism.c |  9 ++++++++-
 2 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/smc.h b/include/net/smc.h
index 542d12372c18..33b753115e43 100644
--- a/include/net/smc.h
+++ b/include/net/smc.h
@@ -59,12 +59,6 @@ struct smcd_ops {
 	int (*register_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb,
 			    void *client);
 	int (*unregister_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb);
-	int (*add_vlan_id)(struct smcd_dev *dev, u64 vlan_id);
-	int (*del_vlan_id)(struct smcd_dev *dev, u64 vlan_id);
-	int (*set_vlan_required)(struct smcd_dev *dev);
-	int (*reset_vlan_required)(struct smcd_dev *dev);
-	int (*signal_event)(struct smcd_dev *dev, struct smcd_gid *rgid,
-			    u32 trigger_irq, u32 event_code, u64 info);
 	int (*move_data)(struct smcd_dev *dev, u64 dmb_tok, unsigned int idx,
 			 bool sf, unsigned int offset, void *data,
 			 unsigned int size);
@@ -72,6 +66,14 @@ struct smcd_ops {
 	void (*get_local_gid)(struct smcd_dev *dev, struct smcd_gid *gid);
 	u16 (*get_chid)(struct smcd_dev *dev);
 	struct device* (*get_dev)(struct smcd_dev *dev);
+
+	/* optional operations */
+	int (*add_vlan_id)(struct smcd_dev *dev, u64 vlan_id);
+	int (*del_vlan_id)(struct smcd_dev *dev, u64 vlan_id);
+	int (*set_vlan_required)(struct smcd_dev *dev);
+	int (*reset_vlan_required)(struct smcd_dev *dev);
+	int (*signal_event)(struct smcd_dev *dev, struct smcd_gid *rgid,
+			    u32 trigger_irq, u32 event_code, u64 info);
 };
 
 struct smcd_dev {
diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c
index 051726586730..36459a3c739d 100644
--- a/net/smc/smc_ism.c
+++ b/net/smc/smc_ism.c
@@ -126,6 +126,8 @@ int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid)
 
 	if (!vlanid)			/* No valid vlan id */
 		return -EINVAL;
+	if (!smcd->ops->add_vlan_id)
+		return -EOPNOTSUPP;
 
 	/* create new vlan entry, in case we need it */
 	new_vlan = kzalloc(sizeof(*new_vlan), GFP_KERNEL);
@@ -171,6 +173,8 @@ int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid)
 
 	if (!vlanid)			/* No valid vlan id */
 		return -EINVAL;
+	if (!smcd->ops->del_vlan_id)
+		return -EOPNOTSUPP;
 
 	spin_lock_irqsave(&smcd->lock, flags);
 	list_for_each_entry(vlan, &smcd->vlan, list) {
@@ -368,7 +372,8 @@ static void smcd_handle_sw_event(struct smc_ism_event_work *wrk)
 		smc_smcd_terminate(wrk->smcd, &peer_gid, ev_info.vlan_id);
 		break;
 	case ISM_EVENT_CODE_TESTLINK:	/* Activity timer */
-		if (ev_info.code == ISM_EVENT_REQUEST) {
+		if (ev_info.code == ISM_EVENT_REQUEST &&
+		    wrk->smcd->ops->signal_event) {
 			ev_info.code = ISM_EVENT_RESPONSE;
 			wrk->smcd->ops->signal_event(wrk->smcd,
 						     &peer_gid,
@@ -538,6 +543,8 @@ int smc_ism_signal_shutdown(struct smc_link_group *lgr)
 
 	if (lgr->peer_shutdown)
 		return 0;
+	if (!lgr->smcd->ops->signal_event)
+		return 0;
 
 	memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE);
 	ev_info.vlan_id = lgr->vlan_id;
-- 
cgit v1.2.3


From 4398888268582cb51b69c6ee94f551bb8d37d12f Mon Sep 17 00:00:00 2001
From: Wen Gu <guwen@linux.alibaba.com>
Date: Sun, 28 Apr 2024 14:07:35 +0800
Subject: net/smc: add operations to merge sndbuf with peer DMB

In some scenarios using Emulated-ISM device, sndbuf can share the same
physical memory region with peer DMB to avoid data copy from one side
to the other. In such case the sndbuf is only a descriptor that
describes the shared memory and does not actually occupy memory, it's
more like a ghost buffer.

      +----------+                     +----------+
      | socket A |                     | socket B |
      +----------+                     +----------+
            |                               |
       +--------+                       +--------+
       | sndbuf |                       |  DMB   |
       |  desc  |                       |  desc  |
       +--------+                       +--------+
            |                               |
            |                          +----v-----+
            +-------------------------->  memory  |
                                       +----------+

So here introduces three new SMC-D device operations to check if this
feature is supported by device, and to {attach|detach} ghost sndbuf to
peer DMB. For now only loopback-ism supports this.

Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
Reviewed-by: Wenjia Zhang <wenjia@linux.ibm.com>
Reviewed-and-tested-by: Jan Karcher <jaka@linux.ibm.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/smc.h |  3 +++
 net/smc/smc_ism.c | 40 ++++++++++++++++++++++++++++++++++++++++
 net/smc/smc_ism.h |  4 ++++
 3 files changed, 47 insertions(+)

(limited to 'include')

diff --git a/include/net/smc.h b/include/net/smc.h
index 33b753115e43..db84e4e35080 100644
--- a/include/net/smc.h
+++ b/include/net/smc.h
@@ -74,6 +74,9 @@ struct smcd_ops {
 	int (*reset_vlan_required)(struct smcd_dev *dev);
 	int (*signal_event)(struct smcd_dev *dev, struct smcd_gid *rgid,
 			    u32 trigger_irq, u32 event_code, u64 info);
+	int (*support_dmb_nocopy)(struct smcd_dev *dev);
+	int (*attach_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb);
+	int (*detach_dmb)(struct smcd_dev *dev, u64 token);
 };
 
 struct smcd_dev {
diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c
index 6bed0a61b746..84f98e18c7db 100644
--- a/net/smc/smc_ism.c
+++ b/net/smc/smc_ism.c
@@ -250,6 +250,46 @@ int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len,
 	return rc;
 }
 
+bool smc_ism_support_dmb_nocopy(struct smcd_dev *smcd)
+{
+	/* for now only loopback-ism supports
+	 * merging sndbuf with peer DMB to avoid
+	 * data copies between them.
+	 */
+	return (smcd->ops->support_dmb_nocopy &&
+		smcd->ops->support_dmb_nocopy(smcd));
+}
+
+int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token,
+		       struct smc_buf_desc *dmb_desc)
+{
+	struct smcd_dmb dmb;
+	int rc = 0;
+
+	if (!dev->ops->attach_dmb)
+		return -EINVAL;
+
+	memset(&dmb, 0, sizeof(dmb));
+	dmb.dmb_tok = token;
+	rc = dev->ops->attach_dmb(dev, &dmb);
+	if (!rc) {
+		dmb_desc->sba_idx = dmb.sba_idx;
+		dmb_desc->token = dmb.dmb_tok;
+		dmb_desc->cpu_addr = dmb.cpu_addr;
+		dmb_desc->dma_addr = dmb.dma_addr;
+		dmb_desc->len = dmb.dmb_len;
+	}
+	return rc;
+}
+
+int smc_ism_detach_dmb(struct smcd_dev *dev, u64 token)
+{
+	if (!dev->ops->detach_dmb)
+		return -EINVAL;
+
+	return dev->ops->detach_dmb(dev, token);
+}
+
 static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd,
 				  struct sk_buff *skb,
 				  struct netlink_callback *cb)
diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h
index e6f57e5e1ef9..6763133dd8d0 100644
--- a/net/smc/smc_ism.h
+++ b/net/smc/smc_ism.h
@@ -48,6 +48,10 @@ int smc_ism_put_vlan(struct smcd_dev *dev, unsigned short vlan_id);
 int smc_ism_register_dmb(struct smc_link_group *lgr, int buf_size,
 			 struct smc_buf_desc *dmb_desc);
 int smc_ism_unregister_dmb(struct smcd_dev *dev, struct smc_buf_desc *dmb_desc);
+bool smc_ism_support_dmb_nocopy(struct smcd_dev *smcd);
+int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token,
+		       struct smc_buf_desc *dmb_desc);
+int smc_ism_detach_dmb(struct smcd_dev *dev, u64 token);
 int smc_ism_signal_shutdown(struct smc_link_group *lgr);
 void smc_ism_get_system_eid(u8 **eid);
 u16 smc_ism_get_chid(struct smcd_dev *dev);
-- 
cgit v1.2.3


From 5cd9fac3a369edd072e8e72a2b5f5abae57c97ed Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Sun, 28 Apr 2024 15:51:12 +0100
Subject: net: sfp-bus: constify link_modes to sfp_select_interface()

sfp_select_interface() does not modify its link_modes argument, so
make this a const pointer.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Daniel Machon <daniel.machon@microchip.com>
Link: https://lore.kernel.org/r/E1s15s0-00AHyq-8E@rmk-PC.armlinux.org.uk
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/phy/sfp-bus.c | 2 +-
 include/linux/sfp.h       | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c
index c6e3baf00f23..37c85f1e6534 100644
--- a/drivers/net/phy/sfp-bus.c
+++ b/drivers/net/phy/sfp-bus.c
@@ -355,7 +355,7 @@ EXPORT_SYMBOL_GPL(sfp_parse_support);
  * modes mask.
  */
 phy_interface_t sfp_select_interface(struct sfp_bus *bus,
-				     unsigned long *link_modes)
+				     const unsigned long *link_modes)
 {
 	if (phylink_test(link_modes, 25000baseCR_Full) ||
 	    phylink_test(link_modes, 25000baseKR_Full) ||
diff --git a/include/linux/sfp.h b/include/linux/sfp.h
index 55c0ab17c9e2..5ebc57f78c95 100644
--- a/include/linux/sfp.h
+++ b/include/linux/sfp.h
@@ -554,7 +554,7 @@ bool sfp_may_have_phy(struct sfp_bus *bus, const struct sfp_eeprom_id *id);
 void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id,
 		       unsigned long *support, unsigned long *interfaces);
 phy_interface_t sfp_select_interface(struct sfp_bus *bus,
-				     unsigned long *link_modes);
+				     const unsigned long *link_modes);
 
 int sfp_get_module_info(struct sfp_bus *bus, struct ethtool_modinfo *modinfo);
 int sfp_get_module_eeprom(struct sfp_bus *bus, struct ethtool_eeprom *ee,
@@ -593,7 +593,7 @@ static inline void sfp_parse_support(struct sfp_bus *bus,
 }
 
 static inline phy_interface_t sfp_select_interface(struct sfp_bus *bus,
-						   unsigned long *link_modes)
+						const unsigned long *link_modes)
 {
 	return PHY_INTERFACE_MODE_NA;
 }
-- 
cgit v1.2.3


From 940325add1c54e3018277d6d783ec419262729e8 Mon Sep 17 00:00:00 2001
From: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com>
Date: Wed, 13 Mar 2024 12:42:17 +0530
Subject: cxl/mbox: Add Get Log Capabilities and Get Supported Logs Sub-List
 commands

Adding UAPI support for
1. CXL r3.1 8.2.9.5.3 Get Log Capabilities.
2. CXL r3.1 8.2.9.5.6 Get Supported Logs Sub-List.

Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://lore.kernel.org/r/20240313071218.729-2-sthanneeru.opensrc@micron.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/core/mbox.c      | 2 ++
 drivers/cxl/cxlmem.h         | 2 ++
 include/uapi/linux/cxl_mem.h | 2 ++
 3 files changed, 6 insertions(+)

(limited to 'include')

diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 65185c9fa001..dbfee67b6319 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -56,6 +56,8 @@ static struct cxl_mem_command cxl_mem_commands[CXL_MEM_COMMAND_ID_MAX] = {
 	CXL_CMD(GET_LSA, 0x8, CXL_VARIABLE_PAYLOAD, 0),
 	CXL_CMD(GET_HEALTH_INFO, 0, 0x12, 0),
 	CXL_CMD(GET_LOG, 0x18, CXL_VARIABLE_PAYLOAD, CXL_CMD_FLAG_FORCE_ENABLE),
+	CXL_CMD(GET_LOG_CAPS, 0x10, 0x4, 0),
+	CXL_CMD(GET_SUP_LOG_SUBLIST, 0x2, CXL_VARIABLE_PAYLOAD, 0),
 	CXL_CMD(SET_PARTITION_INFO, 0x0a, 0, 0),
 	CXL_CMD(SET_LSA, CXL_VARIABLE_PAYLOAD, 0, 0),
 	CXL_CMD(GET_ALERT_CONFIG, 0, 0x10, 0),
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index 36cee9c30ceb..39e06bf45a43 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -527,6 +527,8 @@ enum cxl_opcode {
 	CXL_MBOX_OP_SET_TIMESTAMP	= 0x0301,
 	CXL_MBOX_OP_GET_SUPPORTED_LOGS	= 0x0400,
 	CXL_MBOX_OP_GET_LOG		= 0x0401,
+	CXL_MBOX_OP_GET_LOG_CAPS	= 0x0402,
+	CXL_MBOX_OP_GET_SUP_LOG_SUBLIST = 0x0405,
 	CXL_MBOX_OP_IDENTIFY		= 0x4000,
 	CXL_MBOX_OP_GET_PARTITION_INFO	= 0x4100,
 	CXL_MBOX_OP_SET_PARTITION_INFO	= 0x4101,
diff --git a/include/uapi/linux/cxl_mem.h b/include/uapi/linux/cxl_mem.h
index 42066f4eb890..49c25056c222 100644
--- a/include/uapi/linux/cxl_mem.h
+++ b/include/uapi/linux/cxl_mem.h
@@ -47,6 +47,8 @@
 	___DEPRECATED(SCAN_MEDIA, "Scan Media"),                          \
 	___DEPRECATED(GET_SCAN_MEDIA, "Get Scan Media Results"),          \
 	___C(GET_TIMESTAMP, "Get Timestamp"),                             \
+	___C(GET_LOG_CAPS, "Get Log Capabilities"),			  \
+	___C(GET_SUP_LOG_SUBLIST, "Get Supported Logs Sub-List"),	  \
 	___C(MAX, "invalid / last command")
 
 #define ___C(a, b) CXL_MEM_COMMAND_ID_##a
-- 
cgit v1.2.3


From 206f9fa9d55592c8cea0ccf84a5242b7c7cf3748 Mon Sep 17 00:00:00 2001
From: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com>
Date: Wed, 13 Mar 2024 12:42:18 +0530
Subject: cxl/mbox: Add Clear Log mailbox command

Adding UAPI support for CXL r3.1 8.2.9.5.4
Clear Log command.

This proposed patch will be useful for clearing and populating
the Vendor debug log in certain scenarios, allowing for the
aggregation of results over time.

Signed-off-by: Srinivasulu Thanneeru <sthanneeru.opensrc@micron.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://lore.kernel.org/r/20240313071218.729-3-sthanneeru.opensrc@micron.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/core/mbox.c      | 10 ++++++++++
 drivers/cxl/cxlmem.h         |  1 +
 include/uapi/linux/cxl_mem.h |  1 +
 3 files changed, 12 insertions(+)

(limited to 'include')

diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index dbfee67b6319..0c6216d1c586 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -57,6 +57,7 @@ static struct cxl_mem_command cxl_mem_commands[CXL_MEM_COMMAND_ID_MAX] = {
 	CXL_CMD(GET_HEALTH_INFO, 0, 0x12, 0),
 	CXL_CMD(GET_LOG, 0x18, CXL_VARIABLE_PAYLOAD, CXL_CMD_FLAG_FORCE_ENABLE),
 	CXL_CMD(GET_LOG_CAPS, 0x10, 0x4, 0),
+	CXL_CMD(CLEAR_LOG, 0x10, 0, 0),
 	CXL_CMD(GET_SUP_LOG_SUBLIST, 0x2, CXL_VARIABLE_PAYLOAD, 0),
 	CXL_CMD(SET_PARTITION_INFO, 0x0a, 0, 0),
 	CXL_CMD(SET_LSA, CXL_VARIABLE_PAYLOAD, 0, 0),
@@ -333,6 +334,15 @@ static bool cxl_payload_from_user_allowed(u16 opcode, void *payload_in)
 			return false;
 		break;
 	}
+	case CXL_MBOX_OP_CLEAR_LOG: {
+		const uuid_t *uuid = (uuid_t *)payload_in;
+
+		/*
+		 * Restrict the ‘Clear log’ action to only apply to
+		 * Vendor debug logs.
+		 */
+		return uuid_equal(uuid, &DEFINE_CXL_VENDOR_DEBUG_UUID);
+	}
 	default:
 		break;
 	}
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index 39e06bf45a43..19aba81cdf13 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -528,6 +528,7 @@ enum cxl_opcode {
 	CXL_MBOX_OP_GET_SUPPORTED_LOGS	= 0x0400,
 	CXL_MBOX_OP_GET_LOG		= 0x0401,
 	CXL_MBOX_OP_GET_LOG_CAPS	= 0x0402,
+	CXL_MBOX_OP_CLEAR_LOG           = 0x0403,
 	CXL_MBOX_OP_GET_SUP_LOG_SUBLIST = 0x0405,
 	CXL_MBOX_OP_IDENTIFY		= 0x4000,
 	CXL_MBOX_OP_GET_PARTITION_INFO	= 0x4100,
diff --git a/include/uapi/linux/cxl_mem.h b/include/uapi/linux/cxl_mem.h
index 49c25056c222..c6c0fe27495d 100644
--- a/include/uapi/linux/cxl_mem.h
+++ b/include/uapi/linux/cxl_mem.h
@@ -48,6 +48,7 @@
 	___DEPRECATED(GET_SCAN_MEDIA, "Get Scan Media Results"),          \
 	___C(GET_TIMESTAMP, "Get Timestamp"),                             \
 	___C(GET_LOG_CAPS, "Get Log Capabilities"),			  \
+	___C(CLEAR_LOG, "Clear Log"),					  \
 	___C(GET_SUP_LOG_SUBLIST, "Get Supported Logs Sub-List"),	  \
 	___C(MAX, "invalid / last command")
 
-- 
cgit v1.2.3


From faba7db431294e0684d08a51b1f04cda75473d93 Mon Sep 17 00:00:00 2001
From: Olivia Wen <olivia.wen@mediatek.com>
Date: Tue, 30 Apr 2024 09:15:34 +0800
Subject: remoteproc: mediatek: Add IMGSYS IPI command

Add an IPI command definition for communication with IMGSYS through
SCP mailbox.

Signed-off-by: Olivia Wen <olivia.wen@mediatek.com>
Reviewed-by: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Link: https://lore.kernel.org/r/20240430011534.9587-5-olivia.wen@mediatek.com
Signed-off-by: Mathieu Poirier <mathieu.poirier@linaro.org>
---
 include/linux/remoteproc/mtk_scp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/remoteproc/mtk_scp.h b/include/linux/remoteproc/mtk_scp.h
index 7c2b7cc9fe6c..344ff41c22c7 100644
--- a/include/linux/remoteproc/mtk_scp.h
+++ b/include/linux/remoteproc/mtk_scp.h
@@ -43,6 +43,7 @@ enum scp_ipi_id {
 	SCP_IPI_CROS_HOST_CMD,
 	SCP_IPI_VDEC_LAT,
 	SCP_IPI_VDEC_CORE,
+	SCP_IPI_IMGSYS_CMD,
 	SCP_IPI_NS_SERVICE = 0xFF,
 	SCP_IPI_MAX = 0x100,
 };
-- 
cgit v1.2.3


From 535a3692ba7245792e6f23654507865d4293c850 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Tue, 30 Apr 2024 13:28:24 +0200
Subject: bpf: Add support for kprobe session attach

Adding support to attach bpf program for entry and return probe
of the same function. This is common use case which at the moment
requires to create two kprobe multi links.

Adding new BPF_TRACE_KPROBE_SESSION attach type that instructs
kernel to attach single link program to both entry and exit probe.

It's possible to control execution of the bpf program on return
probe simply by returning zero or non zero from the entry bpf
program execution to execute or not the bpf program on return
probe respectively.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Link: https://lore.kernel.org/bpf/20240430112830.1184228-2-jolsa@kernel.org
---
 include/uapi/linux/bpf.h       |  1 +
 kernel/bpf/syscall.c           |  7 ++++++-
 kernel/trace/bpf_trace.c       | 28 ++++++++++++++++++++--------
 tools/include/uapi/linux/bpf.h |  1 +
 4 files changed, 28 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d94a72593ead..90706a47f6ff 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1115,6 +1115,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_UNIX_GETSOCKNAME,
 	BPF_NETKIT_PRIMARY,
 	BPF_NETKIT_PEER,
+	BPF_TRACE_KPROBE_SESSION,
 	__MAX_BPF_ATTACH_TYPE
 };
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f655adf42e39..13ad74ecf2cd 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4016,11 +4016,15 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
 		if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI &&
 		    attach_type != BPF_TRACE_KPROBE_MULTI)
 			return -EINVAL;
+		if (prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION &&
+		    attach_type != BPF_TRACE_KPROBE_SESSION)
+			return -EINVAL;
 		if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI &&
 		    attach_type != BPF_TRACE_UPROBE_MULTI)
 			return -EINVAL;
 		if (attach_type != BPF_PERF_EVENT &&
 		    attach_type != BPF_TRACE_KPROBE_MULTI &&
+		    attach_type != BPF_TRACE_KPROBE_SESSION &&
 		    attach_type != BPF_TRACE_UPROBE_MULTI)
 			return -EINVAL;
 		return 0;
@@ -5281,7 +5285,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
 	case BPF_PROG_TYPE_KPROBE:
 		if (attr->link_create.attach_type == BPF_PERF_EVENT)
 			ret = bpf_perf_link_attach(attr, prog);
-		else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI)
+		else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI ||
+			 attr->link_create.attach_type == BPF_TRACE_KPROBE_SESSION)
 			ret = bpf_kprobe_multi_link_attach(attr, prog);
 		else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI)
 			ret = bpf_uprobe_multi_link_attach(attr, prog);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0ba722b57af3..06a9671834b6 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1631,6 +1631,17 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	}
 }
 
+static bool is_kprobe_multi(const struct bpf_prog *prog)
+{
+	return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ||
+	       prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION;
+}
+
+static inline bool is_kprobe_session(const struct bpf_prog *prog)
+{
+	return prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION;
+}
+
 static const struct bpf_func_proto *
 kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -1646,13 +1657,13 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_override_return_proto;
 #endif
 	case BPF_FUNC_get_func_ip:
-		if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI)
+		if (is_kprobe_multi(prog))
 			return &bpf_get_func_ip_proto_kprobe_multi;
 		if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI)
 			return &bpf_get_func_ip_proto_uprobe_multi;
 		return &bpf_get_func_ip_proto_kprobe;
 	case BPF_FUNC_get_attach_cookie:
-		if (prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI)
+		if (is_kprobe_multi(prog))
 			return &bpf_get_attach_cookie_proto_kmulti;
 		if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI)
 			return &bpf_get_attach_cookie_proto_umulti;
@@ -2834,10 +2845,11 @@ kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip,
 			  void *data)
 {
 	struct bpf_kprobe_multi_link *link;
+	int err;
 
 	link = container_of(fp, struct bpf_kprobe_multi_link, fp);
-	kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs);
-	return 0;
+	err = kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs);
+	return is_kprobe_session(link->link.prog) ? err : 0;
 }
 
 static void
@@ -2981,7 +2993,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 	if (sizeof(u64) != sizeof(void *))
 		return -EOPNOTSUPP;
 
-	if (prog->expected_attach_type != BPF_TRACE_KPROBE_MULTI)
+	if (!is_kprobe_multi(prog))
 		return -EINVAL;
 
 	flags = attr->link_create.kprobe_multi.flags;
@@ -3062,10 +3074,10 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
 	if (err)
 		goto error;
 
-	if (flags & BPF_F_KPROBE_MULTI_RETURN)
-		link->fp.exit_handler = kprobe_multi_link_exit_handler;
-	else
+	if (!(flags & BPF_F_KPROBE_MULTI_RETURN))
 		link->fp.entry_handler = kprobe_multi_link_handler;
+	if ((flags & BPF_F_KPROBE_MULTI_RETURN) || is_kprobe_session(prog))
+		link->fp.exit_handler = kprobe_multi_link_exit_handler;
 
 	link->addrs = addrs;
 	link->cookies = cookies;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d94a72593ead..90706a47f6ff 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1115,6 +1115,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_UNIX_GETSOCKNAME,
 	BPF_NETKIT_PRIMARY,
 	BPF_NETKIT_PEER,
+	BPF_TRACE_KPROBE_SESSION,
 	__MAX_BPF_ATTACH_TYPE
 };
 
-- 
cgit v1.2.3


From 38ae7142e35165571123997addd71393a9dabde5 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Sun, 17 Mar 2024 11:40:37 +0100
Subject: pwm: Make pwmchip_[sg]et_drvdata() a wrapper around dev_set_drvdata()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that a pwm_chip has a dedicated struct device, pwmchip_set_drvdata()
and pwmchip_get_drvdata() can be made thin wrappers around
dev_set_drvdata() and dev_get_drvdata() respectively and the previously
needed pointer can be dropped from struct pwm_chip.

Link: https://lore.kernel.org/r/a5e05bd2d83421a26fdef6a87d69253c0f98becf.1710670958.git.u.kleine-koenig@pengutronix.de
Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
---
 include/linux/pwm.h | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index 495e23761f34..60b92c2c75ef 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -274,7 +274,6 @@ struct pwm_ops {
  * @of_xlate: request a PWM device given a device tree PWM specifier
  * @atomic: can the driver's ->apply() be called in atomic context
  * @uses_pwmchip_alloc: signals if pwmchip_allow was used to allocate this chip
- * @driver_data: Private pointer for driver specific info
  * @pwms: array of PWM devices allocated by the framework
  */
 struct pwm_chip {
@@ -290,7 +289,6 @@ struct pwm_chip {
 
 	/* only used internally by the PWM framework */
 	bool uses_pwmchip_alloc;
-	void *driver_data;
 	struct pwm_device pwms[] __counted_by(npwm);
 };
 
@@ -301,20 +299,12 @@ static inline struct device *pwmchip_parent(const struct pwm_chip *chip)
 
 static inline void *pwmchip_get_drvdata(struct pwm_chip *chip)
 {
-	/*
-	 * After pwm_chip got a dedicated struct device, this can be replaced by
-	 * dev_get_drvdata(&chip->dev);
-	 */
-	return chip->driver_data;
+	return dev_get_drvdata(&chip->dev);
 }
 
 static inline void pwmchip_set_drvdata(struct pwm_chip *chip, void *data)
 {
-	/*
-	 * After pwm_chip got a dedicated struct device, this can be replaced by
-	 * dev_set_drvdata(&chip->dev, data);
-	 */
-	chip->driver_data = data;
+	dev_set_drvdata(&chip->dev, data);
 }
 
 #if IS_ENABLED(CONFIG_PWM)
-- 
cgit v1.2.3


From 26f812ba75890c48644a31e3cfe3dd9762138968 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 29 Apr 2024 12:43:41 -0700
Subject: kunit/fortify: Add memcpy() tests

Add fortify tests for memcpy() and memmove(). This can use a similar
method to the fortify_panic() replacement, only we can do it for what
was the WARN_ONCE(), which can be redefined.

Since this is primarily testing the fortify behaviors of the memcpy()
and memmove() defenses, the tests for memcpy() and memmove() are
identical.

Link: https://lore.kernel.org/r/20240429194342.2421639-3-keescook@chromium.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/fortify-string.h |  6 ++-
 lib/fortify_kunit.c            | 85 ++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 87 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index 6aeebe0a6777..a0bb13825109 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -15,10 +15,14 @@
 #define FORTIFY_REASON(func, write)	(FIELD_PREP(BIT(0), write) | \
 					 FIELD_PREP(GENMASK(7, 1), func))
 
+/* Overridden by KUnit tests. */
 #ifndef fortify_panic
 # define fortify_panic(func, write, avail, size, retfail)	\
 	 __fortify_panic(FORTIFY_REASON(func, write), avail, size)
 #endif
+#ifndef fortify_warn_once
+# define fortify_warn_once(x...)	WARN_ONCE(x)
+#endif
 
 #define FORTIFY_READ		 0
 #define FORTIFY_WRITE		 1
@@ -609,7 +613,7 @@ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size,
 	const size_t __q_size = (q_size);				\
 	const size_t __p_size_field = (p_size_field);			\
 	const size_t __q_size_field = (q_size_field);			\
-	WARN_ONCE(fortify_memcpy_chk(__fortify_size, __p_size,		\
+	fortify_warn_once(fortify_memcpy_chk(__fortify_size, __p_size,	\
 				     __q_size, __p_size_field,		\
 				     __q_size_field, FORTIFY_FUNC_ ##op), \
 		  #op ": detected field-spanning write (size %zu) of single %s (size %zu)\n", \
diff --git a/lib/fortify_kunit.c b/lib/fortify_kunit.c
index 601fa327c5b7..ef3e4c68b759 100644
--- a/lib/fortify_kunit.c
+++ b/lib/fortify_kunit.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Runtime test cases for CONFIG_FORTIFY_SOURCE. For testing memcpy(),
- * see FORTIFY_MEM_* tests in LKDTM (drivers/misc/lkdtm/fortify.c).
+ * Runtime test cases for CONFIG_FORTIFY_SOURCE. For additional memcpy()
+ * testing see FORTIFY_MEM_* tests in LKDTM (drivers/misc/lkdtm/fortify.c).
  *
  * For corner cases with UBSAN, try testing with:
  *
@@ -18,8 +18,10 @@
 /* We don't need to fill dmesg with the fortify WARNs during testing. */
 #ifdef DEBUG
 # define FORTIFY_REPORT_KUNIT(x...) __fortify_report(x)
+# define FORTIFY_WARN_KUNIT(x...)   WARN_ONCE(x)
 #else
 # define FORTIFY_REPORT_KUNIT(x...) do { } while (0)
+# define FORTIFY_WARN_KUNIT(x...)   do { } while (0)
 #endif
 
 /* Redefine fortify_panic() to track failures. */
@@ -30,6 +32,14 @@ void fortify_add_kunit_error(int write);
 	return (retfail);						\
 } while (0)
 
+/* Redefine fortify_warn_once() to track memcpy() failures. */
+#define fortify_warn_once(chk_func, x...) do {				\
+	bool __result = chk_func;					\
+	FORTIFY_WARN_KUNIT(__result, x);				\
+	if (__result)							\
+		fortify_add_kunit_error(1);				\
+} while (0)
+
 #include <kunit/device.h>
 #include <kunit/test.h>
 #include <kunit/test-bug.h>
@@ -818,6 +828,74 @@ static void fortify_test_strlcat(struct kunit *test)
 	KUNIT_EXPECT_EQ(test, pad.bytes_after, 0);
 }
 
+/* Check for 0-sized arrays... */
+struct fortify_zero_sized {
+	unsigned long bytes_before;
+	char buf[0];
+	unsigned long bytes_after;
+};
+
+#define __fortify_test(memfunc)					\
+static void fortify_test_##memfunc(struct kunit *test)		\
+{								\
+	struct fortify_zero_sized zero = { };			\
+	struct fortify_padding pad = { };			\
+	char srcA[sizeof(pad.buf) + 2];				\
+	char srcB[sizeof(pad.buf) + 2];				\
+	size_t len = sizeof(pad.buf) + unconst;			\
+								\
+	memset(srcA, 'A', sizeof(srcA));			\
+	KUNIT_ASSERT_EQ(test, srcA[0], 'A');			\
+	memset(srcB, 'B', sizeof(srcB));			\
+	KUNIT_ASSERT_EQ(test, srcB[0], 'B');			\
+								\
+	memfunc(pad.buf, srcA, 0 + unconst);			\
+	KUNIT_EXPECT_EQ(test, pad.buf[0], '\0');		\
+	KUNIT_EXPECT_EQ(test, fortify_read_overflows, 0);	\
+	KUNIT_EXPECT_EQ(test, fortify_write_overflows, 0);	\
+	memfunc(pad.buf + 1, srcB, 1 + unconst);		\
+	KUNIT_EXPECT_EQ(test, pad.buf[0], '\0');		\
+	KUNIT_EXPECT_EQ(test, pad.buf[1], 'B');			\
+	KUNIT_EXPECT_EQ(test, pad.buf[2], '\0');		\
+	KUNIT_EXPECT_EQ(test, fortify_read_overflows, 0);	\
+	KUNIT_EXPECT_EQ(test, fortify_write_overflows, 0);	\
+	memfunc(pad.buf, srcA, 1 + unconst);			\
+	KUNIT_EXPECT_EQ(test, pad.buf[0], 'A');			\
+	KUNIT_EXPECT_EQ(test, pad.buf[1], 'B');			\
+	KUNIT_EXPECT_EQ(test, fortify_read_overflows, 0);	\
+	KUNIT_EXPECT_EQ(test, fortify_write_overflows, 0);	\
+	memfunc(pad.buf, srcA, len - 1);			\
+	KUNIT_EXPECT_EQ(test, pad.buf[1], 'A');			\
+	KUNIT_EXPECT_EQ(test, pad.buf[len - 1], '\0');		\
+	KUNIT_EXPECT_EQ(test, fortify_read_overflows, 0);	\
+	KUNIT_EXPECT_EQ(test, fortify_write_overflows, 0);	\
+	memfunc(pad.buf, srcA, len);				\
+	KUNIT_EXPECT_EQ(test, pad.buf[1], 'A');			\
+	KUNIT_EXPECT_EQ(test, pad.buf[len - 1], 'A');		\
+	KUNIT_EXPECT_EQ(test, pad.bytes_after, 0);		\
+	KUNIT_EXPECT_EQ(test, fortify_read_overflows, 0);	\
+	KUNIT_EXPECT_EQ(test, fortify_write_overflows, 0);	\
+	memfunc(pad.buf, srcA, len + 1);			\
+	KUNIT_EXPECT_EQ(test, fortify_read_overflows, 0);	\
+	KUNIT_EXPECT_EQ(test, fortify_write_overflows, 1);	\
+	memfunc(pad.buf + 1, srcB, len);			\
+	KUNIT_EXPECT_EQ(test, fortify_read_overflows, 0);	\
+	KUNIT_EXPECT_EQ(test, fortify_write_overflows, 2);	\
+								\
+	/* Reset error counter. */				\
+	fortify_write_overflows = 0;				\
+	/* Copy nothing into nothing: no errors. */		\
+	memfunc(zero.buf, srcB, 0 + unconst);			\
+	KUNIT_EXPECT_EQ(test, fortify_read_overflows, 0);	\
+	KUNIT_EXPECT_EQ(test, fortify_write_overflows, 0);	\
+	/* We currently explicitly ignore zero-sized dests. */	\
+	memfunc(zero.buf, srcB, 1 + unconst);			\
+	KUNIT_EXPECT_EQ(test, fortify_read_overflows, 0);	\
+	KUNIT_EXPECT_EQ(test, fortify_write_overflows, 0);	\
+}
+__fortify_test(memcpy)
+__fortify_test(memmove)
+
 static void fortify_test_memscan(struct kunit *test)
 {
 	char haystack[] = "Where oh where is my memory range?";
@@ -977,7 +1055,8 @@ static struct kunit_case fortify_test_cases[] = {
 	KUNIT_CASE(fortify_test_strncat),
 	KUNIT_CASE(fortify_test_strlcat),
 	/* skip memset: performs bounds checking on whole structs */
-	/* skip memcpy: still using warn-and-overwrite instead of hard-fail */
+	KUNIT_CASE(fortify_test_memcpy),
+	KUNIT_CASE(fortify_test_memmove),
 	KUNIT_CASE(fortify_test_memscan),
 	KUNIT_CASE(fortify_test_memchr),
 	KUNIT_CASE(fortify_test_memchr_inv),
-- 
cgit v1.2.3


From 0e081a0ec04df58ac9caec1916ae6d9d048d50a7 Mon Sep 17 00:00:00 2001
From: Sangyun Kim <sangyun.kim@snu.ac.kr>
Date: Wed, 17 Apr 2024 12:50:43 +0900
Subject: cxl/cxl-event: include missing <linux/types.h> and <linux/uuid.h>

The linux/cxl-event.h header file uses the u8, u16, and uuid_t types,
but it doesn't include the necessary header files,
<linux/types.h> and <linux/uuid.h>.

Currently, cxl-event.h is only used by drivers/cxl/cxlmem.h,
and it doesn't cause any errors because cxlmem.h indirectly
includes the required types.

However, cxl-event.h may be used by other CXL-related code in the future,
so it's important to fix this issue by including the missing header files
directly in cxl-event.h.

Signed-off-by: Sangyun Kim <sangyun.kim@snu.ac.kr>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Link: https://lore.kernel.org/r/20240417035043.2791431-1-sangyun.kim@snu.ac.kr
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 include/linux/cxl-event.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index 03fa6d50d46f..812ed16ffc2f 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -3,6 +3,9 @@
 #ifndef _LINUX_CXL_EVENT_H
 #define _LINUX_CXL_EVENT_H
 
+#include <linux/types.h>
+#include <linux/uuid.h>
+
 /*
  * Common Event Record Format
  * CXL rev 3.0 section 8.2.9.2.1; Table 8-42
-- 
cgit v1.2.3


From 575024a8aa7cf1dff49b94092f774ed1c90586be Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Sun, 28 Apr 2024 17:24:26 +0800
Subject: powercap: intel_rapl: Introduce APIs for PMU support

Introduce two new APIs rapl_package_add_pmu()/rapl_package_remove_pmu().

RAPL driver can invoke these APIs to expose its supported energy
counters via perf PMU. The new RAPL PMU is fully compatible with current
MSR RAPL PMU, including using the same PMU name and events
name/id/unit/scale, etc.

For example, use below command
 perf stat -e power/energy-pkg/ -e power/energy-ram/ FOO
to get the energy consumption if power/energy-pkg/ and power/energy-ram/
events are available in the "perf list" output.

This does not introduce any conflict because TPMI RAPL is the only user
of these APIs currently, and it never co-exists with MSR RAPL.

Note that RAPL Packages can be probed/removed dynamically, and the
events supported by each TPMI RAPL device can be different. Thus the
RAPL PMU support is done on demand, which means
1. PMU is registered only if it is needed by a RAPL Package. PMU events
   for unsupported counters are not exposed.
2. PMU is unregistered and registered when a new RAPL Package is probed
   and supports new counters that are not supported by current PMU.
   For example, on a dual-package system using TPMI RAPL, it is possible
   that Package 1 behaves as TPMI domain root and supports Psys domain.
   In this case, register PMU without Psys event when probing Package 0,
   and re-register the PMU with Psys event when probing Package 1.
3. PMU is unregistered when all registered RAPL Packages don't need PMU.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl_common.c | 582 +++++++++++++++++++++++++++++++++++
 include/linux/intel_rapl.h           |  32 ++
 2 files changed, 614 insertions(+)

(limited to 'include')

diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c
index c4302caeb631..aac0744011a3 100644
--- a/drivers/powercap/intel_rapl_common.c
+++ b/drivers/powercap/intel_rapl_common.c
@@ -15,6 +15,8 @@
 #include <linux/list.h>
 #include <linux/log2.h>
 #include <linux/module.h>
+#include <linux/nospec.h>
+#include <linux/perf_event.h>
 #include <linux/platform_device.h>
 #include <linux/powercap.h>
 #include <linux/processor.h>
@@ -1507,6 +1509,586 @@ static int rapl_detect_domains(struct rapl_package *rp)
 	return 0;
 }
 
+#ifdef CONFIG_PERF_EVENTS
+
+/*
+ * Support for RAPL PMU
+ *
+ * Register a PMU if any of the registered RAPL Packages have the requirement
+ * of exposing its energy counters via Perf PMU.
+ *
+ * PMU Name:
+ *	power
+ *
+ * Events:
+ *	Name		Event id	RAPL Domain
+ *	energy_cores	0x01		RAPL_DOMAIN_PP0
+ *	energy_pkg	0x02		RAPL_DOMAIN_PACKAGE
+ *	energy_ram	0x03		RAPL_DOMAIN_DRAM
+ *	energy_gpu	0x04		RAPL_DOMAIN_PP1
+ *	energy_psys	0x05		RAPL_DOMAIN_PLATFORM
+ *
+ * Unit:
+ *	Joules
+ *
+ * Scale:
+ *	2.3283064365386962890625e-10
+ *	The same RAPL domain in different RAPL Packages may have different
+ *	energy units. Use 2.3283064365386962890625e-10 (2^-32) Joules as
+ *	the fixed unit for all energy counters, and covert each hardware
+ *	counter increase to N times of PMU event counter increases.
+ *
+ * This is fully compatible with the current MSR RAPL PMU. This means that
+ * userspace programs like turbostat can use the same code to handle RAPL Perf
+ * PMU, no matter what RAPL Interface driver (MSR/TPMI, etc) is running
+ * underlying on the platform.
+ *
+ * Note that RAPL Packages can be probed/removed dynamically, and the events
+ * supported by each TPMI RAPL device can be different. Thus the RAPL PMU
+ * support is done on demand, which means
+ * 1. PMU is registered only if it is needed by a RAPL Package. PMU events for
+ *    unsupported counters are not exposed.
+ * 2. PMU is unregistered and registered when a new RAPL Package is probed and
+ *    supports new counters that are not supported by current PMU.
+ * 3. PMU is unregistered when all registered RAPL Packages don't need PMU.
+ */
+
+struct rapl_pmu {
+	struct pmu pmu;			/* Perf PMU structure */
+	u64 timer_ms;			/* Maximum expiration time to avoid counter overflow */
+	unsigned long domain_map;	/* Events supported by current registered PMU */
+	bool registered;		/* Whether the PMU has been registered or not */
+};
+
+static struct rapl_pmu rapl_pmu;
+
+/* PMU helpers */
+
+static int get_pmu_cpu(struct rapl_package *rp)
+{
+	int cpu;
+
+	if (!rp->has_pmu)
+		return nr_cpu_ids;
+
+	/* Only TPMI RAPL is supported for now */
+	if (rp->priv->type != RAPL_IF_TPMI)
+		return nr_cpu_ids;
+
+	/* TPMI RAPL uses any CPU in the package for PMU */
+	for_each_online_cpu(cpu)
+		if (topology_physical_package_id(cpu) == rp->id)
+			return cpu;
+
+	return nr_cpu_ids;
+}
+
+static bool is_rp_pmu_cpu(struct rapl_package *rp, int cpu)
+{
+	if (!rp->has_pmu)
+		return false;
+
+	/* Only TPMI RAPL is supported for now */
+	if (rp->priv->type != RAPL_IF_TPMI)
+		return false;
+
+	/* TPMI RAPL uses any CPU in the package for PMU */
+	return topology_physical_package_id(cpu) == rp->id;
+}
+
+static struct rapl_package_pmu_data *event_to_pmu_data(struct perf_event *event)
+{
+	struct rapl_package *rp = event->pmu_private;
+
+	return &rp->pmu_data;
+}
+
+/* PMU event callbacks */
+
+static u64 event_read_counter(struct perf_event *event)
+{
+	struct rapl_package *rp = event->pmu_private;
+	u64 val;
+	int ret;
+
+	/* Return 0 for unsupported events */
+	if (event->hw.idx < 0)
+		return 0;
+
+	ret = rapl_read_data_raw(&rp->domains[event->hw.idx], ENERGY_COUNTER, false, &val);
+
+	/* Return 0 for failed read */
+	if (ret)
+		return 0;
+
+	return val;
+}
+
+static void __rapl_pmu_event_start(struct perf_event *event)
+{
+	struct rapl_package_pmu_data *data = event_to_pmu_data(event);
+
+	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+		return;
+
+	event->hw.state = 0;
+
+	list_add_tail(&event->active_entry, &data->active_list);
+
+	local64_set(&event->hw.prev_count, event_read_counter(event));
+	if (++data->n_active == 1)
+		hrtimer_start(&data->hrtimer, data->timer_interval,
+			      HRTIMER_MODE_REL_PINNED);
+}
+
+static void rapl_pmu_event_start(struct perf_event *event, int mode)
+{
+	struct rapl_package_pmu_data *data = event_to_pmu_data(event);
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&data->lock, flags);
+	__rapl_pmu_event_start(event);
+	raw_spin_unlock_irqrestore(&data->lock, flags);
+}
+
+static u64 rapl_event_update(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct rapl_package_pmu_data *data = event_to_pmu_data(event);
+	u64 prev_raw_count, new_raw_count;
+	s64 delta, sdelta;
+
+	/*
+	 * Follow the generic code to drain hwc->prev_count.
+	 * The loop is not expected to run for multiple times.
+	 */
+	prev_raw_count = local64_read(&hwc->prev_count);
+	do {
+		new_raw_count = event_read_counter(event);
+	} while (!local64_try_cmpxchg(&hwc->prev_count,
+		&prev_raw_count, new_raw_count));
+
+
+	/*
+	 * Now we have the new raw value and have updated the prev
+	 * timestamp already. We can now calculate the elapsed delta
+	 * (event-)time and add that to the generic event.
+	 */
+	delta = new_raw_count - prev_raw_count;
+
+	/*
+	 * Scale delta to smallest unit (2^-32)
+	 * users must then scale back: count * 1/(1e9*2^32) to get Joules
+	 * or use ldexp(count, -32).
+	 * Watts = Joules/Time delta
+	 */
+	sdelta = delta * data->scale[event->hw.flags];
+
+	local64_add(sdelta, &event->count);
+
+	return new_raw_count;
+}
+
+static void rapl_pmu_event_stop(struct perf_event *event, int mode)
+{
+	struct rapl_package_pmu_data *data = event_to_pmu_data(event);
+	struct hw_perf_event *hwc = &event->hw;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&data->lock, flags);
+
+	/* Mark event as deactivated and stopped */
+	if (!(hwc->state & PERF_HES_STOPPED)) {
+		WARN_ON_ONCE(data->n_active <= 0);
+		if (--data->n_active == 0)
+			hrtimer_cancel(&data->hrtimer);
+
+		list_del(&event->active_entry);
+
+		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+		hwc->state |= PERF_HES_STOPPED;
+	}
+
+	/* Check if update of sw counter is necessary */
+	if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+		/*
+		 * Drain the remaining delta count out of a event
+		 * that we are disabling:
+		 */
+		rapl_event_update(event);
+		hwc->state |= PERF_HES_UPTODATE;
+	}
+
+	raw_spin_unlock_irqrestore(&data->lock, flags);
+}
+
+static int rapl_pmu_event_add(struct perf_event *event, int mode)
+{
+	struct rapl_package_pmu_data *data = event_to_pmu_data(event);
+	struct hw_perf_event *hwc = &event->hw;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&data->lock, flags);
+
+	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+	if (mode & PERF_EF_START)
+		__rapl_pmu_event_start(event);
+
+	raw_spin_unlock_irqrestore(&data->lock, flags);
+
+	return 0;
+}
+
+static void rapl_pmu_event_del(struct perf_event *event, int flags)
+{
+	rapl_pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+/* RAPL PMU event ids, same as shown in sysfs */
+enum perf_rapl_events {
+	PERF_RAPL_PP0 = 1,	/* all cores */
+	PERF_RAPL_PKG,		/* entire package */
+	PERF_RAPL_RAM,		/* DRAM */
+	PERF_RAPL_PP1,		/* gpu */
+	PERF_RAPL_PSYS,		/* psys */
+	PERF_RAPL_MAX
+};
+#define RAPL_EVENT_MASK GENMASK(7, 0)
+
+static const int event_to_domain[PERF_RAPL_MAX] = {
+	[PERF_RAPL_PP0]		= RAPL_DOMAIN_PP0,
+	[PERF_RAPL_PKG]		= RAPL_DOMAIN_PACKAGE,
+	[PERF_RAPL_RAM]		= RAPL_DOMAIN_DRAM,
+	[PERF_RAPL_PP1]		= RAPL_DOMAIN_PP1,
+	[PERF_RAPL_PSYS]	= RAPL_DOMAIN_PLATFORM,
+};
+
+static int rapl_pmu_event_init(struct perf_event *event)
+{
+	struct rapl_package *pos, *rp = NULL;
+	u64 cfg = event->attr.config & RAPL_EVENT_MASK;
+	int domain, idx;
+
+	/* Only look at RAPL events */
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	/* Check for supported events only */
+	if (!cfg || cfg >= PERF_RAPL_MAX)
+		return -EINVAL;
+
+	if (event->cpu < 0)
+		return -EINVAL;
+
+	/* Find out which Package the event belongs to */
+	list_for_each_entry(pos, &rapl_packages, plist) {
+		if (is_rp_pmu_cpu(pos, event->cpu)) {
+			rp = pos;
+			break;
+		}
+	}
+	if (!rp)
+		return -ENODEV;
+
+	/* Find out which RAPL Domain the event belongs to */
+	domain = event_to_domain[cfg];
+
+	event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
+	event->pmu_private = rp;	/* Which package */
+	event->hw.flags = domain;	/* Which domain */
+
+	event->hw.idx = -1;
+	/* Find out the index in rp->domains[] to get domain pointer */
+	for (idx = 0; idx < rp->nr_domains; idx++) {
+		if (rp->domains[idx].id == domain) {
+			event->hw.idx = idx;
+			break;
+		}
+	}
+
+	return 0;
+}
+
+static void rapl_pmu_event_read(struct perf_event *event)
+{
+	rapl_event_update(event);
+}
+
+static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
+{
+	struct rapl_package_pmu_data *data =
+		container_of(hrtimer, struct rapl_package_pmu_data, hrtimer);
+	struct perf_event *event;
+	unsigned long flags;
+
+	if (!data->n_active)
+		return HRTIMER_NORESTART;
+
+	raw_spin_lock_irqsave(&data->lock, flags);
+
+	list_for_each_entry(event, &data->active_list, active_entry)
+		rapl_event_update(event);
+
+	raw_spin_unlock_irqrestore(&data->lock, flags);
+
+	hrtimer_forward_now(hrtimer, data->timer_interval);
+
+	return HRTIMER_RESTART;
+}
+
+/* PMU sysfs attributes */
+
+/*
+ * There are no default events, but we need to create "events" group (with
+ * empty attrs) before updating it with detected events.
+ */
+static struct attribute *attrs_empty[] = {
+	NULL,
+};
+
+static struct attribute_group pmu_events_group = {
+	.name = "events",
+	.attrs = attrs_empty,
+};
+
+static ssize_t cpumask_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct rapl_package *rp;
+	cpumask_var_t cpu_mask;
+	int cpu;
+	int ret;
+
+	if (!alloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	cpus_read_lock();
+
+	cpumask_clear(cpu_mask);
+
+	/* Choose a cpu for each RAPL Package */
+	list_for_each_entry(rp, &rapl_packages, plist) {
+		cpu = get_pmu_cpu(rp);
+		if (cpu < nr_cpu_ids)
+			cpumask_set_cpu(cpu, cpu_mask);
+	}
+	cpus_read_unlock();
+
+	ret = cpumap_print_to_pagebuf(true, buf, cpu_mask);
+
+	free_cpumask_var(cpu_mask);
+
+	return ret;
+}
+
+static DEVICE_ATTR_RO(cpumask);
+
+static struct attribute *pmu_cpumask_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL
+};
+
+static struct attribute_group pmu_cpumask_group = {
+	.attrs = pmu_cpumask_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+static struct attribute *pmu_format_attr[] = {
+	&format_attr_event.attr,
+	NULL
+};
+
+static struct attribute_group pmu_format_group = {
+	.name = "format",
+	.attrs = pmu_format_attr,
+};
+
+static const struct attribute_group *pmu_attr_groups[] = {
+	&pmu_events_group,
+	&pmu_cpumask_group,
+	&pmu_format_group,
+	NULL
+};
+
+#define RAPL_EVENT_ATTR_STR(_name, v, str)					\
+static struct perf_pmu_events_attr event_attr_##v = {				\
+	.attr		= __ATTR(_name, 0444, perf_event_sysfs_show, NULL),	\
+	.event_str	= str,							\
+}
+
+RAPL_EVENT_ATTR_STR(energy-cores,	rapl_cores,	"event=0x01");
+RAPL_EVENT_ATTR_STR(energy-pkg,		rapl_pkg,	"event=0x02");
+RAPL_EVENT_ATTR_STR(energy-ram,		rapl_ram,	"event=0x03");
+RAPL_EVENT_ATTR_STR(energy-gpu,		rapl_gpu,	"event=0x04");
+RAPL_EVENT_ATTR_STR(energy-psys,	rapl_psys,	"event=0x05");
+
+RAPL_EVENT_ATTR_STR(energy-cores.unit,	rapl_unit_cores,	"Joules");
+RAPL_EVENT_ATTR_STR(energy-pkg.unit,	rapl_unit_pkg,		"Joules");
+RAPL_EVENT_ATTR_STR(energy-ram.unit,	rapl_unit_ram,		"Joules");
+RAPL_EVENT_ATTR_STR(energy-gpu.unit,	rapl_unit_gpu,		"Joules");
+RAPL_EVENT_ATTR_STR(energy-psys.unit,	rapl_unit_psys,		"Joules");
+
+RAPL_EVENT_ATTR_STR(energy-cores.scale,	rapl_scale_cores,	"2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-pkg.scale,	rapl_scale_pkg,		"2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-ram.scale,	rapl_scale_ram,		"2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-gpu.scale,	rapl_scale_gpu,		"2.3283064365386962890625e-10");
+RAPL_EVENT_ATTR_STR(energy-psys.scale,	rapl_scale_psys,	"2.3283064365386962890625e-10");
+
+#define RAPL_EVENT_GROUP(_name, domain)			\
+static struct attribute *pmu_attr_##_name[] = {		\
+	&event_attr_rapl_##_name.attr.attr,		\
+	&event_attr_rapl_unit_##_name.attr.attr,	\
+	&event_attr_rapl_scale_##_name.attr.attr,	\
+	NULL						\
+};							\
+static umode_t is_visible_##_name(struct kobject *kobj, struct attribute *attr, int event)	\
+{											\
+	return rapl_pmu.domain_map & BIT(domain) ? attr->mode : 0;	\
+}							\
+static struct attribute_group pmu_group_##_name = {	\
+	.name  = "events",				\
+	.attrs = pmu_attr_##_name,			\
+	.is_visible = is_visible_##_name,		\
+}
+
+RAPL_EVENT_GROUP(cores,	RAPL_DOMAIN_PP0);
+RAPL_EVENT_GROUP(pkg,	RAPL_DOMAIN_PACKAGE);
+RAPL_EVENT_GROUP(ram,	RAPL_DOMAIN_DRAM);
+RAPL_EVENT_GROUP(gpu,	RAPL_DOMAIN_PP1);
+RAPL_EVENT_GROUP(psys,	RAPL_DOMAIN_PLATFORM);
+
+static const struct attribute_group *pmu_attr_update[] = {
+	&pmu_group_cores,
+	&pmu_group_pkg,
+	&pmu_group_ram,
+	&pmu_group_gpu,
+	&pmu_group_psys,
+	NULL
+};
+
+static int rapl_pmu_update(struct rapl_package *rp)
+{
+	int ret = 0;
+
+	/* Return if PMU already covers all events supported by current RAPL Package */
+	if (rapl_pmu.registered && !(rp->domain_map & (~rapl_pmu.domain_map)))
+		goto end;
+
+	/* Unregister previous registered PMU */
+	if (rapl_pmu.registered)
+		perf_pmu_unregister(&rapl_pmu.pmu);
+
+	rapl_pmu.registered = false;
+	rapl_pmu.domain_map |= rp->domain_map;
+
+	memset(&rapl_pmu.pmu, 0, sizeof(struct pmu));
+	rapl_pmu.pmu.attr_groups = pmu_attr_groups;
+	rapl_pmu.pmu.attr_update = pmu_attr_update;
+	rapl_pmu.pmu.task_ctx_nr = perf_invalid_context;
+	rapl_pmu.pmu.event_init = rapl_pmu_event_init;
+	rapl_pmu.pmu.add = rapl_pmu_event_add;
+	rapl_pmu.pmu.del = rapl_pmu_event_del;
+	rapl_pmu.pmu.start = rapl_pmu_event_start;
+	rapl_pmu.pmu.stop = rapl_pmu_event_stop;
+	rapl_pmu.pmu.read = rapl_pmu_event_read;
+	rapl_pmu.pmu.module = THIS_MODULE;
+	rapl_pmu.pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT;
+	ret = perf_pmu_register(&rapl_pmu.pmu, "power", -1);
+	if (ret) {
+		pr_info("Failed to register PMU\n");
+		return ret;
+	}
+
+	rapl_pmu.registered = true;
+end:
+	rp->has_pmu = true;
+	return ret;
+}
+
+int rapl_package_add_pmu(struct rapl_package *rp)
+{
+	struct rapl_package_pmu_data *data = &rp->pmu_data;
+	int idx;
+
+	if (rp->has_pmu)
+		return -EEXIST;
+
+	guard(cpus_read_lock)();
+
+	for (idx = 0; idx < rp->nr_domains; idx++) {
+		struct rapl_domain *rd = &rp->domains[idx];
+		int domain = rd->id;
+		u64 val;
+
+		if (!test_bit(domain, &rp->domain_map))
+			continue;
+
+		/*
+		 * The RAPL PMU granularity is 2^-32 Joules
+		 * data->scale[]: times of 2^-32 Joules for each ENERGY COUNTER increase
+		 */
+		val = rd->energy_unit * (1ULL << 32);
+		do_div(val, ENERGY_UNIT_SCALE * 1000000);
+		data->scale[domain] = val;
+
+		if (!rapl_pmu.timer_ms) {
+			struct rapl_primitive_info *rpi = get_rpi(rp, ENERGY_COUNTER);
+
+			/*
+			 * Calculate the timer rate:
+			 * Use reference of 200W for scaling the timeout to avoid counter
+			 * overflows.
+			 *
+			 * max_count = rpi->mask >> rpi->shift + 1
+			 * max_energy_pj = max_count * rd->energy_unit
+			 * max_time_sec = (max_energy_pj / 1000000000) / 200w
+			 *
+			 * rapl_pmu.timer_ms = max_time_sec * 1000 / 2
+			 */
+			val = (rpi->mask >> rpi->shift) + 1;
+			val *= rd->energy_unit;
+			do_div(val, 1000000 * 200 * 2);
+			rapl_pmu.timer_ms = val;
+
+			pr_debug("%llu ms overflow timer\n", rapl_pmu.timer_ms);
+		}
+
+		pr_debug("Domain %s: hw unit %lld * 2^-32 Joules\n", rd->name, data->scale[domain]);
+	}
+
+	/* Initialize per package PMU data */
+	raw_spin_lock_init(&data->lock);
+	INIT_LIST_HEAD(&data->active_list);
+	data->timer_interval = ms_to_ktime(rapl_pmu.timer_ms);
+	hrtimer_init(&data->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	data->hrtimer.function = rapl_hrtimer_handle;
+
+	return rapl_pmu_update(rp);
+}
+EXPORT_SYMBOL_GPL(rapl_package_add_pmu);
+
+void rapl_package_remove_pmu(struct rapl_package *rp)
+{
+	struct rapl_package *pos;
+
+	if (!rp->has_pmu)
+		return;
+
+	guard(cpus_read_lock)();
+
+	list_for_each_entry(pos, &rapl_packages, plist) {
+		/* PMU is still needed */
+		if (pos->has_pmu && pos != rp)
+			return;
+	}
+
+	perf_pmu_unregister(&rapl_pmu.pmu);
+	memset(&rapl_pmu, 0, sizeof(struct rapl_pmu));
+}
+EXPORT_SYMBOL_GPL(rapl_package_remove_pmu);
+#endif
+
 /* called from CPU hotplug notifier, hotplug lock held */
 void rapl_remove_package_cpuslocked(struct rapl_package *rp)
 {
diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h
index f3196f82fd8a..c0397423d3a8 100644
--- a/include/linux/intel_rapl.h
+++ b/include/linux/intel_rapl.h
@@ -158,6 +158,26 @@ struct rapl_if_priv {
 	void *rpi;
 };
 
+#ifdef CONFIG_PERF_EVENTS
+/**
+ * struct rapl_package_pmu_data: Per package data for PMU support
+ * @scale:		Scale of 2^-32 Joules for each energy counter increase.
+ * @lock:		Lock to protect n_active and active_list.
+ * @n_active:		Number of active events.
+ * @active_list:	List of active events.
+ * @timer_interval:	Maximum timer expiration time before counter overflow.
+ * @hrtimer:		Periodically update the counter to prevent overflow.
+ */
+struct rapl_package_pmu_data {
+	u64 scale[RAPL_DOMAIN_MAX];
+	raw_spinlock_t lock;
+	int n_active;
+	struct list_head active_list;
+	ktime_t timer_interval;
+	struct hrtimer hrtimer;
+};
+#endif
+
 /* maximum rapl package domain name: package-%d-die-%d */
 #define PACKAGE_DOMAIN_NAME_LENGTH 30
 
@@ -176,6 +196,10 @@ struct rapl_package {
 	struct cpumask cpumask;
 	char name[PACKAGE_DOMAIN_NAME_LENGTH];
 	struct rapl_if_priv *priv;
+#ifdef CONFIG_PERF_EVENTS
+	bool has_pmu;
+	struct rapl_package_pmu_data pmu_data;
+#endif
 };
 
 struct rapl_package *rapl_find_package_domain_cpuslocked(int id, struct rapl_if_priv *priv,
@@ -188,4 +212,12 @@ struct rapl_package *rapl_find_package_domain(int id, struct rapl_if_priv *priv,
 struct rapl_package *rapl_add_package(int id, struct rapl_if_priv *priv, bool id_is_cpu);
 void rapl_remove_package(struct rapl_package *rp);
 
+#ifdef CONFIG_PERF_EVENTS
+int rapl_package_add_pmu(struct rapl_package *rp);
+void rapl_package_remove_pmu(struct rapl_package *rp);
+#else
+static inline int rapl_package_add_pmu(struct rapl_package *rp) { return 0; }
+static inline void rapl_package_remove_pmu(struct rapl_package *rp) { }
+#endif
+
 #endif /* __INTEL_RAPL_H__ */
-- 
cgit v1.2.3


From e049249013b13297817a1b0a33bbedec0ebca293 Mon Sep 17 00:00:00 2001
From: Ruidong Tian <tianruidong@linux.alibaba.com>
Date: Thu, 25 Apr 2024 17:34:26 +0800
Subject: ACPICA: AEST: Add support for the AEST V2 table

ACPICA commit ebb49799c78891cbe370f1264844664a3d8b6f35

AEST V2 was published[1], add V2 support based on AEST V1.

[1]: https://developer.arm.com/documentation/den0085/latest/

Link: https://github.com/acpica/acpica/commit/ebb4979
Signed-off-by: Ruidong Tian <tianruidong@linux.alibaba.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/acpi/actbl2.h | 88 +++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 82 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h
index 64a3f9ae2641..fdfe87fc523c 100644
--- a/include/acpi/actbl2.h
+++ b/include/acpi/actbl2.h
@@ -78,8 +78,8 @@
  *
  * AEST - Arm Error Source Table
  *
- * Conforms to: ACPI for the Armv8 RAS Extensions 1.1 Platform Design Document
- * September 2020.
+ * Conforms to: ACPI for the Armv8 RAS Extensions 1.1(Sep 2020) and
+ * 2.0(May 2023) Platform Design Document.
  *
  ******************************************************************************/
 
@@ -109,7 +109,9 @@ struct acpi_aest_hdr {
 #define ACPI_AEST_SMMU_ERROR_NODE           2
 #define ACPI_AEST_VENDOR_ERROR_NODE         3
 #define ACPI_AEST_GIC_ERROR_NODE            4
-#define ACPI_AEST_NODE_TYPE_RESERVED        5	/* 5 and above are reserved */
+#define ACPI_AEST_PCIE_ERROR_NODE           5
+#define ACPI_AEST_PROXY_ERROR_NODE          6
+#define ACPI_AEST_NODE_TYPE_RESERVED        7 /* 7 and above are reserved */
 
 /*
  * AEST subtables (Error nodes)
@@ -188,6 +190,12 @@ typedef struct acpi_aest_vendor {
 
 } acpi_aest_vendor;
 
+struct acpi_aest_vendor_v2 {
+	char acpi_hid[8];
+	u32 acpi_uid;
+	u8 vendor_specific_data[16];
+};
+
 /* 4: Gic Error */
 
 typedef struct acpi_aest_gic {
@@ -204,6 +212,18 @@ typedef struct acpi_aest_gic {
 #define ACPI_AEST_GIC_ITS                   3
 #define ACPI_AEST_GIC_RESERVED              4	/* 4 and above are reserved */
 
+/* 5: PCIe Error */
+
+struct acpi_aest_pcie {
+	u32 iort_node_reference;
+};
+
+/* 6: Proxy Error */
+
+struct acpi_aest_proxy {
+	u64 node_address;
+};
+
 /* Node Interface Structure */
 
 typedef struct acpi_aest_node_interface {
@@ -219,11 +239,57 @@ typedef struct acpi_aest_node_interface {
 
 } acpi_aest_node_interface;
 
+/* Node Interface Structure V2 */
+
+struct acpi_aest_node_interface_header {
+	u8 type;
+	u8 group_format;
+	u8 reserved[2];
+	u32 flags;
+	u64 address;
+	u32 error_record_index;
+	u32 error_record_count;
+};
+
+#define ACPI_AEST_NODE_GROUP_FORMAT_4K          0
+#define ACPI_AEST_NODE_GROUP_FORMAT_16K         1
+#define ACPI_AEST_NODE_GROUP_FORMAT_64K         2
+
+struct acpi_aest_node_interface_common {
+	u32 error_node_device;
+	u32 processor_affinity;
+	u64 error_group_register_base;
+	u64 fault_inject_register_base;
+	u64 interrupt_config_register_base;
+};
+
+struct acpi_aest_node_interface_4k {
+	u64 error_record_implemented;
+	u64 error_status_reporting;
+	u64 addressing_mode;
+	struct acpi_aest_node_interface_common common;
+};
+
+struct acpi_aest_node_interface_16k {
+	u64 error_record_implemented[4];
+	u64 error_status_reporting[4];
+	u64 addressing_mode[4];
+	struct acpi_aest_node_interface_common common;
+};
+
+struct acpi_aest_node_interface_64k {
+	u64 error_record_implemented[14];
+	u64 error_status_reporting[14];
+	u64 addressing_mode[14];
+	struct acpi_aest_node_interface_common common;
+};
+
 /* Values for Type field above */
 
-#define ACPI_AEST_NODE_SYSTEM_REGISTER      0
-#define ACPI_AEST_NODE_MEMORY_MAPPED        1
-#define ACPI_AEST_XFACE_RESERVED            2	/* 2 and above are reserved */
+#define ACPI_AEST_NODE_SYSTEM_REGISTER			0
+#define ACPI_AEST_NODE_MEMORY_MAPPED			1
+#define ACPI_AEST_NODE_SINGLE_RECORD_MEMORY_MAPPED	2
+#define ACPI_AEST_XFACE_RESERVED			3   /* 2 and above are reserved */
 
 /* Node Interrupt Structure */
 
@@ -237,6 +303,16 @@ typedef struct acpi_aest_node_interrupt {
 
 } acpi_aest_node_interrupt;
 
+/* Node Interrupt Structure V2 */
+
+struct acpi_aest_node_interrupt_v2 {
+	u8 type;
+	u8 reserved[2];
+	u8 flags;
+	u32 gsiv;
+	u8 reserved1[4];
+};
+
 /* Values for Type field above */
 
 #define ACPI_AEST_NODE_FAULT_HANDLING       0
-- 
cgit v1.2.3


From 6aec00139d3a83e2394d4bcb0084e872b4036e8f Mon Sep 17 00:00:00 2001
From: Alison Schofield <alison.schofield@intel.com>
Date: Tue, 30 Apr 2024 10:28:06 -0700
Subject: cxl/core: Add region info to cxl_general_media and cxl_dram events

User space may need to know which region, if any, maps the DPAs
(device physical addresses) reported in a cxl_general_media or
cxl_dram event. Since the mapping can change, the kernel provides
this information at the time the event occurs. This informs user
space that at event <timestamp> this <region> mapped this <DPA>
to this <HPA>.

Add the same region info that is included in the cxl_poison trace
event: the DPA->HPA translation, region name, and region uuid.

The new fields are inserted in the trace event and no existing
fields are modified. If the DPA is not mapped, user will see:
hpa=ULLONG_MAX, region="", and uuid=0

This work must be protected by dpa_rwsem & region_rwsem since
it is looking up region mappings.

Signed-off-by: Alison Schofield <alison.schofield@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Link: https://lore.kernel.org/r/dd8d708b7a7ebfb64a27020a5eb338091336b34d.1714496730.git.alison.schofield@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/core/mbox.c   | 36 ++++++++++++++++++++++++++++++------
 drivers/cxl/core/trace.h  | 44 +++++++++++++++++++++++++++++++++++---------
 include/linux/cxl-event.h | 10 ++++++++++
 3 files changed, 75 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 65185c9fa001..97e938657773 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -842,14 +842,38 @@ void cxl_event_trace_record(const struct cxl_memdev *cxlmd,
 			    enum cxl_event_type event_type,
 			    const uuid_t *uuid, union cxl_event *evt)
 {
-	if (event_type == CXL_CPER_EVENT_GEN_MEDIA)
-		trace_cxl_general_media(cxlmd, type, &evt->gen_media);
-	else if (event_type == CXL_CPER_EVENT_DRAM)
-		trace_cxl_dram(cxlmd, type, &evt->dram);
-	else if (event_type == CXL_CPER_EVENT_MEM_MODULE)
+	if (event_type == CXL_CPER_EVENT_MEM_MODULE) {
 		trace_cxl_memory_module(cxlmd, type, &evt->mem_module);
-	else
+		return;
+	}
+	if (event_type == CXL_CPER_EVENT_GENERIC) {
 		trace_cxl_generic_event(cxlmd, type, uuid, &evt->generic);
+		return;
+	}
+
+	if (trace_cxl_general_media_enabled() || trace_cxl_dram_enabled()) {
+		u64 dpa, hpa = ULLONG_MAX;
+		struct cxl_region *cxlr;
+
+		/*
+		 * These trace points are annotated with HPA and region
+		 * translations. Take topology mutation locks and lookup
+		 * { HPA, REGION } from { DPA, MEMDEV } in the event record.
+		 */
+		guard(rwsem_read)(&cxl_region_rwsem);
+		guard(rwsem_read)(&cxl_dpa_rwsem);
+
+		dpa = le64_to_cpu(evt->common.phys_addr) & CXL_DPA_MASK;
+		cxlr = cxl_dpa_to_region(cxlmd, dpa);
+		if (cxlr)
+			hpa = cxl_trace_hpa(cxlr, cxlmd, dpa);
+
+		if (event_type == CXL_CPER_EVENT_GEN_MEDIA)
+			trace_cxl_general_media(cxlmd, type, cxlr, hpa,
+						&evt->gen_media);
+		else if (event_type == CXL_CPER_EVENT_DRAM)
+			trace_cxl_dram(cxlmd, type, cxlr, hpa, &evt->dram);
+	}
 }
 EXPORT_SYMBOL_NS_GPL(cxl_event_trace_record, CXL);
 
diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h
index e303e618aa05..07a0394b1d99 100644
--- a/drivers/cxl/core/trace.h
+++ b/drivers/cxl/core/trace.h
@@ -316,9 +316,9 @@ TRACE_EVENT(cxl_generic_event,
 TRACE_EVENT(cxl_general_media,
 
 	TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log,
-		 struct cxl_event_gen_media *rec),
+		 struct cxl_region *cxlr, u64 hpa, struct cxl_event_gen_media *rec),
 
-	TP_ARGS(cxlmd, log, rec),
+	TP_ARGS(cxlmd, log, cxlr, hpa, rec),
 
 	TP_STRUCT__entry(
 		CXL_EVT_TP_entry
@@ -330,10 +330,13 @@ TRACE_EVENT(cxl_general_media,
 		__field(u8, channel)
 		__field(u32, device)
 		__array(u8, comp_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE)
-		__field(u16, validity_flags)
 		/* Following are out of order to pack trace record */
+		__field(u64, hpa)
+		__field_struct(uuid_t, region_uuid)
+		__field(u16, validity_flags)
 		__field(u8, rank)
 		__field(u8, dpa_flags)
+		__string(region_name, cxlr ? dev_name(&cxlr->dev) : "")
 	),
 
 	TP_fast_assign(
@@ -354,18 +357,28 @@ TRACE_EVENT(cxl_general_media,
 		memcpy(__entry->comp_id, &rec->component_id,
 			CXL_EVENT_GEN_MED_COMP_ID_SIZE);
 		__entry->validity_flags = get_unaligned_le16(&rec->validity_flags);
+		__entry->hpa = hpa;
+		if (cxlr) {
+			__assign_str(region_name, dev_name(&cxlr->dev));
+			uuid_copy(&__entry->region_uuid, &cxlr->params.uuid);
+		} else {
+			__assign_str(region_name, "");
+			uuid_copy(&__entry->region_uuid, &uuid_null);
+		}
 	),
 
 	CXL_EVT_TP_printk("dpa=%llx dpa_flags='%s' " \
 		"descriptor='%s' type='%s' transaction_type='%s' channel=%u rank=%u " \
-		"device=%x comp_id=%s validity_flags='%s'",
+		"device=%x comp_id=%s validity_flags='%s' " \
+		"hpa=%llx region=%s region_uuid=%pUb",
 		__entry->dpa, show_dpa_flags(__entry->dpa_flags),
 		show_event_desc_flags(__entry->descriptor),
 		show_mem_event_type(__entry->type),
 		show_trans_type(__entry->transaction_type),
 		__entry->channel, __entry->rank, __entry->device,
 		__print_hex(__entry->comp_id, CXL_EVENT_GEN_MED_COMP_ID_SIZE),
-		show_valid_flags(__entry->validity_flags)
+		show_valid_flags(__entry->validity_flags),
+		__entry->hpa, __get_str(region_name), &__entry->region_uuid
 	)
 );
 
@@ -400,9 +413,9 @@ TRACE_EVENT(cxl_general_media,
 TRACE_EVENT(cxl_dram,
 
 	TP_PROTO(const struct cxl_memdev *cxlmd, enum cxl_event_log_type log,
-		 struct cxl_event_dram *rec),
+		 struct cxl_region *cxlr, u64 hpa, struct cxl_event_dram *rec),
 
-	TP_ARGS(cxlmd, log, rec),
+	TP_ARGS(cxlmd, log, cxlr, hpa, rec),
 
 	TP_STRUCT__entry(
 		CXL_EVT_TP_entry
@@ -417,10 +430,13 @@ TRACE_EVENT(cxl_dram,
 		__field(u32, nibble_mask)
 		__field(u32, row)
 		__array(u8, cor_mask, CXL_EVENT_DER_CORRECTION_MASK_SIZE)
+		__field(u64, hpa)
+		__field_struct(uuid_t, region_uuid)
 		__field(u8, rank)	/* Out of order to pack trace record */
 		__field(u8, bank_group)	/* Out of order to pack trace record */
 		__field(u8, bank)	/* Out of order to pack trace record */
 		__field(u8, dpa_flags)	/* Out of order to pack trace record */
+		__string(region_name, cxlr ? dev_name(&cxlr->dev) : "")
 	),
 
 	TP_fast_assign(
@@ -444,12 +460,21 @@ TRACE_EVENT(cxl_dram,
 		__entry->column = get_unaligned_le16(rec->column);
 		memcpy(__entry->cor_mask, &rec->correction_mask,
 			CXL_EVENT_DER_CORRECTION_MASK_SIZE);
+		__entry->hpa = hpa;
+		if (cxlr) {
+			__assign_str(region_name, dev_name(&cxlr->dev));
+			uuid_copy(&__entry->region_uuid, &cxlr->params.uuid);
+		} else {
+			__assign_str(region_name, "");
+			uuid_copy(&__entry->region_uuid, &uuid_null);
+		}
 	),
 
 	CXL_EVT_TP_printk("dpa=%llx dpa_flags='%s' descriptor='%s' type='%s' " \
 		"transaction_type='%s' channel=%u rank=%u nibble_mask=%x " \
 		"bank_group=%u bank=%u row=%u column=%u cor_mask=%s " \
-		"validity_flags='%s'",
+		"validity_flags='%s' " \
+		"hpa=%llx region=%s region_uuid=%pUb",
 		__entry->dpa, show_dpa_flags(__entry->dpa_flags),
 		show_event_desc_flags(__entry->descriptor),
 		show_mem_event_type(__entry->type),
@@ -458,7 +483,8 @@ TRACE_EVENT(cxl_dram,
 		__entry->bank_group, __entry->bank,
 		__entry->row, __entry->column,
 		__print_hex(__entry->cor_mask, CXL_EVENT_DER_CORRECTION_MASK_SIZE),
-		show_dram_valid_flags(__entry->validity_flags)
+		show_dram_valid_flags(__entry->validity_flags),
+		__entry->hpa, __get_str(region_name), &__entry->region_uuid
 	)
 );
 
diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index 03fa6d50d46f..5342755777cc 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -91,11 +91,21 @@ struct cxl_event_mem_module {
 	u8 reserved[0x3d];
 } __packed;
 
+/*
+ * General Media or DRAM Event Common Fields
+ * - provides common access to phys_addr
+ */
+struct cxl_event_common {
+	struct cxl_event_record_hdr hdr;
+	__le64 phys_addr;
+} __packed;
+
 union cxl_event {
 	struct cxl_event_generic generic;
 	struct cxl_event_gen_media gen_media;
 	struct cxl_event_dram dram;
 	struct cxl_event_mem_module mem_module;
+	struct cxl_event_common common;
 } __packed;
 
 /*
-- 
cgit v1.2.3


From dfac21b1eeefe8ae472e3fee511cfdcf7d03bb4b Mon Sep 17 00:00:00 2001
From: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Date: Sun, 28 Apr 2024 21:04:41 -0700
Subject: ACPI: Move acpi_blacklisted() declaration to asm/acpi.h

The function acpi_blacklisted() is defined only when CONFIG_X86 is
enabled and is only used by X86 arch code. To align with its usage and
definition conditions, move its declaration to asm/acpi.h

Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
[ rjw: Added empty code line in a header file ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/x86/include/asm/acpi.h | 2 ++
 include/linux/acpi.h        | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index f896eed4516c..5af926c050f0 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -56,6 +56,8 @@ static inline void disable_acpi(void)
 
 extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq);
 
+extern int acpi_blacklisted(void);
+
 static inline void acpi_noirq_set(void) { acpi_noirq = 1; }
 static inline void acpi_disable_pci(void)
 {
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 34829f2c517a..2009f7ea5963 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -421,7 +421,7 @@ extern char *wmi_get_acpi_device_uid(const char *guid);
 
 extern char acpi_video_backlight_string[];
 extern long acpi_is_video_device(acpi_handle handle);
-extern int acpi_blacklisted(void);
+
 extern void acpi_osi_setup(char *str);
 extern bool acpi_osi_is_win8(void);
 
-- 
cgit v1.2.3


From 05d6d492097c55f2d153fc3fd33cbe78e1e28e0a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 29 Apr 2024 13:30:09 +0000
Subject: inet: introduce dst_rtable() helper

I added dst_rt6_info() in commit
e8dfd42c17fa ("ipv6: introduce dst_rt6_info() helper")

This patch does a similar change for IPv4.

Instead of (struct rtable *)dst casts, we can use :

 #define dst_rtable(_ptr) \
             container_of_const(_ptr, struct rtable, dst)

Patch is smaller than IPv6 one, because IPv4 has skb_rtable() helper.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Link: https://lore.kernel.org/r/20240429133009.1227754-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/infiniband/core/addr.c   | 12 +++---------
 drivers/net/vrf.c                |  2 +-
 drivers/s390/net/qeth_core.h     |  5 ++---
 include/linux/skbuff.h           |  9 ---------
 include/net/ip.h                 |  4 ++--
 include/net/route.h              | 11 +++++++++++
 net/atm/clip.c                   |  2 +-
 net/core/dst_cache.c             |  2 +-
 net/core/filter.c                |  3 +--
 net/ipv4/af_inet.c               |  2 +-
 net/ipv4/icmp.c                  | 26 ++++++++++++++------------
 net/ipv4/ip_input.c              |  2 +-
 net/ipv4/ip_output.c             |  8 ++++----
 net/ipv4/route.c                 | 24 +++++++++++-------------
 net/ipv4/udp.c                   |  2 +-
 net/ipv4/xfrm4_policy.c          |  2 +-
 net/l2tp/l2tp_ip.c               |  2 +-
 net/mpls/mpls_iptunnel.c         |  2 +-
 net/netfilter/ipvs/ip_vs_xmit.c  |  2 +-
 net/netfilter/nf_flow_table_ip.c |  4 ++--
 net/netfilter/nft_rt.c           |  2 +-
 net/sctp/protocol.c              |  4 ++--
 net/tipc/udp_media.c             |  2 +-
 23 files changed, 64 insertions(+), 70 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index f20dfe70fa0e..be0743dac3ff 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -348,16 +348,10 @@ static int dst_fetch_ha(const struct dst_entry *dst,
 
 static bool has_gateway(const struct dst_entry *dst, sa_family_t family)
 {
-	const struct rtable *rt;
-	const struct rt6_info *rt6;
+	if (family == AF_INET)
+		return dst_rtable(dst)->rt_uses_gateway;
 
-	if (family == AF_INET) {
-		rt = container_of(dst, struct rtable, dst);
-		return rt->rt_uses_gateway;
-	}
-
-	rt6 = dst_rt6_info(dst);
-	return rt6->rt6i_flags & RTF_GATEWAY;
+	return dst_rt6_info(dst)->rt6i_flags & RTF_GATEWAY;
 }
 
 static int fetch_ha(const struct dst_entry *dst, struct rdma_dev_addr *dev_addr,
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 784b9b2d275e..3a252ac5dd28 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -860,7 +860,7 @@ static int vrf_rt6_create(struct net_device *dev)
 static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
-	struct rtable *rt = (struct rtable *)dst;
+	struct rtable *rt = dst_rtable(dst);
 	struct net_device *dev = dst->dev;
 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
 	struct neighbour *neigh;
diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
index 5f17a2a5d0e3..41fe8a043d61 100644
--- a/drivers/s390/net/qeth_core.h
+++ b/drivers/s390/net/qeth_core.h
@@ -970,9 +970,8 @@ static inline struct dst_entry *qeth_dst_check_rcu(struct sk_buff *skb,
 static inline __be32 qeth_next_hop_v4_rcu(struct sk_buff *skb,
 					  struct dst_entry *dst)
 {
-	struct rtable *rt = (struct rtable *) dst;
-
-	return (rt) ? rt_nexthop(rt, ip_hdr(skb)->daddr) : ip_hdr(skb)->daddr;
+	return (dst) ? rt_nexthop(dst_rtable(dst), ip_hdr(skb)->daddr) :
+		       ip_hdr(skb)->daddr;
 }
 
 static inline struct in6_addr *qeth_next_hop_v6_rcu(struct sk_buff *skb,
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f76825e5b92a..adf75d69770c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1180,15 +1180,6 @@ static inline bool skb_dst_is_noref(const struct sk_buff *skb)
 	return (skb->_skb_refdst & SKB_DST_NOREF) && skb_dst(skb);
 }
 
-/**
- * skb_rtable - Returns the skb &rtable
- * @skb: buffer
- */
-static inline struct rtable *skb_rtable(const struct sk_buff *skb)
-{
-	return (struct rtable *)skb_dst(skb);
-}
-
 /* For mangling skb->pkt_type from user space side from applications
  * such as nft, tc, etc, we only allow a conservative subset of
  * possible pkt_types to be set.
diff --git a/include/net/ip.h b/include/net/ip.h
index 25cb688bdc62..6d735e00d3f3 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -423,7 +423,7 @@ int ip_decrease_ttl(struct iphdr *iph)
 
 static inline int ip_mtu_locked(const struct dst_entry *dst)
 {
-	const struct rtable *rt = (const struct rtable *)dst;
+	const struct rtable *rt = dst_rtable(dst);
 
 	return rt->rt_mtu_locked || dst_metric_locked(dst, RTAX_MTU);
 }
@@ -461,7 +461,7 @@ static inline bool ip_sk_ignore_df(const struct sock *sk)
 static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
 						    bool forwarding)
 {
-	const struct rtable *rt = container_of(dst, struct rtable, dst);
+	const struct rtable *rt = dst_rtable(dst);
 	struct net *net = dev_net(dst->dev);
 	unsigned int mtu;
 
diff --git a/include/net/route.h b/include/net/route.h
index 630d1ef6868a..93833cfe9c96 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -75,6 +75,17 @@ struct rtable {
 				rt_pmtu:31;
 };
 
+#define dst_rtable(_ptr) container_of_const(_ptr, struct rtable, dst)
+
+/**
+ * skb_rtable - Returns the skb &rtable
+ * @skb: buffer
+ */
+static inline struct rtable *skb_rtable(const struct sk_buff *skb)
+{
+	return dst_rtable(skb_dst(skb));
+}
+
 static inline bool rt_is_input_route(const struct rtable *rt)
 {
 	return rt->rt_is_input != 0;
diff --git a/net/atm/clip.c b/net/atm/clip.c
index 362e8d25a79e..42b910cb4e8e 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -345,7 +345,7 @@ static netdev_tx_t clip_start_xmit(struct sk_buff *skb,
 		dev->stats.tx_dropped++;
 		return NETDEV_TX_OK;
 	}
-	rt = (struct rtable *) dst;
+	rt = dst_rtable(dst);
 	if (rt->rt_gw_family == AF_INET)
 		daddr = &rt->rt_gw4;
 	else
diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c
index b17171345d64..0c0bdb058c5b 100644
--- a/net/core/dst_cache.c
+++ b/net/core/dst_cache.c
@@ -83,7 +83,7 @@ struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr)
 		return NULL;
 
 	*saddr = idst->in_saddr.s_addr;
-	return container_of(dst, struct rtable, dst);
+	return dst_rtable(dst);
 }
 EXPORT_SYMBOL_GPL(dst_cache_get_ip4);
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 6d319c76188b..29165744c505 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2317,8 +2317,7 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
 
 	rcu_read_lock();
 	if (!nh) {
-		struct dst_entry *dst = skb_dst(skb);
-		struct rtable *rt = container_of(dst, struct rtable, dst);
+		struct rtable *rt = skb_rtable(skb);
 
 		neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
 	} else if (nh->nh_family == AF_INET6) {
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index a7cfeda28bb2..486a8d4f53b1 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1307,8 +1307,8 @@ static int inet_sk_reselect_saddr(struct sock *sk)
 
 int inet_sk_rebuild_header(struct sock *sk)
 {
+	struct rtable *rt = dst_rtable(__sk_dst_check(sk, 0));
 	struct inet_sock *inet = inet_sk(sk);
-	struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
 	__be32 daddr;
 	struct ip_options_rcu *inet_opt;
 	struct flowi4 *fl4;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 437e782b9663..207482d30dc7 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -483,6 +483,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
 					struct icmp_bxm *param)
 {
 	struct net_device *route_lookup_dev;
+	struct dst_entry *dst, *dst2;
 	struct rtable *rt, *rt2;
 	struct flowi4 fl4_dec;
 	int err;
@@ -508,16 +509,17 @@ static struct rtable *icmp_route_lookup(struct net *net,
 	/* No need to clone since we're just using its address. */
 	rt2 = rt;
 
-	rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
-					   flowi4_to_flowi(fl4), NULL, 0);
-	if (!IS_ERR(rt)) {
+	dst = xfrm_lookup(net, &rt->dst,
+			  flowi4_to_flowi(fl4), NULL, 0);
+	rt = dst_rtable(dst);
+	if (!IS_ERR(dst)) {
 		if (rt != rt2)
 			return rt;
-	} else if (PTR_ERR(rt) == -EPERM) {
+	} else if (PTR_ERR(dst) == -EPERM) {
 		rt = NULL;
-	} else
+	} else {
 		return rt;
-
+	}
 	err = xfrm_decode_session_reverse(net, skb_in, flowi4_to_flowi(&fl4_dec), AF_INET);
 	if (err)
 		goto relookup_failed;
@@ -551,19 +553,19 @@ static struct rtable *icmp_route_lookup(struct net *net,
 	if (err)
 		goto relookup_failed;
 
-	rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst,
-					    flowi4_to_flowi(&fl4_dec), NULL,
-					    XFRM_LOOKUP_ICMP);
-	if (!IS_ERR(rt2)) {
+	dst2 = xfrm_lookup(net, &rt2->dst, flowi4_to_flowi(&fl4_dec), NULL,
+			   XFRM_LOOKUP_ICMP);
+	rt2 = dst_rtable(dst2);
+	if (!IS_ERR(dst2)) {
 		dst_release(&rt->dst);
 		memcpy(fl4, &fl4_dec, sizeof(*fl4));
 		rt = rt2;
-	} else if (PTR_ERR(rt2) == -EPERM) {
+	} else if (PTR_ERR(dst2) == -EPERM) {
 		if (rt)
 			dst_release(&rt->dst);
 		return rt2;
 	} else {
-		err = PTR_ERR(rt2);
+		err = PTR_ERR(dst2);
 		goto relookup_failed;
 	}
 	return rt;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 5e9c8156656a..d6fbcbd2358a 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -616,7 +616,7 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk,
 		dst = skb_dst(skb);
 		if (curr_dst != dst) {
 			hint = ip_extract_route_hint(net, skb,
-					       ((struct rtable *)dst)->rt_type);
+						     dst_rtable(dst)->rt_type);
 
 			/* dispatch old sublist */
 			if (!list_empty(&sublist))
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 1fe794967211..b455bd05a7d5 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -198,7 +198,7 @@ EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
 static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
-	struct rtable *rt = (struct rtable *)dst;
+	struct rtable *rt = dst_rtable(dst);
 	struct net_device *dev = dst->dev;
 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
 	struct neighbour *neigh;
@@ -475,7 +475,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
 		goto packet_routed;
 
 	/* Make sure we can route this packet. */
-	rt = (struct rtable *)__sk_dst_check(sk, 0);
+	rt = dst_rtable(__sk_dst_check(sk, 0));
 	if (!rt) {
 		__be32 daddr;
 
@@ -971,7 +971,7 @@ static int __ip_append_data(struct sock *sk,
 	bool zc = false;
 	unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
 	int csummode = CHECKSUM_NONE;
-	struct rtable *rt = (struct rtable *)cork->dst;
+	struct rtable *rt = dst_rtable(cork->dst);
 	bool paged, hold_tskey, extra_uref = false;
 	unsigned int wmem_alloc_delta = 0;
 	u32 tskey = 0;
@@ -1390,7 +1390,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
 	struct inet_sock *inet = inet_sk(sk);
 	struct net *net = sock_net(sk);
 	struct ip_options *opt = NULL;
-	struct rtable *rt = (struct rtable *)cork->dst;
+	struct rtable *rt = dst_rtable(cork->dst);
 	struct iphdr *iph;
 	u8 pmtudisc, ttl;
 	__be16 df = 0;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f89ff2e5a05b..0fd9a3d7ac4a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -819,7 +819,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
 	u32 mark = skb->mark;
 	__u8 tos = iph->tos;
 
-	rt = (struct rtable *) dst;
+	rt = dst_rtable(dst);
 
 	__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 	__ip_do_redirect(rt, skb, &fl4, true);
@@ -827,7 +827,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
 
 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 {
-	struct rtable *rt = (struct rtable *)dst;
+	struct rtable *rt = dst_rtable(dst);
 	struct dst_entry *ret = dst;
 
 	if (rt) {
@@ -1044,7 +1044,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 			      struct sk_buff *skb, u32 mtu,
 			      bool confirm_neigh)
 {
-	struct rtable *rt = (struct rtable *) dst;
+	struct rtable *rt = dst_rtable(dst);
 	struct flowi4 fl4;
 
 	ip_rt_build_flow_key(&fl4, sk, skb);
@@ -1115,7 +1115,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 
 	__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
 
-	rt = (struct rtable *)odst;
+	rt = dst_rtable(odst);
 	if (odst->obsolete && !odst->ops->check(odst, 0)) {
 		rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
 		if (IS_ERR(rt))
@@ -1124,7 +1124,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
 		new = true;
 	}
 
-	__ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
+	__ip_rt_update_pmtu(dst_rtable(xfrm_dst_path(&rt->dst)), &fl4, mtu);
 
 	if (!dst_check(&rt->dst, 0)) {
 		if (new)
@@ -1181,7 +1181,7 @@ EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
 INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst,
 							 u32 cookie)
 {
-	struct rtable *rt = (struct rtable *) dst;
+	struct rtable *rt = dst_rtable(dst);
 
 	/* All IPV4 dsts are created with ->obsolete set to the value
 	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
@@ -1516,10 +1516,8 @@ void rt_del_uncached_list(struct rtable *rt)
 
 static void ipv4_dst_destroy(struct dst_entry *dst)
 {
-	struct rtable *rt = (struct rtable *)dst;
-
 	ip_dst_metrics_put(dst);
-	rt_del_uncached_list(rt);
+	rt_del_uncached_list(dst_rtable(dst));
 }
 
 void rt_flush_dev(struct net_device *dev)
@@ -2820,7 +2818,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
 
 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
 {
-	struct rtable *ort = (struct rtable *) dst_orig;
+	struct rtable *ort = dst_rtable(dst_orig);
 	struct rtable *rt;
 
 	rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, DST_OBSOLETE_DEAD, 0);
@@ -2865,9 +2863,9 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
 
 	if (flp4->flowi4_proto) {
 		flp4->flowi4_oif = rt->dst.dev->ifindex;
-		rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
-							flowi4_to_flowi(flp4),
-							sk, 0);
+		rt = dst_rtable(xfrm_lookup_route(net, &rt->dst,
+						  flowi4_to_flowi(flp4),
+						  sk, 0));
 	}
 
 	return rt;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 6e2446295089..fe55ff5d379b 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1217,7 +1217,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	}
 
 	if (connected)
-		rt = (struct rtable *)sk_dst_check(sk, 0);
+		rt = dst_rtable(sk_dst_check(sk, 0));
 
 	if (!rt) {
 		struct net *net = sock_net(sk);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 1dda59e0aeab..fccbbd3e1a4b 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -69,7 +69,7 @@ static int xfrm4_get_saddr(struct net *net, int oif,
 static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
 			  const struct flowi *fl)
 {
-	struct rtable *rt = (struct rtable *)xdst->route;
+	struct rtable *rt = dst_rtable(xdst->route);
 	const struct flowi4 *fl4 = &fl->u.ip4;
 
 	xdst->u.rt.rt_iif = fl4->flowi4_iif;
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 970af3983d11..19c8cc5289d5 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -459,7 +459,7 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 
 	fl4 = &inet->cork.fl.u.ip4;
 	if (connected)
-		rt = (struct rtable *)__sk_dst_check(sk, 0);
+		rt = dst_rtable(__sk_dst_check(sk, 0));
 
 	rcu_read_lock();
 	if (!rt) {
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index 606349c8df0e..4385fd3b13be 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -81,7 +81,7 @@ static int mpls_xmit(struct sk_buff *skb)
 			ttl = net->mpls.default_ttl;
 		else
 			ttl = ip_hdr(skb)->ttl;
-		rt = (struct rtable *)dst;
+		rt = dst_rtable(dst);
 	} else if (dst->ops->family == AF_INET6) {
 		if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DISABLED)
 			ttl = tun_encap_info->default_ttl;
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 6e8b9d100ad2..3313bceb6cc9 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -318,7 +318,7 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
 	if (dest) {
 		dest_dst = __ip_vs_dst_check(dest);
 		if (likely(dest_dst))
-			rt = (struct rtable *) dest_dst->dst_cache;
+			rt = dst_rtable(dest_dst->dst_cache);
 		else {
 			dest_dst = ip_vs_dest_dst_alloc();
 			spin_lock_bh(&dest->dst_lock);
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 100887beed31..c2c005234dcd 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -434,7 +434,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 		return NF_ACCEPT;
 
 	if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) {
-		rt = (struct rtable *)tuplehash->tuple.dst_cache;
+		rt = dst_rtable(tuplehash->tuple.dst_cache);
 		memset(skb->cb, 0, sizeof(struct inet_skb_parm));
 		IPCB(skb)->iif = skb->dev->ifindex;
 		IPCB(skb)->flags = IPSKB_FORWARDED;
@@ -446,7 +446,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 
 	switch (tuplehash->tuple.xmit_type) {
 	case FLOW_OFFLOAD_XMIT_NEIGH:
-		rt = (struct rtable *)tuplehash->tuple.dst_cache;
+		rt = dst_rtable(tuplehash->tuple.dst_cache);
 		outdev = rt->dst.dev;
 		skb->dev = outdev;
 		nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c
index 2434c624aafd..14d88394bcb7 100644
--- a/net/netfilter/nft_rt.c
+++ b/net/netfilter/nft_rt.c
@@ -73,7 +73,7 @@ void nft_rt_get_eval(const struct nft_expr *expr,
 		if (nft_pf(pkt) != NFPROTO_IPV4)
 			goto err;
 
-		*dest = (__force u32)rt_nexthop((const struct rtable *)dst,
+		*dest = (__force u32)rt_nexthop(dst_rtable(dst),
 						ip_hdr(skb)->daddr);
 		break;
 	case NFT_RT_NEXTHOP6:
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index e849f368ed91..5a7436a13b74 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -552,7 +552,7 @@ static void sctp_v4_get_saddr(struct sctp_sock *sk,
 			      struct flowi *fl)
 {
 	union sctp_addr *saddr = &t->saddr;
-	struct rtable *rt = (struct rtable *)t->dst;
+	struct rtable *rt = dst_rtable(t->dst);
 
 	if (rt) {
 		saddr->v4.sin_family = AF_INET;
@@ -1085,7 +1085,7 @@ static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *t)
 	skb_reset_inner_mac_header(skb);
 	skb_reset_inner_transport_header(skb);
 	skb_set_inner_ipproto(skb, IPPROTO_SCTP);
-	udp_tunnel_xmit_skb((struct rtable *)dst, sk, skb, fl4->saddr,
+	udp_tunnel_xmit_skb(dst_rtable(dst), sk, skb, fl4->saddr,
 			    fl4->daddr, dscp, ip4_dst_hoplimit(dst), df,
 			    sctp_sk(sk)->udp_port, t->encap_port, false, false);
 	return 0;
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index f892b0903dba..b849a3d133a0 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -174,7 +174,7 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb,
 	local_bh_disable();
 	ndst = dst_cache_get(cache);
 	if (dst->proto == htons(ETH_P_IP)) {
-		struct rtable *rt = (struct rtable *)ndst;
+		struct rtable *rt = dst_rtable(ndst);
 
 		if (!rt) {
 			struct flowi4 fl = {
-- 
cgit v1.2.3


From a86a0661b86f310c0b73a30c829648864f0b2619 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 29 Apr 2024 13:40:21 +0000
Subject: net: move sysctl_max_skb_frags to net_hotdata

sysctl_max_skb_frags is used in TCP and MPTCP fast paths,
move it to net_hodata for better cache locality.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20240429134025.1233626-2-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/skbuff.h     | 2 --
 include/net/hotdata.h      | 1 +
 net/core/hotdata.c         | 1 +
 net/core/skbuff.c          | 5 +----
 net/core/sysctl_net_core.c | 2 +-
 net/ipv4/tcp.c             | 3 ++-
 net/mptcp/protocol.c       | 3 ++-
 7 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index adf75d69770c..36b133f04d30 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -353,8 +353,6 @@ struct sk_buff;
 
 #define MAX_SKB_FRAGS CONFIG_MAX_SKB_FRAGS
 
-extern int sysctl_max_skb_frags;
-
 /* Set skb_shinfo(skb)->gso_size to this in case you want skb_segment to
  * segment using its current segmentation instead.
  */
diff --git a/include/net/hotdata.h b/include/net/hotdata.h
index 003667a1efd6..a6cff6590426 100644
--- a/include/net/hotdata.h
+++ b/include/net/hotdata.h
@@ -38,6 +38,7 @@ struct net_hotdata {
 	int			max_backlog;
 	int			dev_tx_weight;
 	int			dev_rx_weight;
+	int			sysctl_max_skb_frags;
 };
 
 #define inet_ehash_secret	net_hotdata.tcp_protocol.secret
diff --git a/net/core/hotdata.c b/net/core/hotdata.c
index c8a7a451c18a..f17cbb4807b9 100644
--- a/net/core/hotdata.c
+++ b/net/core/hotdata.c
@@ -18,5 +18,6 @@ struct net_hotdata net_hotdata __cacheline_aligned = {
 	.max_backlog = 1000,
 	.dev_tx_weight = 64,
 	.dev_rx_weight = 64,
+	.sysctl_max_skb_frags = MAX_SKB_FRAGS,
 };
 EXPORT_SYMBOL(net_hotdata);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 0c8b82750000..65779b8f0b12 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -109,9 +109,6 @@ static struct kmem_cache *skbuff_ext_cache __ro_after_init;
 #define SKB_SMALL_HEAD_HEADROOM						\
 	SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)
 
-int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
-EXPORT_SYMBOL(sysctl_max_skb_frags);
-
 /* kcm_write_msgs() relies on casting paged frags to bio_vec to use
  * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the
  * netmem is a page.
@@ -7040,7 +7037,7 @@ static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
 ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
 			     ssize_t maxsize, gfp_t gfp)
 {
-	size_t frag_limit = READ_ONCE(sysctl_max_skb_frags);
+	size_t frag_limit = READ_ONCE(net_hotdata.sysctl_max_skb_frags);
 	struct page *pages[8], **ppages = pages;
 	ssize_t spliced = 0, ret = 0;
 	unsigned int i;
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 903ab4a51c17..e75375d54b9e 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -595,7 +595,7 @@ static struct ctl_table net_core_table[] = {
 	},
 	{
 		.procname	= "max_skb_frags",
-		.data		= &sysctl_max_skb_frags,
+		.data		= &net_hotdata.sysctl_max_skb_frags,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4ec0f4feee00..388f6e115bf1 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -280,6 +280,7 @@
 #include <linux/uaccess.h>
 #include <asm/ioctls.h>
 #include <net/busy_poll.h>
+#include <net/hotdata.h>
 #include <net/rps.h>
 
 /* Track pending CMSGs. */
@@ -1188,7 +1189,7 @@ new_segment:
 
 			if (!skb_can_coalesce(skb, i, pfrag->page,
 					      pfrag->offset)) {
-				if (i >= READ_ONCE(sysctl_max_skb_frags)) {
+				if (i >= READ_ONCE(net_hotdata.sysctl_max_skb_frags)) {
 					tcp_mark_push(tp, skb);
 					goto new_segment;
 				}
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 4b13ca362efa..aff17597e6a7 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -20,6 +20,7 @@
 #include <net/transp_v6.h>
 #endif
 #include <net/mptcp.h>
+#include <net/hotdata.h>
 #include <net/xfrm.h>
 #include <asm/ioctls.h>
 #include "protocol.h"
@@ -1272,7 +1273,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
 
 		i = skb_shinfo(skb)->nr_frags;
 		can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset);
-		if (!can_coalesce && i >= READ_ONCE(sysctl_max_skb_frags)) {
+		if (!can_coalesce && i >= READ_ONCE(net_hotdata.sysctl_max_skb_frags)) {
 			tcp_mark_push(tcp_sk(ssk), skb);
 			goto alloc_skb;
 		}
-- 
cgit v1.2.3


From d480dc76d9f8a72671a6f9a7d987d2389c2e2ee4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 29 Apr 2024 13:40:22 +0000
Subject: net: move sysctl_skb_defer_max to net_hotdata

sysctl_skb_defer_max is used in TCP fast path,
move it to net_hodata.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20240429134025.1233626-3-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/hotdata.h      | 1 +
 net/core/dev.c             | 1 -
 net/core/dev.h             | 1 -
 net/core/hotdata.c         | 1 +
 net/core/skbuff.c          | 2 +-
 net/core/sysctl_net_core.c | 2 +-
 6 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/hotdata.h b/include/net/hotdata.h
index a6cff6590426..290499f72e18 100644
--- a/include/net/hotdata.h
+++ b/include/net/hotdata.h
@@ -39,6 +39,7 @@ struct net_hotdata {
 	int			dev_tx_weight;
 	int			dev_rx_weight;
 	int			sysctl_max_skb_frags;
+	int			sysctl_skb_defer_max;
 };
 
 #define inet_ehash_secret	net_hotdata.tcp_protocol.secret
diff --git a/net/core/dev.c b/net/core/dev.c
index c9e59eff8ec8..cd7ba50eac15 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4450,7 +4450,6 @@ EXPORT_SYMBOL(__dev_direct_xmit);
  *************************************************************************/
 static DEFINE_PER_CPU(struct task_struct *, backlog_napi);
 
-unsigned int sysctl_skb_defer_max __read_mostly = 64;
 int weight_p __read_mostly = 64;           /* old backlog weight */
 int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
 int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */
diff --git a/net/core/dev.h b/net/core/dev.h
index 8572d2c8dc4a..b7b518bc2be5 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -36,7 +36,6 @@ int dev_addr_init(struct net_device *dev);
 void dev_addr_check(struct net_device *dev);
 
 /* sysctls not referred to from outside net/core/ */
-extern unsigned int	sysctl_skb_defer_max;
 extern int		netdev_unregister_timeout_secs;
 extern int		weight_p;
 extern int		dev_weight_rx_bias;
diff --git a/net/core/hotdata.c b/net/core/hotdata.c
index f17cbb4807b9..a359ff160d54 100644
--- a/net/core/hotdata.c
+++ b/net/core/hotdata.c
@@ -19,5 +19,6 @@ struct net_hotdata net_hotdata __cacheline_aligned = {
 	.dev_tx_weight = 64,
 	.dev_rx_weight = 64,
 	.sysctl_max_skb_frags = MAX_SKB_FRAGS,
+	.sysctl_skb_defer_max = 64,
 };
 EXPORT_SYMBOL(net_hotdata);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 65779b8f0b12..5f382e94b4d1 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -6985,7 +6985,7 @@ nodefer:	kfree_skb_napi_cache(skb);
 	DEBUG_NET_WARN_ON_ONCE(skb->destructor);
 
 	sd = &per_cpu(softnet_data, cpu);
-	defer_max = READ_ONCE(sysctl_skb_defer_max);
+	defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max);
 	if (READ_ONCE(sd->defer_count) >= defer_max)
 		goto nodefer;
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index e75375d54b9e..118c78615543 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -654,7 +654,7 @@ static struct ctl_table net_core_table[] = {
 	},
 	{
 		.procname	= "skb_defer_max",
-		.data		= &sysctl_skb_defer_max,
+		.data		= &net_hotdata.sysctl_skb_defer_max,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-- 
cgit v1.2.3


From dda4d96acb20c02920f6d9a20fdc3f4846192aeb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 29 Apr 2024 13:40:23 +0000
Subject: tcp: move tcp_out_of_memory() to net/ipv4/tcp.c

tcp_out_of_memory() has a single caller: tcp_check_oom().

Following patch will also make sk_memory_allocated()
not anymore visible from <net/sock.h> and <net/tcp.h>

Add const qualifier to sock argument of tcp_out_of_memory()
and tcp_check_oom().

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20240429134025.1233626-4-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tcp.h | 10 +---------
 net/ipv4/tcp.c    | 10 +++++++++-
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index fe98fb01879b..0a51e6a45bce 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -296,14 +296,6 @@ static inline bool between(__u32 seq1, __u32 seq2, __u32 seq3)
 	return seq3 - seq2 >= seq1 - seq2;
 }
 
-static inline bool tcp_out_of_memory(struct sock *sk)
-{
-	if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
-	    sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2))
-		return true;
-	return false;
-}
-
 static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
 {
 	sk_wmem_queued_add(sk, -skb->truesize);
@@ -316,7 +308,7 @@ static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
 
 void sk_forced_mem_schedule(struct sock *sk, int size);
 
-bool tcp_check_oom(struct sock *sk, int shift);
+bool tcp_check_oom(const struct sock *sk, int shift);
 
 
 extern struct proto tcp_prot;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 388f6e115bf1..0a3aa3047083 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2752,7 +2752,15 @@ static bool tcp_too_many_orphans(int shift)
 		READ_ONCE(sysctl_tcp_max_orphans);
 }
 
-bool tcp_check_oom(struct sock *sk, int shift)
+static bool tcp_out_of_memory(const struct sock *sk)
+{
+	if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
+	    sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2))
+		return true;
+	return false;
+}
+
+bool tcp_check_oom(const struct sock *sk, int shift)
 {
 	bool too_many_orphans, out_of_socket_memory;
 
-- 
cgit v1.2.3


From f3d93817fba30a8d3508fa990405039c0820dca3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 29 Apr 2024 13:40:24 +0000
Subject: net: add <net/proto_memory.h>

Move some proto memory definitions out of <net/sock.h>

Very few files need them, and following patch
will include <net/hotdata.h> from <net/proto_memory.h>

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20240429134025.1233626-5-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/proto_memory.h | 83 ++++++++++++++++++++++++++++++++++++++++++++++
 include/net/sock.h         | 78 -------------------------------------------
 net/core/sock.c            |  1 +
 net/core/sysctl_net_core.c |  1 +
 net/ipv4/proc.c            |  1 +
 net/ipv4/tcp.c             |  1 +
 net/ipv4/tcp_input.c       |  1 +
 net/ipv4/tcp_output.c      |  1 +
 net/sctp/sm_statefuns.c    |  1 +
 9 files changed, 90 insertions(+), 78 deletions(-)
 create mode 100644 include/net/proto_memory.h

(limited to 'include')

diff --git a/include/net/proto_memory.h b/include/net/proto_memory.h
new file mode 100644
index 000000000000..41404d4bb6f0
--- /dev/null
+++ b/include/net/proto_memory.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _PROTO_MEMORY_H
+#define _PROTO_MEMORY_H
+
+#include <net/sock.h>
+
+/* 1 MB per cpu, in page units */
+#define SK_MEMORY_PCPU_RESERVE (1 << (20 - PAGE_SHIFT))
+extern int sysctl_mem_pcpu_rsv;
+
+static inline bool sk_has_memory_pressure(const struct sock *sk)
+{
+	return sk->sk_prot->memory_pressure != NULL;
+}
+
+static inline bool
+proto_memory_pressure(const struct proto *prot)
+{
+	if (!prot->memory_pressure)
+		return false;
+	return !!READ_ONCE(*prot->memory_pressure);
+}
+
+static inline bool sk_under_global_memory_pressure(const struct sock *sk)
+{
+	return proto_memory_pressure(sk->sk_prot);
+}
+
+static inline bool sk_under_memory_pressure(const struct sock *sk)
+{
+	if (!sk->sk_prot->memory_pressure)
+		return false;
+
+	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
+	    mem_cgroup_under_socket_pressure(sk->sk_memcg))
+		return true;
+
+	return !!READ_ONCE(*sk->sk_prot->memory_pressure);
+}
+
+static inline long
+proto_memory_allocated(const struct proto *prot)
+{
+	return max(0L, atomic_long_read(prot->memory_allocated));
+}
+
+static inline long
+sk_memory_allocated(const struct sock *sk)
+{
+	return proto_memory_allocated(sk->sk_prot);
+}
+
+static inline void proto_memory_pcpu_drain(struct proto *proto)
+{
+	int val = this_cpu_xchg(*proto->per_cpu_fw_alloc, 0);
+
+	if (val)
+		atomic_long_add(val, proto->memory_allocated);
+}
+
+static inline void
+sk_memory_allocated_add(const struct sock *sk, int val)
+{
+	struct proto *proto = sk->sk_prot;
+
+	val = this_cpu_add_return(*proto->per_cpu_fw_alloc, val);
+
+	if (unlikely(val >= READ_ONCE(sysctl_mem_pcpu_rsv)))
+		proto_memory_pcpu_drain(proto);
+}
+
+static inline void
+sk_memory_allocated_sub(const struct sock *sk, int val)
+{
+	struct proto *proto = sk->sk_prot;
+
+	val = this_cpu_sub_return(*proto->per_cpu_fw_alloc, val);
+
+	if (unlikely(val <= -READ_ONCE(sysctl_mem_pcpu_rsv)))
+		proto_memory_pcpu_drain(proto);
+}
+
+#endif /* _PROTO_MEMORY_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index 48bcc845202f..0450494a1766 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1371,75 +1371,6 @@ static inline int sk_under_cgroup_hierarchy(struct sock *sk,
 #endif
 }
 
-static inline bool sk_has_memory_pressure(const struct sock *sk)
-{
-	return sk->sk_prot->memory_pressure != NULL;
-}
-
-static inline bool sk_under_global_memory_pressure(const struct sock *sk)
-{
-	return sk->sk_prot->memory_pressure &&
-		!!READ_ONCE(*sk->sk_prot->memory_pressure);
-}
-
-static inline bool sk_under_memory_pressure(const struct sock *sk)
-{
-	if (!sk->sk_prot->memory_pressure)
-		return false;
-
-	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
-	    mem_cgroup_under_socket_pressure(sk->sk_memcg))
-		return true;
-
-	return !!READ_ONCE(*sk->sk_prot->memory_pressure);
-}
-
-static inline long
-proto_memory_allocated(const struct proto *prot)
-{
-	return max(0L, atomic_long_read(prot->memory_allocated));
-}
-
-static inline long
-sk_memory_allocated(const struct sock *sk)
-{
-	return proto_memory_allocated(sk->sk_prot);
-}
-
-/* 1 MB per cpu, in page units */
-#define SK_MEMORY_PCPU_RESERVE (1 << (20 - PAGE_SHIFT))
-extern int sysctl_mem_pcpu_rsv;
-
-static inline void proto_memory_pcpu_drain(struct proto *proto)
-{
-	int val = this_cpu_xchg(*proto->per_cpu_fw_alloc, 0);
-
-	if (val)
-		atomic_long_add(val, proto->memory_allocated);
-}
-
-static inline void
-sk_memory_allocated_add(const struct sock *sk, int val)
-{
-	struct proto *proto = sk->sk_prot;
-
-	val = this_cpu_add_return(*proto->per_cpu_fw_alloc, val);
-
-	if (unlikely(val >= READ_ONCE(sysctl_mem_pcpu_rsv)))
-		proto_memory_pcpu_drain(proto);
-}
-
-static inline void
-sk_memory_allocated_sub(const struct sock *sk, int val)
-{
-	struct proto *proto = sk->sk_prot;
-
-	val = this_cpu_sub_return(*proto->per_cpu_fw_alloc, val);
-
-	if (unlikely(val <= -READ_ONCE(sysctl_mem_pcpu_rsv)))
-		proto_memory_pcpu_drain(proto);
-}
-
 #define SK_ALLOC_PERCPU_COUNTER_BATCH 16
 
 static inline void sk_sockets_allocated_dec(struct sock *sk)
@@ -1466,15 +1397,6 @@ proto_sockets_allocated_sum_positive(struct proto *prot)
 	return percpu_counter_sum_positive(prot->sockets_allocated);
 }
 
-static inline bool
-proto_memory_pressure(struct proto *prot)
-{
-	if (!prot->memory_pressure)
-		return false;
-	return !!READ_ONCE(*prot->memory_pressure);
-}
-
-
 #ifdef CONFIG_PROC_FS
 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
 struct prot_inuse {
diff --git a/net/core/sock.c b/net/core/sock.c
index fe9195186c13..e0692b752369 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -127,6 +127,7 @@
 #include <net/net_namespace.h>
 #include <net/request_sock.h>
 #include <net/sock.h>
+#include <net/proto_memory.h>
 #include <linux/net_tstamp.h>
 #include <net/xfrm.h>
 #include <linux/ipsec.h>
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 118c78615543..a452a330d0ed 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -24,6 +24,7 @@
 #include <net/busy_poll.h>
 #include <net/pkt_sched.h>
 #include <net/hotdata.h>
+#include <net/proto_memory.h>
 #include <net/rps.h>
 
 #include "dev.h"
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 914bc9c35cc7..6c4664c681ca 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -33,6 +33,7 @@
 #include <net/protocol.h>
 #include <net/tcp.h>
 #include <net/mptcp.h>
+#include <net/proto_memory.h>
 #include <net/udp.h>
 #include <net/udplite.h>
 #include <linux/bottom_half.h>
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0a3aa3047083..e1f0efbb29d6 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -272,6 +272,7 @@
 #include <net/inet_common.h>
 #include <net/tcp.h>
 #include <net/mptcp.h>
+#include <net/proto_memory.h>
 #include <net/xfrm.h>
 #include <net/ip.h>
 #include <net/sock.h>
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 53e1150f706f..ad8fa129fcfe 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -72,6 +72,7 @@
 #include <linux/prefetch.h>
 #include <net/dst.h>
 #include <net/tcp.h>
+#include <net/proto_memory.h>
 #include <net/inet_common.h>
 #include <linux/ipsec.h>
 #include <asm/unaligned.h>
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index ea7ad7d99245..57edf66ff91b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -39,6 +39,7 @@
 
 #include <net/tcp.h>
 #include <net/mptcp.h>
+#include <net/proto_memory.h>
 
 #include <linux/compiler.h>
 #include <linux/gfp.h>
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 08fdf1251f46..5adf0c0a6c1a 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -38,6 +38,7 @@
 #include <linux/inet.h>
 #include <linux/slab.h>
 #include <net/sock.h>
+#include <net/proto_memory.h>
 #include <net/inet_ecn.h>
 #include <linux/skbuff.h>
 #include <net/sctp/sctp.h>
-- 
cgit v1.2.3


From c204fef97ee62ca3310f43d12bbda4eb10266c7c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 29 Apr 2024 13:40:25 +0000
Subject: net: move sysctl_mem_pcpu_rsv to net_hotdata

sysctl_mem_pcpu_rsv is used in TCP fast path,
move it to net_hodata for better cache locality.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://lore.kernel.org/r/20240429134025.1233626-6-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/hotdata.h      | 1 +
 include/net/proto_memory.h | 6 +++---
 net/core/hotdata.c         | 5 +++--
 net/core/sock.c            | 1 -
 net/core/sysctl_net_core.c | 2 +-
 5 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/hotdata.h b/include/net/hotdata.h
index 290499f72e18..30e9570beb2a 100644
--- a/include/net/hotdata.h
+++ b/include/net/hotdata.h
@@ -40,6 +40,7 @@ struct net_hotdata {
 	int			dev_rx_weight;
 	int			sysctl_max_skb_frags;
 	int			sysctl_skb_defer_max;
+	int			sysctl_mem_pcpu_rsv;
 };
 
 #define inet_ehash_secret	net_hotdata.tcp_protocol.secret
diff --git a/include/net/proto_memory.h b/include/net/proto_memory.h
index 41404d4bb6f0..a6ab2f4f5e28 100644
--- a/include/net/proto_memory.h
+++ b/include/net/proto_memory.h
@@ -3,10 +3,10 @@
 #define _PROTO_MEMORY_H
 
 #include <net/sock.h>
+#include <net/hotdata.h>
 
 /* 1 MB per cpu, in page units */
 #define SK_MEMORY_PCPU_RESERVE (1 << (20 - PAGE_SHIFT))
-extern int sysctl_mem_pcpu_rsv;
 
 static inline bool sk_has_memory_pressure(const struct sock *sk)
 {
@@ -65,7 +65,7 @@ sk_memory_allocated_add(const struct sock *sk, int val)
 
 	val = this_cpu_add_return(*proto->per_cpu_fw_alloc, val);
 
-	if (unlikely(val >= READ_ONCE(sysctl_mem_pcpu_rsv)))
+	if (unlikely(val >= READ_ONCE(net_hotdata.sysctl_mem_pcpu_rsv)))
 		proto_memory_pcpu_drain(proto);
 }
 
@@ -76,7 +76,7 @@ sk_memory_allocated_sub(const struct sock *sk, int val)
 
 	val = this_cpu_sub_return(*proto->per_cpu_fw_alloc, val);
 
-	if (unlikely(val <= -READ_ONCE(sysctl_mem_pcpu_rsv)))
+	if (unlikely(val <= -READ_ONCE(net_hotdata.sysctl_mem_pcpu_rsv)))
 		proto_memory_pcpu_drain(proto);
 }
 
diff --git a/net/core/hotdata.c b/net/core/hotdata.c
index a359ff160d54..d0aaaaa556f2 100644
--- a/net/core/hotdata.c
+++ b/net/core/hotdata.c
@@ -1,9 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
-#include <net/hotdata.h>
 #include <linux/cache.h>
 #include <linux/jiffies.h>
 #include <linux/list.h>
-
+#include <net/hotdata.h>
+#include <net/proto_memory.h>
 
 struct net_hotdata net_hotdata __cacheline_aligned = {
 	.offload_base = LIST_HEAD_INIT(net_hotdata.offload_base),
@@ -20,5 +20,6 @@ struct net_hotdata net_hotdata __cacheline_aligned = {
 	.dev_rx_weight = 64,
 	.sysctl_max_skb_frags = MAX_SKB_FRAGS,
 	.sysctl_skb_defer_max = 64,
+	.sysctl_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE
 };
 EXPORT_SYMBOL(net_hotdata);
diff --git a/net/core/sock.c b/net/core/sock.c
index e0692b752369..8d6e638b5426 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -284,7 +284,6 @@ __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 EXPORT_SYMBOL(sysctl_rmem_max);
 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
-int sysctl_mem_pcpu_rsv __read_mostly = SK_MEMORY_PCPU_RESERVE;
 
 int sysctl_tstamp_allow_data __read_mostly = 1;
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index a452a330d0ed..6da5995ac86a 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -416,7 +416,7 @@ static struct ctl_table net_core_table[] = {
 	},
 	{
 		.procname	= "mem_pcpu_rsv",
-		.data		= &sysctl_mem_pcpu_rsv,
+		.data		= &net_hotdata.sysctl_mem_pcpu_rsv,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
-- 
cgit v1.2.3


From a4a87fa4e96c7746e009de06a567688fd9af6013 Mon Sep 17 00:00:00 2001
From: Antony Antony <antony.antony@secunet.com>
Date: Tue, 30 Apr 2024 09:08:52 +0200
Subject: xfrm: Add Direction to the SA in or out

This patch introduces the 'dir' attribute, 'in' or 'out', to the
xfrm_state, SA, enhancing usability by delineating the scope of values
based on direction. An input SA will restrict values pertinent to input,
effectively segregating them from output-related values.
And an output SA will restrict attributes for output. This change aims
to streamline the configuration process and improve the overall
consistency of SA attributes during configuration.

This feature sets the groundwork for future patches, including
the upcoming IP-TFS patch.

Signed-off-by: Antony Antony <antony.antony@secunet.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h        |   1 +
 include/uapi/linux/xfrm.h |   6 ++
 net/xfrm/xfrm_compat.c    |   7 ++-
 net/xfrm/xfrm_device.c    |   6 ++
 net/xfrm/xfrm_replay.c    |   3 +-
 net/xfrm/xfrm_state.c     |   8 +++
 net/xfrm/xfrm_user.c      | 138 ++++++++++++++++++++++++++++++++++++++++++++--
 7 files changed, 160 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 57c743b7e4fe..7c9be06f8302 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -291,6 +291,7 @@ struct xfrm_state {
 	/* Private data of this transformer, format is opaque,
 	 * interpreted by xfrm_type methods. */
 	void			*data;
+	u8			dir;
 };
 
 static inline struct net *xs_net(struct xfrm_state *x)
diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index 6a77328be114..18ceaba8486e 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -141,6 +141,11 @@ enum {
 	XFRM_POLICY_MAX	= 3
 };
 
+enum xfrm_sa_dir {
+	XFRM_SA_DIR_IN	= 1,
+	XFRM_SA_DIR_OUT = 2
+};
+
 enum {
 	XFRM_SHARE_ANY,		/* No limitations */
 	XFRM_SHARE_SESSION,	/* For this session only */
@@ -315,6 +320,7 @@ enum xfrm_attr_type_t {
 	XFRMA_SET_MARK_MASK,	/* __u32 */
 	XFRMA_IF_ID,		/* __u32 */
 	XFRMA_MTIMER_THRESH,	/* __u32 in seconds for input SA */
+	XFRMA_SA_DIR,		/* __u8 */
 	__XFRMA_MAX
 
 #define XFRMA_OUTPUT_MARK XFRMA_SET_MARK	/* Compatibility */
diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c
index 655fe4ff8621..703d4172c7d7 100644
--- a/net/xfrm/xfrm_compat.c
+++ b/net/xfrm/xfrm_compat.c
@@ -98,6 +98,7 @@ static const int compat_msg_min[XFRM_NR_MSGTYPES] = {
 };
 
 static const struct nla_policy compat_policy[XFRMA_MAX+1] = {
+	[XFRMA_UNSPEC]          = { .strict_start_type = XFRMA_SA_DIR },
 	[XFRMA_SA]		= { .len = XMSGSIZE(compat_xfrm_usersa_info)},
 	[XFRMA_POLICY]		= { .len = XMSGSIZE(compat_xfrm_userpolicy_info)},
 	[XFRMA_LASTUSED]	= { .type = NLA_U64},
@@ -129,6 +130,7 @@ static const struct nla_policy compat_policy[XFRMA_MAX+1] = {
 	[XFRMA_SET_MARK_MASK]	= { .type = NLA_U32 },
 	[XFRMA_IF_ID]		= { .type = NLA_U32 },
 	[XFRMA_MTIMER_THRESH]	= { .type = NLA_U32 },
+	[XFRMA_SA_DIR]          = NLA_POLICY_RANGE(NLA_U8, XFRM_SA_DIR_IN, XFRM_SA_DIR_OUT),
 };
 
 static struct nlmsghdr *xfrm_nlmsg_put_compat(struct sk_buff *skb,
@@ -277,9 +279,10 @@ static int xfrm_xlate64_attr(struct sk_buff *dst, const struct nlattr *src)
 	case XFRMA_SET_MARK_MASK:
 	case XFRMA_IF_ID:
 	case XFRMA_MTIMER_THRESH:
+	case XFRMA_SA_DIR:
 		return xfrm_nla_cpy(dst, src, nla_len(src));
 	default:
-		BUILD_BUG_ON(XFRMA_MAX != XFRMA_MTIMER_THRESH);
+		BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_DIR);
 		pr_warn_once("unsupported nla_type %d\n", src->nla_type);
 		return -EOPNOTSUPP;
 	}
@@ -434,7 +437,7 @@ static int xfrm_xlate32_attr(void *dst, const struct nlattr *nla,
 	int err;
 
 	if (type > XFRMA_MAX) {
-		BUILD_BUG_ON(XFRMA_MAX != XFRMA_MTIMER_THRESH);
+		BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_DIR);
 		NL_SET_ERR_MSG(extack, "Bad attribute");
 		return -EOPNOTSUPP;
 	}
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 6346690d5c69..2455a76a1cff 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -253,6 +253,12 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 		return -EINVAL;
 	}
 
+	if ((xuo->flags & XFRM_OFFLOAD_INBOUND && x->dir == XFRM_SA_DIR_OUT) ||
+	    (!(xuo->flags & XFRM_OFFLOAD_INBOUND) && x->dir == XFRM_SA_DIR_IN)) {
+		NL_SET_ERR_MSG(extack, "Mismatched SA and offload direction");
+		return -EINVAL;
+	}
+
 	is_packet_offload = xuo->flags & XFRM_OFFLOAD_PACKET;
 
 	/* We don't yet support UDP encapsulation and TFC padding. */
diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c
index ce56d659c55a..bc56c6305725 100644
--- a/net/xfrm/xfrm_replay.c
+++ b/net/xfrm/xfrm_replay.c
@@ -778,7 +778,8 @@ int xfrm_init_replay(struct xfrm_state *x, struct netlink_ext_ack *extack)
 		}
 
 		if (x->props.flags & XFRM_STATE_ESN) {
-			if (replay_esn->replay_window == 0) {
+			if (replay_esn->replay_window == 0 &&
+			    (!x->dir || x->dir == XFRM_SA_DIR_IN)) {
 				NL_SET_ERR_MSG(extack, "ESN replay window must be > 0");
 				return -EINVAL;
 			}
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 0c306473a79d..649bb739df0d 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1292,6 +1292,7 @@ found:
 		if (km_query(x, tmpl, pol) == 0) {
 			spin_lock_bh(&net->xfrm.xfrm_state_lock);
 			x->km.state = XFRM_STATE_ACQ;
+			x->dir = XFRM_SA_DIR_OUT;
 			list_add(&x->km.all, &net->xfrm.state_all);
 			XFRM_STATE_INSERT(bydst, &x->bydst,
 					  net->xfrm.state_bydst + h,
@@ -1744,6 +1745,7 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig,
 	x->lastused = orig->lastused;
 	x->new_mapping = 0;
 	x->new_mapping_sport = 0;
+	x->dir = orig->dir;
 
 	return x;
 
@@ -1864,8 +1866,14 @@ int xfrm_state_update(struct xfrm_state *x)
 	}
 
 	if (x1->km.state == XFRM_STATE_ACQ) {
+		if (x->dir && x1->dir != x->dir)
+			goto out;
+
 		__xfrm_state_insert(x);
 		x = NULL;
+	} else {
+		if (x1->dir != x->dir)
+			goto out;
 	}
 	err = 0;
 
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 810b520493f3..f5eb3af4fb81 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -130,7 +130,7 @@ static inline int verify_sec_ctx_len(struct nlattr **attrs, struct netlink_ext_a
 }
 
 static inline int verify_replay(struct xfrm_usersa_info *p,
-				struct nlattr **attrs,
+				struct nlattr **attrs, u8 sa_dir,
 				struct netlink_ext_ack *extack)
 {
 	struct nlattr *rt = attrs[XFRMA_REPLAY_ESN_VAL];
@@ -168,6 +168,30 @@ static inline int verify_replay(struct xfrm_usersa_info *p,
 		return -EINVAL;
 	}
 
+	if (sa_dir == XFRM_SA_DIR_OUT)  {
+		if (rs->replay_window) {
+			NL_SET_ERR_MSG(extack, "Replay window should be 0 for output SA");
+			return -EINVAL;
+		}
+		if (rs->seq || rs->seq_hi) {
+			NL_SET_ERR_MSG(extack,
+				       "Replay seq and seq_hi should be 0 for output SA");
+			return -EINVAL;
+		}
+		if (rs->bmp_len) {
+			NL_SET_ERR_MSG(extack, "Replay bmp_len should 0 for output SA");
+			return -EINVAL;
+		}
+	}
+
+	if (sa_dir == XFRM_SA_DIR_IN)  {
+		if (rs->oseq || rs->oseq_hi) {
+			NL_SET_ERR_MSG(extack,
+				       "Replay oseq and oseq_hi should be 0 for input SA");
+			return -EINVAL;
+		}
+	}
+
 	return 0;
 }
 
@@ -176,6 +200,7 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
 			     struct netlink_ext_ack *extack)
 {
 	int err;
+	u8 sa_dir = attrs[XFRMA_SA_DIR] ? nla_get_u8(attrs[XFRMA_SA_DIR]) : 0;
 
 	err = -EINVAL;
 	switch (p->family) {
@@ -334,7 +359,7 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
 		goto out;
 	if ((err = verify_sec_ctx_len(attrs, extack)))
 		goto out;
-	if ((err = verify_replay(p, attrs, extack)))
+	if ((err = verify_replay(p, attrs, sa_dir, extack)))
 		goto out;
 
 	err = -EINVAL;
@@ -358,6 +383,77 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
 			err = -EINVAL;
 			goto out;
 		}
+
+		if (sa_dir == XFRM_SA_DIR_OUT) {
+			NL_SET_ERR_MSG(extack,
+				       "MTIMER_THRESH attribute should not be set on output SA");
+			err = -EINVAL;
+			goto out;
+		}
+	}
+
+	if (sa_dir == XFRM_SA_DIR_OUT) {
+		if (p->flags & XFRM_STATE_DECAP_DSCP) {
+			NL_SET_ERR_MSG(extack, "Flag DECAP_DSCP should not be set for output SA");
+			err = -EINVAL;
+			goto out;
+		}
+
+		if (p->flags & XFRM_STATE_ICMP) {
+			NL_SET_ERR_MSG(extack, "Flag ICMP should not be set for output SA");
+			err = -EINVAL;
+			goto out;
+		}
+
+		if (p->flags & XFRM_STATE_WILDRECV) {
+			NL_SET_ERR_MSG(extack, "Flag WILDRECV should not be set for output SA");
+			err = -EINVAL;
+			goto out;
+		}
+
+		if (p->replay_window) {
+			NL_SET_ERR_MSG(extack, "Replay window should be 0 for output SA");
+			err = -EINVAL;
+			goto out;
+		}
+
+		if (attrs[XFRMA_REPLAY_VAL]) {
+			struct xfrm_replay_state *replay;
+
+			replay = nla_data(attrs[XFRMA_REPLAY_VAL]);
+
+			if (replay->seq || replay->bitmap) {
+				NL_SET_ERR_MSG(extack,
+					       "Replay seq and bitmap should be 0 for output SA");
+				err = -EINVAL;
+				goto out;
+			}
+		}
+	}
+
+	if (sa_dir == XFRM_SA_DIR_IN) {
+		if (p->flags & XFRM_STATE_NOPMTUDISC) {
+			NL_SET_ERR_MSG(extack, "Flag NOPMTUDISC should not be set for input SA");
+			err = -EINVAL;
+			goto out;
+		}
+
+		if (attrs[XFRMA_SA_EXTRA_FLAGS]) {
+			u32 xflags = nla_get_u32(attrs[XFRMA_SA_EXTRA_FLAGS]);
+
+			if (xflags & XFRM_SA_XFLAG_DONT_ENCAP_DSCP) {
+				NL_SET_ERR_MSG(extack, "Flag DONT_ENCAP_DSCP should not be set for input SA");
+				err = -EINVAL;
+				goto out;
+			}
+
+			if (xflags & XFRM_SA_XFLAG_OSEQ_MAY_WRAP) {
+				NL_SET_ERR_MSG(extack, "Flag OSEQ_MAY_WRAP should not be set for input SA");
+				err = -EINVAL;
+				goto out;
+			}
+
+		}
 	}
 
 out:
@@ -734,6 +830,9 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
 	if (attrs[XFRMA_IF_ID])
 		x->if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
 
+	if (attrs[XFRMA_SA_DIR])
+		x->dir = nla_get_u8(attrs[XFRMA_SA_DIR]);
+
 	err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV], extack);
 	if (err)
 		goto error;
@@ -1182,8 +1281,13 @@ static int copy_to_user_state_extra(struct xfrm_state *x,
 		if (ret)
 			goto out;
 	}
-	if (x->mapping_maxage)
+	if (x->mapping_maxage) {
 		ret = nla_put_u32(skb, XFRMA_MTIMER_THRESH, x->mapping_maxage);
+		if (ret)
+			goto out;
+	}
+	if (x->dir)
+		ret = nla_put_u8(skb, XFRMA_SA_DIR, x->dir);
 out:
 	return ret;
 }
@@ -1618,6 +1722,9 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err)
 		goto out;
 
+	if (attrs[XFRMA_SA_DIR])
+		x->dir = nla_get_u8(attrs[XFRMA_SA_DIR]);
+
 	resp_skb = xfrm_state_netlink(skb, x, nlh->nlmsg_seq);
 	if (IS_ERR(resp_skb)) {
 		err = PTR_ERR(resp_skb);
@@ -2402,7 +2509,8 @@ static inline unsigned int xfrm_aevent_msgsize(struct xfrm_state *x)
 	       + nla_total_size_64bit(sizeof(struct xfrm_lifetime_cur))
 	       + nla_total_size(sizeof(struct xfrm_mark))
 	       + nla_total_size(4) /* XFRM_AE_RTHR */
-	       + nla_total_size(4); /* XFRM_AE_ETHR */
+	       + nla_total_size(4) /* XFRM_AE_ETHR */
+	       + nla_total_size(sizeof(x->dir)); /* XFRMA_SA_DIR */
 }
 
 static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct km_event *c)
@@ -2459,6 +2567,12 @@ static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct
 	if (err)
 		goto out_cancel;
 
+	if (x->dir) {
+		err = nla_put_u8(skb, XFRMA_SA_DIR, x->dir);
+		if (err)
+			goto out_cancel;
+	}
+
 	nlmsg_end(skb, nlh);
 	return 0;
 
@@ -3018,6 +3132,7 @@ EXPORT_SYMBOL_GPL(xfrm_msg_min);
 #undef XMSGSIZE
 
 const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
+	[XFRMA_UNSPEC]		= { .strict_start_type = XFRMA_SA_DIR },
 	[XFRMA_SA]		= { .len = sizeof(struct xfrm_usersa_info)},
 	[XFRMA_POLICY]		= { .len = sizeof(struct xfrm_userpolicy_info)},
 	[XFRMA_LASTUSED]	= { .type = NLA_U64},
@@ -3049,6 +3164,7 @@ const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
 	[XFRMA_SET_MARK_MASK]	= { .type = NLA_U32 },
 	[XFRMA_IF_ID]		= { .type = NLA_U32 },
 	[XFRMA_MTIMER_THRESH]   = { .type = NLA_U32 },
+	[XFRMA_SA_DIR]          = NLA_POLICY_RANGE(NLA_U8, XFRM_SA_DIR_IN, XFRM_SA_DIR_OUT),
 };
 EXPORT_SYMBOL_GPL(xfrma_policy);
 
@@ -3189,8 +3305,9 @@ static void xfrm_netlink_rcv(struct sk_buff *skb)
 
 static inline unsigned int xfrm_expire_msgsize(void)
 {
-	return NLMSG_ALIGN(sizeof(struct xfrm_user_expire))
-	       + nla_total_size(sizeof(struct xfrm_mark));
+	return NLMSG_ALIGN(sizeof(struct xfrm_user_expire)) +
+	       nla_total_size(sizeof(struct xfrm_mark)) +
+	       nla_total_size(sizeof_field(struct xfrm_state, dir));
 }
 
 static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct km_event *c)
@@ -3217,6 +3334,12 @@ static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct
 	if (err)
 		return err;
 
+	if (x->dir) {
+		err = nla_put_u8(skb, XFRMA_SA_DIR, x->dir);
+		if (err)
+			return err;
+	}
+
 	nlmsg_end(skb, nlh);
 	return 0;
 }
@@ -3324,6 +3447,9 @@ static inline unsigned int xfrm_sa_len(struct xfrm_state *x)
 	if (x->mapping_maxage)
 		l += nla_total_size(sizeof(x->mapping_maxage));
 
+	if (x->dir)
+		l += nla_total_size(sizeof(x->dir));
+
 	return l;
 }
 
-- 
cgit v1.2.3


From 601a0867f86cbb5e137ce485a7eb60cbf9fc5180 Mon Sep 17 00:00:00 2001
From: Antony Antony <antony.antony@secunet.com>
Date: Tue, 30 Apr 2024 09:09:09 +0200
Subject: xfrm: Add dir validation to "out" data path lookup

Introduces validation for the x->dir attribute within the XFRM output
data lookup path. If the configured direction does not match the expected
direction, output, increment the XfrmOutStateDirError counter and drop
the packet to ensure data integrity and correct flow handling.

grep -vw 0 /proc/net/xfrm_stat
XfrmOutPolError         	1
XfrmOutStateDirError    	1

Signed-off-by: Antony Antony <antony.antony@secunet.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 Documentation/networking/xfrm_proc.rst | 3 +++
 include/uapi/linux/snmp.h              | 1 +
 net/xfrm/xfrm_policy.c                 | 6 ++++++
 net/xfrm/xfrm_proc.c                   | 1 +
 4 files changed, 11 insertions(+)

(limited to 'include')

diff --git a/Documentation/networking/xfrm_proc.rst b/Documentation/networking/xfrm_proc.rst
index 0a771c5a7399..5ac3acf4cf51 100644
--- a/Documentation/networking/xfrm_proc.rst
+++ b/Documentation/networking/xfrm_proc.rst
@@ -111,3 +111,6 @@ XfrmOutPolError:
 
 XfrmOutStateInvalid:
 	State is invalid, perhaps expired
+
+XfrmOutStateDirError:
+        State direction mismatch (lookup found an input state on the output path, expected output or no direction)
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index a0819c6a5988..23792b8412bd 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -337,6 +337,7 @@ enum
 	LINUX_MIB_XFRMFWDHDRERROR,		/* XfrmFwdHdrError*/
 	LINUX_MIB_XFRMOUTSTATEINVALID,		/* XfrmOutStateInvalid */
 	LINUX_MIB_XFRMACQUIREERROR,		/* XfrmAcquireError */
+	LINUX_MIB_XFRMOUTSTATEDIRERROR,		/* XfrmOutStateDirError */
 	__LINUX_MIB_XFRMMAX
 };
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 6affe5cd85d8..298b3a9eb48d 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2489,6 +2489,12 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl,
 
 		x = xfrm_state_find(remote, local, fl, tmpl, policy, &error,
 				    family, policy->if_id);
+		if (x && x->dir && x->dir != XFRM_SA_DIR_OUT) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEDIRERROR);
+			xfrm_state_put(x);
+			error = -EINVAL;
+			goto fail;
+		}
 
 		if (x && x->km.state == XFRM_STATE_VALID) {
 			xfrm[nx++] = x;
diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c
index 5f9bf8e5c933..98606f1078f7 100644
--- a/net/xfrm/xfrm_proc.c
+++ b/net/xfrm/xfrm_proc.c
@@ -41,6 +41,7 @@ static const struct snmp_mib xfrm_mib_list[] = {
 	SNMP_MIB_ITEM("XfrmFwdHdrError", LINUX_MIB_XFRMFWDHDRERROR),
 	SNMP_MIB_ITEM("XfrmOutStateInvalid", LINUX_MIB_XFRMOUTSTATEINVALID),
 	SNMP_MIB_ITEM("XfrmAcquireError", LINUX_MIB_XFRMACQUIREERROR),
+	SNMP_MIB_ITEM("XfrmOutStateDirError", LINUX_MIB_XFRMOUTSTATEDIRERROR),
 	SNMP_MIB_SENTINEL
 };
 
-- 
cgit v1.2.3


From 304b44f0d5a4c2f91f82f7c31538d00485fb484c Mon Sep 17 00:00:00 2001
From: Antony Antony <antony.antony@secunet.com>
Date: Tue, 30 Apr 2024 09:09:29 +0200
Subject: xfrm: Add dir validation to "in" data path lookup

Introduces validation for the x->dir attribute within the XFRM input
data lookup path. If the configured direction does not match the
expected direction, input, increment the XfrmInStateDirError counter
and drop the packet to ensure data integrity and correct flow handling.

grep -vw 0 /proc/net/xfrm_stat
XfrmInStateDirError     	1

Signed-off-by: Antony Antony <antony.antony@secunet.com>
Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 Documentation/networking/xfrm_proc.rst |  3 +++
 include/uapi/linux/snmp.h              |  1 +
 net/ipv6/xfrm6_input.c                 |  7 +++++++
 net/xfrm/xfrm_input.c                  | 11 +++++++++++
 net/xfrm/xfrm_proc.c                   |  1 +
 5 files changed, 23 insertions(+)

(limited to 'include')

diff --git a/Documentation/networking/xfrm_proc.rst b/Documentation/networking/xfrm_proc.rst
index 5ac3acf4cf51..973d1571acac 100644
--- a/Documentation/networking/xfrm_proc.rst
+++ b/Documentation/networking/xfrm_proc.rst
@@ -73,6 +73,9 @@ XfrmAcquireError:
 XfrmFwdHdrError:
 	Forward routing of a packet is not allowed
 
+XfrmInStateDirError:
+        State direction mismatch (lookup found an output state on the input path, expected input or no direction)
+
 Outbound errors
 ~~~~~~~~~~~~~~~
 XfrmOutError:
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 23792b8412bd..adf5fd78dd50 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -338,6 +338,7 @@ enum
 	LINUX_MIB_XFRMOUTSTATEINVALID,		/* XfrmOutStateInvalid */
 	LINUX_MIB_XFRMACQUIREERROR,		/* XfrmAcquireError */
 	LINUX_MIB_XFRMOUTSTATEDIRERROR,		/* XfrmOutStateDirError */
+	LINUX_MIB_XFRMINSTATEDIRERROR,		/* XfrmInStateDirError */
 	__LINUX_MIB_XFRMMAX
 };
 
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 2c6aeb090b7a..d5bac0d76b6e 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -266,6 +266,13 @@ int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
 		if (!x)
 			continue;
 
+		if (unlikely(x->dir && x->dir != XFRM_SA_DIR_IN)) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEDIRERROR);
+			xfrm_state_put(x);
+			x = NULL;
+			continue;
+		}
+
 		spin_lock(&x->lock);
 
 		if ((!i || (x->props.flags & XFRM_STATE_WILDRECV)) &&
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 161f535c8b94..71b42de6e3c9 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -466,6 +466,11 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 	if (encap_type < 0 || (xo && xo->flags & XFRM_GRO)) {
 		x = xfrm_input_state(skb);
 
+		if (unlikely(x->dir && x->dir != XFRM_SA_DIR_IN)) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEDIRERROR);
+			goto drop;
+		}
+
 		if (unlikely(x->km.state != XFRM_STATE_VALID)) {
 			if (x->km.state == XFRM_STATE_ACQ)
 				XFRM_INC_STATS(net, LINUX_MIB_XFRMACQUIREERROR);
@@ -571,6 +576,12 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 			goto drop;
 		}
 
+		if (unlikely(x->dir && x->dir != XFRM_SA_DIR_IN)) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEDIRERROR);
+			xfrm_state_put(x);
+			goto drop;
+		}
+
 		skb->mark = xfrm_smark_get(skb->mark, x);
 
 		sp->xvec[sp->len++] = x;
diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c
index 98606f1078f7..eeb984be03a7 100644
--- a/net/xfrm/xfrm_proc.c
+++ b/net/xfrm/xfrm_proc.c
@@ -42,6 +42,7 @@ static const struct snmp_mib xfrm_mib_list[] = {
 	SNMP_MIB_ITEM("XfrmOutStateInvalid", LINUX_MIB_XFRMOUTSTATEINVALID),
 	SNMP_MIB_ITEM("XfrmAcquireError", LINUX_MIB_XFRMACQUIREERROR),
 	SNMP_MIB_ITEM("XfrmOutStateDirError", LINUX_MIB_XFRMOUTSTATEDIRERROR),
+	SNMP_MIB_ITEM("XfrmInStateDirError", LINUX_MIB_XFRMINSTATEDIRERROR),
 	SNMP_MIB_SENTINEL
 };
 
-- 
cgit v1.2.3


From 73142cab3af1b99157837297f437b306d7a70bff Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@kernel.org>
Date: Wed, 7 Feb 2024 16:35:47 +0100
Subject: fprobe: Add entry/exit callbacks types

We are going to store callbacks in following change,
so this will ease up the code.

Link: https://lore.kernel.org/all/20240207153550.856536-2-jolsa@kernel.org/

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 include/linux/fprobe.h | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h
index 3e03758151f4..f39869588117 100644
--- a/include/linux/fprobe.h
+++ b/include/linux/fprobe.h
@@ -7,6 +7,16 @@
 #include <linux/ftrace.h>
 #include <linux/rethook.h>
 
+struct fprobe;
+
+typedef int (*fprobe_entry_cb)(struct fprobe *fp, unsigned long entry_ip,
+			       unsigned long ret_ip, struct pt_regs *regs,
+			       void *entry_data);
+
+typedef void (*fprobe_exit_cb)(struct fprobe *fp, unsigned long entry_ip,
+			       unsigned long ret_ip, struct pt_regs *regs,
+			       void *entry_data);
+
 /**
  * struct fprobe - ftrace based probe.
  * @ops: The ftrace_ops.
@@ -34,12 +44,8 @@ struct fprobe {
 	size_t			entry_data_size;
 	int			nr_maxactive;
 
-	int (*entry_handler)(struct fprobe *fp, unsigned long entry_ip,
-			     unsigned long ret_ip, struct pt_regs *regs,
-			     void *entry_data);
-	void (*exit_handler)(struct fprobe *fp, unsigned long entry_ip,
-			     unsigned long ret_ip, struct pt_regs *regs,
-			     void *entry_data);
+	fprobe_entry_cb entry_handler;
+	fprobe_exit_cb  exit_handler;
 };
 
 /* This fprobe is soft-disabled. */
-- 
cgit v1.2.3


From b0e28a4b5becea84ae6fca5cbd8a6b80a134e223 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Thu, 18 Apr 2024 12:09:08 -0700
Subject: ftrace: make extra rcu_is_watching() validation check optional

Introduce CONFIG_FTRACE_VALIDATE_RCU_IS_WATCHING config option to
control whether ftrace low-level code performs additional
rcu_is_watching()-based validation logic in an attempt to catch noinstr
violations.

This check is expected to never be true and is mostly useful for
low-level validation of ftrace subsystem invariants. For most users it
should probably be kept disabled to eliminate unnecessary runtime
overhead.

This improves BPF multi-kretprobe (relying on ftrace and rethook
infrastructure) runtime throughput by 2%, according to BPF benchmarks ([0]).

  [0] https://lore.kernel.org/bpf/CAEf4BzauQ2WKMjZdc9s0rBWa01BYbgwHN6aNDXQSHYia47pQ-w@mail.gmail.com/

Link: https://lore.kernel.org/all/20240418190909.704286-1-andrii@kernel.org/

Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Paul E. McKenney <paulmck@kernel.org>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 include/linux/trace_recursion.h |  2 +-
 kernel/trace/Kconfig            | 13 +++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index d48cd92d2364..24ea8ac049b4 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -135,7 +135,7 @@ extern void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip);
 # define do_ftrace_record_recursion(ip, pip)	do { } while (0)
 #endif
 
-#ifdef CONFIG_ARCH_WANTS_NO_INSTR
+#ifdef CONFIG_FTRACE_VALIDATE_RCU_IS_WATCHING
 # define trace_warn_on_no_rcu(ip)					\
 	({								\
 		bool __ret = !rcu_is_watching();			\
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 47345bf1d4a9..d093e16d01c1 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -974,6 +974,19 @@ config FTRACE_RECORD_RECURSION_SIZE
 	  This file can be reset, but the limit can not change in
 	  size at runtime.
 
+config FTRACE_VALIDATE_RCU_IS_WATCHING
+	bool "Validate RCU is on during ftrace execution"
+	depends on FUNCTION_TRACER
+	depends on ARCH_WANTS_NO_INSTR
+	help
+	  All callbacks that attach to the function tracing have some sort of
+	  protection against recursion. This option is only to verify that
+	  ftrace (and other users of ftrace_test_recursion_trylock()) are not
+	  called outside of RCU, as if they are, it can cause a race. But it
+	  also has a noticeable overhead when enabled.
+
+	  If unsure, say N
+
 config RING_BUFFER_RECORD_RECURSION
 	bool "Record functions that recurse in the ring buffer"
 	depends on FTRACE_RECORD_RECURSION
-- 
cgit v1.2.3


From a3b00f10da808bd4a354f890b551cba471082d0e Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 24 Apr 2024 14:52:13 -0700
Subject: objpool: enable inlining objpool_push() and objpool_pop() operations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

objpool_push() and objpool_pop() are very performance-critical functions
and can be called very frequently in kretprobe triggering path.

As such, it makes sense to allow compiler to inline them completely to
eliminate function calls overhead. Luckily, their logic is quite well
isolated and doesn't have any sprawling dependencies.

This patch moves both objpool_push() and objpool_pop() into
include/linux/objpool.h and marks them as static inline functions,
enabling inlining. To avoid anyone using internal helpers
(objpool_try_get_slot, objpool_try_add_slot), rename them to use leading
underscores.

We used kretprobe microbenchmark from BPF selftests (bench trig-kprobe
and trig-kprobe-multi benchmarks) running no-op BPF kretprobe/kretprobe.multi
programs in a tight loop to evaluate the effect. BPF own overhead in
this case is minimal and it mostly stresses the rest of in-kernel
kretprobe infrastructure overhead. Results are in millions of calls per
second. This is not super scientific, but shows the trend nevertheless.

BEFORE
======
kretprobe      :    9.794 ± 0.086M/s
kretprobe-multi:   10.219 ± 0.032M/s

AFTER
=====
kretprobe      :    9.937 ± 0.174M/s (+1.5%)
kretprobe-multi:   10.440 ± 0.108M/s (+2.2%)

Link: https://lore.kernel.org/all/20240424215214.3956041-2-andrii@kernel.org/

Cc: Matt (Qiang) Wu <wuqiang.matt@bytedance.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 include/linux/objpool.h | 101 +++++++++++++++++++++++++++++++++++++++++++++++-
 lib/objpool.c           | 100 -----------------------------------------------
 2 files changed, 99 insertions(+), 102 deletions(-)

(limited to 'include')

diff --git a/include/linux/objpool.h b/include/linux/objpool.h
index 15aff4a17f0c..d8b1f7b91128 100644
--- a/include/linux/objpool.h
+++ b/include/linux/objpool.h
@@ -5,6 +5,10 @@
 
 #include <linux/types.h>
 #include <linux/refcount.h>
+#include <linux/atomic.h>
+#include <linux/cpumask.h>
+#include <linux/irqflags.h>
+#include <linux/smp.h>
 
 /*
  * objpool: ring-array based lockless MPMC queue
@@ -118,13 +122,94 @@ int objpool_init(struct objpool_head *pool, int nr_objs, int object_size,
 		 gfp_t gfp, void *context, objpool_init_obj_cb objinit,
 		 objpool_fini_cb release);
 
+/* try to retrieve object from slot */
+static inline void *__objpool_try_get_slot(struct objpool_head *pool, int cpu)
+{
+	struct objpool_slot *slot = pool->cpu_slots[cpu];
+	/* load head snapshot, other cpus may change it */
+	uint32_t head = smp_load_acquire(&slot->head);
+
+	while (head != READ_ONCE(slot->last)) {
+		void *obj;
+
+		/*
+		 * data visibility of 'last' and 'head' could be out of
+		 * order since memory updating of 'last' and 'head' are
+		 * performed in push() and pop() independently
+		 *
+		 * before any retrieving attempts, pop() must guarantee
+		 * 'last' is behind 'head', that is to say, there must
+		 * be available objects in slot, which could be ensured
+		 * by condition 'last != head && last - head <= nr_objs'
+		 * that is equivalent to 'last - head - 1 < nr_objs' as
+		 * 'last' and 'head' are both unsigned int32
+		 */
+		if (READ_ONCE(slot->last) - head - 1 >= pool->nr_objs) {
+			head = READ_ONCE(slot->head);
+			continue;
+		}
+
+		/* obj must be retrieved before moving forward head */
+		obj = READ_ONCE(slot->entries[head & slot->mask]);
+
+		/* move head forward to mark it's consumption */
+		if (try_cmpxchg_release(&slot->head, &head, head + 1))
+			return obj;
+	}
+
+	return NULL;
+}
+
 /**
  * objpool_pop() - allocate an object from objpool
  * @pool: object pool
  *
  * return value: object ptr or NULL if failed
  */
-void *objpool_pop(struct objpool_head *pool);
+static inline void *objpool_pop(struct objpool_head *pool)
+{
+	void *obj = NULL;
+	unsigned long flags;
+	int i, cpu;
+
+	/* disable local irq to avoid preemption & interruption */
+	raw_local_irq_save(flags);
+
+	cpu = raw_smp_processor_id();
+	for (i = 0; i < num_possible_cpus(); i++) {
+		obj = __objpool_try_get_slot(pool, cpu);
+		if (obj)
+			break;
+		cpu = cpumask_next_wrap(cpu, cpu_possible_mask, -1, 1);
+	}
+	raw_local_irq_restore(flags);
+
+	return obj;
+}
+
+/* adding object to slot, abort if the slot was already full */
+static inline int
+__objpool_try_add_slot(void *obj, struct objpool_head *pool, int cpu)
+{
+	struct objpool_slot *slot = pool->cpu_slots[cpu];
+	uint32_t head, tail;
+
+	/* loading tail and head as a local snapshot, tail first */
+	tail = READ_ONCE(slot->tail);
+
+	do {
+		head = READ_ONCE(slot->head);
+		/* fault caught: something must be wrong */
+		WARN_ON_ONCE(tail - head > pool->nr_objs);
+	} while (!try_cmpxchg_acquire(&slot->tail, &tail, tail + 1));
+
+	/* now the tail position is reserved for the given obj */
+	WRITE_ONCE(slot->entries[tail & slot->mask], obj);
+	/* update sequence to make this obj available for pop() */
+	smp_store_release(&slot->last, tail + 1);
+
+	return 0;
+}
 
 /**
  * objpool_push() - reclaim the object and return back to objpool
@@ -134,7 +219,19 @@ void *objpool_pop(struct objpool_head *pool);
  * return: 0 or error code (it fails only when user tries to push
  * the same object multiple times or wrong "objects" into objpool)
  */
-int objpool_push(void *obj, struct objpool_head *pool);
+static inline int objpool_push(void *obj, struct objpool_head *pool)
+{
+	unsigned long flags;
+	int rc;
+
+	/* disable local irq to avoid preemption & interruption */
+	raw_local_irq_save(flags);
+	rc = __objpool_try_add_slot(obj, pool, raw_smp_processor_id());
+	raw_local_irq_restore(flags);
+
+	return rc;
+}
+
 
 /**
  * objpool_drop() - discard the object and deref objpool
diff --git a/lib/objpool.c b/lib/objpool.c
index cfdc02420884..f696308fc026 100644
--- a/lib/objpool.c
+++ b/lib/objpool.c
@@ -152,106 +152,6 @@ int objpool_init(struct objpool_head *pool, int nr_objs, int object_size,
 }
 EXPORT_SYMBOL_GPL(objpool_init);
 
-/* adding object to slot, abort if the slot was already full */
-static inline int
-objpool_try_add_slot(void *obj, struct objpool_head *pool, int cpu)
-{
-	struct objpool_slot *slot = pool->cpu_slots[cpu];
-	uint32_t head, tail;
-
-	/* loading tail and head as a local snapshot, tail first */
-	tail = READ_ONCE(slot->tail);
-
-	do {
-		head = READ_ONCE(slot->head);
-		/* fault caught: something must be wrong */
-		WARN_ON_ONCE(tail - head > pool->nr_objs);
-	} while (!try_cmpxchg_acquire(&slot->tail, &tail, tail + 1));
-
-	/* now the tail position is reserved for the given obj */
-	WRITE_ONCE(slot->entries[tail & slot->mask], obj);
-	/* update sequence to make this obj available for pop() */
-	smp_store_release(&slot->last, tail + 1);
-
-	return 0;
-}
-
-/* reclaim an object to object pool */
-int objpool_push(void *obj, struct objpool_head *pool)
-{
-	unsigned long flags;
-	int rc;
-
-	/* disable local irq to avoid preemption & interruption */
-	raw_local_irq_save(flags);
-	rc = objpool_try_add_slot(obj, pool, raw_smp_processor_id());
-	raw_local_irq_restore(flags);
-
-	return rc;
-}
-EXPORT_SYMBOL_GPL(objpool_push);
-
-/* try to retrieve object from slot */
-static inline void *objpool_try_get_slot(struct objpool_head *pool, int cpu)
-{
-	struct objpool_slot *slot = pool->cpu_slots[cpu];
-	/* load head snapshot, other cpus may change it */
-	uint32_t head = smp_load_acquire(&slot->head);
-
-	while (head != READ_ONCE(slot->last)) {
-		void *obj;
-
-		/*
-		 * data visibility of 'last' and 'head' could be out of
-		 * order since memory updating of 'last' and 'head' are
-		 * performed in push() and pop() independently
-		 *
-		 * before any retrieving attempts, pop() must guarantee
-		 * 'last' is behind 'head', that is to say, there must
-		 * be available objects in slot, which could be ensured
-		 * by condition 'last != head && last - head <= nr_objs'
-		 * that is equivalent to 'last - head - 1 < nr_objs' as
-		 * 'last' and 'head' are both unsigned int32
-		 */
-		if (READ_ONCE(slot->last) - head - 1 >= pool->nr_objs) {
-			head = READ_ONCE(slot->head);
-			continue;
-		}
-
-		/* obj must be retrieved before moving forward head */
-		obj = READ_ONCE(slot->entries[head & slot->mask]);
-
-		/* move head forward to mark it's consumption */
-		if (try_cmpxchg_release(&slot->head, &head, head + 1))
-			return obj;
-	}
-
-	return NULL;
-}
-
-/* allocate an object from object pool */
-void *objpool_pop(struct objpool_head *pool)
-{
-	void *obj = NULL;
-	unsigned long flags;
-	int i, cpu;
-
-	/* disable local irq to avoid preemption & interruption */
-	raw_local_irq_save(flags);
-
-	cpu = raw_smp_processor_id();
-	for (i = 0; i < num_possible_cpus(); i++) {
-		obj = objpool_try_get_slot(pool, cpu);
-		if (obj)
-			break;
-		cpu = cpumask_next_wrap(cpu, cpu_possible_mask, -1, 1);
-	}
-	raw_local_irq_restore(flags);
-
-	return obj;
-}
-EXPORT_SYMBOL_GPL(objpool_pop);
-
 /* release whole objpool forcely */
 void objpool_free(struct objpool_head *pool)
 {
-- 
cgit v1.2.3


From 78d0b16127daa26d016c215a089ae330878291f7 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andrii@kernel.org>
Date: Wed, 24 Apr 2024 14:52:14 -0700
Subject: objpool: cache nr_possible_cpus() and avoid caching nr_cpu_ids
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Profiling shows that calling nr_possible_cpus() in objpool_pop() takes
a noticeable amount of CPU (when profiled on 80-core machine), as we
need to recalculate number of set bits in a CPU bit mask. This number
can't change, so there is no point in paying the price for recalculating
it. As such, cache this value in struct objpool_head and use it in
objpool_pop().

On the other hand, cached pool->nr_cpus isn't necessary, as it's not
used in hot path and is also a pretty trivial value to retrieve. So drop
pool->nr_cpus in favor of using nr_cpu_ids everywhere. This way the size
of struct objpool_head remains the same, which is a nice bonus.

Same BPF selftests benchmarks were used to evaluate the effect. Using
changes in previous patch (inlining of objpool_pop/objpool_push) as
baseline, here are the differences:

BASELINE
========
kretprobe      :    9.937 ± 0.174M/s
kretprobe-multi:   10.440 ± 0.108M/s

AFTER
=====
kretprobe      :   10.106 ± 0.120M/s (+1.7%)
kretprobe-multi:   10.515 ± 0.180M/s (+0.7%)

Link: https://lore.kernel.org/all/20240424215214.3956041-3-andrii@kernel.org/

Cc: Matt (Qiang) Wu <wuqiang.matt@bytedance.com>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 include/linux/objpool.h |  6 +++---
 lib/objpool.c           | 12 ++++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/objpool.h b/include/linux/objpool.h
index d8b1f7b91128..cb1758eaa2d3 100644
--- a/include/linux/objpool.h
+++ b/include/linux/objpool.h
@@ -73,7 +73,7 @@ typedef int (*objpool_fini_cb)(struct objpool_head *head, void *context);
  * struct objpool_head - object pooling metadata
  * @obj_size:   object size, aligned to sizeof(void *)
  * @nr_objs:    total objs (to be pre-allocated with objpool)
- * @nr_cpus:    local copy of nr_cpu_ids
+ * @nr_possible_cpus: cached value of num_possible_cpus()
  * @capacity:   max objs can be managed by one objpool_slot
  * @gfp:        gfp flags for kmalloc & vmalloc
  * @ref:        refcount of objpool
@@ -85,7 +85,7 @@ typedef int (*objpool_fini_cb)(struct objpool_head *head, void *context);
 struct objpool_head {
 	int                     obj_size;
 	int                     nr_objs;
-	int                     nr_cpus;
+	int                     nr_possible_cpus;
 	int                     capacity;
 	gfp_t                   gfp;
 	refcount_t              ref;
@@ -176,7 +176,7 @@ static inline void *objpool_pop(struct objpool_head *pool)
 	raw_local_irq_save(flags);
 
 	cpu = raw_smp_processor_id();
-	for (i = 0; i < num_possible_cpus(); i++) {
+	for (i = 0; i < pool->nr_possible_cpus; i++) {
 		obj = __objpool_try_get_slot(pool, cpu);
 		if (obj)
 			break;
diff --git a/lib/objpool.c b/lib/objpool.c
index f696308fc026..234f9d0bd081 100644
--- a/lib/objpool.c
+++ b/lib/objpool.c
@@ -50,7 +50,7 @@ objpool_init_percpu_slots(struct objpool_head *pool, int nr_objs,
 {
 	int i, cpu_count = 0;
 
-	for (i = 0; i < pool->nr_cpus; i++) {
+	for (i = 0; i < nr_cpu_ids; i++) {
 
 		struct objpool_slot *slot;
 		int nodes, size, rc;
@@ -60,8 +60,8 @@ objpool_init_percpu_slots(struct objpool_head *pool, int nr_objs,
 			continue;
 
 		/* compute how many objects to be allocated with this slot */
-		nodes = nr_objs / num_possible_cpus();
-		if (cpu_count < (nr_objs % num_possible_cpus()))
+		nodes = nr_objs / pool->nr_possible_cpus;
+		if (cpu_count < (nr_objs % pool->nr_possible_cpus))
 			nodes++;
 		cpu_count++;
 
@@ -103,7 +103,7 @@ static void objpool_fini_percpu_slots(struct objpool_head *pool)
 	if (!pool->cpu_slots)
 		return;
 
-	for (i = 0; i < pool->nr_cpus; i++)
+	for (i = 0; i < nr_cpu_ids; i++)
 		kvfree(pool->cpu_slots[i]);
 	kfree(pool->cpu_slots);
 }
@@ -130,13 +130,13 @@ int objpool_init(struct objpool_head *pool, int nr_objs, int object_size,
 
 	/* initialize objpool pool */
 	memset(pool, 0, sizeof(struct objpool_head));
-	pool->nr_cpus = nr_cpu_ids;
+	pool->nr_possible_cpus = num_possible_cpus();
 	pool->obj_size = object_size;
 	pool->capacity = capacity;
 	pool->gfp = gfp & ~__GFP_ZERO;
 	pool->context = context;
 	pool->release = release;
-	slot_size = pool->nr_cpus * sizeof(struct objpool_slot);
+	slot_size = nr_cpu_ids * sizeof(struct objpool_slot);
 	pool->cpu_slots = kzalloc(slot_size, pool->gfp);
 	if (!pool->cpu_slots)
 		return -ENOMEM;
-- 
cgit v1.2.3


From cd7eb8f83fcf258f71e293f7fc52a70be8ed0128 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Sun, 28 Apr 2024 17:26:44 +0300
Subject: mm/slab: make __free(kfree) accept error pointers

Currently, if an automatically freed allocation is an error pointer that
will lead to a crash.  An example of this is in wm831x_gpio_dbg_show().

   171	char *label __free(kfree) = gpiochip_dup_line_label(chip, i);
   172	if (IS_ERR(label)) {
   173		dev_err(wm831x->dev, "Failed to duplicate label\n");
   174		continue;
   175  }

The auto clean up function should check for error pointers as well,
otherwise we're going to keep hitting issues like this.

Fixes: 54da6a092431 ("locking: Introduce __cleanup() based infrastructure")
Cc: <stable@vger.kernel.org>
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slab.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index e53cbfa18325..739b21262507 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -266,7 +266,7 @@ void kfree(const void *objp);
 void kfree_sensitive(const void *objp);
 size_t __ksize(const void *objp);
 
-DEFINE_FREE(kfree, void *, if (_T) kfree(_T))
+DEFINE_FREE(kfree, void *, if (!IS_ERR_OR_NULL(_T)) kfree(_T))
 
 /**
  * ksize - Report actual allocation size of associated object
@@ -792,7 +792,7 @@ static inline __alloc_size(1, 2) void *kvcalloc(size_t n, size_t size, gfp_t fla
 extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
 		      __realloc_size(3);
 extern void kvfree(const void *addr);
-DEFINE_FREE(kvfree, void *, if (_T) kvfree(_T))
+DEFINE_FREE(kvfree, void *, if (!IS_ERR_OR_NULL(_T)) kvfree(_T))
 
 extern void kvfree_sensitive(const void *addr, size_t len);
 
-- 
cgit v1.2.3


From 948e1a53c2e95ad4c03cc6201edcb5d92e87d841 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 23 Apr 2024 16:05:24 +0100
Subject: KVM: arm64: Simplify vgic-v3 hypercalls

Consolidate the GICv3 VMCR accessor hypercalls into the APR save/restore
hypercalls so that all of the EL2 GICv3 state is covered by a single pair
of hypercalls.

Signed-off-by: Fuad Tabba <tabba@google.com>
Acked-by: Oliver Upton <oliver.upton@linux.dev>
Link: https://lore.kernel.org/r/20240423150538.2103045-17-tabba@google.com
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/include/asm/kvm_asm.h   |  8 ++------
 arch/arm64/include/asm/kvm_hyp.h   |  4 ++--
 arch/arm64/kvm/arm.c               |  5 ++---
 arch/arm64/kvm/hyp/nvhe/hyp-main.c | 24 ++++++------------------
 arch/arm64/kvm/hyp/vgic-v3-sr.c    | 27 +++++++++++++++++++++++----
 arch/arm64/kvm/vgic/vgic-v2.c      |  9 +--------
 arch/arm64/kvm/vgic/vgic-v3.c      | 23 ++---------------------
 arch/arm64/kvm/vgic/vgic.c         | 11 -----------
 arch/arm64/kvm/vgic/vgic.h         |  2 --
 include/kvm/arm_vgic.h             |  1 -
 10 files changed, 38 insertions(+), 76 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 24b5e6b23417..a6330460d9e5 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -73,10 +73,8 @@ enum __kvm_host_smccc_func {
 	__KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_range,
 	__KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context,
 	__KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff,
-	__KVM_HOST_SMCCC_FUNC___vgic_v3_read_vmcr,
-	__KVM_HOST_SMCCC_FUNC___vgic_v3_write_vmcr,
-	__KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs,
-	__KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs,
+	__KVM_HOST_SMCCC_FUNC___vgic_v3_save_vmcr_aprs,
+	__KVM_HOST_SMCCC_FUNC___vgic_v3_restore_vmcr_aprs,
 	__KVM_HOST_SMCCC_FUNC___pkvm_vcpu_init_traps,
 	__KVM_HOST_SMCCC_FUNC___pkvm_init_vm,
 	__KVM_HOST_SMCCC_FUNC___pkvm_init_vcpu,
@@ -241,8 +239,6 @@ extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
 extern void __kvm_adjust_pc(struct kvm_vcpu *vcpu);
 
 extern u64 __vgic_v3_get_gic_config(void);
-extern u64 __vgic_v3_read_vmcr(void);
-extern void __vgic_v3_write_vmcr(u32 vmcr);
 extern void __vgic_v3_init_lrs(void);
 
 extern u64 __kvm_get_mdcr_el2(void);
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 3e2a1ac0c9bb..3e80464f8953 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -80,8 +80,8 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if);
 void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if);
 void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if);
 void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if);
-void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if);
-void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if);
+void __vgic_v3_save_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if);
+void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if);
 int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu);
 
 #ifdef __KVM_NVHE_HYPERVISOR__
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 7f87fbb452c5..b6b6f60becdf 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -784,9 +784,8 @@ void kvm_vcpu_wfi(struct kvm_vcpu *vcpu)
 	 * doorbells to be signalled, should an interrupt become pending.
 	 */
 	preempt_disable();
-	kvm_vgic_vmcr_sync(vcpu);
 	vcpu_set_flag(vcpu, IN_WFI);
-	vgic_v4_put(vcpu);
+	kvm_vgic_put(vcpu);
 	preempt_enable();
 
 	kvm_vcpu_halt(vcpu);
@@ -794,7 +793,7 @@ void kvm_vcpu_wfi(struct kvm_vcpu *vcpu)
 
 	preempt_disable();
 	vcpu_clear_flag(vcpu, IN_WFI);
-	vgic_v4_load(vcpu);
+	kvm_vgic_load(vcpu);
 	preempt_enable();
 }
 
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 26561c562f7a..d5c48dc98f67 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -175,16 +175,6 @@ static void handle___vgic_v3_get_gic_config(struct kvm_cpu_context *host_ctxt)
 	cpu_reg(host_ctxt, 1) = __vgic_v3_get_gic_config();
 }
 
-static void handle___vgic_v3_read_vmcr(struct kvm_cpu_context *host_ctxt)
-{
-	cpu_reg(host_ctxt, 1) = __vgic_v3_read_vmcr();
-}
-
-static void handle___vgic_v3_write_vmcr(struct kvm_cpu_context *host_ctxt)
-{
-	__vgic_v3_write_vmcr(cpu_reg(host_ctxt, 1));
-}
-
 static void handle___vgic_v3_init_lrs(struct kvm_cpu_context *host_ctxt)
 {
 	__vgic_v3_init_lrs();
@@ -195,18 +185,18 @@ static void handle___kvm_get_mdcr_el2(struct kvm_cpu_context *host_ctxt)
 	cpu_reg(host_ctxt, 1) = __kvm_get_mdcr_el2();
 }
 
-static void handle___vgic_v3_save_aprs(struct kvm_cpu_context *host_ctxt)
+static void handle___vgic_v3_save_vmcr_aprs(struct kvm_cpu_context *host_ctxt)
 {
 	DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1);
 
-	__vgic_v3_save_aprs(kern_hyp_va(cpu_if));
+	__vgic_v3_save_vmcr_aprs(kern_hyp_va(cpu_if));
 }
 
-static void handle___vgic_v3_restore_aprs(struct kvm_cpu_context *host_ctxt)
+static void handle___vgic_v3_restore_vmcr_aprs(struct kvm_cpu_context *host_ctxt)
 {
 	DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1);
 
-	__vgic_v3_restore_aprs(kern_hyp_va(cpu_if));
+	__vgic_v3_restore_vmcr_aprs(kern_hyp_va(cpu_if));
 }
 
 static void handle___pkvm_init(struct kvm_cpu_context *host_ctxt)
@@ -337,10 +327,8 @@ static const hcall_t host_hcall[] = {
 	HANDLE_FUNC(__kvm_tlb_flush_vmid_range),
 	HANDLE_FUNC(__kvm_flush_cpu_context),
 	HANDLE_FUNC(__kvm_timer_set_cntvoff),
-	HANDLE_FUNC(__vgic_v3_read_vmcr),
-	HANDLE_FUNC(__vgic_v3_write_vmcr),
-	HANDLE_FUNC(__vgic_v3_save_aprs),
-	HANDLE_FUNC(__vgic_v3_restore_aprs),
+	HANDLE_FUNC(__vgic_v3_save_vmcr_aprs),
+	HANDLE_FUNC(__vgic_v3_restore_vmcr_aprs),
 	HANDLE_FUNC(__pkvm_vcpu_init_traps),
 	HANDLE_FUNC(__pkvm_init_vm),
 	HANDLE_FUNC(__pkvm_init_vcpu),
diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c
index 6cb638b184b1..7b397fad26f2 100644
--- a/arch/arm64/kvm/hyp/vgic-v3-sr.c
+++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c
@@ -330,7 +330,7 @@ void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if)
 		write_gicreg(0, ICH_HCR_EL2);
 }
 
-void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if)
+static void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if)
 {
 	u64 val;
 	u32 nr_pre_bits;
@@ -363,7 +363,7 @@ void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if)
 	}
 }
 
-void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if)
+static void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if)
 {
 	u64 val;
 	u32 nr_pre_bits;
@@ -455,16 +455,35 @@ u64 __vgic_v3_get_gic_config(void)
 	return val;
 }
 
-u64 __vgic_v3_read_vmcr(void)
+static u64 __vgic_v3_read_vmcr(void)
 {
 	return read_gicreg(ICH_VMCR_EL2);
 }
 
-void __vgic_v3_write_vmcr(u32 vmcr)
+static void __vgic_v3_write_vmcr(u32 vmcr)
 {
 	write_gicreg(vmcr, ICH_VMCR_EL2);
 }
 
+void __vgic_v3_save_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if)
+{
+	__vgic_v3_save_aprs(cpu_if);
+	if (cpu_if->vgic_sre)
+		cpu_if->vgic_vmcr = __vgic_v3_read_vmcr();
+}
+
+void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if)
+{
+	/*
+	 * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen
+	 * is dependent on ICC_SRE_EL1.SRE, and we have to perform the
+	 * VMCR_EL2 save/restore in the world switch.
+	 */
+	if (cpu_if->vgic_sre)
+		__vgic_v3_write_vmcr(cpu_if->vgic_vmcr);
+	__vgic_v3_restore_aprs(cpu_if);
+}
+
 static int __vgic_v3_bpr_min(void)
 {
 	/* See Pseudocode for VPriorityGroup */
diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c
index 7e9cdb78f7ce..ae5a44d5702d 100644
--- a/arch/arm64/kvm/vgic/vgic-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-v2.c
@@ -464,17 +464,10 @@ void vgic_v2_load(struct kvm_vcpu *vcpu)
 		       kvm_vgic_global_state.vctrl_base + GICH_APR);
 }
 
-void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-
-	cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR);
-}
-
 void vgic_v2_put(struct kvm_vcpu *vcpu)
 {
 	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
 
-	vgic_v2_vmcr_sync(vcpu);
+	cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR);
 	cpu_if->vgic_apr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_APR);
 }
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 4ea3340786b9..ed6e412cd74b 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -722,15 +722,7 @@ void vgic_v3_load(struct kvm_vcpu *vcpu)
 {
 	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
 
-	/*
-	 * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen
-	 * is dependent on ICC_SRE_EL1.SRE, and we have to perform the
-	 * VMCR_EL2 save/restore in the world switch.
-	 */
-	if (likely(cpu_if->vgic_sre))
-		kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr);
-
-	kvm_call_hyp(__vgic_v3_restore_aprs, cpu_if);
+	kvm_call_hyp(__vgic_v3_restore_vmcr_aprs, cpu_if);
 
 	if (has_vhe())
 		__vgic_v3_activate_traps(cpu_if);
@@ -738,24 +730,13 @@ void vgic_v3_load(struct kvm_vcpu *vcpu)
 	WARN_ON(vgic_v4_load(vcpu));
 }
 
-void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu)
-{
-	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
-
-	if (likely(cpu_if->vgic_sre))
-		cpu_if->vgic_vmcr = kvm_call_hyp_ret(__vgic_v3_read_vmcr);
-}
-
 void vgic_v3_put(struct kvm_vcpu *vcpu)
 {
 	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
 
+	kvm_call_hyp(__vgic_v3_save_vmcr_aprs, cpu_if);
 	WARN_ON(vgic_v4_put(vcpu));
 
-	vgic_v3_vmcr_sync(vcpu);
-
-	kvm_call_hyp(__vgic_v3_save_aprs, cpu_if);
-
 	if (has_vhe())
 		__vgic_v3_deactivate_traps(cpu_if);
 }
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index 4ec93587c8cd..fcc5747f51e9 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -939,17 +939,6 @@ void kvm_vgic_put(struct kvm_vcpu *vcpu)
 		vgic_v3_put(vcpu);
 }
 
-void kvm_vgic_vmcr_sync(struct kvm_vcpu *vcpu)
-{
-	if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
-		return;
-
-	if (kvm_vgic_global_state.type == VGIC_V2)
-		vgic_v2_vmcr_sync(vcpu);
-	else
-		vgic_v3_vmcr_sync(vcpu);
-}
-
 int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
 {
 	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index 0c2b82de8fa3..4b93528e6a89 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -214,7 +214,6 @@ int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
 void vgic_v2_init_lrs(void);
 void vgic_v2_load(struct kvm_vcpu *vcpu);
 void vgic_v2_put(struct kvm_vcpu *vcpu);
-void vgic_v2_vmcr_sync(struct kvm_vcpu *vcpu);
 
 void vgic_v2_save_state(struct kvm_vcpu *vcpu);
 void vgic_v2_restore_state(struct kvm_vcpu *vcpu);
@@ -253,7 +252,6 @@ bool vgic_v3_check_base(struct kvm *kvm);
 
 void vgic_v3_load(struct kvm_vcpu *vcpu);
 void vgic_v3_put(struct kvm_vcpu *vcpu);
-void vgic_v3_vmcr_sync(struct kvm_vcpu *vcpu);
 
 bool vgic_has_its(struct kvm *kvm);
 int kvm_vgic_register_its_device(void);
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 47035946648e..0c3cce31e0a2 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -388,7 +388,6 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
 
 void kvm_vgic_load(struct kvm_vcpu *vcpu);
 void kvm_vgic_put(struct kvm_vcpu *vcpu);
-void kvm_vgic_vmcr_sync(struct kvm_vcpu *vcpu);
 
 #define irqchip_in_kernel(k)	(!!((k)->arch.vgic.in_kernel))
 #define vgic_initialized(k)	((k)->arch.vgic.initialized)
-- 
cgit v1.2.3


From 5e4a264bf8b55d5a1b4c8cd96ac10b99f98d7487 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Fri, 26 Apr 2024 20:34:00 -0700
Subject: acpi/ghes: Process CXL Component Events

BIOS can configure memory devices as firmware first.  This will send CXL
events to the firmware instead of the OS.  The firmware can then inform
the OS of these events via UEFI.

UEFI v2.10 section N.2.14 defines a Common Platform Error Record (CPER)
format for CXL Component Events.  The format is mostly the same as the
CXL Common Event Record Format.  The difference lies in the use of a
GUID as the CPER Section Type which matches the UUID defined in CXL 3.1
Table 8-43.

Currently a configuration such as this will trace a non standard event
in the log omitting useful details of the event.  In addition the CXL
sub-system contains additional region and HPA information useful to the
user.[0]

The CXL code is required to be called from process context as it needs
to take a device lock.  The GHES code may be in interrupt context.  This
complicated the use of a callback.  Dan Williams suggested the use of
work items as an atomic way of switching between the callback execution
and a default handler.[1]

The use of a kfifo simplifies queue processing by providing lock free
fifo operations.  cxl_cper_kfifo_get() allows easier management of the
kfifo between the ghes and cxl modules.

CXL 3.1 Table 8-127 requires a device to have a queue depth of 1 for
each of the four event logs.  A combined queue depth of 32 is chosen to
provide room for 8 entries of each log type.

Add GHES support to detect CXL CPER records.  Add the ability for the
CXL sub-system to register a work queue to process the events.

This patch adds back the functionality which was removed to fix the
report by Dan Carpenter[2].

Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Rafael J. Wysocki <rafael@kernel.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Suggested-by: Dan Carpenter <dan.carpenter@linaro.org>
Suggested-by: Dan Williams <dan.j.williams@intel.com>
Link: http://lore.kernel.org/r/cover.1711598777.git.alison.schofield@intel.com [0]
Link: http://lore.kernel.org/r/65d111eb87115_6c745294ac@dwillia2-xfh.jf.intel.com.notmuch [1]
Link: http://lore.kernel.org/r/b963c490-2c13-4b79-bbe7-34c6568423c7@moroto.mountain [2]
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Tested-by: Smita Koralahalli <Smita.KoralahalliChannabasappa@amd.com>
Link: https://lore.kernel.org/r/20240426-cxl-cper3-v4-1-58076cce1624@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/acpi/apei/ghes.c  | 110 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/cxl-event.h |  27 ++++++++++++
 2 files changed, 137 insertions(+)

(limited to 'include')

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 512067cac170..2247a1535b52 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -26,6 +26,8 @@
 #include <linux/interrupt.h>
 #include <linux/timer.h>
 #include <linux/cper.h>
+#include <linux/cleanup.h>
+#include <linux/cxl-event.h>
 #include <linux/platform_device.h>
 #include <linux/mutex.h>
 #include <linux/ratelimit.h>
@@ -33,6 +35,7 @@
 #include <linux/irq_work.h>
 #include <linux/llist.h>
 #include <linux/genalloc.h>
+#include <linux/kfifo.h>
 #include <linux/pci.h>
 #include <linux/pfn.h>
 #include <linux/aer.h>
@@ -673,6 +676,101 @@ static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata,
 	schedule_work(&entry->work);
 }
 
+/* CXL Event record UUIDs are formated as GUIDs and reported in section type */
+
+/*
+ * General Media Event Record
+ * CXL rev 3.0 Section 8.2.9.2.1.1; Table 8-43
+ */
+#define CPER_SEC_CXL_GEN_MEDIA_GUID					\
+	GUID_INIT(0xfbcd0a77, 0xc260, 0x417f,				\
+		  0x85, 0xa9, 0x08, 0x8b, 0x16, 0x21, 0xeb, 0xa6)
+
+/*
+ * DRAM Event Record
+ * CXL rev 3.0 section 8.2.9.2.1.2; Table 8-44
+ */
+#define CPER_SEC_CXL_DRAM_GUID						\
+	GUID_INIT(0x601dcbb3, 0x9c06, 0x4eab,				\
+		  0xb8, 0xaf, 0x4e, 0x9b, 0xfb, 0x5c, 0x96, 0x24)
+
+/*
+ * Memory Module Event Record
+ * CXL rev 3.0 section 8.2.9.2.1.3; Table 8-45
+ */
+#define CPER_SEC_CXL_MEM_MODULE_GUID					\
+	GUID_INIT(0xfe927475, 0xdd59, 0x4339,				\
+		  0xa5, 0x86, 0x79, 0xba, 0xb1, 0x13, 0xb7, 0x74)
+
+/* Room for 8 entries for each of the 4 event log queues */
+#define CXL_CPER_FIFO_DEPTH 32
+DEFINE_KFIFO(cxl_cper_fifo, struct cxl_cper_work_data, CXL_CPER_FIFO_DEPTH);
+
+/* Synchronize schedule_work() with cxl_cper_work changes */
+static DEFINE_SPINLOCK(cxl_cper_work_lock);
+struct work_struct *cxl_cper_work;
+
+static void cxl_cper_post_event(enum cxl_event_type event_type,
+				struct cxl_cper_event_rec *rec)
+{
+	struct cxl_cper_work_data wd;
+
+	if (rec->hdr.length <= sizeof(rec->hdr) ||
+	    rec->hdr.length > sizeof(*rec)) {
+		pr_err(FW_WARN "CXL CPER Invalid section length (%u)\n",
+		       rec->hdr.length);
+		return;
+	}
+
+	if (!(rec->hdr.validation_bits & CPER_CXL_COMP_EVENT_LOG_VALID)) {
+		pr_err(FW_WARN "CXL CPER invalid event\n");
+		return;
+	}
+
+	guard(spinlock_irqsave)(&cxl_cper_work_lock);
+
+	if (!cxl_cper_work)
+		return;
+
+	wd.event_type = event_type;
+	memcpy(&wd.rec, rec, sizeof(wd.rec));
+
+	if (!kfifo_put(&cxl_cper_fifo, wd)) {
+		pr_err_ratelimited("CXL CPER kfifo overflow\n");
+		return;
+	}
+
+	schedule_work(cxl_cper_work);
+}
+
+int cxl_cper_register_work(struct work_struct *work)
+{
+	if (cxl_cper_work)
+		return -EINVAL;
+
+	guard(spinlock)(&cxl_cper_work_lock);
+	cxl_cper_work = work;
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cper_register_work, CXL);
+
+int cxl_cper_unregister_work(struct work_struct *work)
+{
+	if (cxl_cper_work != work)
+		return -EINVAL;
+
+	guard(spinlock)(&cxl_cper_work_lock);
+	cxl_cper_work = NULL;
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cper_unregister_work, CXL);
+
+int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd)
+{
+	return kfifo_get(&cxl_cper_fifo, wd);
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cper_kfifo_get, CXL);
+
 static bool ghes_do_proc(struct ghes *ghes,
 			 const struct acpi_hest_generic_status *estatus)
 {
@@ -707,6 +805,18 @@ static bool ghes_do_proc(struct ghes *ghes,
 		}
 		else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
 			queued = ghes_handle_arm_hw_error(gdata, sev, sync);
+		} else if (guid_equal(sec_type, &CPER_SEC_CXL_GEN_MEDIA_GUID)) {
+			struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata);
+
+			cxl_cper_post_event(CXL_CPER_EVENT_GEN_MEDIA, rec);
+		} else if (guid_equal(sec_type, &CPER_SEC_CXL_DRAM_GUID)) {
+			struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata);
+
+			cxl_cper_post_event(CXL_CPER_EVENT_DRAM, rec);
+		} else if (guid_equal(sec_type, &CPER_SEC_CXL_MEM_MODULE_GUID)) {
+			struct cxl_cper_event_rec *rec = acpi_hest_get_payload(gdata);
+
+			cxl_cper_post_event(CXL_CPER_EVENT_MEM_MODULE, rec);
 		} else {
 			void *err = acpi_hest_get_payload(gdata);
 
diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index 03fa6d50d46f..a0067c49e2ca 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -3,6 +3,8 @@
 #ifndef _LINUX_CXL_EVENT_H
 #define _LINUX_CXL_EVENT_H
 
+#include <linux/workqueue_types.h>
+
 /*
  * Common Event Record Format
  * CXL rev 3.0 section 8.2.9.2.1; Table 8-42
@@ -140,4 +142,29 @@ struct cxl_cper_event_rec {
 	union cxl_event event;
 } __packed;
 
+struct cxl_cper_work_data {
+	enum cxl_event_type event_type;
+	struct cxl_cper_event_rec rec;
+};
+
+#ifdef CONFIG_ACPI_APEI_GHES
+int cxl_cper_register_work(struct work_struct *work);
+int cxl_cper_unregister_work(struct work_struct *work);
+int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd);
+#else
+static inline int cxl_cper_register_work(struct work_struct *work);
+{
+	return 0;
+}
+
+static inline int cxl_cper_unregister_work(struct work_struct *work);
+{
+	return 0;
+}
+static inline int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd)
+{
+	return 0;
+}
+#endif
+
 #endif /* _LINUX_CXL_EVENT_H */
-- 
cgit v1.2.3


From 74e797d79cf131d9b1a54001cf64a0b492a83e60 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 27 Mar 2024 08:51:38 +0000
Subject: mm: Provide a means of invalidation without using launder_folio

Implement a replacement for launder_folio.  The key feature of
invalidate_inode_pages2() is that it locks each folio individually, unmaps
it to prevent mmap'd accesses interfering and calls the ->launder_folio()
address_space op to flush it.  This has problems: firstly, each folio is
written individually as one or more small writes; secondly, adjacent folios
cannot be added so easily into the laundry; thirdly, it's yet another op to
implement.

Instead, use the invalidate lock to cause anyone wanting to add a folio to
the inode to wait, then unmap all the folios if we have mmaps, then,
conditionally, use ->writepages() to flush any dirty data back and then
discard all pages.

The invalidate lock prevents ->read_iter(), ->write_iter() and faulting
through mmap all from adding pages for the duration.

This is then used from netfslib to handle the flusing in unbuffered and
direct writes.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Matthew Wilcox <willy@infradead.org>
cc: Miklos Szeredi <miklos@szeredi.hu>
cc: Trond Myklebust <trond.myklebust@hammerspace.com>
cc: Christoph Hellwig <hch@lst.de>
cc: Andrew Morton <akpm@linux-foundation.org>
cc: Alexander Viro <viro@zeniv.linux.org.uk>
cc: Christian Brauner <brauner@kernel.org>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-mm@kvack.org
cc: linux-fsdevel@vger.kernel.org
cc: netfs@lists.linux.dev
cc: v9fs@lists.linux.dev
cc: linux-afs@lists.infradead.org
cc: ceph-devel@vger.kernel.org
cc: linux-cifs@vger.kernel.org
cc: linux-nfs@vger.kernel.org
cc: devel@lists.orangefs.org
---
 fs/netfs/direct_write.c | 28 +++++++++++++++++++++----
 include/linux/pagemap.h |  2 ++
 mm/filemap.c            | 54 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 80 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
index bee047e20f5d..2b81cd4aae6e 100644
--- a/fs/netfs/direct_write.c
+++ b/fs/netfs/direct_write.c
@@ -132,12 +132,14 @@ out:
 ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_mapping->host;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
 	struct netfs_inode *ictx = netfs_inode(inode);
-	unsigned long long end;
 	ssize_t ret;
+	loff_t pos = iocb->ki_pos;
+	unsigned long long end = pos + iov_iter_count(from) - 1;
 
-	_enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));
+	_enter("%llx,%zx,%llx", pos, iov_iter_count(from), i_size_read(inode));
 
 	if (!iov_iter_count(from))
 		return 0;
@@ -157,7 +159,25 @@ ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	ret = file_update_time(file);
 	if (ret < 0)
 		goto out;
-	ret = kiocb_invalidate_pages(iocb, iov_iter_count(from));
+	if (iocb->ki_flags & IOCB_NOWAIT) {
+		/* We could block if there are any pages in the range. */
+		ret = -EAGAIN;
+		if (filemap_range_has_page(mapping, pos, end))
+			if (filemap_invalidate_inode(inode, true, pos, end))
+				goto out;
+	} else {
+		ret = filemap_write_and_wait_range(mapping, pos, end);
+		if (ret < 0)
+			goto out;
+	}
+
+	/*
+	 * After a write we want buffered reads to be sure to go to disk to get
+	 * the new data.  We invalidate clean cached page from the region we're
+	 * about to write.  We do this *before* the write so that we can return
+	 * without clobbering -EIOCBQUEUED from ->direct_IO().
+	 */
+	ret = filemap_invalidate_inode(inode, true, pos, end);
 	if (ret < 0)
 		goto out;
 	end = iocb->ki_pos + iov_iter_count(from);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 2df35e65557d..c5e33e2ca48a 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -40,6 +40,8 @@ int filemap_fdatawait_keep_errors(struct address_space *mapping);
 int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend);
 int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
 		loff_t start_byte, loff_t end_byte);
+int filemap_invalidate_inode(struct inode *inode, bool flush,
+			     loff_t start, loff_t end);
 
 static inline int filemap_fdatawait(struct address_space *mapping)
 {
diff --git a/mm/filemap.c b/mm/filemap.c
index 9a2e28bf298a..1d6b3a369077 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -4134,6 +4134,60 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp)
 }
 EXPORT_SYMBOL(filemap_release_folio);
 
+/**
+ * filemap_invalidate_inode - Invalidate/forcibly write back a range of an inode's pagecache
+ * @inode: The inode to flush
+ * @flush: Set to write back rather than simply invalidate.
+ * @start: First byte to in range.
+ * @end: Last byte in range (inclusive), or LLONG_MAX for everything from start
+ *       onwards.
+ *
+ * Invalidate all the folios on an inode that contribute to the specified
+ * range, possibly writing them back first.  Whilst the operation is
+ * undertaken, the invalidate lock is held to prevent new folios from being
+ * installed.
+ */
+int filemap_invalidate_inode(struct inode *inode, bool flush,
+			     loff_t start, loff_t end)
+{
+	struct address_space *mapping = inode->i_mapping;
+	pgoff_t first = start >> PAGE_SHIFT;
+	pgoff_t last = end >> PAGE_SHIFT;
+	pgoff_t nr = end == LLONG_MAX ? ULONG_MAX : last - first + 1;
+
+	if (!mapping || !mapping->nrpages || end < start)
+		goto out;
+
+	/* Prevent new folios from being added to the inode. */
+	filemap_invalidate_lock(mapping);
+
+	if (!mapping->nrpages)
+		goto unlock;
+
+	unmap_mapping_pages(mapping, first, nr, false);
+
+	/* Write back the data if we're asked to. */
+	if (flush) {
+		struct writeback_control wbc = {
+			.sync_mode	= WB_SYNC_ALL,
+			.nr_to_write	= LONG_MAX,
+			.range_start	= start,
+			.range_end	= end,
+		};
+
+		filemap_fdatawrite_wbc(mapping, &wbc);
+	}
+
+	/* Wait for writeback to complete on all folios and discard. */
+	truncate_inode_pages_range(mapping, start, end);
+
+unlock:
+	filemap_invalidate_unlock(mapping);
+out:
+	return filemap_check_errors(mapping);
+}
+EXPORT_SYMBOL_GPL(filemap_invalidate_inode);
+
 #ifdef CONFIG_CACHESTAT_SYSCALL
 /**
  * filemap_cachestat() - compute the page cache statistics of a mapping
-- 
cgit v1.2.3


From b4ff7b178bda0ce4ec9f799c6a85579ba17f0df3 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 27 Mar 2024 17:58:39 +0000
Subject: netfs: Remove ->launder_folio() support

Remove support for ->launder_folio() from netfslib and expect filesystems
to use filemap_invalidate_inode() instead.  netfs_launder_folio() can then
be got rid of.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: Eric Van Hensbergen <ericvh@kernel.org>
cc: Latchesar Ionkov <lucho@ionkov.net>
cc: Dominique Martinet <asmadeus@codewreck.org>
cc: Christian Schoenebeck <linux_oss@crudebyte.com>
cc: David Howells <dhowells@redhat.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: Steve French <sfrench@samba.org>
cc: Matthew Wilcox <willy@infradead.org>
cc: linux-mm@kvack.org
cc: linux-fsdevel@vger.kernel.org
cc: netfs@lists.linux.dev
cc: v9fs@lists.linux.dev
cc: linux-afs@lists.infradead.org
cc: ceph-devel@vger.kernel.org
cc: linux-cifs@vger.kernel.org
cc: devel@lists.orangefs.org
---
 fs/netfs/buffered_write.c    | 74 --------------------------------------------
 fs/netfs/main.c              |  1 -
 include/linux/netfs.h        |  2 --
 include/trace/events/netfs.h |  3 --
 4 files changed, 80 deletions(-)

(limited to 'include')

diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index 57c6eab01261..d8f66ce94575 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -1200,77 +1200,3 @@ out:
 	return ret;
 }
 EXPORT_SYMBOL(netfs_writepages);
-
-/*
- * Deal with the disposition of a laundered folio.
- */
-static void netfs_cleanup_launder_folio(struct netfs_io_request *wreq)
-{
-	if (wreq->error) {
-		pr_notice("R=%08x Laundering error %d\n", wreq->debug_id, wreq->error);
-		mapping_set_error(wreq->mapping, wreq->error);
-	}
-}
-
-/**
- * netfs_launder_folio - Clean up a dirty folio that's being invalidated
- * @folio: The folio to clean
- *
- * This is called to write back a folio that's being invalidated when an inode
- * is getting torn down.  Ideally, writepages would be used instead.
- */
-int netfs_launder_folio(struct folio *folio)
-{
-	struct netfs_io_request *wreq;
-	struct address_space *mapping = folio->mapping;
-	struct netfs_folio *finfo = netfs_folio_info(folio);
-	struct netfs_group *group = netfs_folio_group(folio);
-	struct bio_vec bvec;
-	unsigned long long i_size = i_size_read(mapping->host);
-	unsigned long long start = folio_pos(folio);
-	size_t offset = 0, len;
-	int ret = 0;
-
-	if (finfo) {
-		offset = finfo->dirty_offset;
-		start += offset;
-		len = finfo->dirty_len;
-	} else {
-		len = folio_size(folio);
-	}
-	len = min_t(unsigned long long, len, i_size - start);
-
-	wreq = netfs_alloc_request(mapping, NULL, start, len, NETFS_LAUNDER_WRITE);
-	if (IS_ERR(wreq)) {
-		ret = PTR_ERR(wreq);
-		goto out;
-	}
-
-	if (!folio_clear_dirty_for_io(folio))
-		goto out_put;
-
-	trace_netfs_folio(folio, netfs_folio_trace_launder);
-
-	_debug("launder %llx-%llx", start, start + len - 1);
-
-	/* Speculatively write to the cache.  We have to fix this up later if
-	 * the store fails.
-	 */
-	wreq->cleanup = netfs_cleanup_launder_folio;
-
-	bvec_set_folio(&bvec, folio, len, offset);
-	iov_iter_bvec(&wreq->iter, ITER_SOURCE, &bvec, 1, len);
-	if (group != NETFS_FOLIO_COPY_TO_CACHE)
-		__set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
-	ret = netfs_begin_write(wreq, true, netfs_write_trace_launder);
-
-out_put:
-	folio_detach_private(folio);
-	netfs_put_group(group);
-	kfree(finfo);
-	netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
-out:
-	_leave(" = %d", ret);
-	return ret;
-}
-EXPORT_SYMBOL(netfs_launder_folio);
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index c5a73c9ed126..844efbb2e7a2 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -34,7 +34,6 @@ static const char *netfs_origins[nr__netfs_io_origin] = {
 	[NETFS_COPY_TO_CACHE]		= "CC",
 	[NETFS_WRITEBACK]		= "WB",
 	[NETFS_WRITETHROUGH]		= "WT",
-	[NETFS_LAUNDER_WRITE]		= "LW",
 	[NETFS_UNBUFFERED_WRITE]	= "UW",
 	[NETFS_DIO_READ]		= "DR",
 	[NETFS_DIO_WRITE]		= "DW",
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index ddafc6ebff42..3af589dabd7f 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -172,7 +172,6 @@ enum netfs_io_origin {
 	NETFS_COPY_TO_CACHE,		/* This write is to copy a read to the cache */
 	NETFS_WRITEBACK,		/* This write was triggered by writepages */
 	NETFS_WRITETHROUGH,		/* This write was made by netfs_perform_write() */
-	NETFS_LAUNDER_WRITE,		/* This is triggered by ->launder_folio() */
 	NETFS_UNBUFFERED_WRITE,		/* This is an unbuffered write */
 	NETFS_DIO_READ,			/* This is a direct I/O read */
 	NETFS_DIO_WRITE,		/* This is a direct I/O write */
@@ -352,7 +351,6 @@ int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc);
 void netfs_clear_inode_writeback(struct inode *inode, const void *aux);
 void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length);
 bool netfs_release_folio(struct folio *folio, gfp_t gfp);
-int netfs_launder_folio(struct folio *folio);
 
 /* VMA operations API. */
 vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group);
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index e03fafb0c1e3..30769103638f 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -26,7 +26,6 @@
 #define netfs_write_traces					\
 	EM(netfs_write_trace_copy_to_cache,	"COPY2CACH")	\
 	EM(netfs_write_trace_dio_write,		"DIO-WRITE")	\
-	EM(netfs_write_trace_launder,		"LAUNDER  ")	\
 	EM(netfs_write_trace_unbuffered_write,	"UNB-WRITE")	\
 	EM(netfs_write_trace_writeback,		"WRITEBACK")	\
 	E_(netfs_write_trace_writethrough,	"WRITETHRU")
@@ -38,7 +37,6 @@
 	EM(NETFS_COPY_TO_CACHE,			"CC")		\
 	EM(NETFS_WRITEBACK,			"WB")		\
 	EM(NETFS_WRITETHROUGH,			"WT")		\
-	EM(NETFS_LAUNDER_WRITE,			"LW")		\
 	EM(NETFS_UNBUFFERED_WRITE,		"UW")		\
 	EM(NETFS_DIO_READ,			"DR")		\
 	E_(NETFS_DIO_WRITE,			"DW")
@@ -135,7 +133,6 @@
 	EM(netfs_folio_trace_end_copy,		"end-copy")	\
 	EM(netfs_folio_trace_filled_gaps,	"filled-gaps")	\
 	EM(netfs_folio_trace_kill,		"kill")		\
-	EM(netfs_folio_trace_launder,		"launder")	\
 	EM(netfs_folio_trace_mkwrite,		"mkwrite")	\
 	EM(netfs_folio_trace_mkwrite_plus,	"mkwrite+")	\
 	EM(netfs_folio_trace_read_gaps,		"read-gaps")	\
-- 
cgit v1.2.3


From d9f85a04fb0eee0171f451fb4c4875b8a00eeaec Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 15 Mar 2024 14:37:18 +0000
Subject: netfs: Use mempools for allocating requests and subrequests

Use mempools for allocating requests and subrequests in an effort to make
sure that allocation always succeeds so that when performing writeback we
can always make progress.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 fs/netfs/internal.h   |  2 ++
 fs/netfs/main.c       | 51 ++++++++++++++++++++++++++++++++++++++------
 fs/netfs/objects.c    | 59 ++++++++++++++++++++++++++++++++++-----------------
 include/linux/netfs.h |  5 +++--
 4 files changed, 89 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 156ab138e224..c67da478cd2b 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -37,6 +37,8 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync);
 extern unsigned int netfs_debug;
 extern struct list_head netfs_io_requests;
 extern spinlock_t netfs_proc_lock;
+extern mempool_t netfs_request_pool;
+extern mempool_t netfs_subrequest_pool;
 
 #ifdef CONFIG_PROC_FS
 static inline void netfs_proc_add_rreq(struct netfs_io_request *rreq)
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index 844efbb2e7a2..4805b9377364 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -7,6 +7,7 @@
 
 #include <linux/module.h>
 #include <linux/export.h>
+#include <linux/mempool.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include "internal.h"
@@ -23,6 +24,11 @@ unsigned netfs_debug;
 module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO);
 MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask");
 
+static struct kmem_cache *netfs_request_slab;
+static struct kmem_cache *netfs_subrequest_slab;
+mempool_t netfs_request_pool;
+mempool_t netfs_subrequest_pool;
+
 #ifdef CONFIG_PROC_FS
 LIST_HEAD(netfs_io_requests);
 DEFINE_SPINLOCK(netfs_proc_lock);
@@ -98,25 +104,54 @@ static int __init netfs_init(void)
 {
 	int ret = -ENOMEM;
 
+	netfs_request_slab = kmem_cache_create("netfs_request",
+					       sizeof(struct netfs_io_request), 0,
+					       SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT,
+					       NULL);
+	if (!netfs_request_slab)
+		goto error_req;
+
+	if (mempool_init_slab_pool(&netfs_request_pool, 100, netfs_request_slab) < 0)
+		goto error_reqpool;
+
+	netfs_subrequest_slab = kmem_cache_create("netfs_subrequest",
+						  sizeof(struct netfs_io_subrequest), 0,
+						  SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT,
+						  NULL);
+	if (!netfs_subrequest_slab)
+		goto error_subreq;
+
+	if (mempool_init_slab_pool(&netfs_subrequest_pool, 100, netfs_subrequest_slab) < 0)
+		goto error_subreqpool;
+
 	if (!proc_mkdir("fs/netfs", NULL))
-		goto error;
+		goto error_proc;
 	if (!proc_create_seq("fs/netfs/requests", S_IFREG | 0444, NULL,
 			     &netfs_requests_seq_ops))
-		goto error_proc;
+		goto error_procfile;
 #ifdef CONFIG_FSCACHE_STATS
 	if (!proc_create_single("fs/netfs/stats", S_IFREG | 0444, NULL,
 				netfs_stats_show))
-		goto error_proc;
+		goto error_procfile;
 #endif
 
 	ret = fscache_init();
 	if (ret < 0)
-		goto error_proc;
+		goto error_fscache;
 	return 0;
 
-error_proc:
+error_fscache:
+error_procfile:
 	remove_proc_entry("fs/netfs", NULL);
-error:
+error_proc:
+	mempool_exit(&netfs_subrequest_pool);
+error_subreqpool:
+	kmem_cache_destroy(netfs_subrequest_slab);
+error_subreq:
+	mempool_exit(&netfs_request_pool);
+error_reqpool:
+	kmem_cache_destroy(netfs_request_slab);
+error_req:
 	return ret;
 }
 fs_initcall(netfs_init);
@@ -125,5 +160,9 @@ static void __exit netfs_exit(void)
 {
 	fscache_exit();
 	remove_proc_entry("fs/netfs", NULL);
+	mempool_exit(&netfs_subrequest_pool);
+	kmem_cache_destroy(netfs_subrequest_slab);
+	mempool_exit(&netfs_request_pool);
+	kmem_cache_destroy(netfs_request_slab);
 }
 module_exit(netfs_exit);
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index 8acc03a64059..1a4e2ce735ce 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -6,6 +6,8 @@
  */
 
 #include <linux/slab.h>
+#include <linux/mempool.h>
+#include <linux/delay.h>
 #include "internal.h"
 
 /*
@@ -20,17 +22,22 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
 	struct inode *inode = file ? file_inode(file) : mapping->host;
 	struct netfs_inode *ctx = netfs_inode(inode);
 	struct netfs_io_request *rreq;
+	mempool_t *mempool = ctx->ops->request_pool ?: &netfs_request_pool;
+	struct kmem_cache *cache = mempool->pool_data;
 	bool is_unbuffered = (origin == NETFS_UNBUFFERED_WRITE ||
 			      origin == NETFS_DIO_READ ||
 			      origin == NETFS_DIO_WRITE);
 	bool cached = !is_unbuffered && netfs_is_cache_enabled(ctx);
 	int ret;
 
-	rreq = kzalloc(ctx->ops->io_request_size ?: sizeof(struct netfs_io_request),
-		       GFP_KERNEL);
-	if (!rreq)
-		return ERR_PTR(-ENOMEM);
+	for (;;) {
+		rreq = mempool_alloc(mempool, GFP_KERNEL);
+		if (rreq)
+			break;
+		msleep(10);
+	}
 
+	memset(rreq, 0, kmem_cache_size(cache));
 	rreq->start	= start;
 	rreq->len	= len;
 	rreq->upper_len	= len;
@@ -56,7 +63,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
 	if (rreq->netfs_ops->init_request) {
 		ret = rreq->netfs_ops->init_request(rreq, file);
 		if (ret < 0) {
-			kfree(rreq);
+			mempool_free(rreq, rreq->netfs_ops->request_pool ?: &netfs_request_pool);
 			return ERR_PTR(ret);
 		}
 	}
@@ -88,6 +95,14 @@ void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async)
 	}
 }
 
+static void netfs_free_request_rcu(struct rcu_head *rcu)
+{
+	struct netfs_io_request *rreq = container_of(rcu, struct netfs_io_request, rcu);
+
+	mempool_free(rreq, rreq->netfs_ops->request_pool ?: &netfs_request_pool);
+	netfs_stat_d(&netfs_n_rh_rreq);
+}
+
 static void netfs_free_request(struct work_struct *work)
 {
 	struct netfs_io_request *rreq =
@@ -110,8 +125,7 @@ static void netfs_free_request(struct work_struct *work)
 		}
 		kvfree(rreq->direct_bv);
 	}
-	kfree_rcu(rreq, rcu);
-	netfs_stat_d(&netfs_n_rh_rreq);
+	call_rcu(&rreq->rcu, netfs_free_request_rcu);
 }
 
 void netfs_put_request(struct netfs_io_request *rreq, bool was_async,
@@ -143,20 +157,25 @@ void netfs_put_request(struct netfs_io_request *rreq, bool was_async,
 struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq)
 {
 	struct netfs_io_subrequest *subreq;
-
-	subreq = kzalloc(rreq->netfs_ops->io_subrequest_size ?:
-			 sizeof(struct netfs_io_subrequest),
-			 GFP_KERNEL);
-	if (subreq) {
-		INIT_WORK(&subreq->work, NULL);
-		INIT_LIST_HEAD(&subreq->rreq_link);
-		refcount_set(&subreq->ref, 2);
-		subreq->rreq = rreq;
-		subreq->debug_index = atomic_inc_return(&rreq->subreq_counter);
-		netfs_get_request(rreq, netfs_rreq_trace_get_subreq);
-		netfs_stat(&netfs_n_rh_sreq);
+	mempool_t *mempool = rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool;
+	struct kmem_cache *cache = mempool->pool_data;
+
+	for (;;) {
+		subreq = mempool_alloc(rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool,
+				       GFP_KERNEL);
+		if (subreq)
+			break;
+		msleep(10);
 	}
 
+	memset(subreq, 0, kmem_cache_size(cache));
+	INIT_WORK(&subreq->work, NULL);
+	INIT_LIST_HEAD(&subreq->rreq_link);
+	refcount_set(&subreq->ref, 2);
+	subreq->rreq = rreq;
+	subreq->debug_index = atomic_inc_return(&rreq->subreq_counter);
+	netfs_get_request(rreq, netfs_rreq_trace_get_subreq);
+	netfs_stat(&netfs_n_rh_sreq);
 	return subreq;
 }
 
@@ -178,7 +197,7 @@ static void netfs_free_subrequest(struct netfs_io_subrequest *subreq,
 	trace_netfs_sreq(subreq, netfs_sreq_trace_free);
 	if (rreq->netfs_ops->free_subrequest)
 		rreq->netfs_ops->free_subrequest(subreq);
-	kfree(subreq);
+	mempool_free(subreq, rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool);
 	netfs_stat_d(&netfs_n_rh_sreq);
 	netfs_put_request(rreq, was_async, netfs_rreq_trace_put_subreq);
 }
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 3af589dabd7f..0b6c2c2d3c23 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -20,6 +20,7 @@
 #include <linux/uio.h>
 
 enum netfs_sreq_ref_trace;
+typedef struct mempool_s mempool_t;
 
 /**
  * folio_start_private_2 - Start an fscache write on a folio.  [DEPRECATED]
@@ -236,8 +237,8 @@ struct netfs_io_request {
  * Operations the network filesystem can/must provide to the helpers.
  */
 struct netfs_request_ops {
-	unsigned int	io_request_size;	/* Alloc size for netfs_io_request struct */
-	unsigned int	io_subrequest_size;	/* Alloc size for netfs_io_subrequest struct */
+	mempool_t *request_pool;
+	mempool_t *subrequest_pool;
 	int (*init_request)(struct netfs_io_request *rreq, struct file *file);
 	void (*free_request)(struct netfs_io_request *rreq);
 	void (*free_subrequest)(struct netfs_io_subrequest *rreq);
-- 
cgit v1.2.3


From 7ba167c4c73ed96eb002c98a9d7d49317dfb0191 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 18 Mar 2024 16:57:31 +0000
Subject: netfs: Switch to using unsigned long long rather than loff_t

Switch to using unsigned long long rather than loff_t in netfslib to avoid
problems with the sign flipping in the maths when we're dealing with the
byte at position 0x7fffffffffffffff.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Xiubo Li <xiubli@redhat.com>
cc: netfs@lists.linux.dev
cc: ceph-devel@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
---
 fs/cachefiles/io.c           |  2 +-
 fs/ceph/addr.c               |  2 +-
 fs/netfs/buffered_read.c     |  4 +++-
 fs/netfs/buffered_write.c    |  2 +-
 fs/netfs/io.c                |  6 +++---
 fs/netfs/main.c              |  2 +-
 fs/netfs/output.c            |  4 ++--
 include/linux/netfs.h        | 16 +++++++++-------
 include/trace/events/netfs.h |  6 +++---
 9 files changed, 24 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index 1d685357e67f..5ba5c7814fe4 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -493,7 +493,7 @@ out_no_object:
  * boundary as appropriate.
  */
 static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *subreq,
-						    loff_t i_size)
+						    unsigned long long i_size)
 {
 	return cachefiles_do_prepare_read(&subreq->rreq->cache_resources,
 					  subreq->start, &subreq->len, i_size,
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 74bfd10b1b1a..8c16bc5250ef 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -193,7 +193,7 @@ static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq)
 	 * block, but do not exceed the file size, unless the original
 	 * request already exceeds it.
 	 */
-	new_end = min(round_up(end, lo->stripe_unit), rreq->i_size);
+	new_end = umin(round_up(end, lo->stripe_unit), rreq->i_size);
 	if (new_end > end && new_end <= rreq->start + max_len)
 		rreq->len = new_end - rreq->start;
 
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 1622cce535a3..47603f08680e 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -130,7 +130,9 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
 }
 
 static void netfs_cache_expand_readahead(struct netfs_io_request *rreq,
-					 loff_t *_start, size_t *_len, loff_t i_size)
+					 unsigned long long *_start,
+					 unsigned long long *_len,
+					 unsigned long long i_size)
 {
 	struct netfs_cache_resources *cres = &rreq->cache_resources;
 
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index d8f66ce94575..eba49bfafe64 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -664,7 +664,7 @@ static void netfs_pages_written_back(struct netfs_io_request *wreq)
 	last = (wreq->start + wreq->len - 1) / PAGE_SIZE;
 	xas_for_each(&xas, folio, last) {
 		WARN(!folio_test_writeback(folio),
-		     "bad %zx @%llx page %lx %lx\n",
+		     "bad %llx @%llx page %lx %lx\n",
 		     wreq->len, wreq->start, folio->index, last);
 
 		if ((finfo = netfs_folio_info(folio))) {
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index 8de581ac0cfb..6cfecfcd02e1 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -476,7 +476,7 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq,
 
 set:
 	if (subreq->len > rreq->len)
-		pr_warn("R=%08x[%u] SREQ>RREQ %zx > %zx\n",
+		pr_warn("R=%08x[%u] SREQ>RREQ %zx > %llx\n",
 			rreq->debug_id, subreq->debug_index,
 			subreq->len, rreq->len);
 
@@ -513,7 +513,7 @@ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
 	subreq->start		= rreq->start + rreq->submitted;
 	subreq->len		= io_iter->count;
 
-	_debug("slice %llx,%zx,%zx", subreq->start, subreq->len, rreq->submitted);
+	_debug("slice %llx,%zx,%llx", subreq->start, subreq->len, rreq->submitted);
 	list_add_tail(&subreq->rreq_link, &rreq->subrequests);
 
 	/* Call out to the cache to find out what it can do with the remaining
@@ -588,7 +588,7 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
 	atomic_set(&rreq->nr_outstanding, 1);
 	io_iter = rreq->io_iter;
 	do {
-		_debug("submit %llx + %zx >= %llx",
+		_debug("submit %llx + %llx >= %llx",
 		       rreq->start, rreq->submitted, rreq->i_size);
 		if (rreq->origin == NETFS_DIO_READ &&
 		    rreq->start + rreq->submitted >= rreq->i_size)
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index 4805b9377364..5f0f438e5d21 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -62,7 +62,7 @@ static int netfs_requests_seq_show(struct seq_file *m, void *v)
 
 	rreq = list_entry(v, struct netfs_io_request, proc_link);
 	seq_printf(m,
-		   "%08x %s %3d %2lx %4d %3d @%04llx %zx/%zx",
+		   "%08x %s %3d %2lx %4d %3d @%04llx %llx/%llx",
 		   rreq->debug_id,
 		   netfs_origins[rreq->origin],
 		   refcount_read(&rreq->ref),
diff --git a/fs/netfs/output.c b/fs/netfs/output.c
index e586396d6b72..85374322f10f 100644
--- a/fs/netfs/output.c
+++ b/fs/netfs/output.c
@@ -439,7 +439,7 @@ static void netfs_submit_writethrough(struct netfs_io_request *wreq, bool final)
  */
 int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end)
 {
-	_enter("ic=%zu sb=%zu ws=%u cp=%zu tp=%u",
+	_enter("ic=%zu sb=%llu ws=%u cp=%zu tp=%u",
 	       wreq->iter.count, wreq->submitted, wreq->wsize, copied, to_page_end);
 
 	wreq->iter.count += copied;
@@ -457,7 +457,7 @@ int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb)
 {
 	int ret = -EIOCBQUEUED;
 
-	_enter("ic=%zu sb=%zu ws=%u",
+	_enter("ic=%zu sb=%llu ws=%u",
 	       wreq->iter.count, wreq->submitted, wreq->wsize);
 
 	if (wreq->submitted < wreq->io_iter.count)
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 0b6c2c2d3c23..88269681d4fc 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -149,7 +149,7 @@ struct netfs_io_subrequest {
 	struct work_struct	work;
 	struct list_head	rreq_link;	/* Link in rreq->subrequests */
 	struct iov_iter		io_iter;	/* Iterator for this subrequest */
-	loff_t			start;		/* Where to start the I/O */
+	unsigned long long	start;		/* Where to start the I/O */
 	size_t			len;		/* Size of the I/O */
 	size_t			transferred;	/* Amount of data transferred */
 	refcount_t		ref;
@@ -205,15 +205,15 @@ struct netfs_io_request {
 	atomic_t		subreq_counter;	/* Next subreq->debug_index */
 	atomic_t		nr_outstanding;	/* Number of ops in progress */
 	atomic_t		nr_copy_ops;	/* Number of copy-to-cache ops in progress */
-	size_t			submitted;	/* Amount submitted for I/O so far */
-	size_t			len;		/* Length of the request */
 	size_t			upper_len;	/* Length can be extended to here */
+	unsigned long long	submitted;	/* Amount submitted for I/O so far */
+	unsigned long long	len;		/* Length of the request */
 	size_t			transferred;	/* Amount to be indicated as transferred */
 	short			error;		/* 0 or error that occurred */
 	enum netfs_io_origin	origin;		/* Origin of the request */
 	bool			direct_bv_unpin; /* T if direct_bv[] must be unpinned */
-	loff_t			i_size;		/* Size of the file */
-	loff_t			start;		/* Start position */
+	unsigned long long	i_size;		/* Size of the file */
+	unsigned long long	start;		/* Start position */
 	pgoff_t			no_unlock_folio; /* Don't unlock this folio after read */
 	refcount_t		ref;
 	unsigned long		flags;
@@ -294,13 +294,15 @@ struct netfs_cache_ops {
 
 	/* Expand readahead request */
 	void (*expand_readahead)(struct netfs_cache_resources *cres,
-				 loff_t *_start, size_t *_len, loff_t i_size);
+				 unsigned long long *_start,
+				 unsigned long long *_len,
+				 unsigned long long i_size);
 
 	/* Prepare a read operation, shortening it to a cached/uncached
 	 * boundary as appropriate.
 	 */
 	enum netfs_io_source (*prepare_read)(struct netfs_io_subrequest *subreq,
-					     loff_t i_size);
+					     unsigned long long i_size);
 
 	/* Prepare a write operation, working out what part of the write we can
 	 * actually do.
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index 30769103638f..7126d2ea459c 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -280,7 +280,7 @@ TRACE_EVENT(netfs_sreq,
 		    __entry->start	= sreq->start;
 			   ),
 
-	    TP_printk("R=%08x[%u] %s %s f=%02x s=%llx %zx/%zx e=%d",
+	    TP_printk("R=%08x[%x] %s %s f=%02x s=%llx %zx/%zx e=%d",
 		      __entry->rreq, __entry->index,
 		      __print_symbolic(__entry->source, netfs_sreq_sources),
 		      __print_symbolic(__entry->what, netfs_sreq_traces),
@@ -320,7 +320,7 @@ TRACE_EVENT(netfs_failure,
 		    __entry->start	= sreq ? sreq->start : 0;
 			   ),
 
-	    TP_printk("R=%08x[%d] %s f=%02x s=%llx %zx/%zx %s e=%d",
+	    TP_printk("R=%08x[%x] %s f=%02x s=%llx %zx/%zx %s e=%d",
 		      __entry->rreq, __entry->index,
 		      __print_symbolic(__entry->source, netfs_sreq_sources),
 		      __entry->flags,
@@ -436,7 +436,7 @@ TRACE_EVENT(netfs_write,
 		    __field(unsigned int,		cookie		)
 		    __field(enum netfs_write_trace,	what		)
 		    __field(unsigned long long,		start		)
-		    __field(size_t,			len		)
+		    __field(unsigned long long,		len		)
 			     ),
 
 	    TP_fast_assign(
-- 
cgit v1.2.3


From 288ace2f57c9d06dd2e42bd80d03747d879a4068 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 18 Mar 2024 16:52:05 +0000
Subject: netfs: New writeback implementation

The current netfslib writeback implementation creates writeback requests of
contiguous folio data and then separately tiles subrequests over the space
twice, once for the server and once for the cache.  This creates a few
issues:

 (1) Every time there's a discontiguity or a change between writing to only
     one destination or writing to both, it must create a new request.
     This makes it harder to do vectored writes.

 (2) The folios don't have the writeback mark removed until the end of the
     request - and a request could be hundreds of megabytes.

 (3) In future, I want to support a larger cache granularity, which will
     require aggregation of some folios that contain unmodified data (which
     only need to go to the cache) and some which contain modifications
     (which need to be uploaded and stored to the cache) - but, currently,
     these are treated as discontiguous.

There's also a move to get everyone to use writeback_iter() to extract
writable folios from the pagecache.  That said, currently writeback_iter()
has some issues that make it less than ideal:

 (1) there's no way to cancel the iteration, even if you find a "temporary"
     error that means the current folio and all subsequent folios are going
     to fail;

 (2) there's no way to filter the folios being written back - something
     that will impact Ceph with it's ordered snap system;

 (3) and if you get a folio you can't immediately deal with (say you need
     to flush the preceding writes), you are left with a folio hanging in
     the locked state for the duration, when really we should unlock it and
     relock it later.

In this new implementation, I use writeback_iter() to pump folios,
progressively creating two parallel, but separate streams and cleaning up
the finished folios as the subrequests complete.  Either or both streams
can contain gaps, and the subrequests in each stream can be of variable
size, don't need to align with each other and don't need to align with the
folios.

Indeed, subrequests can cross folio boundaries, may cover several folios or
a folio may be spanned by multiple folios, e.g.:

         +---+---+-----+-----+---+----------+
Folios:  |   |   |     |     |   |          |
         +---+---+-----+-----+---+----------+

           +------+------+     +----+----+
Upload:    |      |      |.....|    |    |
           +------+------+     +----+----+

         +------+------+------+------+------+
Cache:   |      |      |      |      |      |
         +------+------+------+------+------+

The progressive subrequest construction permits the algorithm to be
preparing both the next upload to the server and the next write to the
cache whilst the previous ones are already in progress.  Throttling can be
applied to control the rate of production of subrequests - and, in any
case, we probably want to write them to the server in ascending order,
particularly if the file will be extended.

Content crypto can also be prepared at the same time as the subrequests and
run asynchronously, with the prepped requests being stalled until the
crypto catches up with them.  This might also be useful for transport
crypto, but that happens at a lower layer, so probably would be harder to
pull off.

The algorithm is split into three parts:

 (1) The issuer.  This walks through the data, packaging it up, encrypting
     it and creating subrequests.  The part of this that generates
     subrequests only deals with file positions and spans and so is usable
     for DIO/unbuffered writes as well as buffered writes.

 (2) The collector. This asynchronously collects completed subrequests,
     unlocks folios, frees crypto buffers and performs any retries.  This
     runs in a work queue so that the issuer can return to the caller for
     writeback (so that the VM can have its kswapd thread back) or async
     writes.

 (3) The retryer.  This pauses the issuer, waits for all outstanding
     subrequests to complete and then goes through the failed subrequests
     to reissue them.  This may involve reprepping them (with cifs, the
     credits must be renegotiated, and a subrequest may need splitting),
     and doing RMW for content crypto if there's a conflicting change on
     the server.

[!] Note that some of the functions are prefixed with "new_" to avoid
clashes with existing functions.  These will be renamed in a later patch
that cuts over to the new algorithm.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: Eric Van Hensbergen <ericvh@kernel.org>
cc: Latchesar Ionkov <lucho@ionkov.net>
cc: Dominique Martinet <asmadeus@codewreck.org>
cc: Christian Schoenebeck <linux_oss@crudebyte.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: v9fs@lists.linux.dev
cc: linux-afs@lists.infradead.org
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
---
 fs/netfs/Makefile            |   4 +-
 fs/netfs/buffered_write.c    |   4 -
 fs/netfs/internal.h          |  27 ++
 fs/netfs/objects.c           |  17 +
 fs/netfs/write_collect.c     | 803 +++++++++++++++++++++++++++++++++++++++++++
 fs/netfs/write_issue.c       | 683 ++++++++++++++++++++++++++++++++++++
 include/linux/netfs.h        |  68 +++-
 include/trace/events/netfs.h | 232 ++++++++++++-
 8 files changed, 1829 insertions(+), 9 deletions(-)
 create mode 100644 fs/netfs/write_collect.c
 create mode 100644 fs/netfs/write_issue.c

(limited to 'include')

diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
index d4d1d799819e..1eb86e34b5a9 100644
--- a/fs/netfs/Makefile
+++ b/fs/netfs/Makefile
@@ -11,7 +11,9 @@ netfs-y := \
 	main.o \
 	misc.o \
 	objects.o \
-	output.o
+	output.o \
+	write_collect.o \
+	write_issue.o
 
 netfs-$(CONFIG_NETFS_STATS) += stats.o
 
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index eba49bfafe64..84ac95ee4b4d 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -74,16 +74,12 @@ static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx,
 
 	if (file->f_mode & FMODE_READ)
 		goto no_write_streaming;
-	if (test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags))
-		goto no_write_streaming;
 
 	if (netfs_is_cache_enabled(ctx)) {
 		/* We don't want to get a streaming write on a file that loses
 		 * caching service temporarily because the backing store got
 		 * culled.
 		 */
-		if (!test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags))
-			set_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags);
 		goto no_write_streaming;
 	}
 
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index c67da478cd2b..dc11d1f67363 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -150,6 +150,33 @@ static inline void netfs_stat_d(atomic_t *stat)
 #define netfs_stat_d(x) do {} while(0)
 #endif
 
+/*
+ * write_collect.c
+ */
+int netfs_folio_written_back(struct folio *folio);
+void netfs_write_collection_worker(struct work_struct *work);
+void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async);
+
+/*
+ * write_issue.c
+ */
+struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
+						struct file *file,
+						loff_t start,
+						enum netfs_io_origin origin);
+void netfs_reissue_write(struct netfs_io_stream *stream,
+			 struct netfs_io_subrequest *subreq);
+int netfs_advance_write(struct netfs_io_request *wreq,
+			struct netfs_io_stream *stream,
+			loff_t start, size_t len, bool to_eof);
+struct netfs_io_request *new_netfs_begin_writethrough(struct kiocb *iocb, size_t len);
+int new_netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
+				   struct folio *folio, size_t copied, bool to_page_end,
+				   struct folio **writethrough_cache);
+int new_netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
+			       struct folio *writethrough_cache);
+int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len);
+
 /*
  * Miscellaneous functions.
  */
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index 1a4e2ce735ce..c90d482b1650 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -47,6 +47,10 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
 	rreq->inode	= inode;
 	rreq->i_size	= i_size_read(inode);
 	rreq->debug_id	= atomic_inc_return(&debug_ids);
+	rreq->wsize	= INT_MAX;
+	spin_lock_init(&rreq->lock);
+	INIT_LIST_HEAD(&rreq->io_streams[0].subrequests);
+	INIT_LIST_HEAD(&rreq->io_streams[1].subrequests);
 	INIT_LIST_HEAD(&rreq->subrequests);
 	INIT_WORK(&rreq->work, NULL);
 	refcount_set(&rreq->ref, 1);
@@ -85,6 +89,8 @@ void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace
 void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async)
 {
 	struct netfs_io_subrequest *subreq;
+	struct netfs_io_stream *stream;
+	int s;
 
 	while (!list_empty(&rreq->subrequests)) {
 		subreq = list_first_entry(&rreq->subrequests,
@@ -93,6 +99,17 @@ void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async)
 		netfs_put_subrequest(subreq, was_async,
 				     netfs_sreq_trace_put_clear);
 	}
+
+	for (s = 0; s < ARRAY_SIZE(rreq->io_streams); s++) {
+		stream = &rreq->io_streams[s];
+		while (!list_empty(&stream->subrequests)) {
+			subreq = list_first_entry(&stream->subrequests,
+						  struct netfs_io_subrequest, rreq_link);
+			list_del(&subreq->rreq_link);
+			netfs_put_subrequest(subreq, was_async,
+					     netfs_sreq_trace_put_clear);
+		}
+	}
 }
 
 static void netfs_free_request_rcu(struct rcu_head *rcu)
diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
new file mode 100644
index 000000000000..b8c1d3ca724a
--- /dev/null
+++ b/fs/netfs/write_collect.c
@@ -0,0 +1,803 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Network filesystem write subrequest result collection, assessment
+ * and retrying.
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+/* Notes made in the collector */
+#define HIT_PENDING		0x01	/* A front op was still pending */
+#define SOME_EMPTY		0x02	/* One of more streams are empty */
+#define ALL_EMPTY		0x04	/* All streams are empty */
+#define MAYBE_DISCONTIG		0x08	/* A front op may be discontiguous (rounded to PAGE_SIZE) */
+#define NEED_REASSESS		0x10	/* Need to loop round and reassess */
+#define REASSESS_DISCONTIG	0x20	/* Reassess discontiguity if contiguity advances */
+#define MADE_PROGRESS		0x40	/* Made progress cleaning up a stream or the folio set */
+#define BUFFERED		0x80	/* The pagecache needs cleaning up */
+#define NEED_RETRY		0x100	/* A front op requests retrying */
+#define SAW_FAILURE		0x200	/* One stream or hit a permanent failure */
+
+/*
+ * Successful completion of write of a folio to the server and/or cache.  Note
+ * that we are not allowed to lock the folio here on pain of deadlocking with
+ * truncate.
+ */
+int netfs_folio_written_back(struct folio *folio)
+{
+	enum netfs_folio_trace why = netfs_folio_trace_clear;
+	struct netfs_folio *finfo;
+	struct netfs_group *group = NULL;
+	int gcount = 0;
+
+	if ((finfo = netfs_folio_info(folio))) {
+		/* Streaming writes cannot be redirtied whilst under writeback,
+		 * so discard the streaming record.
+		 */
+		folio_detach_private(folio);
+		group = finfo->netfs_group;
+		gcount++;
+		kfree(finfo);
+		why = netfs_folio_trace_clear_s;
+		goto end_wb;
+	}
+
+	if ((group = netfs_folio_group(folio))) {
+		if (group == NETFS_FOLIO_COPY_TO_CACHE) {
+			why = netfs_folio_trace_clear_cc;
+			folio_detach_private(folio);
+			goto end_wb;
+		}
+
+		/* Need to detach the group pointer if the page didn't get
+		 * redirtied.  If it has been redirtied, then it must be within
+		 * the same group.
+		 */
+		why = netfs_folio_trace_redirtied;
+		if (!folio_test_dirty(folio)) {
+			folio_detach_private(folio);
+			gcount++;
+			why = netfs_folio_trace_clear_g;
+		}
+	}
+
+end_wb:
+	trace_netfs_folio(folio, why);
+	folio_end_writeback(folio);
+	return gcount;
+}
+
+/*
+ * Get hold of a folio we have under writeback.  We don't want to get the
+ * refcount on it.
+ */
+static struct folio *netfs_writeback_lookup_folio(struct netfs_io_request *wreq, loff_t pos)
+{
+	XA_STATE(xas, &wreq->mapping->i_pages, pos / PAGE_SIZE);
+	struct folio *folio;
+
+	rcu_read_lock();
+
+	for (;;) {
+		xas_reset(&xas);
+		folio = xas_load(&xas);
+		if (xas_retry(&xas, folio))
+			continue;
+
+		if (!folio || xa_is_value(folio))
+			kdebug("R=%08x: folio %lx (%llx) not present",
+			       wreq->debug_id, xas.xa_index, pos / PAGE_SIZE);
+		BUG_ON(!folio || xa_is_value(folio));
+
+		if (folio == xas_reload(&xas))
+			break;
+	}
+
+	rcu_read_unlock();
+
+	if (WARN_ONCE(!folio_test_writeback(folio),
+		      "R=%08x: folio %lx is not under writeback\n",
+		      wreq->debug_id, folio->index)) {
+		trace_netfs_folio(folio, netfs_folio_trace_not_under_wback);
+	}
+	return folio;
+}
+
+/*
+ * Unlock any folios we've finished with.
+ */
+static void netfs_writeback_unlock_folios(struct netfs_io_request *wreq,
+					  unsigned long long collected_to,
+					  unsigned int *notes)
+{
+	for (;;) {
+		struct folio *folio;
+		struct netfs_folio *finfo;
+		unsigned long long fpos, fend;
+		size_t fsize, flen;
+
+		folio = netfs_writeback_lookup_folio(wreq, wreq->cleaned_to);
+
+		fpos = folio_pos(folio);
+		fsize = folio_size(folio);
+		finfo = netfs_folio_info(folio);
+		flen = finfo ? finfo->dirty_offset + finfo->dirty_len : fsize;
+
+		fend = min_t(unsigned long long, fpos + flen, wreq->i_size);
+
+		trace_netfs_collect_folio(wreq, folio, fend, collected_to);
+
+		if (fpos + fsize > wreq->contiguity) {
+			trace_netfs_collect_contig(wreq, fpos + fsize,
+						   netfs_contig_trace_unlock);
+			wreq->contiguity = fpos + fsize;
+		}
+
+		/* Unlock any folio we've transferred all of. */
+		if (collected_to < fend)
+			break;
+
+		wreq->nr_group_rel += netfs_folio_written_back(folio);
+		wreq->cleaned_to = fpos + fsize;
+		*notes |= MADE_PROGRESS;
+
+		if (fpos + fsize >= collected_to)
+			break;
+	}
+}
+
+/*
+ * Perform retries on the streams that need it.
+ */
+static void netfs_retry_write_stream(struct netfs_io_request *wreq,
+				     struct netfs_io_stream *stream)
+{
+	struct list_head *next;
+
+	_enter("R=%x[%x:]", wreq->debug_id, stream->stream_nr);
+
+	if (unlikely(stream->failed))
+		return;
+
+	/* If there's no renegotiation to do, just resend each failed subreq. */
+	if (!stream->prepare_write) {
+		struct netfs_io_subrequest *subreq;
+
+		list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
+			if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
+				break;
+			if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
+				__set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
+				netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
+				netfs_reissue_write(stream, subreq);
+			}
+		}
+		return;
+	}
+
+	if (list_empty(&stream->subrequests))
+		return;
+	next = stream->subrequests.next;
+
+	do {
+		struct netfs_io_subrequest *subreq = NULL, *from, *to, *tmp;
+		unsigned long long start, len;
+		size_t part;
+		bool boundary = false;
+
+		/* Go through the stream and find the next span of contiguous
+		 * data that we then rejig (cifs, for example, needs the wsize
+		 * renegotiating) and reissue.
+		 */
+		from = list_entry(next, struct netfs_io_subrequest, rreq_link);
+		to = from;
+		start = from->start + from->transferred;
+		len   = from->len   - from->transferred;
+
+		if (test_bit(NETFS_SREQ_FAILED, &from->flags) ||
+		    !test_bit(NETFS_SREQ_NEED_RETRY, &from->flags))
+			return;
+
+		list_for_each_continue(next, &stream->subrequests) {
+			subreq = list_entry(next, struct netfs_io_subrequest, rreq_link);
+			if (subreq->start + subreq->transferred != start + len ||
+			    test_bit(NETFS_SREQ_BOUNDARY, &subreq->flags) ||
+			    !test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags))
+				break;
+			to = subreq;
+			len += to->len;
+		}
+
+		/* Work through the sublist. */
+		subreq = from;
+		list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) {
+			if (!len)
+				break;
+			/* Renegotiate max_len (wsize) */
+			trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
+			__clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+			__set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
+			stream->prepare_write(subreq);
+
+			part = min(len, subreq->max_len);
+			subreq->len = part;
+			subreq->start = start;
+			subreq->transferred = 0;
+			len -= part;
+			start += part;
+			if (len && subreq == to &&
+			    __test_and_clear_bit(NETFS_SREQ_BOUNDARY, &to->flags))
+				boundary = true;
+
+			netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
+			netfs_reissue_write(stream, subreq);
+			if (subreq == to)
+				break;
+		}
+
+		/* If we managed to use fewer subreqs, we can discard the
+		 * excess; if we used the same number, then we're done.
+		 */
+		if (!len) {
+			if (subreq == to)
+				continue;
+			list_for_each_entry_safe_from(subreq, tmp,
+						      &stream->subrequests, rreq_link) {
+				trace_netfs_sreq(subreq, netfs_sreq_trace_discard);
+				list_del(&subreq->rreq_link);
+				netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done);
+				if (subreq == to)
+					break;
+			}
+			continue;
+		}
+
+		/* We ran out of subrequests, so we need to allocate some more
+		 * and insert them after.
+		 */
+		do {
+			subreq = netfs_alloc_subrequest(wreq);
+			subreq->source		= to->source;
+			subreq->start		= start;
+			subreq->max_len		= len;
+			subreq->max_nr_segs	= INT_MAX;
+			subreq->debug_index	= atomic_inc_return(&wreq->subreq_counter);
+			subreq->stream_nr	= to->stream_nr;
+			__set_bit(NETFS_SREQ_RETRYING, &subreq->flags);
+
+			trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
+					     refcount_read(&subreq->ref),
+					     netfs_sreq_trace_new);
+			netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
+
+			list_add(&subreq->rreq_link, &to->rreq_link);
+			to = list_next_entry(to, rreq_link);
+			trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
+
+			switch (stream->source) {
+			case NETFS_UPLOAD_TO_SERVER:
+				netfs_stat(&netfs_n_wh_upload);
+				subreq->max_len = min(len, wreq->wsize);
+				break;
+			case NETFS_WRITE_TO_CACHE:
+				netfs_stat(&netfs_n_wh_write);
+				break;
+			default:
+				WARN_ON_ONCE(1);
+			}
+
+			stream->prepare_write(subreq);
+
+			part = min(len, subreq->max_len);
+			subreq->len = subreq->transferred + part;
+			len -= part;
+			start += part;
+			if (!len && boundary) {
+				__set_bit(NETFS_SREQ_BOUNDARY, &to->flags);
+				boundary = false;
+			}
+
+			netfs_reissue_write(stream, subreq);
+			if (!len)
+				break;
+
+		} while (len);
+
+	} while (!list_is_head(next, &stream->subrequests));
+}
+
+/*
+ * Perform retries on the streams that need it.  If we're doing content
+ * encryption and the server copy changed due to a third-party write, we may
+ * need to do an RMW cycle and also rewrite the data to the cache.
+ */
+static void netfs_retry_writes(struct netfs_io_request *wreq)
+{
+	struct netfs_io_subrequest *subreq;
+	struct netfs_io_stream *stream;
+	int s;
+
+	/* Wait for all outstanding I/O to quiesce before performing retries as
+	 * we may need to renegotiate the I/O sizes.
+	 */
+	for (s = 0; s < NR_IO_STREAMS; s++) {
+		stream = &wreq->io_streams[s];
+		if (!stream->active)
+			continue;
+
+		list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
+			wait_on_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS,
+				    TASK_UNINTERRUPTIBLE);
+		}
+	}
+
+	// TODO: Enc: Fetch changed partial pages
+	// TODO: Enc: Reencrypt content if needed.
+	// TODO: Enc: Wind back transferred point.
+	// TODO: Enc: Mark cache pages for retry.
+
+	for (s = 0; s < NR_IO_STREAMS; s++) {
+		stream = &wreq->io_streams[s];
+		if (stream->need_retry) {
+			stream->need_retry = false;
+			netfs_retry_write_stream(wreq, stream);
+		}
+	}
+}
+
+/*
+ * Collect and assess the results of various write subrequests.  We may need to
+ * retry some of the results - or even do an RMW cycle for content crypto.
+ *
+ * Note that we have a number of parallel, overlapping lists of subrequests,
+ * one to the server and one to the local cache for example, which may not be
+ * the same size or starting position and may not even correspond in boundary
+ * alignment.
+ */
+static void netfs_collect_write_results(struct netfs_io_request *wreq)
+{
+	struct netfs_io_subrequest *front, *remove;
+	struct netfs_io_stream *stream;
+	unsigned long long collected_to;
+	unsigned int notes;
+	int s;
+
+	_enter("%llx-%llx", wreq->start, wreq->start + wreq->len);
+	trace_netfs_collect(wreq);
+	trace_netfs_rreq(wreq, netfs_rreq_trace_collect);
+
+reassess_streams:
+	smp_rmb();
+	collected_to = ULLONG_MAX;
+	if (wreq->origin == NETFS_WRITEBACK)
+		notes = ALL_EMPTY | BUFFERED | MAYBE_DISCONTIG;
+	else if (wreq->origin == NETFS_WRITETHROUGH)
+		notes = ALL_EMPTY | BUFFERED;
+	else
+		notes = ALL_EMPTY;
+
+	/* Remove completed subrequests from the front of the streams and
+	 * advance the completion point on each stream.  We stop when we hit
+	 * something that's in progress.  The issuer thread may be adding stuff
+	 * to the tail whilst we're doing this.
+	 *
+	 * We must not, however, merge in discontiguities that span whole
+	 * folios that aren't under writeback.  This is made more complicated
+	 * by the folios in the gap being of unpredictable sizes - if they even
+	 * exist - but we don't want to look them up.
+	 */
+	for (s = 0; s < NR_IO_STREAMS; s++) {
+		loff_t rstart, rend;
+
+		stream = &wreq->io_streams[s];
+		/* Read active flag before list pointers */
+		if (!smp_load_acquire(&stream->active))
+			continue;
+
+		front = stream->front;
+		while (front) {
+			trace_netfs_collect_sreq(wreq, front);
+			//_debug("sreq [%x] %llx %zx/%zx",
+			//       front->debug_index, front->start, front->transferred, front->len);
+
+			/* Stall if there may be a discontinuity. */
+			rstart = round_down(front->start, PAGE_SIZE);
+			if (rstart > wreq->contiguity) {
+				if (wreq->contiguity > stream->collected_to) {
+					trace_netfs_collect_gap(wreq, stream,
+								wreq->contiguity, 'D');
+					stream->collected_to = wreq->contiguity;
+				}
+				notes |= REASSESS_DISCONTIG;
+				break;
+			}
+			rend = round_up(front->start + front->len, PAGE_SIZE);
+			if (rend > wreq->contiguity) {
+				trace_netfs_collect_contig(wreq, rend,
+							   netfs_contig_trace_collect);
+				wreq->contiguity = rend;
+				if (notes & REASSESS_DISCONTIG)
+					notes |= NEED_REASSESS;
+			}
+			notes &= ~MAYBE_DISCONTIG;
+
+			/* Stall if the front is still undergoing I/O. */
+			if (test_bit(NETFS_SREQ_IN_PROGRESS, &front->flags)) {
+				notes |= HIT_PENDING;
+				break;
+			}
+			smp_rmb(); /* Read counters after I-P flag. */
+
+			if (stream->failed) {
+				stream->collected_to = front->start + front->len;
+				notes |= MADE_PROGRESS | SAW_FAILURE;
+				goto cancel;
+			}
+			if (front->start + front->transferred > stream->collected_to) {
+				stream->collected_to = front->start + front->transferred;
+				stream->transferred = stream->collected_to - wreq->start;
+				notes |= MADE_PROGRESS;
+			}
+			if (test_bit(NETFS_SREQ_FAILED, &front->flags)) {
+				stream->failed = true;
+				stream->error = front->error;
+				if (stream->source == NETFS_UPLOAD_TO_SERVER)
+					mapping_set_error(wreq->mapping, front->error);
+				notes |= NEED_REASSESS | SAW_FAILURE;
+				break;
+			}
+			if (front->transferred < front->len) {
+				stream->need_retry = true;
+				notes |= NEED_RETRY | MADE_PROGRESS;
+				break;
+			}
+
+		cancel:
+			/* Remove if completely consumed. */
+			spin_lock(&wreq->lock);
+
+			remove = front;
+			list_del_init(&front->rreq_link);
+			front = list_first_entry_or_null(&stream->subrequests,
+							 struct netfs_io_subrequest, rreq_link);
+			stream->front = front;
+			if (!front) {
+				unsigned long long jump_to = atomic64_read(&wreq->issued_to);
+
+				if (stream->collected_to < jump_to) {
+					trace_netfs_collect_gap(wreq, stream, jump_to, 'A');
+					stream->collected_to = jump_to;
+				}
+			}
+
+			spin_unlock(&wreq->lock);
+			netfs_put_subrequest(remove, false,
+					     notes & SAW_FAILURE ?
+					     netfs_sreq_trace_put_cancel :
+					     netfs_sreq_trace_put_done);
+		}
+
+		if (front)
+			notes &= ~ALL_EMPTY;
+		else
+			notes |= SOME_EMPTY;
+
+		if (stream->collected_to < collected_to)
+			collected_to = stream->collected_to;
+	}
+
+	if (collected_to != ULLONG_MAX && collected_to > wreq->collected_to)
+		wreq->collected_to = collected_to;
+
+	/* If we have an empty stream, we need to jump it forward over any gap
+	 * otherwise the collection point will never advance.
+	 *
+	 * Note that the issuer always adds to the stream with the lowest
+	 * so-far submitted start, so if we see two consecutive subreqs in one
+	 * stream with nothing between then in another stream, then the second
+	 * stream has a gap that can be jumped.
+	 */
+	if (notes & SOME_EMPTY) {
+		unsigned long long jump_to = wreq->start + wreq->len;
+
+		for (s = 0; s < NR_IO_STREAMS; s++) {
+			stream = &wreq->io_streams[s];
+			if (stream->active &&
+			    stream->front &&
+			    stream->front->start < jump_to)
+				jump_to = stream->front->start;
+		}
+
+		for (s = 0; s < NR_IO_STREAMS; s++) {
+			stream = &wreq->io_streams[s];
+			if (stream->active &&
+			    !stream->front &&
+			    stream->collected_to < jump_to) {
+				trace_netfs_collect_gap(wreq, stream, jump_to, 'B');
+				stream->collected_to = jump_to;
+			}
+		}
+	}
+
+	for (s = 0; s < NR_IO_STREAMS; s++) {
+		stream = &wreq->io_streams[s];
+		if (stream->active)
+			trace_netfs_collect_stream(wreq, stream);
+	}
+
+	trace_netfs_collect_state(wreq, wreq->collected_to, notes);
+
+	/* Unlock any folios that we have now finished with. */
+	if (notes & BUFFERED) {
+		unsigned long long clean_to = min(wreq->collected_to, wreq->contiguity);
+
+		if (wreq->cleaned_to < clean_to)
+			netfs_writeback_unlock_folios(wreq, clean_to, &notes);
+	} else {
+		wreq->cleaned_to = wreq->collected_to;
+	}
+
+	// TODO: Discard encryption buffers
+
+	/* If all streams are discontiguous with the last folio we cleared, we
+	 * may need to skip a set of folios.
+	 */
+	if ((notes & (MAYBE_DISCONTIG | ALL_EMPTY)) == MAYBE_DISCONTIG) {
+		unsigned long long jump_to = ULLONG_MAX;
+
+		for (s = 0; s < NR_IO_STREAMS; s++) {
+			stream = &wreq->io_streams[s];
+			if (stream->active && stream->front &&
+			    stream->front->start < jump_to)
+				jump_to = stream->front->start;
+		}
+
+		trace_netfs_collect_contig(wreq, jump_to, netfs_contig_trace_jump);
+		wreq->contiguity = jump_to;
+		wreq->cleaned_to = jump_to;
+		wreq->collected_to = jump_to;
+		for (s = 0; s < NR_IO_STREAMS; s++) {
+			stream = &wreq->io_streams[s];
+			if (stream->collected_to < jump_to)
+				stream->collected_to = jump_to;
+		}
+		//cond_resched();
+		notes |= MADE_PROGRESS;
+		goto reassess_streams;
+	}
+
+	if (notes & NEED_RETRY)
+		goto need_retry;
+	if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) {
+		trace_netfs_rreq(wreq, netfs_rreq_trace_unpause);
+		clear_bit_unlock(NETFS_RREQ_PAUSE, &wreq->flags);
+		wake_up_bit(&wreq->flags, NETFS_RREQ_PAUSE);
+	}
+
+	if (notes & NEED_REASSESS) {
+		//cond_resched();
+		goto reassess_streams;
+	}
+	if (notes & MADE_PROGRESS) {
+		//cond_resched();
+		goto reassess_streams;
+	}
+
+out:
+	netfs_put_group_many(wreq->group, wreq->nr_group_rel);
+	wreq->nr_group_rel = 0;
+	_leave(" = %x", notes);
+	return;
+
+need_retry:
+	/* Okay...  We're going to have to retry one or both streams.  Note
+	 * that any partially completed op will have had any wholly transferred
+	 * folios removed from it.
+	 */
+	_debug("retry");
+	netfs_retry_writes(wreq);
+	goto out;
+}
+
+/*
+ * Perform the collection of subrequests, folios and encryption buffers.
+ */
+void netfs_write_collection_worker(struct work_struct *work)
+{
+	struct netfs_io_request *wreq = container_of(work, struct netfs_io_request, work);
+	struct netfs_inode *ictx = netfs_inode(wreq->inode);
+	size_t transferred;
+	int s;
+
+	_enter("R=%x", wreq->debug_id);
+
+	netfs_see_request(wreq, netfs_rreq_trace_see_work);
+	if (!test_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags)) {
+		netfs_put_request(wreq, false, netfs_rreq_trace_put_work);
+		return;
+	}
+
+	netfs_collect_write_results(wreq);
+
+	/* We're done when the app thread has finished posting subreqs and all
+	 * the queues in all the streams are empty.
+	 */
+	if (!test_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags)) {
+		netfs_put_request(wreq, false, netfs_rreq_trace_put_work);
+		return;
+	}
+	smp_rmb(); /* Read ALL_QUEUED before lists. */
+
+	transferred = LONG_MAX;
+	for (s = 0; s < NR_IO_STREAMS; s++) {
+		struct netfs_io_stream *stream = &wreq->io_streams[s];
+		if (!stream->active)
+			continue;
+		if (!list_empty(&stream->subrequests)) {
+			netfs_put_request(wreq, false, netfs_rreq_trace_put_work);
+			return;
+		}
+		if (stream->transferred < transferred)
+			transferred = stream->transferred;
+	}
+
+	/* Okay, declare that all I/O is complete. */
+	wreq->transferred = transferred;
+	trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);
+
+	if (wreq->io_streams[1].active &&
+	    wreq->io_streams[1].failed) {
+		/* Cache write failure doesn't prevent writeback completion
+		 * unless we're in disconnected mode.
+		 */
+		ictx->ops->invalidate_cache(wreq);
+	}
+
+	if (wreq->cleanup)
+		wreq->cleanup(wreq);
+
+	if (wreq->origin == NETFS_DIO_WRITE &&
+	    wreq->mapping->nrpages) {
+		/* mmap may have got underfoot and we may now have folios
+		 * locally covering the region we just wrote.  Attempt to
+		 * discard the folios, but leave in place any modified locally.
+		 * ->write_iter() is prevented from interfering by the DIO
+		 * counter.
+		 */
+		pgoff_t first = wreq->start >> PAGE_SHIFT;
+		pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT;
+		invalidate_inode_pages2_range(wreq->mapping, first, last);
+	}
+
+	if (wreq->origin == NETFS_DIO_WRITE)
+		inode_dio_end(wreq->inode);
+
+	_debug("finished");
+	trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip);
+	clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags);
+	wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS);
+
+	if (wreq->iocb) {
+		wreq->iocb->ki_pos += wreq->transferred;
+		if (wreq->iocb->ki_complete)
+			wreq->iocb->ki_complete(
+				wreq->iocb, wreq->error ? wreq->error : wreq->transferred);
+		wreq->iocb = VFS_PTR_POISON;
+	}
+
+	netfs_clear_subrequests(wreq, false);
+	netfs_put_request(wreq, false, netfs_rreq_trace_put_work_complete);
+}
+
+/*
+ * Wake the collection work item.
+ */
+void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async)
+{
+	if (!work_pending(&wreq->work)) {
+		netfs_get_request(wreq, netfs_rreq_trace_get_work);
+		if (!queue_work(system_unbound_wq, &wreq->work))
+			netfs_put_request(wreq, was_async, netfs_rreq_trace_put_work_nq);
+	}
+}
+
+/**
+ * new_netfs_write_subrequest_terminated - Note the termination of a write operation.
+ * @_op: The I/O request that has terminated.
+ * @transferred_or_error: The amount of data transferred or an error code.
+ * @was_async: The termination was asynchronous
+ *
+ * This tells the library that a contributory write I/O operation has
+ * terminated, one way or another, and that it should collect the results.
+ *
+ * The caller indicates in @transferred_or_error the outcome of the operation,
+ * supplying a positive value to indicate the number of bytes transferred or a
+ * negative error code.  The library will look after reissuing I/O operations
+ * as appropriate and writing downloaded data to the cache.
+ *
+ * If @was_async is true, the caller might be running in softirq or interrupt
+ * context and we can't sleep.
+ *
+ * When this is called, ownership of the subrequest is transferred back to the
+ * library, along with a ref.
+ *
+ * Note that %_op is a void* so that the function can be passed to
+ * kiocb::term_func without the need for a casting wrapper.
+ */
+void new_netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
+					   bool was_async)
+{
+	struct netfs_io_subrequest *subreq = _op;
+	struct netfs_io_request *wreq = subreq->rreq;
+	struct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr];
+
+	_enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error);
+
+	switch (subreq->source) {
+	case NETFS_UPLOAD_TO_SERVER:
+		netfs_stat(&netfs_n_wh_upload_done);
+		break;
+	case NETFS_WRITE_TO_CACHE:
+		netfs_stat(&netfs_n_wh_write_done);
+		break;
+	case NETFS_INVALID_WRITE:
+		break;
+	default:
+		BUG();
+	}
+
+	if (IS_ERR_VALUE(transferred_or_error)) {
+		subreq->error = transferred_or_error;
+		if (subreq->error == -EAGAIN)
+			set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+		else
+			set_bit(NETFS_SREQ_FAILED, &subreq->flags);
+		trace_netfs_failure(wreq, subreq, transferred_or_error, netfs_fail_write);
+
+		switch (subreq->source) {
+		case NETFS_WRITE_TO_CACHE:
+			netfs_stat(&netfs_n_wh_write_failed);
+			break;
+		case NETFS_UPLOAD_TO_SERVER:
+			netfs_stat(&netfs_n_wh_upload_failed);
+			break;
+		default:
+			break;
+		}
+		trace_netfs_rreq(wreq, netfs_rreq_trace_set_pause);
+		set_bit(NETFS_RREQ_PAUSE, &wreq->flags);
+	} else {
+		if (WARN(transferred_or_error > subreq->len - subreq->transferred,
+			 "Subreq excess write: R=%x[%x] %zd > %zu - %zu",
+			 wreq->debug_id, subreq->debug_index,
+			 transferred_or_error, subreq->len, subreq->transferred))
+			transferred_or_error = subreq->len - subreq->transferred;
+
+		subreq->error = 0;
+		subreq->transferred += transferred_or_error;
+
+		if (subreq->transferred < subreq->len)
+			set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+	}
+
+	trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
+
+	clear_bit_unlock(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+	wake_up_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS);
+
+	/* If we are at the head of the queue, wake up the collector,
+	 * transferring a ref to it if we were the ones to do so.
+	 */
+	if (list_is_first(&subreq->rreq_link, &stream->subrequests))
+		netfs_wake_write_collector(wreq, was_async);
+
+	netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
+}
+EXPORT_SYMBOL(new_netfs_write_subrequest_terminated);
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
new file mode 100644
index 000000000000..1b2e69934fbd
--- /dev/null
+++ b/fs/netfs/write_issue.c
@@ -0,0 +1,683 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Network filesystem high-level (buffered) writeback.
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ *
+ * To support network filesystems with local caching, we manage a situation
+ * that can be envisioned like the following:
+ *
+ *               +---+---+-----+-----+---+----------+
+ *    Folios:    |   |   |     |     |   |          |
+ *               +---+---+-----+-----+---+----------+
+ *
+ *                 +------+------+     +----+----+
+ *    Upload:      |      |      |.....|    |    |
+ *  (Stream 0)     +------+------+     +----+----+
+ *
+ *               +------+------+------+------+------+
+ *    Cache:     |      |      |      |      |      |
+ *  (Stream 1)   +------+------+------+------+------+
+ *
+ * Where we have a sequence of folios of varying sizes that we need to overlay
+ * with multiple parallel streams of I/O requests, where the I/O requests in a
+ * stream may also be of various sizes (in cifs, for example, the sizes are
+ * negotiated with the server; in something like ceph, they may represent the
+ * sizes of storage objects).
+ *
+ * The sequence in each stream may contain gaps and noncontiguous subrequests
+ * may be glued together into single vectored write RPCs.
+ */
+
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include "internal.h"
+
+/*
+ * Kill all dirty folios in the event of an unrecoverable error, starting with
+ * a locked folio we've already obtained from writeback_iter().
+ */
+static void netfs_kill_dirty_pages(struct address_space *mapping,
+				   struct writeback_control *wbc,
+				   struct folio *folio)
+{
+	int error = 0;
+
+	do {
+		enum netfs_folio_trace why = netfs_folio_trace_kill;
+		struct netfs_group *group = NULL;
+		struct netfs_folio *finfo = NULL;
+		void *priv;
+
+		priv = folio_detach_private(folio);
+		if (priv) {
+			finfo = __netfs_folio_info(priv);
+			if (finfo) {
+				/* Kill folio from streaming write. */
+				group = finfo->netfs_group;
+				why = netfs_folio_trace_kill_s;
+			} else {
+				group = priv;
+				if (group == NETFS_FOLIO_COPY_TO_CACHE) {
+					/* Kill copy-to-cache folio */
+					why = netfs_folio_trace_kill_cc;
+					group = NULL;
+				} else {
+					/* Kill folio with group */
+					why = netfs_folio_trace_kill_g;
+				}
+			}
+		}
+
+		trace_netfs_folio(folio, why);
+
+		folio_start_writeback(folio);
+		folio_unlock(folio);
+		folio_end_writeback(folio);
+
+		netfs_put_group(group);
+		kfree(finfo);
+
+	} while ((folio = writeback_iter(mapping, wbc, folio, &error)));
+}
+
+/*
+ * Create a write request and set it up appropriately for the origin type.
+ */
+struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
+						struct file *file,
+						loff_t start,
+						enum netfs_io_origin origin)
+{
+	struct netfs_io_request *wreq;
+	struct netfs_inode *ictx;
+
+	wreq = netfs_alloc_request(mapping, file, start, 0, origin);
+	if (IS_ERR(wreq))
+		return wreq;
+
+	_enter("R=%x", wreq->debug_id);
+
+	ictx = netfs_inode(wreq->inode);
+	if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags))
+		fscache_begin_write_operation(&wreq->cache_resources, netfs_i_cookie(ictx));
+
+	wreq->contiguity = wreq->start;
+	wreq->cleaned_to = wreq->start;
+	INIT_WORK(&wreq->work, netfs_write_collection_worker);
+
+	wreq->io_streams[0].stream_nr		= 0;
+	wreq->io_streams[0].source		= NETFS_UPLOAD_TO_SERVER;
+	wreq->io_streams[0].prepare_write	= ictx->ops->prepare_write;
+	wreq->io_streams[0].issue_write		= ictx->ops->issue_write;
+	wreq->io_streams[0].collected_to	= start;
+	wreq->io_streams[0].transferred		= LONG_MAX;
+
+	wreq->io_streams[1].stream_nr		= 1;
+	wreq->io_streams[1].source		= NETFS_WRITE_TO_CACHE;
+	wreq->io_streams[1].collected_to	= start;
+	wreq->io_streams[1].transferred		= LONG_MAX;
+	if (fscache_resources_valid(&wreq->cache_resources)) {
+		wreq->io_streams[1].avail	= true;
+		wreq->io_streams[1].prepare_write = wreq->cache_resources.ops->prepare_write_subreq;
+		wreq->io_streams[1].issue_write = wreq->cache_resources.ops->issue_write;
+	}
+
+	return wreq;
+}
+
+/**
+ * netfs_prepare_write_failed - Note write preparation failed
+ * @subreq: The subrequest to mark
+ *
+ * Mark a subrequest to note that preparation for write failed.
+ */
+void netfs_prepare_write_failed(struct netfs_io_subrequest *subreq)
+{
+	__set_bit(NETFS_SREQ_FAILED, &subreq->flags);
+	trace_netfs_sreq(subreq, netfs_sreq_trace_prep_failed);
+}
+EXPORT_SYMBOL(netfs_prepare_write_failed);
+
+/*
+ * Prepare a write subrequest.  We need to allocate a new subrequest
+ * if we don't have one.
+ */
+static void netfs_prepare_write(struct netfs_io_request *wreq,
+				struct netfs_io_stream *stream,
+				loff_t start)
+{
+	struct netfs_io_subrequest *subreq;
+
+	subreq = netfs_alloc_subrequest(wreq);
+	subreq->source		= stream->source;
+	subreq->start		= start;
+	subreq->max_len		= ULONG_MAX;
+	subreq->max_nr_segs	= INT_MAX;
+	subreq->stream_nr	= stream->stream_nr;
+
+	_enter("R=%x[%x]", wreq->debug_id, subreq->debug_index);
+
+	trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
+			     refcount_read(&subreq->ref),
+			     netfs_sreq_trace_new);
+
+	trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
+
+	switch (stream->source) {
+	case NETFS_UPLOAD_TO_SERVER:
+		netfs_stat(&netfs_n_wh_upload);
+		subreq->max_len = wreq->wsize;
+		break;
+	case NETFS_WRITE_TO_CACHE:
+		netfs_stat(&netfs_n_wh_write);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		break;
+	}
+
+	if (stream->prepare_write)
+		stream->prepare_write(subreq);
+
+	__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+
+	/* We add to the end of the list whilst the collector may be walking
+	 * the list.  The collector only goes nextwards and uses the lock to
+	 * remove entries off of the front.
+	 */
+	spin_lock(&wreq->lock);
+	list_add_tail(&subreq->rreq_link, &stream->subrequests);
+	if (list_is_first(&subreq->rreq_link, &stream->subrequests)) {
+		stream->front = subreq;
+		if (!stream->active) {
+			stream->collected_to = stream->front->start;
+			/* Write list pointers before active flag */
+			smp_store_release(&stream->active, true);
+		}
+	}
+
+	spin_unlock(&wreq->lock);
+
+	stream->construct = subreq;
+}
+
+/*
+ * Set the I/O iterator for the filesystem/cache to use and dispatch the I/O
+ * operation.  The operation may be asynchronous and should call
+ * netfs_write_subrequest_terminated() when complete.
+ */
+static void netfs_do_issue_write(struct netfs_io_stream *stream,
+				 struct netfs_io_subrequest *subreq)
+{
+	struct netfs_io_request *wreq = subreq->rreq;
+
+	_enter("R=%x[%x],%zx", wreq->debug_id, subreq->debug_index, subreq->len);
+
+	if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
+		return netfs_write_subrequest_terminated(subreq, subreq->error, false);
+
+	// TODO: Use encrypted buffer
+	if (test_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags)) {
+		subreq->io_iter = wreq->io_iter;
+		iov_iter_advance(&subreq->io_iter,
+				 subreq->start + subreq->transferred - wreq->start);
+		iov_iter_truncate(&subreq->io_iter,
+				 subreq->len - subreq->transferred);
+	} else {
+		iov_iter_xarray(&subreq->io_iter, ITER_SOURCE, &wreq->mapping->i_pages,
+				subreq->start + subreq->transferred,
+				subreq->len   - subreq->transferred);
+	}
+
+	trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+	stream->issue_write(subreq);
+}
+
+void netfs_reissue_write(struct netfs_io_stream *stream,
+			 struct netfs_io_subrequest *subreq)
+{
+	__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+	netfs_do_issue_write(stream, subreq);
+}
+
+static void netfs_issue_write(struct netfs_io_request *wreq,
+			      struct netfs_io_stream *stream)
+{
+	struct netfs_io_subrequest *subreq = stream->construct;
+
+	if (!subreq)
+		return;
+	stream->construct = NULL;
+
+	if (subreq->start + subreq->len > wreq->start + wreq->submitted)
+		wreq->len = wreq->submitted = subreq->start + subreq->len - wreq->start;
+	netfs_do_issue_write(stream, subreq);
+}
+
+/*
+ * Add data to the write subrequest, dispatching each as we fill it up or if it
+ * is discontiguous with the previous.  We only fill one part at a time so that
+ * we can avoid overrunning the credits obtained (cifs) and try to parallelise
+ * content-crypto preparation with network writes.
+ */
+int netfs_advance_write(struct netfs_io_request *wreq,
+			struct netfs_io_stream *stream,
+			loff_t start, size_t len, bool to_eof)
+{
+	struct netfs_io_subrequest *subreq = stream->construct;
+	size_t part;
+
+	if (!stream->avail) {
+		_leave("no write");
+		return len;
+	}
+
+	_enter("R=%x[%x]", wreq->debug_id, subreq ? subreq->debug_index : 0);
+
+	if (subreq && start != subreq->start + subreq->len) {
+		netfs_issue_write(wreq, stream);
+		subreq = NULL;
+	}
+
+	if (!stream->construct)
+		netfs_prepare_write(wreq, stream, start);
+	subreq = stream->construct;
+
+	part = min(subreq->max_len - subreq->len, len);
+	_debug("part %zx/%zx %zx/%zx", subreq->len, subreq->max_len, part, len);
+	subreq->len += part;
+	subreq->nr_segs++;
+
+	if (subreq->len >= subreq->max_len ||
+	    subreq->nr_segs >= subreq->max_nr_segs ||
+	    to_eof) {
+		netfs_issue_write(wreq, stream);
+		subreq = NULL;
+	}
+
+	return part;
+}
+
+/*
+ * Write some of a pending folio data back to the server.
+ */
+static int netfs_write_folio(struct netfs_io_request *wreq,
+			     struct writeback_control *wbc,
+			     struct folio *folio)
+{
+	struct netfs_io_stream *upload = &wreq->io_streams[0];
+	struct netfs_io_stream *cache  = &wreq->io_streams[1];
+	struct netfs_io_stream *stream;
+	struct netfs_group *fgroup; /* TODO: Use this with ceph */
+	struct netfs_folio *finfo;
+	size_t fsize = folio_size(folio), flen = fsize, foff = 0;
+	loff_t fpos = folio_pos(folio), i_size;
+	bool to_eof = false, streamw = false;
+	bool debug = false;
+
+	_enter("");
+
+	/* netfs_perform_write() may shift i_size around the page or from out
+	 * of the page to beyond it, but cannot move i_size into or through the
+	 * page since we have it locked.
+	 */
+	i_size = i_size_read(wreq->inode);
+
+	if (fpos >= i_size) {
+		/* mmap beyond eof. */
+		_debug("beyond eof");
+		folio_start_writeback(folio);
+		folio_unlock(folio);
+		wreq->nr_group_rel += netfs_folio_written_back(folio);
+		netfs_put_group_many(wreq->group, wreq->nr_group_rel);
+		wreq->nr_group_rel = 0;
+		return 0;
+	}
+
+	if (fpos + fsize > wreq->i_size)
+		wreq->i_size = i_size;
+
+	fgroup = netfs_folio_group(folio);
+	finfo = netfs_folio_info(folio);
+	if (finfo) {
+		foff = finfo->dirty_offset;
+		flen = foff + finfo->dirty_len;
+		streamw = true;
+	}
+
+	if (wreq->origin == NETFS_WRITETHROUGH) {
+		to_eof = false;
+		if (flen > i_size - fpos)
+			flen = i_size - fpos;
+	} else if (flen > i_size - fpos) {
+		flen = i_size - fpos;
+		if (!streamw)
+			folio_zero_segment(folio, flen, fsize);
+		to_eof = true;
+	} else if (flen == i_size - fpos) {
+		to_eof = true;
+	}
+	flen -= foff;
+
+	_debug("folio %zx %zx %zx", foff, flen, fsize);
+
+	/* Deal with discontinuities in the stream of dirty pages.  These can
+	 * arise from a number of sources:
+	 *
+	 * (1) Intervening non-dirty pages from random-access writes, multiple
+	 *     flushers writing back different parts simultaneously and manual
+	 *     syncing.
+	 *
+	 * (2) Partially-written pages from write-streaming.
+	 *
+	 * (3) Pages that belong to a different write-back group (eg.  Ceph
+	 *     snapshots).
+	 *
+	 * (4) Actually-clean pages that were marked for write to the cache
+	 *     when they were read.  Note that these appear as a special
+	 *     write-back group.
+	 */
+	if (fgroup == NETFS_FOLIO_COPY_TO_CACHE) {
+		netfs_issue_write(wreq, upload);
+	} else if (fgroup != wreq->group) {
+		/* We can't write this page to the server yet. */
+		kdebug("wrong group");
+		folio_redirty_for_writepage(wbc, folio);
+		folio_unlock(folio);
+		netfs_issue_write(wreq, upload);
+		netfs_issue_write(wreq, cache);
+		return 0;
+	}
+
+	if (foff > 0)
+		netfs_issue_write(wreq, upload);
+	if (streamw)
+		netfs_issue_write(wreq, cache);
+
+	/* Flip the page to the writeback state and unlock.  If we're called
+	 * from write-through, then the page has already been put into the wb
+	 * state.
+	 */
+	if (wreq->origin == NETFS_WRITEBACK)
+		folio_start_writeback(folio);
+	folio_unlock(folio);
+
+	if (fgroup == NETFS_FOLIO_COPY_TO_CACHE) {
+		if (!fscache_resources_valid(&wreq->cache_resources)) {
+			trace_netfs_folio(folio, netfs_folio_trace_cancel_copy);
+			netfs_issue_write(wreq, upload);
+			netfs_folio_written_back(folio);
+			return 0;
+		}
+		trace_netfs_folio(folio, netfs_folio_trace_store_copy);
+	} else if (!upload->construct) {
+		trace_netfs_folio(folio, netfs_folio_trace_store);
+	} else {
+		trace_netfs_folio(folio, netfs_folio_trace_store_plus);
+	}
+
+	/* Move the submission point forward to allow for write-streaming data
+	 * not starting at the front of the page.  We don't do write-streaming
+	 * with the cache as the cache requires DIO alignment.
+	 *
+	 * Also skip uploading for data that's been read and just needs copying
+	 * to the cache.
+	 */
+	for (int s = 0; s < NR_IO_STREAMS; s++) {
+		stream = &wreq->io_streams[s];
+		stream->submit_max_len = fsize;
+		stream->submit_off = foff;
+		stream->submit_len = flen;
+		if ((stream->source == NETFS_WRITE_TO_CACHE && streamw) ||
+		    (stream->source == NETFS_UPLOAD_TO_SERVER &&
+		     fgroup == NETFS_FOLIO_COPY_TO_CACHE)) {
+			stream->submit_off = UINT_MAX;
+			stream->submit_len = 0;
+			stream->submit_max_len = 0;
+		}
+	}
+
+	/* Attach the folio to one or more subrequests.  For a big folio, we
+	 * could end up with thousands of subrequests if the wsize is small -
+	 * but we might need to wait during the creation of subrequests for
+	 * network resources (eg. SMB credits).
+	 */
+	for (;;) {
+		ssize_t part;
+		size_t lowest_off = ULONG_MAX;
+		int choose_s = -1;
+
+		/* Always add to the lowest-submitted stream first. */
+		for (int s = 0; s < NR_IO_STREAMS; s++) {
+			stream = &wreq->io_streams[s];
+			if (stream->submit_len > 0 &&
+			    stream->submit_off < lowest_off) {
+				lowest_off = stream->submit_off;
+				choose_s = s;
+			}
+		}
+
+		if (choose_s < 0)
+			break;
+		stream = &wreq->io_streams[choose_s];
+
+		part = netfs_advance_write(wreq, stream, fpos + stream->submit_off,
+					   stream->submit_len, to_eof);
+		atomic64_set(&wreq->issued_to, fpos + stream->submit_off);
+		stream->submit_off += part;
+		stream->submit_max_len -= part;
+		if (part > stream->submit_len)
+			stream->submit_len = 0;
+		else
+			stream->submit_len -= part;
+		if (part > 0)
+			debug = true;
+	}
+
+	atomic64_set(&wreq->issued_to, fpos + fsize);
+
+	if (!debug)
+		kdebug("R=%x: No submit", wreq->debug_id);
+
+	if (flen < fsize)
+		for (int s = 0; s < NR_IO_STREAMS; s++)
+			netfs_issue_write(wreq, &wreq->io_streams[s]);
+
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * Write some of the pending data back to the server
+ */
+int new_netfs_writepages(struct address_space *mapping,
+			 struct writeback_control *wbc)
+{
+	struct netfs_inode *ictx = netfs_inode(mapping->host);
+	struct netfs_io_request *wreq = NULL;
+	struct folio *folio;
+	int error = 0;
+
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		mutex_lock(&ictx->wb_lock);
+	else if (!mutex_trylock(&ictx->wb_lock))
+		return 0;
+
+	/* Need the first folio to be able to set up the op. */
+	folio = writeback_iter(mapping, wbc, NULL, &error);
+	if (!folio)
+		goto out;
+
+	wreq = netfs_create_write_req(mapping, NULL, folio_pos(folio), NETFS_WRITEBACK);
+	if (IS_ERR(wreq)) {
+		error = PTR_ERR(wreq);
+		goto couldnt_start;
+	}
+
+	trace_netfs_write(wreq, netfs_write_trace_writeback);
+
+	do {
+		_debug("wbiter %lx %llx", folio->index, wreq->start + wreq->submitted);
+
+		/* It appears we don't have to handle cyclic writeback wrapping. */
+		WARN_ON_ONCE(wreq && folio_pos(folio) < wreq->start + wreq->submitted);
+
+		if (netfs_folio_group(folio) != NETFS_FOLIO_COPY_TO_CACHE &&
+		    unlikely(!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))) {
+			set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
+			wreq->netfs_ops->begin_writeback(wreq);
+		}
+
+		error = netfs_write_folio(wreq, wbc, folio);
+		if (error < 0)
+			break;
+	} while ((folio = writeback_iter(mapping, wbc, folio, &error)));
+
+	for (int s = 0; s < NR_IO_STREAMS; s++)
+		netfs_issue_write(wreq, &wreq->io_streams[s]);
+	smp_wmb(); /* Write lists before ALL_QUEUED. */
+	set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags);
+
+	mutex_unlock(&ictx->wb_lock);
+
+	netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+	_leave(" = %d", error);
+	return error;
+
+couldnt_start:
+	netfs_kill_dirty_pages(mapping, wbc, folio);
+out:
+	mutex_unlock(&ictx->wb_lock);
+	_leave(" = %d", error);
+	return error;
+}
+EXPORT_SYMBOL(new_netfs_writepages);
+
+/*
+ * Begin a write operation for writing through the pagecache.
+ */
+struct netfs_io_request *new_netfs_begin_writethrough(struct kiocb *iocb, size_t len)
+{
+	struct netfs_io_request *wreq = NULL;
+	struct netfs_inode *ictx = netfs_inode(file_inode(iocb->ki_filp));
+
+	mutex_lock(&ictx->wb_lock);
+
+	wreq = netfs_create_write_req(iocb->ki_filp->f_mapping, iocb->ki_filp,
+				      iocb->ki_pos, NETFS_WRITETHROUGH);
+	if (IS_ERR(wreq)) {
+		mutex_unlock(&ictx->wb_lock);
+		return wreq;
+	}
+
+	wreq->io_streams[0].avail = true;
+	trace_netfs_write(wreq, netfs_write_trace_writethrough);
+	return wreq;
+}
+
+/*
+ * Advance the state of the write operation used when writing through the
+ * pagecache.  Data has been copied into the pagecache that we need to append
+ * to the request.  If we've added more than wsize then we need to create a new
+ * subrequest.
+ */
+int new_netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
+				   struct folio *folio, size_t copied, bool to_page_end,
+				   struct folio **writethrough_cache)
+{
+	_enter("R=%x ic=%zu ws=%u cp=%zu tp=%u",
+	       wreq->debug_id, wreq->iter.count, wreq->wsize, copied, to_page_end);
+
+	if (!*writethrough_cache) {
+		if (folio_test_dirty(folio))
+			/* Sigh.  mmap. */
+			folio_clear_dirty_for_io(folio);
+
+		/* We can make multiple writes to the folio... */
+		folio_start_writeback(folio);
+		if (wreq->len == 0)
+			trace_netfs_folio(folio, netfs_folio_trace_wthru);
+		else
+			trace_netfs_folio(folio, netfs_folio_trace_wthru_plus);
+		*writethrough_cache = folio;
+	}
+
+	wreq->len += copied;
+	if (!to_page_end)
+		return 0;
+
+	*writethrough_cache = NULL;
+	return netfs_write_folio(wreq, wbc, folio);
+}
+
+/*
+ * End a write operation used when writing through the pagecache.
+ */
+int new_netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
+			       struct folio *writethrough_cache)
+{
+	struct netfs_inode *ictx = netfs_inode(wreq->inode);
+	int ret;
+
+	_enter("R=%x", wreq->debug_id);
+
+	if (writethrough_cache)
+		netfs_write_folio(wreq, wbc, writethrough_cache);
+
+	netfs_issue_write(wreq, &wreq->io_streams[0]);
+	netfs_issue_write(wreq, &wreq->io_streams[1]);
+	smp_wmb(); /* Write lists before ALL_QUEUED. */
+	set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags);
+
+	mutex_unlock(&ictx->wb_lock);
+
+	ret = wreq->error;
+	netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+	return ret;
+}
+
+/*
+ * Write data to the server without going through the pagecache and without
+ * writing it to the local cache.
+ */
+int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len)
+{
+	struct netfs_io_stream *upload = &wreq->io_streams[0];
+	ssize_t part;
+	loff_t start = wreq->start;
+	int error = 0;
+
+	_enter("%zx", len);
+
+	if (wreq->origin == NETFS_DIO_WRITE)
+		inode_dio_begin(wreq->inode);
+
+	while (len) {
+		// TODO: Prepare content encryption
+
+		_debug("unbuffered %zx", len);
+		part = netfs_advance_write(wreq, upload, start, len, false);
+		start += part;
+		len -= part;
+		if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) {
+			trace_netfs_rreq(wreq, netfs_rreq_trace_wait_pause);
+			wait_on_bit(&wreq->flags, NETFS_RREQ_PAUSE, TASK_UNINTERRUPTIBLE);
+		}
+		if (test_bit(NETFS_RREQ_FAILED, &wreq->flags))
+			break;
+	}
+
+	netfs_issue_write(wreq, upload);
+
+	smp_wmb(); /* Write lists before ALL_QUEUED. */
+	set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags);
+	if (list_empty(&upload->subrequests))
+		netfs_wake_write_collector(wreq, false);
+
+	_leave(" = %d", error);
+	return error;
+}
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 88269681d4fc..42dba05a428b 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -64,6 +64,7 @@ struct netfs_inode {
 #if IS_ENABLED(CONFIG_FSCACHE)
 	struct fscache_cookie	*cache;
 #endif
+	struct mutex		wb_lock;	/* Writeback serialisation */
 	loff_t			remote_i_size;	/* Size of the remote file */
 	loff_t			zero_point;	/* Size after which we assume there's no data
 						 * on the server */
@@ -71,7 +72,6 @@ struct netfs_inode {
 #define NETFS_ICTX_ODIRECT	0		/* The file has DIO in progress */
 #define NETFS_ICTX_UNBUFFERED	1		/* I/O should not use the pagecache */
 #define NETFS_ICTX_WRITETHROUGH	2		/* Write-through caching */
-#define NETFS_ICTX_NO_WRITE_STREAMING	3	/* Don't engage in write-streaming */
 #define NETFS_ICTX_USE_PGPRIV2	31		/* [DEPRECATED] Use PG_private_2 to mark
 						 * write to cache on read */
 };
@@ -126,6 +126,33 @@ static inline struct netfs_group *netfs_folio_group(struct folio *folio)
 	return priv;
 }
 
+/*
+ * Stream of I/O subrequests going to a particular destination, such as the
+ * server or the local cache.  This is mainly intended for writing where we may
+ * have to write to multiple destinations concurrently.
+ */
+struct netfs_io_stream {
+	/* Submission tracking */
+	struct netfs_io_subrequest *construct;	/* Op being constructed */
+	unsigned int		submit_off;	/* Folio offset we're submitting from */
+	unsigned int		submit_len;	/* Amount of data left to submit */
+	unsigned int		submit_max_len;	/* Amount I/O can be rounded up to */
+	void (*prepare_write)(struct netfs_io_subrequest *subreq);
+	void (*issue_write)(struct netfs_io_subrequest *subreq);
+	/* Collection tracking */
+	struct list_head	subrequests;	/* Contributory I/O operations */
+	struct netfs_io_subrequest *front;	/* Op being collected */
+	unsigned long long	collected_to;	/* Position we've collected results to */
+	size_t			transferred;	/* The amount transferred from this stream */
+	enum netfs_io_source	source;		/* Where to read from/write to */
+	unsigned short		error;		/* Aggregate error for the stream */
+	unsigned char		stream_nr;	/* Index of stream in parent table */
+	bool			avail;		/* T if stream is available */
+	bool			active;		/* T if stream is active */
+	bool			need_retry;	/* T if this stream needs retrying */
+	bool			failed;		/* T if this stream failed */
+};
+
 /*
  * Resources required to do operations on a cache.
  */
@@ -150,13 +177,16 @@ struct netfs_io_subrequest {
 	struct list_head	rreq_link;	/* Link in rreq->subrequests */
 	struct iov_iter		io_iter;	/* Iterator for this subrequest */
 	unsigned long long	start;		/* Where to start the I/O */
+	size_t			max_len;	/* Maximum size of the I/O */
 	size_t			len;		/* Size of the I/O */
 	size_t			transferred;	/* Amount of data transferred */
 	refcount_t		ref;
 	short			error;		/* 0 or error that occurred */
 	unsigned short		debug_index;	/* Index in list (for debugging output) */
+	unsigned int		nr_segs;	/* Number of segs in io_iter */
 	unsigned int		max_nr_segs;	/* 0 or max number of segments in an iterator */
 	enum netfs_io_source	source;		/* Where to read from/write to */
+	unsigned char		stream_nr;	/* I/O stream this belongs to */
 	unsigned long		flags;
 #define NETFS_SREQ_COPY_TO_CACHE	0	/* Set if should copy the data to the cache */
 #define NETFS_SREQ_CLEAR_TAIL		1	/* Set if the rest of the read should be cleared */
@@ -164,6 +194,11 @@ struct netfs_io_subrequest {
 #define NETFS_SREQ_SEEK_DATA_READ	3	/* Set if ->read() should SEEK_DATA first */
 #define NETFS_SREQ_NO_PROGRESS		4	/* Set if we didn't manage to read any data */
 #define NETFS_SREQ_ONDEMAND		5	/* Set if it's from on-demand read mode */
+#define NETFS_SREQ_BOUNDARY		6	/* Set if ends on hard boundary (eg. ceph object) */
+#define NETFS_SREQ_IN_PROGRESS		8	/* Unlocked when the subrequest completes */
+#define NETFS_SREQ_NEED_RETRY		9	/* Set if the filesystem requests a retry */
+#define NETFS_SREQ_RETRYING		10	/* Set if we're retrying */
+#define NETFS_SREQ_FAILED		11	/* Set if the subreq failed unretryably */
 };
 
 enum netfs_io_origin {
@@ -194,6 +229,9 @@ struct netfs_io_request {
 	struct netfs_cache_resources cache_resources;
 	struct list_head	proc_link;	/* Link in netfs_iorequests */
 	struct list_head	subrequests;	/* Contributory I/O operations */
+	struct netfs_io_stream	io_streams[2];	/* Streams of parallel I/O operations */
+#define NR_IO_STREAMS 2 //wreq->nr_io_streams
+	struct netfs_group	*group;		/* Writeback group being written back */
 	struct iov_iter		iter;		/* Unencrypted-side iterator */
 	struct iov_iter		io_iter;	/* I/O (Encrypted-side) iterator */
 	void			*netfs_priv;	/* Private data for the netfs */
@@ -203,6 +241,8 @@ struct netfs_io_request {
 	unsigned int		rsize;		/* Maximum read size (0 for none) */
 	unsigned int		wsize;		/* Maximum write size (0 for none) */
 	atomic_t		subreq_counter;	/* Next subreq->debug_index */
+	unsigned int		nr_group_rel;	/* Number of refs to release on ->group */
+	spinlock_t		lock;		/* Lock for queuing subreqs */
 	atomic_t		nr_outstanding;	/* Number of ops in progress */
 	atomic_t		nr_copy_ops;	/* Number of copy-to-cache ops in progress */
 	size_t			upper_len;	/* Length can be extended to here */
@@ -214,6 +254,10 @@ struct netfs_io_request {
 	bool			direct_bv_unpin; /* T if direct_bv[] must be unpinned */
 	unsigned long long	i_size;		/* Size of the file */
 	unsigned long long	start;		/* Start position */
+	atomic64_t		issued_to;	/* Write issuer folio cursor */
+	unsigned long long	contiguity;	/* Tracking for gaps in the writeback sequence */
+	unsigned long long	collected_to;	/* Point we've collected to */
+	unsigned long long	cleaned_to;	/* Position we've cleaned folios to */
 	pgoff_t			no_unlock_folio; /* Don't unlock this folio after read */
 	refcount_t		ref;
 	unsigned long		flags;
@@ -227,6 +271,9 @@ struct netfs_io_request {
 #define NETFS_RREQ_UPLOAD_TO_SERVER	8	/* Need to write to the server */
 #define NETFS_RREQ_NONBLOCK		9	/* Don't block if possible (O_NONBLOCK) */
 #define NETFS_RREQ_BLOCKED		10	/* We blocked */
+#define NETFS_RREQ_PAUSE		11	/* Pause subrequest generation */
+#define NETFS_RREQ_USE_IO_ITER		12	/* Use ->io_iter rather than ->i_pages */
+#define NETFS_RREQ_ALL_QUEUED		13	/* All subreqs are now queued */
 #define NETFS_RREQ_USE_PGPRIV2		31	/* [DEPRECATED] Use PG_private_2 to mark
 						 * write to cache on read */
 	const struct netfs_request_ops *netfs_ops;
@@ -258,6 +305,9 @@ struct netfs_request_ops {
 	/* Write request handling */
 	void (*create_write_requests)(struct netfs_io_request *wreq,
 				      loff_t start, size_t len);
+	void (*begin_writeback)(struct netfs_io_request *wreq);
+	void (*prepare_write)(struct netfs_io_subrequest *subreq);
+	void (*issue_write)(struct netfs_io_subrequest *subreq);
 	void (*invalidate_cache)(struct netfs_io_request *wreq);
 };
 
@@ -292,6 +342,9 @@ struct netfs_cache_ops {
 		     netfs_io_terminated_t term_func,
 		     void *term_func_priv);
 
+	/* Write data to the cache from a netfs subrequest. */
+	void (*issue_write)(struct netfs_io_subrequest *subreq);
+
 	/* Expand readahead request */
 	void (*expand_readahead)(struct netfs_cache_resources *cres,
 				 unsigned long long *_start,
@@ -304,6 +357,13 @@ struct netfs_cache_ops {
 	enum netfs_io_source (*prepare_read)(struct netfs_io_subrequest *subreq,
 					     unsigned long long i_size);
 
+	/* Prepare a write subrequest, working out if we're allowed to do it
+	 * and finding out the maximum amount of data to gather before
+	 * attempting to submit.  If we're not permitted to do it, the
+	 * subrequest should be marked failed.
+	 */
+	void (*prepare_write_subreq)(struct netfs_io_subrequest *subreq);
+
 	/* Prepare a write operation, working out what part of the write we can
 	 * actually do.
 	 */
@@ -349,6 +409,8 @@ int netfs_write_begin(struct netfs_inode *, struct file *,
 		      struct folio **, void **fsdata);
 int netfs_writepages(struct address_space *mapping,
 		     struct writeback_control *wbc);
+int new_netfs_writepages(struct address_space *mapping,
+			struct writeback_control *wbc);
 bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio);
 int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc);
 void netfs_clear_inode_writeback(struct inode *inode, const void *aux);
@@ -372,8 +434,11 @@ size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset,
 struct netfs_io_subrequest *netfs_create_write_request(
 	struct netfs_io_request *wreq, enum netfs_io_source dest,
 	loff_t start, size_t len, work_func_t worker);
+void netfs_prepare_write_failed(struct netfs_io_subrequest *subreq);
 void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
 				       bool was_async);
+void new_netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
+					   bool was_async);
 void netfs_queue_write_request(struct netfs_io_subrequest *subreq);
 
 int netfs_start_io_read(struct inode *inode);
@@ -415,6 +480,7 @@ static inline void netfs_inode_init(struct netfs_inode *ctx,
 #if IS_ENABLED(CONFIG_FSCACHE)
 	ctx->cache = NULL;
 #endif
+	mutex_init(&ctx->wb_lock);
 	/* ->releasepage() drives zero_point */
 	if (use_zero_point) {
 		ctx->zero_point = ctx->remote_i_size;
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index 7126d2ea459c..e7700172ae7e 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -44,14 +44,18 @@
 #define netfs_rreq_traces					\
 	EM(netfs_rreq_trace_assess,		"ASSESS ")	\
 	EM(netfs_rreq_trace_copy,		"COPY   ")	\
+	EM(netfs_rreq_trace_collect,		"COLLECT")	\
 	EM(netfs_rreq_trace_done,		"DONE   ")	\
 	EM(netfs_rreq_trace_free,		"FREE   ")	\
 	EM(netfs_rreq_trace_redirty,		"REDIRTY")	\
 	EM(netfs_rreq_trace_resubmit,		"RESUBMT")	\
+	EM(netfs_rreq_trace_set_pause,		"PAUSE  ")	\
 	EM(netfs_rreq_trace_unlock,		"UNLOCK ")	\
 	EM(netfs_rreq_trace_unmark,		"UNMARK ")	\
 	EM(netfs_rreq_trace_wait_ip,		"WAIT-IP")	\
+	EM(netfs_rreq_trace_wait_pause,		"WT-PAUS")	\
 	EM(netfs_rreq_trace_wake_ip,		"WAKE-IP")	\
+	EM(netfs_rreq_trace_unpause,		"UNPAUSE")	\
 	E_(netfs_rreq_trace_write_done,		"WR-DONE")
 
 #define netfs_sreq_sources					\
@@ -64,11 +68,15 @@
 	E_(NETFS_INVALID_WRITE,			"INVL")
 
 #define netfs_sreq_traces					\
+	EM(netfs_sreq_trace_discard,		"DSCRD")	\
 	EM(netfs_sreq_trace_download_instead,	"RDOWN")	\
+	EM(netfs_sreq_trace_fail,		"FAIL ")	\
 	EM(netfs_sreq_trace_free,		"FREE ")	\
 	EM(netfs_sreq_trace_limited,		"LIMIT")	\
 	EM(netfs_sreq_trace_prepare,		"PREP ")	\
+	EM(netfs_sreq_trace_prep_failed,	"PRPFL")	\
 	EM(netfs_sreq_trace_resubmit_short,	"SHORT")	\
+	EM(netfs_sreq_trace_retry,		"RETRY")	\
 	EM(netfs_sreq_trace_submit,		"SUBMT")	\
 	EM(netfs_sreq_trace_terminated,		"TERM ")	\
 	EM(netfs_sreq_trace_write,		"WRITE")	\
@@ -88,6 +96,7 @@
 #define netfs_rreq_ref_traces					\
 	EM(netfs_rreq_trace_get_for_outstanding,"GET OUTSTND")	\
 	EM(netfs_rreq_trace_get_subreq,		"GET SUBREQ ")	\
+	EM(netfs_rreq_trace_get_work,		"GET WORK   ")	\
 	EM(netfs_rreq_trace_put_complete,	"PUT COMPLT ")	\
 	EM(netfs_rreq_trace_put_discard,	"PUT DISCARD")	\
 	EM(netfs_rreq_trace_put_failed,		"PUT FAILED ")	\
@@ -95,6 +104,8 @@
 	EM(netfs_rreq_trace_put_return,		"PUT RETURN ")	\
 	EM(netfs_rreq_trace_put_subreq,		"PUT SUBREQ ")	\
 	EM(netfs_rreq_trace_put_work,		"PUT WORK   ")	\
+	EM(netfs_rreq_trace_put_work_complete,	"PUT WORK CP")	\
+	EM(netfs_rreq_trace_put_work_nq,	"PUT WORK NQ")	\
 	EM(netfs_rreq_trace_see_work,		"SEE WORK   ")	\
 	E_(netfs_rreq_trace_new,		"NEW        ")
 
@@ -103,11 +114,14 @@
 	EM(netfs_sreq_trace_get_resubmit,	"GET RESUBMIT")	\
 	EM(netfs_sreq_trace_get_short_read,	"GET SHORTRD")	\
 	EM(netfs_sreq_trace_new,		"NEW        ")	\
+	EM(netfs_sreq_trace_put_cancel,		"PUT CANCEL ")	\
 	EM(netfs_sreq_trace_put_clear,		"PUT CLEAR  ")	\
 	EM(netfs_sreq_trace_put_discard,	"PUT DISCARD")	\
+	EM(netfs_sreq_trace_put_done,		"PUT DONE   ")	\
 	EM(netfs_sreq_trace_put_failed,		"PUT FAILED ")	\
 	EM(netfs_sreq_trace_put_merged,		"PUT MERGED ")	\
 	EM(netfs_sreq_trace_put_no_copy,	"PUT NO COPY")	\
+	EM(netfs_sreq_trace_put_oom,		"PUT OOM    ")	\
 	EM(netfs_sreq_trace_put_wip,		"PUT WIP    ")	\
 	EM(netfs_sreq_trace_put_work,		"PUT WORK   ")	\
 	E_(netfs_sreq_trace_put_terminated,	"PUT TERM   ")
@@ -124,7 +138,9 @@
 	EM(netfs_streaming_filled_page,		"mod-streamw-f") \
 	EM(netfs_streaming_cont_filled_page,	"mod-streamw-f+") \
 	/* The rest are for writeback */			\
+	EM(netfs_folio_trace_cancel_copy,	"cancel-copy")	\
 	EM(netfs_folio_trace_clear,		"clear")	\
+	EM(netfs_folio_trace_clear_cc,		"clear-cc")	\
 	EM(netfs_folio_trace_clear_s,		"clear-s")	\
 	EM(netfs_folio_trace_clear_g,		"clear-g")	\
 	EM(netfs_folio_trace_copy,		"copy")		\
@@ -133,16 +149,26 @@
 	EM(netfs_folio_trace_end_copy,		"end-copy")	\
 	EM(netfs_folio_trace_filled_gaps,	"filled-gaps")	\
 	EM(netfs_folio_trace_kill,		"kill")		\
+	EM(netfs_folio_trace_kill_cc,		"kill-cc")	\
+	EM(netfs_folio_trace_kill_g,		"kill-g")	\
+	EM(netfs_folio_trace_kill_s,		"kill-s")	\
 	EM(netfs_folio_trace_mkwrite,		"mkwrite")	\
 	EM(netfs_folio_trace_mkwrite_plus,	"mkwrite+")	\
+	EM(netfs_folio_trace_not_under_wback,	"!wback")	\
 	EM(netfs_folio_trace_read_gaps,		"read-gaps")	\
 	EM(netfs_folio_trace_redirty,		"redirty")	\
 	EM(netfs_folio_trace_redirtied,		"redirtied")	\
 	EM(netfs_folio_trace_store,		"store")	\
+	EM(netfs_folio_trace_store_copy,	"store-copy")	\
 	EM(netfs_folio_trace_store_plus,	"store+")	\
 	EM(netfs_folio_trace_wthru,		"wthru")	\
 	E_(netfs_folio_trace_wthru_plus,	"wthru+")
 
+#define netfs_collect_contig_traces				\
+	EM(netfs_contig_trace_collect,		"Collect")	\
+	EM(netfs_contig_trace_jump,		"-->JUMP-->")	\
+	E_(netfs_contig_trace_unlock,		"Unlock")
+
 #ifndef __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY
 #define __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY
 
@@ -159,6 +185,7 @@ enum netfs_failure { netfs_failures } __mode(byte);
 enum netfs_rreq_ref_trace { netfs_rreq_ref_traces } __mode(byte);
 enum netfs_sreq_ref_trace { netfs_sreq_ref_traces } __mode(byte);
 enum netfs_folio_trace { netfs_folio_traces } __mode(byte);
+enum netfs_collect_contig_trace { netfs_collect_contig_traces } __mode(byte);
 
 #endif
 
@@ -180,6 +207,7 @@ netfs_failures;
 netfs_rreq_ref_traces;
 netfs_sreq_ref_traces;
 netfs_folio_traces;
+netfs_collect_contig_traces;
 
 /*
  * Now redefine the EM() and E_() macros to map the enums to the strings that
@@ -413,16 +441,18 @@ TRACE_EVENT(netfs_write_iter,
 		    __field(unsigned long long,		start		)
 		    __field(size_t,			len		)
 		    __field(unsigned int,		flags		)
+		    __field(unsigned int,		ino		)
 			     ),
 
 	    TP_fast_assign(
 		    __entry->start	= iocb->ki_pos;
 		    __entry->len	= iov_iter_count(from);
+		    __entry->ino	= iocb->ki_filp->f_inode->i_ino;
 		    __entry->flags	= iocb->ki_flags;
 			   ),
 
-	    TP_printk("WRITE-ITER s=%llx l=%zx f=%x",
-		      __entry->start, __entry->len, __entry->flags)
+	    TP_printk("WRITE-ITER i=%x s=%llx l=%zx f=%x",
+		      __entry->ino, __entry->start, __entry->len, __entry->flags)
 	    );
 
 TRACE_EVENT(netfs_write,
@@ -434,6 +464,7 @@ TRACE_EVENT(netfs_write,
 	    TP_STRUCT__entry(
 		    __field(unsigned int,		wreq		)
 		    __field(unsigned int,		cookie		)
+		    __field(unsigned int,		ino		)
 		    __field(enum netfs_write_trace,	what		)
 		    __field(unsigned long long,		start		)
 		    __field(unsigned long long,		len		)
@@ -444,18 +475,213 @@ TRACE_EVENT(netfs_write,
 		    struct fscache_cookie *__cookie = netfs_i_cookie(__ctx);
 		    __entry->wreq	= wreq->debug_id;
 		    __entry->cookie	= __cookie ? __cookie->debug_id : 0;
+		    __entry->ino	= wreq->inode->i_ino;
 		    __entry->what	= what;
 		    __entry->start	= wreq->start;
 		    __entry->len	= wreq->len;
 			   ),
 
-	    TP_printk("R=%08x %s c=%08x by=%llx-%llx",
+	    TP_printk("R=%08x %s c=%08x i=%x by=%llx-%llx",
 		      __entry->wreq,
 		      __print_symbolic(__entry->what, netfs_write_traces),
 		      __entry->cookie,
+		      __entry->ino,
 		      __entry->start, __entry->start + __entry->len - 1)
 	    );
 
+TRACE_EVENT(netfs_collect,
+	    TP_PROTO(const struct netfs_io_request *wreq),
+
+	    TP_ARGS(wreq),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,		wreq		)
+		    __field(unsigned int,		len		)
+		    __field(unsigned long long,		transferred	)
+		    __field(unsigned long long,		start		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->wreq	= wreq->debug_id;
+		    __entry->start	= wreq->start;
+		    __entry->len	= wreq->len;
+		    __entry->transferred = wreq->transferred;
+			   ),
+
+	    TP_printk("R=%08x s=%llx-%llx",
+		      __entry->wreq,
+		      __entry->start + __entry->transferred,
+		      __entry->start + __entry->len)
+	    );
+
+TRACE_EVENT(netfs_collect_contig,
+	    TP_PROTO(const struct netfs_io_request *wreq, unsigned long long to,
+		     enum netfs_collect_contig_trace type),
+
+	    TP_ARGS(wreq, to, type),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,		wreq)
+		    __field(enum netfs_collect_contig_trace, type)
+		    __field(unsigned long long,		contiguity)
+		    __field(unsigned long long,		to)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->wreq	= wreq->debug_id;
+		    __entry->type	= type;
+		    __entry->contiguity	= wreq->contiguity;
+		    __entry->to		= to;
+			   ),
+
+	    TP_printk("R=%08x %llx -> %llx %s",
+		      __entry->wreq,
+		      __entry->contiguity,
+		      __entry->to,
+		      __print_symbolic(__entry->type, netfs_collect_contig_traces))
+	    );
+
+TRACE_EVENT(netfs_collect_sreq,
+	    TP_PROTO(const struct netfs_io_request *wreq,
+		     const struct netfs_io_subrequest *subreq),
+
+	    TP_ARGS(wreq, subreq),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,		wreq		)
+		    __field(unsigned int,		subreq		)
+		    __field(unsigned int,		stream		)
+		    __field(unsigned int,		len		)
+		    __field(unsigned int,		transferred	)
+		    __field(unsigned long long,		start		)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->wreq	= wreq->debug_id;
+		    __entry->subreq	= subreq->debug_index;
+		    __entry->stream	= subreq->stream_nr;
+		    __entry->start	= subreq->start;
+		    __entry->len	= subreq->len;
+		    __entry->transferred = subreq->transferred;
+			   ),
+
+	    TP_printk("R=%08x[%u:%02x] s=%llx t=%x/%x",
+		      __entry->wreq, __entry->stream, __entry->subreq,
+		      __entry->start, __entry->transferred, __entry->len)
+	    );
+
+TRACE_EVENT(netfs_collect_folio,
+	    TP_PROTO(const struct netfs_io_request *wreq,
+		     const struct folio *folio,
+		     unsigned long long fend,
+		     unsigned long long collected_to),
+
+	    TP_ARGS(wreq, folio, fend, collected_to),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,	wreq		)
+		    __field(unsigned long,	index		)
+		    __field(unsigned long long,	fend		)
+		    __field(unsigned long long,	cleaned_to	)
+		    __field(unsigned long long,	collected_to	)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->wreq	= wreq->debug_id;
+		    __entry->index	= folio->index;
+		    __entry->fend	= fend;
+		    __entry->cleaned_to	= wreq->cleaned_to;
+		    __entry->collected_to = collected_to;
+			   ),
+
+	    TP_printk("R=%08x ix=%05lx r=%llx-%llx t=%llx/%llx",
+		      __entry->wreq, __entry->index,
+		      (unsigned long long)__entry->index * PAGE_SIZE, __entry->fend,
+		      __entry->cleaned_to, __entry->collected_to)
+	    );
+
+TRACE_EVENT(netfs_collect_state,
+	    TP_PROTO(const struct netfs_io_request *wreq,
+		     unsigned long long collected_to,
+		     unsigned int notes),
+
+	    TP_ARGS(wreq, collected_to, notes),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,	wreq		)
+		    __field(unsigned int,	notes		)
+		    __field(unsigned long long,	collected_to	)
+		    __field(unsigned long long,	cleaned_to	)
+		    __field(unsigned long long,	contiguity	)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->wreq	= wreq->debug_id;
+		    __entry->notes	= notes;
+		    __entry->collected_to = collected_to;
+		    __entry->cleaned_to	= wreq->cleaned_to;
+		    __entry->contiguity = wreq->contiguity;
+			   ),
+
+	    TP_printk("R=%08x cto=%llx fto=%llx ctg=%llx n=%x",
+		      __entry->wreq, __entry->collected_to,
+		      __entry->cleaned_to, __entry->contiguity,
+		      __entry->notes)
+	    );
+
+TRACE_EVENT(netfs_collect_gap,
+	    TP_PROTO(const struct netfs_io_request *wreq,
+		     const struct netfs_io_stream *stream,
+		     unsigned long long jump_to, char type),
+
+	    TP_ARGS(wreq, stream, jump_to, type),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,	wreq)
+		    __field(unsigned char,	stream)
+		    __field(unsigned char,	type)
+		    __field(unsigned long long,	from)
+		    __field(unsigned long long,	to)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->wreq	= wreq->debug_id;
+		    __entry->stream	= stream->stream_nr;
+		    __entry->from	= stream->collected_to;
+		    __entry->to		= jump_to;
+		    __entry->type	= type;
+			   ),
+
+	    TP_printk("R=%08x[%x:] %llx->%llx %c",
+		      __entry->wreq, __entry->stream,
+		      __entry->from, __entry->to, __entry->type)
+	    );
+
+TRACE_EVENT(netfs_collect_stream,
+	    TP_PROTO(const struct netfs_io_request *wreq,
+		     const struct netfs_io_stream *stream),
+
+	    TP_ARGS(wreq, stream),
+
+	    TP_STRUCT__entry(
+		    __field(unsigned int,	wreq)
+		    __field(unsigned char,	stream)
+		    __field(unsigned long long,	collected_to)
+		    __field(unsigned long long,	front)
+			     ),
+
+	    TP_fast_assign(
+		    __entry->wreq	= wreq->debug_id;
+		    __entry->stream	= stream->stream_nr;
+		    __entry->collected_to = stream->collected_to;
+		    __entry->front	= stream->front ? stream->front->start : UINT_MAX;
+			   ),
+
+	    TP_printk("R=%08x[%x:] cto=%llx frn=%llx",
+		      __entry->wreq, __entry->stream,
+		      __entry->collected_to, __entry->front)
+	    );
+
 #undef EM
 #undef E_
 #endif /* _TRACE_NETFS_H */
-- 
cgit v1.2.3


From 5fb70e7275a61dd404f684370e1add7fe0ebe9c5 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 18 Mar 2024 20:29:53 +0000
Subject: netfs, 9p: Implement helpers for new write code

Implement the helpers for the new write code in 9p.  There's now an
optional ->prepare_write() that allows the filesystem to set the parameters
for the next write, such as maximum size and maximum segment count, and an
->issue_write() that is called to initiate an (asynchronous) write
operation.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: Eric Van Hensbergen <ericvh@kernel.org>
cc: Latchesar Ionkov <lucho@ionkov.net>
cc: Dominique Martinet <asmadeus@codewreck.org>
cc: Christian Schoenebeck <linux_oss@crudebyte.com>
cc: v9fs@lists.linux.dev
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
---
 fs/9p/vfs_addr.c        | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/net/9p/client.h |  2 ++
 net/9p/Kconfig          |  1 +
 net/9p/client.c         | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 100 insertions(+)

(limited to 'include')

diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 5a943c122d83..07d03efdd594 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -26,6 +26,40 @@
 #include "cache.h"
 #include "fid.h"
 
+/*
+ * Writeback calls this when it finds a folio that needs uploading.  This isn't
+ * called if writeback only has copy-to-cache to deal with.
+ */
+static void v9fs_begin_writeback(struct netfs_io_request *wreq)
+{
+	struct p9_fid *fid;
+
+	fid = v9fs_fid_find_inode(wreq->inode, true, INVALID_UID, true);
+	if (!fid) {
+		WARN_ONCE(1, "folio expected an open fid inode->i_ino=%lx\n",
+			  wreq->inode->i_ino);
+		return;
+	}
+
+	wreq->wsize = fid->clnt->msize - P9_IOHDRSZ;
+	if (fid->iounit)
+		wreq->wsize = min(wreq->wsize, fid->iounit);
+	wreq->netfs_priv = fid;
+	wreq->io_streams[0].avail = true;
+}
+
+/*
+ * Issue a subrequest to write to the server.
+ */
+static void v9fs_issue_write(struct netfs_io_subrequest *subreq)
+{
+	struct p9_fid *fid = subreq->rreq->netfs_priv;
+	int err, len;
+
+	len = p9_client_write(fid, subreq->start, &subreq->io_iter, &err);
+	netfs_write_subrequest_terminated(subreq, len ?: err, false);
+}
+
 static void v9fs_upload_to_server(struct netfs_io_subrequest *subreq)
 {
 	struct p9_fid *fid = subreq->rreq->netfs_priv;
@@ -92,6 +126,14 @@ static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file)
 			rreq->origin == NETFS_UNBUFFERED_WRITE ||
 			rreq->origin == NETFS_DIO_WRITE);
 
+#if 0 // TODO: Cut over
+	if (rreq->origin == NETFS_WRITEBACK)
+		return 0; /* We don't get the write handle until we find we
+			   * have actually dirty data and not just
+			   * copy-to-cache data.
+			   */
+#endif
+
 	if (file) {
 		fid = file->private_data;
 		if (!fid)
@@ -103,6 +145,10 @@ static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file)
 			goto no_fid;
 	}
 
+	rreq->wsize = fid->clnt->msize - P9_IOHDRSZ;
+	if (fid->iounit)
+		rreq->wsize = min(rreq->wsize, fid->iounit);
+
 	/* we might need to read from a fid that was opened write-only
 	 * for read-modify-write of page cache, use the writeback fid
 	 * for that */
@@ -131,6 +177,8 @@ const struct netfs_request_ops v9fs_req_ops = {
 	.init_request		= v9fs_init_request,
 	.free_request		= v9fs_free_request,
 	.issue_read		= v9fs_issue_read,
+	.begin_writeback	= v9fs_begin_writeback,
+	.issue_write		= v9fs_issue_write,
 	.create_write_requests	= v9fs_create_write_requests,
 };
 
diff --git a/include/net/9p/client.h b/include/net/9p/client.h
index 78ebcf782ce5..4f785098c67a 100644
--- a/include/net/9p/client.h
+++ b/include/net/9p/client.h
@@ -207,6 +207,8 @@ int p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err
 int p9_client_read_once(struct p9_fid *fid, u64 offset, struct iov_iter *to,
 		int *err);
 int p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err);
+struct netfs_io_subrequest;
+void p9_client_write_subreq(struct netfs_io_subrequest *subreq);
 int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset);
 int p9dirent_read(struct p9_client *clnt, char *buf, int len,
 		  struct p9_dirent *dirent);
diff --git a/net/9p/Kconfig b/net/9p/Kconfig
index 00ebce9e5a65..bcdab9c23b40 100644
--- a/net/9p/Kconfig
+++ b/net/9p/Kconfig
@@ -5,6 +5,7 @@
 
 menuconfig NET_9P
 	tristate "Plan 9 Resource Sharing Support (9P2000)"
+	select NETFS_SUPPORT
 	help
 	  If you say Y here, you will get experimental support for
 	  Plan 9 resource sharing via the 9P2000 protocol.
diff --git a/net/9p/client.c b/net/9p/client.c
index f7e90b4769bb..00774656eeac 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -18,6 +18,7 @@
 #include <linux/sched/signal.h>
 #include <linux/uaccess.h>
 #include <linux/uio.h>
+#include <linux/netfs.h>
 #include <net/9p/9p.h>
 #include <linux/parser.h>
 #include <linux/seq_file.h>
@@ -1661,6 +1662,54 @@ p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err)
 }
 EXPORT_SYMBOL(p9_client_write);
 
+void
+p9_client_write_subreq(struct netfs_io_subrequest *subreq)
+{
+	struct netfs_io_request *wreq = subreq->rreq;
+	struct p9_fid *fid = wreq->netfs_priv;
+	struct p9_client *clnt = fid->clnt;
+	struct p9_req_t *req;
+	unsigned long long start = subreq->start + subreq->transferred;
+	int written, len = subreq->len - subreq->transferred;
+	int err;
+
+	p9_debug(P9_DEBUG_9P, ">>> TWRITE fid %d offset %llu len %d\n",
+		 fid->fid, start, len);
+
+	/* Don't bother zerocopy for small IO (< 1024) */
+	if (clnt->trans_mod->zc_request && len > 1024) {
+		req = p9_client_zc_rpc(clnt, P9_TWRITE, NULL, &subreq->io_iter,
+				       0, wreq->len, P9_ZC_HDR_SZ, "dqd",
+				       fid->fid, start, len);
+	} else {
+		req = p9_client_rpc(clnt, P9_TWRITE, "dqV", fid->fid,
+				    start, len, &subreq->io_iter);
+	}
+	if (IS_ERR(req)) {
+		netfs_write_subrequest_terminated(subreq, PTR_ERR(req), false);
+		return;
+	}
+
+	err = p9pdu_readf(&req->rc, clnt->proto_version, "d", &written);
+	if (err) {
+		trace_9p_protocol_dump(clnt, &req->rc);
+		p9_req_put(clnt, req);
+		netfs_write_subrequest_terminated(subreq, err, false);
+		return;
+	}
+
+	if (written > len) {
+		pr_err("bogus RWRITE count (%d > %u)\n", written, len);
+		written = len;
+	}
+
+	p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", len);
+
+	p9_req_put(clnt, req);
+	netfs_write_subrequest_terminated(subreq, written, false);
+}
+EXPORT_SYMBOL(p9_client_write_subreq);
+
 struct p9_wstat *p9_client_stat(struct p9_fid *fid)
 {
 	int err;
-- 
cgit v1.2.3


From 2df86547b23dabcd02ab000a24ed7813606c269f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 8 Mar 2024 12:36:05 +0000
Subject: netfs: Cut over to using new writeback code

Cut over to using the new writeback code.  The old code is #ifdef'd out or
otherwise removed from compilation to avoid conflicts and will be removed
in a future patch.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: Eric Van Hensbergen <ericvh@kernel.org>
cc: Latchesar Ionkov <lucho@ionkov.net>
cc: Dominique Martinet <asmadeus@codewreck.org>
cc: Christian Schoenebeck <linux_oss@crudebyte.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: v9fs@lists.linux.dev
cc: linux-afs@lists.infradead.org
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
---
 fs/9p/vfs_addr.c          |  6 ++----
 fs/afs/file.c             |  3 +--
 fs/afs/internal.h         |  1 -
 fs/afs/write.c            |  2 ++
 fs/netfs/Makefile         |  1 -
 fs/netfs/buffered_write.c | 45 ++++++++++++++++++++++++---------------------
 fs/netfs/direct_write.c   | 26 ++++++++++++++------------
 fs/netfs/internal.h       | 21 ++++++---------------
 fs/netfs/write_collect.c  |  8 ++++----
 fs/netfs/write_issue.c    | 18 +++++++++---------
 include/linux/netfs.h     |  9 ---------
 11 files changed, 62 insertions(+), 78 deletions(-)

(limited to 'include')

diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 07d03efdd594..4845e655bc39 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -60,6 +60,7 @@ static void v9fs_issue_write(struct netfs_io_subrequest *subreq)
 	netfs_write_subrequest_terminated(subreq, len ?: err, false);
 }
 
+#if 0 // TODO: Remove
 static void v9fs_upload_to_server(struct netfs_io_subrequest *subreq)
 {
 	struct p9_fid *fid = subreq->rreq->netfs_priv;
@@ -91,6 +92,7 @@ static void v9fs_create_write_requests(struct netfs_io_request *wreq, loff_t sta
 	if (subreq)
 		netfs_queue_write_request(subreq);
 }
+#endif
 
 /**
  * v9fs_issue_read - Issue a read from 9P
@@ -121,18 +123,15 @@ static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file)
 {
 	struct p9_fid *fid;
 	bool writing = (rreq->origin == NETFS_READ_FOR_WRITE ||
-			rreq->origin == NETFS_WRITEBACK ||
 			rreq->origin == NETFS_WRITETHROUGH ||
 			rreq->origin == NETFS_UNBUFFERED_WRITE ||
 			rreq->origin == NETFS_DIO_WRITE);
 
-#if 0 // TODO: Cut over
 	if (rreq->origin == NETFS_WRITEBACK)
 		return 0; /* We don't get the write handle until we find we
 			   * have actually dirty data and not just
 			   * copy-to-cache data.
 			   */
-#endif
 
 	if (file) {
 		fid = file->private_data;
@@ -179,7 +178,6 @@ const struct netfs_request_ops v9fs_req_ops = {
 	.issue_read		= v9fs_issue_read,
 	.begin_writeback	= v9fs_begin_writeback,
 	.issue_write		= v9fs_issue_write,
-	.create_write_requests	= v9fs_create_write_requests,
 };
 
 const struct address_space_operations v9fs_addr_operations = {
diff --git a/fs/afs/file.c b/fs/afs/file.c
index db9ebae84fa2..8f983e3ecae7 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -353,7 +353,7 @@ static int afs_init_request(struct netfs_io_request *rreq, struct file *file)
 	if (file)
 		rreq->netfs_priv = key_get(afs_file_key(file));
 	rreq->rsize = 256 * 1024;
-	rreq->wsize = 256 * 1024;
+	rreq->wsize = 256 * 1024 * 1024;
 	return 0;
 }
 
@@ -399,7 +399,6 @@ const struct netfs_request_ops afs_req_ops = {
 	.issue_read		= afs_issue_read,
 	.update_i_size		= afs_update_i_size,
 	.invalidate_cache	= afs_netfs_invalidate_cache,
-	.create_write_requests	= afs_create_write_requests,
 	.begin_writeback	= afs_begin_writeback,
 	.prepare_write		= afs_prepare_write,
 	.issue_write		= afs_issue_write,
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index dcf0ae0323d3..887245f9336d 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1605,7 +1605,6 @@ extern int afs_writepages(struct address_space *, struct writeback_control *);
 extern int afs_fsync(struct file *, loff_t, loff_t, int);
 extern vm_fault_t afs_page_mkwrite(struct vm_fault *vmf);
 extern void afs_prune_wb_keys(struct afs_vnode *);
-void afs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len);
 
 /*
  * xattr.c
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 34595f482718..35db74627563 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -156,6 +156,7 @@ try_next_key:
 	return afs_put_operation(op);
 }
 
+#if 0 // TODO: Remove
 static void afs_upload_to_server(struct netfs_io_subrequest *subreq)
 {
 	struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode);
@@ -193,6 +194,7 @@ void afs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size
 	if (subreq)
 		netfs_queue_write_request(subreq);
 }
+#endif
 
 /*
  * Writeback calls this when it finds a folio that needs uploading.  This isn't
diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
index 1eb86e34b5a9..8e6781e0b10b 100644
--- a/fs/netfs/Makefile
+++ b/fs/netfs/Makefile
@@ -11,7 +11,6 @@ netfs-y := \
 	main.o \
 	misc.o \
 	objects.o \
-	output.o \
 	write_collect.o \
 	write_issue.o
 
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index 33ea4c20e7e7..ee8d9e3216bd 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -26,8 +26,6 @@ enum netfs_how_to_modify {
 	NETFS_FLUSH_CONTENT,		/* Flush incompatible content. */
 };
 
-static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq);
-
 static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group)
 {
 	void *priv = folio_get_private(folio);
@@ -180,7 +178,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
 	};
 	struct netfs_io_request *wreq = NULL;
 	struct netfs_folio *finfo;
-	struct folio *folio;
+	struct folio *folio, *writethrough = NULL;
 	enum netfs_how_to_modify howto;
 	enum netfs_folio_trace trace;
 	unsigned int bdp_flags = (iocb->ki_flags & IOCB_SYNC) ? 0: BDP_ASYNC;
@@ -209,7 +207,6 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
 		}
 		if (!is_sync_kiocb(iocb))
 			wreq->iocb = iocb;
-		wreq->cleanup = netfs_cleanup_buffered_write;
 		netfs_stat(&netfs_n_wh_writethrough);
 	} else {
 		netfs_stat(&netfs_n_wh_buffered_write);
@@ -253,6 +250,16 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
 		offset = pos & (flen - 1);
 		part = min_t(size_t, flen - offset, part);
 
+		/* Wait for writeback to complete.  The writeback engine owns
+		 * the info in folio->private and may change it until it
+		 * removes the WB mark.
+		 */
+		if (folio_get_private(folio) &&
+		    folio_wait_writeback_killable(folio)) {
+			ret = written ? -EINTR : -ERESTARTSYS;
+			goto error_folio_unlock;
+		}
+
 		if (signal_pending(current)) {
 			ret = written ? -EINTR : -ERESTARTSYS;
 			goto error_folio_unlock;
@@ -327,6 +334,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
 				maybe_trouble = true;
 				iov_iter_revert(iter, copied);
 				copied = 0;
+				folio_unlock(folio);
 				goto retry;
 			}
 			netfs_set_group(folio, netfs_group);
@@ -382,23 +390,14 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
 
 		if (likely(!wreq)) {
 			folio_mark_dirty(folio);
+			folio_unlock(folio);
 		} else {
-			if (folio_test_dirty(folio))
-				/* Sigh.  mmap. */
-				folio_clear_dirty_for_io(folio);
-			/* We make multiple writes to the folio... */
-			if (!folio_test_writeback(folio)) {
-				folio_start_writeback(folio);
-				if (wreq->iter.count == 0)
-					trace_netfs_folio(folio, netfs_folio_trace_wthru);
-				else
-					trace_netfs_folio(folio, netfs_folio_trace_wthru_plus);
-			}
-			netfs_advance_writethrough(wreq, copied,
-						   offset + copied == flen);
+			netfs_advance_writethrough(wreq, &wbc, folio, copied,
+						   offset + copied == flen,
+						   &writethrough);
+			/* Folio unlocked */
 		}
 	retry:
-		folio_unlock(folio);
 		folio_put(folio);
 		folio = NULL;
 
@@ -407,7 +406,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
 
 out:
 	if (unlikely(wreq)) {
-		ret2 = netfs_end_writethrough(wreq, iocb);
+		ret2 = netfs_end_writethrough(wreq, &wbc, writethrough);
 		wbc_detach_inode(&wbc);
 		if (ret2 == -EIOCBQUEUED)
 			return ret2;
@@ -529,11 +528,13 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
 
 	sb_start_pagefault(inode->i_sb);
 
-	if (folio_wait_writeback_killable(folio))
+	if (folio_lock_killable(folio) < 0)
 		goto out;
 
-	if (folio_lock_killable(folio) < 0)
+	if (folio_wait_writeback_killable(folio)) {
+		ret = VM_FAULT_LOCKED;
 		goto out;
+	}
 
 	/* Can we see a streaming write here? */
 	if (WARN_ON(!folio_test_uptodate(folio))) {
@@ -573,6 +574,7 @@ out:
 }
 EXPORT_SYMBOL(netfs_page_mkwrite);
 
+#if 0 // TODO: Remove
 /*
  * Kill all the pages in the given range
  */
@@ -1199,3 +1201,4 @@ out:
 	return ret;
 }
 EXPORT_SYMBOL(netfs_writepages);
+#endif
diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
index 36b6db504500..608ba6416919 100644
--- a/fs/netfs/direct_write.c
+++ b/fs/netfs/direct_write.c
@@ -34,6 +34,7 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
 	unsigned long long start = iocb->ki_pos;
 	unsigned long long end = start + iov_iter_count(iter);
 	ssize_t ret, n;
+	size_t len = iov_iter_count(iter);
 	bool async = !is_sync_kiocb(iocb);
 
 	_enter("");
@@ -46,13 +47,17 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
 
 	_debug("uw %llx-%llx", start, end);
 
-	wreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp,
-				   start, end - start,
-				   iocb->ki_flags & IOCB_DIRECT ?
-				   NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE);
+	wreq = netfs_create_write_req(iocb->ki_filp->f_mapping, iocb->ki_filp, start,
+				      iocb->ki_flags & IOCB_DIRECT ?
+				      NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE);
 	if (IS_ERR(wreq))
 		return PTR_ERR(wreq);
 
+	wreq->io_streams[0].avail = true;
+	trace_netfs_write(wreq, (iocb->ki_flags & IOCB_DIRECT ?
+				 netfs_write_trace_dio_write :
+				 netfs_write_trace_unbuffered_write));
+
 	{
 		/* If this is an async op and we're not using a bounce buffer,
 		 * we have to save the source buffer as the iterator is only
@@ -63,7 +68,7 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
 		 * request.
 		 */
 		if (async || user_backed_iter(iter)) {
-			n = netfs_extract_user_iter(iter, wreq->len, &wreq->iter, 0);
+			n = netfs_extract_user_iter(iter, len, &wreq->iter, 0);
 			if (n < 0) {
 				ret = n;
 				goto out;
@@ -71,7 +76,6 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
 			wreq->direct_bv = (struct bio_vec *)wreq->iter.bvec;
 			wreq->direct_bv_count = n;
 			wreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
-			wreq->len = iov_iter_count(&wreq->iter);
 		} else {
 			wreq->iter = *iter;
 		}
@@ -79,6 +83,8 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
 		wreq->io_iter = wreq->iter;
 	}
 
+	__set_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags);
+
 	/* Copy the data into the bounce buffer and encrypt it. */
 	// TODO
 
@@ -87,10 +93,7 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
 	if (async)
 		wreq->iocb = iocb;
 	wreq->cleanup = netfs_cleanup_dio_write;
-	ret = netfs_begin_write(wreq, is_sync_kiocb(iocb),
-				iocb->ki_flags & IOCB_DIRECT ?
-				netfs_write_trace_dio_write :
-				netfs_write_trace_unbuffered_write);
+	ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), iov_iter_count(&wreq->io_iter));
 	if (ret < 0) {
 		_debug("begin = %zd", ret);
 		goto out;
@@ -100,9 +103,8 @@ static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov
 		trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip);
 		wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
 			    TASK_UNINTERRUPTIBLE);
-
+		smp_rmb(); /* Read error/transferred after RIP flag */
 		ret = wreq->error;
-		_debug("waited = %zd", ret);
 		if (ret == 0) {
 			ret = wreq->transferred;
 			iocb->ki_pos += ret;
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 5d3f74a70fa7..95e281a8af78 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -92,15 +92,6 @@ static inline void netfs_see_request(struct netfs_io_request *rreq,
 	trace_netfs_rreq_ref(rreq->debug_id, refcount_read(&rreq->ref), what);
 }
 
-/*
- * output.c
- */
-int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait,
-		      enum netfs_write_trace what);
-struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len);
-int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end);
-int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb);
-
 /*
  * stats.c
  */
@@ -172,12 +163,12 @@ void netfs_reissue_write(struct netfs_io_stream *stream,
 int netfs_advance_write(struct netfs_io_request *wreq,
 			struct netfs_io_stream *stream,
 			loff_t start, size_t len, bool to_eof);
-struct netfs_io_request *new_netfs_begin_writethrough(struct kiocb *iocb, size_t len);
-int new_netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
-				   struct folio *folio, size_t copied, bool to_page_end,
-				   struct folio **writethrough_cache);
-int new_netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
-			       struct folio *writethrough_cache);
+struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len);
+int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
+			       struct folio *folio, size_t copied, bool to_page_end,
+			       struct folio **writethrough_cache);
+int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
+			   struct folio *writethrough_cache);
 int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len);
 
 /*
diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
index b8c1d3ca724a..f14c08bf605d 100644
--- a/fs/netfs/write_collect.c
+++ b/fs/netfs/write_collect.c
@@ -709,7 +709,7 @@ void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async)
 }
 
 /**
- * new_netfs_write_subrequest_terminated - Note the termination of a write operation.
+ * netfs_write_subrequest_terminated - Note the termination of a write operation.
  * @_op: The I/O request that has terminated.
  * @transferred_or_error: The amount of data transferred or an error code.
  * @was_async: The termination was asynchronous
@@ -731,8 +731,8 @@ void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async)
  * Note that %_op is a void* so that the function can be passed to
  * kiocb::term_func without the need for a casting wrapper.
  */
-void new_netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
-					   bool was_async)
+void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
+				       bool was_async)
 {
 	struct netfs_io_subrequest *subreq = _op;
 	struct netfs_io_request *wreq = subreq->rreq;
@@ -800,4 +800,4 @@ void new_netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_err
 
 	netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
 }
-EXPORT_SYMBOL(new_netfs_write_subrequest_terminated);
+EXPORT_SYMBOL(netfs_write_subrequest_terminated);
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
index 7ea86e33382c..e190043bc0da 100644
--- a/fs/netfs/write_issue.c
+++ b/fs/netfs/write_issue.c
@@ -494,8 +494,8 @@ static int netfs_write_folio(struct netfs_io_request *wreq,
 /*
  * Write some of the pending data back to the server
  */
-int new_netfs_writepages(struct address_space *mapping,
-			 struct writeback_control *wbc)
+int netfs_writepages(struct address_space *mapping,
+		     struct writeback_control *wbc)
 {
 	struct netfs_inode *ictx = netfs_inode(mapping->host);
 	struct netfs_io_request *wreq = NULL;
@@ -556,12 +556,12 @@ out:
 	_leave(" = %d", error);
 	return error;
 }
-EXPORT_SYMBOL(new_netfs_writepages);
+EXPORT_SYMBOL(netfs_writepages);
 
 /*
  * Begin a write operation for writing through the pagecache.
  */
-struct netfs_io_request *new_netfs_begin_writethrough(struct kiocb *iocb, size_t len)
+struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len)
 {
 	struct netfs_io_request *wreq = NULL;
 	struct netfs_inode *ictx = netfs_inode(file_inode(iocb->ki_filp));
@@ -586,9 +586,9 @@ struct netfs_io_request *new_netfs_begin_writethrough(struct kiocb *iocb, size_t
  * to the request.  If we've added more than wsize then we need to create a new
  * subrequest.
  */
-int new_netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
-				   struct folio *folio, size_t copied, bool to_page_end,
-				   struct folio **writethrough_cache)
+int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
+			       struct folio *folio, size_t copied, bool to_page_end,
+			       struct folio **writethrough_cache)
 {
 	_enter("R=%x ic=%zu ws=%u cp=%zu tp=%u",
 	       wreq->debug_id, wreq->iter.count, wreq->wsize, copied, to_page_end);
@@ -618,8 +618,8 @@ int new_netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeba
 /*
  * End a write operation used when writing through the pagecache.
  */
-int new_netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
-			       struct folio *writethrough_cache)
+int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
+			   struct folio *writethrough_cache)
 {
 	struct netfs_inode *ictx = netfs_inode(wreq->inode);
 	int ret;
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 42dba05a428b..c2ba364041b0 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -303,8 +303,6 @@ struct netfs_request_ops {
 	void (*update_i_size)(struct inode *inode, loff_t i_size);
 
 	/* Write request handling */
-	void (*create_write_requests)(struct netfs_io_request *wreq,
-				      loff_t start, size_t len);
 	void (*begin_writeback)(struct netfs_io_request *wreq);
 	void (*prepare_write)(struct netfs_io_subrequest *subreq);
 	void (*issue_write)(struct netfs_io_subrequest *subreq);
@@ -409,8 +407,6 @@ int netfs_write_begin(struct netfs_inode *, struct file *,
 		      struct folio **, void **fsdata);
 int netfs_writepages(struct address_space *mapping,
 		     struct writeback_control *wbc);
-int new_netfs_writepages(struct address_space *mapping,
-			struct writeback_control *wbc);
 bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio);
 int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc);
 void netfs_clear_inode_writeback(struct inode *inode, const void *aux);
@@ -431,14 +427,9 @@ ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len,
 				iov_iter_extraction_t extraction_flags);
 size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset,
 			size_t max_size, size_t max_segs);
-struct netfs_io_subrequest *netfs_create_write_request(
-	struct netfs_io_request *wreq, enum netfs_io_source dest,
-	loff_t start, size_t len, work_func_t worker);
 void netfs_prepare_write_failed(struct netfs_io_subrequest *subreq);
 void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
 				       bool was_async);
-void new_netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
-					   bool was_async);
 void netfs_queue_write_request(struct netfs_io_subrequest *subreq);
 
 int netfs_start_io_read(struct inode *inode);
-- 
cgit v1.2.3


From d41ca44c20c3578154f30b07aa85ed4a951f34ab Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 18 Mar 2024 20:14:32 +0000
Subject: netfs: Miscellaneous tidy ups

Do a couple of miscellaneous tidy ups:

 (1) Add a qualifier into a file banner comment.

 (2) Put the writeback folio traces back into alphabetical order.

 (3) Remove some unused folio traces.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
---
 fs/netfs/buffered_write.c    | 2 +-
 include/trace/events/netfs.h | 6 +-----
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index b21d331eaa1e..825e6632ee4f 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0-only
-/* Network filesystem high-level write support.
+/* Network filesystem high-level buffered write support.
  *
  * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index e7700172ae7e..4ba553a6d71b 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -141,12 +141,9 @@
 	EM(netfs_folio_trace_cancel_copy,	"cancel-copy")	\
 	EM(netfs_folio_trace_clear,		"clear")	\
 	EM(netfs_folio_trace_clear_cc,		"clear-cc")	\
-	EM(netfs_folio_trace_clear_s,		"clear-s")	\
 	EM(netfs_folio_trace_clear_g,		"clear-g")	\
-	EM(netfs_folio_trace_copy,		"copy")		\
-	EM(netfs_folio_trace_copy_plus,		"copy+")	\
+	EM(netfs_folio_trace_clear_s,		"clear-s")	\
 	EM(netfs_folio_trace_copy_to_cache,	"mark-copy")	\
-	EM(netfs_folio_trace_end_copy,		"end-copy")	\
 	EM(netfs_folio_trace_filled_gaps,	"filled-gaps")	\
 	EM(netfs_folio_trace_kill,		"kill")		\
 	EM(netfs_folio_trace_kill_cc,		"kill-cc")	\
@@ -156,7 +153,6 @@
 	EM(netfs_folio_trace_mkwrite_plus,	"mkwrite+")	\
 	EM(netfs_folio_trace_not_under_wback,	"!wback")	\
 	EM(netfs_folio_trace_read_gaps,		"read-gaps")	\
-	EM(netfs_folio_trace_redirty,		"redirty")	\
 	EM(netfs_folio_trace_redirtied,		"redirtied")	\
 	EM(netfs_folio_trace_store,		"store")	\
 	EM(netfs_folio_trace_store_copy,	"store-copy")	\
-- 
cgit v1.2.3


From 1ecb146f7cd82e44277de448d4f736b98741f3cb Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 15 Mar 2024 15:15:44 +0000
Subject: netfs, afs: Use writeback retry to deal with alternate keys

Use a hook in the new writeback code's retry algorithm to rotate the keys
once all the outstanding subreqs have failed rather than doing it
separately on each subreq.

Signed-off-by: David Howells <dhowells@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
---
 fs/afs/file.c            |   1 +
 fs/afs/internal.h        |   1 +
 fs/afs/write.c           | 191 +++++++++++++++++++++++------------------------
 fs/netfs/write_collect.c |   9 ++-
 include/linux/netfs.h    |   2 +
 5 files changed, 104 insertions(+), 100 deletions(-)

(limited to 'include')

diff --git a/fs/afs/file.c b/fs/afs/file.c
index 8f983e3ecae7..c3f0c45ae9a9 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -368,6 +368,7 @@ static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len,
 static void afs_free_request(struct netfs_io_request *rreq)
 {
 	key_put(rreq->netfs_priv);
+	afs_put_wb_key(rreq->netfs_priv2);
 }
 
 static void afs_update_i_size(struct inode *inode, loff_t new_i_size)
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 887245f9336d..6e1d3c4daf72 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1601,6 +1601,7 @@ extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *);
 void afs_prepare_write(struct netfs_io_subrequest *subreq);
 void afs_issue_write(struct netfs_io_subrequest *subreq);
 void afs_begin_writeback(struct netfs_io_request *wreq);
+void afs_retry_request(struct netfs_io_request *wreq, struct netfs_io_stream *stream);
 extern int afs_writepages(struct address_space *, struct writeback_control *);
 extern int afs_fsync(struct file *, loff_t, loff_t, int);
 extern vm_fault_t afs_page_mkwrite(struct vm_fault *vmf);
diff --git a/fs/afs/write.c b/fs/afs/write.c
index b8505a8b622a..e959640694c2 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -29,43 +29,39 @@ static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsign
 
 /*
  * Find a key to use for the writeback.  We cached the keys used to author the
- * writes on the vnode.  *_wbk will contain the last writeback key used or NULL
- * and we need to start from there if it's set.
+ * writes on the vnode.  wreq->netfs_priv2 will contain the last writeback key
+ * record used or NULL and we need to start from there if it's set.
+ * wreq->netfs_priv will be set to the key itself or NULL.
  */
-static int afs_get_writeback_key(struct afs_vnode *vnode,
-				 struct afs_wb_key **_wbk)
+static void afs_get_writeback_key(struct netfs_io_request *wreq)
 {
-	struct afs_wb_key *wbk = NULL;
-	struct list_head *p;
-	int ret = -ENOKEY, ret2;
+	struct afs_wb_key *wbk, *old = wreq->netfs_priv2;
+	struct afs_vnode *vnode = AFS_FS_I(wreq->inode);
+
+	key_put(wreq->netfs_priv);
+	wreq->netfs_priv = NULL;
+	wreq->netfs_priv2 = NULL;
 
 	spin_lock(&vnode->wb_lock);
-	if (*_wbk)
-		p = (*_wbk)->vnode_link.next;
+	if (old)
+		wbk = list_next_entry(old, vnode_link);
 	else
-		p = vnode->wb_keys.next;
+		wbk = list_first_entry(&vnode->wb_keys, struct afs_wb_key, vnode_link);
 
-	while (p != &vnode->wb_keys) {
-		wbk = list_entry(p, struct afs_wb_key, vnode_link);
+	list_for_each_entry_from(wbk, &vnode->wb_keys, vnode_link) {
 		_debug("wbk %u", key_serial(wbk->key));
-		ret2 = key_validate(wbk->key);
-		if (ret2 == 0) {
+		if (key_validate(wbk->key) == 0) {
 			refcount_inc(&wbk->usage);
+			wreq->netfs_priv = key_get(wbk->key);
+			wreq->netfs_priv2 = wbk;
 			_debug("USE WB KEY %u", key_serial(wbk->key));
 			break;
 		}
-
-		wbk = NULL;
-		if (ret == -ENOKEY)
-			ret = ret2;
-		p = p->next;
 	}
 
 	spin_unlock(&vnode->wb_lock);
-	if (*_wbk)
-		afs_put_wb_key(*_wbk);
-	*_wbk = wbk;
-	return 0;
+
+	afs_put_wb_key(old);
 }
 
 static void afs_store_data_success(struct afs_operation *op)
@@ -88,72 +84,91 @@ static const struct afs_operation_ops afs_store_data_operation = {
 };
 
 /*
- * write to a file
+ * Prepare a subrequest to write to the server.  This sets the max_len
+ * parameter.
  */
-static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t pos)
+void afs_prepare_write(struct netfs_io_subrequest *subreq)
 {
+	//if (test_bit(NETFS_SREQ_RETRYING, &subreq->flags))
+	//	subreq->max_len = 512 * 1024;
+	//else
+	subreq->max_len = 256 * 1024 * 1024;
+}
+
+/*
+ * Issue a subrequest to write to the server.
+ */
+static void afs_issue_write_worker(struct work_struct *work)
+{
+	struct netfs_io_subrequest *subreq = container_of(work, struct netfs_io_subrequest, work);
+	struct netfs_io_request *wreq = subreq->rreq;
 	struct afs_operation *op;
-	struct afs_wb_key *wbk = NULL;
-	loff_t size = iov_iter_count(iter);
+	struct afs_vnode *vnode = AFS_FS_I(wreq->inode);
+	unsigned long long pos = subreq->start + subreq->transferred;
+	size_t len = subreq->len - subreq->transferred;
 	int ret = -ENOKEY;
 
-	_enter("%s{%llx:%llu.%u},%llx,%llx",
+	_enter("R=%x[%x],%s{%llx:%llu.%u},%llx,%zx",
+	       wreq->debug_id, subreq->debug_index,
 	       vnode->volume->name,
 	       vnode->fid.vid,
 	       vnode->fid.vnode,
 	       vnode->fid.unique,
-	       size, pos);
+	       pos, len);
 
-	ret = afs_get_writeback_key(vnode, &wbk);
-	if (ret) {
-		_leave(" = %d [no keys]", ret);
-		return ret;
-	}
+#if 0 // Error injection
+	if (subreq->debug_index == 3)
+		return netfs_write_subrequest_terminated(subreq, -ENOANO, false);
 
-	op = afs_alloc_operation(wbk->key, vnode->volume);
-	if (IS_ERR(op)) {
-		afs_put_wb_key(wbk);
-		return -ENOMEM;
+	if (!test_bit(NETFS_SREQ_RETRYING, &subreq->flags)) {
+		set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
+		return netfs_write_subrequest_terminated(subreq, -EAGAIN, false);
 	}
+#endif
+
+	op = afs_alloc_operation(wreq->netfs_priv, vnode->volume);
+	if (IS_ERR(op))
+		return netfs_write_subrequest_terminated(subreq, -EAGAIN, false);
 
 	afs_op_set_vnode(op, 0, vnode);
-	op->file[0].dv_delta = 1;
+	op->file[0].dv_delta	= 1;
 	op->file[0].modification = true;
-	op->store.pos = pos;
-	op->store.size = size;
-	op->flags |= AFS_OPERATION_UNINTR;
-	op->ops = &afs_store_data_operation;
+	op->store.pos		= pos;
+	op->store.size		= len;
+	op->flags		|= AFS_OPERATION_UNINTR;
+	op->ops			= &afs_store_data_operation;
 
-try_next_key:
 	afs_begin_vnode_operation(op);
 
-	op->store.write_iter = iter;
-	op->store.i_size = max(pos + size, vnode->netfs.remote_i_size);
-	op->mtime = inode_get_mtime(&vnode->netfs.inode);
+	op->store.write_iter	= &subreq->io_iter;
+	op->store.i_size	= umax(pos + len, vnode->netfs.remote_i_size);
+	op->mtime		= inode_get_mtime(&vnode->netfs.inode);
 
 	afs_wait_for_operation(op);
-
-	switch (afs_op_error(op)) {
+	ret = afs_put_operation(op);
+	switch (ret) {
 	case -EACCES:
 	case -EPERM:
 	case -ENOKEY:
 	case -EKEYEXPIRED:
 	case -EKEYREJECTED:
 	case -EKEYREVOKED:
-		_debug("next");
-
-		ret = afs_get_writeback_key(vnode, &wbk);
-		if (ret == 0) {
-			key_put(op->key);
-			op->key = key_get(wbk->key);
-			goto try_next_key;
-		}
+		/* If there are more keys we can try, use the retry algorithm
+		 * to rotate the keys.
+		 */
+		if (wreq->netfs_priv2)
+			set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
 		break;
 	}
 
-	afs_put_wb_key(wbk);
-	_leave(" = %d", afs_op_error(op));
-	return afs_put_operation(op);
+	netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len, false);
+}
+
+void afs_issue_write(struct netfs_io_subrequest *subreq)
+{
+	subreq->work.func = afs_issue_write_worker;
+	if (!queue_work(system_unbound_wq, &subreq->work))
+		WARN_ON_ONCE(1);
 }
 
 /*
@@ -162,52 +177,32 @@ try_next_key:
  */
 void afs_begin_writeback(struct netfs_io_request *wreq)
 {
+	afs_get_writeback_key(wreq);
 	wreq->io_streams[0].avail = true;
 }
 
 /*
- * Prepare a subrequest to write to the server.  This sets the max_len
- * parameter.
- */
-void afs_prepare_write(struct netfs_io_subrequest *subreq)
-{
-	//if (test_bit(NETFS_SREQ_RETRYING, &subreq->flags))
-	//	subreq->max_len = 512 * 1024;
-	//else
-	subreq->max_len = 256 * 1024 * 1024;
-}
-
-/*
- * Issue a subrequest to write to the server.
+ * Prepare to retry the writes in request.  Use this to try rotating the
+ * available writeback keys.
  */
-static void afs_issue_write_worker(struct work_struct *work)
+void afs_retry_request(struct netfs_io_request *wreq, struct netfs_io_stream *stream)
 {
-	struct netfs_io_subrequest *subreq = container_of(work, struct netfs_io_subrequest, work);
-	struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode);
-	ssize_t ret;
-
-	_enter("%x[%x],%zx",
-	       subreq->rreq->debug_id, subreq->debug_index, subreq->io_iter.count);
-
-#if 0 // Error injection
-	if (subreq->debug_index == 3)
-		return netfs_write_subrequest_terminated(subreq, -ENOANO, false);
+	struct netfs_io_subrequest *subreq =
+		list_first_entry(&stream->subrequests,
+				 struct netfs_io_subrequest, rreq_link);
 
-	if (!test_bit(NETFS_SREQ_RETRYING, &subreq->flags)) {
-		set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
-		return netfs_write_subrequest_terminated(subreq, -EAGAIN, false);
+	switch (subreq->error) {
+	case -EACCES:
+	case -EPERM:
+	case -ENOKEY:
+	case -EKEYEXPIRED:
+	case -EKEYREJECTED:
+	case -EKEYREVOKED:
+		afs_get_writeback_key(wreq);
+		if (!wreq->netfs_priv)
+			stream->failed = true;
+		break;
 	}
-#endif
-
-	ret = afs_store_data(vnode, &subreq->io_iter, subreq->start);
-	netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len, false);
-}
-
-void afs_issue_write(struct netfs_io_subrequest *subreq)
-{
-	subreq->work.func = afs_issue_write_worker;
-	if (!queue_work(system_unbound_wq, &subreq->work))
-		WARN_ON_ONCE(1);
 }
 
 /*
diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
index f14c08bf605d..60112e4b2c5e 100644
--- a/fs/netfs/write_collect.c
+++ b/fs/netfs/write_collect.c
@@ -163,6 +163,13 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
 
 	_enter("R=%x[%x:]", wreq->debug_id, stream->stream_nr);
 
+	if (list_empty(&stream->subrequests))
+		return;
+
+	if (stream->source == NETFS_UPLOAD_TO_SERVER &&
+	    wreq->netfs_ops->retry_request)
+		wreq->netfs_ops->retry_request(wreq, stream);
+
 	if (unlikely(stream->failed))
 		return;
 
@@ -182,8 +189,6 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
 		return;
 	}
 
-	if (list_empty(&stream->subrequests))
-		return;
 	next = stream->subrequests.next;
 
 	do {
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index c2ba364041b0..298552f5122c 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -235,6 +235,7 @@ struct netfs_io_request {
 	struct iov_iter		iter;		/* Unencrypted-side iterator */
 	struct iov_iter		io_iter;	/* I/O (Encrypted-side) iterator */
 	void			*netfs_priv;	/* Private data for the netfs */
+	void			*netfs_priv2;	/* Private data for the netfs */
 	struct bio_vec		*direct_bv;	/* DIO buffer list (when handling iovec-iter) */
 	unsigned int		direct_bv_count; /* Number of elements in direct_bv[] */
 	unsigned int		debug_id;
@@ -306,6 +307,7 @@ struct netfs_request_ops {
 	void (*begin_writeback)(struct netfs_io_request *wreq);
 	void (*prepare_write)(struct netfs_io_subrequest *subreq);
 	void (*issue_write)(struct netfs_io_subrequest *subreq);
+	void (*retry_request)(struct netfs_io_request *wreq, struct netfs_io_stream *stream);
 	void (*invalidate_cache)(struct netfs_io_request *wreq);
 };
 
-- 
cgit v1.2.3


From 69c3c023af25edb5433a2db824d3e7cc328f0183 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 6 Oct 2023 18:16:15 +0100
Subject: cifs: Implement netfslib hooks

Provide implementation of the netfslib hooks that will be used by netfslib
to ask cifs to set up and perform operations.  Of particular note are

 (*) cifs_clamp_length() - This is used to negotiate the size of the next
     subrequest in a read request, taking into account the credit available
     and the rsize.  The credits are attached to the subrequest.

 (*) cifs_req_issue_read() - This is used to issue a subrequest that has
     been set up and clamped.

 (*) cifs_prepare_write() - This prepares to fill a subrequest by picking a
     channel, reopening the file and requesting credits so that we can set
     the maximum size of the subrequest and also sets the maximum number of
     segments if we're doing RDMA.

 (*) cifs_issue_write() - This releases any unneeded credits and issues an
     asynchronous data write for the contiguous slice of file covered by
     the subrequest.  This should possibly be folded in to all
     ->async_writev() ops and that called directly.

 (*) cifs_begin_writeback() - This gets the cached writable handle through
     which we do writeback (this does not affect writethrough, unbuffered
     or direct writes).

At this point, cifs is not wired up to actually *use* netfslib; that will
be done in a subsequent patch.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Steve French <sfrench@samba.org>
cc: Shyam Prasad N <nspmangalore@gmail.com>
cc: Rohith Surabattula <rohiths.msft@gmail.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: linux-cifs@vger.kernel.org
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
cc: linux-mm@kvack.org
---
 fs/netfs/buffered_write.c    |   6 +
 fs/smb/client/Kconfig        |   1 +
 fs/smb/client/cifsfs.c       |   2 +-
 fs/smb/client/cifsfs.h       |   1 +
 fs/smb/client/cifsglob.h     |  28 ++--
 fs/smb/client/file.c         | 315 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/netfs.h        |   1 +
 include/trace/events/netfs.h |   1 +
 8 files changed, 345 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index 825e6632ee4f..1121601536d1 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -405,6 +405,9 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
 	} while (iov_iter_count(iter));
 
 out:
+	if (likely(written) && ctx->ops->post_modify)
+		ctx->ops->post_modify(inode);
+
 	if (unlikely(wreq)) {
 		ret2 = netfs_end_writethrough(wreq, &wbc, writethrough);
 		wbc_detach_inode(&wbc);
@@ -521,6 +524,7 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
 	struct folio *folio = page_folio(vmf->page);
 	struct file *file = vmf->vma->vm_file;
 	struct inode *inode = file_inode(file);
+	struct netfs_inode *ictx = netfs_inode(inode);
 	vm_fault_t ret = VM_FAULT_RETRY;
 	int err;
 
@@ -567,6 +571,8 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
 		trace_netfs_folio(folio, netfs_folio_trace_mkwrite);
 	netfs_set_group(folio, netfs_group);
 	file_update_time(file);
+	if (ictx->ops->post_modify)
+		ictx->ops->post_modify(inode);
 	ret = VM_FAULT_LOCKED;
 out:
 	sb_end_pagefault(inode->i_sb);
diff --git a/fs/smb/client/Kconfig b/fs/smb/client/Kconfig
index 2927bd174a88..2517dc242386 100644
--- a/fs/smb/client/Kconfig
+++ b/fs/smb/client/Kconfig
@@ -2,6 +2,7 @@
 config CIFS
 	tristate "SMB3 and CIFS support (advanced network filesystem)"
 	depends on INET
+	select NETFS_SUPPORT
 	select NLS
 	select NLS_UCS2_UTILS
 	select CRYPTO
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 763d17870e0b..8f7165567be7 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -1758,7 +1758,7 @@ static int cifs_init_netfs(void)
 {
 	cifs_io_request_cachep =
 		kmem_cache_create("cifs_io_request",
-				  sizeof(struct netfs_io_request), 0,
+				  sizeof(struct cifs_io_request), 0,
 				  SLAB_HWCACHE_ALIGN, NULL);
 	if (!cifs_io_request_cachep)
 		goto nomem_req;
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index 1acf6bfc06de..922c10d7cfdd 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -84,6 +84,7 @@ extern const struct inode_operations cifs_namespace_inode_operations;
 
 
 /* Functions related to files and directories */
+extern const struct netfs_request_ops cifs_req_ops;
 extern const struct file_operations cifs_file_ops;
 extern const struct file_operations cifs_file_direct_ops; /* if directio mnt */
 extern const struct file_operations cifs_file_strict_ops; /* if strictio mnt */
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 611f59c6d2c0..4e9033b2f191 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -1491,15 +1491,24 @@ struct cifs_aio_ctx {
 	bool			direct_io;
 };
 
+struct cifs_io_request {
+	struct netfs_io_request		rreq;
+	struct cifsFileInfo		*cfile;
+};
+
 /* asynchronous read support */
 struct cifs_io_subrequest {
-	struct netfs_io_subrequest	subreq;
-	struct cifsFileInfo		*cfile;
-	struct address_space		*mapping;
-	struct cifs_aio_ctx		*ctx;
+	union {
+		struct netfs_io_subrequest subreq;
+		struct netfs_io_request *rreq;
+		struct cifs_io_request *req;
+	};
 	ssize_t				got_bytes;
 	pid_t				pid;
+	unsigned int			xid;
 	int				result;
+	bool				have_xid;
+	bool				replay;
 	struct kvec			iov[2];
 	struct TCP_Server_Info		*server;
 #ifdef CONFIG_CIFS_SMB_DIRECT
@@ -1507,15 +1516,16 @@ struct cifs_io_subrequest {
 #endif
 	struct cifs_credits		credits;
 
-	enum writeback_sync_modes	sync_mode;
-	bool				uncached;
-	bool				replay;
-	struct bio_vec			*bv;
-
 	// TODO: Remove following elements
 	struct list_head		list;
 	struct completion		done;
 	struct work_struct		work;
+	struct cifsFileInfo		*cfile;
+	struct address_space		*mapping;
+	struct cifs_aio_ctx		*ctx;
+	enum writeback_sync_modes	sync_mode;
+	bool				uncached;
+	struct bio_vec			*bv;
 };
 
 /*
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 331446fc3d22..c9c33b3a54f3 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -36,6 +36,321 @@
 #include "fs_context.h"
 #include "cifs_ioctl.h"
 #include "cached_dir.h"
+#include <trace/events/netfs.h>
+
+static int cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush);
+
+/*
+ * Prepare a subrequest to upload to the server.  We need to allocate credits
+ * so that we know the maximum amount of data that we can include in it.
+ */
+static void cifs_prepare_write(struct netfs_io_subrequest *subreq)
+{
+	struct cifs_io_subrequest *wdata =
+		container_of(subreq, struct cifs_io_subrequest, subreq);
+	struct cifs_io_request *req = wdata->req;
+	struct TCP_Server_Info *server;
+	struct cifsFileInfo *open_file = req->cfile;
+	size_t wsize = req->rreq.wsize;
+	int rc;
+
+	if (!wdata->have_xid) {
+		wdata->xid = get_xid();
+		wdata->have_xid = true;
+	}
+
+	server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses);
+	wdata->server = server;
+
+retry:
+	if (open_file->invalidHandle) {
+		rc = cifs_reopen_file(open_file, false);
+		if (rc < 0) {
+			if (rc == -EAGAIN)
+				goto retry;
+			subreq->error = rc;
+			return netfs_prepare_write_failed(subreq);
+		}
+	}
+
+	rc = server->ops->wait_mtu_credits(server, wsize, &wdata->subreq.max_len,
+					   &wdata->credits);
+	if (rc < 0) {
+		subreq->error = rc;
+		return netfs_prepare_write_failed(subreq);
+	}
+
+#ifdef CONFIG_CIFS_SMB_DIRECT
+	if (server->smbd_conn)
+		subreq->max_nr_segs = server->smbd_conn->max_frmr_depth;
+#endif
+}
+
+/*
+ * Issue a subrequest to upload to the server.
+ */
+static void cifs_issue_write(struct netfs_io_subrequest *subreq)
+{
+	struct cifs_io_subrequest *wdata =
+		container_of(subreq, struct cifs_io_subrequest, subreq);
+	struct cifs_sb_info *sbi = CIFS_SB(subreq->rreq->inode->i_sb);
+	int rc;
+
+	if (cifs_forced_shutdown(sbi)) {
+		rc = -EIO;
+		goto fail;
+	}
+
+	rc = adjust_credits(wdata->server, &wdata->credits, wdata->subreq.len);
+	if (rc)
+		goto fail;
+
+	rc = -EAGAIN;
+	if (wdata->req->cfile->invalidHandle)
+		goto fail;
+
+	wdata->server->ops->async_writev(wdata);
+out:
+	return;
+
+fail:
+	if (rc == -EAGAIN)
+		trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
+	else
+		trace_netfs_sreq(subreq, netfs_sreq_trace_fail);
+	add_credits_and_wake_if(wdata->server, &wdata->credits, 0);
+	netfs_write_subrequest_terminated(wdata, rc, false);
+	goto out;
+}
+
+/*
+ * Split the read up according to how many credits we can get for each piece.
+ * It's okay to sleep here if we need to wait for more credit to become
+ * available.
+ *
+ * We also choose the server and allocate an operation ID to be cleaned up
+ * later.
+ */
+static bool cifs_clamp_length(struct netfs_io_subrequest *subreq)
+{
+	struct netfs_io_request *rreq = subreq->rreq;
+	struct TCP_Server_Info *server;
+	struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq);
+	struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb);
+	size_t rsize = 0;
+	int rc;
+
+	rdata->xid = get_xid();
+	rdata->have_xid = true;
+
+	server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses);
+	rdata->server = server;
+
+	if (cifs_sb->ctx->rsize == 0)
+		cifs_sb->ctx->rsize =
+			server->ops->negotiate_rsize(tlink_tcon(req->cfile->tlink),
+						     cifs_sb->ctx);
+
+
+	rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize, &rsize,
+					   &rdata->credits);
+	if (rc) {
+		subreq->error = rc;
+		return false;
+	}
+
+	subreq->len = min_t(size_t, subreq->len, rsize);
+#ifdef CONFIG_CIFS_SMB_DIRECT
+	if (server->smbd_conn)
+		subreq->max_nr_segs = server->smbd_conn->max_frmr_depth;
+#endif
+	return true;
+}
+
+/*
+ * Issue a read operation on behalf of the netfs helper functions.  We're asked
+ * to make a read of a certain size at a point in the file.  We are permitted
+ * to only read a portion of that, but as long as we read something, the netfs
+ * helper will call us again so that we can issue another read.
+ */
+static void cifs_req_issue_read(struct netfs_io_subrequest *subreq)
+{
+	struct netfs_io_request *rreq = subreq->rreq;
+	struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq);
+	struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb);
+	pid_t pid;
+	int rc = 0;
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+		pid = req->cfile->pid;
+	else
+		pid = current->tgid; // Ummm...  This may be a workqueue
+
+	cifs_dbg(FYI, "%s: op=%08x[%x] mapping=%p len=%zu/%zu\n",
+		 __func__, rreq->debug_id, subreq->debug_index, rreq->mapping,
+		 subreq->transferred, subreq->len);
+
+	if (req->cfile->invalidHandle) {
+		do {
+			rc = cifs_reopen_file(req->cfile, true);
+		} while (rc == -EAGAIN);
+		if (rc)
+			goto out;
+	}
+
+	__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+	rdata->pid = pid;
+
+	rc = adjust_credits(rdata->server, &rdata->credits, rdata->subreq.len);
+	if (!rc) {
+		if (rdata->req->cfile->invalidHandle)
+			rc = -EAGAIN;
+		else
+			rc = rdata->server->ops->async_readv(rdata);
+	}
+
+out:
+	if (rc)
+		netfs_subreq_terminated(subreq, rc, false);
+}
+
+/*
+ * Writeback calls this when it finds a folio that needs uploading.  This isn't
+ * called if writeback only has copy-to-cache to deal with.
+ */
+static void cifs_begin_writeback(struct netfs_io_request *wreq)
+{
+	struct cifs_io_request *req = container_of(wreq, struct cifs_io_request, rreq);
+	int ret;
+
+	ret = cifs_get_writable_file(CIFS_I(wreq->inode), FIND_WR_ANY, &req->cfile);
+	if (ret) {
+		cifs_dbg(VFS, "No writable handle in writepages ret=%d\n", ret);
+		return;
+	}
+
+	wreq->io_streams[0].avail = true;
+}
+
+/*
+ * Initialise a request.
+ */
+static int cifs_init_request(struct netfs_io_request *rreq, struct file *file)
+{
+	struct cifs_io_request *req = container_of(rreq, struct cifs_io_request, rreq);
+	struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb);
+	struct cifsFileInfo *open_file = NULL;
+
+	rreq->rsize = cifs_sb->ctx->rsize;
+	rreq->wsize = cifs_sb->ctx->wsize;
+
+	if (file) {
+		open_file = file->private_data;
+		rreq->netfs_priv = file->private_data;
+		req->cfile = cifsFileInfo_get(open_file);
+	} else if (rreq->origin != NETFS_WRITEBACK) {
+		WARN_ON_ONCE(1);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/*
+ * Expand the size of a readahead to the size of the rsize, if at least as
+ * large as a page, allowing for the possibility that rsize is not pow-2
+ * aligned.
+ */
+static void cifs_expand_readahead(struct netfs_io_request *rreq)
+{
+	unsigned int rsize = rreq->rsize;
+	loff_t misalignment, i_size = i_size_read(rreq->inode);
+
+	if (rsize < PAGE_SIZE)
+		return;
+
+	if (rsize < INT_MAX)
+		rsize = roundup_pow_of_two(rsize);
+	else
+		rsize = ((unsigned int)INT_MAX + 1) / 2;
+
+	misalignment = rreq->start & (rsize - 1);
+	if (misalignment) {
+		rreq->start -= misalignment;
+		rreq->len += misalignment;
+	}
+
+	rreq->len = round_up(rreq->len, rsize);
+	if (rreq->start < i_size && rreq->len > i_size - rreq->start)
+		rreq->len = i_size - rreq->start;
+}
+
+/*
+ * Completion of a request operation.
+ */
+static void cifs_rreq_done(struct netfs_io_request *rreq)
+{
+	struct timespec64 atime, mtime;
+	struct inode *inode = rreq->inode;
+
+	/* we do not want atime to be less than mtime, it broke some apps */
+	atime = inode_set_atime_to_ts(inode, current_time(inode));
+	mtime = inode_get_mtime(inode);
+	if (timespec64_compare(&atime, &mtime))
+		inode_set_atime_to_ts(inode, inode_get_mtime(inode));
+}
+
+static void cifs_post_modify(struct inode *inode)
+{
+	/* Indication to update ctime and mtime as close is deferred */
+	set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags);
+}
+
+static void cifs_free_request(struct netfs_io_request *rreq)
+{
+	struct cifs_io_request *req = container_of(rreq, struct cifs_io_request, rreq);
+
+	if (req->cfile)
+		cifsFileInfo_put(req->cfile);
+}
+
+static void cifs_free_subrequest(struct netfs_io_subrequest *subreq)
+{
+	struct cifs_io_subrequest *rdata =
+		container_of(subreq, struct cifs_io_subrequest, subreq);
+	int rc = subreq->error;
+
+	if (rdata->subreq.source == NETFS_DOWNLOAD_FROM_SERVER) {
+#ifdef CONFIG_CIFS_SMB_DIRECT
+		if (rdata->mr) {
+			smbd_deregister_mr(rdata->mr);
+			rdata->mr = NULL;
+		}
+#endif
+	}
+
+	add_credits_and_wake_if(rdata->server, &rdata->credits, 0);
+	if (rdata->have_xid)
+		free_xid(rdata->xid);
+}
+
+const struct netfs_request_ops cifs_req_ops = {
+	.request_pool		= &cifs_io_request_pool,
+	.subrequest_pool	= &cifs_io_subrequest_pool,
+	.init_request		= cifs_init_request,
+	.free_request		= cifs_free_request,
+	.free_subrequest	= cifs_free_subrequest,
+	.expand_readahead	= cifs_expand_readahead,
+	.clamp_length		= cifs_clamp_length,
+	.issue_read		= cifs_req_issue_read,
+	.done			= cifs_rreq_done,
+	.post_modify		= cifs_post_modify,
+	.begin_writeback	= cifs_begin_writeback,
+	.prepare_write		= cifs_prepare_write,
+	.issue_write		= cifs_issue_write,
+};
 
 /*
  * Remove the dirty flags from a span of pages.
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 298552f5122c..f45d06284f2f 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -302,6 +302,7 @@ struct netfs_request_ops {
 
 	/* Modification handling */
 	void (*update_i_size)(struct inode *inode, loff_t i_size);
+	void (*post_modify)(struct inode *inode);
 
 	/* Write request handling */
 	void (*begin_writeback)(struct netfs_io_request *wreq);
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index 4ba553a6d71b..da23484268df 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -112,6 +112,7 @@
 #define netfs_sreq_ref_traces					\
 	EM(netfs_sreq_trace_get_copy_to_cache,	"GET COPY2C ")	\
 	EM(netfs_sreq_trace_get_resubmit,	"GET RESUBMIT")	\
+	EM(netfs_sreq_trace_get_submit,		"GET SUBMIT")	\
 	EM(netfs_sreq_trace_get_short_read,	"GET SHORTRD")	\
 	EM(netfs_sreq_trace_new,		"NEW        ")	\
 	EM(netfs_sreq_trace_put_cancel,		"PUT CANCEL ")	\
-- 
cgit v1.2.3


From 74df22453c51392476117d7330bf02cee6e987cf Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 1 May 2024 16:29:48 -0700
Subject: kunit/fortify: Fix replaced failure path to unbreak __alloc_size

The __alloc_size annotation for kmemdup() was getting disabled under
KUnit testing because the replaced fortify_panic macro implementation
was using "return NULL" as a way to survive the sanity checking. But
having the chance to return NULL invalidated __alloc_size, so kmemdup
was not passing the __builtin_dynamic_object_size() tests any more:

[23:26:18] [PASSED] fortify_test_alloc_size_kmalloc_const
[23:26:19]     # fortify_test_alloc_size_kmalloc_dynamic: EXPECTATION FAILED at lib/fortify_kunit.c:265
[23:26:19]     Expected __builtin_dynamic_object_size(p, 1) == expected, but
[23:26:19]         __builtin_dynamic_object_size(p, 1) == -1 (0xffffffffffffffff)
[23:26:19]         expected == 11 (0xb)
[23:26:19] __alloc_size() not working with __bdos on kmemdup("hello there", len, gfp)
[23:26:19] [FAILED] fortify_test_alloc_size_kmalloc_dynamic

Normal builds were not affected: __alloc_size continued to work there.

Use a zero-sized allocation instead, which allows __alloc_size to
behave.

Fixes: 4ce615e798a7 ("fortify: Provide KUnit counters for failure testing")
Fixes: fa4a3f86d498 ("fortify: Add KUnit tests for runtime overflows")
Link: https://lore.kernel.org/r/20240501232937.work.532-kees@kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/fortify-string.h | 3 ++-
 lib/fortify_kunit.c            | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index a0bb13825109..85fc0e6f0f7f 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -738,7 +738,8 @@ __FORTIFY_INLINE void *kmemdup(const void * const POS0 p, size_t size, gfp_t gfp
 	if (__compiletime_lessthan(p_size, size))
 		__read_overflow();
 	if (p_size < size)
-		fortify_panic(FORTIFY_FUNC_kmemdup, FORTIFY_READ, p_size, size, NULL);
+		fortify_panic(FORTIFY_FUNC_kmemdup, FORTIFY_READ, p_size, size,
+			      __real_kmemdup(p, 0, gfp));
 	return __real_kmemdup(p, size, gfp);
 }
 
diff --git a/lib/fortify_kunit.c b/lib/fortify_kunit.c
index ef3e4c68b759..306522fd0aa2 100644
--- a/lib/fortify_kunit.c
+++ b/lib/fortify_kunit.c
@@ -1002,19 +1002,19 @@ static void fortify_test_kmemdup(struct kunit *test)
 
 	/* Out of bounds by 1 byte. */
 	copy = kmemdup(src, len + 1, GFP_KERNEL);
-	KUNIT_EXPECT_NULL(test, copy);
+	KUNIT_EXPECT_PTR_EQ(test, copy, ZERO_SIZE_PTR);
 	KUNIT_EXPECT_EQ(test, fortify_read_overflows, 1);
 	kfree(copy);
 
 	/* Way out of bounds. */
 	copy = kmemdup(src, len * 2, GFP_KERNEL);
-	KUNIT_EXPECT_NULL(test, copy);
+	KUNIT_EXPECT_PTR_EQ(test, copy, ZERO_SIZE_PTR);
 	KUNIT_EXPECT_EQ(test, fortify_read_overflows, 2);
 	kfree(copy);
 
 	/* Starting offset causing out of bounds. */
 	copy = kmemdup(src + 1, len, GFP_KERNEL);
-	KUNIT_EXPECT_NULL(test, copy);
+	KUNIT_EXPECT_PTR_EQ(test, copy, ZERO_SIZE_PTR);
 	KUNIT_EXPECT_EQ(test, fortify_read_overflows, 3);
 	kfree(copy);
 }
-- 
cgit v1.2.3


From 0840556e5a3a331b6932ef17dd4bc94445df3297 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 29 Apr 2024 18:58:12 -0700
Subject: net: Protect dev->name by seqlock.

We will convert ioctl(SIOCGARP) to RCU, and then we need to copy
dev->name which is currently protected by rtnl_lock().

This patch does the following:

  1) Add seqlock netdev_rename_lock to protect dev->name

  2) Add netdev_copy_name() that copies dev->name to buffer
     under netdev_rename_lock

  3) Use netdev_copy_name() in netdev_get_name() and drop
     devnet_rename_sem

Suggested-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/netdev/CANn89iJEWs7AYSJqGCUABeVqOCTkErponfZdT5kV-iD=-SajnQ@mail.gmail.com/
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://lore.kernel.org/r/20240430015813.71143-7-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h |  1 +
 net/core/dev.c            | 27 +++++++++++++++++++++++----
 2 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f849e7d110ed..41853424b41d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3136,6 +3136,7 @@ struct net_device *netdev_get_by_name(struct net *net, const char *name,
 				      netdevice_tracker *tracker, gfp_t gfp);
 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
 struct net_device *dev_get_by_napi_id(unsigned int napi_id);
+void netdev_copy_name(struct net_device *dev, char *name);
 
 static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
 				  unsigned short type,
diff --git a/net/core/dev.c b/net/core/dev.c
index cd7ba50eac15..e02d2363347e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -940,6 +940,18 @@ struct net_device *dev_get_by_napi_id(unsigned int napi_id)
 }
 EXPORT_SYMBOL(dev_get_by_napi_id);
 
+static DEFINE_SEQLOCK(netdev_rename_lock);
+
+void netdev_copy_name(struct net_device *dev, char *name)
+{
+	unsigned int seq;
+
+	do {
+		seq = read_seqbegin(&netdev_rename_lock);
+		strscpy(name, dev->name, IFNAMSIZ);
+	} while (read_seqretry(&netdev_rename_lock, seq));
+}
+
 /**
  *	netdev_get_name - get a netdevice name, knowing its ifindex.
  *	@net: network namespace
@@ -951,7 +963,6 @@ int netdev_get_name(struct net *net, char *name, int ifindex)
 	struct net_device *dev;
 	int ret;
 
-	down_read(&devnet_rename_sem);
 	rcu_read_lock();
 
 	dev = dev_get_by_index_rcu(net, ifindex);
@@ -960,12 +971,11 @@ int netdev_get_name(struct net *net, char *name, int ifindex)
 		goto out;
 	}
 
-	strcpy(name, dev->name);
+	netdev_copy_name(dev, name);
 
 	ret = 0;
 out:
 	rcu_read_unlock();
-	up_read(&devnet_rename_sem);
 	return ret;
 }
 
@@ -1217,7 +1227,10 @@ int dev_change_name(struct net_device *dev, const char *newname)
 
 	memcpy(oldname, dev->name, IFNAMSIZ);
 
+	write_seqlock(&netdev_rename_lock);
 	err = dev_get_valid_name(net, dev, newname);
+	write_sequnlock(&netdev_rename_lock);
+
 	if (err < 0) {
 		up_write(&devnet_rename_sem);
 		return err;
@@ -1257,7 +1270,9 @@ rollback:
 		if (err >= 0) {
 			err = ret;
 			down_write(&devnet_rename_sem);
+			write_seqlock(&netdev_rename_lock);
 			memcpy(dev->name, oldname, IFNAMSIZ);
+			write_sequnlock(&netdev_rename_lock);
 			memcpy(oldname, newname, IFNAMSIZ);
 			WRITE_ONCE(dev->name_assign_type, old_assign_type);
 			old_assign_type = NET_NAME_RENAMED;
@@ -11403,8 +11418,12 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
 	dev_net_set(dev, net);
 	dev->ifindex = new_ifindex;
 
-	if (new_name[0]) /* Rename the netdev to prepared name */
+	if (new_name[0]) {
+		/* Rename the netdev to prepared name */
+		write_seqlock(&netdev_rename_lock);
 		strscpy(dev->name, new_name, IFNAMSIZ);
+		write_sequnlock(&netdev_rename_lock);
+	}
 
 	/* Fixup kobjects */
 	dev_set_uevent_suppress(&dev->dev, 1);
-- 
cgit v1.2.3


From ad3e1287c6f354b4ecf77aa67810731ea192c3fb Mon Sep 17 00:00:00 2001
From: Florian Fainelli <florian.fainelli@broadcom.com>
Date: Tue, 30 Apr 2024 09:48:15 -0700
Subject: net: dsa: Remove fixed_link_update member

We have not had a switch driver use a fixed_link_update callback since
58d56fcc3964f9be0a9ca42fd126bcd9dc7afc90 ("net: dsa: bcm_sf2: Get rid of
PHYLIB functions") remove this callback.

Signed-off-by: Florian Fainelli <florian.fainelli@broadcom.com>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://lore.kernel.org/r/20240430164816.2400606-2-florian.fainelli@broadcom.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dsa.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index a6ef7e4c503f..678f1fd8b189 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -25,7 +25,6 @@
 struct dsa_8021q_context;
 struct tc_action;
 struct phy_device;
-struct fixed_phy_status;
 struct phylink_link_state;
 
 #define DSA_TAG_PROTO_NONE_VALUE		0
@@ -873,9 +872,6 @@ struct dsa_switch_ops {
 	 */
 	void	(*adjust_link)(struct dsa_switch *ds, int port,
 				struct phy_device *phydev);
-	void	(*fixed_link_update)(struct dsa_switch *ds, int port,
-				struct fixed_phy_status *st);
-
 	/*
 	 * PHYLINK integration
 	 */
-- 
cgit v1.2.3


From 8a021a863a7dc9f4d3d00c5571b79e0961eeabc8 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <florian.fainelli@broadcom.com>
Date: Tue, 30 Apr 2024 09:48:16 -0700
Subject: net: dsa: Remove adjust_link paths

Now that we no longer any drivers using PHYLIB's adjust_link callback,
remove all paths that made use of adjust_link as well as the associated
functions.

Signed-off-by: Florian Fainelli <florian.fainelli@broadcom.com>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Link: https://lore.kernel.org/r/20240430164816.2400606-3-florian.fainelli@broadcom.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dsa.h |   7 ---
 net/dsa/dsa.c     |   3 +-
 net/dsa/port.c    | 139 +++++-------------------------------------------------
 3 files changed, 12 insertions(+), 137 deletions(-)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 678f1fd8b189..eef702dbea78 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -24,8 +24,6 @@
 
 struct dsa_8021q_context;
 struct tc_action;
-struct phy_device;
-struct phylink_link_state;
 
 #define DSA_TAG_PROTO_NONE_VALUE		0
 #define DSA_TAG_PROTO_BRCM_VALUE		1
@@ -867,11 +865,6 @@ struct dsa_switch_ops {
 	int	(*phy_write)(struct dsa_switch *ds, int port,
 			     int regnum, u16 val);
 
-	/*
-	 * Link state adjustment (called from libphy)
-	 */
-	void	(*adjust_link)(struct dsa_switch *ds, int port,
-				struct phy_device *phydev);
 	/*
 	 * PHYLINK integration
 	 */
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 2f347cd37316..12521a7d4048 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -1511,8 +1511,7 @@ static int dsa_switch_probe(struct dsa_switch *ds)
 		    ds->ops->phylink_mac_config ||
 		    ds->ops->phylink_mac_finish ||
 		    ds->ops->phylink_mac_link_down ||
-		    ds->ops->phylink_mac_link_up ||
-		    ds->ops->adjust_link)
+		    ds->ops->phylink_mac_link_up)
 			return -EINVAL;
 	}
 
diff --git a/net/dsa/port.c b/net/dsa/port.c
index c6febc3d96d9..9a249d4ac3a5 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -1535,25 +1535,6 @@ void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp,
 	cpu_dp->tag_ops = tag_ops;
 }
 
-static struct phy_device *dsa_port_get_phy_device(struct dsa_port *dp)
-{
-	struct device_node *phy_dn;
-	struct phy_device *phydev;
-
-	phy_dn = of_parse_phandle(dp->dn, "phy-handle", 0);
-	if (!phy_dn)
-		return NULL;
-
-	phydev = of_phy_find_device(phy_dn);
-	if (!phydev) {
-		of_node_put(phy_dn);
-		return ERR_PTR(-EPROBE_DEFER);
-	}
-
-	of_node_put(phy_dn);
-	return phydev;
-}
-
 static struct phylink_pcs *
 dsa_port_phylink_mac_select_pcs(struct phylink_config *config,
 				phy_interface_t interface)
@@ -1616,17 +1597,10 @@ static void dsa_port_phylink_mac_link_down(struct phylink_config *config,
 					   phy_interface_t interface)
 {
 	struct dsa_port *dp = dsa_phylink_to_port(config);
-	struct phy_device *phydev = NULL;
 	struct dsa_switch *ds = dp->ds;
 
-	if (dsa_port_is_user(dp))
-		phydev = dp->user->phydev;
-
-	if (!ds->ops->phylink_mac_link_down) {
-		if (ds->ops->adjust_link && phydev)
-			ds->ops->adjust_link(ds, dp->index, phydev);
+	if (!ds->ops->phylink_mac_link_down)
 		return;
-	}
 
 	ds->ops->phylink_mac_link_down(ds, dp->index, mode, interface);
 }
@@ -1641,11 +1615,8 @@ static void dsa_port_phylink_mac_link_up(struct phylink_config *config,
 	struct dsa_port *dp = dsa_phylink_to_port(config);
 	struct dsa_switch *ds = dp->ds;
 
-	if (!ds->ops->phylink_mac_link_up) {
-		if (ds->ops->adjust_link && phydev)
-			ds->ops->adjust_link(ds, dp->index, phydev);
+	if (!ds->ops->phylink_mac_link_up)
 		return;
-	}
 
 	ds->ops->phylink_mac_link_up(ds, dp->index, mode, interface, phydev,
 				     speed, duplex, tx_pause, rx_pause);
@@ -1708,78 +1679,6 @@ void dsa_port_phylink_destroy(struct dsa_port *dp)
 	dp->pl = NULL;
 }
 
-static int dsa_shared_port_setup_phy_of(struct dsa_port *dp, bool enable)
-{
-	struct dsa_switch *ds = dp->ds;
-	struct phy_device *phydev;
-	int port = dp->index;
-	int err = 0;
-
-	phydev = dsa_port_get_phy_device(dp);
-	if (!phydev)
-		return 0;
-
-	if (IS_ERR(phydev))
-		return PTR_ERR(phydev);
-
-	if (enable) {
-		err = genphy_resume(phydev);
-		if (err < 0)
-			goto err_put_dev;
-
-		err = genphy_read_status(phydev);
-		if (err < 0)
-			goto err_put_dev;
-	} else {
-		err = genphy_suspend(phydev);
-		if (err < 0)
-			goto err_put_dev;
-	}
-
-	if (ds->ops->adjust_link)
-		ds->ops->adjust_link(ds, port, phydev);
-
-	dev_dbg(ds->dev, "enabled port's phy: %s", phydev_name(phydev));
-
-err_put_dev:
-	put_device(&phydev->mdio.dev);
-	return err;
-}
-
-static int dsa_shared_port_fixed_link_register_of(struct dsa_port *dp)
-{
-	struct device_node *dn = dp->dn;
-	struct dsa_switch *ds = dp->ds;
-	struct phy_device *phydev;
-	int port = dp->index;
-	phy_interface_t mode;
-	int err;
-
-	err = of_phy_register_fixed_link(dn);
-	if (err) {
-		dev_err(ds->dev,
-			"failed to register the fixed PHY of port %d\n",
-			port);
-		return err;
-	}
-
-	phydev = of_phy_find_device(dn);
-
-	err = of_get_phy_mode(dn, &mode);
-	if (err)
-		mode = PHY_INTERFACE_MODE_NA;
-	phydev->interface = mode;
-
-	genphy_read_status(phydev);
-
-	if (ds->ops->adjust_link)
-		ds->ops->adjust_link(ds, port, phydev);
-
-	put_device(&phydev->mdio.dev);
-
-	return 0;
-}
-
 static int dsa_shared_port_phylink_register(struct dsa_port *dp)
 {
 	struct dsa_switch *ds = dp->ds;
@@ -1983,44 +1882,28 @@ int dsa_shared_port_link_register_of(struct dsa_port *dp)
 					dsa_switches_apply_workarounds))
 		return -EINVAL;
 
-	if (!ds->ops->adjust_link) {
-		if (missing_link_description) {
-			dev_warn(ds->dev,
-				 "Skipping phylink registration for %s port %d\n",
-				 dsa_port_is_cpu(dp) ? "CPU" : "DSA", dp->index);
-		} else {
-			dsa_shared_port_link_down(dp);
+	if (missing_link_description) {
+		dev_warn(ds->dev,
+			 "Skipping phylink registration for %s port %d\n",
+			 dsa_port_is_cpu(dp) ? "CPU" : "DSA", dp->index);
+	} else {
+		dsa_shared_port_link_down(dp);
 
-			return dsa_shared_port_phylink_register(dp);
-		}
-		return 0;
+		return dsa_shared_port_phylink_register(dp);
 	}
 
-	dev_warn(ds->dev,
-		 "Using legacy PHYLIB callbacks. Please migrate to PHYLINK!\n");
-
-	if (of_phy_is_fixed_link(dp->dn))
-		return dsa_shared_port_fixed_link_register_of(dp);
-	else
-		return dsa_shared_port_setup_phy_of(dp, true);
+	return 0;
 }
 
 void dsa_shared_port_link_unregister_of(struct dsa_port *dp)
 {
-	struct dsa_switch *ds = dp->ds;
-
-	if (!ds->ops->adjust_link && dp->pl) {
+	if (dp->pl) {
 		rtnl_lock();
 		phylink_disconnect_phy(dp->pl);
 		rtnl_unlock();
 		dsa_port_phylink_destroy(dp);
 		return;
 	}
-
-	if (of_phy_is_fixed_link(dp->dn))
-		of_phy_deregister_fixed_link(dp->dn);
-	else
-		dsa_shared_port_setup_phy_of(dp, false);
 }
 
 int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr,
-- 
cgit v1.2.3


From 9723cab054d59d770cd1927f92fe315c26cc6a78 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Date: Wed, 1 May 2024 19:51:27 +0200
Subject: ASoC: Use inline function for type safety in
 snd_soc_substream_to_rtd()

A common pattern in sound drivers is getting 'struct snd_soc_pcm_runtime'
from 'struct snd_pcm_substream' opaque pointer private_data field with
snd_soc_substream_to_rtd().  However 'private_data' appears in several
other structures as well, including 'struct snd_compr_stream'.  The
field might not hold the same type for every structure, although seems
the case at least for 'struct snd_compr_stream', so code can easily make
a mistake by using macro for wrong structure passed as argument.

Switch from macro to inline function, so such mistake will be build-time
detectable.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20240501175127.34301-1-krzysztof.kozlowski@linaro.org
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/sound/soc.h b/include/sound/soc.h
index 8aa6ddb7ee1e..33671437ee89 100644
--- a/include/sound/soc.h
+++ b/include/sound/soc.h
@@ -1215,8 +1215,12 @@ struct snd_soc_pcm_runtime {
 /* see soc_new_pcm_runtime()  */
 #define snd_soc_rtd_to_cpu(rtd, n)   (rtd)->dais[n]
 #define snd_soc_rtd_to_codec(rtd, n) (rtd)->dais[n + (rtd)->dai_link->num_cpus]
-#define snd_soc_substream_to_rtd(substream) \
-	(struct snd_soc_pcm_runtime *)snd_pcm_substream_chip(substream)
+
+static inline struct snd_soc_pcm_runtime *
+snd_soc_substream_to_rtd(const struct snd_pcm_substream *substream)
+{
+	return snd_pcm_substream_chip(substream);
+}
 
 #define for_each_rtd_components(rtd, i, component)			\
 	for ((i) = 0, component = NULL;					\
-- 
cgit v1.2.3


From 5ef31ea5d053a8f493a772ebad3f3ce82c35d845 Mon Sep 17 00:00:00 2001
From: Richard Gobert <richardbgobert@gmail.com>
Date: Tue, 30 Apr 2024 16:35:54 +0200
Subject: net: gro: fix udp bad offset in socket lookup by adding
 {inner_}network_offset to napi_gro_cb

Commits a602456 ("udp: Add GRO functions to UDP socket") and 57c67ff ("udp:
additional GRO support") introduce incorrect usage of {ip,ipv6}_hdr in the
complete phase of gro. The functions always return skb->network_header,
which in the case of encapsulated packets at the gro complete phase, is
always set to the innermost L3 of the packet. That means that calling
{ip,ipv6}_hdr for skbs which completed the GRO receive phase (both in
gro_list and *_gro_complete) when parsing an encapsulated packet's _outer_
L3/L4 may return an unexpected value.

This incorrect usage leads to a bug in GRO's UDP socket lookup.
udp{4,6}_lib_lookup_skb functions use ip_hdr/ipv6_hdr respectively. These
*_hdr functions return network_header which will point to the innermost L3,
resulting in the wrong offset being used in __udp{4,6}_lib_lookup with
encapsulated packets.

This patch adds network_offset and inner_network_offset to napi_gro_cb, and
makes sure both are set correctly.

To fix the issue, network_offsets union is used inside napi_gro_cb, in
which both the outer and the inner network offsets are saved.

Reproduction example:

Endpoint configuration example (fou + local address bind)

    # ip fou add port 6666 ipproto 4
    # ip link add name tun1 type ipip remote 2.2.2.1 local 2.2.2.2 encap fou encap-dport 5555 encap-sport 6666 mode ipip
    # ip link set tun1 up
    # ip a add 1.1.1.2/24 dev tun1

Netperf TCP_STREAM result on net-next before patch is applied:

net-next main, GRO enabled:
    $ netperf -H 1.1.1.2 -t TCP_STREAM -l 5
    Recv   Send    Send
    Socket Socket  Message  Elapsed
    Size   Size    Size     Time     Throughput
    bytes  bytes   bytes    secs.    10^6bits/sec

    131072  16384  16384    5.28        2.37

net-next main, GRO disabled:
    $ netperf -H 1.1.1.2 -t TCP_STREAM -l 5
    Recv   Send    Send
    Socket Socket  Message  Elapsed
    Size   Size    Size     Time     Throughput
    bytes  bytes   bytes    secs.    10^6bits/sec

    131072  16384  16384    5.01     2745.06

patch applied, GRO enabled:
    $ netperf -H 1.1.1.2 -t TCP_STREAM -l 5
    Recv   Send    Send
    Socket Socket  Message  Elapsed
    Size   Size    Size     Time     Throughput
    bytes  bytes   bytes    secs.    10^6bits/sec

    131072  16384  16384    5.01     2877.38

Fixes: a6024562ffd7 ("udp: Add GRO functions to UDP socket")
Signed-off-by: Richard Gobert <richardbgobert@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/gro.h      | 9 +++++++++
 net/8021q/vlan_core.c  | 2 ++
 net/core/gro.c         | 1 +
 net/ipv4/af_inet.c     | 1 +
 net/ipv4/udp.c         | 3 ++-
 net/ipv4/udp_offload.c | 3 ++-
 net/ipv6/ip6_offload.c | 1 +
 net/ipv6/udp.c         | 3 ++-
 net/ipv6/udp_offload.c | 3 ++-
 9 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/gro.h b/include/net/gro.h
index 50f1e403dbbb..c1d4ca0463a1 100644
--- a/include/net/gro.h
+++ b/include/net/gro.h
@@ -87,6 +87,15 @@ struct napi_gro_cb {
 
 	/* used to support CHECKSUM_COMPLETE for tunneling protocols */
 	__wsum	csum;
+
+	/* L3 offsets */
+	union {
+		struct {
+			u16 network_offset;
+			u16 inner_network_offset;
+		};
+		u16 network_offsets[2];
+	};
 };
 
 #define NAPI_GRO_CB(skb) ((struct napi_gro_cb *)(skb)->cb)
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index f00158234505..9404dd551dfd 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -478,6 +478,8 @@ static struct sk_buff *vlan_gro_receive(struct list_head *head,
 	if (unlikely(!vhdr))
 		goto out;
 
+	NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = hlen;
+
 	type = vhdr->h_vlan_encapsulated_proto;
 
 	ptype = gro_find_receive_by_type(type);
diff --git a/net/core/gro.c b/net/core/gro.c
index 83f35d99a682..c7901253a1a8 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -371,6 +371,7 @@ static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff)
 	const skb_frag_t *frag0;
 	unsigned int headlen;
 
+	NAPI_GRO_CB(skb)->network_offset = 0;
 	NAPI_GRO_CB(skb)->data_offset = 0;
 	headlen = skb_headlen(skb);
 	NAPI_GRO_CB(skb)->frag0 = skb->data;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 55bd72997b31..fafb123f798b 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1572,6 +1572,7 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
 	/* The above will be needed by the transport layer if there is one
 	 * immediately following this IP hdr.
 	 */
+	NAPI_GRO_CB(skb)->inner_network_offset = off;
 
 	/* Note : No need to call skb_gro_postpull_rcsum() here,
 	 * as we already checked checksum over ipv4 header was 0
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 420905be5f30..b32cf2eeeb41 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -532,7 +532,8 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
 struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb,
 				 __be16 sport, __be16 dport)
 {
-	const struct iphdr *iph = ip_hdr(skb);
+	const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
+	const struct iphdr *iph = (struct iphdr *)(skb->data + offset);
 	struct net *net = dev_net(skb->dev);
 	int iif, sdif;
 
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 3498dd1d0694..fd29d21d579c 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -718,7 +718,8 @@ EXPORT_SYMBOL(udp_gro_complete);
 
 INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff)
 {
-	const struct iphdr *iph = ip_hdr(skb);
+	const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
+	const struct iphdr *iph = (struct iphdr *)(skb->data + offset);
 	struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
 
 	/* do fraglist only if there is no outer UDP encap (or we already processed it) */
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index b41e35af69ea..c8b909a9904f 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -237,6 +237,7 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head,
 		goto out;
 
 	skb_set_network_header(skb, off);
+	NAPI_GRO_CB(skb)->inner_network_offset = off;
 
 	flush += ntohs(iph->payload_len) != skb->len - hlen;
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 1a4cccdd40c9..8f7aa8bac1e7 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -272,7 +272,8 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
 struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb,
 				 __be16 sport, __be16 dport)
 {
-	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
+	const struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + offset);
 	struct net *net = dev_net(skb->dev);
 	int iif, sdif;
 
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index bbd347de00b4..b41152dd4246 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -164,7 +164,8 @@ flush:
 
 INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff)
 {
-	const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+	const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
+	const struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + offset);
 	struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
 
 	/* do fraglist only if there is no outer UDP encap (or we already processed it) */
-- 
cgit v1.2.3


From 951bcae6c5a0bfaa55b27c5f16178204988f0379 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Mon, 15 Apr 2024 18:20:43 +0200
Subject: kallsyms: Avoid weak references for kallsyms symbols

kallsyms is a directory of all the symbols in the vmlinux binary, and so
creating it is somewhat of a chicken-and-egg problem, as its non-zero
size affects the layout of the binary, and therefore the values of the
symbols.

For this reason, the kernel is linked more than once, and the first pass
does not include any kallsyms data at all. For the linker to accept
this, the symbol declarations describing the kallsyms metadata are
emitted as having weak linkage, so they can remain unsatisfied. During
the subsequent passes, the weak references are satisfied by the kallsyms
metadata that was constructed based on information gathered from the
preceding passes.

Weak references lead to somewhat worse codegen, because taking their
address may need to produce NULL (if the reference was unsatisfied), and
this is not usually supported by RIP or PC relative symbol references.

Given that these references are ultimately always satisfied in the final
link, let's drop the weak annotation, and instead, provide fallback
definitions in the linker script that are only emitted if an unsatisfied
reference exists.

While at it, drop the FRV specific annotation that these symbols reside
in .rodata - FRV is long gone.

Tested-by: Nick Desaulniers <ndesaulniers@google.com> # Boot
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lkml.kernel.org/r/20230504174320.3930345-1-ardb%40kernel.org
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
---
 include/asm-generic/vmlinux.lds.h | 19 +++++++++++++++++++
 kernel/kallsyms.c                 |  6 ------
 kernel/kallsyms_internal.h        | 30 ++++++++++++------------------
 3 files changed, 31 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index f7749d0f2562..e8449be62058 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -448,11 +448,30 @@
 #endif
 #endif
 
+/*
+ * Some symbol definitions will not exist yet during the first pass of the
+ * link, but are guaranteed to exist in the final link. Provide preliminary
+ * definitions that will be superseded in the final link to avoid having to
+ * rely on weak external linkage, which requires a GOT when used in position
+ * independent code.
+ */
+#define PRELIMINARY_SYMBOL_DEFINITIONS					\
+	PROVIDE(kallsyms_addresses = .);				\
+	PROVIDE(kallsyms_offsets = .);					\
+	PROVIDE(kallsyms_names = .);					\
+	PROVIDE(kallsyms_num_syms = .);					\
+	PROVIDE(kallsyms_relative_base = .);				\
+	PROVIDE(kallsyms_token_table = .);				\
+	PROVIDE(kallsyms_token_index = .);				\
+	PROVIDE(kallsyms_markers = .);					\
+	PROVIDE(kallsyms_seqs_of_names = .);
+
 /*
  * Read only Data
  */
 #define RO_DATA(align)							\
 	. = ALIGN((align));						\
+	PRELIMINARY_SYMBOL_DEFINITIONS					\
 	.rodata           : AT(ADDR(.rodata) - LOAD_OFFSET) {		\
 		__start_rodata = .;					\
 		*(.rodata) *(.rodata.*)					\
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 18edd57b5fe8..22ea19a36e6e 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -325,12 +325,6 @@ static unsigned long get_symbol_pos(unsigned long addr,
 	unsigned long symbol_start = 0, symbol_end = 0;
 	unsigned long i, low, high, mid;
 
-	/* This kernel should never had been booted. */
-	if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE))
-		BUG_ON(!kallsyms_addresses);
-	else
-		BUG_ON(!kallsyms_offsets);
-
 	/* Do a binary search on the sorted kallsyms_addresses array. */
 	low = 0;
 	high = kallsyms_num_syms;
diff --git a/kernel/kallsyms_internal.h b/kernel/kallsyms_internal.h
index 27fabdcc40f5..85480274fc8f 100644
--- a/kernel/kallsyms_internal.h
+++ b/kernel/kallsyms_internal.h
@@ -5,27 +5,21 @@
 #include <linux/types.h>
 
 /*
- * These will be re-linked against their real values
- * during the second link stage.
+ * These will be re-linked against their real values during the second link
+ * stage. Preliminary values must be provided in the linker script using the
+ * PROVIDE() directive so that the first link stage can complete successfully.
  */
-extern const unsigned long kallsyms_addresses[] __weak;
-extern const int kallsyms_offsets[] __weak;
-extern const u8 kallsyms_names[] __weak;
+extern const unsigned long kallsyms_addresses[];
+extern const int kallsyms_offsets[];
+extern const u8 kallsyms_names[];
 
-/*
- * Tell the compiler that the count isn't in the small data section if the arch
- * has one (eg: FRV).
- */
-extern const unsigned int kallsyms_num_syms
-__section(".rodata") __attribute__((weak));
-
-extern const unsigned long kallsyms_relative_base
-__section(".rodata") __attribute__((weak));
+extern const unsigned int kallsyms_num_syms;
+extern const unsigned long kallsyms_relative_base;
 
-extern const char kallsyms_token_table[] __weak;
-extern const u16 kallsyms_token_index[] __weak;
+extern const char kallsyms_token_table[];
+extern const u16 kallsyms_token_index[];
 
-extern const unsigned int kallsyms_markers[] __weak;
-extern const u8 kallsyms_seqs_of_names[] __weak;
+extern const unsigned int kallsyms_markers[];
+extern const u8 kallsyms_seqs_of_names[];
 
 #endif // LINUX_KALLSYMS_INTERNAL_H_
-- 
cgit v1.2.3


From 45751097aeb386eb239f6a8ed0ccfd7dabce068e Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Mon, 22 Apr 2024 12:24:06 +0200
Subject: seq_file: Optimize seq_puts()

Most of seq_puts() usages are done with a string literal. In such cases,
the length of the string car be computed at compile time in order to save
a strlen() call at run-time. seq_putc() or seq_write() can then be used
instead.

This saves a few cycles.

To have an estimation of how often this optimization triggers:
   $ git grep seq_puts.*\" | wc -l
   3436

   $ git grep seq_puts.*\".\" | wc -l
   84

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Link: https://lore.kernel.org/r/a8589bffe4830dafcb9111e22acf06603fea7132.1713781332.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Christian Brauner <brauner@kernel.org>
The output for seq_putc() generation has also be checked and works.
---
 fs/seq_file.c            |  4 ++--
 include/linux/seq_file.h | 13 ++++++++++++-
 2 files changed, 14 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index f5fdaf3b1572..8ef0a07033ca 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -669,7 +669,7 @@ void seq_putc(struct seq_file *m, char c)
 }
 EXPORT_SYMBOL(seq_putc);
 
-void seq_puts(struct seq_file *m, const char *s)
+void __seq_puts(struct seq_file *m, const char *s)
 {
 	int len = strlen(s);
 
@@ -680,7 +680,7 @@ void seq_puts(struct seq_file *m, const char *s)
 	memcpy(m->buf + m->count, s, len);
 	m->count += len;
 }
-EXPORT_SYMBOL(seq_puts);
+EXPORT_SYMBOL(__seq_puts);
 
 /**
  * seq_put_decimal_ull_width - A helper routine for putting decimal numbers
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 234bcdb1fba4..8bd4fda6e027 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -118,7 +118,18 @@ void seq_vprintf(struct seq_file *m, const char *fmt, va_list args);
 __printf(2, 3)
 void seq_printf(struct seq_file *m, const char *fmt, ...);
 void seq_putc(struct seq_file *m, char c);
-void seq_puts(struct seq_file *m, const char *s);
+void __seq_puts(struct seq_file *m, const char *s);
+
+static __always_inline void seq_puts(struct seq_file *m, const char *s)
+{
+	if (!__builtin_constant_p(*s))
+		__seq_puts(m, s);
+	else if (s[0] && !s[1])
+		seq_putc(m, s[0]);
+	else
+		seq_write(m, s, __builtin_strlen(s));
+}
+
 void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter,
 			       unsigned long long num, unsigned int width);
 void seq_put_decimal_ull(struct seq_file *m, const char *delimiter,
-- 
cgit v1.2.3


From 82e9c84d8712e8d29278a37aaa9aa767d50a16da Mon Sep 17 00:00:00 2001
From: Venkatesh Srinivas <venkateshs@chromium.org>
Date: Thu, 4 Apr 2024 23:26:51 +0000
Subject: KVM: Remove kvm_make_all_cpus_request_except()

Remove kvm_make_all_cpus_request_except() as it effectively has no users,
and arguably should never have been added in the first place.

Commit 54163a346d4a ("KVM: Introduce kvm_make_all_cpus_request_except()")
added the "except" variation for use in SVM's AVIC update path, which used
it to skip sending a request to the current vCPU (commit 7d611233b016
("KVM: SVM: Disable AVIC before setting V_IRQ")).

But the AVIC usage of kvm_make_all_cpus_request_except() was essentially a
hack-a-fix that simply squashed the most likely scenario of a racy WARN
without addressing the underlying problem(s).  Commit f1577ab21442 ("KVM:
SVM: svm_set_vintr don't warn if AVIC is active but is about to be
deactivated") eventually fixed the WARN itself, and the "except" usage was
subsequently dropped by df63202fe52b ("KVM: x86: APICv: drop immediate
APICv disablement on current vCPU").

That kvm_make_all_cpus_request_except() hasn't gained any users in the
last ~3 years isn't a coincidence.  If a VM-wide broadcast *needs* to skip
the current vCPU, then odds are very good that there is underlying bug
that could be better fixed elsewhere.

Signed-off-by: Venkatesh Srinivas <venkateshs@chromium.org>
Link: https://lore.kernel.org/r/20240404232651.1645176-1-venkateshs@chromium.org
[sean: rewrite changelog with --verbose]
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 include/linux/kvm_host.h |  2 --
 virt/kvm/kvm_main.c      | 13 ++-----------
 2 files changed, 2 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 48f31dcd318a..e1555d75a203 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -193,8 +193,6 @@ static inline bool is_error_page(struct page *page)
 bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
 				 unsigned long *vcpu_bitmap);
 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req);
-bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
-				      struct kvm_vcpu *except);
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID		0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 3573c8cd9e11..bd04aafcb3c4 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -311,8 +311,7 @@ bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
 	return called;
 }
 
-bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
-				      struct kvm_vcpu *except)
+bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
 {
 	struct kvm_vcpu *vcpu;
 	struct cpumask *cpus;
@@ -325,22 +324,14 @@ bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req,
 	cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
 	cpumask_clear(cpus);
 
-	kvm_for_each_vcpu(i, vcpu, kvm) {
-		if (vcpu == except)
-			continue;
+	kvm_for_each_vcpu(i, vcpu, kvm)
 		kvm_make_vcpu_request(vcpu, req, cpus, me);
-	}
 
 	called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
 	put_cpu();
 
 	return called;
 }
-
-bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
-{
-	return kvm_make_all_cpus_request_except(kvm, req, NULL);
-}
 EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
 
 void kvm_flush_remote_tlbs(struct kvm *kvm)
-- 
cgit v1.2.3


From 7d78a77733552092361239b1d8afaf8412f5dffd Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 1 May 2024 16:32:02 -0700
Subject: string: Add additional __realloc_size() annotations for "dup" helpers

Several other "dup"-style interfaces could use the __realloc_size()
attribute. (As a reminder to myself and others: "realloc" is used here
instead of "alloc" because the "alloc_size" attribute implies that the
memory contents are uninitialized. Since we're copying contents into the
resulting allocation, it must use "realloc_size" to avoid confusing the
compiler's optimization passes.)

Add KUnit test coverage where possible. (KUnit still does not have the
ability to manipulate userspace memory.)

Reviewed-by: Andy Shevchenko <andy@kernel.org>
Link: https://lore.kernel.org/r/20240502145218.it.729-kees@kernel.org
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/string.h | 13 ++++++++-----
 lib/fortify_kunit.c    | 26 ++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/string.h b/include/linux/string.h
index 86aa6cd35167..10e5177bb49c 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -14,8 +14,8 @@
 #include <uapi/linux/string.h>
 
 extern char *strndup_user(const char __user *, long);
-extern void *memdup_user(const void __user *, size_t);
-extern void *vmemdup_user(const void __user *, size_t);
+extern void *memdup_user(const void __user *, size_t) __realloc_size(2);
+extern void *vmemdup_user(const void __user *, size_t) __realloc_size(2);
 extern void *memdup_user_nul(const void __user *, size_t);
 
 /**
@@ -27,7 +27,8 @@ extern void *memdup_user_nul(const void __user *, size_t);
  * Return: an ERR_PTR() on failure. Result is physically
  * contiguous, to be freed by kfree().
  */
-static inline void *memdup_array_user(const void __user *src, size_t n, size_t size)
+static inline __realloc_size(2, 3)
+void *memdup_array_user(const void __user *src, size_t n, size_t size)
 {
 	size_t nbytes;
 
@@ -46,7 +47,8 @@ static inline void *memdup_array_user(const void __user *src, size_t n, size_t s
  * Return: an ERR_PTR() on failure. Result may be not
  * physically contiguous. Use kvfree() to free.
  */
-static inline void *vmemdup_array_user(const void __user *src, size_t n, size_t size)
+static inline __realloc_size(2, 3)
+void *vmemdup_array_user(const void __user *src, size_t n, size_t size)
 {
 	size_t nbytes;
 
@@ -285,7 +287,8 @@ extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
 extern void *kmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2);
 extern void *kvmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2);
 extern char *kmemdup_nul(const char *s, size_t len, gfp_t gfp);
-extern void *kmemdup_array(const void *src, size_t element_size, size_t count, gfp_t gfp);
+extern void *kmemdup_array(const void *src, size_t element_size, size_t count, gfp_t gfp)
+		__realloc_size(2, 3);
 
 /* lib/argv_split.c */
 extern char **argv_split(gfp_t gfp, const char *str, int *argcp);
diff --git a/lib/fortify_kunit.c b/lib/fortify_kunit.c
index 306522fd0aa2..d2377e00caab 100644
--- a/lib/fortify_kunit.c
+++ b/lib/fortify_kunit.c
@@ -363,6 +363,31 @@ DEFINE_ALLOC_SIZE_TEST_PAIR(kvmalloc)
 } while (0)
 DEFINE_ALLOC_SIZE_TEST_PAIR(devm_kmalloc)
 
+static const char * const test_strs[] = {
+	"",
+	"Hello there",
+	"A longer string, just for variety",
+};
+
+#define TEST_realloc(checker)	do {					\
+	gfp_t gfp = GFP_KERNEL;						\
+	size_t len;							\
+	int i;								\
+									\
+	for (i = 0; i < ARRAY_SIZE(test_strs); i++) {			\
+		len = strlen(test_strs[i]);				\
+		KUNIT_EXPECT_EQ(test, __builtin_constant_p(len), 0);	\
+		checker(len, kmemdup_array(test_strs[i], len, 1, gfp),	\
+			kfree(p));					\
+		checker(len, kmemdup(test_strs[i], len, gfp),		\
+			kfree(p));					\
+	}								\
+} while (0)
+static void fortify_test_realloc_size(struct kunit *test)
+{
+	TEST_realloc(check_dynamic);
+}
+
 /*
  * We can't have an array at the end of a structure or else
  * builds without -fstrict-flex-arrays=3 will report them as
@@ -1046,6 +1071,7 @@ static struct kunit_case fortify_test_cases[] = {
 	KUNIT_CASE(fortify_test_alloc_size_kvmalloc_dynamic),
 	KUNIT_CASE(fortify_test_alloc_size_devm_kmalloc_const),
 	KUNIT_CASE(fortify_test_alloc_size_devm_kmalloc_dynamic),
+	KUNIT_CASE(fortify_test_realloc_size),
 	KUNIT_CASE(fortify_test_strlen),
 	KUNIT_CASE(fortify_test_strnlen),
 	KUNIT_CASE(fortify_test_strcpy),
-- 
cgit v1.2.3


From 47c82aac10a6954d68f29f10d9758d016e8e5af1 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Date: Thu, 22 Feb 2024 16:17:33 +0000
Subject: media: cec: core: avoid recursive cec_claim_log_addrs

Keep track if cec_claim_log_addrs() is running, and return -EBUSY
if it is when calling CEC_ADAP_S_LOG_ADDRS.

This prevents a case where cec_claim_log_addrs() could be called
while it was still in progress.

Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Reported-by: Yang, Chenyuan <cy54@illinois.edu>
Closes: https://lore.kernel.org/linux-media/PH7PR11MB57688E64ADE4FE82E658D86DA09EA@PH7PR11MB5768.namprd11.prod.outlook.com/
Fixes: ca684386e6e2 ("[media] cec: add HDMI CEC framework (api)")
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---
 drivers/media/cec/core/cec-adap.c | 6 +++++-
 drivers/media/cec/core/cec-api.c  | 2 +-
 include/media/cec.h               | 1 +
 3 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/media/cec/core/cec-adap.c b/drivers/media/cec/core/cec-adap.c
index 6fc7de744ee9..a493cbce2456 100644
--- a/drivers/media/cec/core/cec-adap.c
+++ b/drivers/media/cec/core/cec-adap.c
@@ -1574,9 +1574,12 @@ unconfigure:
  */
 static void cec_claim_log_addrs(struct cec_adapter *adap, bool block)
 {
-	if (WARN_ON(adap->is_configuring || adap->is_configured))
+	if (WARN_ON(adap->is_claiming_log_addrs ||
+		    adap->is_configuring || adap->is_configured))
 		return;
 
+	adap->is_claiming_log_addrs = true;
+
 	init_completion(&adap->config_completion);
 
 	/* Ready to kick off the thread */
@@ -1591,6 +1594,7 @@ static void cec_claim_log_addrs(struct cec_adapter *adap, bool block)
 		wait_for_completion(&adap->config_completion);
 		mutex_lock(&adap->lock);
 	}
+	adap->is_claiming_log_addrs = false;
 }
 
 /*
diff --git a/drivers/media/cec/core/cec-api.c b/drivers/media/cec/core/cec-api.c
index d64bb716f9c6..3ef915344304 100644
--- a/drivers/media/cec/core/cec-api.c
+++ b/drivers/media/cec/core/cec-api.c
@@ -178,7 +178,7 @@ static long cec_adap_s_log_addrs(struct cec_adapter *adap, struct cec_fh *fh,
 			   CEC_LOG_ADDRS_FL_ALLOW_RC_PASSTHRU |
 			   CEC_LOG_ADDRS_FL_CDC_ONLY;
 	mutex_lock(&adap->lock);
-	if (!adap->is_configuring &&
+	if (!adap->is_claiming_log_addrs && !adap->is_configuring &&
 	    (!log_addrs.num_log_addrs || !adap->is_configured) &&
 	    !cec_is_busy(adap, fh)) {
 		err = __cec_s_log_addrs(adap, &log_addrs, block);
diff --git a/include/media/cec.h b/include/media/cec.h
index c8350258f10a..d3abe84c9c9e 100644
--- a/include/media/cec.h
+++ b/include/media/cec.h
@@ -258,6 +258,7 @@ struct cec_adapter {
 	u16 phys_addr;
 	bool needs_hpd;
 	bool is_enabled;
+	bool is_claiming_log_addrs;
 	bool is_configuring;
 	bool must_reconfigure;
 	bool is_configured;
-- 
cgit v1.2.3


From 9a87907de3597a339cc129229d1a20bc7365ea5f Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Thu, 2 May 2024 20:35:57 +0200
Subject: ovl: implement tmpfile

Combine inode creation with opening a file.

There are six separate objects that are being set up: the backing inode,
dentry and file, and the overlay inode, dentry and file.  Cleanup in case
of an error is a bit of a challenge and is difficult to test, so careful
review is needed.

All tmpfile testcases except generic/509 now run/pass, and no regressions
are observed with full xfstests.

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/backing-file.c            |  23 +++++++
 fs/internal.h                |   3 +
 fs/namei.c                   |   6 +-
 fs/overlayfs/dir.c           | 149 +++++++++++++++++++++++++++++++++++++------
 fs/overlayfs/file.c          |   3 -
 fs/overlayfs/overlayfs.h     |   3 +
 include/linux/backing-file.h |   3 +
 7 files changed, 165 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/fs/backing-file.c b/fs/backing-file.c
index 740185198db3..afb557446c27 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -52,6 +52,29 @@ struct file *backing_file_open(const struct path *user_path, int flags,
 }
 EXPORT_SYMBOL_GPL(backing_file_open);
 
+struct file *backing_tmpfile_open(const struct path *user_path, int flags,
+				  const struct path *real_parentpath,
+				  umode_t mode, const struct cred *cred)
+{
+	struct mnt_idmap *real_idmap = mnt_idmap(real_parentpath->mnt);
+	struct file *f;
+	int error;
+
+	f = alloc_empty_backing_file(flags, cred);
+	if (IS_ERR(f))
+		return f;
+
+	path_get(user_path);
+	*backing_file_user_path(f) = *user_path;
+	error = vfs_tmpfile(real_idmap, real_parentpath, f, mode);
+	if (error) {
+		fput(f);
+		f = ERR_PTR(error);
+	}
+	return f;
+}
+EXPORT_SYMBOL(backing_tmpfile_open);
+
 struct backing_aio {
 	struct kiocb iocb;
 	refcount_t ref;
diff --git a/fs/internal.h b/fs/internal.h
index 7ca738904e34..ab2225136f60 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -62,6 +62,9 @@ int do_mkdirat(int dfd, struct filename *name, umode_t mode);
 int do_symlinkat(struct filename *from, int newdfd, struct filename *to);
 int do_linkat(int olddfd, struct filename *old, int newdfd,
 			struct filename *new, int flags);
+int vfs_tmpfile(struct mnt_idmap *idmap,
+		const struct path *parentpath,
+		struct file *file, umode_t mode);
 
 /*
  * namespace.c
diff --git a/fs/namei.c b/fs/namei.c
index c5b2a25be7d0..13e50b0a49d2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3668,9 +3668,9 @@ static int do_open(struct nameidata *nd,
  * On non-idmapped mounts or if permission checking is to be performed on the
  * raw inode simply pass @nop_mnt_idmap.
  */
-static int vfs_tmpfile(struct mnt_idmap *idmap,
-		       const struct path *parentpath,
-		       struct file *file, umode_t mode)
+int vfs_tmpfile(struct mnt_idmap *idmap,
+		const struct path *parentpath,
+		struct file *file, umode_t mode)
 {
 	struct dentry *child;
 	struct inode *dir = d_inode(parentpath->dentry);
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 0f8b4a719237..c614e923b143 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -14,6 +14,7 @@
 #include <linux/posix_acl_xattr.h>
 #include <linux/atomic.h>
 #include <linux/ratelimit.h>
+#include <linux/backing-file.h>
 #include "overlayfs.h"
 
 static unsigned short ovl_redirect_max = 256;
@@ -260,14 +261,13 @@ static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry)
  * may not use to instantiate the new dentry.
  */
 static int ovl_instantiate(struct dentry *dentry, struct inode *inode,
-			   struct dentry *newdentry, bool hardlink)
+			   struct dentry *newdentry, bool hardlink, struct file *tmpfile)
 {
 	struct ovl_inode_params oip = {
 		.upperdentry = newdentry,
 		.newinode = inode,
 	};
 
-	ovl_dir_modified(dentry->d_parent, false);
 	ovl_dentry_set_upper_alias(dentry);
 	ovl_dentry_init_reval(dentry, newdentry, NULL);
 
@@ -295,6 +295,9 @@ static int ovl_instantiate(struct dentry *dentry, struct inode *inode,
 		inc_nlink(inode);
 	}
 
+	if (tmpfile)
+		d_mark_tmpfile(tmpfile, inode);
+
 	d_instantiate(dentry, inode);
 	if (inode != oip.newinode) {
 		pr_warn_ratelimited("newly created inode found in cache (%pd2)\n",
@@ -345,7 +348,8 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
 		ovl_set_opaque(dentry, newdentry);
 	}
 
-	err = ovl_instantiate(dentry, inode, newdentry, !!attr->hardlink);
+	ovl_dir_modified(dentry->d_parent, false);
+	err = ovl_instantiate(dentry, inode, newdentry, !!attr->hardlink, NULL);
 	if (err)
 		goto out_cleanup;
 out_unlock:
@@ -529,7 +533,8 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
 		if (err)
 			goto out_cleanup;
 	}
-	err = ovl_instantiate(dentry, inode, newdentry, hardlink);
+	ovl_dir_modified(dentry->d_parent, false);
+	err = ovl_instantiate(dentry, inode, newdentry, hardlink, NULL);
 	if (err) {
 		ovl_cleanup(ofs, udir, newdentry);
 		dput(newdentry);
@@ -551,12 +556,35 @@ out_cleanup:
 	goto out_dput;
 }
 
+static int ovl_setup_cred_for_create(struct dentry *dentry, struct inode *inode,
+				     umode_t mode, const struct cred *old_cred)
+{
+	int err;
+	struct cred *override_cred;
+
+	override_cred = prepare_creds();
+	if (!override_cred)
+		return -ENOMEM;
+
+	override_cred->fsuid = inode->i_uid;
+	override_cred->fsgid = inode->i_gid;
+	err = security_dentry_create_files_as(dentry, mode, &dentry->d_name,
+					      old_cred, override_cred);
+	if (err) {
+		put_cred(override_cred);
+		return err;
+	}
+	put_cred(override_creds(override_cred));
+	put_cred(override_cred);
+
+	return 0;
+}
+
 static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
 			      struct ovl_cattr *attr, bool origin)
 {
 	int err;
 	const struct cred *old_cred;
-	struct cred *override_cred;
 	struct dentry *parent = dentry->d_parent;
 
 	old_cred = ovl_override_creds(dentry->d_sb);
@@ -572,10 +600,6 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
 	}
 
 	if (!attr->hardlink) {
-		err = -ENOMEM;
-		override_cred = prepare_creds();
-		if (!override_cred)
-			goto out_revert_creds;
 		/*
 		 * In the creation cases(create, mkdir, mknod, symlink),
 		 * ovl should transfer current's fs{u,g}id to underlying
@@ -589,17 +613,9 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
 		 * create a new inode, so just use the ovl mounter's
 		 * fs{u,g}id.
 		 */
-		override_cred->fsuid = inode->i_uid;
-		override_cred->fsgid = inode->i_gid;
-		err = security_dentry_create_files_as(dentry,
-				attr->mode, &dentry->d_name, old_cred,
-				override_cred);
-		if (err) {
-			put_cred(override_cred);
+		err = ovl_setup_cred_for_create(dentry, inode, attr->mode, old_cred);
+		if (err)
 			goto out_revert_creds;
-		}
-		put_cred(override_creds(override_cred));
-		put_cred(override_cred);
 	}
 
 	if (!ovl_dentry_is_whiteout(dentry))
@@ -1290,6 +1306,100 @@ out:
 	return err;
 }
 
+static int ovl_create_tmpfile(struct file *file, struct dentry *dentry,
+			      struct inode *inode, umode_t mode)
+{
+	const struct cred *old_cred;
+	struct path realparentpath;
+	struct file *realfile;
+	struct dentry *newdentry;
+	/* It's okay to set O_NOATIME, since the owner will be current fsuid */
+	int flags = file->f_flags | OVL_OPEN_FLAGS;
+	int err;
+
+	err = ovl_copy_up(dentry->d_parent);
+	if (err)
+		return err;
+
+	old_cred = ovl_override_creds(dentry->d_sb);
+	err = ovl_setup_cred_for_create(dentry, inode, mode, old_cred);
+	if (err)
+		goto out_revert_creds;
+
+	ovl_path_upper(dentry->d_parent, &realparentpath);
+	realfile = backing_tmpfile_open(&file->f_path, flags, &realparentpath,
+					mode, current_cred());
+	err = PTR_ERR_OR_ZERO(realfile);
+	pr_debug("tmpfile/open(%pd2, 0%o) = %i\n", realparentpath.dentry, mode, err);
+	if (err)
+		goto out_revert_creds;
+
+	/* ovl_instantiate() consumes the newdentry reference on success */
+	newdentry = dget(realfile->f_path.dentry);
+	err = ovl_instantiate(dentry, inode, newdentry, false, file);
+	if (!err) {
+		file->private_data = realfile;
+	} else {
+		dput(newdentry);
+		fput(realfile);
+	}
+out_revert_creds:
+	revert_creds(old_cred);
+	return err;
+}
+
+static int ovl_dummy_open(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+static int ovl_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
+		       struct file *file, umode_t mode)
+{
+	int err;
+	struct dentry *dentry = file->f_path.dentry;
+	struct inode *inode;
+
+	if (!OVL_FS(dentry->d_sb)->tmpfile)
+		return -EOPNOTSUPP;
+
+	err = ovl_want_write(dentry);
+	if (err)
+		return err;
+
+	err = -ENOMEM;
+	inode = ovl_new_inode(dentry->d_sb, mode, 0);
+	if (!inode)
+		goto drop_write;
+
+	inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
+	err = ovl_create_tmpfile(file, dentry, inode, inode->i_mode);
+	if (err)
+		goto put_inode;
+
+	/*
+	 * Check if the preallocated inode was actually used.  Having something
+	 * else assigned to the dentry shouldn't happen as that would indicate
+	 * that the backing tmpfile "leaked" out of overlayfs.
+	 */
+	err = -EIO;
+	if (WARN_ON(inode != d_inode(dentry)))
+		goto put_realfile;
+
+	/* inode reference was transferred to dentry */
+	inode = NULL;
+	err = finish_open(file, dentry, ovl_dummy_open);
+put_realfile:
+	/* Without FMODE_OPENED ->release() won't be called on @file */
+	if (!(file->f_mode & FMODE_OPENED))
+		fput(file->private_data);
+put_inode:
+	iput(inode);
+drop_write:
+	ovl_drop_write(dentry);
+	return err;
+}
+
 const struct inode_operations ovl_dir_inode_operations = {
 	.lookup		= ovl_lookup,
 	.mkdir		= ovl_mkdir,
@@ -1310,4 +1420,5 @@ const struct inode_operations ovl_dir_inode_operations = {
 	.update_time	= ovl_update_time,
 	.fileattr_get	= ovl_fileattr_get,
 	.fileattr_set	= ovl_fileattr_set,
+	.tmpfile	= ovl_tmpfile,
 };
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 05536964d37f..1a411cae57ed 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -24,9 +24,6 @@ static char ovl_whatisit(struct inode *inode, struct inode *realinode)
 		return 'm';
 }
 
-/* No atime modification on underlying */
-#define OVL_OPEN_FLAGS (O_NOATIME)
-
 static struct file *ovl_open_realfile(const struct file *file,
 				      const struct path *realpath)
 {
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index ee949f3e7c77..0bfe35da4b7b 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -175,6 +175,9 @@ static inline int ovl_metadata_digest_size(const struct ovl_metacopy *metacopy)
 	return (int)metacopy->len - OVL_METACOPY_MIN_SIZE;
 }
 
+/* No atime modification on underlying */
+#define OVL_OPEN_FLAGS (O_NOATIME)
+
 extern const char *const ovl_xattr_table[][2];
 static inline const char *ovl_xattr(struct ovl_fs *ofs, enum ovl_xattr ox)
 {
diff --git a/include/linux/backing-file.h b/include/linux/backing-file.h
index 3f1fe1774f1b..4b61b0e57720 100644
--- a/include/linux/backing-file.h
+++ b/include/linux/backing-file.h
@@ -22,6 +22,9 @@ struct backing_file_ctx {
 struct file *backing_file_open(const struct path *user_path, int flags,
 			       const struct path *real_path,
 			       const struct cred *cred);
+struct file *backing_tmpfile_open(const struct path *user_path, int flags,
+				  const struct path *real_parentpath,
+				  umode_t mode, const struct cred *cred);
 ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
 			       struct kiocb *iocb, int flags,
 			       struct backing_file_ctx *ctx);
-- 
cgit v1.2.3


From 55111470b4416a6225abfdb4e1f0b8a8c8c52fdc Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Wed, 1 May 2024 11:40:23 -0700
Subject: cxl/cper: Fix non-ACPI-APEI-GHES build

If ACPI_APEI_GHES is not configured the [un]register work functions are
not properly declared.

0day notices that the cxl_cper_register_work() declaration in the
CONFIG_ACPI_APEI_GHES=n is broken, fix it to be typical nop stub.

Reported-by: kernel test robot <lkp@intel.com>
Closes: http://lore.kernel.org/r/202405012230.6kXItWen-lkp@intel.com
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Link: https://lore.kernel.org/r/20240501-cper-fix-0day-v1-1-c0b0056eafbc@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 include/linux/cxl-event.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/cxl-event.h b/include/linux/cxl-event.h
index 855908792ef8..60b25020281f 100644
--- a/include/linux/cxl-event.h
+++ b/include/linux/cxl-event.h
@@ -164,12 +164,12 @@ int cxl_cper_register_work(struct work_struct *work);
 int cxl_cper_unregister_work(struct work_struct *work);
 int cxl_cper_kfifo_get(struct cxl_cper_work_data *wd);
 #else
-static inline int cxl_cper_register_work(struct work_struct *work);
+static inline int cxl_cper_register_work(struct work_struct *work)
 {
 	return 0;
 }
 
-static inline int cxl_cper_unregister_work(struct work_struct *work);
+static inline int cxl_cper_unregister_work(struct work_struct *work)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From 798cb7f9aec35460c383eab57b9fa474d999a2eb Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 17 Apr 2024 18:26:54 -0400
Subject: swapon(2)/swapoff(2): don't bother with block size

once upon a time that used to matter; these days we do swap IO for
swap devices at the level that doesn't give a damn about block size,
buffer_head or anything of that sort - just attach the page to
bio, set the location and size (the latter to PAGE_SIZE) and feed
into queue.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/swap.h |  1 -
 mm/swapfile.c        | 12 +-----------
 2 files changed, 1 insertion(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index f53d608daa01..a5b640cca459 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -301,7 +301,6 @@ struct swap_info_struct {
 	struct file *bdev_file;		/* open handle of the bdev */
 	struct block_device *bdev;	/* swap device or bdev of swap file */
 	struct file *swap_file;		/* seldom referenced */
-	unsigned int old_block_size;	/* seldom referenced */
 	struct completion comp;		/* seldom referenced */
 	spinlock_t lock;		/*
 					 * protect map scan related fields like
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4919423cce76..304f74d039f3 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2417,7 +2417,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	struct inode *inode;
 	struct filename *pathname;
 	int err, found = 0;
-	unsigned int old_block_size;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -2529,7 +2528,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	}
 
 	swap_file = p->swap_file;
-	old_block_size = p->old_block_size;
 	p->swap_file = NULL;
 	p->max = 0;
 	swap_map = p->swap_map;
@@ -2553,7 +2551,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 
 	inode = mapping->host;
 	if (p->bdev_file) {
-		set_blocksize(p->bdev, old_block_size);
 		fput(p->bdev_file);
 		p->bdev_file = NULL;
 	}
@@ -2782,21 +2779,15 @@ static struct swap_info_struct *alloc_swap_info(void)
 
 static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
 {
-	int error;
-
 	if (S_ISBLK(inode->i_mode)) {
 		p->bdev_file = bdev_file_open_by_dev(inode->i_rdev,
 				BLK_OPEN_READ | BLK_OPEN_WRITE, p, NULL);
 		if (IS_ERR(p->bdev_file)) {
-			error = PTR_ERR(p->bdev_file);
+			int error = PTR_ERR(p->bdev_file);
 			p->bdev_file = NULL;
 			return error;
 		}
 		p->bdev = file_bdev(p->bdev_file);
-		p->old_block_size = block_size(p->bdev);
-		error = set_blocksize(p->bdev, PAGE_SIZE);
-		if (error < 0)
-			return error;
 		/*
 		 * Zoned block devices contain zones that have a sequential
 		 * write only restriction.  Hence zoned block devices are not
@@ -3235,7 +3226,6 @@ bad_swap:
 	free_percpu(p->cluster_next_cpu);
 	p->cluster_next_cpu = NULL;
 	if (p->bdev_file) {
-		set_blocksize(p->bdev, p->old_block_size);
 		fput(p->bdev_file);
 		p->bdev_file = NULL;
 	}
-- 
cgit v1.2.3


From 51d908b3db0e588aeb2d06df37e4df3fb1754bb5 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 17 Apr 2024 18:33:34 -0400
Subject: swapon(2): open swap with O_EXCL

... eliminating the need to reopen block devices so they could be
exclusively held.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/swap.h |  1 -
 mm/swapfile.c        | 19 ++-----------------
 2 files changed, 2 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index a5b640cca459..7e61a4aef2fc 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -298,7 +298,6 @@ struct swap_info_struct {
 	unsigned int __percpu *cluster_next_cpu; /*percpu index for next allocation */
 	struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
 	struct rb_root swap_extent_root;/* root of the swap extent rbtree */
-	struct file *bdev_file;		/* open handle of the bdev */
 	struct block_device *bdev;	/* swap device or bdev of swap file */
 	struct file *swap_file;		/* seldom referenced */
 	struct completion comp;		/* seldom referenced */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 304f74d039f3..7bba0b0d4924 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2550,10 +2550,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	exit_swap_address_space(p->type);
 
 	inode = mapping->host;
-	if (p->bdev_file) {
-		fput(p->bdev_file);
-		p->bdev_file = NULL;
-	}
 
 	inode_lock(inode);
 	inode->i_flags &= ~S_SWAPFILE;
@@ -2780,14 +2776,7 @@ static struct swap_info_struct *alloc_swap_info(void)
 static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
 {
 	if (S_ISBLK(inode->i_mode)) {
-		p->bdev_file = bdev_file_open_by_dev(inode->i_rdev,
-				BLK_OPEN_READ | BLK_OPEN_WRITE, p, NULL);
-		if (IS_ERR(p->bdev_file)) {
-			int error = PTR_ERR(p->bdev_file);
-			p->bdev_file = NULL;
-			return error;
-		}
-		p->bdev = file_bdev(p->bdev_file);
+		p->bdev = I_BDEV(inode);
 		/*
 		 * Zoned block devices contain zones that have a sequential
 		 * write only restriction.  Hence zoned block devices are not
@@ -3028,7 +3017,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 		name = NULL;
 		goto bad_swap;
 	}
-	swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0);
+	swap_file = file_open_name(name, O_RDWR | O_LARGEFILE | O_EXCL, 0);
 	if (IS_ERR(swap_file)) {
 		error = PTR_ERR(swap_file);
 		swap_file = NULL;
@@ -3225,10 +3214,6 @@ bad_swap:
 	p->percpu_cluster = NULL;
 	free_percpu(p->cluster_next_cpu);
 	p->cluster_next_cpu = NULL;
-	if (p->bdev_file) {
-		fput(p->bdev_file);
-		p->bdev_file = NULL;
-	}
 	inode = NULL;
 	destroy_swap_extents(p);
 	swap_cgroup_swapoff(p->type);
-- 
cgit v1.2.3


From ead083aeeed9df44fab9227e47688f7305c3a233 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 18 Apr 2024 00:34:31 -0400
Subject: set_blocksize(): switch to passing struct file *

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 block/bdev.c            | 11 +++++++----
 block/ioctl.c           | 21 ++++++++++++---------
 drivers/block/pktcdvd.c |  2 +-
 fs/btrfs/dev-replace.c  |  2 +-
 fs/btrfs/volumes.c      |  4 ++--
 fs/ext4/super.c         |  2 +-
 fs/reiserfs/journal.c   |  5 ++---
 fs/xfs/xfs_buf.c        |  2 +-
 include/linux/blkdev.h  |  2 +-
 9 files changed, 28 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/block/bdev.c b/block/bdev.c
index b8e32d933a63..a329ff9be11d 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -144,8 +144,11 @@ static void set_init_blocksize(struct block_device *bdev)
 	bdev->bd_inode->i_blkbits = blksize_bits(bsize);
 }
 
-int set_blocksize(struct block_device *bdev, int size)
+int set_blocksize(struct file *file, int size)
 {
+	struct inode *inode = file->f_mapping->host;
+	struct block_device *bdev = I_BDEV(inode);
+
 	/* Size must be a power of two, and between 512 and PAGE_SIZE */
 	if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
 		return -EINVAL;
@@ -155,9 +158,9 @@ int set_blocksize(struct block_device *bdev, int size)
 		return -EINVAL;
 
 	/* Don't change the size if it is same as current */
-	if (bdev->bd_inode->i_blkbits != blksize_bits(size)) {
+	if (inode->i_blkbits != blksize_bits(size)) {
 		sync_blockdev(bdev);
-		bdev->bd_inode->i_blkbits = blksize_bits(size);
+		inode->i_blkbits = blksize_bits(size);
 		kill_bdev(bdev);
 	}
 	return 0;
@@ -167,7 +170,7 @@ EXPORT_SYMBOL(set_blocksize);
 
 int sb_set_blocksize(struct super_block *sb, int size)
 {
-	if (set_blocksize(sb->s_bdev, size))
+	if (set_blocksize(sb->s_bdev_file, size))
 		return 0;
 	/* If we get here, we know size is power of two
 	 * and it's value is between 512 and PAGE_SIZE */
diff --git a/block/ioctl.c b/block/ioctl.c
index a9028a2c2db5..1c800364bc70 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -473,11 +473,14 @@ static int compat_hdio_getgeo(struct block_device *bdev,
 #endif
 
 /* set the logical block size */
-static int blkdev_bszset(struct block_device *bdev, blk_mode_t mode,
+static int blkdev_bszset(struct file *file, blk_mode_t mode,
 		int __user *argp)
 {
+	// this one might be file_inode(file)->i_rdev - a rare valid
+	// use of file_inode() for those.
+	dev_t dev = I_BDEV(file->f_mapping->host)->bd_dev;
+	struct file *excl_file;
 	int ret, n;
-	struct file *file;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
@@ -487,13 +490,13 @@ static int blkdev_bszset(struct block_device *bdev, blk_mode_t mode,
 		return -EFAULT;
 
 	if (mode & BLK_OPEN_EXCL)
-		return set_blocksize(bdev, n);
+		return set_blocksize(file, n);
 
-	file = bdev_file_open_by_dev(bdev->bd_dev, mode, &bdev, NULL);
-	if (IS_ERR(file))
+	excl_file = bdev_file_open_by_dev(dev, mode, &dev, NULL);
+	if (IS_ERR(excl_file))
 		return -EBUSY;
-	ret = set_blocksize(bdev, n);
-	fput(file);
+	ret = set_blocksize(excl_file, n);
+	fput(excl_file);
 	return ret;
 }
 
@@ -621,7 +624,7 @@ long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */
 		return put_int(argp, block_size(bdev));
 	case BLKBSZSET:
-		return blkdev_bszset(bdev, mode, argp);
+		return blkdev_bszset(file, mode, argp);
 	case BLKGETSIZE64:
 		return put_u64(argp, bdev_nr_bytes(bdev));
 
@@ -681,7 +684,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */
 		return put_int(argp, bdev_logical_block_size(bdev));
 	case BLKBSZSET_32:
-		return blkdev_bszset(bdev, mode, argp);
+		return blkdev_bszset(file, mode, argp);
 	case BLKGETSIZE64_32:
 		return put_u64(argp, bdev_nr_bytes(bdev));
 
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 05933f25b397..8a2ce8070010 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2215,7 +2215,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, bool write)
 		}
 		dev_info(ddev, "%lukB available on disc\n", lba << 1);
 	}
-	set_blocksize(file_bdev(bdev_file), CD_FRAMESIZE);
+	set_blocksize(bdev_file, CD_FRAMESIZE);
 
 	return 0;
 
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 7696beec4c21..7130040d92ab 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -316,7 +316,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 	set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
 	device->dev_stats_valid = 1;
-	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
+	set_blocksize(bdev_file, BTRFS_BDEV_BLOCKSIZE);
 	device->fs_devices = fs_devices;
 
 	ret = btrfs_get_dev_zone_info(device, false);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 43af5a9fb547..65c03ddecc59 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -483,7 +483,7 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
 	if (flush)
 		sync_blockdev(bdev);
 	if (holder) {
-		ret = set_blocksize(bdev, BTRFS_BDEV_BLOCKSIZE);
+		ret = set_blocksize(*bdev_file, BTRFS_BDEV_BLOCKSIZE);
 		if (ret) {
 			fput(*bdev_file);
 			goto error;
@@ -2717,7 +2717,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
 	device->dev_stats_valid = 1;
-	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
+	set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE);
 
 	if (seeding_dev) {
 		btrfs_clear_sb_rdonly(sb);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 044135796f2b..9988b3a40b42 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5873,7 +5873,7 @@ static struct file *ext4_get_journal_blkdev(struct super_block *sb,
 
 	sb_block = EXT4_MIN_BLOCK_SIZE / blocksize;
 	offset = EXT4_MIN_BLOCK_SIZE % blocksize;
-	set_blocksize(bdev, blocksize);
+	set_blocksize(bdev_file, blocksize);
 	bh = __bread(bdev, sb_block, blocksize);
 	if (!bh) {
 		ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index e539ccd39e1e..e477ee0ff35d 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2626,8 +2626,7 @@ static int journal_init_dev(struct super_block *super,
 					 MAJOR(jdev), MINOR(jdev), result);
 			return result;
 		} else if (jdev != super->s_dev)
-			set_blocksize(file_bdev(journal->j_bdev_file),
-				      super->s_blocksize);
+			set_blocksize(journal->j_bdev_file, super->s_blocksize);
 
 		return 0;
 	}
@@ -2643,7 +2642,7 @@ static int journal_init_dev(struct super_block *super,
 		return result;
 	}
 
-	set_blocksize(file_bdev(journal->j_bdev_file), super->s_blocksize);
+	set_blocksize(journal->j_bdev_file, super->s_blocksize);
 	reiserfs_info(super,
 		      "journal_init_dev: journal device: %pg\n",
 		      file_bdev(journal->j_bdev_file));
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index f0fa02264eda..2dc0eacb0999 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -2043,7 +2043,7 @@ xfs_setsize_buftarg(
 	btp->bt_meta_sectorsize = sectorsize;
 	btp->bt_meta_sectormask = sectorsize - 1;
 
-	if (set_blocksize(btp->bt_bdev, sectorsize)) {
+	if (set_blocksize(btp->bt_bdev_file, sectorsize)) {
 		xfs_warn(btp->bt_mount,
 			"Cannot set_blocksize to %u on device %pg",
 			sectorsize, btp->bt_bdev);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 172c91879999..20c749b2ebc2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1474,7 +1474,7 @@ static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time)
 }
 
 int bdev_read_only(struct block_device *bdev);
-int set_blocksize(struct block_device *bdev, int size);
+int set_blocksize(struct file *file, int size);
 
 int lookup_bdev(const char *pathname, dev_t *dev);
 
-- 
cgit v1.2.3


From 3f9b8fb46e5d20eac314f56747e24e1a4e74539d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 12 Apr 2024 00:54:19 -0400
Subject: Use bdev_is_paritition() instead of open-coding it

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 block/blk-core.c          | 5 +++--
 block/blk-mq.c            | 2 +-
 include/linux/part_stat.h | 2 +-
 lib/vsprintf.c            | 2 +-
 4 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/block/blk-core.c b/block/blk-core.c
index a16b5abdbbf5..20322abc6082 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -759,7 +759,8 @@ void submit_bio_noacct(struct bio *bio)
 	if (!bio_flagged(bio, BIO_REMAPPED)) {
 		if (unlikely(bio_check_eod(bio)))
 			goto end_io;
-		if (bdev->bd_partno && unlikely(blk_partition_remap(bio)))
+		if (bdev_is_partition(bdev) &&
+		    unlikely(blk_partition_remap(bio)))
 			goto end_io;
 	}
 
@@ -989,7 +990,7 @@ again:
 		if (likely(try_cmpxchg(&part->bd_stamp, &stamp, now)))
 			__part_stat_add(part, io_ticks, end ? now - stamp : 1);
 	}
-	if (part->bd_partno) {
+	if (bdev_is_partition(part)) {
 		part = bdev_whole(part);
 		goto again;
 	}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 32afb87efbd0..43bb8f50a07c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -92,7 +92,7 @@ static bool blk_mq_check_inflight(struct request *rq, void *priv)
 	struct mq_inflight *mi = priv;
 
 	if (rq->part && blk_do_io_stat(rq) &&
-	    (!mi->part->bd_partno || rq->part == mi->part) &&
+	    (!bdev_is_partition(mi->part) || rq->part == mi->part) &&
 	    blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
 		mi->inflight[rq_data_dir(rq)]++;
 
diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h
index abeba356bc3f..ac8c44dd8237 100644
--- a/include/linux/part_stat.h
+++ b/include/linux/part_stat.h
@@ -59,7 +59,7 @@ static inline void part_stat_set_all(struct block_device *part, int value)
 
 #define part_stat_add(part, field, addnd)	do {			\
 	__part_stat_add((part), field, addnd);				\
-	if ((part)->bd_partno)						\
+	if (bdev_is_partition(part))					\
 		__part_stat_add(bdev_whole(part), field, addnd);	\
 } while (0)
 
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 552738f14275..3f9f1b959ef0 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -966,7 +966,7 @@ char *bdev_name(char *buf, char *end, struct block_device *bdev,
 
 	hd = bdev->bd_disk;
 	buf = string(buf, end, hd->disk_name, spec);
-	if (bdev->bd_partno) {
+	if (bdev_is_partition(bdev)) {
 		if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) {
 			if (buf < end)
 				*buf = 'p';
-- 
cgit v1.2.3


From b8c873edbf35570b93edfeddad9e85da54defa52 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 12 Apr 2024 01:01:36 -0400
Subject: wrapper for access to ->bd_partno

On the next step it's going to get folded into a field where flags will go.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 block/early-lookup.c    |  2 +-
 block/partitions/core.c | 12 ++++++------
 include/linux/blkdev.h  |  7 ++++++-
 lib/vsprintf.c          |  2 +-
 4 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/block/early-lookup.c b/block/early-lookup.c
index 3effbd0d35e9..3fb57f7d2b12 100644
--- a/block/early-lookup.c
+++ b/block/early-lookup.c
@@ -78,7 +78,7 @@ static int __init devt_from_partuuid(const char *uuid_str, dev_t *devt)
 		 * to the partition number found by UUID.
 		 */
 		*devt = part_devt(dev_to_disk(dev),
-				  dev_to_bdev(dev)->bd_partno + offset);
+				  bdev_partno(dev_to_bdev(dev)) + offset);
 	} else {
 		*devt = dev->devt;
 	}
diff --git a/block/partitions/core.c b/block/partitions/core.c
index b11e88c82c8c..edd5309dc4ba 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -173,7 +173,7 @@ static struct parsed_partitions *check_partition(struct gendisk *hd)
 static ssize_t part_partition_show(struct device *dev,
 				   struct device_attribute *attr, char *buf)
 {
-	return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_partno);
+	return sprintf(buf, "%d\n", bdev_partno(dev_to_bdev(dev)));
 }
 
 static ssize_t part_start_show(struct device *dev,
@@ -250,7 +250,7 @@ static int part_uevent(const struct device *dev, struct kobj_uevent_env *env)
 {
 	const struct block_device *part = dev_to_bdev(dev);
 
-	add_uevent_var(env, "PARTN=%u", part->bd_partno);
+	add_uevent_var(env, "PARTN=%u", bdev_partno(part));
 	if (part->bd_meta_info && part->bd_meta_info->volname[0])
 		add_uevent_var(env, "PARTNAME=%s", part->bd_meta_info->volname);
 	return 0;
@@ -267,7 +267,7 @@ void drop_partition(struct block_device *part)
 {
 	lockdep_assert_held(&part->bd_disk->open_mutex);
 
-	xa_erase(&part->bd_disk->part_tbl, part->bd_partno);
+	xa_erase(&part->bd_disk->part_tbl, bdev_partno(part));
 	kobject_put(part->bd_holder_dir);
 
 	device_del(&part->bd_device);
@@ -338,8 +338,8 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
 	pdev->parent = ddev;
 
 	/* in consecutive minor range? */
-	if (bdev->bd_partno < disk->minors) {
-		devt = MKDEV(disk->major, disk->first_minor + bdev->bd_partno);
+	if (bdev_partno(bdev) < disk->minors) {
+		devt = MKDEV(disk->major, disk->first_minor + bdev_partno(bdev));
 	} else {
 		err = blk_alloc_ext_minor();
 		if (err < 0)
@@ -404,7 +404,7 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start,
 
 	rcu_read_lock();
 	xa_for_each_start(&disk->part_tbl, idx, part, 1) {
-		if (part->bd_partno != skip_partno &&
+		if (bdev_partno(part) != skip_partno &&
 		    start < part->bd_start_sect + bdev_nr_sectors(part) &&
 		    start + length > part->bd_start_sect) {
 			overlap = true;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c3e8f7cf96be..32549d675955 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -720,6 +720,11 @@ void invalidate_disk(struct gendisk *disk);
 void set_disk_ro(struct gendisk *disk, bool read_only);
 void disk_uevent(struct gendisk *disk, enum kobject_action action);
 
+static inline u8 bdev_partno(const struct block_device *bdev)
+{
+	return bdev->bd_partno;
+}
+
 static inline int get_disk_ro(struct gendisk *disk)
 {
 	return disk->part0->bd_read_only ||
@@ -1095,7 +1100,7 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
 
 static inline bool bdev_is_partition(struct block_device *bdev)
 {
-	return bdev->bd_partno;
+	return bdev_partno(bdev) != 0;
 }
 
 enum blk_default_limits {
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 3f9f1b959ef0..cdd4e2314bfc 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -972,7 +972,7 @@ char *bdev_name(char *buf, char *end, struct block_device *bdev,
 				*buf = 'p';
 			buf++;
 		}
-		buf = number(buf, end, bdev->bd_partno, spec);
+		buf = number(buf, end, bdev_partno(bdev), spec);
 	}
 	return buf;
 }
-- 
cgit v1.2.3


From b478e162f227d17d6a29f40ffd464af358334971 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Fri, 22 Mar 2024 14:39:51 +0200
Subject: PCI/ASPM: Consolidate link state defines
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The linux/pci.h and aspm.c files define their own sets of link state
related defines which are almost the same.

Consolidate the use of defines into those defined by linux/pci.h and expand
PCIE_LINK_STATE_L0S to match earlier ASPM_STATE_L0S that includes both
upstream and downstream bits. Rename also the defines that are internal to
aspm.c to start with PCIE_LINK_STATE for consistency.

While the PCIE_LINK_STATE_L0S BIT(0) -> (BIT(0) | BIT(1)) transformation is
not 1:1, in practice aspm.c already used ASPM_STATE_L0S that has both bits
enabled except during mapping.

While at it, place the PCIE_LINK_STATE_CLKPM define last to have more
logical grouping.

Use static_assert() to ensure PCIE_LINK_STATE_L0S is strictly equal to the
combination of PCIE_LINK_STATE_L0S_UP/DW.

Link: https://lore.kernel.org/r/20240322123952.6384-2-ilpo.jarvinen@linux.intel.com
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pcie/aspm.c | 155 ++++++++++++++++++++++++------------------------
 include/linux/pci.h     |  24 ++++----
 2 files changed, 91 insertions(+), 88 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
index 2428d278e015..994b283b6a0f 100644
--- a/drivers/pci/pcie/aspm.c
+++ b/drivers/pci/pcie/aspm.c
@@ -8,6 +8,8 @@
  */
 
 #include <linux/bitfield.h>
+#include <linux/bits.h>
+#include <linux/build_bug.h>
 #include <linux/kernel.h>
 #include <linux/limits.h>
 #include <linux/math.h>
@@ -189,21 +191,18 @@ void pci_restore_aspm_l1ss_state(struct pci_dev *pdev)
 #endif
 #define MODULE_PARAM_PREFIX "pcie_aspm."
 
-/* Note: those are not register definitions */
-#define ASPM_STATE_L0S_UP	(1)	/* Upstream direction L0s state */
-#define ASPM_STATE_L0S_DW	(2)	/* Downstream direction L0s state */
-#define ASPM_STATE_L1		(4)	/* L1 state */
-#define ASPM_STATE_L1_1		(8)	/* ASPM L1.1 state */
-#define ASPM_STATE_L1_2		(0x10)	/* ASPM L1.2 state */
-#define ASPM_STATE_L1_1_PCIPM	(0x20)	/* PCI PM L1.1 state */
-#define ASPM_STATE_L1_2_PCIPM	(0x40)	/* PCI PM L1.2 state */
-#define ASPM_STATE_L1_SS_PCIPM	(ASPM_STATE_L1_1_PCIPM | ASPM_STATE_L1_2_PCIPM)
-#define ASPM_STATE_L1_2_MASK	(ASPM_STATE_L1_2 | ASPM_STATE_L1_2_PCIPM)
-#define ASPM_STATE_L1SS		(ASPM_STATE_L1_1 | ASPM_STATE_L1_1_PCIPM |\
-				 ASPM_STATE_L1_2_MASK)
-#define ASPM_STATE_L0S		(ASPM_STATE_L0S_UP | ASPM_STATE_L0S_DW)
-#define ASPM_STATE_ALL		(ASPM_STATE_L0S | ASPM_STATE_L1 |	\
-				 ASPM_STATE_L1SS)
+/* Note: these are not register definitions */
+#define PCIE_LINK_STATE_L0S_UP	BIT(0)	/* Upstream direction L0s state */
+#define PCIE_LINK_STATE_L0S_DW	BIT(1)	/* Downstream direction L0s state */
+static_assert(PCIE_LINK_STATE_L0S == (PCIE_LINK_STATE_L0S_UP | PCIE_LINK_STATE_L0S_DW));
+
+#define PCIE_LINK_STATE_L1_SS_PCIPM	(PCIE_LINK_STATE_L1_1_PCIPM |\
+					 PCIE_LINK_STATE_L1_2_PCIPM)
+#define PCIE_LINK_STATE_L1_2_MASK	(PCIE_LINK_STATE_L1_2 |\
+					 PCIE_LINK_STATE_L1_2_PCIPM)
+#define PCIE_LINK_STATE_L1SS		(PCIE_LINK_STATE_L1_1 |\
+					 PCIE_LINK_STATE_L1_1_PCIPM |\
+					 PCIE_LINK_STATE_L1_2_MASK)
 
 struct pcie_link_state {
 	struct pci_dev *pdev;		/* Upstream component of the Link */
@@ -275,10 +274,10 @@ static int policy_to_aspm_state(struct pcie_link_state *link)
 		return 0;
 	case POLICY_POWERSAVE:
 		/* Enable ASPM L0s/L1 */
-		return (ASPM_STATE_L0S | ASPM_STATE_L1);
+		return PCIE_LINK_STATE_L0S | PCIE_LINK_STATE_L1;
 	case POLICY_POWER_SUPERSAVE:
 		/* Enable Everything */
-		return ASPM_STATE_ALL;
+		return PCIE_LINK_STATE_ASPM_ALL;
 	case POLICY_DEFAULT:
 		return link->aspm_default;
 	}
@@ -581,14 +580,14 @@ static void pcie_aspm_check_latency(struct pci_dev *endpoint)
 		latency_dw_l1 = calc_l1_latency(lnkcap_dw);
 
 		/* Check upstream direction L0s latency */
-		if ((link->aspm_capable & ASPM_STATE_L0S_UP) &&
+		if ((link->aspm_capable & PCIE_LINK_STATE_L0S_UP) &&
 		    (latency_up_l0s > acceptable_l0s))
-			link->aspm_capable &= ~ASPM_STATE_L0S_UP;
+			link->aspm_capable &= ~PCIE_LINK_STATE_L0S_UP;
 
 		/* Check downstream direction L0s latency */
-		if ((link->aspm_capable & ASPM_STATE_L0S_DW) &&
+		if ((link->aspm_capable & PCIE_LINK_STATE_L0S_DW) &&
 		    (latency_dw_l0s > acceptable_l0s))
-			link->aspm_capable &= ~ASPM_STATE_L0S_DW;
+			link->aspm_capable &= ~PCIE_LINK_STATE_L0S_DW;
 		/*
 		 * Check L1 latency.
 		 * Every switch on the path to root complex need 1
@@ -603,9 +602,9 @@ static void pcie_aspm_check_latency(struct pci_dev *endpoint)
 		 * substate latencies (and hence do not do any check).
 		 */
 		latency = max_t(u32, latency_up_l1, latency_dw_l1);
-		if ((link->aspm_capable & ASPM_STATE_L1) &&
+		if ((link->aspm_capable & PCIE_LINK_STATE_L1) &&
 		    (latency + l1_switch_latency > acceptable_l1))
-			link->aspm_capable &= ~ASPM_STATE_L1;
+			link->aspm_capable &= ~PCIE_LINK_STATE_L1;
 		l1_switch_latency += NSEC_PER_USEC;
 
 		link = link->parent;
@@ -741,13 +740,13 @@ static void aspm_l1ss_init(struct pcie_link_state *link)
 		child_l1ss_cap &= ~PCI_L1SS_CAP_ASPM_L1_2;
 
 	if (parent_l1ss_cap & child_l1ss_cap & PCI_L1SS_CAP_ASPM_L1_1)
-		link->aspm_support |= ASPM_STATE_L1_1;
+		link->aspm_support |= PCIE_LINK_STATE_L1_1;
 	if (parent_l1ss_cap & child_l1ss_cap & PCI_L1SS_CAP_ASPM_L1_2)
-		link->aspm_support |= ASPM_STATE_L1_2;
+		link->aspm_support |= PCIE_LINK_STATE_L1_2;
 	if (parent_l1ss_cap & child_l1ss_cap & PCI_L1SS_CAP_PCIPM_L1_1)
-		link->aspm_support |= ASPM_STATE_L1_1_PCIPM;
+		link->aspm_support |= PCIE_LINK_STATE_L1_1_PCIPM;
 	if (parent_l1ss_cap & child_l1ss_cap & PCI_L1SS_CAP_PCIPM_L1_2)
-		link->aspm_support |= ASPM_STATE_L1_2_PCIPM;
+		link->aspm_support |= PCIE_LINK_STATE_L1_2_PCIPM;
 
 	if (parent_l1ss_cap)
 		pci_read_config_dword(parent, parent->l1ss + PCI_L1SS_CTL1,
@@ -757,15 +756,15 @@ static void aspm_l1ss_init(struct pcie_link_state *link)
 				      &child_l1ss_ctl1);
 
 	if (parent_l1ss_ctl1 & child_l1ss_ctl1 & PCI_L1SS_CTL1_ASPM_L1_1)
-		link->aspm_enabled |= ASPM_STATE_L1_1;
+		link->aspm_enabled |= PCIE_LINK_STATE_L1_1;
 	if (parent_l1ss_ctl1 & child_l1ss_ctl1 & PCI_L1SS_CTL1_ASPM_L1_2)
-		link->aspm_enabled |= ASPM_STATE_L1_2;
+		link->aspm_enabled |= PCIE_LINK_STATE_L1_2;
 	if (parent_l1ss_ctl1 & child_l1ss_ctl1 & PCI_L1SS_CTL1_PCIPM_L1_1)
-		link->aspm_enabled |= ASPM_STATE_L1_1_PCIPM;
+		link->aspm_enabled |= PCIE_LINK_STATE_L1_1_PCIPM;
 	if (parent_l1ss_ctl1 & child_l1ss_ctl1 & PCI_L1SS_CTL1_PCIPM_L1_2)
-		link->aspm_enabled |= ASPM_STATE_L1_2_PCIPM;
+		link->aspm_enabled |= PCIE_LINK_STATE_L1_2_PCIPM;
 
-	if (link->aspm_support & ASPM_STATE_L1_2_MASK)
+	if (link->aspm_support & PCIE_LINK_STATE_L1_2_MASK)
 		aspm_calc_l12_info(link, parent_l1ss_cap, child_l1ss_cap);
 }
 
@@ -778,8 +777,8 @@ static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist)
 
 	if (blacklist) {
 		/* Set enabled/disable so that we will disable ASPM later */
-		link->aspm_enabled = ASPM_STATE_ALL;
-		link->aspm_disable = ASPM_STATE_ALL;
+		link->aspm_enabled = PCIE_LINK_STATE_ASPM_ALL;
+		link->aspm_disable = PCIE_LINK_STATE_ASPM_ALL;
 		return;
 	}
 
@@ -814,19 +813,19 @@ static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist)
 	 * support L0s.
 	 */
 	if (parent_lnkcap & child_lnkcap & PCI_EXP_LNKCAP_ASPM_L0S)
-		link->aspm_support |= ASPM_STATE_L0S;
+		link->aspm_support |= PCIE_LINK_STATE_L0S;
 
 	if (child_lnkctl & PCI_EXP_LNKCTL_ASPM_L0S)
-		link->aspm_enabled |= ASPM_STATE_L0S_UP;
+		link->aspm_enabled |= PCIE_LINK_STATE_L0S_UP;
 	if (parent_lnkctl & PCI_EXP_LNKCTL_ASPM_L0S)
-		link->aspm_enabled |= ASPM_STATE_L0S_DW;
+		link->aspm_enabled |= PCIE_LINK_STATE_L0S_DW;
 
 	/* Setup L1 state */
 	if (parent_lnkcap & child_lnkcap & PCI_EXP_LNKCAP_ASPM_L1)
-		link->aspm_support |= ASPM_STATE_L1;
+		link->aspm_support |= PCIE_LINK_STATE_L1;
 
 	if (parent_lnkctl & child_lnkctl & PCI_EXP_LNKCTL_ASPM_L1)
-		link->aspm_enabled |= ASPM_STATE_L1;
+		link->aspm_enabled |= PCIE_LINK_STATE_L1;
 
 	aspm_l1ss_init(link);
 
@@ -876,7 +875,7 @@ static void pcie_config_aspm_l1ss(struct pcie_link_state *link, u32 state)
 	 * If needed, disable L1, and it gets enabled later
 	 * in pcie_config_aspm_link().
 	 */
-	if (enable_req & (ASPM_STATE_L1_1 | ASPM_STATE_L1_2)) {
+	if (enable_req & (PCIE_LINK_STATE_L1_1 | PCIE_LINK_STATE_L1_2)) {
 		pcie_capability_clear_word(child, PCI_EXP_LNKCTL,
 					   PCI_EXP_LNKCTL_ASPM_L1);
 		pcie_capability_clear_word(parent, PCI_EXP_LNKCTL,
@@ -884,13 +883,13 @@ static void pcie_config_aspm_l1ss(struct pcie_link_state *link, u32 state)
 	}
 
 	val = 0;
-	if (state & ASPM_STATE_L1_1)
+	if (state & PCIE_LINK_STATE_L1_1)
 		val |= PCI_L1SS_CTL1_ASPM_L1_1;
-	if (state & ASPM_STATE_L1_2)
+	if (state & PCIE_LINK_STATE_L1_2)
 		val |= PCI_L1SS_CTL1_ASPM_L1_2;
-	if (state & ASPM_STATE_L1_1_PCIPM)
+	if (state & PCIE_LINK_STATE_L1_1_PCIPM)
 		val |= PCI_L1SS_CTL1_PCIPM_L1_1;
-	if (state & ASPM_STATE_L1_2_PCIPM)
+	if (state & PCIE_LINK_STATE_L1_2_PCIPM)
 		val |= PCI_L1SS_CTL1_PCIPM_L1_2;
 
 	/* Enable what we need to enable */
@@ -916,29 +915,29 @@ static void pcie_config_aspm_link(struct pcie_link_state *link, u32 state)
 	state &= (link->aspm_capable & ~link->aspm_disable);
 
 	/* Can't enable any substates if L1 is not enabled */
-	if (!(state & ASPM_STATE_L1))
-		state &= ~ASPM_STATE_L1SS;
+	if (!(state & PCIE_LINK_STATE_L1))
+		state &= ~PCIE_LINK_STATE_L1SS;
 
 	/* Spec says both ports must be in D0 before enabling PCI PM substates*/
 	if (parent->current_state != PCI_D0 || child->current_state != PCI_D0) {
-		state &= ~ASPM_STATE_L1_SS_PCIPM;
-		state |= (link->aspm_enabled & ASPM_STATE_L1_SS_PCIPM);
+		state &= ~PCIE_LINK_STATE_L1_SS_PCIPM;
+		state |= (link->aspm_enabled & PCIE_LINK_STATE_L1_SS_PCIPM);
 	}
 
 	/* Nothing to do if the link is already in the requested state */
 	if (link->aspm_enabled == state)
 		return;
 	/* Convert ASPM state to upstream/downstream ASPM register state */
-	if (state & ASPM_STATE_L0S_UP)
+	if (state & PCIE_LINK_STATE_L0S_UP)
 		dwstream |= PCI_EXP_LNKCTL_ASPM_L0S;
-	if (state & ASPM_STATE_L0S_DW)
+	if (state & PCIE_LINK_STATE_L0S_DW)
 		upstream |= PCI_EXP_LNKCTL_ASPM_L0S;
-	if (state & ASPM_STATE_L1) {
+	if (state & PCIE_LINK_STATE_L1) {
 		upstream |= PCI_EXP_LNKCTL_ASPM_L1;
 		dwstream |= PCI_EXP_LNKCTL_ASPM_L1;
 	}
 
-	if (link->aspm_capable & ASPM_STATE_L1SS)
+	if (link->aspm_capable & PCIE_LINK_STATE_L1SS)
 		pcie_config_aspm_l1ss(link, state);
 
 	/*
@@ -947,11 +946,11 @@ static void pcie_config_aspm_link(struct pcie_link_state *link, u32 state)
 	 * upstream component first and then downstream, and vice
 	 * versa for disabling ASPM L1. Spec doesn't mention L0S.
 	 */
-	if (state & ASPM_STATE_L1)
+	if (state & PCIE_LINK_STATE_L1)
 		pcie_config_aspm_dev(parent, upstream);
 	list_for_each_entry(child, &linkbus->devices, bus_list)
 		pcie_config_aspm_dev(child, dwstream);
-	if (!(state & ASPM_STATE_L1))
+	if (!(state & PCIE_LINK_STATE_L1))
 		pcie_config_aspm_dev(parent, upstream);
 
 	link->aspm_enabled = state;
@@ -1347,18 +1346,18 @@ static int __pci_disable_link_state(struct pci_dev *pdev, int state, bool locked
 		down_read(&pci_bus_sem);
 	mutex_lock(&aspm_lock);
 	if (state & PCIE_LINK_STATE_L0S)
-		link->aspm_disable |= ASPM_STATE_L0S;
+		link->aspm_disable |= PCIE_LINK_STATE_L0S;
 	if (state & PCIE_LINK_STATE_L1)
 		/* L1 PM substates require L1 */
-		link->aspm_disable |= ASPM_STATE_L1 | ASPM_STATE_L1SS;
+		link->aspm_disable |= PCIE_LINK_STATE_L1 | PCIE_LINK_STATE_L1SS;
 	if (state & PCIE_LINK_STATE_L1_1)
-		link->aspm_disable |= ASPM_STATE_L1_1;
+		link->aspm_disable |= PCIE_LINK_STATE_L1_1;
 	if (state & PCIE_LINK_STATE_L1_2)
-		link->aspm_disable |= ASPM_STATE_L1_2;
+		link->aspm_disable |= PCIE_LINK_STATE_L1_2;
 	if (state & PCIE_LINK_STATE_L1_1_PCIPM)
-		link->aspm_disable |= ASPM_STATE_L1_1_PCIPM;
+		link->aspm_disable |= PCIE_LINK_STATE_L1_1_PCIPM;
 	if (state & PCIE_LINK_STATE_L1_2_PCIPM)
-		link->aspm_disable |= ASPM_STATE_L1_2_PCIPM;
+		link->aspm_disable |= PCIE_LINK_STATE_L1_2_PCIPM;
 	pcie_config_aspm_link(link, policy_to_aspm_state(link));
 
 	if (state & PCIE_LINK_STATE_CLKPM)
@@ -1416,18 +1415,18 @@ static int __pci_enable_link_state(struct pci_dev *pdev, int state, bool locked)
 	mutex_lock(&aspm_lock);
 	link->aspm_default = 0;
 	if (state & PCIE_LINK_STATE_L0S)
-		link->aspm_default |= ASPM_STATE_L0S;
+		link->aspm_default |= PCIE_LINK_STATE_L0S;
 	if (state & PCIE_LINK_STATE_L1)
-		link->aspm_default |= ASPM_STATE_L1;
+		link->aspm_default |= PCIE_LINK_STATE_L1;
 	/* L1 PM substates require L1 */
 	if (state & PCIE_LINK_STATE_L1_1)
-		link->aspm_default |= ASPM_STATE_L1_1 | ASPM_STATE_L1;
+		link->aspm_default |= PCIE_LINK_STATE_L1_1 | PCIE_LINK_STATE_L1;
 	if (state & PCIE_LINK_STATE_L1_2)
-		link->aspm_default |= ASPM_STATE_L1_2 | ASPM_STATE_L1;
+		link->aspm_default |= PCIE_LINK_STATE_L1_2 | PCIE_LINK_STATE_L1;
 	if (state & PCIE_LINK_STATE_L1_1_PCIPM)
-		link->aspm_default |= ASPM_STATE_L1_1_PCIPM | ASPM_STATE_L1;
+		link->aspm_default |= PCIE_LINK_STATE_L1_1_PCIPM | PCIE_LINK_STATE_L1;
 	if (state & PCIE_LINK_STATE_L1_2_PCIPM)
-		link->aspm_default |= ASPM_STATE_L1_2_PCIPM | ASPM_STATE_L1;
+		link->aspm_default |= PCIE_LINK_STATE_L1_2_PCIPM | PCIE_LINK_STATE_L1;
 	pcie_config_aspm_link(link, policy_to_aspm_state(link));
 
 	link->clkpm_default = (state & PCIE_LINK_STATE_CLKPM) ? 1 : 0;
@@ -1563,12 +1562,12 @@ static ssize_t aspm_attr_store_common(struct device *dev,
 	if (state_enable) {
 		link->aspm_disable &= ~state;
 		/* need to enable L1 for substates */
-		if (state & ASPM_STATE_L1SS)
-			link->aspm_disable &= ~ASPM_STATE_L1;
+		if (state & PCIE_LINK_STATE_L1SS)
+			link->aspm_disable &= ~PCIE_LINK_STATE_L1;
 	} else {
 		link->aspm_disable |= state;
-		if (state & ASPM_STATE_L1)
-			link->aspm_disable |= ASPM_STATE_L1SS;
+		if (state & PCIE_LINK_STATE_L1)
+			link->aspm_disable |= PCIE_LINK_STATE_L1SS;
 	}
 
 	pcie_config_aspm_link(link, policy_to_aspm_state(link));
@@ -1582,12 +1581,12 @@ static ssize_t aspm_attr_store_common(struct device *dev,
 #define ASPM_ATTR(_f, _s)						\
 static ssize_t _f##_show(struct device *dev,				\
 			 struct device_attribute *attr, char *buf)	\
-{ return aspm_attr_show_common(dev, attr, buf, ASPM_STATE_##_s); }	\
+{ return aspm_attr_show_common(dev, attr, buf, PCIE_LINK_STATE_##_s); }	\
 									\
 static ssize_t _f##_store(struct device *dev,				\
 			  struct device_attribute *attr,		\
 			  const char *buf, size_t len)			\
-{ return aspm_attr_store_common(dev, attr, buf, len, ASPM_STATE_##_s); }
+{ return aspm_attr_store_common(dev, attr, buf, len, PCIE_LINK_STATE_##_s); }
 
 ASPM_ATTR(l0s_aspm, L0S)
 ASPM_ATTR(l1_aspm, L1)
@@ -1654,12 +1653,12 @@ static umode_t aspm_ctrl_attrs_are_visible(struct kobject *kobj,
 	struct pci_dev *pdev = to_pci_dev(dev);
 	struct pcie_link_state *link = pcie_aspm_get_link(pdev);
 	static const u8 aspm_state_map[] = {
-		ASPM_STATE_L0S,
-		ASPM_STATE_L1,
-		ASPM_STATE_L1_1,
-		ASPM_STATE_L1_2,
-		ASPM_STATE_L1_1_PCIPM,
-		ASPM_STATE_L1_2_PCIPM,
+		PCIE_LINK_STATE_L0S,
+		PCIE_LINK_STATE_L1,
+		PCIE_LINK_STATE_L1_1,
+		PCIE_LINK_STATE_L1_2,
+		PCIE_LINK_STATE_L1_1_PCIPM,
+		PCIE_LINK_STATE_L1_2_PCIPM,
 	};
 
 	if (aspm_disabled || !link)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 16493426a04f..08c68d80c01a 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1821,17 +1821,21 @@ extern bool pcie_ports_native;
 #define pcie_ports_native	false
 #endif
 
-#define PCIE_LINK_STATE_L0S		BIT(0)
-#define PCIE_LINK_STATE_L1		BIT(1)
-#define PCIE_LINK_STATE_CLKPM		BIT(2)
-#define PCIE_LINK_STATE_L1_1		BIT(3)
-#define PCIE_LINK_STATE_L1_2		BIT(4)
-#define PCIE_LINK_STATE_L1_1_PCIPM	BIT(5)
-#define PCIE_LINK_STATE_L1_2_PCIPM	BIT(6)
-#define PCIE_LINK_STATE_ALL		(PCIE_LINK_STATE_L0S | PCIE_LINK_STATE_L1 |\
-					 PCIE_LINK_STATE_CLKPM | PCIE_LINK_STATE_L1_1 |\
-					 PCIE_LINK_STATE_L1_2 | PCIE_LINK_STATE_L1_1_PCIPM |\
+#define PCIE_LINK_STATE_L0S		(BIT(0) | BIT(1)) /* Upstr/dwnstr L0s */
+#define PCIE_LINK_STATE_L1		BIT(2)	/* L1 state */
+#define PCIE_LINK_STATE_L1_1		BIT(3)	/* ASPM L1.1 state */
+#define PCIE_LINK_STATE_L1_2		BIT(4)	/* ASPM L1.2 state */
+#define PCIE_LINK_STATE_L1_1_PCIPM	BIT(5)	/* PCI-PM L1.1 state */
+#define PCIE_LINK_STATE_L1_2_PCIPM	BIT(6)	/* PCI-PM L1.2 state */
+#define PCIE_LINK_STATE_ASPM_ALL	(PCIE_LINK_STATE_L0S		|\
+					 PCIE_LINK_STATE_L1		|\
+					 PCIE_LINK_STATE_L1_1		|\
+					 PCIE_LINK_STATE_L1_2		|\
+					 PCIE_LINK_STATE_L1_1_PCIPM	|\
 					 PCIE_LINK_STATE_L1_2_PCIPM)
+#define PCIE_LINK_STATE_CLKPM		BIT(7)
+#define PCIE_LINK_STATE_ALL		(PCIE_LINK_STATE_ASPM_ALL	|\
+					 PCIE_LINK_STATE_CLKPM)
 
 #ifdef CONFIG_PCIEASPM
 int pci_disable_link_state(struct pci_dev *pdev, int state);
-- 
cgit v1.2.3


From 57bfc7605ca5b102ba336779ae9adbc5bbba1d96 Mon Sep 17 00:00:00 2001
From: Miao Xu <miaxu@meta.com>
Date: Wed, 1 May 2024 21:23:16 -0700
Subject: tcp: Add new args for cong_control in tcp_congestion_ops

This patch adds two new arguments for cong_control of struct
tcp_congestion_ops:
 - ack
 - flag
These two arguments are inherited from the caller tcp_cong_control in
tcp_intput.c. One use case of them is to update cwnd and pacing rate
inside cong_control based on the info they provide. For example, the
flag can be used to decide if it is the right time to raise or reduce a
sender's cwnd.

Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Miao Xu <miaxu@meta.com>
Link: https://lore.kernel.org/r/20240502042318.801932-2-miaxu@meta.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/net/tcp.h                                | 2 +-
 net/ipv4/bpf_tcp_ca.c                            | 3 ++-
 net/ipv4/tcp_bbr.c                               | 2 +-
 net/ipv4/tcp_input.c                             | 2 +-
 tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c | 6 +++---
 5 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index fe98fb01879b..7294da8fb780 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1172,7 +1172,7 @@ struct tcp_congestion_ops {
 	/* call when packets are delivered to update cwnd and pacing rate,
 	 * after all the ca_state processing. (optional)
 	 */
-	void (*cong_control)(struct sock *sk, const struct rate_sample *rs);
+	void (*cong_control)(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs);
 
 
 	/* new value of cwnd after loss (required) */
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 7f518ea5f4ac..6bd7f8db189a 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -307,7 +307,8 @@ static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk)
 	return 0;
 }
 
-static void bpf_tcp_ca_cong_control(struct sock *sk, const struct rate_sample *rs)
+static void bpf_tcp_ca_cong_control(struct sock *sk, u32 ack, int flag,
+				    const struct rate_sample *rs)
 {
 }
 
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 7e52ab24e40a..760941e55153 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -1024,7 +1024,7 @@ static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
 	bbr_update_gains(sk);
 }
 
-__bpf_kfunc static void bbr_main(struct sock *sk, const struct rate_sample *rs)
+__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs)
 {
 	struct bbr *bbr = inet_csk_ca(sk);
 	u32 bw;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 53e1150f706f..23ccfc7b1d3c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3541,7 +3541,7 @@ static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 
 	if (icsk->icsk_ca_ops->cong_control) {
-		icsk->icsk_ca_ops->cong_control(sk, rs);
+		icsk->icsk_ca_ops->cong_control(sk, ack, flag, rs);
 		return;
 	}
 
diff --git a/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c b/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c
index fcfbfe0336b4..52b610357309 100644
--- a/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c
+++ b/tools/testing/selftests/bpf/progs/tcp_ca_kfunc.c
@@ -5,7 +5,7 @@
 #include <bpf/bpf_tracing.h>
 
 extern void bbr_init(struct sock *sk) __ksym;
-extern void bbr_main(struct sock *sk, const struct rate_sample *rs) __ksym;
+extern void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) __ksym;
 extern u32 bbr_sndbuf_expand(struct sock *sk) __ksym;
 extern u32 bbr_undo_cwnd(struct sock *sk) __ksym;
 extern void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) __ksym;
@@ -42,9 +42,9 @@ void BPF_PROG(in_ack_event, struct sock *sk, u32 flags)
 }
 
 SEC("struct_ops/cong_control")
-void BPF_PROG(cong_control, struct sock *sk, const struct rate_sample *rs)
+void BPF_PROG(cong_control, struct sock *sk, u32 ack, int flag, const struct rate_sample *rs)
 {
-	bbr_main(sk, rs);
+	bbr_main(sk, ack, flag, rs);
 }
 
 SEC("struct_ops/cong_avoid")
-- 
cgit v1.2.3


From 1116b9fa15c09748ae05d2365a305fa22671eb1e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 12 Apr 2024 01:07:29 -0400
Subject: bdev: infrastructure for flags

Replace bd_partno with a 32bit field (__bd_flags).  The lower 8 bits
contain the partition number, the upper 24 are for flags.

Helpers: bdev_{test,set,clear}_flag(bdev, flag), with atomic_or()
and atomic_andnot() used to set/clear.

NOTE: this commit does not actually move any flags over there - they
are still bool fields.  As the result, it shifts the fields wrt
cacheline boundaries; that's going to be restored once the first
3 flags are dealt with.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 block/bdev.c              |  2 +-
 include/linux/blk_types.h |  3 ++-
 include/linux/blkdev.h    | 17 ++++++++++++++++-
 3 files changed, 19 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/block/bdev.c b/block/bdev.c
index 7a5f611c3d2e..2ec223315500 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -411,7 +411,7 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
 	mutex_init(&bdev->bd_fsfreeze_mutex);
 	spin_lock_init(&bdev->bd_size_lock);
 	mutex_init(&bdev->bd_holder_lock);
-	bdev->bd_partno = partno;
+	atomic_set(&bdev->__bd_flags, partno);
 	bdev->bd_inode = inode;
 	bdev->bd_queue = disk->queue;
 	if (partno)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index cb1526ec44b5..04f92737ab08 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -45,8 +45,9 @@ struct block_device {
 	struct request_queue *	bd_queue;
 	struct disk_stats __percpu *bd_stats;
 	unsigned long		bd_stamp;
+	atomic_t		__bd_flags;	// partition number + flags
+#define BD_PARTNO		255	// lower 8 bits; assign-once
 	bool			bd_read_only;	/* read-only policy */
-	u8			bd_partno;
 	bool			bd_write_holder;
 	bool			bd_has_submit_bio;
 	dev_t			bd_dev;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 32549d675955..99917e5860fd 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -722,7 +722,22 @@ void disk_uevent(struct gendisk *disk, enum kobject_action action);
 
 static inline u8 bdev_partno(const struct block_device *bdev)
 {
-	return bdev->bd_partno;
+	return atomic_read(&bdev->__bd_flags) & BD_PARTNO;
+}
+
+static inline bool bdev_test_flag(const struct block_device *bdev, unsigned flag)
+{
+	return atomic_read(&bdev->__bd_flags) & flag;
+}
+
+static inline void bdev_set_flag(struct block_device *bdev, unsigned flag)
+{
+	atomic_or(flag, &bdev->__bd_flags);
+}
+
+static inline void bdev_clear_flag(struct block_device *bdev, unsigned flag)
+{
+	atomic_andnot(flag, &bdev->__bd_flags);
 }
 
 static inline int get_disk_ro(struct gendisk *disk)
-- 
cgit v1.2.3


From 01e198f01d55880b79d8f5ac73064ff30a89d98a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 12 Apr 2024 01:15:55 -0400
Subject: bdev: move ->bd_read_only to ->__bd_flags

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 block/ioctl.c             | 5 ++++-
 include/linux/blk_types.h | 2 +-
 include/linux/blkdev.h    | 4 ++--
 3 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/block/ioctl.c b/block/ioctl.c
index 0c76137adcaa..be173e4ff43d 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -402,7 +402,10 @@ static int blkdev_roset(struct block_device *bdev, unsigned cmd,
 		if (ret)
 			return ret;
 	}
-	bdev->bd_read_only = n;
+	if (n)
+		bdev_set_flag(bdev, BD_READ_ONLY);
+	else
+		bdev_clear_flag(bdev, BD_READ_ONLY);
 	return 0;
 }
 
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 04f92737ab08..f70dd31cbcd1 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -47,7 +47,7 @@ struct block_device {
 	unsigned long		bd_stamp;
 	atomic_t		__bd_flags;	// partition number + flags
 #define BD_PARTNO		255	// lower 8 bits; assign-once
-	bool			bd_read_only;	/* read-only policy */
+#define BD_READ_ONLY		(1u<<8) // read-only policy
 	bool			bd_write_holder;
 	bool			bd_has_submit_bio;
 	dev_t			bd_dev;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 99917e5860fd..1fe91231f85b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -742,13 +742,13 @@ static inline void bdev_clear_flag(struct block_device *bdev, unsigned flag)
 
 static inline int get_disk_ro(struct gendisk *disk)
 {
-	return disk->part0->bd_read_only ||
+	return bdev_test_flag(disk->part0, BD_READ_ONLY) ||
 		test_bit(GD_READ_ONLY, &disk->state);
 }
 
 static inline int bdev_read_only(struct block_device *bdev)
 {
-	return bdev->bd_read_only || get_disk_ro(bdev->bd_disk);
+	return bdev_test_flag(bdev, BD_READ_ONLY) || get_disk_ro(bdev->bd_disk);
 }
 
 bool set_capacity_and_notify(struct gendisk *disk, sector_t size);
-- 
cgit v1.2.3


From 4c80105e3909b3aff634a40daa9c29059ce03d6a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 12 Apr 2024 01:18:24 -0400
Subject: bdev: move ->bd_write_holder into ->__bd_flags

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 block/bdev.c              | 9 +++++----
 include/linux/blk_types.h | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/block/bdev.c b/block/bdev.c
index 2ec223315500..9df9a59f0900 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -624,7 +624,7 @@ static void bd_end_claim(struct block_device *bdev, void *holder)
 		bdev->bd_holder = NULL;
 		bdev->bd_holder_ops = NULL;
 		mutex_unlock(&bdev->bd_holder_lock);
-		if (bdev->bd_write_holder)
+		if (bdev_test_flag(bdev, BD_WRITE_HOLDER))
 			unblock = true;
 	}
 	if (!whole->bd_holders)
@@ -640,7 +640,7 @@ static void bd_end_claim(struct block_device *bdev, void *holder)
 	 */
 	if (unblock) {
 		disk_unblock_events(bdev->bd_disk);
-		bdev->bd_write_holder = false;
+		bdev_clear_flag(bdev, BD_WRITE_HOLDER);
 	}
 }
 
@@ -892,9 +892,10 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
 		 * writeable reference is too fragile given the way @mode is
 		 * used in blkdev_get/put().
 		 */
-		if ((mode & BLK_OPEN_WRITE) && !bdev->bd_write_holder &&
+		if ((mode & BLK_OPEN_WRITE) &&
+		    !bdev_test_flag(bdev, BD_WRITE_HOLDER) &&
 		    (disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) {
-			bdev->bd_write_holder = true;
+			bdev_set_flag(bdev, BD_WRITE_HOLDER);
 			unblock_events = false;
 		}
 	}
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index f70dd31cbcd1..e45a490d488e 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -48,7 +48,7 @@ struct block_device {
 	atomic_t		__bd_flags;	// partition number + flags
 #define BD_PARTNO		255	// lower 8 bits; assign-once
 #define BD_READ_ONLY		(1u<<8) // read-only policy
-	bool			bd_write_holder;
+#define BD_WRITE_HOLDER		(1u<<9)
 	bool			bd_has_submit_bio;
 	dev_t			bd_dev;
 	struct inode		*bd_inode;	/* will die */
-- 
cgit v1.2.3


From ac2b6f9dee8f41d5e9162959a8bbd2c8047ee382 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 12 Apr 2024 01:21:45 -0400
Subject: bdev: move ->bd_has_subit_bio to ->__bd_flags

In bdev_alloc() we have all flags initialized to false, so
assignment to ->bh_has_submit_bio n there is a no-op unless
we have partno != 0 and flag already set on entire device.

In device_add_disk() we have just allocated the block_device
in question and it had been a full-device one, so the flag
is guaranteed to be still clear when we get to assignment.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 block/bdev.c              | 6 ++----
 block/blk-core.c          | 4 ++--
 block/genhd.c             | 3 ++-
 include/linux/blk_types.h | 2 +-
 4 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/block/bdev.c b/block/bdev.c
index 9df9a59f0900..fae30eae7741 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -414,10 +414,8 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
 	atomic_set(&bdev->__bd_flags, partno);
 	bdev->bd_inode = inode;
 	bdev->bd_queue = disk->queue;
-	if (partno)
-		bdev->bd_has_submit_bio = disk->part0->bd_has_submit_bio;
-	else
-		bdev->bd_has_submit_bio = false;
+	if (partno && bdev_test_flag(disk->part0, BD_HAS_SUBMIT_BIO))
+		bdev_set_flag(bdev, BD_HAS_SUBMIT_BIO);
 	bdev->bd_stats = alloc_percpu(struct disk_stats);
 	if (!bdev->bd_stats) {
 		iput(inode);
diff --git a/block/blk-core.c b/block/blk-core.c
index 20322abc6082..f61460b65408 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -615,7 +615,7 @@ static void __submit_bio(struct bio *bio)
 	if (unlikely(!blk_crypto_bio_prep(&bio)))
 		return;
 
-	if (!bio->bi_bdev->bd_has_submit_bio) {
+	if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) {
 		blk_mq_submit_bio(bio);
 	} else if (likely(bio_queue_enter(bio) == 0)) {
 		struct gendisk *disk = bio->bi_bdev->bd_disk;
@@ -723,7 +723,7 @@ void submit_bio_noacct_nocheck(struct bio *bio)
 	 */
 	if (current->bio_list)
 		bio_list_add(&current->bio_list[0], bio);
-	else if (!bio->bi_bdev->bd_has_submit_bio)
+	else if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO))
 		__submit_bio_noacct_mq(bio);
 	else
 		__submit_bio_noacct(bio);
diff --git a/block/genhd.c b/block/genhd.c
index bb29a68e1d67..19cd1a31fa80 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -413,7 +413,8 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
 	elevator_init_mq(disk->queue);
 
 	/* Mark bdev as having a submit_bio, if needed */
-	disk->part0->bd_has_submit_bio = disk->fops->submit_bio != NULL;
+	if (disk->fops->submit_bio)
+		bdev_set_flag(disk->part0, BD_HAS_SUBMIT_BIO);
 
 	/*
 	 * If the driver provides an explicit major number it also must provide
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index e45a490d488e..11b9e8eeb79f 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -49,7 +49,7 @@ struct block_device {
 #define BD_PARTNO		255	// lower 8 bits; assign-once
 #define BD_READ_ONLY		(1u<<8) // read-only policy
 #define BD_WRITE_HOLDER		(1u<<9)
-	bool			bd_has_submit_bio;
+#define BD_HAS_SUBMIT_BIO	(1u<<10)
 	dev_t			bd_dev;
 	struct inode		*bd_inode;	/* will die */
 
-- 
cgit v1.2.3


From 49a43dae93c8b0ee5c9797f7f407d1244dea8213 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 12 Apr 2024 01:24:27 -0400
Subject: bdev: move ->bd_ro_warned to ->__bd_flags

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 block/blk-core.c          | 5 +++--
 include/linux/blk_types.h | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/block/blk-core.c b/block/blk-core.c
index f61460b65408..1be49be9fac4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -514,10 +514,11 @@ static inline void bio_check_ro(struct bio *bio)
 		if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
 			return;
 
-		if (bio->bi_bdev->bd_ro_warned)
+		if (bdev_test_flag(bio->bi_bdev, BD_RO_WARNED))
 			return;
 
-		bio->bi_bdev->bd_ro_warned = true;
+		bdev_set_flag(bio->bi_bdev, BD_RO_WARNED);
+
 		/*
 		 * Use ioctl to set underlying disk of raid/dm to read-only
 		 * will trigger this.
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 11b9e8eeb79f..4e0c8785090c 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -50,6 +50,7 @@ struct block_device {
 #define BD_READ_ONLY		(1u<<8) // read-only policy
 #define BD_WRITE_HOLDER		(1u<<9)
 #define BD_HAS_SUBMIT_BIO	(1u<<10)
+#define BD_RO_WARNED		(1u<<11)
 	dev_t			bd_dev;
 	struct inode		*bd_inode;	/* will die */
 
@@ -69,7 +70,6 @@ struct block_device {
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	bool			bd_make_it_fail;
 #endif
-	bool			bd_ro_warned;
 	int			bd_writers;
 	/*
 	 * keep this out-of-line as it's both big and not needed in the fast
-- 
cgit v1.2.3


From 811ba89a8838e7c43ff46b6210ba1878bfe4437e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 28 Apr 2024 00:15:07 -0400
Subject: bdev: move ->bd_make_it_fail to ->__bd_flags

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 block/blk-core.c          |  3 ++-
 block/genhd.c             | 12 ++++++++----
 include/linux/blk_types.h |  6 +++---
 3 files changed, 13 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/block/blk-core.c b/block/blk-core.c
index 1be49be9fac4..1076336dd620 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -494,7 +494,8 @@ __setup("fail_make_request=", setup_fail_make_request);
 
 bool should_fail_request(struct block_device *part, unsigned int bytes)
 {
-	return part->bd_make_it_fail && should_fail(&fail_make_request, bytes);
+	return bdev_test_flag(part, BD_MAKE_IT_FAIL) &&
+	       should_fail(&fail_make_request, bytes);
 }
 
 static int __init fail_make_request_debugfs(void)
diff --git a/block/genhd.c b/block/genhd.c
index 19cd1a31fa80..0cce461952f6 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1066,7 +1066,8 @@ static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL);
 ssize_t part_fail_show(struct device *dev,
 		       struct device_attribute *attr, char *buf)
 {
-	return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_make_it_fail);
+	return sprintf(buf, "%d\n",
+		       bdev_test_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL));
 }
 
 ssize_t part_fail_store(struct device *dev,
@@ -1075,9 +1076,12 @@ ssize_t part_fail_store(struct device *dev,
 {
 	int i;
 
-	if (count > 0 && sscanf(buf, "%d", &i) > 0)
-		dev_to_bdev(dev)->bd_make_it_fail = i;
-
+	if (count > 0 && sscanf(buf, "%d", &i) > 0) {
+		if (i)
+			bdev_set_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL);
+		else
+			bdev_clear_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL);
+	}
 	return count;
 }
 
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 4e0c8785090c..5bb7805927ac 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -51,6 +51,9 @@ struct block_device {
 #define BD_WRITE_HOLDER		(1u<<9)
 #define BD_HAS_SUBMIT_BIO	(1u<<10)
 #define BD_RO_WARNED		(1u<<11)
+#ifdef CONFIG_FAIL_MAKE_REQUEST
+#define BD_MAKE_IT_FAIL		(1u<<12)
+#endif
 	dev_t			bd_dev;
 	struct inode		*bd_inode;	/* will die */
 
@@ -67,9 +70,6 @@ struct block_device {
 	struct mutex		bd_fsfreeze_mutex; /* serialize freeze/thaw */
 
 	struct partition_meta_info *bd_meta_info;
-#ifdef CONFIG_FAIL_MAKE_REQUEST
-	bool			bd_make_it_fail;
-#endif
 	int			bd_writers;
 	/*
 	 * keep this out-of-line as it's both big and not needed in the fast
-- 
cgit v1.2.3


From b250c20b64290808aa4b5cc6d68819a7ee28237f Mon Sep 17 00:00:00 2001
From: David Lechner <dlechner@baylibre.com>
Date: Mon, 29 Apr 2024 18:40:09 -0500
Subject: regulator: devres: add API for reference voltage supplies

A common use case for regulators is to supply a reference voltage to an
analog input or output device. This adds a new devres API to get,
enable, and get the voltage in a single call. This allows eliminating
boilerplate code in drivers that use reference supplies in this way.

Signed-off-by: David Lechner <dlechner@baylibre.com>
Link: https://lore.kernel.org/r/20240429-regulator-get-enable-get-votlage-v2-1-b1f11ab766c1@baylibre.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 Documentation/driver-api/driver-model/devres.rst |  1 +
 drivers/regulator/devres.c                       | 59 ++++++++++++++++++++++++
 include/linux/regulator/consumer.h               |  7 +++
 3 files changed, 67 insertions(+)

(limited to 'include')

diff --git a/Documentation/driver-api/driver-model/devres.rst b/Documentation/driver-api/driver-model/devres.rst
index 7be8b8dd5f00..18caebad7376 100644
--- a/Documentation/driver-api/driver-model/devres.rst
+++ b/Documentation/driver-api/driver-model/devres.rst
@@ -433,6 +433,7 @@ REGULATOR
   devm_regulator_bulk_put()
   devm_regulator_get()
   devm_regulator_get_enable()
+  devm_regulator_get_enable_read_voltage()
   devm_regulator_get_enable_optional()
   devm_regulator_get_exclusive()
   devm_regulator_get_optional()
diff --git a/drivers/regulator/devres.c b/drivers/regulator/devres.c
index 90bb0d178885..4f290b9b559b 100644
--- a/drivers/regulator/devres.c
+++ b/drivers/regulator/devres.c
@@ -145,6 +145,65 @@ struct regulator *devm_regulator_get_optional(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(devm_regulator_get_optional);
 
+/**
+ * devm_regulator_get_enable_read_voltage - Resource managed regulator get and
+ *                                          enable that returns the voltage
+ * @dev: device to supply
+ * @id:  supply name or regulator ID.
+ *
+ * Get and enable regulator for duration of the device life-time.
+ * regulator_disable() and regulator_put() are automatically called on driver
+ * detach. See regulator_get_optional(), regulator_enable(), and
+ * regulator_get_voltage() for more information.
+ *
+ * This is a convenience function for supplies that provide a reference voltage
+ * where the consumer driver just needs to know the voltage and keep the
+ * regulator enabled.
+ *
+ * In cases where the supply is not strictly required, callers can check for
+ * -ENODEV error and handle it accordingly.
+ *
+ * Returns: voltage in microvolts on success, or an error code on failure.
+ */
+int devm_regulator_get_enable_read_voltage(struct device *dev, const char *id)
+{
+	struct regulator *r;
+	int ret;
+
+	/*
+	 * Since we need a real voltage, we use devm_regulator_get_optional()
+	 * rather than getting a dummy regulator with devm_regulator_get() and
+	 * then letting regulator_get_voltage() fail with -EINVAL. This way, the
+	 * caller can handle the -ENODEV error code if needed instead of the
+	 * ambiguous -EINVAL.
+	 */
+	r = devm_regulator_get_optional(dev, id);
+	if (IS_ERR(r))
+		return PTR_ERR(r);
+
+	ret = regulator_enable(r);
+	if (ret)
+		goto err_regulator_put;
+
+	ret = devm_add_action_or_reset(dev, regulator_action_disable, r);
+	if (ret)
+		goto err_regulator_put;
+
+	ret = regulator_get_voltage(r);
+	if (ret < 0)
+		goto err_release_action;
+
+	return 0;
+
+err_release_action:
+	devm_release_action(dev, regulator_action_disable, r);
+err_regulator_put:
+	devm_regulator_put(r);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(devm_regulator_get_enable_read_voltage);
+
 static int devm_regulator_match(struct device *dev, void *res, void *data)
 {
 	struct regulator **r = res;
diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index 4660582a3302..6f41a11c92e7 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -164,6 +164,7 @@ struct regulator *__must_check devm_regulator_get_optional(struct device *dev,
 							   const char *id);
 int devm_regulator_get_enable(struct device *dev, const char *id);
 int devm_regulator_get_enable_optional(struct device *dev, const char *id);
+int devm_regulator_get_enable_read_voltage(struct device *dev, const char *id);
 void regulator_put(struct regulator *regulator);
 void devm_regulator_put(struct regulator *regulator);
 
@@ -329,6 +330,12 @@ static inline int devm_regulator_get_enable_optional(struct device *dev,
 	return -ENODEV;
 }
 
+static inline int devm_regulator_get_enable_read_voltage(struct device *dev,
+							 const char *id)
+{
+	return -ENODEV;
+}
+
 static inline struct regulator *__must_check
 regulator_get_optional(struct device *dev, const char *id)
 {
-- 
cgit v1.2.3


From 2a45166938f145294b73445b0af997b3100f02b4 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 17 Apr 2024 13:54:30 +0300
Subject: spi: pxa2xx: Move contents of linux/spi/pxa2xx_spi.h to a local one

There is no user of the linux/spi/pxa2xx_spi.h. Move its contents
to the drivers/spi/spi-pxa2xx.h.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: https://lore.kernel.org/r/20240417110334.2671228-4-andriy.shevchenko@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/spi/spi-pxa2xx-dma.c   |  1 -
 drivers/spi/spi-pxa2xx-pci.c   |  4 ++--
 drivers/spi/spi-pxa2xx.c       |  1 -
 drivers/spi/spi-pxa2xx.h       | 36 ++++++++++++++++++++++++++++++-
 include/linux/spi/pxa2xx_spi.h | 48 ------------------------------------------
 5 files changed, 37 insertions(+), 53 deletions(-)
 delete mode 100644 include/linux/spi/pxa2xx_spi.h

(limited to 'include')

diff --git a/drivers/spi/spi-pxa2xx-dma.c b/drivers/spi/spi-pxa2xx-dma.c
index be563f0dd03a..26416ced6505 100644
--- a/drivers/spi/spi-pxa2xx-dma.c
+++ b/drivers/spi/spi-pxa2xx-dma.c
@@ -12,7 +12,6 @@
 #include <linux/scatterlist.h>
 #include <linux/sizes.h>
 
-#include <linux/spi/pxa2xx_spi.h>
 #include <linux/spi/spi.h>
 
 #include "spi-pxa2xx.h"
diff --git a/drivers/spi/spi-pxa2xx-pci.c b/drivers/spi/spi-pxa2xx-pci.c
index 861b21c63504..e11a613bc340 100644
--- a/drivers/spi/spi-pxa2xx-pci.c
+++ b/drivers/spi/spi-pxa2xx-pci.c
@@ -10,11 +10,11 @@
 #include <linux/pci.h>
 #include <linux/platform_device.h>
 
-#include <linux/spi/pxa2xx_spi.h>
-
 #include <linux/dmaengine.h>
 #include <linux/platform_data/dma-dw.h>
 
+#include "spi-pxa2xx.h"
+
 #define PCI_DEVICE_ID_INTEL_QUARK_X1000		0x0935
 #define PCI_DEVICE_ID_INTEL_BYT			0x0f0e
 #define PCI_DEVICE_ID_INTEL_MRFLD		0x1194
diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c
index ab6fd55237cd..82633682b581 100644
--- a/drivers/spi/spi-pxa2xx.c
+++ b/drivers/spi/spi-pxa2xx.c
@@ -24,7 +24,6 @@
 #include <linux/property.h>
 #include <linux/slab.h>
 
-#include <linux/spi/pxa2xx_spi.h>
 #include <linux/spi/spi.h>
 
 #include "spi-pxa2xx.h"
diff --git a/drivers/spi/spi-pxa2xx.h b/drivers/spi/spi-pxa2xx.h
index 45cdbbc71c4b..08296729ea80 100644
--- a/drivers/spi/spi-pxa2xx.h
+++ b/drivers/spi/spi-pxa2xx.h
@@ -7,6 +7,7 @@
 #ifndef SPI_PXA2XX_H
 #define SPI_PXA2XX_H
 
+#include <linux/dmaengine.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/types.h>
@@ -15,7 +16,40 @@
 #include <linux/pxa2xx_ssp.h>
 
 struct gpio_desc;
-struct pxa2xx_spi_controller;
+
+/*
+ * The platform data for SSP controller devices
+ * (resides in device.platform_data).
+ */
+struct pxa2xx_spi_controller {
+	u8 num_chipselect;
+	u8 enable_dma;
+	u8 dma_burst_size;
+	bool is_target;
+
+	/* DMA engine specific config */
+	dma_filter_fn dma_filter;
+	void *tx_param;
+	void *rx_param;
+
+	/* For non-PXA arches */
+	struct ssp_device ssp;
+};
+
+/*
+ * The controller specific data for SPI target devices
+ * (resides in spi_board_info.controller_data),
+ * copied to spi_device.platform_data ... mostly for
+ * DMA tuning.
+ */
+struct pxa2xx_spi_chip {
+	u8 tx_threshold;
+	u8 tx_hi_threshold;
+	u8 rx_threshold;
+	u8 dma_burst_size;
+	u32 timeout;
+};
+
 struct spi_controller;
 struct spi_device;
 struct spi_transfer;
diff --git a/include/linux/spi/pxa2xx_spi.h b/include/linux/spi/pxa2xx_spi.h
deleted file mode 100644
index e5a4a045fb67..000000000000
--- a/include/linux/spi/pxa2xx_spi.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Copyright (C) 2005 Stephen Street / StreetFire Sound Labs
- */
-#ifndef __LINUX_SPI_PXA2XX_SPI_H
-#define __LINUX_SPI_PXA2XX_SPI_H
-
-#include <linux/dmaengine.h>
-#include <linux/types.h>
-
-#include <linux/pxa2xx_ssp.h>
-
-struct dma_chan;
-
-/*
- * The platform data for SSP controller devices
- * (resides in device.platform_data).
- */
-struct pxa2xx_spi_controller {
-	u8 num_chipselect;
-	u8 enable_dma;
-	u8 dma_burst_size;
-	bool is_target;
-
-	/* DMA engine specific config */
-	dma_filter_fn dma_filter;
-	void *tx_param;
-	void *rx_param;
-
-	/* For non-PXA arches */
-	struct ssp_device ssp;
-};
-
-/*
- * The controller specific data for SPI target devices
- * (resides in spi_board_info.controller_data),
- * copied to spi_device.platform_data ... mostly for
- * DMA tuning.
- */
-struct pxa2xx_spi_chip {
-	u8 tx_threshold;
-	u8 tx_hi_threshold;
-	u8 rx_threshold;
-	u8 dma_burst_size;
-	u32 timeout;
-};
-
-#endif	/* __LINUX_SPI_PXA2XX_SPI_H */
-- 
cgit v1.2.3


From 6d0ca4a2a7e25f9ad07c1f335f20b4d9e048cdd5 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Date: Tue, 30 Apr 2024 09:49:11 +0100
Subject: nvmem: layouts: store owner from modules with
 nvmem_layout_driver_register()

Modules registering driver with nvmem_layout_driver_register() might
forget to set .owner field.  The field is used by some of other kernel
parts for reference counting (try_module_get()), so it is expected that
drivers will set it.

Solve the problem by moving this task away from the drivers to the core
code, just like we did for platform_driver in
commit 9447057eaff8 ("platform_device: use a macro instead of
platform_driver_register").

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Michael Walle <mwalle@kernel.org>
Reviewed-by: Miquel Raynal <miquel.raynal@bootlin.com>
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
Link: https://lore.kernel.org/r/20240430084921.33387-2-srinivas.kandagatla@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/nvmem/layouts.c        | 6 ++++--
 include/linux/nvmem-provider.h | 5 ++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/nvmem/layouts.c b/drivers/nvmem/layouts.c
index 8b5e2de138eb..64dc7013a098 100644
--- a/drivers/nvmem/layouts.c
+++ b/drivers/nvmem/layouts.c
@@ -52,13 +52,15 @@ static const struct bus_type nvmem_layout_bus_type = {
 	.remove		= nvmem_layout_bus_remove,
 };
 
-int nvmem_layout_driver_register(struct nvmem_layout_driver *drv)
+int __nvmem_layout_driver_register(struct nvmem_layout_driver *drv,
+				   struct module *owner)
 {
 	drv->driver.bus = &nvmem_layout_bus_type;
+	drv->driver.owner = owner;
 
 	return driver_register(&drv->driver);
 }
-EXPORT_SYMBOL_GPL(nvmem_layout_driver_register);
+EXPORT_SYMBOL_GPL(__nvmem_layout_driver_register);
 
 void nvmem_layout_driver_unregister(struct nvmem_layout_driver *drv)
 {
diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h
index f0ba0e03218f..3ebeaa0ded00 100644
--- a/include/linux/nvmem-provider.h
+++ b/include/linux/nvmem-provider.h
@@ -199,7 +199,10 @@ int nvmem_add_one_cell(struct nvmem_device *nvmem,
 int nvmem_layout_register(struct nvmem_layout *layout);
 void nvmem_layout_unregister(struct nvmem_layout *layout);
 
-int nvmem_layout_driver_register(struct nvmem_layout_driver *drv);
+#define nvmem_layout_driver_register(drv) \
+	__nvmem_layout_driver_register(drv, THIS_MODULE)
+int __nvmem_layout_driver_register(struct nvmem_layout_driver *drv,
+				   struct module *owner);
 void nvmem_layout_driver_unregister(struct nvmem_layout_driver *drv);
 #define module_nvmem_layout_driver(__nvmem_layout_driver)		\
 	module_driver(__nvmem_layout_driver, nvmem_layout_driver_register, \
-- 
cgit v1.2.3


From 186ddac2072a8134798d72635d1ed0f29889369d Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Thu, 11 Apr 2024 15:53:43 +0100
Subject: block: move two helpers into bdev.c

disk_live() and block_size() access bd_inode directly, prepare to remove
the field bd_inode from block_device, and only access bd_inode in block
layer.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Link: https://lore.kernel.org/r/20240411145346.2516848-8-viro@zeniv.linux.org.uk
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 block/bdev.c           | 12 ++++++++++++
 include/linux/blkdev.h | 12 ++----------
 2 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/block/bdev.c b/block/bdev.c
index a89bce368b64..536233ac3e99 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -1257,6 +1257,18 @@ void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
 	blkdev_put_no_open(bdev);
 }
 
+bool disk_live(struct gendisk *disk)
+{
+	return !inode_unhashed(disk->part0->bd_inode);
+}
+EXPORT_SYMBOL_GPL(disk_live);
+
+unsigned int block_size(struct block_device *bdev)
+{
+	return 1 << bdev->bd_inode->i_blkbits;
+}
+EXPORT_SYMBOL_GPL(block_size);
+
 static int __init setup_bdev_allow_write_mounted(char *str)
 {
 	if (kstrtobool(str, &bdev_allow_write_mounted))
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 20c749b2ebc2..99ac98ed9548 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -211,11 +211,6 @@ struct gendisk {
 	struct blk_independent_access_ranges *ia_ranges;
 };
 
-static inline bool disk_live(struct gendisk *disk)
-{
-	return !inode_unhashed(disk->part0->bd_inode);
-}
-
 /**
  * disk_openers - returns how many openers are there for a disk
  * @disk: disk to check
@@ -1364,11 +1359,6 @@ static inline unsigned int blksize_bits(unsigned int size)
 	return order_base_2(size >> SECTOR_SHIFT) + SECTOR_SHIFT;
 }
 
-static inline unsigned int block_size(struct block_device *bdev)
-{
-	return 1 << bdev->bd_inode->i_blkbits;
-}
-
 int kblockd_schedule_work(struct work_struct *work);
 int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
 
@@ -1536,6 +1526,8 @@ void blkdev_put_no_open(struct block_device *bdev);
 
 struct block_device *I_BDEV(struct inode *inode);
 struct block_device *file_bdev(struct file *bdev_file);
+bool disk_live(struct gendisk *disk);
+unsigned int block_size(struct block_device *bdev);
 
 #ifdef CONFIG_BLOCK
 void invalidate_bdev(struct block_device *bdev);
-- 
cgit v1.2.3


From e33aef2c58577f51ec22736843a652576ce0ef7a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 11 Apr 2024 15:53:36 +0100
Subject: block_device: add a pointer to struct address_space (page cache of
 bdev)

points to ->i_data of coallocated inode.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Link: https://lore.kernel.org/r/20240411145346.2516848-1-viro@zeniv.linux.org.uk
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 block/bdev.c              | 1 +
 include/linux/blk_types.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/block/bdev.c b/block/bdev.c
index 28e6f0423857..8e19101cbbb0 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -419,6 +419,7 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
 	mutex_init(&bdev->bd_holder_lock);
 	bdev->bd_partno = partno;
 	bdev->bd_inode = inode;
+	bdev->bd_mapping = &inode->i_data;
 	bdev->bd_queue = disk->queue;
 	if (partno)
 		bdev->bd_has_submit_bio = disk->part0->bd_has_submit_bio;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index cb1526ec44b5..6438c75cbb35 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -51,6 +51,7 @@ struct block_device {
 	bool			bd_has_submit_bio;
 	dev_t			bd_dev;
 	struct inode		*bd_inode;	/* will die */
+	struct address_space	*bd_mapping;	/* page cache */
 
 	atomic_t		bd_openers;
 	spinlock_t		bd_size_lock; /* for bd_inode->i_size updates */
-- 
cgit v1.2.3


From 224941e8379a0de8652ffec768cc8394f0b1cb95 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 11 Apr 2024 15:53:37 +0100
Subject: use ->bd_mapping instead of ->bd_inode->i_mapping

Just the low-hanging fruit...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Link: https://lore.kernel.org/r/20240411145346.2516848-2-viro@zeniv.linux.org.uk
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 block/bdev.c                | 18 +++++++++---------
 block/blk-zoned.c           |  4 ++--
 block/genhd.c               |  2 +-
 block/ioctl.c               |  4 ++--
 block/partitions/core.c     |  2 +-
 drivers/md/bcache/super.c   |  2 +-
 drivers/scsi/scsicam.c      |  2 +-
 fs/btrfs/disk-io.c          |  6 +++---
 fs/btrfs/volumes.c          |  2 +-
 fs/btrfs/zoned.c            |  2 +-
 fs/buffer.c                 |  2 +-
 fs/cramfs/inode.c           |  2 +-
 fs/erofs/data.c             |  2 +-
 fs/ext4/dir.c               |  2 +-
 fs/ext4/ext4_jbd2.c         |  2 +-
 fs/ext4/super.c             |  6 +++---
 fs/jbd2/journal.c           |  2 +-
 include/linux/buffer_head.h |  4 ++--
 include/linux/jbd2.h        |  4 ++--
 19 files changed, 35 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/block/bdev.c b/block/bdev.c
index 8e19101cbbb0..00017af92a51 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -76,7 +76,7 @@ static void bdev_write_inode(struct block_device *bdev)
 /* Kill _all_ buffers and pagecache , dirty or not.. */
 static void kill_bdev(struct block_device *bdev)
 {
-	struct address_space *mapping = bdev->bd_inode->i_mapping;
+	struct address_space *mapping = bdev->bd_mapping;
 
 	if (mapping_empty(mapping))
 		return;
@@ -88,7 +88,7 @@ static void kill_bdev(struct block_device *bdev)
 /* Invalidate clean unused buffers and pagecache. */
 void invalidate_bdev(struct block_device *bdev)
 {
-	struct address_space *mapping = bdev->bd_inode->i_mapping;
+	struct address_space *mapping = bdev->bd_mapping;
 
 	if (mapping->nrpages) {
 		invalidate_bh_lrus();
@@ -116,7 +116,7 @@ int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode,
 			goto invalidate;
 	}
 
-	truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend);
+	truncate_inode_pages_range(bdev->bd_mapping, lstart, lend);
 	if (!(mode & BLK_OPEN_EXCL))
 		bd_abort_claiming(bdev, truncate_bdev_range);
 	return 0;
@@ -126,7 +126,7 @@ invalidate:
 	 * Someone else has handle exclusively open. Try invalidating instead.
 	 * The 'end' argument is inclusive so the rounding is safe.
 	 */
-	return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping,
+	return invalidate_inode_pages2_range(bdev->bd_mapping,
 					     lstart >> PAGE_SHIFT,
 					     lend >> PAGE_SHIFT);
 }
@@ -198,7 +198,7 @@ int sync_blockdev_nowait(struct block_device *bdev)
 {
 	if (!bdev)
 		return 0;
-	return filemap_flush(bdev->bd_inode->i_mapping);
+	return filemap_flush(bdev->bd_mapping);
 }
 EXPORT_SYMBOL_GPL(sync_blockdev_nowait);
 
@@ -210,13 +210,13 @@ int sync_blockdev(struct block_device *bdev)
 {
 	if (!bdev)
 		return 0;
-	return filemap_write_and_wait(bdev->bd_inode->i_mapping);
+	return filemap_write_and_wait(bdev->bd_mapping);
 }
 EXPORT_SYMBOL(sync_blockdev);
 
 int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend)
 {
-	return filemap_write_and_wait_range(bdev->bd_inode->i_mapping,
+	return filemap_write_and_wait_range(bdev->bd_mapping,
 			lstart, lend);
 }
 EXPORT_SYMBOL(sync_blockdev_range);
@@ -445,7 +445,7 @@ void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors)
 void bdev_add(struct block_device *bdev, dev_t dev)
 {
 	if (bdev_stable_writes(bdev))
-		mapping_set_stable_writes(bdev->bd_inode->i_mapping);
+		mapping_set_stable_writes(bdev->bd_mapping);
 	bdev->bd_dev = dev;
 	bdev->bd_inode->i_rdev = dev;
 	bdev->bd_inode->i_ino = dev;
@@ -925,7 +925,7 @@ int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
 		bdev_file->f_mode |= FMODE_NOWAIT;
 	if (mode & BLK_OPEN_RESTRICT_WRITES)
 		bdev_file->f_mode |= FMODE_WRITE_RESTRICTED;
-	bdev_file->f_mapping = bdev->bd_inode->i_mapping;
+	bdev_file->f_mapping = bdev->bd_mapping;
 	bdev_file->f_wb_err = filemap_sample_wb_err(bdev_file->f_mapping);
 	bdev_file->private_data = holder;
 
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index da0f4b2a8fa0..b008bcd4889c 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -398,7 +398,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
 		op = REQ_OP_ZONE_RESET;
 
 		/* Invalidate the page cache, including dirty pages. */
-		filemap_invalidate_lock(bdev->bd_inode->i_mapping);
+		filemap_invalidate_lock(bdev->bd_mapping);
 		ret = blkdev_truncate_zone_range(bdev, mode, &zrange);
 		if (ret)
 			goto fail;
@@ -420,7 +420,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
 
 fail:
 	if (cmd == BLKRESETZONE)
-		filemap_invalidate_unlock(bdev->bd_inode->i_mapping);
+		filemap_invalidate_unlock(bdev->bd_mapping);
 
 	return ret;
 }
diff --git a/block/genhd.c b/block/genhd.c
index 93f5118b7d41..2bf05bd0f071 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -745,7 +745,7 @@ void invalidate_disk(struct gendisk *disk)
 	struct block_device *bdev = disk->part0;
 
 	invalidate_bdev(bdev);
-	bdev->bd_inode->i_mapping->wb_err = 0;
+	bdev->bd_mapping->wb_err = 0;
 	set_capacity(disk, 0);
 }
 EXPORT_SYMBOL(invalidate_disk);
diff --git a/block/ioctl.c b/block/ioctl.c
index 1c800364bc70..7c13d8bed453 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -152,12 +152,12 @@ static int blk_ioctl_secure_erase(struct block_device *bdev, blk_mode_t mode,
 	if (start + len > bdev_nr_bytes(bdev))
 		return -EINVAL;
 
-	filemap_invalidate_lock(bdev->bd_inode->i_mapping);
+	filemap_invalidate_lock(bdev->bd_mapping);
 	err = truncate_bdev_range(bdev, mode, start, start + len - 1);
 	if (!err)
 		err = blkdev_issue_secure_erase(bdev, start >> 9, len >> 9,
 						GFP_KERNEL);
-	filemap_invalidate_unlock(bdev->bd_inode->i_mapping);
+	filemap_invalidate_unlock(bdev->bd_mapping);
 	return err;
 }
 
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 2b75e325c63b..63ee317dfff0 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -704,7 +704,7 @@ EXPORT_SYMBOL_GPL(bdev_disk_changed);
 
 void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p)
 {
-	struct address_space *mapping = state->disk->part0->bd_inode->i_mapping;
+	struct address_space *mapping = state->disk->part0->bd_mapping;
 	struct folio *folio;
 
 	if (n >= get_capacity(state->disk)) {
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 0ee5e17ae2dd..277ad958cf53 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -171,7 +171,7 @@ static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
 	struct page *page;
 	unsigned int i;
 
-	page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
+	page = read_cache_page_gfp(bdev->bd_mapping,
 				   SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL);
 	if (IS_ERR(page))
 		return "IO error";
diff --git a/drivers/scsi/scsicam.c b/drivers/scsi/scsicam.c
index e2c7d8ef205f..dd69342bbe78 100644
--- a/drivers/scsi/scsicam.c
+++ b/drivers/scsi/scsicam.c
@@ -32,7 +32,7 @@
  */
 unsigned char *scsi_bios_ptable(struct block_device *dev)
 {
-	struct address_space *mapping = bdev_whole(dev)->bd_inode->i_mapping;
+	struct address_space *mapping = bdev_whole(dev)->bd_mapping;
 	unsigned char *res = NULL;
 	struct folio *folio;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3df5477d48a8..f10e894b0bf5 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3651,7 +3651,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
 	struct btrfs_super_block *super;
 	struct page *page;
 	u64 bytenr, bytenr_orig;
-	struct address_space *mapping = bdev->bd_inode->i_mapping;
+	struct address_space *mapping = bdev->bd_mapping;
 	int ret;
 
 	bytenr_orig = btrfs_sb_offset(copy_num);
@@ -3738,7 +3738,7 @@ static int write_dev_supers(struct btrfs_device *device,
 			    struct btrfs_super_block *sb, int max_mirrors)
 {
 	struct btrfs_fs_info *fs_info = device->fs_info;
-	struct address_space *mapping = device->bdev->bd_inode->i_mapping;
+	struct address_space *mapping = device->bdev->bd_mapping;
 	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
 	int i;
 	int errors = 0;
@@ -3855,7 +3855,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
 		    device->commit_total_bytes)
 			break;
 
-		page = find_get_page(device->bdev->bd_inode->i_mapping,
+		page = find_get_page(device->bdev->bd_mapping,
 				     bytenr >> PAGE_SHIFT);
 		if (!page) {
 			errors++;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 65c03ddecc59..33d357e5f9c5 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1290,7 +1290,7 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev
 		return ERR_PTR(-EINVAL);
 
 	/* pull in the page with our super */
-	page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
+	page = read_cache_page_gfp(bdev->bd_mapping, index, GFP_KERNEL);
 
 	if (IS_ERR(page))
 		return ERR_CAST(page);
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 4cba80b34387..9b43fa493219 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -118,7 +118,7 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
 		return -ENOENT;
 	} else if (full[0] && full[1]) {
 		/* Compare two super blocks */
-		struct address_space *mapping = bdev->bd_inode->i_mapping;
+		struct address_space *mapping = bdev->bd_mapping;
 		struct page *page[BTRFS_NR_SB_LOG_ZONES];
 		struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
 		int i;
diff --git a/fs/buffer.c b/fs/buffer.c
index 4f73d23c2c46..d5a0932ae68d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1463,7 +1463,7 @@ __bread_gfp(struct block_device *bdev, sector_t block,
 {
 	struct buffer_head *bh;
 
-	gfp |= mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS);
+	gfp |= mapping_gfp_constraint(bdev->bd_mapping, ~__GFP_FS);
 
 	/*
 	 * Prefer looping in the allocator rather than here, at least that
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 9901057a15ba..460690ca0174 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -183,7 +183,7 @@ static int next_buffer;
 static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset,
 				unsigned int len)
 {
-	struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
+	struct address_space *mapping = sb->s_bdev->bd_mapping;
 	struct file_ra_state ra = {};
 	struct page *pages[BLKS_PER_BUF];
 	unsigned i, blocknr, buffer;
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index e1a170e45c70..5fc03c1e2757 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -68,7 +68,7 @@ void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb)
 	if (erofs_is_fscache_mode(sb))
 		buf->mapping = EROFS_SB(sb)->s_fscache->inode->i_mapping;
 	else
-		buf->mapping = sb->s_bdev->bd_inode->i_mapping;
+		buf->mapping = sb->s_bdev->bd_mapping;
 }
 
 void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 3985f8c33f95..ff4514e4626b 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -192,7 +192,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
 					(PAGE_SHIFT - inode->i_blkbits);
 			if (!ra_has_index(&file->f_ra, index))
 				page_cache_sync_readahead(
-					sb->s_bdev->bd_inode->i_mapping,
+					sb->s_bdev->bd_mapping,
 					&file->f_ra, file,
 					index, 1);
 			file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT;
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 5d8055161acd..da4a82456383 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -206,7 +206,7 @@ static void ext4_journal_abort_handle(const char *caller, unsigned int line,
 
 static void ext4_check_bdev_write_error(struct super_block *sb)
 {
-	struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
+	struct address_space *mapping = sb->s_bdev->bd_mapping;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	int err;
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index b255f798f449..02042eb91806 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -244,7 +244,7 @@ static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb,
 struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block,
 				   blk_opf_t op_flags)
 {
-	gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_inode->i_mapping,
+	gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping,
 			~__GFP_FS) | __GFP_MOVABLE;
 
 	return __ext4_sb_bread_gfp(sb, block, op_flags, gfp);
@@ -253,7 +253,7 @@ struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block,
 struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
 					    sector_t block)
 {
-	gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_inode->i_mapping,
+	gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping,
 			~__GFP_FS);
 
 	return __ext4_sb_bread_gfp(sb, block, 0, gfp);
@@ -5556,7 +5556,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
 	 * used to detect the metadata async write error.
 	 */
 	spin_lock_init(&sbi->s_bdev_wb_lock);
-	errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err,
+	errseq_check_and_advance(&sb->s_bdev->bd_mapping->wb_err,
 				 &sbi->s_bdev_wb_err);
 	EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
 	ext4_orphan_cleanup(sb, es);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index b6c114c11b97..03c4b9214f56 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -2009,7 +2009,7 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags)
 		byte_count = (block_stop - block_start + 1) *
 				journal->j_blocksize;
 
-		truncate_inode_pages_range(journal->j_dev->bd_inode->i_mapping,
+		truncate_inode_pages_range(journal->j_dev->bd_mapping,
 				byte_start, byte_stop);
 
 		if (flags & JBD2_JOURNAL_FLUSH_DISCARD) {
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index d78454a4dd1f..e58a0d63409a 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -338,7 +338,7 @@ static inline struct buffer_head *getblk_unmovable(struct block_device *bdev,
 {
 	gfp_t gfp;
 
-	gfp = mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS);
+	gfp = mapping_gfp_constraint(bdev->bd_mapping, ~__GFP_FS);
 	gfp |= __GFP_NOFAIL;
 
 	return bdev_getblk(bdev, block, size, gfp);
@@ -349,7 +349,7 @@ static inline struct buffer_head *__getblk(struct block_device *bdev,
 {
 	gfp_t gfp;
 
-	gfp = mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS);
+	gfp = mapping_gfp_constraint(bdev->bd_mapping, ~__GFP_FS);
 	gfp |= __GFP_MOVABLE | __GFP_NOFAIL;
 
 	return bdev_getblk(bdev, block, size, gfp);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 971f3e826e15..ac31c37816f7 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1696,7 +1696,7 @@ static inline void jbd2_journal_abort_handle(handle_t *handle)
 
 static inline void jbd2_init_fs_dev_write_error(journal_t *journal)
 {
-	struct address_space *mapping = journal->j_fs_dev->bd_inode->i_mapping;
+	struct address_space *mapping = journal->j_fs_dev->bd_mapping;
 
 	/*
 	 * Save the original wb_err value of client fs's bdev mapping which
@@ -1707,7 +1707,7 @@ static inline void jbd2_init_fs_dev_write_error(journal_t *journal)
 
 static inline int jbd2_check_fs_dev_write_error(journal_t *journal)
 {
-	struct address_space *mapping = journal->j_fs_dev->bd_inode->i_mapping;
+	struct address_space *mapping = journal->j_fs_dev->bd_mapping;
 
 	return errseq_check(&mapping->wb_err,
 			    READ_ONCE(journal->j_fs_dev_wb_err));
-- 
cgit v1.2.3


From 203c1ce0bb063d1620698e39637b64f2d09c1368 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 28 Apr 2024 20:47:14 -0400
Subject: RIP ->bd_inode

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 block/bdev.c              | 1 -
 include/linux/blk_types.h | 1 -
 2 files changed, 2 deletions(-)

(limited to 'include')

diff --git a/block/bdev.c b/block/bdev.c
index a8c66cc1d6b8..0849a9cfa2b6 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -423,7 +423,6 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
 	spin_lock_init(&bdev->bd_size_lock);
 	mutex_init(&bdev->bd_holder_lock);
 	bdev->bd_partno = partno;
-	bdev->bd_inode = inode;
 	bdev->bd_mapping = &inode->i_data;
 	bdev->bd_queue = disk->queue;
 	if (partno)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 6438c75cbb35..5616d059cb23 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -50,7 +50,6 @@ struct block_device {
 	bool			bd_write_holder;
 	bool			bd_has_submit_bio;
 	dev_t			bd_dev;
-	struct inode		*bd_inode;	/* will die */
 	struct address_space	*bd_mapping;	/* page cache */
 
 	atomic_t		bd_openers;
-- 
cgit v1.2.3


From e9006f81faf8e438ea83626db578610e49f31576 Mon Sep 17 00:00:00 2001
From: Alex Bee <knaerzche@gmail.com>
Date: Tue, 16 Apr 2024 18:12:34 +0200
Subject: mfd: rk8xx: Add RK816 support

This integrates RK816 support in the this existing rk8xx mfd driver.

This version has unaligned interrupt registers, which requires to define a
separate get_irq_reg callback for the regmap. Apart from that the
integration is straightforward and the existing structures can be used as
is. The initialization sequence has been taken from vendor kernel.

Signed-off-by: Alex Bee <knaerzche@gmail.com>
Link: https://lore.kernel.org/r/20240416161237.2500037-3-knaerzche@gmail.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/Kconfig       |   4 +-
 drivers/mfd/rk8xx-core.c  | 104 +++++++++++++++++++++++++++++++++
 drivers/mfd/rk8xx-i2c.c   |  45 ++++++++++++++-
 include/linux/mfd/rk808.h | 144 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 294 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 4b023ee229cf..2e7286cc98e4 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -1225,7 +1225,7 @@ config MFD_RK8XX
 	select MFD_CORE
 
 config MFD_RK8XX_I2C
-	tristate "Rockchip RK805/RK808/RK809/RK817/RK818 Power Management Chip"
+	tristate "Rockchip RK805/RK808/RK809/RK816/RK817/RK818 Power Management Chip"
 	depends on I2C && OF
 	select MFD_CORE
 	select REGMAP_I2C
@@ -1233,7 +1233,7 @@ config MFD_RK8XX_I2C
 	select MFD_RK8XX
 	help
 	  If you say yes here you get support for the RK805, RK808, RK809,
-	  RK817 and RK818 Power Management chips.
+	  RK816, RK817 and RK818 Power Management chips.
 	  This driver provides common support for accessing the device
 	  through I2C interface. The device supports multiple sub-devices
 	  including interrupts, RTC, LDO & DCDC regulators, and onkey.
diff --git a/drivers/mfd/rk8xx-core.c b/drivers/mfd/rk8xx-core.c
index e2261b68b844..5eda3c0dbbdf 100644
--- a/drivers/mfd/rk8xx-core.c
+++ b/drivers/mfd/rk8xx-core.c
@@ -28,6 +28,10 @@ static const struct resource rtc_resources[] = {
 	DEFINE_RES_IRQ(RK808_IRQ_RTC_ALARM),
 };
 
+static const struct resource rk816_rtc_resources[] = {
+	DEFINE_RES_IRQ(RK816_IRQ_RTC_ALARM),
+};
+
 static const struct resource rk817_rtc_resources[] = {
 	DEFINE_RES_IRQ(RK817_IRQ_RTC_ALARM),
 };
@@ -87,6 +91,22 @@ static const struct mfd_cell rk808s[] = {
 	},
 };
 
+static const struct mfd_cell rk816s[] = {
+	{ .name = "rk805-pinctrl", },
+	{ .name = "rk808-clkout", },
+	{ .name = "rk808-regulator", },
+	{
+		.name = "rk805-pwrkey",
+		.num_resources = ARRAY_SIZE(rk805_key_resources),
+		.resources = rk805_key_resources,
+	},
+	{
+		.name = "rk808-rtc",
+		.num_resources = ARRAY_SIZE(rk816_rtc_resources),
+		.resources = rk816_rtc_resources,
+	},
+};
+
 static const struct mfd_cell rk817s[] = {
 	{ .name = "rk808-clkout", },
 	{ .name = "rk808-regulator", },
@@ -148,6 +168,17 @@ static const struct rk808_reg_data rk808_pre_init_reg[] = {
 						    VB_LO_SEL_3500MV },
 };
 
+static const struct rk808_reg_data rk816_pre_init_reg[] = {
+	{ RK818_BUCK1_CONFIG_REG, RK817_RAMP_RATE_MASK,
+				  RK817_RAMP_RATE_12_5MV_PER_US },
+	{ RK818_BUCK2_CONFIG_REG, RK817_RAMP_RATE_MASK,
+				  RK817_RAMP_RATE_12_5MV_PER_US },
+	{ RK818_BUCK4_CONFIG_REG, BUCK_ILMIN_MASK,  BUCK_ILMIN_250MA },
+	{ RK808_THERMAL_REG, TEMP_HOTDIE_MSK, TEMP105C},
+	{ RK808_VB_MON_REG, VBAT_LOW_VOL_MASK | VBAT_LOW_ACT_MASK,
+			    RK808_VBAT_LOW_3V0 | EN_VABT_LOW_SHUT_DOWN },
+};
+
 static const struct rk808_reg_data rk817_pre_init_reg[] = {
 	{RK817_RTC_CTRL_REG, RTC_STOP, RTC_STOP},
 	/* Codec specific registers */
@@ -350,6 +381,59 @@ static const struct regmap_irq rk808_irqs[] = {
 	},
 };
 
+static const unsigned int rk816_irq_status_offsets[] = {
+	RK816_IRQ_STS_OFFSET(RK816_INT_STS_REG1),
+	RK816_IRQ_STS_OFFSET(RK816_INT_STS_REG2),
+	RK816_IRQ_STS_OFFSET(RK816_INT_STS_REG3),
+};
+
+static const unsigned int rk816_irq_mask_offsets[] = {
+	RK816_IRQ_MSK_OFFSET(RK816_INT_STS_MSK_REG1),
+	RK816_IRQ_MSK_OFFSET(RK816_INT_STS_MSK_REG2),
+	RK816_IRQ_MSK_OFFSET(RK816_INT_STS_MSK_REG3),
+};
+
+static unsigned int rk816_get_irq_reg(struct regmap_irq_chip_data *data,
+				      unsigned int base, int index)
+{
+	unsigned int irq_reg = base;
+
+	switch (base) {
+	case RK816_INT_STS_REG1:
+		irq_reg += rk816_irq_status_offsets[index];
+		break;
+	case RK816_INT_STS_MSK_REG1:
+		irq_reg += rk816_irq_mask_offsets[index];
+		break;
+	}
+
+	return irq_reg;
+};
+
+static const struct regmap_irq rk816_irqs[] = {
+	/* INT_STS_REG1 IRQs */
+	REGMAP_IRQ_REG(RK816_IRQ_PWRON_FALL, 0, RK816_INT_STS_PWRON_FALL),
+	REGMAP_IRQ_REG(RK816_IRQ_PWRON_RISE, 0, RK816_INT_STS_PWRON_RISE),
+
+	/* INT_STS_REG2 IRQs  */
+	REGMAP_IRQ_REG(RK816_IRQ_VB_LOW, 1, RK816_INT_STS_VB_LOW),
+	REGMAP_IRQ_REG(RK816_IRQ_PWRON, 1, RK816_INT_STS_PWRON),
+	REGMAP_IRQ_REG(RK816_IRQ_PWRON_LP, 1, RK816_INT_STS_PWRON_LP),
+	REGMAP_IRQ_REG(RK816_IRQ_HOTDIE, 1, RK816_INT_STS_HOTDIE),
+	REGMAP_IRQ_REG(RK816_IRQ_RTC_ALARM, 1, RK816_INT_STS_RTC_ALARM),
+	REGMAP_IRQ_REG(RK816_IRQ_RTC_PERIOD, 1, RK816_INT_STS_RTC_PERIOD),
+	REGMAP_IRQ_REG(RK816_IRQ_USB_OV, 1, RK816_INT_STS_USB_OV),
+
+	/* INT_STS3 IRQs */
+	REGMAP_IRQ_REG(RK816_IRQ_PLUG_IN, 2, RK816_INT_STS_PLUG_IN),
+	REGMAP_IRQ_REG(RK816_IRQ_PLUG_OUT, 2, RK816_INT_STS_PLUG_OUT),
+	REGMAP_IRQ_REG(RK816_IRQ_CHG_OK, 2, RK816_INT_STS_CHG_OK),
+	REGMAP_IRQ_REG(RK816_IRQ_CHG_TE, 2, RK816_INT_STS_CHG_TE),
+	REGMAP_IRQ_REG(RK816_IRQ_CHG_TS, 2, RK816_INT_STS_CHG_TS),
+	REGMAP_IRQ_REG(RK816_IRQ_CHG_CVTLIM, 2, RK816_INT_STS_CHG_CVTLIM),
+	REGMAP_IRQ_REG(RK816_IRQ_DISCHG_ILIM, 2, RK816_INT_STS_DISCHG_ILIM),
+};
+
 static const struct regmap_irq rk818_irqs[] = {
 	/* INT_STS */
 	[RK818_IRQ_VOUT_LO] = {
@@ -482,6 +566,18 @@ static const struct regmap_irq_chip rk808_irq_chip = {
 	.init_ack_masked = true,
 };
 
+static const struct regmap_irq_chip rk816_irq_chip = {
+	.name = "rk816",
+	.irqs = rk816_irqs,
+	.num_irqs = ARRAY_SIZE(rk816_irqs),
+	.num_regs = 3,
+	.get_irq_reg = rk816_get_irq_reg,
+	.status_base = RK816_INT_STS_REG1,
+	.mask_base = RK816_INT_STS_MSK_REG1,
+	.ack_base = RK816_INT_STS_REG1,
+	.init_ack_masked = true,
+};
+
 static struct regmap_irq_chip rk817_irq_chip = {
 	.name = "rk817",
 	.irqs = rk817_irqs,
@@ -530,6 +626,7 @@ static int rk808_power_off(struct sys_off_data *data)
 		reg = RK817_SYS_CFG(3);
 		bit = DEV_OFF;
 		break;
+	case RK816_ID:
 	case RK818_ID:
 		reg = RK818_DEVCTRL_REG;
 		bit = DEV_OFF;
@@ -637,6 +734,13 @@ int rk8xx_probe(struct device *dev, int variant, unsigned int irq, struct regmap
 		cells = rk808s;
 		nr_cells = ARRAY_SIZE(rk808s);
 		break;
+	case RK816_ID:
+		rk808->regmap_irq_chip = &rk816_irq_chip;
+		pre_init_reg = rk816_pre_init_reg;
+		nr_pre_init_regs = ARRAY_SIZE(rk816_pre_init_reg);
+		cells = rk816s;
+		nr_cells = ARRAY_SIZE(rk816s);
+		break;
 	case RK818_ID:
 		rk808->regmap_irq_chip = &rk818_irq_chip;
 		pre_init_reg = rk818_pre_init_reg;
diff --git a/drivers/mfd/rk8xx-i2c.c b/drivers/mfd/rk8xx-i2c.c
index 75b5cf09d5a0..69a6b297d723 100644
--- a/drivers/mfd/rk8xx-i2c.c
+++ b/drivers/mfd/rk8xx-i2c.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * Rockchip RK808/RK818 Core (I2C) driver
+ * Rockchip RK805/RK808/RK816/RK817/RK818 Core (I2C) driver
  *
  * Copyright (c) 2014, Fuzhou Rockchip Electronics Co., Ltd
  * Copyright (C) 2016 PHYTEC Messtechnik GmbH
@@ -49,6 +49,35 @@ static bool rk808_is_volatile_reg(struct device *dev, unsigned int reg)
 	return false;
 }
 
+static bool rk816_is_volatile_reg(struct device *dev, unsigned int reg)
+{
+	/*
+	 * Technically the ROUND_30s bit makes RTC_CTRL_REG volatile, but
+	 * we don't use that feature.  It's better to cache.
+	 */
+
+	switch (reg) {
+	case RK808_SECONDS_REG ... RK808_WEEKS_REG:
+	case RK808_RTC_STATUS_REG:
+	case RK808_VB_MON_REG:
+	case RK808_THERMAL_REG:
+	case RK816_DCDC_EN_REG1:
+	case RK816_DCDC_EN_REG2:
+	case RK816_INT_STS_REG1:
+	case RK816_INT_STS_REG2:
+	case RK816_INT_STS_REG3:
+	case RK808_DEVCTRL_REG:
+	case RK816_SUP_STS_REG:
+	case RK816_GGSTS_REG:
+	case RK816_ZERO_CUR_ADC_REGH:
+	case RK816_ZERO_CUR_ADC_REGL:
+	case RK816_GASCNT_REG(0) ... RK816_BAT_VOL_REGL:
+		return true;
+	}
+
+	return false;
+}
+
 static bool rk817_is_volatile_reg(struct device *dev, unsigned int reg)
 {
 	/*
@@ -100,6 +129,14 @@ static const struct regmap_config rk808_regmap_config = {
 	.volatile_reg = rk808_is_volatile_reg,
 };
 
+static const struct regmap_config rk816_regmap_config = {
+	.reg_bits = 8,
+	.val_bits = 8,
+	.max_register = RK816_DATA_REG(18),
+	.cache_type = REGCACHE_MAPLE,
+	.volatile_reg = rk816_is_volatile_reg,
+};
+
 static const struct regmap_config rk817_regmap_config = {
 	.reg_bits = 8,
 	.val_bits = 8,
@@ -123,6 +160,11 @@ static const struct rk8xx_i2c_platform_data rk809_data = {
 	.variant = RK809_ID,
 };
 
+static const struct rk8xx_i2c_platform_data rk816_data = {
+	.regmap_cfg = &rk816_regmap_config,
+	.variant = RK816_ID,
+};
+
 static const struct rk8xx_i2c_platform_data rk817_data = {
 	.regmap_cfg = &rk817_regmap_config,
 	.variant = RK817_ID,
@@ -161,6 +203,7 @@ static const struct of_device_id rk8xx_i2c_of_match[] = {
 	{ .compatible = "rockchip,rk805", .data = &rk805_data },
 	{ .compatible = "rockchip,rk808", .data = &rk808_data },
 	{ .compatible = "rockchip,rk809", .data = &rk809_data },
+	{ .compatible = "rockchip,rk816", .data = &rk816_data },
 	{ .compatible = "rockchip,rk817", .data = &rk817_data },
 	{ .compatible = "rockchip,rk818", .data = &rk818_data },
 	{ },
diff --git a/include/linux/mfd/rk808.h b/include/linux/mfd/rk808.h
index 78e167a92483..69cbea78b430 100644
--- a/include/linux/mfd/rk808.h
+++ b/include/linux/mfd/rk808.h
@@ -113,6 +113,148 @@ enum rk808_reg {
 #define RK808_INT_STS_MSK_REG2	0x4f
 #define RK808_IO_POL_REG	0x50
 
+/* RK816 */
+enum rk816_reg {
+	RK816_ID_DCDC1,
+	RK816_ID_DCDC2,
+	RK816_ID_DCDC3,
+	RK816_ID_DCDC4,
+	RK816_ID_LDO1,
+	RK816_ID_LDO2,
+	RK816_ID_LDO3,
+	RK816_ID_LDO4,
+	RK816_ID_LDO5,
+	RK816_ID_LDO6,
+	RK816_ID_BOOST,
+	RK816_ID_OTG_SW,
+};
+
+enum rk816_irqs {
+	/* INT_STS_REG1 */
+	RK816_IRQ_PWRON_FALL,
+	RK816_IRQ_PWRON_RISE,
+
+	/* INT_STS_REG2 */
+	RK816_IRQ_VB_LOW,
+	RK816_IRQ_PWRON,
+	RK816_IRQ_PWRON_LP,
+	RK816_IRQ_HOTDIE,
+	RK816_IRQ_RTC_ALARM,
+	RK816_IRQ_RTC_PERIOD,
+	RK816_IRQ_USB_OV,
+
+	/* INT_STS_REG3 */
+	RK816_IRQ_PLUG_IN,
+	RK816_IRQ_PLUG_OUT,
+	RK816_IRQ_CHG_OK,
+	RK816_IRQ_CHG_TE,
+	RK816_IRQ_CHG_TS,
+	RK816_IRQ_CHG_CVTLIM,
+	RK816_IRQ_DISCHG_ILIM,
+};
+
+/* power channel registers */
+#define RK816_DCDC_EN_REG1		0x23
+
+#define RK816_DCDC_EN_REG2		0x24
+#define	RK816_BOOST_EN			BIT(1)
+#define RK816_OTG_EN			BIT(2)
+#define	RK816_BOOST_EN_MSK		BIT(5)
+#define RK816_OTG_EN_MSK		BIT(6)
+#define RK816_BUCK_DVS_CONFIRM		BIT(7)
+
+#define RK816_LDO_EN_REG1		0x27
+
+#define RK816_LDO_EN_REG2		0x28
+
+/* interrupt registers and irq definitions */
+#define RK816_INT_STS_REG1		0x49
+#define RK816_INT_STS_MSK_REG1		0x4a
+#define RK816_INT_STS_PWRON_FALL	BIT(5)
+#define RK816_INT_STS_PWRON_RISE	BIT(6)
+
+#define RK816_INT_STS_REG2		0x4c
+#define RK816_INT_STS_MSK_REG2		0x4d
+#define RK816_INT_STS_VB_LOW		BIT(1)
+#define RK816_INT_STS_PWRON		BIT(2)
+#define RK816_INT_STS_PWRON_LP		BIT(3)
+#define RK816_INT_STS_HOTDIE		BIT(4)
+#define RK816_INT_STS_RTC_ALARM		BIT(5)
+#define RK816_INT_STS_RTC_PERIOD	BIT(6)
+#define RK816_INT_STS_USB_OV		BIT(7)
+
+#define RK816_INT_STS_REG3		0x4e
+#define RK816_INT_STS_MSK_REG3		0x4f
+#define RK816_INT_STS_PLUG_IN		BIT(0)
+#define RK816_INT_STS_PLUG_OUT		BIT(1)
+#define RK816_INT_STS_CHG_OK		BIT(2)
+#define RK816_INT_STS_CHG_TE		BIT(3)
+#define RK816_INT_STS_CHG_TS		BIT(4)
+#define RK816_INT_STS_CHG_CVTLIM	BIT(6)
+#define RK816_INT_STS_DISCHG_ILIM	BIT(7)
+
+#define RK816_IRQ_STS_OFFSET(x)		((x) - RK816_INT_STS_REG1)
+#define RK816_IRQ_MSK_OFFSET(x)		((x) - RK816_INT_STS_MSK_REG1)
+
+/* charger, boost and OTG registers */
+#define RK816_OTG_BUCK_LDO_CONFIG_REG	0x2a
+#define RK816_CHRG_CONFIG_REG		0x2b
+#define RK816_BOOST_ON_VESL_REG		0x54
+#define RK816_BOOST_SLP_VSEL_REG	0x55
+#define RK816_CHRG_BOOST_CONFIG_REG	0x9a
+#define RK816_SUP_STS_REG		0xa0
+#define RK816_USB_CTRL_REG		0xa1
+#define RK816_CHRG_CTRL(x)		(0xa3 + (x))
+#define RK816_BAT_CTRL_REG		0xa6
+#define RK816_BAT_HTS_TS_REG		0xa8
+#define RK816_BAT_LTS_TS_REG		0xa9
+
+/* adc and fuel gauge registers */
+#define RK816_TS_CTRL_REG		0xac
+#define RK816_ADC_CTRL_REG		0xad
+#define RK816_GGCON_REG			0xb0
+#define RK816_GGSTS_REG			0xb1
+#define RK816_ZERO_CUR_ADC_REGH		0xb2
+#define RK816_ZERO_CUR_ADC_REGL		0xb3
+#define RK816_GASCNT_CAL_REG(x)		(0xb7 - (x))
+#define RK816_GASCNT_REG(x)		(0xbb - (x))
+#define RK816_BAT_CUR_AVG_REGH		0xbc
+#define RK816_BAT_CUR_AVG_REGL		0xbd
+#define RK816_TS_ADC_REGH		0xbe
+#define RK816_TS_ADC_REGL		0xbf
+#define RK816_USB_ADC_REGH		0xc0
+#define RK816_USB_ADC_REGL		0xc1
+#define RK816_BAT_OCV_REGH		0xc2
+#define RK816_BAT_OCV_REGL		0xc3
+#define RK816_BAT_VOL_REGH		0xc4
+#define RK816_BAT_VOL_REGL		0xc5
+#define RK816_RELAX_ENTRY_THRES_REGH	0xc6
+#define RK816_RELAX_ENTRY_THRES_REGL	0xc7
+#define RK816_RELAX_EXIT_THRES_REGH	0xc8
+#define RK816_RELAX_EXIT_THRES_REGL	0xc9
+#define RK816_RELAX_VOL1_REGH		0xca
+#define RK816_RELAX_VOL1_REGL		0xcb
+#define RK816_RELAX_VOL2_REGH		0xcc
+#define RK816_RELAX_VOL2_REGL		0xcd
+#define RK816_RELAX_CUR1_REGH		0xce
+#define RK816_RELAX_CUR1_REGL		0xcf
+#define RK816_RELAX_CUR2_REGH		0xd0
+#define RK816_RELAX_CUR2_REGL		0xd1
+#define RK816_CAL_OFFSET_REGH		0xd2
+#define RK816_CAL_OFFSET_REGL		0xd3
+#define RK816_NON_ACT_TIMER_CNT_REG	0xd4
+#define RK816_VCALIB0_REGH		0xd5
+#define RK816_VCALIB0_REGL		0xd6
+#define RK816_VCALIB1_REGH		0xd7
+#define RK816_VCALIB1_REGL		0xd8
+#define RK816_FCC_GASCNT_REG(x)		(0xdc - (x))
+#define RK816_IOFFSET_REGH		0xdd
+#define RK816_IOFFSET_REGL		0xde
+#define RK816_SLEEP_CON_SAMP_CUR_REG	0xdf
+
+/* general purpose data registers 0xe0 ~ 0xf2 */
+#define RK816_DATA_REG(x)		(0xe0 + (x))
+
 /* RK818 */
 #define RK818_DCDC1			0
 #define RK818_LDO1			4
@@ -791,6 +933,7 @@ enum rk806_dvs_mode {
 #define VOUT_LO_INT	BIT(0)
 #define CLK32KOUT2_EN	BIT(0)
 
+#define TEMP105C			0x08
 #define TEMP115C			0x0c
 #define TEMP_HOTDIE_MSK			0x0c
 #define SLP_SD_MSK			(0x3 << 2)
@@ -1191,6 +1334,7 @@ enum {
 	RK806_ID = 0x8060,
 	RK808_ID = 0x0000,
 	RK809_ID = 0x8090,
+	RK816_ID = 0x8160,
 	RK817_ID = 0x8170,
 	RK818_ID = 0x8180,
 };
-- 
cgit v1.2.3


From 91d2b6ee137dc709c8012d4e02c28a308f6753a2 Mon Sep 17 00:00:00 2001
From: Aditya Kumar Singh <quic_adisi@quicinc.com>
Date: Mon, 22 Apr 2024 11:04:06 +0530
Subject: wifi: cfg80211: handle color change per link

Currently, during color change, no link id information is passed down.
In order to support color change during Multi Link Operation, it is
required to pass link id as well.
Additionally, update notification APIs to allow drivers/mac80211 to
pass the link ID.

Signed-off-by: Aditya Kumar Singh <quic_adisi@quicinc.com>
Link: https://msgid.link/20240422053412.2024075-2-quic_adisi@quicinc.com
Link: https://msgid.link/20240422053412.2024075-3-quic_adisi@quicinc.com
[squash, actually only pass 0 from mac80211]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 29 ++++++++++++++++++++---------
 net/mac80211/cfg.c     | 10 +++++-----
 net/wireless/nl80211.c | 10 ++++++++--
 net/wireless/trace.h   |  6 ++++--
 4 files changed, 37 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 0112d645a2b4..cbf1664dc569 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -1577,6 +1577,8 @@ struct cfg80211_csa_settings {
  * @beacon_next: beacon data to be used after the color change
  * @count: number of beacons until the color change
  * @color: the color used after the change
+ * @link_id: defines the link on which color change is expected during MLO.
+ *	0 in case of non-MLO.
  */
 struct cfg80211_color_change_settings {
 	struct cfg80211_beacon_data beacon_color_change;
@@ -1585,6 +1587,7 @@ struct cfg80211_color_change_settings {
 	struct cfg80211_beacon_data beacon_next;
 	u8 count;
 	u8 color;
+	u8 link_id;
 };
 
 /**
@@ -9425,70 +9428,78 @@ void cfg80211_bss_flush(struct wiphy *wiphy);
  * @cmd: the actual event we want to notify
  * @count: the number of TBTTs until the color change happens
  * @color_bitmap: representations of the colors that the local BSS is aware of
+ * @link_id: valid link_id in case of MLO or 0 for non-MLO.
  *
  * Return: 0 on success. Non-zero on error.
  */
 int cfg80211_bss_color_notify(struct net_device *dev,
 			      enum nl80211_commands cmd, u8 count,
-			      u64 color_bitmap);
+			      u64 color_bitmap, u8 link_id);
 
 /**
  * cfg80211_obss_color_collision_notify - notify about bss color collision
  * @dev: network device
  * @color_bitmap: representations of the colors that the local BSS is aware of
+ * @link_id: valid link_id in case of MLO or 0 for non-MLO.
  *
  * Return: 0 on success. Non-zero on error.
  */
 static inline int cfg80211_obss_color_collision_notify(struct net_device *dev,
-						       u64 color_bitmap)
+						       u64 color_bitmap,
+						       u8 link_id)
 {
 	return cfg80211_bss_color_notify(dev, NL80211_CMD_OBSS_COLOR_COLLISION,
-					 0, color_bitmap);
+					 0, color_bitmap, link_id);
 }
 
 /**
  * cfg80211_color_change_started_notify - notify color change start
  * @dev: the device on which the color is switched
  * @count: the number of TBTTs until the color change happens
+ * @link_id: valid link_id in case of MLO or 0 for non-MLO.
  *
  * Inform the userspace about the color change that has started.
  *
  * Return: 0 on success. Non-zero on error.
  */
 static inline int cfg80211_color_change_started_notify(struct net_device *dev,
-						       u8 count)
+						       u8 count, u8 link_id)
 {
 	return cfg80211_bss_color_notify(dev, NL80211_CMD_COLOR_CHANGE_STARTED,
-					 count, 0);
+					 count, 0, link_id);
 }
 
 /**
  * cfg80211_color_change_aborted_notify - notify color change abort
  * @dev: the device on which the color is switched
+ * @link_id: valid link_id in case of MLO or 0 for non-MLO.
  *
  * Inform the userspace about the color change that has aborted.
  *
  * Return: 0 on success. Non-zero on error.
  */
-static inline int cfg80211_color_change_aborted_notify(struct net_device *dev)
+static inline int cfg80211_color_change_aborted_notify(struct net_device *dev,
+						       u8 link_id)
 {
 	return cfg80211_bss_color_notify(dev, NL80211_CMD_COLOR_CHANGE_ABORTED,
-					 0, 0);
+					 0, 0, link_id);
 }
 
 /**
  * cfg80211_color_change_notify - notify color change completion
  * @dev: the device on which the color was switched
+ * @link_id: valid link_id in case of MLO or 0 for non-MLO.
  *
  * Inform the userspace about the color change that has completed.
  *
  * Return: 0 on success. Non-zero on error.
  */
-static inline int cfg80211_color_change_notify(struct net_device *dev)
+static inline int cfg80211_color_change_notify(struct net_device *dev,
+					       u8 link_id)
 {
 	return cfg80211_bss_color_notify(dev,
 					 NL80211_CMD_COLOR_CHANGE_COMPLETED,
-					 0, 0);
+					 0, 0, link_id);
 }
 
 /**
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 44971a62d798..4131ad6c1de3 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -3924,7 +3924,7 @@ static void ieee80211_color_change_abort(struct ieee80211_sub_if_data  *sdata)
 
 	ieee80211_free_next_beacon(&sdata->deflink);
 
-	cfg80211_color_change_aborted_notify(sdata->dev);
+	cfg80211_color_change_aborted_notify(sdata->dev, 0);
 }
 
 static int
@@ -4772,14 +4772,14 @@ static int ieee80211_color_change_finalize(struct ieee80211_sub_if_data *sdata)
 
 	err = ieee80211_set_after_color_change_beacon(sdata, &changed);
 	if (err) {
-		cfg80211_color_change_aborted_notify(sdata->dev);
+		cfg80211_color_change_aborted_notify(sdata->dev, 0);
 		return err;
 	}
 
 	ieee80211_color_change_bss_config_notify(sdata,
 						 sdata->vif.bss_conf.color_change_color,
 						 1, changed);
-	cfg80211_color_change_notify(sdata->dev);
+	cfg80211_color_change_notify(sdata->dev, 0);
 
 	return 0;
 }
@@ -4812,7 +4812,7 @@ void ieee80211_color_collision_detection_work(struct work_struct *work)
 			     color_collision_detect_work);
 	struct ieee80211_sub_if_data *sdata = link->sdata;
 
-	cfg80211_obss_color_collision_notify(sdata->dev, link->color_bitmap);
+	cfg80211_obss_color_collision_notify(sdata->dev, link->color_bitmap, 0);
 }
 
 void ieee80211_color_change_finish(struct ieee80211_vif *vif)
@@ -4876,7 +4876,7 @@ ieee80211_color_change(struct wiphy *wiphy, struct net_device *dev,
 	sdata->vif.bss_conf.color_change_active = true;
 	sdata->vif.bss_conf.color_change_color = params->color;
 
-	cfg80211_color_change_started_notify(sdata->dev, params->count);
+	cfg80211_color_change_started_notify(sdata->dev, params->count, 0);
 
 	if (changed)
 		ieee80211_color_change_bss_config_notify(sdata, 0, 0, changed);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index f391b4055944..1886f78d9c3c 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -16032,6 +16032,7 @@ static int nl80211_color_change(struct sk_buff *skb, struct genl_info *info)
 		params.counter_offset_presp = offset;
 	}
 
+	params.link_id = nl80211_link_id(info->attrs);
 	err = rdev_color_change(rdev, dev, &params);
 
 out:
@@ -17434,7 +17435,8 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nl80211_color_change,
 		.flags = GENL_UNS_ADMIN_PERM,
-		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
+					 NL80211_FLAG_MLO_VALID_LINK_ID),
 	},
 	{
 		.cmd = NL80211_CMD_SET_FILS_AAD,
@@ -19456,7 +19458,7 @@ EXPORT_SYMBOL(cfg80211_ch_switch_started_notify);
 
 int cfg80211_bss_color_notify(struct net_device *dev,
 			      enum nl80211_commands cmd, u8 count,
-			      u64 color_bitmap)
+			      u64 color_bitmap, u8 link_id)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct wiphy *wiphy = wdev->wiphy;
@@ -19479,6 +19481,10 @@ int cfg80211_bss_color_notify(struct net_device *dev,
 	if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex))
 		goto nla_put_failure;
 
+	if (wdev->valid_links &&
+	    nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, link_id))
+		goto nla_put_failure;
+
 	if (cmd == NL80211_CMD_COLOR_CHANGE_STARTED &&
 	    nla_put_u32(msg, NL80211_ATTR_COLOR_CHANGE_COUNT, count))
 		goto nla_put_failure;
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index df013c98b80d..9bf987519811 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -2842,6 +2842,7 @@ TRACE_EVENT(rdev_color_change,
 		__field(u8, count)
 		__field(u16, bcn_ofs)
 		__field(u16, pres_ofs)
+		__field(u8, link_id)
 	),
 	TP_fast_assign(
 		WIPHY_ASSIGN;
@@ -2849,11 +2850,12 @@ TRACE_EVENT(rdev_color_change,
 		__entry->count = params->count;
 		__entry->bcn_ofs = params->counter_offset_beacon;
 		__entry->pres_ofs = params->counter_offset_presp;
+		__entry->link_id = params->link_id;
 	),
 	TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT
-		  ", count: %u",
+		  ", count: %u, link_id: %d",
 		  WIPHY_PR_ARG, NETDEV_PR_ARG,
-		  __entry->count)
+		  __entry->count, __entry->link_id)
 );
 
 TRACE_EVENT(rdev_set_radar_background,
-- 
cgit v1.2.3


From 414e736c3db032733d20d0ada8200af4159d2940 Mon Sep 17 00:00:00 2001
From: Aditya Kumar Singh <quic_adisi@quicinc.com>
Date: Mon, 22 Apr 2024 11:04:08 +0530
Subject: wifi: mac80211: handle color change per link

In order to support color change with MLO, handle the link ID now
passed from cfg80211, adjust the code to do everything per link
and call the notifications to cfg80211 correctly.

Signed-off-by: Aditya Kumar Singh <quic_adisi@quicinc.com>
Link: https://msgid.link/20240422053412.2024075-4-quic_adisi@quicinc.com
Link: https://msgid.link/20240422053412.2024075-5-quic_adisi@quicinc.com
Link: https://msgid.link/20240422053412.2024075-6-quic_adisi@quicinc.com
Link: https://msgid.link/20240422053412.2024075-7-quic_adisi@quicinc.com
[squash, move API call updates to this patch]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/ath11k/mac.c           |   2 +-
 drivers/net/wireless/ath/ath11k/wmi.c           |   3 +-
 drivers/net/wireless/mediatek/mt76/mt7915/mcu.c |   2 +-
 drivers/net/wireless/mediatek/mt76/mt7996/mcu.c |   2 +-
 include/net/mac80211.h                          |   6 +-
 net/mac80211/cfg.c                              | 144 ++++++++++++++++--------
 net/mac80211/rx.c                               |   7 +-
 7 files changed, 111 insertions(+), 55 deletions(-)

(limited to 'include')

diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c
index 4fdd56cd30a1..7fa760ef674e 100644
--- a/drivers/net/wireless/ath/ath11k/mac.c
+++ b/drivers/net/wireless/ath/ath11k/mac.c
@@ -1659,7 +1659,7 @@ void ath11k_mac_bcn_tx_event(struct ath11k_vif *arvif)
 	if (vif->bss_conf.color_change_active &&
 	    ieee80211_beacon_cntdwn_is_complete(vif, 0)) {
 		arvif->bcca_zero_sent = true;
-		ieee80211_color_change_finish(vif);
+		ieee80211_color_change_finish(vif, 0);
 		return;
 	}
 
diff --git a/drivers/net/wireless/ath/ath11k/wmi.c b/drivers/net/wireless/ath/ath11k/wmi.c
index c74aa3f95658..3761dfce1f64 100644
--- a/drivers/net/wireless/ath/ath11k/wmi.c
+++ b/drivers/net/wireless/ath/ath11k/wmi.c
@@ -4064,7 +4064,8 @@ ath11k_wmi_obss_color_collision_event(struct ath11k_base *ab, struct sk_buff *sk
 
 	switch (ev->evt_type) {
 	case WMI_BSS_COLOR_COLLISION_DETECTION:
-		ieee80211_obss_color_collision_notify(arvif->vif, ev->obss_color_bitmap);
+		ieee80211_obss_color_collision_notify(arvif->vif, ev->obss_color_bitmap,
+						      0);
 		ath11k_dbg(ab, ATH11K_DBG_WMI,
 			   "OBSS color collision detected vdev:%d, event:%d, bitmap:%08llx\n",
 			   ev->vdev_id, ev->evt_type, ev->obss_color_bitmap);
diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c
index d90f98c50039..6442e3e5d45b 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c
@@ -331,7 +331,7 @@ mt7915_mcu_cca_finish(void *priv, u8 *mac, struct ieee80211_vif *vif)
 	if (!vif->bss_conf.color_change_active || vif->type == NL80211_IFTYPE_STATION)
 		return;
 
-	ieee80211_color_change_finish(vif);
+	ieee80211_color_change_finish(vif, 0);
 }
 
 static void
diff --git a/drivers/net/wireless/mediatek/mt76/mt7996/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7996/mcu.c
index b44abe2acc81..163b822a477c 100644
--- a/drivers/net/wireless/mediatek/mt76/mt7996/mcu.c
+++ b/drivers/net/wireless/mediatek/mt76/mt7996/mcu.c
@@ -418,7 +418,7 @@ mt7996_mcu_cca_finish(void *priv, u8 *mac, struct ieee80211_vif *vif)
 	if (!vif->bss_conf.color_change_active || vif->type == NL80211_IFTYPE_STATION)
 		return;
 
-	ieee80211_color_change_finish(vif);
+	ieee80211_color_change_finish(vif, 0);
 }
 
 static void
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 5be28b45049b..cafc664ee531 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -5615,12 +5615,13 @@ bool ieee80211_beacon_cntdwn_is_complete(struct ieee80211_vif *vif,
 /**
  * ieee80211_color_change_finish - notify mac80211 about color change
  * @vif: &struct ieee80211_vif pointer from the add_interface callback.
+ * @link_id: valid link_id during MLO or 0 for non-MLO
  *
  * After a color change announcement was scheduled and the counter in this
  * announcement hits 1, this function must be called by the driver to
  * notify mac80211 that the color can be changed
  */
-void ieee80211_color_change_finish(struct ieee80211_vif *vif);
+void ieee80211_color_change_finish(struct ieee80211_vif *vif, u8 link_id);
 
 /**
  * ieee80211_proberesp_get - retrieve a Probe Response template
@@ -7533,6 +7534,7 @@ ieee80211_get_unsol_bcast_probe_resp_tmpl(struct ieee80211_hw *hw,
 /**
  * ieee80211_obss_color_collision_notify - notify userland about a BSS color
  * collision.
+ * @link_id: valid link_id during MLO or 0 for non-MLO
  *
  * @vif: &struct ieee80211_vif pointer from the add_interface callback.
  * @color_bitmap: a 64 bit bitmap representing the colors that the local BSS is
@@ -7540,7 +7542,7 @@ ieee80211_get_unsol_bcast_probe_resp_tmpl(struct ieee80211_hw *hw,
  */
 void
 ieee80211_obss_color_collision_notify(struct ieee80211_vif *vif,
-				      u64 color_bitmap);
+				      u64 color_bitmap, u8 link_id);
 
 /**
  * ieee80211_is_tx_data - check if frame is a data frame
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 4131ad6c1de3..b08e5d7687e3 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -3918,13 +3918,13 @@ static int ieee80211_set_csa_beacon(struct ieee80211_link_data *link_data,
 	return 0;
 }
 
-static void ieee80211_color_change_abort(struct ieee80211_sub_if_data  *sdata)
+static void ieee80211_color_change_abort(struct ieee80211_link_data *link)
 {
-	sdata->vif.bss_conf.color_change_active = false;
+	link->conf->color_change_active = false;
 
-	ieee80211_free_next_beacon(&sdata->deflink);
+	ieee80211_free_next_beacon(link);
 
-	cfg80211_color_change_aborted_notify(sdata->dev, 0);
+	cfg80211_color_change_aborted_notify(link->sdata->dev, link->link_id);
 }
 
 static int
@@ -4008,7 +4008,7 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
 
 	/* if there is a color change in progress, abort it */
 	if (link_conf->color_change_active)
-		ieee80211_color_change_abort(sdata);
+		ieee80211_color_change_abort(link_data);
 
 	err = ieee80211_set_csa_beacon(link_data, params, &changed);
 	if (err) {
@@ -4666,20 +4666,22 @@ static int ieee80211_set_sar_specs(struct wiphy *wiphy,
 }
 
 static int
-ieee80211_set_after_color_change_beacon(struct ieee80211_sub_if_data *sdata,
+ieee80211_set_after_color_change_beacon(struct ieee80211_link_data *link,
 					u64 *changed)
 {
+	struct ieee80211_sub_if_data *sdata = link->sdata;
+
 	switch (sdata->vif.type) {
 	case NL80211_IFTYPE_AP: {
 		int ret;
 
-		if (!sdata->deflink.u.ap.next_beacon)
+		if (!link->u.ap.next_beacon)
 			return -EINVAL;
 
-		ret = ieee80211_assign_beacon(sdata, &sdata->deflink,
-					      sdata->deflink.u.ap.next_beacon,
+		ret = ieee80211_assign_beacon(sdata, link,
+					      link->u.ap.next_beacon,
 					      NULL, NULL, changed);
-		ieee80211_free_next_beacon(&sdata->deflink);
+		ieee80211_free_next_beacon(link);
 
 		if (ret < 0)
 			return ret;
@@ -4695,18 +4697,19 @@ ieee80211_set_after_color_change_beacon(struct ieee80211_sub_if_data *sdata,
 }
 
 static int
-ieee80211_set_color_change_beacon(struct ieee80211_sub_if_data *sdata,
+ieee80211_set_color_change_beacon(struct ieee80211_link_data *link,
 				  struct cfg80211_color_change_settings *params,
 				  u64 *changed)
 {
+	struct ieee80211_sub_if_data *sdata = link->sdata;
 	struct ieee80211_color_change_settings color_change = {};
 	int err;
 
 	switch (sdata->vif.type) {
 	case NL80211_IFTYPE_AP:
-		sdata->deflink.u.ap.next_beacon =
+		link->u.ap.next_beacon =
 			cfg80211_beacon_dup(&params->beacon_next);
-		if (!sdata->deflink.u.ap.next_beacon)
+		if (!link->u.ap.next_beacon)
 			return -ENOMEM;
 
 		if (params->count <= 1)
@@ -4718,11 +4721,11 @@ ieee80211_set_color_change_beacon(struct ieee80211_sub_if_data *sdata,
 			params->counter_offset_presp;
 		color_change.count = params->count;
 
-		err = ieee80211_assign_beacon(sdata, &sdata->deflink,
+		err = ieee80211_assign_beacon(sdata, link,
 					      &params->beacon_color_change,
 					      NULL, &color_change, changed);
 		if (err < 0) {
-			ieee80211_free_next_beacon(&sdata->deflink);
+			ieee80211_free_next_beacon(link);
 			return err;
 		}
 		break;
@@ -4734,16 +4737,18 @@ ieee80211_set_color_change_beacon(struct ieee80211_sub_if_data *sdata,
 }
 
 static void
-ieee80211_color_change_bss_config_notify(struct ieee80211_sub_if_data *sdata,
+ieee80211_color_change_bss_config_notify(struct ieee80211_link_data *link,
 					 u8 color, int enable, u64 changed)
 {
+	struct ieee80211_sub_if_data *sdata = link->sdata;
+
 	lockdep_assert_wiphy(sdata->local->hw.wiphy);
 
-	sdata->vif.bss_conf.he_bss_color.color = color;
-	sdata->vif.bss_conf.he_bss_color.enabled = enable;
+	link->conf->he_bss_color.color = color;
+	link->conf->he_bss_color.enabled = enable;
 	changed |= BSS_CHANGED_HE_BSS_COLOR;
 
-	ieee80211_link_info_change_notify(sdata, &sdata->deflink, changed);
+	ieee80211_link_info_change_notify(sdata, link, changed);
 
 	if (!sdata->vif.bss_conf.nontransmitted && sdata->vif.mbssid_tx_vif) {
 		struct ieee80211_sub_if_data *child;
@@ -4760,26 +4765,27 @@ ieee80211_color_change_bss_config_notify(struct ieee80211_sub_if_data *sdata,
 	}
 }
 
-static int ieee80211_color_change_finalize(struct ieee80211_sub_if_data *sdata)
+static int ieee80211_color_change_finalize(struct ieee80211_link_data *link)
 {
+	struct ieee80211_sub_if_data *sdata = link->sdata;
 	struct ieee80211_local *local = sdata->local;
 	u64 changed = 0;
 	int err;
 
 	lockdep_assert_wiphy(local->hw.wiphy);
 
-	sdata->vif.bss_conf.color_change_active = false;
+	link->conf->color_change_active = false;
 
-	err = ieee80211_set_after_color_change_beacon(sdata, &changed);
+	err = ieee80211_set_after_color_change_beacon(link, &changed);
 	if (err) {
-		cfg80211_color_change_aborted_notify(sdata->dev, 0);
+		cfg80211_color_change_aborted_notify(sdata->dev, link->link_id);
 		return err;
 	}
 
-	ieee80211_color_change_bss_config_notify(sdata,
-						 sdata->vif.bss_conf.color_change_color,
+	ieee80211_color_change_bss_config_notify(link,
+						 link->conf->color_change_color,
 						 1, changed);
-	cfg80211_color_change_notify(sdata->dev, 0);
+	cfg80211_color_change_notify(sdata->dev, link->link_id);
 
 	return 0;
 }
@@ -4787,21 +4793,23 @@ static int ieee80211_color_change_finalize(struct ieee80211_sub_if_data *sdata)
 void ieee80211_color_change_finalize_work(struct wiphy *wiphy,
 					  struct wiphy_work *work)
 {
-	struct ieee80211_sub_if_data *sdata =
-		container_of(work, struct ieee80211_sub_if_data,
-			     deflink.color_change_finalize_work);
+	struct ieee80211_link_data *link =
+		container_of(work, struct ieee80211_link_data,
+			     color_change_finalize_work);
+	struct ieee80211_sub_if_data *sdata = link->sdata;
+	struct ieee80211_bss_conf *link_conf = link->conf;
 	struct ieee80211_local *local = sdata->local;
 
 	lockdep_assert_wiphy(local->hw.wiphy);
 
 	/* AP might have been stopped while waiting for the lock. */
-	if (!sdata->vif.bss_conf.color_change_active)
+	if (!link_conf->color_change_active)
 		return;
 
 	if (!ieee80211_sdata_running(sdata))
 		return;
 
-	ieee80211_color_change_finalize(sdata);
+	ieee80211_color_change_finalize(link);
 }
 
 void ieee80211_color_collision_detection_work(struct work_struct *work)
@@ -4812,30 +4820,60 @@ void ieee80211_color_collision_detection_work(struct work_struct *work)
 			     color_collision_detect_work);
 	struct ieee80211_sub_if_data *sdata = link->sdata;
 
-	cfg80211_obss_color_collision_notify(sdata->dev, link->color_bitmap, 0);
+	cfg80211_obss_color_collision_notify(sdata->dev, link->color_bitmap,
+					     link->link_id);
 }
 
-void ieee80211_color_change_finish(struct ieee80211_vif *vif)
+void ieee80211_color_change_finish(struct ieee80211_vif *vif, u8 link_id)
 {
 	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+	struct ieee80211_link_data *link;
+
+	if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS))
+		return;
+
+	rcu_read_lock();
+
+	link = rcu_dereference(sdata->link[link_id]);
+	if (WARN_ON(!link)) {
+		rcu_read_unlock();
+		return;
+	}
 
 	wiphy_work_queue(sdata->local->hw.wiphy,
-			 &sdata->deflink.color_change_finalize_work);
+			 &link->color_change_finalize_work);
+
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(ieee80211_color_change_finish);
 
 void
 ieee80211_obss_color_collision_notify(struct ieee80211_vif *vif,
-				      u64 color_bitmap)
+				      u64 color_bitmap, u8 link_id)
 {
 	struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
-	struct ieee80211_link_data *link = &sdata->deflink;
+	struct ieee80211_link_data *link;
 
-	if (sdata->vif.bss_conf.color_change_active || sdata->vif.bss_conf.csa_active)
+	if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS))
 		return;
 
-	if (delayed_work_pending(&link->color_collision_detect_work))
+	rcu_read_lock();
+
+	link = rcu_dereference(sdata->link[link_id]);
+	if (WARN_ON(!link)) {
+		rcu_read_unlock();
 		return;
+	}
+
+	if (link->conf->color_change_active || link->conf->csa_active) {
+		rcu_read_unlock();
+		return;
+	}
+
+	if (delayed_work_pending(&link->color_collision_detect_work)) {
+		rcu_read_unlock();
+		return;
+	}
 
 	link->color_bitmap = color_bitmap;
 	/* queue the color collision detection event every 500 ms in order to
@@ -4844,6 +4882,8 @@ ieee80211_obss_color_collision_notify(struct ieee80211_vif *vif,
 	ieee80211_queue_delayed_work(&sdata->local->hw,
 				     &link->color_collision_detect_work,
 				     msecs_to_jiffies(500));
+
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(ieee80211_obss_color_collision_notify);
 
@@ -4853,36 +4893,48 @@ ieee80211_color_change(struct wiphy *wiphy, struct net_device *dev,
 {
 	struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
 	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_bss_conf *link_conf;
+	struct ieee80211_link_data *link;
+	u8 link_id = params->link_id;
 	u64 changed = 0;
 	int err;
 
 	lockdep_assert_wiphy(local->hw.wiphy);
 
-	if (sdata->vif.bss_conf.nontransmitted)
+	if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS))
+		return -EINVAL;
+
+	link = wiphy_dereference(wiphy, sdata->link[link_id]);
+	if (!link)
+		return -ENOLINK;
+
+	link_conf = link->conf;
+
+	if (link_conf->nontransmitted)
 		return -EINVAL;
 
 	/* don't allow another color change if one is already active or if csa
 	 * is active
 	 */
-	if (sdata->vif.bss_conf.color_change_active || sdata->vif.bss_conf.csa_active) {
+	if (link_conf->color_change_active || link_conf->csa_active) {
 		err = -EBUSY;
 		goto out;
 	}
 
-	err = ieee80211_set_color_change_beacon(sdata, params, &changed);
+	err = ieee80211_set_color_change_beacon(link, params, &changed);
 	if (err)
 		goto out;
 
-	sdata->vif.bss_conf.color_change_active = true;
-	sdata->vif.bss_conf.color_change_color = params->color;
+	link_conf->color_change_active = true;
+	link_conf->color_change_color = params->color;
 
-	cfg80211_color_change_started_notify(sdata->dev, params->count, 0);
+	cfg80211_color_change_started_notify(sdata->dev, params->count, link_id);
 
 	if (changed)
-		ieee80211_color_change_bss_config_notify(sdata, 0, 0, changed);
+		ieee80211_color_change_bss_config_notify(link, 0, 0, changed);
 	else
 		/* if the beacon didn't change, we can finalize immediately */
-		ieee80211_color_change_finalize(sdata);
+		ieee80211_color_change_finalize(link);
 
 out:
 
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 8274dba76206..4914692750e5 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3368,7 +3368,7 @@ ieee80211_rx_check_bss_color_collision(struct ieee80211_rx_data *rx)
 	if (ieee80211_hw_check(&rx->local->hw, DETECTS_COLOR_COLLISION))
 		return;
 
-	if (rx->sdata->vif.bss_conf.csa_active)
+	if (rx->link->conf->csa_active)
 		return;
 
 	baselen = mgmt->u.beacon.variable - rx->skb->data;
@@ -3380,7 +3380,7 @@ ieee80211_rx_check_bss_color_collision(struct ieee80211_rx_data *rx)
 				    rx->skb->len - baselen);
 	if (ie && ie->datalen >= sizeof(struct ieee80211_he_operation) &&
 	    ie->datalen >= ieee80211_he_oper_size(ie->data + 1)) {
-		struct ieee80211_bss_conf *bss_conf = &rx->sdata->vif.bss_conf;
+		struct ieee80211_bss_conf *bss_conf = rx->link->conf;
 		const struct ieee80211_he_operation *he_oper;
 		u8 color;
 
@@ -3393,7 +3393,8 @@ ieee80211_rx_check_bss_color_collision(struct ieee80211_rx_data *rx)
 				      IEEE80211_HE_OPERATION_BSS_COLOR_MASK);
 		if (color == bss_conf->he_bss_color.color)
 			ieee80211_obss_color_collision_notify(&rx->sdata->vif,
-							      BIT_ULL(color));
+							      BIT_ULL(color),
+							      bss_conf->link_id);
 	}
 }
 
-- 
cgit v1.2.3


From 84ccfaee29fe46e305244a69c4471e83629ad5d1 Mon Sep 17 00:00:00 2001
From: Nirmala Devi Mal Nadar <m.nirmaladevi@ltts.com>
Date: Tue, 30 Apr 2024 13:16:34 +0000
Subject: mfd: tps6594: Add register definitions for TI TPS65224 PMIC

Extend TPS6594 PMIC register and field definitions to support TPS65224
power management IC.

TPS65224 is software compatible to TPS6594 and can re-use many of the
same definitions, new definitions are added to support additional
controls available on TPS65224.

Signed-off-by: Nirmala Devi Mal Nadar <m.nirmaladevi@ltts.com>
Signed-off-by: Bhargav Raviprakash <bhargav.r@ltts.com>
Link: https://lore.kernel.org/r/0109018f2f265d30-a87711fa-31d9-48db-b8cb-7109d0213e2e-000000@ap-south-1.amazonses.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/tps6594.h | 347 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 335 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/mfd/tps6594.h b/include/linux/mfd/tps6594.h
index 3f7c5e23cd4c..e754c01acc64 100644
--- a/include/linux/mfd/tps6594.h
+++ b/include/linux/mfd/tps6594.h
@@ -18,12 +18,13 @@ enum pmic_id {
 	TPS6594,
 	TPS6593,
 	LP8764,
+	TPS65224,
 };
 
 /* Macro to get page index from register address */
 #define TPS6594_REG_TO_PAGE(reg)	((reg) >> 8)
 
-/* Registers for page 0 of TPS6594 */
+/* Registers for page 0 */
 #define TPS6594_REG_DEV_REV				0x01
 
 #define TPS6594_REG_NVM_CODE_1				0x02
@@ -56,9 +57,6 @@ enum pmic_id {
 #define TPS6594_REG_GPIOX_OUT(gpio_inst)		(TPS6594_REG_GPIO_OUT_1 + (gpio_inst) / 8)
 #define TPS6594_REG_GPIOX_IN(gpio_inst)			(TPS6594_REG_GPIO_IN_1 + (gpio_inst) / 8)
 
-#define TPS6594_REG_GPIO_IN_1				0x3f
-#define TPS6594_REG_GPIO_IN_2				0x40
-
 #define TPS6594_REG_RAIL_SEL_1				0x41
 #define TPS6594_REG_RAIL_SEL_2				0x42
 #define TPS6594_REG_RAIL_SEL_3				0x43
@@ -70,13 +68,15 @@ enum pmic_id {
 #define TPS6594_REG_FSM_TRIG_MASK_3			0x48
 
 #define TPS6594_REG_MASK_BUCK1_2			0x49
+#define TPS65224_REG_MASK_BUCKS				0x49
 #define TPS6594_REG_MASK_BUCK3_4			0x4a
 #define TPS6594_REG_MASK_BUCK5				0x4b
 #define TPS6594_REG_MASK_LDO1_2				0x4c
+#define TPS65224_REG_MASK_LDOS				0x4c
 #define TPS6594_REG_MASK_LDO3_4				0x4d
 #define TPS6594_REG_MASK_VMON				0x4e
-#define TPS6594_REG_MASK_GPIO1_8_FALL			0x4f
-#define TPS6594_REG_MASK_GPIO1_8_RISE			0x50
+#define TPS6594_REG_MASK_GPIO_FALL			0x4f
+#define TPS6594_REG_MASK_GPIO_RISE			0x50
 #define TPS6594_REG_MASK_GPIO9_11			0x51
 #define TPS6594_REG_MASK_STARTUP			0x52
 #define TPS6594_REG_MASK_MISC				0x53
@@ -174,6 +174,10 @@ enum pmic_id {
 
 #define TPS6594_REG_REGISTER_LOCK			0xa1
 
+#define TPS65224_REG_SRAM_ACCESS_1			0xa2
+#define TPS65224_REG_SRAM_ACCESS_2			0xa3
+#define TPS65224_REG_SRAM_ADDR_CTRL			0xa4
+#define TPS65224_REG_RECOV_CNT_PFSM_INCR		0xa5
 #define TPS6594_REG_MANUFACTURING_VER			0xa6
 
 #define TPS6594_REG_CUSTOMER_NVM_ID_REG			0xa7
@@ -182,6 +186,9 @@ enum pmic_id {
 
 #define TPS6594_REG_SOFT_REBOOT_REG			0xab
 
+#define TPS65224_REG_ADC_CTRL				0xac
+#define TPS65224_REG_ADC_RESULT_REG_1			0xad
+#define TPS65224_REG_ADC_RESULT_REG_2			0xae
 #define TPS6594_REG_RTC_SECONDS				0xb5
 #define TPS6594_REG_RTC_MINUTES				0xb6
 #define TPS6594_REG_RTC_HOURS				0xb7
@@ -199,6 +206,7 @@ enum pmic_id {
 
 #define TPS6594_REG_RTC_CTRL_1				0xc2
 #define TPS6594_REG_RTC_CTRL_2				0xc3
+#define TPS65224_REG_STARTUP_CTRL			0xc3
 #define TPS6594_REG_RTC_STATUS				0xc4
 #define TPS6594_REG_RTC_INTERRUPTS			0xc5
 #define TPS6594_REG_RTC_COMP_LSB			0xc6
@@ -214,13 +222,17 @@ enum pmic_id {
 #define TPS6594_REG_PFSM_DELAY_REG_2			0xce
 #define TPS6594_REG_PFSM_DELAY_REG_3			0xcf
 #define TPS6594_REG_PFSM_DELAY_REG_4			0xd0
+#define TPS65224_REG_ADC_GAIN_COMP_REG			0xd0
+#define TPS65224_REG_CRC_CALC_CONTROL			0xef
+#define TPS65224_REG_REGMAP_USER_CRC_LOW		0xf0
+#define TPS65224_REG_REGMAP_USER_CRC_HIGH		0xf1
 
-/* Registers for page 1 of TPS6594 */
+/* Registers for page 1 */
 #define TPS6594_REG_SERIAL_IF_CONFIG			0x11a
 #define TPS6594_REG_I2C1_ID				0x122
 #define TPS6594_REG_I2C2_ID				0x123
 
-/* Registers for page 4 of TPS6594 */
+/* Registers for page 4 */
 #define TPS6594_REG_WD_ANSWER_REG			0x401
 #define TPS6594_REG_WD_QUESTION_ANSW_CNT		0x402
 #define TPS6594_REG_WD_WIN1_CFG				0x403
@@ -241,16 +253,26 @@ enum pmic_id {
 #define TPS6594_BIT_BUCK_PLDN				BIT(5)
 #define TPS6594_BIT_BUCK_RV_SEL				BIT(7)
 
-/* BUCKX_CONF register field definition */
+/* TPS6594 BUCKX_CONF register field definition */
 #define TPS6594_MASK_BUCK_SLEW_RATE			GENMASK(2, 0)
 #define TPS6594_MASK_BUCK_ILIM				GENMASK(5, 3)
 
-/* BUCKX_PG_WINDOW register field definition */
+/* TPS65224 BUCKX_CONF register field definition */
+#define TPS65224_MASK_BUCK_SLEW_RATE			GENMASK(1, 0)
+
+/* TPS6594 BUCKX_PG_WINDOW register field definition */
 #define TPS6594_MASK_BUCK_OV_THR			GENMASK(2, 0)
 #define TPS6594_MASK_BUCK_UV_THR			GENMASK(5, 3)
 
-/* BUCKX VSET */
-#define TPS6594_MASK_BUCKS_VSET GENMASK(7, 0)
+/* TPS65224 BUCKX_PG_WINDOW register field definition */
+#define TPS65224_MASK_BUCK_VMON_THR			GENMASK(1, 0)
+
+/* TPS6594 BUCKX_VOUT register field definition */
+#define TPS6594_MASK_BUCKS_VSET				GENMASK(7, 0)
+
+/* TPS65224 BUCKX_VOUT register field definition */
+#define TPS65224_MASK_BUCK1_VSET			GENMASK(7, 0)
+#define TPS65224_MASK_BUCKS_VSET			GENMASK(6, 0)
 
 /* LDOX_CTRL register field definition */
 #define TPS6594_BIT_LDO_EN				BIT(0)
@@ -258,6 +280,7 @@ enum pmic_id {
 #define TPS6594_BIT_LDO_VMON_EN				BIT(4)
 #define TPS6594_MASK_LDO_PLDN				GENMASK(6, 5)
 #define TPS6594_BIT_LDO_RV_SEL				BIT(7)
+#define TPS65224_BIT_LDO_DISCHARGE_EN			BIT(5)
 
 /* LDORTC_CTRL register field definition */
 #define TPS6594_BIT_LDORTC_DIS				BIT(0)
@@ -271,6 +294,9 @@ enum pmic_id {
 #define TPS6594_MASK_LDO_OV_THR				GENMASK(2, 0)
 #define TPS6594_MASK_LDO_UV_THR				GENMASK(5, 3)
 
+/* LDOX_PG_WINDOW register field definition */
+#define TPS65224_MASK_LDO_VMON_THR			GENMASK(1, 0)
+
 /* VCCA_VMON_CTRL register field definition */
 #define TPS6594_BIT_VMON_EN				BIT(0)
 #define TPS6594_BIT_VMON1_EN				BIT(1)
@@ -278,10 +304,12 @@ enum pmic_id {
 #define TPS6594_BIT_VMON2_EN				BIT(3)
 #define TPS6594_BIT_VMON2_RV_SEL			BIT(4)
 #define TPS6594_BIT_VMON_DEGLITCH_SEL			BIT(5)
+#define TPS65224_BIT_VMON_DEGLITCH_SEL			GENMASK(7, 5)
 
 /* VCCA_PG_WINDOW register field definition */
 #define TPS6594_MASK_VCCA_OV_THR			GENMASK(2, 0)
 #define TPS6594_MASK_VCCA_UV_THR			GENMASK(5, 3)
+#define TPS65224_MASK_VCCA_VMON_THR			GENMASK(1, 0)
 #define TPS6594_BIT_VCCA_PG_SET				BIT(6)
 
 /* VMONX_PG_WINDOW register field definition */
@@ -289,6 +317,9 @@ enum pmic_id {
 #define TPS6594_MASK_VMONX_UV_THR			GENMASK(5, 3)
 #define TPS6594_BIT_VMONX_RANGE				BIT(6)
 
+/* VMONX_PG_WINDOW register field definition */
+#define TPS65224_MASK_VMONX_THR				GENMASK(1, 0)
+
 /* GPIOX_CONF register field definition */
 #define TPS6594_BIT_GPIO_DIR				BIT(0)
 #define TPS6594_BIT_GPIO_OD				BIT(1)
@@ -296,6 +327,8 @@ enum pmic_id {
 #define TPS6594_BIT_GPIO_PU_PD_EN			BIT(3)
 #define TPS6594_BIT_GPIO_DEGLITCH_EN			BIT(4)
 #define TPS6594_MASK_GPIO_SEL				GENMASK(7, 5)
+#define TPS65224_MASK_GPIO_SEL				GENMASK(6, 5)
+#define TPS65224_MASK_GPIO_SEL_GPIO6			GENMASK(7, 5)
 
 /* NPWRON_CONF register field definition */
 #define TPS6594_BIT_NRSTOUT_OD				BIT(0)
@@ -305,6 +338,12 @@ enum pmic_id {
 #define TPS6594_BIT_ENABLE_POL				BIT(5)
 #define TPS6594_MASK_NPWRON_SEL				GENMASK(7, 6)
 
+/* POWER_ON_CONFIG register field definition */
+#define TPS65224_BIT_NINT_ENDRV_PU_SEL			BIT(0)
+#define TPS65224_BIT_NINT_ENDRV_SEL			BIT(1)
+#define TPS65224_BIT_EN_PB_DEGL				BIT(5)
+#define TPS65224_MASK_EN_PB_VSENSE_CONFIG		GENMASK(7, 6)
+
 /* GPIO_OUT_X register field definition */
 #define TPS6594_BIT_GPIOX_OUT(gpio_inst)		BIT((gpio_inst) % 8)
 
@@ -312,6 +351,12 @@ enum pmic_id {
 #define TPS6594_BIT_GPIOX_IN(gpio_inst)			BIT((gpio_inst) % 8)
 #define TPS6594_BIT_NPWRON_IN				BIT(3)
 
+/* GPIO_OUT_X register field definition */
+#define TPS65224_BIT_GPIOX_OUT(gpio_inst)		BIT((gpio_inst))
+
+/* GPIO_IN_X register field definition */
+#define TPS65224_BIT_GPIOX_IN(gpio_inst)		BIT((gpio_inst))
+
 /* RAIL_SEL_1 register field definition */
 #define TPS6594_MASK_BUCK1_GRP_SEL			GENMASK(1, 0)
 #define TPS6594_MASK_BUCK2_GRP_SEL			GENMASK(3, 2)
@@ -343,6 +388,9 @@ enum pmic_id {
 #define TPS6594_BIT_GPIOX_FSM_MASK(gpio_inst)		BIT(((gpio_inst) << 1) % 8)
 #define TPS6594_BIT_GPIOX_FSM_MASK_POL(gpio_inst)	BIT(((gpio_inst) << 1) % 8 + 1)
 
+#define TPS65224_BIT_GPIOX_FSM_MASK(gpio_inst)		BIT(((gpio_inst) << 1) % 6)
+#define TPS65224_BIT_GPIOX_FSM_MASK_POL(gpio_inst)	BIT(((gpio_inst) << 1) % 6 + 1)
+
 /* MASK_BUCKX register field definition */
 #define TPS6594_BIT_BUCKX_OV_MASK(buck_inst)		BIT(((buck_inst) << 2) % 8)
 #define TPS6594_BIT_BUCKX_UV_MASK(buck_inst)		BIT(((buck_inst) << 2) % 8 + 1)
@@ -361,22 +409,46 @@ enum pmic_id {
 #define TPS6594_BIT_VMON2_OV_MASK			BIT(5)
 #define TPS6594_BIT_VMON2_UV_MASK			BIT(6)
 
+/* MASK_BUCK Register field definition */
+#define TPS65224_BIT_BUCK1_UVOV_MASK			BIT(0)
+#define TPS65224_BIT_BUCK2_UVOV_MASK			BIT(1)
+#define TPS65224_BIT_BUCK3_UVOV_MASK			BIT(2)
+#define TPS65224_BIT_BUCK4_UVOV_MASK			BIT(4)
+
+/* MASK_LDO_VMON register field definition */
+#define TPS65224_BIT_LDO1_UVOV_MASK			BIT(0)
+#define TPS65224_BIT_LDO2_UVOV_MASK			BIT(1)
+#define TPS65224_BIT_LDO3_UVOV_MASK			BIT(2)
+#define TPS65224_BIT_VCCA_UVOV_MASK			BIT(4)
+#define TPS65224_BIT_VMON1_UVOV_MASK			BIT(5)
+#define TPS65224_BIT_VMON2_UVOV_MASK			BIT(6)
+
 /* MASK_GPIOX register field definition */
 #define TPS6594_BIT_GPIOX_FALL_MASK(gpio_inst)		BIT((gpio_inst) < 8 ? \
 							    (gpio_inst) : (gpio_inst) % 8)
 #define TPS6594_BIT_GPIOX_RISE_MASK(gpio_inst)		BIT((gpio_inst) < 8 ? \
 							    (gpio_inst) : (gpio_inst) % 8 + 3)
+/* MASK_GPIOX register field definition */
+#define TPS65224_BIT_GPIOX_FALL_MASK(gpio_inst)		BIT((gpio_inst))
+#define TPS65224_BIT_GPIOX_RISE_MASK(gpio_inst)		BIT((gpio_inst))
 
 /* MASK_STARTUP register field definition */
 #define TPS6594_BIT_NPWRON_START_MASK			BIT(0)
 #define TPS6594_BIT_ENABLE_MASK				BIT(1)
 #define TPS6594_BIT_FSD_MASK				BIT(4)
 #define TPS6594_BIT_SOFT_REBOOT_MASK			BIT(5)
+#define TPS65224_BIT_VSENSE_MASK			BIT(0)
+#define TPS65224_BIT_PB_SHORT_MASK			BIT(2)
 
 /* MASK_MISC register field definition */
 #define TPS6594_BIT_BIST_PASS_MASK			BIT(0)
 #define TPS6594_BIT_EXT_CLK_MASK			BIT(1)
+#define TPS65224_BIT_REG_UNLOCK_MASK			BIT(2)
 #define TPS6594_BIT_TWARN_MASK				BIT(3)
+#define TPS65224_BIT_PB_LONG_MASK			BIT(4)
+#define TPS65224_BIT_PB_FALL_MASK			BIT(5)
+#define TPS65224_BIT_PB_RISE_MASK			BIT(6)
+#define TPS65224_BIT_ADC_CONV_READY_MASK		BIT(7)
 
 /* MASK_MODERATE_ERR register field definition */
 #define TPS6594_BIT_BIST_FAIL_MASK			BIT(1)
@@ -391,6 +463,8 @@ enum pmic_id {
 #define TPS6594_BIT_ORD_SHUTDOWN_MASK			BIT(1)
 #define TPS6594_BIT_MCU_PWR_ERR_MASK			BIT(2)
 #define TPS6594_BIT_SOC_PWR_ERR_MASK			BIT(3)
+#define TPS65224_BIT_COMM_ERR_MASK			BIT(4)
+#define TPS65224_BIT_I2C2_ERR_MASK			BIT(5)
 
 /* MASK_COMM_ERR register field definition */
 #define TPS6594_BIT_COMM_FRM_ERR_MASK			BIT(0)
@@ -426,6 +500,12 @@ enum pmic_id {
 #define TPS6594_BIT_BUCK3_4_INT				BIT(1)
 #define TPS6594_BIT_BUCK5_INT				BIT(2)
 
+/* INT_BUCK register field definition */
+#define TPS65224_BIT_BUCK1_UVOV_INT			BIT(0)
+#define TPS65224_BIT_BUCK2_UVOV_INT			BIT(1)
+#define TPS65224_BIT_BUCK3_UVOV_INT			BIT(2)
+#define TPS65224_BIT_BUCK4_UVOV_INT			BIT(3)
+
 /* INT_BUCKX register field definition */
 #define TPS6594_BIT_BUCKX_OV_INT(buck_inst)		BIT(((buck_inst) << 2) % 8)
 #define TPS6594_BIT_BUCKX_UV_INT(buck_inst)		BIT(((buck_inst) << 2) % 8 + 1)
@@ -437,6 +517,14 @@ enum pmic_id {
 #define TPS6594_BIT_LDO3_4_INT				BIT(1)
 #define TPS6594_BIT_VCCA_INT				BIT(4)
 
+/* INT_LDO_VMON register field definition */
+#define TPS65224_BIT_LDO1_UVOV_INT			BIT(0)
+#define TPS65224_BIT_LDO2_UVOV_INT			BIT(1)
+#define TPS65224_BIT_LDO3_UVOV_INT			BIT(2)
+#define TPS65224_BIT_VCCA_UVOV_INT			BIT(4)
+#define TPS65224_BIT_VMON1_UVOV_INT			BIT(5)
+#define TPS65224_BIT_VMON2_UVOV_INT			BIT(6)
+
 /* INT_LDOX register field definition */
 #define TPS6594_BIT_LDOX_OV_INT(ldo_inst)		BIT(((ldo_inst) << 2) % 8)
 #define TPS6594_BIT_LDOX_UV_INT(ldo_inst)		BIT(((ldo_inst) << 2) % 8 + 1)
@@ -462,17 +550,32 @@ enum pmic_id {
 /* INT_GPIOX register field definition */
 #define TPS6594_BIT_GPIOX_INT(gpio_inst)		BIT(gpio_inst)
 
+/* INT_GPIO register field definition */
+#define TPS65224_BIT_GPIO1_INT				BIT(0)
+#define TPS65224_BIT_GPIO2_INT				BIT(1)
+#define TPS65224_BIT_GPIO3_INT				BIT(2)
+#define TPS65224_BIT_GPIO4_INT				BIT(3)
+#define TPS65224_BIT_GPIO5_INT				BIT(4)
+#define TPS65224_BIT_GPIO6_INT				BIT(5)
+
 /* INT_STARTUP register field definition */
 #define TPS6594_BIT_NPWRON_START_INT			BIT(0)
+#define TPS65224_BIT_VSENSE_INT				BIT(0)
 #define TPS6594_BIT_ENABLE_INT				BIT(1)
 #define TPS6594_BIT_RTC_INT				BIT(2)
+#define TPS65224_BIT_PB_SHORT_INT			BIT(2)
 #define TPS6594_BIT_FSD_INT				BIT(4)
 #define TPS6594_BIT_SOFT_REBOOT_INT			BIT(5)
 
 /* INT_MISC register field definition */
 #define TPS6594_BIT_BIST_PASS_INT			BIT(0)
 #define TPS6594_BIT_EXT_CLK_INT				BIT(1)
+#define TPS65224_BIT_REG_UNLOCK_INT			BIT(2)
 #define TPS6594_BIT_TWARN_INT				BIT(3)
+#define TPS65224_BIT_PB_LONG_INT			BIT(4)
+#define TPS65224_BIT_PB_FALL_INT			BIT(5)
+#define TPS65224_BIT_PB_RISE_INT			BIT(6)
+#define TPS65224_BIT_ADC_CONV_READY_INT			BIT(7)
 
 /* INT_MODERATE_ERR register field definition */
 #define TPS6594_BIT_TSD_ORD_INT				BIT(0)
@@ -488,6 +591,7 @@ enum pmic_id {
 #define TPS6594_BIT_TSD_IMM_INT				BIT(0)
 #define TPS6594_BIT_VCCA_OVP_INT			BIT(1)
 #define TPS6594_BIT_PFSM_ERR_INT			BIT(2)
+#define TPS65224_BIT_BG_XMON_INT			BIT(3)
 
 /* INT_FSM_ERR register field definition */
 #define TPS6594_BIT_IMM_SHUTDOWN_INT			BIT(0)
@@ -496,6 +600,7 @@ enum pmic_id {
 #define TPS6594_BIT_SOC_PWR_ERR_INT			BIT(3)
 #define TPS6594_BIT_COMM_ERR_INT			BIT(4)
 #define TPS6594_BIT_READBACK_ERR_INT			BIT(5)
+#define TPS65224_BIT_I2C2_ERR_INT			BIT(5)
 #define TPS6594_BIT_ESM_INT				BIT(6)
 #define TPS6594_BIT_WD_INT				BIT(7)
 
@@ -536,8 +641,18 @@ enum pmic_id {
 #define TPS6594_BIT_VMON2_OV_STAT			BIT(5)
 #define TPS6594_BIT_VMON2_UV_STAT			BIT(6)
 
+/* STAT_LDO_VMON register field definition */
+#define TPS65224_BIT_LDO1_UVOV_STAT			BIT(0)
+#define TPS65224_BIT_LDO2_UVOV_STAT			BIT(1)
+#define TPS65224_BIT_LDO3_UVOV_STAT			BIT(2)
+#define TPS65224_BIT_VCCA_UVOV_STAT			BIT(4)
+#define TPS65224_BIT_VMON1_UVOV_STAT			BIT(5)
+#define TPS65224_BIT_VMON2_UVOV_STAT			BIT(6)
+
 /* STAT_STARTUP register field definition */
+#define TPS65224_BIT_VSENSE_STAT			BIT(0)
 #define TPS6594_BIT_ENABLE_STAT				BIT(1)
+#define TPS65224_BIT_PB_LEVEL_STAT			BIT(2)
 
 /* STAT_MISC register field definition */
 #define TPS6594_BIT_EXT_CLK_STAT			BIT(1)
@@ -549,6 +664,7 @@ enum pmic_id {
 /* STAT_SEVERE_ERR register field definition */
 #define TPS6594_BIT_TSD_IMM_STAT			BIT(0)
 #define TPS6594_BIT_VCCA_OVP_STAT			BIT(1)
+#define TPS65224_BIT_BG_XMON_STAT			BIT(3)
 
 /* STAT_READBACK_ERR register field definition */
 #define TPS6594_BIT_EN_DRV_READBACK_STAT		BIT(0)
@@ -597,6 +713,8 @@ enum pmic_id {
 #define TPS6594_BIT_BB_CHARGER_EN			BIT(0)
 #define TPS6594_BIT_BB_ICHR				BIT(1)
 #define TPS6594_MASK_BB_VEOC				GENMASK(3, 2)
+#define TPS65224_BIT_I2C1_SPI_CRC_EN			BIT(4)
+#define TPS65224_BIT_I2C2_CRC_EN			BIT(5)
 #define TPS6594_BB_EOC_RDY				BIT(7)
 
 /* ENABLE_DRV_REG register field definition */
@@ -617,6 +735,7 @@ enum pmic_id {
 #define TPS6594_BIT_NRSTOUT_SOC_IN			BIT(2)
 #define TPS6594_BIT_FORCE_EN_DRV_LOW			BIT(3)
 #define TPS6594_BIT_SPMI_LPM_EN				BIT(4)
+#define TPS65224_BIT_TSD_DISABLE			BIT(5)
 
 /* RECOV_CNT_REG_1 register field definition */
 #define TPS6594_MASK_RECOV_CNT				GENMASK(3, 0)
@@ -671,15 +790,27 @@ enum pmic_id {
 /* ESM_SOC_START_REG register field definition */
 #define TPS6594_BIT_ESM_SOC_START			BIT(0)
 
+/* ESM_MCU_START_REG register field definition */
+#define TPS65224_BIT_ESM_MCU_START			BIT(0)
+
 /* ESM_SOC_MODE_CFG register field definition */
 #define TPS6594_MASK_ESM_SOC_ERR_CNT_TH			GENMASK(3, 0)
 #define TPS6594_BIT_ESM_SOC_ENDRV			BIT(5)
 #define TPS6594_BIT_ESM_SOC_EN				BIT(6)
 #define TPS6594_BIT_ESM_SOC_MODE			BIT(7)
 
+/* ESM_MCU_MODE_CFG register field definition */
+#define TPS65224_MASK_ESM_MCU_ERR_CNT_TH		GENMASK(3, 0)
+#define TPS65224_BIT_ESM_MCU_ENDRV			BIT(5)
+#define TPS65224_BIT_ESM_MCU_EN				BIT(6)
+#define TPS65224_BIT_ESM_MCU_MODE			BIT(7)
+
 /* ESM_SOC_ERR_CNT_REG register field definition */
 #define TPS6594_MASK_ESM_SOC_ERR_CNT			GENMASK(4, 0)
 
+/* ESM_MCU_ERR_CNT_REG register field definition */
+#define TPS6594_MASK_ESM_MCU_ERR_CNT			GENMASK(4, 0)
+
 /* REGISTER_LOCK register field definition */
 #define TPS6594_BIT_REGISTER_LOCK_STATUS		BIT(0)
 
@@ -687,6 +818,29 @@ enum pmic_id {
 #define TPS6594_MASK_VMON1_SLEW_RATE			GENMASK(2, 0)
 #define TPS6594_MASK_VMON2_SLEW_RATE			GENMASK(5, 3)
 
+/* SRAM_ACCESS_1 Register field definition */
+#define TPS65224_MASk_SRAM_UNLOCK_SEQ			GENMASK(7, 0)
+
+/* SRAM_ACCESS_2 Register field definition */
+#define TPS65224_BIT_SRAM_WRITE_MODE			BIT(0)
+#define TPS65224_BIT_OTP_PROG_USER			BIT(1)
+#define TPS65224_BIT_OTP_PROG_PFSM			BIT(2)
+#define TPS65224_BIT_OTP_PROG_STATUS			BIT(3)
+#define TPS65224_BIT_SRAM_UNLOCKED			BIT(6)
+#define TPS65224_USER_PROG_ALLOWED			BIT(7)
+
+/* SRAM_ADDR_CTRL Register field definition */
+#define TPS65224_MASk_SRAM_SEL				GENMASK(1, 0)
+
+/* RECOV_CNT_PFSM_INCR Register field definition */
+#define TPS65224_BIT_INCREMENT_RECOV_CNT		BIT(0)
+
+/* MANUFACTURING_VER Register field definition */
+#define TPS65224_MASK_SILICON_REV			GENMASK(7, 0)
+
+/* CUSTOMER_NVM_ID_REG Register field definition */
+#define TPS65224_MASK_CUSTOMER_NVM_ID			GENMASK(7, 0)
+
 /* SOFT_REBOOT_REG register field definition */
 #define TPS6594_BIT_SOFT_REBOOT				BIT(0)
 
@@ -755,14 +909,83 @@ enum pmic_id {
 #define TPS6594_BIT_I2C2_CRC_EN				BIT(2)
 #define TPS6594_MASK_T_CRC				GENMASK(7, 3)
 
+/* ADC_CTRL Register field definition */
+#define TPS65224_BIT_ADC_START				BIT(0)
+#define TPS65224_BIT_ADC_CONT_CONV			BIT(1)
+#define TPS65224_BIT_ADC_THERMAL_SEL			BIT(2)
+#define TPS65224_BIT_ADC_RDIV_EN			BIT(3)
+#define TPS65224_BIT_ADC_STATUS				BIT(7)
+
+/* ADC_RESULT_REG_1 Register field definition */
+#define TPS65224_MASK_ADC_RESULT_11_4			GENMASK(7, 0)
+
+/* ADC_RESULT_REG_2 Register field definition */
+#define TPS65224_MASK_ADC_RESULT_3_0			GENMASK(7, 4)
+
+/* STARTUP_CTRL Register field definition */
+#define TPS65224_MASK_STARTUP_DEST			GENMASK(6, 5)
+#define TPS65224_BIT_FIRST_STARTUP_DONE			BIT(7)
+
+/* SCRATCH_PAD_REG_1 Register field definition */
+#define TPS6594_MASK_SCRATCH_PAD_1			GENMASK(7, 0)
+
+/* SCRATCH_PAD_REG_2 Register field definition */
+#define TPS6594_MASK_SCRATCH_PAD_2			GENMASK(7, 0)
+
+/* SCRATCH_PAD_REG_3 Register field definition */
+#define TPS6594_MASK_SCRATCH_PAD_3			GENMASK(7, 0)
+
+/* SCRATCH_PAD_REG_4 Register field definition */
+#define TPS6594_MASK_SCRATCH_PAD_4			GENMASK(7, 0)
+
+/* PFSM_DELAY_REG_1 Register field definition */
+#define TPS6594_MASK_PFSM_DELAY1			GENMASK(7, 0)
+
+/* PFSM_DELAY_REG_2 Register field definition */
+#define TPS6594_MASK_PFSM_DELAY2			GENMASK(7, 0)
+
+/* PFSM_DELAY_REG_3 Register field definition */
+#define TPS6594_MASK_PFSM_DELAY3			GENMASK(7, 0)
+
+/* PFSM_DELAY_REG_4 Register field definition */
+#define TPS6594_MASK_PFSM_DELAY4			GENMASK(7, 0)
+
+/* CRC_CALC_CONTROL Register field definition */
+#define TPS65224_BIT_RUN_CRC_BIST			BIT(0)
+#define TPS65224_BIT_RUN_CRC_UPDATE			BIT(1)
+
+/* ADC_GAIN_COMP_REG Register field definition */
+#define TPS65224_MASK_ADC_GAIN_COMP			GENMASK(7, 0)
+
+/* REGMAP_USER_CRC_LOW Register field definition */
+#define TPS65224_MASK_REGMAP_USER_CRC16_LOW		GENMASK(7, 0)
+
+/* REGMAP_USER_CRC_HIGH Register field definition */
+#define TPS65224_MASK_REGMAP_USER_CRC16_HIGH		GENMASK(7, 0)
+
+/* WD_ANSWER_REG Register field definition */
+#define TPS6594_MASK_WD_ANSWER				GENMASK(7, 0)
+
 /* WD_QUESTION_ANSW_CNT register field definition */
 #define TPS6594_MASK_WD_QUESTION			GENMASK(3, 0)
 #define TPS6594_MASK_WD_ANSW_CNT			GENMASK(5, 4)
+#define TPS65224_BIT_INT_TOP_STATUS			BIT(7)
+
+/* WD WIN1_CFG register field definition */
+#define TPS6594_MASK_WD_WIN1_CFG			GENMASK(6, 0)
+
+/* WD WIN2_CFG register field definition */
+#define TPS6594_MASK_WD_WIN2_CFG			GENMASK(6, 0)
+
+/* WD LongWin register field definition */
+#define TPS6594_MASK_WD_LONGWIN_CFG			GENMASK(7, 0)
 
 /* WD_MODE_REG register field definition */
 #define TPS6594_BIT_WD_RETURN_LONGWIN			BIT(0)
 #define TPS6594_BIT_WD_MODE_SELECT			BIT(1)
 #define TPS6594_BIT_WD_PWRHOLD				BIT(2)
+#define TPS65224_BIT_WD_ENDRV_SEL			BIT(6)
+#define TPS65224_BIT_WD_CNT_SEL				BIT(7)
 
 /* WD_QA_CFG register field definition */
 #define TPS6594_MASK_WD_QUESTION_SEED			GENMASK(3, 0)
@@ -993,6 +1216,106 @@ enum tps6594_irqs {
 #define TPS6594_IRQ_NAME_ALARM			"alarm"
 #define TPS6594_IRQ_NAME_POWERUP		"powerup"
 
+/* IRQs */
+enum tps65224_irqs {
+	/* INT_BUCK register */
+	TPS65224_IRQ_BUCK1_UVOV,
+	TPS65224_IRQ_BUCK2_UVOV,
+	TPS65224_IRQ_BUCK3_UVOV,
+	TPS65224_IRQ_BUCK4_UVOV,
+	/* INT_LDO_VMON register */
+	TPS65224_IRQ_LDO1_UVOV,
+	TPS65224_IRQ_LDO2_UVOV,
+	TPS65224_IRQ_LDO3_UVOV,
+	TPS65224_IRQ_VCCA_UVOV,
+	TPS65224_IRQ_VMON1_UVOV,
+	TPS65224_IRQ_VMON2_UVOV,
+	/* INT_GPIO register */
+	TPS65224_IRQ_GPIO1,
+	TPS65224_IRQ_GPIO2,
+	TPS65224_IRQ_GPIO3,
+	TPS65224_IRQ_GPIO4,
+	TPS65224_IRQ_GPIO5,
+	TPS65224_IRQ_GPIO6,
+	/* INT_STARTUP register */
+	TPS65224_IRQ_VSENSE,
+	TPS65224_IRQ_ENABLE,
+	TPS65224_IRQ_PB_SHORT,
+	TPS65224_IRQ_FSD,
+	TPS65224_IRQ_SOFT_REBOOT,
+	/* INT_MISC register */
+	TPS65224_IRQ_BIST_PASS,
+	TPS65224_IRQ_EXT_CLK,
+	TPS65224_IRQ_REG_UNLOCK,
+	TPS65224_IRQ_TWARN,
+	TPS65224_IRQ_PB_LONG,
+	TPS65224_IRQ_PB_FALL,
+	TPS65224_IRQ_PB_RISE,
+	TPS65224_IRQ_ADC_CONV_READY,
+	/* INT_MODERATE_ERR register */
+	TPS65224_IRQ_TSD_ORD,
+	TPS65224_IRQ_BIST_FAIL,
+	TPS65224_IRQ_REG_CRC_ERR,
+	TPS65224_IRQ_RECOV_CNT,
+	/* INT_SEVERE_ERR register */
+	TPS65224_IRQ_TSD_IMM,
+	TPS65224_IRQ_VCCA_OVP,
+	TPS65224_IRQ_PFSM_ERR,
+	TPS65224_IRQ_BG_XMON,
+	/* INT_FSM_ERR register */
+	TPS65224_IRQ_IMM_SHUTDOWN,
+	TPS65224_IRQ_ORD_SHUTDOWN,
+	TPS65224_IRQ_MCU_PWR_ERR,
+	TPS65224_IRQ_SOC_PWR_ERR,
+	TPS65224_IRQ_COMM_ERR,
+	TPS65224_IRQ_I2C2_ERR,
+};
+
+#define TPS65224_IRQ_NAME_BUCK1_UVOV		"buck1_uvov"
+#define TPS65224_IRQ_NAME_BUCK2_UVOV		"buck2_uvov"
+#define TPS65224_IRQ_NAME_BUCK3_UVOV		"buck3_uvov"
+#define TPS65224_IRQ_NAME_BUCK4_UVOV		"buck4_uvov"
+#define TPS65224_IRQ_NAME_LDO1_UVOV		"ldo1_uvov"
+#define TPS65224_IRQ_NAME_LDO2_UVOV		"ldo2_uvov"
+#define TPS65224_IRQ_NAME_LDO3_UVOV		"ldo3_uvov"
+#define TPS65224_IRQ_NAME_VCCA_UVOV		"vcca_uvov"
+#define TPS65224_IRQ_NAME_VMON1_UVOV		"vmon1_uvov"
+#define TPS65224_IRQ_NAME_VMON2_UVOV		"vmon2_uvov"
+#define TPS65224_IRQ_NAME_GPIO1			"gpio1"
+#define TPS65224_IRQ_NAME_GPIO2			"gpio2"
+#define TPS65224_IRQ_NAME_GPIO3			"gpio3"
+#define TPS65224_IRQ_NAME_GPIO4			"gpio4"
+#define TPS65224_IRQ_NAME_GPIO5			"gpio5"
+#define TPS65224_IRQ_NAME_GPIO6			"gpio6"
+#define TPS65224_IRQ_NAME_VSENSE	        "vsense"
+#define TPS65224_IRQ_NAME_ENABLE		"enable"
+#define TPS65224_IRQ_NAME_PB_SHORT		"pb_short"
+#define TPS65224_IRQ_NAME_FSD			"fsd"
+#define TPS65224_IRQ_NAME_SOFT_REBOOT		"soft_reboot"
+#define TPS65224_IRQ_NAME_BIST_PASS		"bist_pass"
+#define TPS65224_IRQ_NAME_EXT_CLK		"ext_clk"
+#define TPS65224_IRQ_NAME_REG_UNLOCK		"reg_unlock"
+#define TPS65224_IRQ_NAME_TWARN			"twarn"
+#define TPS65224_IRQ_NAME_PB_LONG		"pb_long"
+#define TPS65224_IRQ_NAME_PB_FALL		"pb_fall"
+#define TPS65224_IRQ_NAME_PB_RISE		"pb_rise"
+#define TPS65224_IRQ_NAME_ADC_CONV_READY	"adc_conv_ready"
+#define TPS65224_IRQ_NAME_TSD_ORD		"tsd_ord"
+#define TPS65224_IRQ_NAME_BIST_FAIL		"bist_fail"
+#define TPS65224_IRQ_NAME_REG_CRC_ERR		"reg_crc_err"
+#define TPS65224_IRQ_NAME_RECOV_CNT		"recov_cnt"
+#define TPS65224_IRQ_NAME_TSD_IMM		"tsd_imm"
+#define TPS65224_IRQ_NAME_VCCA_OVP		"vcca_ovp"
+#define TPS65224_IRQ_NAME_PFSM_ERR		"pfsm_err"
+#define TPS65224_IRQ_NAME_BG_XMON		"bg_xmon"
+#define TPS65224_IRQ_NAME_IMM_SHUTDOWN		"imm_shutdown"
+#define TPS65224_IRQ_NAME_ORD_SHUTDOWN		"ord_shutdown"
+#define TPS65224_IRQ_NAME_MCU_PWR_ERR		"mcu_pwr_err"
+#define TPS65224_IRQ_NAME_SOC_PWR_ERR		"soc_pwr_err"
+#define TPS65224_IRQ_NAME_COMM_ERR		"comm_err"
+#define TPS65224_IRQ_NAME_I2C2_ERR		"i2c2_err"
+#define TPS65224_IRQ_NAME_POWERUP		"powerup"
+
 /**
  * struct tps6594 - device private data structure
  *
-- 
cgit v1.2.3


From 436250638b6d8e6cf8dceed82cdbbfc90ce3a775 Mon Sep 17 00:00:00 2001
From: Bhargav Raviprakash <bhargav.r@ltts.com>
Date: Tue, 30 Apr 2024 13:16:43 +0000
Subject: mfd: tps6594: Use volatile_table instead of volatile_reg

In regmap_config use volatile_table instead of volatile_reg. This change
makes it easier to add support for TPS65224 PMIC.

Signed-off-by: Bhargav Raviprakash <bhargav.r@ltts.com>
Acked-by: Julien Panis <jpanis@baylibre.com>
Link: https://lore.kernel.org/r/0109018f2f267f6e-3121fa42-4816-45f7-a96d-0d6b4678da5a-000000@ap-south-1.amazonses.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/tps6594-core.c  | 16 ++++++++++------
 drivers/mfd/tps6594-i2c.c   |  2 +-
 drivers/mfd/tps6594-spi.c   |  2 +-
 include/linux/mfd/tps6594.h |  4 +++-
 4 files changed, 15 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/mfd/tps6594-core.c b/drivers/mfd/tps6594-core.c
index 783ee59901e8..089ab8cc8664 100644
--- a/drivers/mfd/tps6594-core.c
+++ b/drivers/mfd/tps6594-core.c
@@ -319,12 +319,16 @@ static struct regmap_irq_chip tps6594_irq_chip = {
 	.handle_post_irq = tps6594_handle_post_irq,
 };
 
-bool tps6594_is_volatile_reg(struct device *dev, unsigned int reg)
-{
-	return (reg >= TPS6594_REG_INT_TOP && reg <= TPS6594_REG_STAT_READBACK_ERR) ||
-	       reg == TPS6594_REG_RTC_STATUS;
-}
-EXPORT_SYMBOL_GPL(tps6594_is_volatile_reg);
+static const struct regmap_range tps6594_volatile_ranges[] = {
+	regmap_reg_range(TPS6594_REG_INT_TOP, TPS6594_REG_STAT_READBACK_ERR),
+	regmap_reg_range(TPS6594_REG_RTC_STATUS, TPS6594_REG_RTC_STATUS),
+};
+
+const struct regmap_access_table tps6594_volatile_table = {
+	.yes_ranges = tps6594_volatile_ranges,
+	.n_yes_ranges = ARRAY_SIZE(tps6594_volatile_ranges),
+};
+EXPORT_SYMBOL_GPL(tps6594_volatile_table);
 
 static int tps6594_check_crc_mode(struct tps6594 *tps, bool primary_pmic)
 {
diff --git a/drivers/mfd/tps6594-i2c.c b/drivers/mfd/tps6594-i2c.c
index 899c88c0fe77..c125b474ba8a 100644
--- a/drivers/mfd/tps6594-i2c.c
+++ b/drivers/mfd/tps6594-i2c.c
@@ -187,7 +187,7 @@ static const struct regmap_config tps6594_i2c_regmap_config = {
 	.reg_bits = 16,
 	.val_bits = 8,
 	.max_register = TPS6594_REG_DWD_FAIL_CNT_REG,
-	.volatile_reg = tps6594_is_volatile_reg,
+	.volatile_table = &tps6594_volatile_table,
 	.read = tps6594_i2c_read,
 	.write = tps6594_i2c_write,
 };
diff --git a/drivers/mfd/tps6594-spi.c b/drivers/mfd/tps6594-spi.c
index 24b72847e3f5..5afb1736f0ae 100644
--- a/drivers/mfd/tps6594-spi.c
+++ b/drivers/mfd/tps6594-spi.c
@@ -70,7 +70,7 @@ static const struct regmap_config tps6594_spi_regmap_config = {
 	.reg_bits = 16,
 	.val_bits = 8,
 	.max_register = TPS6594_REG_DWD_FAIL_CNT_REG,
-	.volatile_reg = tps6594_is_volatile_reg,
+	.volatile_table = &tps6594_volatile_table,
 	.reg_read = tps6594_spi_reg_read,
 	.reg_write = tps6594_spi_reg_write,
 	.use_single_read = true,
diff --git a/include/linux/mfd/tps6594.h b/include/linux/mfd/tps6594.h
index e754c01acc64..16543fd4d83e 100644
--- a/include/linux/mfd/tps6594.h
+++ b/include/linux/mfd/tps6594.h
@@ -1337,7 +1337,9 @@ struct tps6594 {
 	struct regmap_irq_chip_data *irq_data;
 };
 
-bool tps6594_is_volatile_reg(struct device *dev, unsigned int reg);
+extern const struct regmap_access_table tps6594_volatile_table;
+extern const struct regmap_access_table tps65224_volatile_table;
+
 int tps6594_device_init(struct tps6594 *tps, bool enable_crc);
 
 #endif /*  __LINUX_MFD_TPS6594_H */
-- 
cgit v1.2.3


From a2e25c8165f93ca8a2d54bf230e1bb7e0319b46c Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzk@kernel.org>
Date: Wed, 24 Apr 2024 08:33:27 +0200
Subject: backlight: lcd: Constify lcd_ops

'struct lcd_ops' passed in lcd_device_register() is not modified by core
backlight code, so it can be made const for code safety.  This allows
drivers to also define the structure as const.

Signed-off-by: Krzysztof Kozlowski <krzk@kernel.org>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Link: https://lore.kernel.org/r/20240424-video-backlight-lcd-ops-v2-1-1aaa82b07bc6@kernel.org
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/video/backlight/lcd.c | 4 ++--
 include/linux/lcd.h           | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/video/backlight/lcd.c b/drivers/video/backlight/lcd.c
index 77c5cb2a44e2..ba3e37b5b584 100644
--- a/drivers/video/backlight/lcd.c
+++ b/drivers/video/backlight/lcd.c
@@ -188,7 +188,7 @@ ATTRIBUTE_GROUPS(lcd_device);
  * or a pointer to the newly allocated device.
  */
 struct lcd_device *lcd_device_register(const char *name, struct device *parent,
-		void *devdata, struct lcd_ops *ops)
+		void *devdata, const struct lcd_ops *ops)
 {
 	struct lcd_device *new_ld;
 	int rc;
@@ -276,7 +276,7 @@ static int devm_lcd_device_match(struct device *dev, void *res, void *data)
  */
 struct lcd_device *devm_lcd_device_register(struct device *dev,
 		const char *name, struct device *parent,
-		void *devdata, struct lcd_ops *ops)
+		void *devdata, const struct lcd_ops *ops)
 {
 	struct lcd_device **ptr, *lcd;
 
diff --git a/include/linux/lcd.h b/include/linux/lcd.h
index 238fb1dfed98..68703a51dc53 100644
--- a/include/linux/lcd.h
+++ b/include/linux/lcd.h
@@ -61,7 +61,7 @@ struct lcd_device {
 	   points to something in the body of that driver, it is also invalid. */
 	struct mutex ops_lock;
 	/* If this is NULL, the backing module is unloaded */
-	struct lcd_ops *ops;
+	const struct lcd_ops *ops;
 	/* Serialise access to set_power method */
 	struct mutex update_lock;
 	/* The framebuffer notifier block */
@@ -102,10 +102,10 @@ static inline void lcd_set_power(struct lcd_device *ld, int power)
 }
 
 extern struct lcd_device *lcd_device_register(const char *name,
-	struct device *parent, void *devdata, struct lcd_ops *ops);
+	struct device *parent, void *devdata, const struct lcd_ops *ops);
 extern struct lcd_device *devm_lcd_device_register(struct device *dev,
 	const char *name, struct device *parent,
-	void *devdata, struct lcd_ops *ops);
+	void *devdata, const struct lcd_ops *ops);
 extern void lcd_device_unregister(struct lcd_device *ld);
 extern void devm_lcd_device_unregister(struct device *dev,
 	struct lcd_device *ld);
-- 
cgit v1.2.3


From 03b3d00a70b55857439511c1b558ca00a99f4126 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Thu, 2 May 2024 16:45:45 +0100
Subject: KVM: arm64: vgic: Allocate private interrupts on demand

Private interrupts are currently part of the CPU interface structure
that is part of each and every vcpu we create.

Currently, we have 32 of them per vcpu, resulting in a per-vcpu array
that is just shy of 4kB. On its own, that's no big deal, but it gets
in the way of other things:

- each vcpu gets mapped at EL2 on nVHE/hVHE configurations. This
  requires memory that is physically contiguous. However, the EL2
  code has no purpose looking at the interrupt structures and
  could do without them being mapped.

- supporting features such as EPPIs, which extend the number of
  private interrupts past the 32 limit would make the array
  even larger, even for VMs that do not use the EPPI feature.

Address these issues by moving the private interrupt array outside
of the vcpu, and replace it with a simple pointer. We take this
opportunity to make it obvious what gets initialised when, as
that path was remarkably opaque, and tighten the locking.

Reviewed-by: Oliver Upton <oliver.upton@linux.dev>
Link: https://lore.kernel.org/r/20240502154545.3012089-1-maz@kernel.org
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/vgic/vgic-init.c | 82 +++++++++++++++++++++++++++++++----------
 include/kvm/arm_vgic.h          |  2 +-
 2 files changed, 64 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index f20941f83a07..d3787fb09251 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -182,27 +182,22 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
 	return 0;
 }
 
-/**
- * kvm_vgic_vcpu_init() - Initialize static VGIC VCPU data
- * structures and register VCPU-specific KVM iodevs
- *
- * @vcpu: pointer to the VCPU being created and initialized
- *
- * Only do initialization, but do not actually enable the
- * VGIC CPU interface
- */
-int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
+static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu)
 {
 	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-	int ret = 0;
 	int i;
 
-	vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF;
+	lockdep_assert_held(&vcpu->kvm->arch.config_lock);
 
-	INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
-	raw_spin_lock_init(&vgic_cpu->ap_list_lock);
-	atomic_set(&vgic_cpu->vgic_v3.its_vpe.vlpi_count, 0);
+	if (vgic_cpu->private_irqs)
+		return 0;
+
+	vgic_cpu->private_irqs = kcalloc(VGIC_NR_PRIVATE_IRQS,
+					 sizeof(struct vgic_irq),
+					 GFP_KERNEL_ACCOUNT);
+
+	if (!vgic_cpu->private_irqs)
+		return -ENOMEM;
 
 	/*
 	 * Enable and configure all SGIs to be edge-triggered and
@@ -227,9 +222,48 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
 		}
 	}
 
+	return 0;
+}
+
+static int vgic_allocate_private_irqs(struct kvm_vcpu *vcpu)
+{
+	int ret;
+
+	mutex_lock(&vcpu->kvm->arch.config_lock);
+	ret = vgic_allocate_private_irqs_locked(vcpu);
+	mutex_unlock(&vcpu->kvm->arch.config_lock);
+
+	return ret;
+}
+
+/**
+ * kvm_vgic_vcpu_init() - Initialize static VGIC VCPU data
+ * structures and register VCPU-specific KVM iodevs
+ *
+ * @vcpu: pointer to the VCPU being created and initialized
+ *
+ * Only do initialization, but do not actually enable the
+ * VGIC CPU interface
+ */
+int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
+{
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+	int ret = 0;
+
+	vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF;
+
+	INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
+	raw_spin_lock_init(&vgic_cpu->ap_list_lock);
+	atomic_set(&vgic_cpu->vgic_v3.its_vpe.vlpi_count, 0);
+
 	if (!irqchip_in_kernel(vcpu->kvm))
 		return 0;
 
+	ret = vgic_allocate_private_irqs(vcpu);
+	if (ret)
+		return ret;
+
 	/*
 	 * If we are creating a VCPU with a GICv3 we must also register the
 	 * KVM io device for the redistributor that belongs to this VCPU.
@@ -285,10 +319,13 @@ int vgic_init(struct kvm *kvm)
 
 	/* Initialize groups on CPUs created before the VGIC type was known */
 	kvm_for_each_vcpu(idx, vcpu, kvm) {
-		struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+		ret = vgic_allocate_private_irqs_locked(vcpu);
+		if (ret)
+			goto out;
 
 		for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
-			struct vgic_irq *irq = &vgic_cpu->private_irqs[i];
+			struct vgic_irq *irq = vgic_get_irq(kvm, vcpu, i);
+
 			switch (dist->vgic_model) {
 			case KVM_DEV_TYPE_ARM_VGIC_V3:
 				irq->group = 1;
@@ -300,8 +337,12 @@ int vgic_init(struct kvm *kvm)
 				break;
 			default:
 				ret = -EINVAL;
-				goto out;
 			}
+
+			vgic_put_irq(kvm, irq);
+
+			if (ret)
+				goto out;
 		}
 	}
 
@@ -381,6 +422,9 @@ static void __kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
 	vgic_flush_pending_lpis(vcpu);
 
 	INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
+	kfree(vgic_cpu->private_irqs);
+	vgic_cpu->private_irqs = NULL;
+
 	if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
 		vgic_unregister_redist_iodev(vcpu);
 		vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF;
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 47035946648e..a7397f37f4dd 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -330,7 +330,7 @@ struct vgic_cpu {
 		struct vgic_v3_cpu_if	vgic_v3;
 	};
 
-	struct vgic_irq private_irqs[VGIC_NR_PRIVATE_IRQS];
+	struct vgic_irq *private_irqs;
 
 	raw_spinlock_t ap_list_lock;	/* Protects the ap_list */
 
-- 
cgit v1.2.3


From bf8367b00c33c64a9391c262bb2e11d274c9f2a4 Mon Sep 17 00:00:00 2001
From: Jean-Baptiste Maneyrol <jean-baptiste.maneyrol@tdk.com>
Date: Fri, 26 Apr 2024 09:48:35 +0000
Subject: iio: invensense: fix timestamp glitches when switching frequency

When a sensor is running and there is a FIFO frequency change due to
another sensor turned on/off, there are glitches on timestamp. Fix that
by using only interrupt timestamp when there is the corresponding sensor
data in the FIFO.

Delete FIFO period handling and simplify internal functions.

Update integration inside inv_mpu6050 and inv_icm42600 drivers.

Fixes: 0ecc363ccea7 ("iio: make invensense timestamp module generic")
Cc: Stable@vger.kernel.org
Signed-off-by: Jean-Baptiste Maneyrol <jean-baptiste.maneyrol@tdk.com>
Link: https://lore.kernel.org/r/20240426094835.138389-1-inv.git-commit@tdk.com
Signed-off-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
---
 .../iio/common/inv_sensors/inv_sensors_timestamp.c | 24 ++++++++++------------
 drivers/iio/imu/inv_icm42600/inv_icm42600_buffer.c | 20 ++++++++----------
 drivers/iio/imu/inv_mpu6050/inv_mpu_ring.c         |  2 +-
 include/linux/iio/common/inv_sensors_timestamp.h   |  3 +--
 4 files changed, 21 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/drivers/iio/common/inv_sensors/inv_sensors_timestamp.c b/drivers/iio/common/inv_sensors/inv_sensors_timestamp.c
index 4b8ec16240b5..fa205f17bd90 100644
--- a/drivers/iio/common/inv_sensors/inv_sensors_timestamp.c
+++ b/drivers/iio/common/inv_sensors/inv_sensors_timestamp.c
@@ -70,13 +70,13 @@ int inv_sensors_timestamp_update_odr(struct inv_sensors_timestamp *ts,
 }
 EXPORT_SYMBOL_NS_GPL(inv_sensors_timestamp_update_odr, IIO_INV_SENSORS_TIMESTAMP);
 
-static bool inv_validate_period(struct inv_sensors_timestamp *ts, uint32_t period, uint32_t mult)
+static bool inv_validate_period(struct inv_sensors_timestamp *ts, uint32_t period)
 {
 	uint32_t period_min, period_max;
 
 	/* check that period is acceptable */
-	period_min = ts->min_period * mult;
-	period_max = ts->max_period * mult;
+	period_min = ts->min_period * ts->mult;
+	period_max = ts->max_period * ts->mult;
 	if (period > period_min && period < period_max)
 		return true;
 	else
@@ -84,15 +84,15 @@ static bool inv_validate_period(struct inv_sensors_timestamp *ts, uint32_t perio
 }
 
 static bool inv_update_chip_period(struct inv_sensors_timestamp *ts,
-				    uint32_t mult, uint32_t period)
+				   uint32_t period)
 {
 	uint32_t new_chip_period;
 
-	if (!inv_validate_period(ts, period, mult))
+	if (!inv_validate_period(ts, period))
 		return false;
 
 	/* update chip internal period estimation */
-	new_chip_period = period / mult;
+	new_chip_period = period / ts->mult;
 	inv_update_acc(&ts->chip_period, new_chip_period);
 	ts->period = ts->mult * ts->chip_period.val;
 
@@ -125,16 +125,14 @@ static void inv_align_timestamp_it(struct inv_sensors_timestamp *ts)
 }
 
 void inv_sensors_timestamp_interrupt(struct inv_sensors_timestamp *ts,
-				      uint32_t fifo_period, size_t fifo_nb,
-				      size_t sensor_nb, int64_t timestamp)
+				     size_t sample_nb, int64_t timestamp)
 {
 	struct inv_sensors_timestamp_interval *it;
 	int64_t delta, interval;
-	const uint32_t fifo_mult = fifo_period / ts->chip.clock_period;
 	uint32_t period;
 	bool valid = false;
 
-	if (fifo_nb == 0)
+	if (sample_nb == 0)
 		return;
 
 	/* update interrupt timestamp and compute chip and sensor periods */
@@ -144,14 +142,14 @@ void inv_sensors_timestamp_interrupt(struct inv_sensors_timestamp *ts,
 	delta = it->up - it->lo;
 	if (it->lo != 0) {
 		/* compute period: delta time divided by number of samples */
-		period = div_s64(delta, fifo_nb);
-		valid = inv_update_chip_period(ts, fifo_mult, period);
+		period = div_s64(delta, sample_nb);
+		valid = inv_update_chip_period(ts, period);
 	}
 
 	/* no previous data, compute theoritical value from interrupt */
 	if (ts->timestamp == 0) {
 		/* elapsed time: sensor period * sensor samples number */
-		interval = (int64_t)ts->period * (int64_t)sensor_nb;
+		interval = (int64_t)ts->period * (int64_t)sample_nb;
 		ts->timestamp = it->up - interval;
 		return;
 	}
diff --git a/drivers/iio/imu/inv_icm42600/inv_icm42600_buffer.c b/drivers/iio/imu/inv_icm42600/inv_icm42600_buffer.c
index cfb4a41ab7c1..63b85ec88c13 100644
--- a/drivers/iio/imu/inv_icm42600/inv_icm42600_buffer.c
+++ b/drivers/iio/imu/inv_icm42600/inv_icm42600_buffer.c
@@ -512,20 +512,20 @@ int inv_icm42600_buffer_fifo_parse(struct inv_icm42600_state *st)
 		return 0;
 
 	/* handle gyroscope timestamp and FIFO data parsing */
-	ts = &gyro_st->ts;
-	inv_sensors_timestamp_interrupt(ts, st->fifo.period, st->fifo.nb.total,
-					st->fifo.nb.gyro, st->timestamp.gyro);
 	if (st->fifo.nb.gyro > 0) {
+		ts = &gyro_st->ts;
+		inv_sensors_timestamp_interrupt(ts, st->fifo.nb.gyro,
+						st->timestamp.gyro);
 		ret = inv_icm42600_gyro_parse_fifo(st->indio_gyro);
 		if (ret)
 			return ret;
 	}
 
 	/* handle accelerometer timestamp and FIFO data parsing */
-	ts = &accel_st->ts;
-	inv_sensors_timestamp_interrupt(ts, st->fifo.period, st->fifo.nb.total,
-					st->fifo.nb.accel, st->timestamp.accel);
 	if (st->fifo.nb.accel > 0) {
+		ts = &accel_st->ts;
+		inv_sensors_timestamp_interrupt(ts, st->fifo.nb.accel,
+						st->timestamp.accel);
 		ret = inv_icm42600_accel_parse_fifo(st->indio_accel);
 		if (ret)
 			return ret;
@@ -555,9 +555,7 @@ int inv_icm42600_buffer_hwfifo_flush(struct inv_icm42600_state *st,
 
 	if (st->fifo.nb.gyro > 0) {
 		ts = &gyro_st->ts;
-		inv_sensors_timestamp_interrupt(ts, st->fifo.period,
-						st->fifo.nb.total, st->fifo.nb.gyro,
-						gyro_ts);
+		inv_sensors_timestamp_interrupt(ts, st->fifo.nb.gyro, gyro_ts);
 		ret = inv_icm42600_gyro_parse_fifo(st->indio_gyro);
 		if (ret)
 			return ret;
@@ -565,9 +563,7 @@ int inv_icm42600_buffer_hwfifo_flush(struct inv_icm42600_state *st,
 
 	if (st->fifo.nb.accel > 0) {
 		ts = &accel_st->ts;
-		inv_sensors_timestamp_interrupt(ts, st->fifo.period,
-						st->fifo.nb.total, st->fifo.nb.accel,
-						accel_ts);
+		inv_sensors_timestamp_interrupt(ts, st->fifo.nb.accel, accel_ts);
 		ret = inv_icm42600_accel_parse_fifo(st->indio_accel);
 		if (ret)
 			return ret;
diff --git a/drivers/iio/imu/inv_mpu6050/inv_mpu_ring.c b/drivers/iio/imu/inv_mpu6050/inv_mpu_ring.c
index 86465226f7e1..0dc0f22a5582 100644
--- a/drivers/iio/imu/inv_mpu6050/inv_mpu_ring.c
+++ b/drivers/iio/imu/inv_mpu6050/inv_mpu_ring.c
@@ -100,7 +100,7 @@ irqreturn_t inv_mpu6050_read_fifo(int irq, void *p)
 		goto end_session;
 	/* Each FIFO data contains all sensors, so same number for FIFO and sensor data */
 	fifo_period = NSEC_PER_SEC / INV_MPU6050_DIVIDER_TO_FIFO_RATE(st->chip_config.divider);
-	inv_sensors_timestamp_interrupt(&st->timestamp, fifo_period, nb, nb, pf->timestamp);
+	inv_sensors_timestamp_interrupt(&st->timestamp, nb, pf->timestamp);
 	inv_sensors_timestamp_apply_odr(&st->timestamp, fifo_period, nb, 0);
 
 	/* clear internal data buffer for avoiding kernel data leak */
diff --git a/include/linux/iio/common/inv_sensors_timestamp.h b/include/linux/iio/common/inv_sensors_timestamp.h
index a47d304d1ba7..8d506f1e9df2 100644
--- a/include/linux/iio/common/inv_sensors_timestamp.h
+++ b/include/linux/iio/common/inv_sensors_timestamp.h
@@ -71,8 +71,7 @@ int inv_sensors_timestamp_update_odr(struct inv_sensors_timestamp *ts,
 				     uint32_t period, bool fifo);
 
 void inv_sensors_timestamp_interrupt(struct inv_sensors_timestamp *ts,
-				     uint32_t fifo_period, size_t fifo_nb,
-				     size_t sensor_nb, int64_t timestamp);
+				     size_t sample_nb, int64_t timestamp);
 
 static inline int64_t inv_sensors_timestamp_pop(struct inv_sensors_timestamp *ts)
 {
-- 
cgit v1.2.3


From 78a7b5dbc0609b8c395f754953a45db687cb05e6 Mon Sep 17 00:00:00 2001
From: Joel Granados <j.granados@samsung.com>
Date: Wed, 1 May 2024 11:29:32 +0200
Subject: ax.25: x.25: Remove the now superfluous sentinel elements from
 ctl_table array

This commit comes at the tail end of a greater effort to remove the
empty elements at the end of the ctl_table arrays (sentinels) which will
reduce the overall build time size of the kernel and run time memory
bloat by ~64 bytes per sentinel (further information Link :
https://lore.kernel.org/all/ZO5Yx5JFogGi%2FcBo@bombadil.infradead.org/)

Avoid a buffer overflow when traversing the ctl_table by ensuring that
AX25_MAX_VALUES is the same as the size of ax25_param_table. This is
done with a BUILD_BUG_ON where ax25_param_table is defined and a
CONFIG_AX25_DAMA_SLAVE guard in the unnamed enum definition as well as
in the ax25_dev_device_up and ax25_ds_set_timer functions.

The overflow happened when the sentinel was removed from
ax25_param_table. The sentinel's data element was changed when
CONFIG_AX25_DAMA_SLAVE was undefined. This had no adverse effects as it
still stopped on the sentinel's null procname but needed to be addressed
once the sentinel was removed.

Signed-off-by: Joel Granados <j.granados@samsung.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ax25.h         | 2 ++
 net/ax25/ax25_dev.c        | 3 +++
 net/ax25/ax25_ds_timer.c   | 1 +
 net/ax25/sysctl_net_ax25.c | 3 +--
 net/x25/sysctl_net_x25.c   | 1 -
 5 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/ax25.h b/include/net/ax25.h
index 0d939e5aee4e..eb9cee8252c8 100644
--- a/include/net/ax25.h
+++ b/include/net/ax25.h
@@ -139,7 +139,9 @@ enum {
 	AX25_VALUES_N2,		/* Default N2 value */
 	AX25_VALUES_PACLEN,	/* AX.25 MTU */
 	AX25_VALUES_PROTOCOL,	/* Std AX.25, DAMA Slave, DAMA Master */
+#ifdef CONFIG_AX25_DAMA_SLAVE
 	AX25_VALUES_DS_TIMEOUT,	/* DAMA Slave timeout */
+#endif
 	AX25_MAX_VALUES		/* THIS MUST REMAIN THE LAST ENTRY OF THIS LIST */
 };
 
diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c
index 282ec581c072..0bc682ffae9c 100644
--- a/net/ax25/ax25_dev.c
+++ b/net/ax25/ax25_dev.c
@@ -78,7 +78,10 @@ void ax25_dev_device_up(struct net_device *dev)
 	ax25_dev->values[AX25_VALUES_N2]        = AX25_DEF_N2;
 	ax25_dev->values[AX25_VALUES_PACLEN]	= AX25_DEF_PACLEN;
 	ax25_dev->values[AX25_VALUES_PROTOCOL]  = AX25_DEF_PROTOCOL;
+
+#ifdef CONFIG_AX25_DAMA_SLAVE
 	ax25_dev->values[AX25_VALUES_DS_TIMEOUT]= AX25_DEF_DS_TIMEOUT;
+#endif
 
 #if defined(CONFIG_AX25_DAMA_SLAVE) || defined(CONFIG_AX25_DAMA_MASTER)
 	ax25_ds_setup_timer(ax25_dev);
diff --git a/net/ax25/ax25_ds_timer.c b/net/ax25/ax25_ds_timer.c
index c4f8adbf8144..c50a58d9e368 100644
--- a/net/ax25/ax25_ds_timer.c
+++ b/net/ax25/ax25_ds_timer.c
@@ -55,6 +55,7 @@ void ax25_ds_set_timer(ax25_dev *ax25_dev)
 	ax25_dev->dama.slave_timeout =
 		msecs_to_jiffies(ax25_dev->values[AX25_VALUES_DS_TIMEOUT]) / 10;
 	mod_timer(&ax25_dev->dama.slave_timer, jiffies + HZ);
+	return;
 }
 
 /*
diff --git a/net/ax25/sysctl_net_ax25.c b/net/ax25/sysctl_net_ax25.c
index e0128dc9def3..68753aa30334 100644
--- a/net/ax25/sysctl_net_ax25.c
+++ b/net/ax25/sysctl_net_ax25.c
@@ -141,8 +141,6 @@ static const struct ctl_table ax25_param_table[] = {
 		.extra2		= &max_ds_timeout
 	},
 #endif
-
-	{ }	/* that's all, folks! */
 };
 
 int ax25_register_dev_sysctl(ax25_dev *ax25_dev)
@@ -155,6 +153,7 @@ int ax25_register_dev_sysctl(ax25_dev *ax25_dev)
 	if (!table)
 		return -ENOMEM;
 
+	BUILD_BUG_ON(ARRAY_SIZE(ax25_param_table) != AX25_MAX_VALUES);
 	for (k = 0; k < AX25_MAX_VALUES; k++)
 		table[k].data = &ax25_dev->values[k];
 
diff --git a/net/x25/sysctl_net_x25.c b/net/x25/sysctl_net_x25.c
index e9802afa43d0..643f50874dfe 100644
--- a/net/x25/sysctl_net_x25.c
+++ b/net/x25/sysctl_net_x25.c
@@ -71,7 +71,6 @@ static struct ctl_table x25_table[] = {
 		.mode = 	0644,
 		.proc_handler = proc_dointvec,
 	},
-	{ },
 };
 
 int __init x25_register_sysctl(void)
-- 
cgit v1.2.3


From c56436ef17520c5fb0f9c2fc47aa961a7946895f Mon Sep 17 00:00:00 2001
From: Oreoluwa Babatunde <quic_obabatun@quicinc.com>
Date: Thu, 2 May 2024 12:24:03 -0700
Subject: of: reserved_mem: Remove the use of phandle from the reserved_mem
 APIs

The __find_rmem() function is the only place that references the phandle
field of the reserved_mem struct. __find_rmem() is used to match a
device_node object to its corresponding entry in the reserved_mem array
using its phandle value. But, there is already a function called
of_reserved_mem_lookup() which carries out the same action using the
name of the node.

Using the of_reserved_mem_lookup() function is more reliable because
every node is guaranteed to have a name, but not all nodes will have a
phandle.

Nodes are only assigned a phandle if they are explicitly defined in the
DT using "phandle = <phandle_number>", or if they are referenced by
another node in the DT. Hence, If the phandle field is empty, then
__find_rmem() will return a false negative.

Hence, delete the __find_rmem() function and switch to using the
of_reserved_mem_lookup() function to find the corresponding entry of a
device_node in the reserved_mem array. Since the phandle field of the
reserved_mem struct is now unused, delete that as well.

Signed-off-by: Oreoluwa Babatunde <quic_obabatun@quicinc.com>
Link: https://lore.kernel.org/r/20240502192403.3307277-1-quic_obabatun@quicinc.com
Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
---
 drivers/of/of_reserved_mem.c    | 22 +---------------------
 include/linux/of_reserved_mem.h |  1 -
 2 files changed, 1 insertion(+), 22 deletions(-)

(limited to 'include')

diff --git a/drivers/of/of_reserved_mem.c b/drivers/of/of_reserved_mem.c
index 8236ecae2953..46e1c3fbc769 100644
--- a/drivers/of/of_reserved_mem.c
+++ b/drivers/of/of_reserved_mem.c
@@ -437,17 +437,10 @@ void __init fdt_init_reserved_mem(void)
 	for (i = 0; i < reserved_mem_count; i++) {
 		struct reserved_mem *rmem = &reserved_mem[i];
 		unsigned long node = rmem->fdt_node;
-		int len;
-		const __be32 *prop;
 		int err = 0;
 		bool nomap;
 
 		nomap = of_get_flat_dt_prop(node, "no-map", NULL) != NULL;
-		prop = of_get_flat_dt_prop(node, "phandle", &len);
-		if (!prop)
-			prop = of_get_flat_dt_prop(node, "linux,phandle", &len);
-		if (prop)
-			rmem->phandle = of_read_number(prop, len/4);
 
 		if (rmem->size == 0)
 			err = __reserved_mem_alloc_size(node, rmem->name,
@@ -477,19 +470,6 @@ void __init fdt_init_reserved_mem(void)
 	}
 }
 
-static inline struct reserved_mem *__find_rmem(struct device_node *node)
-{
-	unsigned int i;
-
-	if (!node->phandle)
-		return NULL;
-
-	for (i = 0; i < reserved_mem_count; i++)
-		if (reserved_mem[i].phandle == node->phandle)
-			return &reserved_mem[i];
-	return NULL;
-}
-
 struct rmem_assigned_device {
 	struct device *dev;
 	struct reserved_mem *rmem;
@@ -534,7 +514,7 @@ int of_reserved_mem_device_init_by_idx(struct device *dev,
 		return 0;
 	}
 
-	rmem = __find_rmem(target);
+	rmem = of_reserved_mem_lookup(target);
 	of_node_put(target);
 
 	if (!rmem || !rmem->ops || !rmem->ops->device_init)
diff --git a/include/linux/of_reserved_mem.h b/include/linux/of_reserved_mem.h
index 4de2a24cadc9..e338282da652 100644
--- a/include/linux/of_reserved_mem.h
+++ b/include/linux/of_reserved_mem.h
@@ -11,7 +11,6 @@ struct reserved_mem_ops;
 struct reserved_mem {
 	const char			*name;
 	unsigned long			fdt_node;
-	unsigned long			phandle;
 	const struct reserved_mem_ops	*ops;
 	phys_addr_t			base;
 	phys_addr_t			size;
-- 
cgit v1.2.3


From 140ce28dd3bee8e53acc27f123ae474d69ef66f0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 2 May 2024 15:00:32 +0200
Subject: block: add a disk_has_partscan helper

Add a helper to check if partition scanning is enabled instead of
open coding the check in a few places.  This now always checks for
the hidden flag even if all but one of the callers are never reachable
for hidden gendisks.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20240502130033.1958492-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c           |  7 ++-----
 block/partitions/core.c |  5 +----
 include/linux/blkdev.h  | 13 +++++++++++++
 3 files changed, 16 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/block/genhd.c b/block/genhd.c
index eb893df56d51..4b85963d09db 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -345,9 +345,7 @@ int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode)
 	struct file *file;
 	int ret = 0;
 
-	if (disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN))
-		return -EINVAL;
-	if (test_bit(GD_SUPPRESS_PART_SCAN, &disk->state))
+	if (!disk_has_partscan(disk))
 		return -EINVAL;
 	if (disk->open_partitions)
 		return -EBUSY;
@@ -503,8 +501,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
 			goto out_unregister_bdi;
 
 		/* Make sure the first partition scan will be proceed */
-		if (get_capacity(disk) && !(disk->flags & GENHD_FL_NO_PART) &&
-		    !test_bit(GD_SUPPRESS_PART_SCAN, &disk->state))
+		if (get_capacity(disk) && disk_has_partscan(disk))
 			set_bit(GD_NEED_PART_SCAN, &disk->state);
 
 		bdev_add(disk->part0, ddev->devt);
diff --git a/block/partitions/core.c b/block/partitions/core.c
index b11e88c82c8c..37b5f92d07fe 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -573,10 +573,7 @@ static int blk_add_partitions(struct gendisk *disk)
 	struct parsed_partitions *state;
 	int ret = -EAGAIN, p;
 
-	if (disk->flags & GENHD_FL_NO_PART)
-		return 0;
-
-	if (test_bit(GD_SUPPRESS_PART_SCAN, &disk->state))
+	if (!disk_has_partscan(disk))
 		return 0;
 
 	state = check_partition(disk);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 040a22e0eda0..3b18a40a1fc1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -230,6 +230,19 @@ static inline unsigned int disk_openers(struct gendisk *disk)
 	return atomic_read(&disk->part0->bd_openers);
 }
 
+/**
+ * disk_has_partscan - return %true if partition scanning is enabled on a disk
+ * @disk: disk to check
+ *
+ * Returns %true if partitions scanning is enabled for @disk, or %false if
+ * partition scanning is disabled either permanently or temporarily.
+ */
+static inline bool disk_has_partscan(struct gendisk *disk)
+{
+	return !(disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN)) &&
+		!test_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
+}
+
 /*
  * The gendisk is refcounted by the part0 block_device, and the bd_device
  * therein is also used for device model presentation in sysfs.
-- 
cgit v1.2.3


From 9c313ccdfc079f71f16c9b3d3a6c6d60996b9367 Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@toblux.com>
Date: Sun, 21 Apr 2024 00:38:37 +0200
Subject: bitops: Change function return types from long to int

Change the return types of bitops functions (ffs, fls, and fns) from
long to int. The expected return values are in the range [0, 64], for
which int is sufficient.

Additionally, int aligns well with the return types of the corresponding
__builtin_* functions, potentially reducing overall type conversions.

Many of the existing bitops functions already return an int and don't
need to be changed. The bitops functions in arch/ should be considered
separately.

Adjust some return variables to match the function return types.

With GCC 13 and defconfig, these changes reduced the size of a test
kernel image by 5,432 bytes on arm64 and by 248 bytes on riscv; there
were no changes in size on x86_64, powerpc, or m68k.

Signed-off-by: Thorsten Blum <thorsten.blum@toblux.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/asm-generic/bitops/__ffs.h         | 4 ++--
 include/asm-generic/bitops/__fls.h         | 4 ++--
 include/asm-generic/bitops/builtin-__ffs.h | 2 +-
 include/asm-generic/bitops/builtin-__fls.h | 2 +-
 include/linux/bitops.h                     | 6 +++---
 tools/include/asm-generic/bitops/__ffs.h   | 4 ++--
 tools/include/asm-generic/bitops/__fls.h   | 4 ++--
 tools/include/linux/bitops.h               | 2 +-
 8 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/bitops/__ffs.h b/include/asm-generic/bitops/__ffs.h
index 446fea6dda78..2d08c750c8a7 100644
--- a/include/asm-generic/bitops/__ffs.h
+++ b/include/asm-generic/bitops/__ffs.h
@@ -10,9 +10,9 @@
  *
  * Undefined if no bit exists, so code should check against 0 first.
  */
-static __always_inline unsigned long generic___ffs(unsigned long word)
+static __always_inline unsigned int generic___ffs(unsigned long word)
 {
-	int num = 0;
+	unsigned int num = 0;
 
 #if BITS_PER_LONG == 64
 	if ((word & 0xffffffff) == 0) {
diff --git a/include/asm-generic/bitops/__fls.h b/include/asm-generic/bitops/__fls.h
index 54ccccf96e21..e974ec932ec1 100644
--- a/include/asm-generic/bitops/__fls.h
+++ b/include/asm-generic/bitops/__fls.h
@@ -10,9 +10,9 @@
  *
  * Undefined if no set bit exists, so code should check against 0 first.
  */
-static __always_inline unsigned long generic___fls(unsigned long word)
+static __always_inline unsigned int generic___fls(unsigned long word)
 {
-	int num = BITS_PER_LONG - 1;
+	unsigned int num = BITS_PER_LONG - 1;
 
 #if BITS_PER_LONG == 64
 	if (!(word & (~0ul << 32))) {
diff --git a/include/asm-generic/bitops/builtin-__ffs.h b/include/asm-generic/bitops/builtin-__ffs.h
index 87024da44d10..cf4b3d33bf96 100644
--- a/include/asm-generic/bitops/builtin-__ffs.h
+++ b/include/asm-generic/bitops/builtin-__ffs.h
@@ -8,7 +8,7 @@
  *
  * Undefined if no bit exists, so code should check against 0 first.
  */
-static __always_inline unsigned long __ffs(unsigned long word)
+static __always_inline unsigned int __ffs(unsigned long word)
 {
 	return __builtin_ctzl(word);
 }
diff --git a/include/asm-generic/bitops/builtin-__fls.h b/include/asm-generic/bitops/builtin-__fls.h
index 43a5aa9afbdb..6d72fc8a5259 100644
--- a/include/asm-generic/bitops/builtin-__fls.h
+++ b/include/asm-generic/bitops/builtin-__fls.h
@@ -8,7 +8,7 @@
  *
  * Undefined if no set bit exists, so code should check against 0 first.
  */
-static __always_inline unsigned long __fls(unsigned long word)
+static __always_inline unsigned int __fls(unsigned long word)
 {
 	return (sizeof(word) * 8) - 1 - __builtin_clzl(word);
 }
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 2ba557e067fe..f60220f119e2 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -200,7 +200,7 @@ static __always_inline __s64 sign_extend64(__u64 value, int index)
 	return (__s64)(value << shift) >> shift;
 }
 
-static inline unsigned fls_long(unsigned long l)
+static inline unsigned int fls_long(unsigned long l)
 {
 	if (sizeof(l) == 4)
 		return fls(l);
@@ -236,7 +236,7 @@ static inline int get_count_order_long(unsigned long l)
  * The result is not defined if no bits are set, so check that @word
  * is non-zero before calling this.
  */
-static inline unsigned long __ffs64(u64 word)
+static inline unsigned int __ffs64(u64 word)
 {
 #if BITS_PER_LONG == 32
 	if (((u32)word) == 0UL)
@@ -252,7 +252,7 @@ static inline unsigned long __ffs64(u64 word)
  * @word: The word to search
  * @n: Bit to find
  */
-static inline unsigned long fns(unsigned long word, unsigned int n)
+static inline unsigned int fns(unsigned long word, unsigned int n)
 {
 	unsigned int bit;
 
diff --git a/tools/include/asm-generic/bitops/__ffs.h b/tools/include/asm-generic/bitops/__ffs.h
index 9d1310519497..2d94c1e9b2f3 100644
--- a/tools/include/asm-generic/bitops/__ffs.h
+++ b/tools/include/asm-generic/bitops/__ffs.h
@@ -11,9 +11,9 @@
  *
  * Undefined if no bit exists, so code should check against 0 first.
  */
-static __always_inline unsigned long __ffs(unsigned long word)
+static __always_inline unsigned int __ffs(unsigned long word)
 {
-	int num = 0;
+	unsigned int num = 0;
 
 #if __BITS_PER_LONG == 64
 	if ((word & 0xffffffff) == 0) {
diff --git a/tools/include/asm-generic/bitops/__fls.h b/tools/include/asm-generic/bitops/__fls.h
index 54ccccf96e21..e974ec932ec1 100644
--- a/tools/include/asm-generic/bitops/__fls.h
+++ b/tools/include/asm-generic/bitops/__fls.h
@@ -10,9 +10,9 @@
  *
  * Undefined if no set bit exists, so code should check against 0 first.
  */
-static __always_inline unsigned long generic___fls(unsigned long word)
+static __always_inline unsigned int generic___fls(unsigned long word)
 {
-	int num = BITS_PER_LONG - 1;
+	unsigned int num = BITS_PER_LONG - 1;
 
 #if BITS_PER_LONG == 64
 	if (!(word & (~0ul << 32))) {
diff --git a/tools/include/linux/bitops.h b/tools/include/linux/bitops.h
index 7319f6ced108..8e073a111d2a 100644
--- a/tools/include/linux/bitops.h
+++ b/tools/include/linux/bitops.h
@@ -70,7 +70,7 @@ static inline unsigned long hweight_long(unsigned long w)
 	return sizeof(w) == 4 ? hweight32(w) : hweight64(w);
 }
 
-static inline unsigned fls_long(unsigned long l)
+static inline unsigned int fls_long(unsigned long l)
 {
 	if (sizeof(l) == 4)
 		return fls(l);
-- 
cgit v1.2.3


From f178e96de7f0868e1b4d6df687794961f30125f2 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Fri, 29 Mar 2024 21:32:11 +0100
Subject: arch: Remove struct fb_info from video helpers

The per-architecture video helpers do not depend on struct fb_info
or anything else from fbdev. Remove it from the interface and replace
fb_is_primary_device() with video_is_primary_device(). The new helper
is similar in functionality, but can operate on non-fbdev devices.

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Sam Ravnborg <sam@ravnborg.org>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Helge Deller <deller@gmx.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: x86@kernel.org
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/parisc/include/asm/fb.h     |  8 +++++---
 arch/parisc/video/fbdev.c        |  9 +++++----
 arch/sparc/include/asm/fb.h      |  7 ++++---
 arch/sparc/video/fbdev.c         | 17 ++++++++---------
 arch/x86/include/asm/fb.h        |  8 +++++---
 arch/x86/video/fbdev.c           | 18 +++++++-----------
 drivers/video/fbdev/core/fbcon.c |  2 +-
 include/asm-generic/fb.h         | 11 ++++++-----
 8 files changed, 41 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/arch/parisc/include/asm/fb.h b/arch/parisc/include/asm/fb.h
index 658a8a7dc531..ed2a195a3e76 100644
--- a/arch/parisc/include/asm/fb.h
+++ b/arch/parisc/include/asm/fb.h
@@ -2,11 +2,13 @@
 #ifndef _ASM_FB_H_
 #define _ASM_FB_H_
 
-struct fb_info;
+#include <linux/types.h>
+
+struct device;
 
 #if defined(CONFIG_STI_CORE)
-int fb_is_primary_device(struct fb_info *info);
-#define fb_is_primary_device fb_is_primary_device
+bool video_is_primary_device(struct device *dev);
+#define video_is_primary_device video_is_primary_device
 #endif
 
 #include <asm-generic/fb.h>
diff --git a/arch/parisc/video/fbdev.c b/arch/parisc/video/fbdev.c
index e4f8ac99fc9e..540fa0c919d5 100644
--- a/arch/parisc/video/fbdev.c
+++ b/arch/parisc/video/fbdev.c
@@ -5,12 +5,13 @@
  * Copyright (C) 2001-2002 Thomas Bogendoerfer <tsbogend@alpha.franken.de>
  */
 
-#include <linux/fb.h>
 #include <linux/module.h>
 
 #include <video/sticore.h>
 
-int fb_is_primary_device(struct fb_info *info)
+#include <asm/fb.h>
+
+bool video_is_primary_device(struct device *dev)
 {
 	struct sti_struct *sti;
 
@@ -21,6 +22,6 @@ int fb_is_primary_device(struct fb_info *info)
 		return true;
 
 	/* return true if it's the default built-in framebuffer driver */
-	return (sti->dev == info->device);
+	return (sti->dev == dev);
 }
-EXPORT_SYMBOL(fb_is_primary_device);
+EXPORT_SYMBOL(video_is_primary_device);
diff --git a/arch/sparc/include/asm/fb.h b/arch/sparc/include/asm/fb.h
index 24440c0fda49..07f0325d6921 100644
--- a/arch/sparc/include/asm/fb.h
+++ b/arch/sparc/include/asm/fb.h
@@ -3,10 +3,11 @@
 #define _SPARC_FB_H_
 
 #include <linux/io.h>
+#include <linux/types.h>
 
 #include <asm/page.h>
 
-struct fb_info;
+struct device;
 
 #ifdef CONFIG_SPARC32
 static inline pgprot_t pgprot_framebuffer(pgprot_t prot,
@@ -18,8 +19,8 @@ static inline pgprot_t pgprot_framebuffer(pgprot_t prot,
 #define pgprot_framebuffer pgprot_framebuffer
 #endif
 
-int fb_is_primary_device(struct fb_info *info);
-#define fb_is_primary_device fb_is_primary_device
+bool video_is_primary_device(struct device *dev);
+#define video_is_primary_device video_is_primary_device
 
 static inline void fb_memcpy_fromio(void *to, const volatile void __iomem *from, size_t n)
 {
diff --git a/arch/sparc/video/fbdev.c b/arch/sparc/video/fbdev.c
index bff66dd1909a..e46f0499c277 100644
--- a/arch/sparc/video/fbdev.c
+++ b/arch/sparc/video/fbdev.c
@@ -1,26 +1,25 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include <linux/console.h>
-#include <linux/fb.h>
+#include <linux/device.h>
 #include <linux/module.h>
 
+#include <asm/fb.h>
 #include <asm/prom.h>
 
-int fb_is_primary_device(struct fb_info *info)
+bool video_is_primary_device(struct device *dev)
 {
-	struct device *dev = info->device;
-	struct device_node *node;
+	struct device_node *node = dev->of_node;
 
 	if (console_set_on_cmdline)
-		return 0;
+		return false;
 
-	node = dev->of_node;
 	if (node && node == of_console_device)
-		return 1;
+		return true;
 
-	return 0;
+	return false;
 }
-EXPORT_SYMBOL(fb_is_primary_device);
+EXPORT_SYMBOL(video_is_primary_device);
 
 MODULE_DESCRIPTION("Sparc fbdev helpers");
 MODULE_LICENSE("GPL");
diff --git a/arch/x86/include/asm/fb.h b/arch/x86/include/asm/fb.h
index c3b9582de7ef..999db3379286 100644
--- a/arch/x86/include/asm/fb.h
+++ b/arch/x86/include/asm/fb.h
@@ -2,17 +2,19 @@
 #ifndef _ASM_X86_FB_H
 #define _ASM_X86_FB_H
 
+#include <linux/types.h>
+
 #include <asm/page.h>
 
-struct fb_info;
+struct device;
 
 pgprot_t pgprot_framebuffer(pgprot_t prot,
 			    unsigned long vm_start, unsigned long vm_end,
 			    unsigned long offset);
 #define pgprot_framebuffer pgprot_framebuffer
 
-int fb_is_primary_device(struct fb_info *info);
-#define fb_is_primary_device fb_is_primary_device
+bool video_is_primary_device(struct device *dev);
+#define video_is_primary_device video_is_primary_device
 
 #include <asm-generic/fb.h>
 
diff --git a/arch/x86/video/fbdev.c b/arch/x86/video/fbdev.c
index 1dd6528cc947..4d87ce8e257f 100644
--- a/arch/x86/video/fbdev.c
+++ b/arch/x86/video/fbdev.c
@@ -7,7 +7,6 @@
  *
  */
 
-#include <linux/fb.h>
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/vgaarb.h>
@@ -25,20 +24,17 @@ pgprot_t pgprot_framebuffer(pgprot_t prot,
 }
 EXPORT_SYMBOL(pgprot_framebuffer);
 
-int fb_is_primary_device(struct fb_info *info)
+bool video_is_primary_device(struct device *dev)
 {
-	struct device *device = info->device;
-	struct pci_dev *pci_dev;
+	struct pci_dev *pdev;
 
-	if (!device || !dev_is_pci(device))
-		return 0;
+	if (!dev_is_pci(dev))
+		return false;
 
-	pci_dev = to_pci_dev(device);
+	pdev = to_pci_dev(dev);
 
-	if (pci_dev == vga_default_device())
-		return 1;
-	return 0;
+	return (pdev == vga_default_device());
 }
-EXPORT_SYMBOL(fb_is_primary_device);
+EXPORT_SYMBOL(video_is_primary_device);
 
 MODULE_LICENSE("GPL");
diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
index fcabc668e9fb..3f7333dca508 100644
--- a/drivers/video/fbdev/core/fbcon.c
+++ b/drivers/video/fbdev/core/fbcon.c
@@ -2907,7 +2907,7 @@ void fbcon_remap_all(struct fb_info *info)
 static void fbcon_select_primary(struct fb_info *info)
 {
 	if (!map_override && primary_device == -1 &&
-	    fb_is_primary_device(info)) {
+	    video_is_primary_device(info->device)) {
 		int i;
 
 		printk(KERN_INFO "fbcon: %s (fb%i) is primary device\n",
diff --git a/include/asm-generic/fb.h b/include/asm-generic/fb.h
index 6ccabb400aa6..4788c1e1c6bc 100644
--- a/include/asm-generic/fb.h
+++ b/include/asm-generic/fb.h
@@ -10,8 +10,9 @@
 #include <linux/io.h>
 #include <linux/mm_types.h>
 #include <linux/pgtable.h>
+#include <linux/types.h>
 
-struct fb_info;
+struct device;
 
 #ifndef pgprot_framebuffer
 #define pgprot_framebuffer pgprot_framebuffer
@@ -23,11 +24,11 @@ static inline pgprot_t pgprot_framebuffer(pgprot_t prot,
 }
 #endif
 
-#ifndef fb_is_primary_device
-#define fb_is_primary_device fb_is_primary_device
-static inline int fb_is_primary_device(struct fb_info *info)
+#ifndef video_is_primary_device
+#define video_is_primary_device video_is_primary_device
+static inline bool video_is_primary_device(struct device *dev)
 {
-	return 0;
+	return false;
 }
 #endif
 
-- 
cgit v1.2.3


From 2fd001cd36005846caa6456fff1008c6f5bae9d4 Mon Sep 17 00:00:00 2001
From: Thomas Zimmermann <tzimmermann@suse.de>
Date: Fri, 29 Mar 2024 21:32:12 +0100
Subject: arch: Rename fbdev header and source files

The per-architecture fbdev code has no dependencies on fbdev and can
be used for any video-related subsystem. Rename the files to 'video'.
Use video-sti.c on parisc as the source file depends on CONFIG_STI_CORE.

On arc, arm, arm64, sh, and um the asm header file is an empty wrapper
around the file in asm-generic. Let Kbuild generate the file. The build
system does this automatically. Only um needs to generate video.h
explicitly, so that it overrides the host architecture's header. The
latter would otherwise interfere with the build.

Further update all includes statements, include guards, and Makefiles.
Also update a few strings and comments to refer to video instead of
fbdev.

v3:
- arc, arm, arm64, sh: generate asm header via build system (Sam,
Helge, Arnd)
- um: rename fb.h to video.h
- fix typo in commit message (Sam)

Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
Reviewed-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Vineet Gupta <vgupta@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Rich Felker <dalias@libc.org>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: x86@kernel.org
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arc/include/asm/fb.h          |   8 ---
 arch/arm/include/asm/fb.h          |   6 --
 arch/arm64/include/asm/fb.h        |  10 ---
 arch/loongarch/include/asm/fb.h    |  31 ---------
 arch/loongarch/include/asm/video.h |  31 +++++++++
 arch/m68k/include/asm/fb.h         |  32 ---------
 arch/m68k/include/asm/video.h      |  32 +++++++++
 arch/mips/include/asm/fb.h         |  38 -----------
 arch/mips/include/asm/video.h      |  38 +++++++++++
 arch/parisc/include/asm/fb.h       |  16 -----
 arch/parisc/include/asm/video.h    |  16 +++++
 arch/parisc/video/Makefile         |   2 +-
 arch/parisc/video/fbdev.c          |  27 --------
 arch/parisc/video/video-sti.c      |  27 ++++++++
 arch/powerpc/include/asm/fb.h      |  17 -----
 arch/powerpc/include/asm/video.h   |  17 +++++
 arch/powerpc/kernel/pci-common.c   |   2 +-
 arch/sh/include/asm/fb.h           |   7 --
 arch/sparc/include/asm/fb.h        |  45 ------------
 arch/sparc/include/asm/video.h     |  45 ++++++++++++
 arch/sparc/video/Makefile          |   2 +-
 arch/sparc/video/fbdev.c           |  25 -------
 arch/sparc/video/video.c           |  25 +++++++
 arch/um/include/asm/Kbuild         |   2 +-
 arch/x86/include/asm/fb.h          |  21 ------
 arch/x86/include/asm/video.h       |  21 ++++++
 arch/x86/video/Makefile            |   2 +-
 arch/x86/video/fbdev.c             |  40 -----------
 arch/x86/video/video.c             |  41 +++++++++++
 include/asm-generic/Kbuild         |   2 +-
 include/asm-generic/fb.h           | 136 -------------------------------------
 include/asm-generic/video.h        | 136 +++++++++++++++++++++++++++++++++++++
 include/linux/fb.h                 |   2 +-
 33 files changed, 436 insertions(+), 466 deletions(-)
 delete mode 100644 arch/arc/include/asm/fb.h
 delete mode 100644 arch/arm/include/asm/fb.h
 delete mode 100644 arch/arm64/include/asm/fb.h
 delete mode 100644 arch/loongarch/include/asm/fb.h
 create mode 100644 arch/loongarch/include/asm/video.h
 delete mode 100644 arch/m68k/include/asm/fb.h
 create mode 100644 arch/m68k/include/asm/video.h
 delete mode 100644 arch/mips/include/asm/fb.h
 create mode 100644 arch/mips/include/asm/video.h
 delete mode 100644 arch/parisc/include/asm/fb.h
 create mode 100644 arch/parisc/include/asm/video.h
 delete mode 100644 arch/parisc/video/fbdev.c
 create mode 100644 arch/parisc/video/video-sti.c
 delete mode 100644 arch/powerpc/include/asm/fb.h
 create mode 100644 arch/powerpc/include/asm/video.h
 delete mode 100644 arch/sh/include/asm/fb.h
 delete mode 100644 arch/sparc/include/asm/fb.h
 create mode 100644 arch/sparc/include/asm/video.h
 delete mode 100644 arch/sparc/video/fbdev.c
 create mode 100644 arch/sparc/video/video.c
 delete mode 100644 arch/x86/include/asm/fb.h
 create mode 100644 arch/x86/include/asm/video.h
 delete mode 100644 arch/x86/video/fbdev.c
 create mode 100644 arch/x86/video/video.c
 delete mode 100644 include/asm-generic/fb.h
 create mode 100644 include/asm-generic/video.h

(limited to 'include')

diff --git a/arch/arc/include/asm/fb.h b/arch/arc/include/asm/fb.h
deleted file mode 100644
index 9c2383d29cbb..000000000000
--- a/arch/arc/include/asm/fb.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef _ASM_FB_H_
-#define _ASM_FB_H_
-
-#include <asm-generic/fb.h>
-
-#endif /* _ASM_FB_H_ */
diff --git a/arch/arm/include/asm/fb.h b/arch/arm/include/asm/fb.h
deleted file mode 100644
index ce20a43c3033..000000000000
--- a/arch/arm/include/asm/fb.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _ASM_FB_H_
-#define _ASM_FB_H_
-
-#include <asm-generic/fb.h>
-
-#endif /* _ASM_FB_H_ */
diff --git a/arch/arm64/include/asm/fb.h b/arch/arm64/include/asm/fb.h
deleted file mode 100644
index 1a495d8fb2ce..000000000000
--- a/arch/arm64/include/asm/fb.h
+++ /dev/null
@@ -1,10 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2012 ARM Ltd.
- */
-#ifndef __ASM_FB_H_
-#define __ASM_FB_H_
-
-#include <asm-generic/fb.h>
-
-#endif /* __ASM_FB_H_ */
diff --git a/arch/loongarch/include/asm/fb.h b/arch/loongarch/include/asm/fb.h
deleted file mode 100644
index 0b218b10a9ec..000000000000
--- a/arch/loongarch/include/asm/fb.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
- */
-#ifndef _ASM_FB_H_
-#define _ASM_FB_H_
-
-#include <linux/compiler.h>
-#include <linux/string.h>
-
-static inline void fb_memcpy_fromio(void *to, const volatile void __iomem *from, size_t n)
-{
-	memcpy(to, (void __force *)from, n);
-}
-#define fb_memcpy_fromio fb_memcpy_fromio
-
-static inline void fb_memcpy_toio(volatile void __iomem *to, const void *from, size_t n)
-{
-	memcpy((void __force *)to, from, n);
-}
-#define fb_memcpy_toio fb_memcpy_toio
-
-static inline void fb_memset_io(volatile void __iomem *addr, int c, size_t n)
-{
-	memset((void __force *)addr, c, n);
-}
-#define fb_memset fb_memset_io
-
-#include <asm-generic/fb.h>
-
-#endif /* _ASM_FB_H_ */
diff --git a/arch/loongarch/include/asm/video.h b/arch/loongarch/include/asm/video.h
new file mode 100644
index 000000000000..9f76845f2d4f
--- /dev/null
+++ b/arch/loongarch/include/asm/video.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2020-2022 Loongson Technology Corporation Limited
+ */
+#ifndef _ASM_VIDEO_H_
+#define _ASM_VIDEO_H_
+
+#include <linux/compiler.h>
+#include <linux/string.h>
+
+static inline void fb_memcpy_fromio(void *to, const volatile void __iomem *from, size_t n)
+{
+	memcpy(to, (void __force *)from, n);
+}
+#define fb_memcpy_fromio fb_memcpy_fromio
+
+static inline void fb_memcpy_toio(volatile void __iomem *to, const void *from, size_t n)
+{
+	memcpy((void __force *)to, from, n);
+}
+#define fb_memcpy_toio fb_memcpy_toio
+
+static inline void fb_memset_io(volatile void __iomem *addr, int c, size_t n)
+{
+	memset((void __force *)addr, c, n);
+}
+#define fb_memset fb_memset_io
+
+#include <asm-generic/video.h>
+
+#endif /* _ASM_VIDEO_H_ */
diff --git a/arch/m68k/include/asm/fb.h b/arch/m68k/include/asm/fb.h
deleted file mode 100644
index 9941b7434b69..000000000000
--- a/arch/m68k/include/asm/fb.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_FB_H_
-#define _ASM_FB_H_
-
-#include <asm/page.h>
-#include <asm/setup.h>
-
-static inline pgprot_t pgprot_framebuffer(pgprot_t prot,
-					  unsigned long vm_start, unsigned long vm_end,
-					  unsigned long offset)
-{
-#ifdef CONFIG_MMU
-#ifdef CONFIG_SUN3
-	pgprot_val(prot) |= SUN3_PAGE_NOCACHE;
-#else
-	if (CPU_IS_020_OR_030)
-		pgprot_val(prot) |= _PAGE_NOCACHE030;
-	if (CPU_IS_040_OR_060) {
-		pgprot_val(prot) &= _CACHEMASK040;
-		/* Use no-cache mode, serialized */
-		pgprot_val(prot) |= _PAGE_NOCACHE_S;
-	}
-#endif /* CONFIG_SUN3 */
-#endif /* CONFIG_MMU */
-
-	return prot;
-}
-#define pgprot_framebuffer pgprot_framebuffer
-
-#include <asm-generic/fb.h>
-
-#endif /* _ASM_FB_H_ */
diff --git a/arch/m68k/include/asm/video.h b/arch/m68k/include/asm/video.h
new file mode 100644
index 000000000000..6cf2194c413d
--- /dev/null
+++ b/arch/m68k/include/asm/video.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_VIDEO_H_
+#define _ASM_VIDEO_H_
+
+#include <asm/page.h>
+#include <asm/setup.h>
+
+static inline pgprot_t pgprot_framebuffer(pgprot_t prot,
+					  unsigned long vm_start, unsigned long vm_end,
+					  unsigned long offset)
+{
+#ifdef CONFIG_MMU
+#ifdef CONFIG_SUN3
+	pgprot_val(prot) |= SUN3_PAGE_NOCACHE;
+#else
+	if (CPU_IS_020_OR_030)
+		pgprot_val(prot) |= _PAGE_NOCACHE030;
+	if (CPU_IS_040_OR_060) {
+		pgprot_val(prot) &= _CACHEMASK040;
+		/* Use no-cache mode, serialized */
+		pgprot_val(prot) |= _PAGE_NOCACHE_S;
+	}
+#endif /* CONFIG_SUN3 */
+#endif /* CONFIG_MMU */
+
+	return prot;
+}
+#define pgprot_framebuffer pgprot_framebuffer
+
+#include <asm-generic/video.h>
+
+#endif /* _ASM_VIDEO_H_ */
diff --git a/arch/mips/include/asm/fb.h b/arch/mips/include/asm/fb.h
deleted file mode 100644
index d98d6681d64e..000000000000
--- a/arch/mips/include/asm/fb.h
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef _ASM_FB_H_
-#define _ASM_FB_H_
-
-#include <asm/page.h>
-
-static inline pgprot_t pgprot_framebuffer(pgprot_t prot,
-					  unsigned long vm_start, unsigned long vm_end,
-					  unsigned long offset)
-{
-	return pgprot_noncached(prot);
-}
-#define pgprot_framebuffer pgprot_framebuffer
-
-/*
- * MIPS doesn't define __raw_ I/O macros, so the helpers
- * in <asm-generic/fb.h> don't generate fb_readq() and
- * fb_write(). We have to provide them here.
- *
- * TODO: Convert MIPS to generic I/O. The helpers below can
- *       then be removed.
- */
-#ifdef CONFIG_64BIT
-static inline u64 fb_readq(const volatile void __iomem *addr)
-{
-	return __raw_readq(addr);
-}
-#define fb_readq fb_readq
-
-static inline void fb_writeq(u64 b, volatile void __iomem *addr)
-{
-	__raw_writeq(b, addr);
-}
-#define fb_writeq fb_writeq
-#endif
-
-#include <asm-generic/fb.h>
-
-#endif /* _ASM_FB_H_ */
diff --git a/arch/mips/include/asm/video.h b/arch/mips/include/asm/video.h
new file mode 100644
index 000000000000..007c106d980f
--- /dev/null
+++ b/arch/mips/include/asm/video.h
@@ -0,0 +1,38 @@
+#ifndef _ASM_VIDEO_H_
+#define _ASM_VIDEO_H_
+
+#include <asm/page.h>
+
+static inline pgprot_t pgprot_framebuffer(pgprot_t prot,
+					  unsigned long vm_start, unsigned long vm_end,
+					  unsigned long offset)
+{
+	return pgprot_noncached(prot);
+}
+#define pgprot_framebuffer pgprot_framebuffer
+
+/*
+ * MIPS doesn't define __raw_ I/O macros, so the helpers
+ * in <asm-generic/video.h> don't generate fb_readq() and
+ * fb_writeq(). We have to provide them here.
+ *
+ * TODO: Convert MIPS to generic I/O. The helpers below can
+ *       then be removed.
+ */
+#ifdef CONFIG_64BIT
+static inline u64 fb_readq(const volatile void __iomem *addr)
+{
+	return __raw_readq(addr);
+}
+#define fb_readq fb_readq
+
+static inline void fb_writeq(u64 b, volatile void __iomem *addr)
+{
+	__raw_writeq(b, addr);
+}
+#define fb_writeq fb_writeq
+#endif
+
+#include <asm-generic/video.h>
+
+#endif /* _ASM_VIDEO_H_ */
diff --git a/arch/parisc/include/asm/fb.h b/arch/parisc/include/asm/fb.h
deleted file mode 100644
index ed2a195a3e76..000000000000
--- a/arch/parisc/include/asm/fb.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_FB_H_
-#define _ASM_FB_H_
-
-#include <linux/types.h>
-
-struct device;
-
-#if defined(CONFIG_STI_CORE)
-bool video_is_primary_device(struct device *dev);
-#define video_is_primary_device video_is_primary_device
-#endif
-
-#include <asm-generic/fb.h>
-
-#endif /* _ASM_FB_H_ */
diff --git a/arch/parisc/include/asm/video.h b/arch/parisc/include/asm/video.h
new file mode 100644
index 000000000000..c5dff3223194
--- /dev/null
+++ b/arch/parisc/include/asm/video.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_VIDEO_H_
+#define _ASM_VIDEO_H_
+
+#include <linux/types.h>
+
+struct device;
+
+#if defined(CONFIG_STI_CORE)
+bool video_is_primary_device(struct device *dev);
+#define video_is_primary_device video_is_primary_device
+#endif
+
+#include <asm-generic/video.h>
+
+#endif /* _ASM_VIDEO_H_ */
diff --git a/arch/parisc/video/Makefile b/arch/parisc/video/Makefile
index 16a73cce4661..b5db5b42880f 100644
--- a/arch/parisc/video/Makefile
+++ b/arch/parisc/video/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
-obj-$(CONFIG_STI_CORE) += fbdev.o
+obj-$(CONFIG_STI_CORE) += video-sti.o
diff --git a/arch/parisc/video/fbdev.c b/arch/parisc/video/fbdev.c
deleted file mode 100644
index 540fa0c919d5..000000000000
--- a/arch/parisc/video/fbdev.c
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2000 Philipp Rumpf <prumpf@tux.org>
- * Copyright (C) 2001-2020 Helge Deller <deller@gmx.de>
- * Copyright (C) 2001-2002 Thomas Bogendoerfer <tsbogend@alpha.franken.de>
- */
-
-#include <linux/module.h>
-
-#include <video/sticore.h>
-
-#include <asm/fb.h>
-
-bool video_is_primary_device(struct device *dev)
-{
-	struct sti_struct *sti;
-
-	sti = sti_get_rom(0);
-
-	/* if no built-in graphics card found, allow any fb driver as default */
-	if (!sti)
-		return true;
-
-	/* return true if it's the default built-in framebuffer driver */
-	return (sti->dev == dev);
-}
-EXPORT_SYMBOL(video_is_primary_device);
diff --git a/arch/parisc/video/video-sti.c b/arch/parisc/video/video-sti.c
new file mode 100644
index 000000000000..564661e87093
--- /dev/null
+++ b/arch/parisc/video/video-sti.c
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2000 Philipp Rumpf <prumpf@tux.org>
+ * Copyright (C) 2001-2020 Helge Deller <deller@gmx.de>
+ * Copyright (C) 2001-2002 Thomas Bogendoerfer <tsbogend@alpha.franken.de>
+ */
+
+#include <linux/module.h>
+
+#include <video/sticore.h>
+
+#include <asm/video.h>
+
+bool video_is_primary_device(struct device *dev)
+{
+	struct sti_struct *sti;
+
+	sti = sti_get_rom(0);
+
+	/* if no built-in graphics card found, allow any fb driver as default */
+	if (!sti)
+		return true;
+
+	/* return true if it's the default built-in framebuffer driver */
+	return (sti->dev == dev);
+}
+EXPORT_SYMBOL(video_is_primary_device);
diff --git a/arch/powerpc/include/asm/fb.h b/arch/powerpc/include/asm/fb.h
deleted file mode 100644
index c0c5d1df7ad1..000000000000
--- a/arch/powerpc/include/asm/fb.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_FB_H_
-#define _ASM_FB_H_
-
-#include <asm/page.h>
-
-static inline pgprot_t pgprot_framebuffer(pgprot_t prot,
-					  unsigned long vm_start, unsigned long vm_end,
-					  unsigned long offset)
-{
-	return __phys_mem_access_prot(PHYS_PFN(offset), vm_end - vm_start, prot);
-}
-#define pgprot_framebuffer pgprot_framebuffer
-
-#include <asm-generic/fb.h>
-
-#endif /* _ASM_FB_H_ */
diff --git a/arch/powerpc/include/asm/video.h b/arch/powerpc/include/asm/video.h
new file mode 100644
index 000000000000..e1770114ffc3
--- /dev/null
+++ b/arch/powerpc/include/asm/video.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_VIDEO_H_
+#define _ASM_VIDEO_H_
+
+#include <asm/page.h>
+
+static inline pgprot_t pgprot_framebuffer(pgprot_t prot,
+					  unsigned long vm_start, unsigned long vm_end,
+					  unsigned long offset)
+{
+	return __phys_mem_access_prot(PHYS_PFN(offset), vm_end - vm_start, prot);
+}
+#define pgprot_framebuffer pgprot_framebuffer
+
+#include <asm-generic/video.h>
+
+#endif /* _ASM_VIDEO_H_ */
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index d95a48eff412..eac84d687b53 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -517,7 +517,7 @@ int pci_iobar_pfn(struct pci_dev *pdev, int bar, struct vm_area_struct *vma)
 }
 
 /*
- * This one is used by /dev/mem and fbdev who have no clue about the
+ * This one is used by /dev/mem and video who have no clue about the
  * PCI device, it tries to find the PCI device first and calls the
  * above routine
  */
diff --git a/arch/sh/include/asm/fb.h b/arch/sh/include/asm/fb.h
deleted file mode 100644
index 19df13ee9ca7..000000000000
--- a/arch/sh/include/asm/fb.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_FB_H_
-#define _ASM_FB_H_
-
-#include <asm-generic/fb.h>
-
-#endif /* _ASM_FB_H_ */
diff --git a/arch/sparc/include/asm/fb.h b/arch/sparc/include/asm/fb.h
deleted file mode 100644
index 07f0325d6921..000000000000
--- a/arch/sparc/include/asm/fb.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _SPARC_FB_H_
-#define _SPARC_FB_H_
-
-#include <linux/io.h>
-#include <linux/types.h>
-
-#include <asm/page.h>
-
-struct device;
-
-#ifdef CONFIG_SPARC32
-static inline pgprot_t pgprot_framebuffer(pgprot_t prot,
-					  unsigned long vm_start, unsigned long vm_end,
-					  unsigned long offset)
-{
-	return prot;
-}
-#define pgprot_framebuffer pgprot_framebuffer
-#endif
-
-bool video_is_primary_device(struct device *dev);
-#define video_is_primary_device video_is_primary_device
-
-static inline void fb_memcpy_fromio(void *to, const volatile void __iomem *from, size_t n)
-{
-	sbus_memcpy_fromio(to, from, n);
-}
-#define fb_memcpy_fromio fb_memcpy_fromio
-
-static inline void fb_memcpy_toio(volatile void __iomem *to, const void *from, size_t n)
-{
-	sbus_memcpy_toio(to, from, n);
-}
-#define fb_memcpy_toio fb_memcpy_toio
-
-static inline void fb_memset_io(volatile void __iomem *addr, int c, size_t n)
-{
-	sbus_memset_io(addr, c, n);
-}
-#define fb_memset fb_memset_io
-
-#include <asm-generic/fb.h>
-
-#endif /* _SPARC_FB_H_ */
diff --git a/arch/sparc/include/asm/video.h b/arch/sparc/include/asm/video.h
new file mode 100644
index 000000000000..a6f48f52db58
--- /dev/null
+++ b/arch/sparc/include/asm/video.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _SPARC_VIDEO_H_
+#define _SPARC_VIDEO_H_
+
+#include <linux/io.h>
+#include <linux/types.h>
+
+#include <asm/page.h>
+
+struct device;
+
+#ifdef CONFIG_SPARC32
+static inline pgprot_t pgprot_framebuffer(pgprot_t prot,
+					  unsigned long vm_start, unsigned long vm_end,
+					  unsigned long offset)
+{
+	return prot;
+}
+#define pgprot_framebuffer pgprot_framebuffer
+#endif
+
+bool video_is_primary_device(struct device *dev);
+#define video_is_primary_device video_is_primary_device
+
+static inline void fb_memcpy_fromio(void *to, const volatile void __iomem *from, size_t n)
+{
+	sbus_memcpy_fromio(to, from, n);
+}
+#define fb_memcpy_fromio fb_memcpy_fromio
+
+static inline void fb_memcpy_toio(volatile void __iomem *to, const void *from, size_t n)
+{
+	sbus_memcpy_toio(to, from, n);
+}
+#define fb_memcpy_toio fb_memcpy_toio
+
+static inline void fb_memset_io(volatile void __iomem *addr, int c, size_t n)
+{
+	sbus_memset_io(addr, c, n);
+}
+#define fb_memset fb_memset_io
+
+#include <asm-generic/video.h>
+
+#endif /* _SPARC_VIDEO_H_ */
diff --git a/arch/sparc/video/Makefile b/arch/sparc/video/Makefile
index 9dd82880a027..fdf83a408d75 100644
--- a/arch/sparc/video/Makefile
+++ b/arch/sparc/video/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
-obj-y	+= fbdev.o
+obj-y	+= video.o
diff --git a/arch/sparc/video/fbdev.c b/arch/sparc/video/fbdev.c
deleted file mode 100644
index e46f0499c277..000000000000
--- a/arch/sparc/video/fbdev.c
+++ /dev/null
@@ -1,25 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/console.h>
-#include <linux/device.h>
-#include <linux/module.h>
-
-#include <asm/fb.h>
-#include <asm/prom.h>
-
-bool video_is_primary_device(struct device *dev)
-{
-	struct device_node *node = dev->of_node;
-
-	if (console_set_on_cmdline)
-		return false;
-
-	if (node && node == of_console_device)
-		return true;
-
-	return false;
-}
-EXPORT_SYMBOL(video_is_primary_device);
-
-MODULE_DESCRIPTION("Sparc fbdev helpers");
-MODULE_LICENSE("GPL");
diff --git a/arch/sparc/video/video.c b/arch/sparc/video/video.c
new file mode 100644
index 000000000000..2414380caadc
--- /dev/null
+++ b/arch/sparc/video/video.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/console.h>
+#include <linux/device.h>
+#include <linux/module.h>
+
+#include <asm/prom.h>
+#include <asm/video.h>
+
+bool video_is_primary_device(struct device *dev)
+{
+	struct device_node *node = dev->of_node;
+
+	if (console_set_on_cmdline)
+		return false;
+
+	if (node && node == of_console_device)
+		return true;
+
+	return false;
+}
+EXPORT_SYMBOL(video_is_primary_device);
+
+MODULE_DESCRIPTION("Sparc video helpers");
+MODULE_LICENSE("GPL");
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index b2d834a29f3a..6fe34779291a 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -8,7 +8,6 @@ generic-y += dma-mapping.h
 generic-y += emergency-restart.h
 generic-y += exec.h
 generic-y += extable.h
-generic-y += fb.h
 generic-y += ftrace.h
 generic-y += hw_irq.h
 generic-y += irq_regs.h
@@ -28,3 +27,4 @@ generic-y += trace_clock.h
 generic-y += kprobes.h
 generic-y += mm_hooks.h
 generic-y += vga.h
+generic-y += video.h
diff --git a/arch/x86/include/asm/fb.h b/arch/x86/include/asm/fb.h
deleted file mode 100644
index 999db3379286..000000000000
--- a/arch/x86/include/asm/fb.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_FB_H
-#define _ASM_X86_FB_H
-
-#include <linux/types.h>
-
-#include <asm/page.h>
-
-struct device;
-
-pgprot_t pgprot_framebuffer(pgprot_t prot,
-			    unsigned long vm_start, unsigned long vm_end,
-			    unsigned long offset);
-#define pgprot_framebuffer pgprot_framebuffer
-
-bool video_is_primary_device(struct device *dev);
-#define video_is_primary_device video_is_primary_device
-
-#include <asm-generic/fb.h>
-
-#endif /* _ASM_X86_FB_H */
diff --git a/arch/x86/include/asm/video.h b/arch/x86/include/asm/video.h
new file mode 100644
index 000000000000..0950c9535fae
--- /dev/null
+++ b/arch/x86/include/asm/video.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_VIDEO_H
+#define _ASM_X86_VIDEO_H
+
+#include <linux/types.h>
+
+#include <asm/page.h>
+
+struct device;
+
+pgprot_t pgprot_framebuffer(pgprot_t prot,
+			    unsigned long vm_start, unsigned long vm_end,
+			    unsigned long offset);
+#define pgprot_framebuffer pgprot_framebuffer
+
+bool video_is_primary_device(struct device *dev);
+#define video_is_primary_device video_is_primary_device
+
+#include <asm-generic/video.h>
+
+#endif /* _ASM_X86_VIDEO_H */
diff --git a/arch/x86/video/Makefile b/arch/x86/video/Makefile
index 9dd82880a027..fdf83a408d75 100644
--- a/arch/x86/video/Makefile
+++ b/arch/x86/video/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
-obj-y	+= fbdev.o
+obj-y	+= video.o
diff --git a/arch/x86/video/fbdev.c b/arch/x86/video/fbdev.c
deleted file mode 100644
index 4d87ce8e257f..000000000000
--- a/arch/x86/video/fbdev.c
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (C) 2007 Antonino Daplas <adaplas@gmail.com>
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file COPYING in the main directory of this archive
- * for more details.
- *
- */
-
-#include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/vgaarb.h>
-#include <asm/fb.h>
-
-pgprot_t pgprot_framebuffer(pgprot_t prot,
-			    unsigned long vm_start, unsigned long vm_end,
-			    unsigned long offset)
-{
-	pgprot_val(prot) &= ~_PAGE_CACHE_MASK;
-	if (boot_cpu_data.x86 > 3)
-		pgprot_val(prot) |= cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS);
-
-	return prot;
-}
-EXPORT_SYMBOL(pgprot_framebuffer);
-
-bool video_is_primary_device(struct device *dev)
-{
-	struct pci_dev *pdev;
-
-	if (!dev_is_pci(dev))
-		return false;
-
-	pdev = to_pci_dev(dev);
-
-	return (pdev == vga_default_device());
-}
-EXPORT_SYMBOL(video_is_primary_device);
-
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/video/video.c b/arch/x86/video/video.c
new file mode 100644
index 000000000000..81fc97a2a837
--- /dev/null
+++ b/arch/x86/video/video.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright (C) 2007 Antonino Daplas <adaplas@gmail.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file COPYING in the main directory of this archive
+ * for more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/vgaarb.h>
+
+#include <asm/video.h>
+
+pgprot_t pgprot_framebuffer(pgprot_t prot,
+			    unsigned long vm_start, unsigned long vm_end,
+			    unsigned long offset)
+{
+	pgprot_val(prot) &= ~_PAGE_CACHE_MASK;
+	if (boot_cpu_data.x86 > 3)
+		pgprot_val(prot) |= cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS);
+
+	return prot;
+}
+EXPORT_SYMBOL(pgprot_framebuffer);
+
+bool video_is_primary_device(struct device *dev)
+{
+	struct pci_dev *pdev;
+
+	if (!dev_is_pci(dev))
+		return false;
+
+	pdev = to_pci_dev(dev);
+
+	return (pdev == vga_default_device());
+}
+EXPORT_SYMBOL(video_is_primary_device);
+
+MODULE_LICENSE("GPL");
diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild
index d436bee4d129..b20fa25a7e8d 100644
--- a/include/asm-generic/Kbuild
+++ b/include/asm-generic/Kbuild
@@ -22,7 +22,6 @@ mandatory-y += dma-mapping.h
 mandatory-y += dma.h
 mandatory-y += emergency-restart.h
 mandatory-y += exec.h
-mandatory-y += fb.h
 mandatory-y += ftrace.h
 mandatory-y += futex.h
 mandatory-y += hardirq.h
@@ -62,5 +61,6 @@ mandatory-y += uaccess.h
 mandatory-y += unaligned.h
 mandatory-y += vermagic.h
 mandatory-y += vga.h
+mandatory-y += video.h
 mandatory-y += word-at-a-time.h
 mandatory-y += xor.h
diff --git a/include/asm-generic/fb.h b/include/asm-generic/fb.h
deleted file mode 100644
index 4788c1e1c6bc..000000000000
--- a/include/asm-generic/fb.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef __ASM_GENERIC_FB_H_
-#define __ASM_GENERIC_FB_H_
-
-/*
- * Only include this header file from your architecture's <asm/fb.h>.
- */
-
-#include <linux/io.h>
-#include <linux/mm_types.h>
-#include <linux/pgtable.h>
-#include <linux/types.h>
-
-struct device;
-
-#ifndef pgprot_framebuffer
-#define pgprot_framebuffer pgprot_framebuffer
-static inline pgprot_t pgprot_framebuffer(pgprot_t prot,
-					  unsigned long vm_start, unsigned long vm_end,
-					  unsigned long offset)
-{
-	return pgprot_writecombine(prot);
-}
-#endif
-
-#ifndef video_is_primary_device
-#define video_is_primary_device video_is_primary_device
-static inline bool video_is_primary_device(struct device *dev)
-{
-	return false;
-}
-#endif
-
-/*
- * I/O helpers for the framebuffer. Prefer these functions over their
- * regular counterparts. The regular I/O functions provide in-order
- * access and swap bytes to/from little-endian ordering. Neither is
- * required for framebuffers. Instead, the helpers read and write
- * raw framebuffer data. Independent operations can be reordered for
- * improved performance.
- */
-
-#ifndef fb_readb
-static inline u8 fb_readb(const volatile void __iomem *addr)
-{
-	return __raw_readb(addr);
-}
-#define fb_readb fb_readb
-#endif
-
-#ifndef fb_readw
-static inline u16 fb_readw(const volatile void __iomem *addr)
-{
-	return __raw_readw(addr);
-}
-#define fb_readw fb_readw
-#endif
-
-#ifndef fb_readl
-static inline u32 fb_readl(const volatile void __iomem *addr)
-{
-	return __raw_readl(addr);
-}
-#define fb_readl fb_readl
-#endif
-
-#ifndef fb_readq
-#if defined(__raw_readq)
-static inline u64 fb_readq(const volatile void __iomem *addr)
-{
-	return __raw_readq(addr);
-}
-#define fb_readq fb_readq
-#endif
-#endif
-
-#ifndef fb_writeb
-static inline void fb_writeb(u8 b, volatile void __iomem *addr)
-{
-	__raw_writeb(b, addr);
-}
-#define fb_writeb fb_writeb
-#endif
-
-#ifndef fb_writew
-static inline void fb_writew(u16 b, volatile void __iomem *addr)
-{
-	__raw_writew(b, addr);
-}
-#define fb_writew fb_writew
-#endif
-
-#ifndef fb_writel
-static inline void fb_writel(u32 b, volatile void __iomem *addr)
-{
-	__raw_writel(b, addr);
-}
-#define fb_writel fb_writel
-#endif
-
-#ifndef fb_writeq
-#if defined(__raw_writeq)
-static inline void fb_writeq(u64 b, volatile void __iomem *addr)
-{
-	__raw_writeq(b, addr);
-}
-#define fb_writeq fb_writeq
-#endif
-#endif
-
-#ifndef fb_memcpy_fromio
-static inline void fb_memcpy_fromio(void *to, const volatile void __iomem *from, size_t n)
-{
-	memcpy_fromio(to, from, n);
-}
-#define fb_memcpy_fromio fb_memcpy_fromio
-#endif
-
-#ifndef fb_memcpy_toio
-static inline void fb_memcpy_toio(volatile void __iomem *to, const void *from, size_t n)
-{
-	memcpy_toio(to, from, n);
-}
-#define fb_memcpy_toio fb_memcpy_toio
-#endif
-
-#ifndef fb_memset
-static inline void fb_memset_io(volatile void __iomem *addr, int c, size_t n)
-{
-	memset_io(addr, c, n);
-}
-#define fb_memset fb_memset_io
-#endif
-
-#endif /* __ASM_GENERIC_FB_H_ */
diff --git a/include/asm-generic/video.h b/include/asm-generic/video.h
new file mode 100644
index 000000000000..b1da2309d943
--- /dev/null
+++ b/include/asm-generic/video.h
@@ -0,0 +1,136 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ASM_GENERIC_VIDEO_H_
+#define __ASM_GENERIC_VIDEO_H_
+
+/*
+ * Only include this header file from your architecture's <asm/fb.h>.
+ */
+
+#include <linux/io.h>
+#include <linux/mm_types.h>
+#include <linux/pgtable.h>
+#include <linux/types.h>
+
+struct device;
+
+#ifndef pgprot_framebuffer
+#define pgprot_framebuffer pgprot_framebuffer
+static inline pgprot_t pgprot_framebuffer(pgprot_t prot,
+					  unsigned long vm_start, unsigned long vm_end,
+					  unsigned long offset)
+{
+	return pgprot_writecombine(prot);
+}
+#endif
+
+#ifndef video_is_primary_device
+#define video_is_primary_device video_is_primary_device
+static inline bool video_is_primary_device(struct device *dev)
+{
+	return false;
+}
+#endif
+
+/*
+ * I/O helpers for the framebuffer. Prefer these functions over their
+ * regular counterparts. The regular I/O functions provide in-order
+ * access and swap bytes to/from little-endian ordering. Neither is
+ * required for framebuffers. Instead, the helpers read and write
+ * raw framebuffer data. Independent operations can be reordered for
+ * improved performance.
+ */
+
+#ifndef fb_readb
+static inline u8 fb_readb(const volatile void __iomem *addr)
+{
+	return __raw_readb(addr);
+}
+#define fb_readb fb_readb
+#endif
+
+#ifndef fb_readw
+static inline u16 fb_readw(const volatile void __iomem *addr)
+{
+	return __raw_readw(addr);
+}
+#define fb_readw fb_readw
+#endif
+
+#ifndef fb_readl
+static inline u32 fb_readl(const volatile void __iomem *addr)
+{
+	return __raw_readl(addr);
+}
+#define fb_readl fb_readl
+#endif
+
+#ifndef fb_readq
+#if defined(__raw_readq)
+static inline u64 fb_readq(const volatile void __iomem *addr)
+{
+	return __raw_readq(addr);
+}
+#define fb_readq fb_readq
+#endif
+#endif
+
+#ifndef fb_writeb
+static inline void fb_writeb(u8 b, volatile void __iomem *addr)
+{
+	__raw_writeb(b, addr);
+}
+#define fb_writeb fb_writeb
+#endif
+
+#ifndef fb_writew
+static inline void fb_writew(u16 b, volatile void __iomem *addr)
+{
+	__raw_writew(b, addr);
+}
+#define fb_writew fb_writew
+#endif
+
+#ifndef fb_writel
+static inline void fb_writel(u32 b, volatile void __iomem *addr)
+{
+	__raw_writel(b, addr);
+}
+#define fb_writel fb_writel
+#endif
+
+#ifndef fb_writeq
+#if defined(__raw_writeq)
+static inline void fb_writeq(u64 b, volatile void __iomem *addr)
+{
+	__raw_writeq(b, addr);
+}
+#define fb_writeq fb_writeq
+#endif
+#endif
+
+#ifndef fb_memcpy_fromio
+static inline void fb_memcpy_fromio(void *to, const volatile void __iomem *from, size_t n)
+{
+	memcpy_fromio(to, from, n);
+}
+#define fb_memcpy_fromio fb_memcpy_fromio
+#endif
+
+#ifndef fb_memcpy_toio
+static inline void fb_memcpy_toio(volatile void __iomem *to, const void *from, size_t n)
+{
+	memcpy_toio(to, from, n);
+}
+#define fb_memcpy_toio fb_memcpy_toio
+#endif
+
+#ifndef fb_memset
+static inline void fb_memset_io(volatile void __iomem *addr, int c, size_t n)
+{
+	memset_io(addr, c, n);
+}
+#define fb_memset fb_memset_io
+#endif
+
+#endif /* __ASM_GENERIC_VIDEO_H_ */
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 0dd27364d56f..bd97240dd272 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -12,7 +12,7 @@
 #include <linux/types.h>
 #include <linux/workqueue.h>
 
-#include <asm/fb.h>
+#include <asm/video.h>
 
 struct backlight_device;
 struct device;
-- 
cgit v1.2.3


From e3dff9c80567e028e52706619dd8f7e72ad72297 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 3 May 2024 17:40:01 +0200
Subject: asm-generic: remove unused asm-generic/page.h

This file was used by c6x and blackfin in the past, but no architecture uses
it any more, and it is only useful for architectures that do not support
an MMU in the first place.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/asm-generic/page.h | 103 ---------------------------------------------
 1 file changed, 103 deletions(-)
 delete mode 100644 include/asm-generic/page.h

(limited to 'include')

diff --git a/include/asm-generic/page.h b/include/asm-generic/page.h
deleted file mode 100644
index 9773582fd96e..000000000000
--- a/include/asm-generic/page.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __ASM_GENERIC_PAGE_H
-#define __ASM_GENERIC_PAGE_H
-/*
- * Generic page.h implementation, for NOMMU architectures.
- * This provides the dummy definitions for the memory management.
- */
-
-#ifdef CONFIG_MMU
-#error need to provide a real asm/page.h
-#endif
-
-
-/* PAGE_SHIFT determines the page size */
-
-#define PAGE_SHIFT	12
-#ifdef __ASSEMBLY__
-#define PAGE_SIZE	(1 << PAGE_SHIFT)
-#else
-#define PAGE_SIZE	(1UL << PAGE_SHIFT)
-#endif
-#define PAGE_MASK	(~(PAGE_SIZE-1))
-
-#include <asm/setup.h>
-
-#ifndef __ASSEMBLY__
-
-#define clear_page(page)	memset((page), 0, PAGE_SIZE)
-#define copy_page(to,from)	memcpy((to), (from), PAGE_SIZE)
-
-#define clear_user_page(page, vaddr, pg)	clear_page(page)
-#define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
-
-/*
- * These are used to make use of C type-checking..
- */
-typedef struct {
-	unsigned long pte;
-} pte_t;
-typedef struct {
-	unsigned long pmd[16];
-} pmd_t;
-typedef struct {
-	unsigned long pgd;
-} pgd_t;
-typedef struct {
-	unsigned long pgprot;
-} pgprot_t;
-typedef struct page *pgtable_t;
-
-#define pte_val(x)	((x).pte)
-#define pmd_val(x)	((&x)->pmd[0])
-#define pgd_val(x)	((x).pgd)
-#define pgprot_val(x)	((x).pgprot)
-
-#define __pte(x)	((pte_t) { (x) } )
-#define __pmd(x)	((pmd_t) { (x) } )
-#define __pgd(x)	((pgd_t) { (x) } )
-#define __pgprot(x)	((pgprot_t) { (x) } )
-
-extern unsigned long memory_start;
-extern unsigned long memory_end;
-
-#endif /* !__ASSEMBLY__ */
-
-#define PAGE_OFFSET		(0)
-
-#ifndef ARCH_PFN_OFFSET
-#define ARCH_PFN_OFFSET		(PAGE_OFFSET >> PAGE_SHIFT)
-#endif
-
-#ifndef __ASSEMBLY__
-
-#define __va(x) ((void *)((unsigned long) (x)))
-#define __pa(x) ((unsigned long) (x))
-
-static inline unsigned long virt_to_pfn(const void *kaddr)
-{
-	return __pa(kaddr) >> PAGE_SHIFT;
-}
-#define virt_to_pfn virt_to_pfn
-static inline void *pfn_to_virt(unsigned long pfn)
-{
-	return __va(pfn) << PAGE_SHIFT;
-}
-#define pfn_to_virt pfn_to_virt
-
-#define virt_to_page(addr)	pfn_to_page(virt_to_pfn(addr))
-#define page_to_virt(page)	pfn_to_virt(page_to_pfn(page))
-
-#ifndef page_to_phys
-#define page_to_phys(page)      ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
-#endif
-
-#define	virt_addr_valid(kaddr)	(((void *)(kaddr) >= (void *)PAGE_OFFSET) && \
-				((void *)(kaddr) < (void *)memory_end))
-
-#endif /* __ASSEMBLY__ */
-
-#include <asm-generic/memory_model.h>
-#include <asm-generic/getorder.h>
-
-#endif /* __ASM_GENERIC_PAGE_H */
-- 
cgit v1.2.3


From c1742dcb6bda5fd535fbaa2145f0a180bc329aa6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 2 May 2024 17:39:26 +0000
Subject: net: no longer acquire RTNL in threaded_show()

dev->threaded can be read locklessly, if we add
corresponding READ_ONCE()/WRITE_ONCE() annotations.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/20240502173926.2010646-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/netdevice.h | 2 +-
 net/core/dev.c            | 4 ++--
 net/core/net-sysfs.c      | 8 ++++----
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 41853424b41d..2814a15eed73 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2370,8 +2370,8 @@ struct net_device {
 	struct sfp_bus		*sfp_bus;
 	struct lock_class_key	*qdisc_tx_busylock;
 	bool			proto_down;
+	bool			threaded;
 	unsigned		wol_enabled:1;
-	unsigned		threaded:1;
 
 	struct list_head	net_notifier_list;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index e02d2363347e..d6b24749eb2e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6531,7 +6531,7 @@ int dev_set_threaded(struct net_device *dev, bool threaded)
 		}
 	}
 
-	dev->threaded = threaded;
+	WRITE_ONCE(dev->threaded, threaded);
 
 	/* Make sure kthread is created before THREADED bit
 	 * is set.
@@ -6622,7 +6622,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 	 * threaded mode will not be enabled in napi_enable().
 	 */
 	if (dev->threaded && napi_kthread_create(napi))
-		dev->threaded = 0;
+		dev->threaded = false;
 	netif_napi_set_irq(napi, -1);
 }
 EXPORT_SYMBOL(netif_napi_add_weight);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 1f7f09e56771..4c27a360c294 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -605,13 +605,13 @@ static ssize_t threaded_show(struct device *dev,
 	struct net_device *netdev = to_net_dev(dev);
 	ssize_t ret = -EINVAL;
 
-	if (!rtnl_trylock())
-		return restart_syscall();
+	rcu_read_lock();
 
 	if (dev_isalive(netdev))
-		ret = sysfs_emit(buf, fmt_dec, netdev->threaded);
+		ret = sysfs_emit(buf, fmt_dec, READ_ONCE(netdev->threaded));
+
+	rcu_read_unlock();
 
-	rtnl_unlock();
 	return ret;
 }
 
-- 
cgit v1.2.3


From 173e7622ccb3f46834bd4176ed363f435e142942 Mon Sep 17 00:00:00 2001
From: Mina Almasry <almasrymina@google.com>
Date: Thu, 2 May 2024 10:54:22 -0700
Subject: Revert "net: mirror skb frag ref/unref helpers"

This reverts commit a580ea994fd37f4105028f5a85c38ff6508a2b25.

This revert is to resolve Dragos's report of page_pool leak here:
https://lore.kernel.org/lkml/20240424165646.1625690-2-dtatulea@nvidia.com/

The reverted patch interacts very badly with commit 2cc3aeb5eccc ("skbuff:
Fix a potential race while recycling page_pool packets"). The reverted
commit hopes that the pp_recycle + is_pp_page variables do not change
between the skb_frag_ref and skb_frag_unref operation. If such a change
occurs, the skb_frag_ref/unref will not operate on the same reference type.
In the case of Dragos's report, the grabbed ref was a pp ref, but the unref
was a page ref, because the pp_recycle setting on the skb was changed.

Attempting to fix this issue on the fly is risky. Lets revert and I hope
to reland this with better understanding and testing to ensure we don't
regress some edge case while streamlining skb reffing.

Fixes: a580ea994fd3 ("net: mirror skb frag ref/unref helpers")
Reported-by: Dragos Tatulea <dtatulea@nvidia.com>
Signed-off-by: Mina Almasry <almasrymina@google.com>
Link: https://lore.kernel.org/r/20240502175423.2456544-1-almasrymina@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../chelsio/inline_crypto/ch_ktls/chcr_ktls.c      |  2 +-
 drivers/net/ethernet/sun/cassini.c                 |  4 +-
 drivers/net/veth.c                                 |  2 +-
 include/linux/skbuff_ref.h                         | 39 ++----------------
 net/core/skbuff.c                                  | 46 ++++++++++++++++++++--
 net/tls/tls_device_fallback.c                      |  2 +-
 6 files changed, 51 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c
index 3832c2e8ea5a..e8e460a92e0e 100644
--- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c
+++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c
@@ -1659,7 +1659,7 @@ static void chcr_ktls_copy_record_in_skb(struct sk_buff *nskb,
 	for (i = 0; i < record->num_frags; i++) {
 		skb_shinfo(nskb)->frags[i] = record->frags[i];
 		/* increase the frag ref count */
-		__skb_frag_ref(&skb_shinfo(nskb)->frags[i], nskb->pp_recycle);
+		__skb_frag_ref(&skb_shinfo(nskb)->frags[i]);
 	}
 
 	skb_shinfo(nskb)->nr_frags = record->num_frags;
diff --git a/drivers/net/ethernet/sun/cassini.c b/drivers/net/ethernet/sun/cassini.c
index f058e154a3bc..8f1f43dbb76d 100644
--- a/drivers/net/ethernet/sun/cassini.c
+++ b/drivers/net/ethernet/sun/cassini.c
@@ -2000,7 +2000,7 @@ static int cas_rx_process_pkt(struct cas *cp, struct cas_rx_comp *rxc,
 		skb->len      += hlen - swivel;
 
 		skb_frag_fill_page_desc(frag, page->buffer, off, hlen - swivel);
-		__skb_frag_ref(frag, skb->pp_recycle);
+		__skb_frag_ref(frag);
 
 		/* any more data? */
 		if ((words[0] & RX_COMP1_SPLIT_PKT) && ((dlen -= hlen) > 0)) {
@@ -2024,7 +2024,7 @@ static int cas_rx_process_pkt(struct cas *cp, struct cas_rx_comp *rxc,
 			frag++;
 
 			skb_frag_fill_page_desc(frag, page->buffer, 0, hlen);
-			__skb_frag_ref(frag, skb->pp_recycle);
+			__skb_frag_ref(frag);
 			RX_USED_ADD(page, hlen + cp->crc_size);
 		}
 
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 0b0293629329..426e68a95067 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -717,7 +717,7 @@ static void veth_xdp_get(struct xdp_buff *xdp)
 		return;
 
 	for (i = 0; i < sinfo->nr_frags; i++)
-		__skb_frag_ref(&sinfo->frags[i], false);
+		__skb_frag_ref(&sinfo->frags[i]);
 }
 
 static int veth_convert_skb_to_xdp_buff(struct veth_rq *rq,
diff --git a/include/linux/skbuff_ref.h b/include/linux/skbuff_ref.h
index 4dcdbe9fbc5f..11f0a4063403 100644
--- a/include/linux/skbuff_ref.h
+++ b/include/linux/skbuff_ref.h
@@ -8,47 +8,16 @@
 #define _LINUX_SKBUFF_REF_H
 
 #include <linux/skbuff.h>
-#include <net/page_pool/helpers.h>
-
-#ifdef CONFIG_PAGE_POOL
-static inline bool is_pp_page(struct page *page)
-{
-	return (page->pp_magic & ~0x3UL) == PP_SIGNATURE;
-}
-
-static inline bool napi_pp_get_page(struct page *page)
-{
-	page = compound_head(page);
-
-	if (!is_pp_page(page))
-		return false;
-
-	page_pool_ref_page(page);
-	return true;
-}
-#endif
-
-static inline void skb_page_ref(struct page *page, bool recycle)
-{
-#ifdef CONFIG_PAGE_POOL
-	if (recycle && napi_pp_get_page(page))
-		return;
-#endif
-	get_page(page);
-}
 
 /**
  * __skb_frag_ref - take an addition reference on a paged fragment.
  * @frag: the paged fragment
- * @recycle: skb->pp_recycle param of the parent skb. False if no parent skb.
  *
- * Takes an additional reference on the paged fragment @frag. Obtains the
- * correct reference count depending on whether skb->pp_recycle is set and
- * whether the frag is a page pool frag.
+ * Takes an additional reference on the paged fragment @frag.
  */
-static inline void __skb_frag_ref(skb_frag_t *frag, bool recycle)
+static inline void __skb_frag_ref(skb_frag_t *frag)
 {
-	skb_page_ref(skb_frag_page(frag), recycle);
+	get_page(skb_frag_page(frag));
 }
 
 /**
@@ -60,7 +29,7 @@ static inline void __skb_frag_ref(skb_frag_t *frag, bool recycle)
  */
 static inline void skb_frag_ref(struct sk_buff *skb, int f)
 {
-	__skb_frag_ref(&skb_shinfo(skb)->frags[f], skb->pp_recycle);
+	__skb_frag_ref(&skb_shinfo(skb)->frags[f]);
 }
 
 bool napi_pp_put_page(struct page *page);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 28cd640a6ea9..466999a7515e 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -904,6 +904,11 @@ static void skb_clone_fraglist(struct sk_buff *skb)
 		skb_get(list);
 }
 
+static bool is_pp_page(struct page *page)
+{
+	return (page->pp_magic & ~0x3UL) == PP_SIGNATURE;
+}
+
 int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
 		    unsigned int headroom)
 {
@@ -1025,6 +1030,37 @@ static bool skb_pp_recycle(struct sk_buff *skb, void *data)
 	return napi_pp_put_page(virt_to_page(data));
 }
 
+/**
+ * skb_pp_frag_ref() - Increase fragment references of a page pool aware skb
+ * @skb:	page pool aware skb
+ *
+ * Increase the fragment reference count (pp_ref_count) of a skb. This is
+ * intended to gain fragment references only for page pool aware skbs,
+ * i.e. when skb->pp_recycle is true, and not for fragments in a
+ * non-pp-recycling skb. It has a fallback to increase references on normal
+ * pages, as page pool aware skbs may also have normal page fragments.
+ */
+static int skb_pp_frag_ref(struct sk_buff *skb)
+{
+	struct skb_shared_info *shinfo;
+	struct page *head_page;
+	int i;
+
+	if (!skb->pp_recycle)
+		return -EINVAL;
+
+	shinfo = skb_shinfo(skb);
+
+	for (i = 0; i < shinfo->nr_frags; i++) {
+		head_page = compound_head(skb_frag_page(&shinfo->frags[i]));
+		if (likely(is_pp_page(head_page)))
+			page_pool_ref_page(head_page);
+		else
+			page_ref_inc(head_page);
+	}
+	return 0;
+}
+
 static void skb_kfree_head(void *head, unsigned int end_offset)
 {
 	if (end_offset == SKB_SMALL_HEAD_HEADROOM)
@@ -4160,7 +4196,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
 			to++;
 
 		} else {
-			__skb_frag_ref(fragfrom, skb->pp_recycle);
+			__skb_frag_ref(fragfrom);
 			skb_frag_page_copy(fragto, fragfrom);
 			skb_frag_off_copy(fragto, fragfrom);
 			skb_frag_size_set(fragto, todo);
@@ -4810,7 +4846,7 @@ normal:
 			}
 
 			*nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag;
-			__skb_frag_ref(nskb_frag, nskb->pp_recycle);
+			__skb_frag_ref(nskb_frag);
 			size = skb_frag_size(nskb_frag);
 
 			if (pos < offset) {
@@ -5941,8 +5977,10 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
 	/* if the skb is not cloned this does nothing
 	 * since we set nr_frags to 0.
 	 */
-	for (i = 0; i < from_shinfo->nr_frags; i++)
-		__skb_frag_ref(&from_shinfo->frags[i], from->pp_recycle);
+	if (skb_pp_frag_ref(from)) {
+		for (i = 0; i < from_shinfo->nr_frags; i++)
+			__skb_frag_ref(&from_shinfo->frags[i]);
+	}
 
 	to->truesize += delta;
 	to->len += len;
diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c
index 9237dded4467..f9e3d3d90dcf 100644
--- a/net/tls/tls_device_fallback.c
+++ b/net/tls/tls_device_fallback.c
@@ -278,7 +278,7 @@ static int fill_sg_in(struct scatterlist *sg_in,
 	for (i = 0; remaining > 0; i++) {
 		skb_frag_t *frag = &record->frags[i];
 
-		__skb_frag_ref(frag, false);
+		__skb_frag_ref(frag);
 		sg_set_page(sg_in + i, skb_frag_page(frag),
 			    skb_frag_size(frag), skb_frag_off(frag));
 
-- 
cgit v1.2.3


From e695668af8523b059127dfa8b261c76e7c9cde10 Mon Sep 17 00:00:00 2001
From: Ricardo Ribalda <ribalda@chromium.org>
Date: Fri, 3 May 2024 11:33:20 +0000
Subject: media: cec.h: Fix kerneldoc

is_claiming_log_addrs documentation was missing.

fix this kernel-doc warning:
include/media/cec.h:296: warning: Function parameter or struct member 'is_claiming_log_addrs' not described in 'cec_adapter'

Signed-off-by: Ricardo Ribalda <ribalda@chromium.org>
Reviewed-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
---
 include/media/cec.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/media/cec.h b/include/media/cec.h
index d3abe84c9c9e..d131514032f2 100644
--- a/include/media/cec.h
+++ b/include/media/cec.h
@@ -187,6 +187,7 @@ struct cec_adap_ops {
  *	in order to transmit or receive CEC messages. This is usually a HW
  *	limitation.
  * @is_enabled:		the CEC adapter is enabled
+ * @is_claiming_log_addrs:  true if cec_claim_log_addrs() is running
  * @is_configuring:	the CEC adapter is configuring (i.e. claiming LAs)
  * @must_reconfigure:	while configuring, the PA changed, so reclaim LAs
  * @is_configured:	the CEC adapter is configured (i.e. has claimed LAs)
-- 
cgit v1.2.3


From b63db58e2fa5d6963db9c45df88e60060f0ff35f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Thu, 2 May 2024 09:03:15 -0400
Subject: eventfs/tracing: Add callback for release of an eventfs_inode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Synthetic events create and destroy tracefs files when they are created
and removed. The tracing subsystem has its own file descriptor
representing the state of the events attached to the tracefs files.
There's a race between the eventfs files and this file descriptor of the
tracing system where the following can cause an issue:

With two scripts 'A' and 'B' doing:

  Script 'A':
    echo "hello int aaa" > /sys/kernel/tracing/synthetic_events
    while :
    do
      echo 0 > /sys/kernel/tracing/events/synthetic/hello/enable
    done

  Script 'B':
    echo > /sys/kernel/tracing/synthetic_events

Script 'A' creates a synthetic event "hello" and then just writes zero
into its enable file.

Script 'B' removes all synthetic events (including the newly created
"hello" event).

What happens is that the opening of the "enable" file has:

 {
	struct trace_event_file *file = inode->i_private;
	int ret;

	ret = tracing_check_open_get_tr(file->tr);
 [..]

But deleting the events frees the "file" descriptor, and a "use after
free" happens with the dereference at "file->tr".

The file descriptor does have a reference counter, but there needs to be a
way to decrement it from the eventfs when the eventfs_inode is removed
that represents this file descriptor.

Add an optional "release" callback to the eventfs_entry array structure,
that gets called when the eventfs file is about to be removed. This allows
for the creating on the eventfs file to increment the tracing file
descriptor ref counter. When the eventfs file is deleted, it can call the
release function that will call the put function for the tracing file
descriptor.

This will protect the tracing file from being freed while a eventfs file
that references it is being opened.

Link: https://lore.kernel.org/linux-trace-kernel/20240426073410.17154-1-Tze-nan.Wu@mediatek.com/
Link: https://lore.kernel.org/linux-trace-kernel/20240502090315.448cba46@gandalf.local.home

Cc: stable@vger.kernel.org
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Fixes: 5790b1fb3d672 ("eventfs: Remove eventfs_file and just use eventfs_inode")
Reported-by: Tze-nan wu <Tze-nan.Wu@mediatek.com>
Tested-by: Tze-nan Wu (吳澤南) <Tze-nan.Wu@mediatek.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 fs/tracefs/event_inode.c    | 23 +++++++++++++++++++++--
 include/linux/tracefs.h     |  3 +++
 kernel/trace/trace_events.c | 12 ++++++++++++
 3 files changed, 36 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index 894c6ca1e500..f5510e26f0f6 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -84,10 +84,17 @@ enum {
 static void release_ei(struct kref *ref)
 {
 	struct eventfs_inode *ei = container_of(ref, struct eventfs_inode, kref);
+	const struct eventfs_entry *entry;
 	struct eventfs_root_inode *rei;
 
 	WARN_ON_ONCE(!ei->is_freed);
 
+	for (int i = 0; i < ei->nr_entries; i++) {
+		entry = &ei->entries[i];
+		if (entry->release)
+			entry->release(entry->name, ei->data);
+	}
+
 	kfree(ei->entry_attrs);
 	kfree_const(ei->name);
 	if (ei->is_events) {
@@ -112,6 +119,18 @@ static inline void free_ei(struct eventfs_inode *ei)
 	}
 }
 
+/*
+ * Called when creation of an ei fails, do not call release() functions.
+ */
+static inline void cleanup_ei(struct eventfs_inode *ei)
+{
+	if (ei) {
+		/* Set nr_entries to 0 to prevent release() function being called */
+		ei->nr_entries = 0;
+		free_ei(ei);
+	}
+}
+
 static inline struct eventfs_inode *get_ei(struct eventfs_inode *ei)
 {
 	if (ei)
@@ -734,7 +753,7 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode
 
 	/* Was the parent freed? */
 	if (list_empty(&ei->list)) {
-		free_ei(ei);
+		cleanup_ei(ei);
 		ei = NULL;
 	}
 	return ei;
@@ -835,7 +854,7 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry
 	return ei;
 
  fail:
-	free_ei(ei);
+	cleanup_ei(ei);
 	tracefs_failed_creating(dentry);
 	return ERR_PTR(-ENOMEM);
 }
diff --git a/include/linux/tracefs.h b/include/linux/tracefs.h
index 7a5fe17b6bf9..d03f74658716 100644
--- a/include/linux/tracefs.h
+++ b/include/linux/tracefs.h
@@ -62,6 +62,8 @@ struct eventfs_file;
 typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
 				const struct file_operations **fops);
 
+typedef void (*eventfs_release)(const char *name, void *data);
+
 /**
  * struct eventfs_entry - dynamically created eventfs file call back handler
  * @name:	Then name of the dynamic file in an eventfs directory
@@ -72,6 +74,7 @@ typedef int (*eventfs_callback)(const char *name, umode_t *mode, void **data,
 struct eventfs_entry {
 	const char			*name;
 	eventfs_callback		callback;
+	eventfs_release			release;
 };
 
 struct eventfs_inode;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 52f75c36bbca..6ef29eba90ce 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2552,6 +2552,14 @@ static int event_callback(const char *name, umode_t *mode, void **data,
 	return 0;
 }
 
+/* The file is incremented on creation and freeing the enable file decrements it */
+static void event_release(const char *name, void *data)
+{
+	struct trace_event_file *file = data;
+
+	event_file_put(file);
+}
+
 static int
 event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file)
 {
@@ -2566,6 +2574,7 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file)
 		{
 			.name		= "enable",
 			.callback	= event_callback,
+			.release	= event_release,
 		},
 		{
 			.name		= "filter",
@@ -2634,6 +2643,9 @@ event_create_dir(struct eventfs_inode *parent, struct trace_event_file *file)
 		return ret;
 	}
 
+	/* Gets decremented on freeing of the "enable" file */
+	event_file_get(file);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From f6bdc7865ef4a7a4393c90a550c728231ebc853e Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 3 May 2024 13:16:25 -0700
Subject: fs: WARN when f_count resurrection is attempted

It should never happen that get_file() is called on a file with
f_count equal to zero. If this happens, a use-after-free condition
has happened[1], and we need to attempt a best-effort reporting of
the situation to help find the root cause more easily. Additionally,
this serves as a data corruption indicator that system owners using
warn_limit or panic_on_warn would like to have detected.

Link: https://lore.kernel.org/lkml/7c41cf3c-2a71-4dbb-8f34-0337890906fc@gmail.com/ [1]
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20240503201620.work.651-kees@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index e5f643e97cfd..5b351c1e6f58 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1034,7 +1034,8 @@ struct file_handle {
 
 static inline struct file *get_file(struct file *f)
 {
-	atomic_long_inc(&f->f_count);
+	long prior = atomic_long_fetch_inc_relaxed(&f->f_count);
+	WARN_ONCE(!prior, "struct file::f_count incremented from zero; use-after-free condition present!\n");
 	return f;
 }
 
-- 
cgit v1.2.3


From 3b0b441a297e7fe11baab51439a81cd6a336ed64 Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Mon, 29 Apr 2024 00:43:19 +0000
Subject: soundwire: intel_ace2x: use DOAIS and DODS settings from firmware

Starting with LNL, the recommendation is to use settings read from DSD
properties instead of hard-coding the values.

The DOAIS and DODS values are completely-specific to Intel and are
stored in a vendor-specific property structure.

Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Signed-off-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://lore.kernel.org/r/20240429004321.2399754-3-yung-chuan.liao@linux.intel.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 drivers/soundwire/intel.h           |  5 +++++
 drivers/soundwire/intel_ace2x.c     |  7 +++++--
 drivers/soundwire/intel_auxdevice.c | 21 +++++++++++++++++++++
 include/linux/soundwire/sdw.h       |  2 ++
 4 files changed, 33 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/soundwire/intel.h b/drivers/soundwire/intel.h
index 511932c55216..b36d46319f0f 100644
--- a/drivers/soundwire/intel.h
+++ b/drivers/soundwire/intel.h
@@ -58,6 +58,11 @@ struct sdw_intel {
 #endif
 };
 
+struct sdw_intel_prop {
+	u16 doais;
+	u16 dods;
+};
+
 enum intel_pdi_type {
 	INTEL_PDI_IN = 0,
 	INTEL_PDI_OUT = 1,
diff --git a/drivers/soundwire/intel_ace2x.c b/drivers/soundwire/intel_ace2x.c
index 32b538cd6d66..917cc79d4d85 100644
--- a/drivers/soundwire/intel_ace2x.c
+++ b/drivers/soundwire/intel_ace2x.c
@@ -25,12 +25,15 @@
 static void intel_shim_vs_init(struct sdw_intel *sdw)
 {
 	void __iomem *shim_vs = sdw->link_res->shim_vs;
+	struct sdw_bus *bus = &sdw->cdns.bus;
+	struct sdw_intel_prop *intel_prop;
 	u16 doais;
 	u16 dods;
 	u16 act;
 
-	doais = 0x3;
-	dods = 0x1;
+	intel_prop = bus->vendor_specific_prop;
+	doais = intel_prop->doais;
+	dods = intel_prop->dods;
 
 	act = intel_readw(shim_vs, SDW_SHIM2_INTEL_VS_ACTMCTL);
 	u16p_replace_bits(&act, doais, SDW_SHIM2_INTEL_VS_ACTMCTL_DOAIS);
diff --git a/drivers/soundwire/intel_auxdevice.c b/drivers/soundwire/intel_auxdevice.c
index 296a470ce1e0..697d64070794 100644
--- a/drivers/soundwire/intel_auxdevice.c
+++ b/drivers/soundwire/intel_auxdevice.c
@@ -122,6 +122,7 @@ static void generic_new_peripheral_assigned(struct sdw_bus *bus,
 static int sdw_master_read_intel_prop(struct sdw_bus *bus)
 {
 	struct sdw_master_prop *prop = &bus->prop;
+	struct sdw_intel_prop *intel_prop;
 	struct fwnode_handle *link;
 	char name[32];
 	u32 quirk_mask;
@@ -153,6 +154,26 @@ static int sdw_master_read_intel_prop(struct sdw_bus *bus)
 	prop->quirks = SDW_MASTER_QUIRKS_CLEAR_INITIAL_CLASH |
 		SDW_MASTER_QUIRKS_CLEAR_INITIAL_PARITY;
 
+	intel_prop = devm_kzalloc(bus->dev, sizeof(*intel_prop), GFP_KERNEL);
+	if (!intel_prop)
+		return -ENOMEM;
+
+	/* initialize with hardware defaults, in case the properties are not found */
+	intel_prop->doais = 0x3;
+	intel_prop->dods  = 0x1;
+
+	fwnode_property_read_u16(link,
+				 "intel-sdw-doais",
+				 &intel_prop->doais);
+	fwnode_property_read_u16(link,
+				 "intel-sdw-dods",
+				 &intel_prop->dods);
+	bus->vendor_specific_prop = intel_prop;
+
+	dev_dbg(bus->dev, "doais %#x dods %#x\n",
+		intel_prop->doais,
+		intel_prop->dods);
+
 	return 0;
 }
 
diff --git a/include/linux/soundwire/sdw.h b/include/linux/soundwire/sdw.h
index 7bb9119f3069..13e96d8b7423 100644
--- a/include/linux/soundwire/sdw.h
+++ b/include/linux/soundwire/sdw.h
@@ -886,6 +886,7 @@ struct sdw_master_ops {
  * @port_ops: Master port callback ops
  * @params: Current bus parameters
  * @prop: Master properties
+ * @vendor_specific_prop: pointer to non-standard properties
  * @m_rt_list: List of Master instance of all stream(s) running on Bus. This
  * is used to compute and program bus bandwidth, clock, frame shape,
  * transport and port parameters
@@ -920,6 +921,7 @@ struct sdw_bus {
 	const struct sdw_master_port_ops *port_ops;
 	struct sdw_bus_params params;
 	struct sdw_master_prop prop;
+	void *vendor_specific_prop;
 	struct list_head m_rt_list;
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *debugfs;
-- 
cgit v1.2.3


From 3cc50d07be10f67aff836f8247a240ec132c6a67 Mon Sep 17 00:00:00 2001
From: Lukas Wunner <lukas@wunner.de>
Date: Sat, 20 Apr 2024 22:00:01 +0200
Subject: driver core: Add device_show_string() helper for sysfs attributes

For drivers wishing to expose an unsigned long, int or bool at a static
memory location in sysfs, the driver core provides ready-made helpers
such as device_show_ulong() to be used as ->show() callback.

Some drivers need to expose a string and so far they all provide their
own ->show() implementation.  arch/powerpc/perf/hv-24x7.c went so far
as to create a device_show_string() helper but kept it private.

Make it public for reuse by other drivers.  The pattern seems to be
sufficiently frequent to merit a public helper.

Add a DEVICE_STRING_ATTR_RO() macro in line with the existing
DEVICE_ULONG_ATTR() and similar macros to ease declaration of string
attributes.

Signed-off-by: Lukas Wunner <lukas@wunner.de>
Acked-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/2e3eaaf2600bb55c0415c23ba301e809403a7aa2.1713608122.git.lukas@wunner.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/perf/hv-24x7.c | 10 ----------
 drivers/base/core.c         |  9 +++++++++
 include/linux/device.h      | 15 +++++++++++++++
 3 files changed, 24 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c
index 057ec2e3451d..d400fa391c27 100644
--- a/arch/powerpc/perf/hv-24x7.c
+++ b/arch/powerpc/perf/hv-24x7.c
@@ -425,16 +425,6 @@ static char *memdup_to_str(char *maybe_str, int max_len, gfp_t gfp)
 	return kasprintf(gfp, "%.*s", max_len, maybe_str);
 }
 
-static ssize_t device_show_string(struct device *dev,
-		struct device_attribute *attr, char *buf)
-{
-	struct dev_ext_attribute *d;
-
-	d = container_of(attr, struct dev_ext_attribute, attr);
-
-	return sprintf(buf, "%s\n", (char *)d->var);
-}
-
 static ssize_t cpumask_show(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 4d51928c4088..131d96c6090b 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -2538,6 +2538,15 @@ ssize_t device_show_bool(struct device *dev, struct device_attribute *attr,
 }
 EXPORT_SYMBOL_GPL(device_show_bool);
 
+ssize_t device_show_string(struct device *dev,
+			   struct device_attribute *attr, char *buf)
+{
+	struct dev_ext_attribute *ea = to_ext_attr(attr);
+
+	return sysfs_emit(buf, "%s\n", (char *)ea->var);
+}
+EXPORT_SYMBOL_GPL(device_show_string);
+
 /**
  * device_release - free device structure.
  * @kobj: device's kobject.
diff --git a/include/linux/device.h b/include/linux/device.h
index 07f4d2af1e3c..b6b6ff881a3f 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -132,6 +132,8 @@ ssize_t device_show_bool(struct device *dev, struct device_attribute *attr,
 			char *buf);
 ssize_t device_store_bool(struct device *dev, struct device_attribute *attr,
 			 const char *buf, size_t count);
+ssize_t device_show_string(struct device *dev, struct device_attribute *attr,
+			   char *buf);
 
 /**
  * DEVICE_ATTR - Define a device attribute.
@@ -251,6 +253,19 @@ ssize_t device_store_bool(struct device *dev, struct device_attribute *attr,
 	struct dev_ext_attribute dev_attr_##_name = \
 		{ __ATTR(_name, _mode, device_show_bool, device_store_bool), &(_var) }
 
+/**
+ * DEVICE_STRING_ATTR_RO - Define a device attribute backed by a r/o string.
+ * @_name: Attribute name.
+ * @_mode: File mode.
+ * @_var: Identifier of string.
+ *
+ * Like DEVICE_ULONG_ATTR(), but @_var is a string. Because the length of the
+ * string allocation is unknown, the attribute must be read-only.
+ */
+#define DEVICE_STRING_ATTR_RO(_name, _mode, _var) \
+	struct dev_ext_attribute dev_attr_##_name = \
+		{ __ATTR(_name, (_mode) & ~0222, device_show_string, NULL), (_var) }
+
 #define DEVICE_ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) \
 	struct device_attribute dev_attr_##_name =		\
 		__ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store)
-- 
cgit v1.2.3


From 562be61b35d911a8b45acc3dcf8642876dbb66dd Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 4 May 2024 11:47:05 +0200
Subject: usb: core: Remove the useless struct usb_devmap which is just a
 bitmap

struct usb_devmap is really just a bitmap. No need to have a dedicated
structure for that.

Simplify code and use DECLARE_BITMAP() directly instead.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Link: https://lore.kernel.org/r/1d818575ff7a1e8317674aecf761ee23c89fdc84.1714815990.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/hcd.c | 4 ++--
 drivers/usb/core/hub.c | 9 ++++-----
 include/linux/usb.h    | 7 +------
 3 files changed, 7 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index c0e005670d67..e3366f4d82b9 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -866,7 +866,7 @@ static int usb_rh_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
  */
 static void usb_bus_init (struct usb_bus *bus)
 {
-	memset (&bus->devmap, 0, sizeof(struct usb_devmap));
+	memset(&bus->devmap, 0, sizeof(bus->devmap));
 
 	bus->devnum_next = 1;
 
@@ -962,7 +962,7 @@ static int register_root_hub(struct usb_hcd *hcd)
 
 	usb_dev->devnum = devnum;
 	usb_dev->bus->devnum_next = devnum + 1;
-	set_bit (devnum, usb_dev->bus->devmap.devicemap);
+	set_bit(devnum, usb_dev->bus->devmap);
 	usb_set_device_state(usb_dev, USB_STATE_ADDRESS);
 
 	mutex_lock(&usb_bus_idr_lock);
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index e9fec90ed6f8..bd2e57f90e71 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -2207,13 +2207,12 @@ static void choose_devnum(struct usb_device *udev)
 	mutex_lock(&bus->devnum_next_mutex);
 
 	/* Try to allocate the next devnum beginning at bus->devnum_next. */
-	devnum = find_next_zero_bit(bus->devmap.devicemap, 128,
-			bus->devnum_next);
+	devnum = find_next_zero_bit(bus->devmap, 128, bus->devnum_next);
 	if (devnum >= 128)
-		devnum = find_next_zero_bit(bus->devmap.devicemap, 128, 1);
+		devnum = find_next_zero_bit(bus->devmap, 128, 1);
 	bus->devnum_next = (devnum >= 127 ? 1 : devnum + 1);
 	if (devnum < 128) {
-		set_bit(devnum, bus->devmap.devicemap);
+		set_bit(devnum, bus->devmap);
 		udev->devnum = devnum;
 	}
 	mutex_unlock(&bus->devnum_next_mutex);
@@ -2222,7 +2221,7 @@ static void choose_devnum(struct usb_device *udev)
 static void release_devnum(struct usb_device *udev)
 {
 	if (udev->devnum > 0) {
-		clear_bit(udev->devnum, udev->bus->devmap.devicemap);
+		clear_bit(udev->devnum, udev->bus->devmap);
 		udev->devnum = -1;
 	}
 }
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 9e52179872a5..1913a13833f2 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -440,11 +440,6 @@ int __usb_get_extra_descriptor(char *buffer, unsigned size,
 
 /* ----------------------------------------------------------------------- */
 
-/* USB device number allocation bitmap */
-struct usb_devmap {
-	unsigned long devicemap[128 / (8*sizeof(unsigned long))];
-};
-
 /*
  * Allocated per bus (tree of devices) we have:
  */
@@ -472,7 +467,7 @@ struct usb_bus {
 					 * round-robin allocation */
 	struct mutex devnum_next_mutex; /* devnum_next mutex */
 
-	struct usb_devmap devmap;	/* device address allocation map */
+	DECLARE_BITMAP(devmap, 128);	/* USB device number allocation bitmap */
 	struct usb_device *root_hub;	/* Root hub */
 	struct usb_bus *hs_companion;	/* Companion EHCI bus, if any */
 
-- 
cgit v1.2.3


From 6bd23e0c2bb6c65d4f5754d1456bc9a4427fc59b Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 23 Apr 2024 09:33:39 -0700
Subject: tty: add the option to have a tty reject a new ldisc

... and use it to limit the virtual terminals to just N_TTY.  They are
kind of special, and in particular, the "con_write()" routine violates
the "writes cannot sleep" rule that some ldiscs rely on.

This avoids the

   BUG: sleeping function called from invalid context at kernel/printk/printk.c:2659

when N_GSM has been attached to a virtual console, and gsmld_write()
calls con_write() while holding a spinlock, and con_write() then tries
to get the console lock.

Tested-by: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Cc: Jiri Slaby <jirislaby@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Daniel Starke <daniel.starke@siemens.com>
Reported-by: syzbot <syzbot+dbac96d8e73b61aa559c@syzkaller.appspotmail.com>
Closes: https://syzkaller.appspot.com/bug?extid=dbac96d8e73b61aa559c
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/20240423163339.59780-1-torvalds@linux-foundation.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_ldisc.c    |  6 ++++++
 drivers/tty/vt/vt.c        | 10 ++++++++++
 include/linux/tty_driver.h |  8 ++++++++
 3 files changed, 24 insertions(+)

(limited to 'include')

diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c
index 3f68e213df1f..d80e9d4c974b 100644
--- a/drivers/tty/tty_ldisc.c
+++ b/drivers/tty/tty_ldisc.c
@@ -545,6 +545,12 @@ int tty_set_ldisc(struct tty_struct *tty, int disc)
 		goto out;
 	}
 
+	if (tty->ops->ldisc_ok) {
+		retval = tty->ops->ldisc_ok(tty, disc);
+		if (retval)
+			goto out;
+	}
+
 	old_ldisc = tty->ldisc;
 
 	/* Shutdown the old discipline. */
diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index 9b5b98dfc8b4..cd87e3d1291e 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -3576,6 +3576,15 @@ static void con_cleanup(struct tty_struct *tty)
 	tty_port_put(&vc->port);
 }
 
+/*
+ * We can't deal with anything but the N_TTY ldisc,
+ * because we can sleep in our write() routine.
+ */
+static int con_ldisc_ok(struct tty_struct *tty, int ldisc)
+{
+	return ldisc == N_TTY ? 0 : -EINVAL;
+}
+
 static int default_color           = 7; /* white */
 static int default_italic_color    = 2; // green (ASCII)
 static int default_underline_color = 3; // cyan (ASCII)
@@ -3695,6 +3704,7 @@ static const struct tty_operations con_ops = {
 	.resize = vt_resize,
 	.shutdown = con_shutdown,
 	.cleanup = con_cleanup,
+	.ldisc_ok = con_ldisc_ok,
 };
 
 static struct cdev vc0_cdev;
diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index 7372124fbf90..dd4b31ce6d5d 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -154,6 +154,13 @@ struct serial_struct;
  *
  *	Optional. Called under the @tty->termios_rwsem. May sleep.
  *
+ * @ldisc_ok: ``int ()(struct tty_struct *tty, int ldisc)``
+ *
+ *	This routine allows the @tty driver to decide if it can deal
+ *	with a particular @ldisc.
+ *
+ *	Optional. Called under the @tty->ldisc_sem and @tty->termios_rwsem.
+ *
  * @set_ldisc: ``void ()(struct tty_struct *tty)``
  *
  *	This routine allows the @tty driver to be notified when the device's
@@ -372,6 +379,7 @@ struct tty_operations {
 	void (*hangup)(struct tty_struct *tty);
 	int (*break_ctl)(struct tty_struct *tty, int state);
 	void (*flush_buffer)(struct tty_struct *tty);
+	int (*ldisc_ok)(struct tty_struct *tty, int ldisc);
 	void (*set_ldisc)(struct tty_struct *tty);
 	void (*wait_until_sent)(struct tty_struct *tty, int timeout);
 	void (*send_xchar)(struct tty_struct *tty, u8 ch);
-- 
cgit v1.2.3


From 07cf835689d700d46d88b5bbffc32bc210319f5f Mon Sep 17 00:00:00 2001
From: Mikhail Lappo <miklelappo@gmail.com>
Date: Mon, 29 Apr 2024 16:01:06 +0300
Subject: stm class: Add source type

Currently kernel HW tracing infrastrtucture and specifically its SyS-T
part treats all source data in the same way. Treating and encoding
different trace data sources differently might allow decoding software
to make use of e.g. ftrace event ids by converting them to a SyS-T
message catalog.

The solution is to keep source type stored within stm_source_data
structure to allow different handling by stm output/protocol.
Currently we only differentiate between STM_USER and STM_FTRACE sources.

Signed-off-by: Mikhail Lappo <miklelappo@gmail.com>
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20240429130119.1518073-3-alexander.shishkin@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hwtracing/stm/console.c   |  1 +
 drivers/hwtracing/stm/ftrace.c    |  1 +
 drivers/hwtracing/stm/heartbeat.c |  1 +
 include/linux/stm.h               | 12 ++++++++++++
 4 files changed, 15 insertions(+)

(limited to 'include')

diff --git a/drivers/hwtracing/stm/console.c b/drivers/hwtracing/stm/console.c
index a00f65e21747..097a00ac43a7 100644
--- a/drivers/hwtracing/stm/console.c
+++ b/drivers/hwtracing/stm/console.c
@@ -22,6 +22,7 @@ static struct stm_console {
 	.data	= {
 		.name		= "console",
 		.nr_chans	= 1,
+		.type		= STM_USER,
 		.link		= stm_console_link,
 		.unlink		= stm_console_unlink,
 	},
diff --git a/drivers/hwtracing/stm/ftrace.c b/drivers/hwtracing/stm/ftrace.c
index 3bb606dfa634..a7cea7ea0163 100644
--- a/drivers/hwtracing/stm/ftrace.c
+++ b/drivers/hwtracing/stm/ftrace.c
@@ -23,6 +23,7 @@ static struct stm_ftrace {
 	.data	= {
 		.name		= "ftrace",
 		.nr_chans	= STM_FTRACE_NR_CHANNELS,
+		.type		= STM_FTRACE,
 		.link		= stm_ftrace_link,
 		.unlink		= stm_ftrace_unlink,
 	},
diff --git a/drivers/hwtracing/stm/heartbeat.c b/drivers/hwtracing/stm/heartbeat.c
index 81d7b21d31ec..e9496fe97baa 100644
--- a/drivers/hwtracing/stm/heartbeat.c
+++ b/drivers/hwtracing/stm/heartbeat.c
@@ -78,6 +78,7 @@ static int stm_heartbeat_init(void)
 		}
 
 		stm_heartbeat[i].data.nr_chans	= 1;
+		stm_heartbeat[i].data.type	= STM_USER;
 		stm_heartbeat[i].data.link	= stm_heartbeat_link;
 		stm_heartbeat[i].data.unlink	= stm_heartbeat_unlink;
 		hrtimer_init(&stm_heartbeat[i].hrtimer, CLOCK_MONOTONIC,
diff --git a/include/linux/stm.h b/include/linux/stm.h
index 3b22689512be..2fcbef9608f6 100644
--- a/include/linux/stm.h
+++ b/include/linux/stm.h
@@ -30,6 +30,16 @@ enum stp_packet_flags {
 	STP_PACKET_TIMESTAMPED	= 0x2,
 };
 
+/**
+ * enum stm_source_type - STM source driver
+ * @STM_USER: any STM trace source
+ * @STM_FTRACE: ftrace STM source
+ */
+enum stm_source_type {
+	STM_USER,
+	STM_FTRACE,
+};
+
 struct stp_policy;
 
 struct stm_device;
@@ -106,6 +116,7 @@ struct stm_source_device;
  * @name:	device name, will be used for policy lookup
  * @src:	internal structure, only used by stm class code
  * @nr_chans:	number of channels to allocate
+ * @type:	type of STM source driver represented by stm_source_type
  * @link:	called when this source gets linked to an STM device
  * @unlink:	called when this source is about to get unlinked from its STM
  *
@@ -117,6 +128,7 @@ struct stm_source_data {
 	struct stm_source_device *src;
 	unsigned int		percpu;
 	unsigned int		nr_chans;
+	unsigned int		type;
 	int			(*link)(struct stm_source_data *data);
 	void			(*unlink)(struct stm_source_data *data);
 };
-- 
cgit v1.2.3


From 44b607ad4cdf23ae8f796b95bd14709fa06f7728 Mon Sep 17 00:00:00 2001
From: Konstantin Taranov <kotaranov@microsoft.com>
Date: Fri, 26 Apr 2024 06:12:40 -0700
Subject: RDMA/mana_ib: implement uapi for creation of rnic cq

Enable users to create RNIC CQs using a corresponding flag.
With the previous request size, an ethernet CQ is created.
As a response, return ID of the created CQ.

Signed-off-by: Konstantin Taranov <kotaranov@microsoft.com>
Link: https://lore.kernel.org/r/1714137160-5222-6-git-send-email-kotaranov@linux.microsoft.com
Reviewed-by: Long Li <longli@microsoft.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/infiniband/hw/mana/cq.c | 55 ++++++++++++++++++++++++++++++++++++++---
 include/uapi/rdma/mana-abi.h    | 12 +++++++++
 2 files changed, 63 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c
index 688ffe61f6b2..c6a3fd57a196 100644
--- a/drivers/infiniband/hw/mana/cq.c
+++ b/drivers/infiniband/hw/mana/cq.c
@@ -9,17 +9,22 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		      struct ib_udata *udata)
 {
 	struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq);
+	struct mana_ib_create_cq_resp resp = {};
+	struct mana_ib_ucontext *mana_ucontext;
 	struct ib_device *ibdev = ibcq->device;
 	struct mana_ib_create_cq ucmd = {};
 	struct mana_ib_dev *mdev;
+	bool is_rnic_cq;
+	u32 doorbell;
 	int err;
 
 	mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
 
-	if (udata->inlen < sizeof(ucmd))
-		return -EINVAL;
-
 	cq->comp_vector = attr->comp_vector % ibdev->num_comp_vectors;
+	cq->cq_handle = INVALID_MANA_HANDLE;
+
+	if (udata->inlen < offsetof(struct mana_ib_create_cq, flags))
+		return -EINVAL;
 
 	err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen));
 	if (err) {
@@ -28,7 +33,9 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		return err;
 	}
 
-	if (attr->cqe > mdev->adapter_caps.max_qp_wr) {
+	is_rnic_cq = !!(ucmd.flags & MANA_IB_CREATE_RNIC_CQ);
+
+	if (!is_rnic_cq && attr->cqe > mdev->adapter_caps.max_qp_wr) {
 		ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe);
 		return -EINVAL;
 	}
@@ -40,7 +47,41 @@ int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		return err;
 	}
 
+	mana_ucontext = rdma_udata_to_drv_context(udata, struct mana_ib_ucontext,
+						  ibucontext);
+	doorbell = mana_ucontext->doorbell;
+
+	if (is_rnic_cq) {
+		err = mana_ib_gd_create_cq(mdev, cq, doorbell);
+		if (err) {
+			ibdev_dbg(ibdev, "Failed to create RNIC cq, %d\n", err);
+			goto err_destroy_queue;
+		}
+
+		err = mana_ib_install_cq_cb(mdev, cq);
+		if (err) {
+			ibdev_dbg(ibdev, "Failed to install cq callback, %d\n", err);
+			goto err_destroy_rnic_cq;
+		}
+	}
+
+	resp.cqid = cq->queue.id;
+	err = ib_copy_to_udata(udata, &resp, min(sizeof(resp), udata->outlen));
+	if (err) {
+		ibdev_dbg(&mdev->ib_dev, "Failed to copy to udata, %d\n", err);
+		goto err_remove_cq_cb;
+	}
+
 	return 0;
+
+err_remove_cq_cb:
+	mana_ib_remove_cq_cb(mdev, cq);
+err_destroy_rnic_cq:
+	mana_ib_gd_destroy_cq(mdev, cq);
+err_destroy_queue:
+	mana_ib_destroy_queue(mdev, &cq->queue);
+
+	return err;
 }
 
 int mana_ib_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
@@ -52,6 +93,12 @@ int mana_ib_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 	mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
 
 	mana_ib_remove_cq_cb(mdev, cq);
+
+	/* Ignore return code as there is not much we can do about it.
+	 * The error message is printed inside.
+	 */
+	mana_ib_gd_destroy_cq(mdev, cq);
+
 	mana_ib_destroy_queue(mdev, &cq->queue);
 
 	return 0;
diff --git a/include/uapi/rdma/mana-abi.h b/include/uapi/rdma/mana-abi.h
index 5fcb31b37fb9..2c41cc315218 100644
--- a/include/uapi/rdma/mana-abi.h
+++ b/include/uapi/rdma/mana-abi.h
@@ -16,8 +16,20 @@
 
 #define MANA_IB_UVERBS_ABI_VERSION 1
 
+enum mana_ib_create_cq_flags {
+	MANA_IB_CREATE_RNIC_CQ	= 1 << 0,
+};
+
 struct mana_ib_create_cq {
 	__aligned_u64 buf_addr;
+	__u16	flags;
+	__u16	reserved0;
+	__u32	reserved1;
+};
+
+struct mana_ib_create_cq_resp {
+	__u32 cqid;
+	__u32 reserved;
 };
 
 struct mana_ib_create_qp {
-- 
cgit v1.2.3


From 087b24de5c825c53f15a9481b94f757223c20610 Mon Sep 17 00:00:00 2001
From: Mina Almasry <almasrymina@google.com>
Date: Wed, 1 May 2024 23:25:40 +0000
Subject: queue_api: define queue api

This API enables the net stack to reset the queues used for devmem TCP.

Signed-off-by: Mina Almasry <almasrymina@google.com>
Signed-off-by: Shailend Chand <shailend@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h   |  3 +++
 include/net/netdev_queues.h | 31 +++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

(limited to 'include')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2814a15eed73..cf261fb89d73 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1957,6 +1957,7 @@ enum netdev_reg_state {
  *	@sysfs_rx_queue_group:	Space for optional per-rx queue attributes
  *	@rtnl_link_ops:	Rtnl_link_ops
  *	@stat_ops:	Optional ops for queue-aware statistics
+ *	@queue_mgmt_ops:	Optional ops for queue management
  *
  *	@gso_max_size:	Maximum size of generic segmentation offload
  *	@tso_max_size:	Device (as in HW) limit on the max TSO request size
@@ -2340,6 +2341,8 @@ struct net_device {
 
 	const struct netdev_stat_ops *stat_ops;
 
+	const struct netdev_queue_mgmt_ops *queue_mgmt_ops;
+
 	/* for setting kernel sock attribute on TCP connection setup */
 #define GSO_MAX_SEGS		65535u
 #define GSO_LEGACY_MAX_SIZE	65536u
diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h
index c7ac4539eafc..e7b84f018cee 100644
--- a/include/net/netdev_queues.h
+++ b/include/net/netdev_queues.h
@@ -87,6 +87,37 @@ struct netdev_stat_ops {
 			       struct netdev_queue_stats_tx *tx);
 };
 
+/**
+ * struct netdev_queue_mgmt_ops - netdev ops for queue management
+ *
+ * @ndo_queue_mem_size: Size of the struct that describes a queue's memory.
+ *
+ * @ndo_queue_mem_alloc: Allocate memory for an RX queue at the specified index.
+ *			 The new memory is written at the specified address.
+ *
+ * @ndo_queue_mem_free:	Free memory from an RX queue.
+ *
+ * @ndo_queue_start:	Start an RX queue with the specified memory and at the
+ *			specified index.
+ *
+ * @ndo_queue_stop:	Stop the RX queue at the specified index. The stopped
+ *			queue's memory is written at the specified address.
+ */
+struct netdev_queue_mgmt_ops {
+	size_t			ndo_queue_mem_size;
+	int			(*ndo_queue_mem_alloc)(struct net_device *dev,
+						       void *per_queue_mem,
+						       int idx);
+	void			(*ndo_queue_mem_free)(struct net_device *dev,
+						      void *per_queue_mem);
+	int			(*ndo_queue_start)(struct net_device *dev,
+						   void *per_queue_mem,
+						   int idx);
+	int			(*ndo_queue_stop)(struct net_device *dev,
+						  void *per_queue_mem,
+						  int idx);
+};
+
 /**
  * DOC: Lockless queue stopping / waking helpers.
  *
-- 
cgit v1.2.3


From 073e58bf6b1a689da2ffcfc7fc515005f8794018 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Mon, 15 Apr 2024 22:48:42 +0200
Subject: i2c: mux: gpio: remove support for class-based device instantiation

i801 as only user of gpio i2c mux removed support for class-based device
instantiation on muxed busses. Class-based device instantiation is a
legacy mechanism and shouldn't be used in new code, therefore remove
support also here.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Andi Shyti <andi.shyti@kernel.org>
---
 drivers/i2c/muxes/i2c-mux-gpio.c           | 3 +--
 include/linux/platform_data/i2c-mux-gpio.h | 2 --
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/i2c/muxes/i2c-mux-gpio.c b/drivers/i2c/muxes/i2c-mux-gpio.c
index 6b979a0a6ab8..0fbb33a3d518 100644
--- a/drivers/i2c/muxes/i2c-mux-gpio.c
+++ b/drivers/i2c/muxes/i2c-mux-gpio.c
@@ -206,9 +206,8 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev)
 
 	for (i = 0; i < mux->data.n_values; i++) {
 		u32 nr = mux->data.base_nr ? (mux->data.base_nr + i) : 0;
-		unsigned int class = mux->data.classes ? mux->data.classes[i] : 0;
 
-		ret = i2c_mux_add_adapter(muxc, nr, mux->data.values[i], class);
+		ret = i2c_mux_add_adapter(muxc, nr, mux->data.values[i], 0);
 		if (ret)
 			goto add_adapter_failed;
 	}
diff --git a/include/linux/platform_data/i2c-mux-gpio.h b/include/linux/platform_data/i2c-mux-gpio.h
index 5e4c2c272a73..816a4cd3ccb5 100644
--- a/include/linux/platform_data/i2c-mux-gpio.h
+++ b/include/linux/platform_data/i2c-mux-gpio.h
@@ -18,7 +18,6 @@
  * @values: Array of bitmasks of GPIO settings (low/high) for each
  *	position
  * @n_values: Number of multiplexer positions (busses to instantiate)
- * @classes: Optional I2C auto-detection classes
  * @idle: Bitmask to write to MUX when idle or GPIO_I2CMUX_NO_IDLE if not used
  */
 struct i2c_mux_gpio_platform_data {
@@ -26,7 +25,6 @@ struct i2c_mux_gpio_platform_data {
 	int base_nr;
 	const unsigned *values;
 	int n_values;
-	const unsigned *classes;
 	unsigned idle;
 };
 
-- 
cgit v1.2.3


From 90d1f14cbb9ddbfc532e2da13bf6e0ed8320e792 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Fri, 26 Apr 2024 11:16:22 +0200
Subject: kmsan: compiler_types: declare __no_sanitize_or_inline

It turned out that KMSAN instruments READ_ONCE_NOCHECK(), resulting in
false positive reports, because __no_sanitize_or_inline enforced inlining.

Properly declare __no_sanitize_or_inline under __SANITIZE_MEMORY__, so
that it does not __always_inline the annotated function.

Link: https://lkml.kernel.org/r/20240426091622.3846771-1-glider@google.com
Fixes: 5de0ce85f5a4 ("kmsan: mark noinstr as __no_sanitize_memory")
Signed-off-by: Alexander Potapenko <glider@google.com>
Reported-by: syzbot+355c5bb8c1445c871ee8@syzkaller.appspotmail.com
Link: https://lkml.kernel.org/r/000000000000826ac1061675b0e3@google.com
Cc: <stable@vger.kernel.org>
Reviewed-by: Marco Elver <elver@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Miguel Ojeda <ojeda@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/compiler_types.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 2abaa3a825a9..8f8236317d5b 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -278,6 +278,17 @@ struct ftrace_likely_data {
 # define __no_kcsan
 #endif
 
+#ifdef __SANITIZE_MEMORY__
+/*
+ * Similarly to KASAN and KCSAN, KMSAN loses function attributes of inlined
+ * functions, therefore disabling KMSAN checks also requires disabling inlining.
+ *
+ * __no_sanitize_or_inline effectively prevents KMSAN from reporting errors
+ * within the function and marks all its outputs as initialized.
+ */
+# define __no_sanitize_or_inline __no_kmsan_checks notrace __maybe_unused
+#endif
+
 #ifndef __no_sanitize_or_inline
 #define __no_sanitize_or_inline __always_inline
 #endif
-- 
cgit v1.2.3


From a72a30af550c08423de1b9feecb6ceeddc434889 Mon Sep 17 00:00:00 2001
From: "Ho-Ren (Jack) Chuang" <horenchuang@bytedance.com>
Date: Fri, 5 Apr 2024 00:07:05 +0000
Subject: memory tier: dax/kmem: introduce an abstract layer for finding,
 allocating, and putting memory types

Patch series "Improved Memory Tier Creation for CPUless NUMA Nodes", v11.

When a memory device, such as CXL1.1 type3 memory, is emulated as normal
memory (E820_TYPE_RAM), the memory device is indistinguishable from normal
DRAM in terms of memory tiering with the current implementation.  The
current memory tiering assigns all detected normal memory nodes to the
same DRAM tier.  This results in normal memory devices with different
attributions being unable to be assigned to the correct memory tier,
leading to the inability to migrate pages between different types of
memory.
https://lore.kernel.org/linux-mm/PH0PR08MB7955E9F08CCB64F23963B5C3A860A@PH0PR08MB7955.namprd08.prod.outlook.com/T/

This patchset automatically resolves the issues.  It delays the
initialization of memory tiers for CPUless NUMA nodes until they obtain
HMAT information and after all devices are initialized at boot time,
eliminating the need for user intervention.  If no HMAT is specified, it
falls back to using `default_dram_type`.

Example usecase:
We have CXL memory on the host, and we create VMs with a new system memory
device backed by host CXL memory.  We inject CXL memory performance
attributes through QEMU, and the guest now sees memory nodes with
performance attributes in HMAT.  With this change, we enable the guest
kernel to construct the correct memory tiering for the memory nodes.


This patch (of 2):

Since different memory devices require finding, allocating, and putting
memory types, these common steps are abstracted in this patch, enhancing
the scalability and conciseness of the code.

Link: https://lkml.kernel.org/r/20240405000707.2670063-1-horenchuang@bytedance.com
Link: https://lkml.kernel.org/r/20240405000707.2670063-2-horenchuang@bytedance.com
Signed-off-by: Ho-Ren (Jack) Chuang <horenchuang@bytedance.com>
Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawie.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Gregory Price <gourry.memverge@gmail.com>
Cc: Hao Xiang <hao.xiang@bytedance.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Ravi Jonnalagadda <ravis.opensrc@micron.com>
Cc: SeongJae Park <sj@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 drivers/dax/kmem.c           | 30 ++++--------------------------
 include/linux/memory-tiers.h | 13 +++++++++++++
 mm/memory-tiers.c            | 29 +++++++++++++++++++++++++++++
 3 files changed, 46 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index 42ee360cf4e3..4fe9d040e375 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -55,36 +55,14 @@ static LIST_HEAD(kmem_memory_types);
 
 static struct memory_dev_type *kmem_find_alloc_memory_type(int adist)
 {
-	bool found = false;
-	struct memory_dev_type *mtype;
-
-	mutex_lock(&kmem_memory_type_lock);
-	list_for_each_entry(mtype, &kmem_memory_types, list) {
-		if (mtype->adistance == adist) {
-			found = true;
-			break;
-		}
-	}
-	if (!found) {
-		mtype = alloc_memory_type(adist);
-		if (!IS_ERR(mtype))
-			list_add(&mtype->list, &kmem_memory_types);
-	}
-	mutex_unlock(&kmem_memory_type_lock);
-
-	return mtype;
+	guard(mutex)(&kmem_memory_type_lock);
+	return mt_find_alloc_memory_type(adist, &kmem_memory_types);
 }
 
 static void kmem_put_memory_types(void)
 {
-	struct memory_dev_type *mtype, *mtn;
-
-	mutex_lock(&kmem_memory_type_lock);
-	list_for_each_entry_safe(mtype, mtn, &kmem_memory_types, list) {
-		list_del(&mtype->list);
-		put_memory_type(mtype);
-	}
-	mutex_unlock(&kmem_memory_type_lock);
+	guard(mutex)(&kmem_memory_type_lock);
+	mt_put_memory_types(&kmem_memory_types);
 }
 
 static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index 69e781900082..0d70788558f4 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -48,6 +48,9 @@ int mt_calc_adistance(int node, int *adist);
 int mt_set_default_dram_perf(int nid, struct access_coordinate *perf,
 			     const char *source);
 int mt_perf_to_adistance(struct access_coordinate *perf, int *adist);
+struct memory_dev_type *mt_find_alloc_memory_type(int adist,
+						  struct list_head *memory_types);
+void mt_put_memory_types(struct list_head *memory_types);
 #ifdef CONFIG_MIGRATION
 int next_demotion_node(int node);
 void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
@@ -136,5 +139,15 @@ static inline int mt_perf_to_adistance(struct access_coordinate *perf, int *adis
 {
 	return -EIO;
 }
+
+static inline struct memory_dev_type *mt_find_alloc_memory_type(int adist,
+								struct list_head *memory_types)
+{
+	return NULL;
+}
+
+static inline void mt_put_memory_types(struct list_head *memory_types)
+{
+}
 #endif	/* CONFIG_NUMA */
 #endif  /* _LINUX_MEMORY_TIERS_H */
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 0537664620e5..516b144fd45a 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -623,6 +623,35 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype)
 }
 EXPORT_SYMBOL_GPL(clear_node_memory_type);
 
+struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct list_head *memory_types)
+{
+	struct memory_dev_type *mtype;
+
+	list_for_each_entry(mtype, memory_types, list)
+		if (mtype->adistance == adist)
+			return mtype;
+
+	mtype = alloc_memory_type(adist);
+	if (IS_ERR(mtype))
+		return mtype;
+
+	list_add(&mtype->list, memory_types);
+
+	return mtype;
+}
+EXPORT_SYMBOL_GPL(mt_find_alloc_memory_type);
+
+void mt_put_memory_types(struct list_head *memory_types)
+{
+	struct memory_dev_type *mtype, *mtn;
+
+	list_for_each_entry_safe(mtype, mtn, memory_types, list) {
+		list_del(&mtype->list);
+		put_memory_type(mtype);
+	}
+}
+EXPORT_SYMBOL_GPL(mt_put_memory_types);
+
 static void dump_hmem_attrs(struct access_coordinate *coord, const char *prefix)
 {
 	pr_info(
-- 
cgit v1.2.3


From 38bc9c28c3780d8d0ca1ed1a1fbfe8c91e20f5f2 Mon Sep 17 00:00:00 2001
From: Hao Ge <gehao@kylinos.cn>
Date: Sun, 7 Apr 2024 14:26:53 +0800
Subject: mm/mmap: make vma_wants_writenotify return bool

vma_wants_writenotify() should return bool, so change it.

Link: https://lkml.kernel.org/r/20240407062653.803142-1-gehao@kylinos.cn
Signed-off-by: Hao Ge <gehao@kylinos.cn>
Cc: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h |  2 +-
 mm/mmap.c          | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index cc9410d772dd..0b933f9f4864 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2575,7 +2575,7 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
 					    MM_CP_UFFD_WP_RESOLVE)
 
 bool vma_needs_dirty_tracking(struct vm_area_struct *vma);
-int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
+bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
 static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma)
 {
 	/*
diff --git a/mm/mmap.c b/mm/mmap.c
index d3c2ca8efa53..c9367454e19f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1514,32 +1514,32 @@ bool vma_needs_dirty_tracking(struct vm_area_struct *vma)
  * to the private version (using protection_map[] without the
  * VM_SHARED bit).
  */
-int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
+bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
 {
 	/* If it was private or non-writable, the write bit is already clear */
 	if (!vma_is_shared_writable(vma))
-		return 0;
+		return false;
 
 	/* The backer wishes to know when pages are first written to? */
 	if (vm_ops_needs_writenotify(vma->vm_ops))
-		return 1;
+		return true;
 
 	/* The open routine did something to the protections that pgprot_modify
 	 * won't preserve? */
 	if (pgprot_val(vm_page_prot) !=
 	    pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags)))
-		return 0;
+		return false;
 
 	/*
 	 * Do we need to track softdirty? hugetlb does not support softdirty
 	 * tracking yet.
 	 */
 	if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
-		return 1;
+		return true;
 
 	/* Do we need write faults for uffd-wp tracking? */
 	if (userfaultfd_wp(vma))
-		return 1;
+		return true;
 
 	/* Can the mapping track the dirty pages? */
 	return vma_fs_can_writeback(vma);
-- 
cgit v1.2.3


From 29ae7d96d166fa08c7232daf8a314ef5ba1efd20 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 10 Apr 2024 17:55:26 +0200
Subject: mm: pass VMA instead of MM to follow_pte()

... and centralize the VM_IO/VM_PFNMAP sanity check in there. We'll
now also perform these sanity checks for direct follow_pte()
invocations.

For generic_access_phys(), we might now check multiple times: nothing to
worry about, really.

Link: https://lkml.kernel.org/r/20240410155527.474777-3-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Acked-by: Sean Christopherson <seanjc@google.com>	[KVM]
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Fei Li <fei1.li@intel.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Yonghua Huang <yonghua.huang@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/s390/pci/pci_mmio.c        |  4 ++--
 arch/x86/mm/pat/memtype.c       |  5 +----
 drivers/vfio/vfio_iommu_type1.c |  4 ++--
 drivers/virt/acrn/mm.c          |  3 +--
 include/linux/mm.h              |  2 +-
 mm/memory.c                     | 15 ++++++++-------
 virt/kvm/kvm_main.c             |  4 ++--
 7 files changed, 17 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c
index a90499c087f0..5398729bfe1b 100644
--- a/arch/s390/pci/pci_mmio.c
+++ b/arch/s390/pci/pci_mmio.c
@@ -169,7 +169,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr,
 	if (!(vma->vm_flags & VM_WRITE))
 		goto out_unlock_mmap;
 
-	ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl);
+	ret = follow_pte(vma, mmio_addr, &ptep, &ptl);
 	if (ret)
 		goto out_unlock_mmap;
 
@@ -308,7 +308,7 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr,
 	if (!(vma->vm_flags & VM_WRITE))
 		goto out_unlock_mmap;
 
-	ret = follow_pte(vma->vm_mm, mmio_addr, &ptep, &ptl);
+	ret = follow_pte(vma, mmio_addr, &ptep, &ptl);
 	if (ret)
 		goto out_unlock_mmap;
 
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index d01c3b0bd6eb..bdc2a240c2aa 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -954,10 +954,7 @@ static int follow_phys(struct vm_area_struct *vma, unsigned long *prot,
 	pte_t *ptep, pte;
 	spinlock_t *ptl;
 
-	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
-		return -EINVAL;
-
-	if (follow_pte(vma->vm_mm, vma->vm_start, &ptep, &ptl))
+	if (follow_pte(vma, vma->vm_start, &ptep, &ptl))
 		return -EINVAL;
 
 	pte = ptep_get(ptep);
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index b5c15fe8f9fc..3a0218171cfa 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -518,7 +518,7 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
 	spinlock_t *ptl;
 	int ret;
 
-	ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl);
+	ret = follow_pte(vma, vaddr, &ptep, &ptl);
 	if (ret) {
 		bool unlocked = false;
 
@@ -532,7 +532,7 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
 		if (ret)
 			return ret;
 
-		ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl);
+		ret = follow_pte(vma, vaddr, &ptep, &ptl);
 		if (ret)
 			return ret;
 	}
diff --git a/drivers/virt/acrn/mm.c b/drivers/virt/acrn/mm.c
index 2d98e1e185c4..db8ff1d0ac23 100644
--- a/drivers/virt/acrn/mm.c
+++ b/drivers/virt/acrn/mm.c
@@ -187,8 +187,7 @@ int acrn_vm_ram_map(struct acrn_vm *vm, struct acrn_vm_memmap *memmap)
 		}
 
 		for (i = 0; i < nr_pages; i++) {
-			ret = follow_pte(vma->vm_mm,
-					 memmap->vma_base + i * PAGE_SIZE,
+			ret = follow_pte(vma, memmap->vma_base + i * PAGE_SIZE,
 					 &ptep, &ptl);
 			if (ret)
 				break;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0b933f9f4864..c25ce6992951 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2420,7 +2420,7 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
 		unsigned long end, unsigned long floor, unsigned long ceiling);
 int
 copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
-int follow_pte(struct mm_struct *mm, unsigned long address,
+int follow_pte(struct vm_area_struct *vma, unsigned long address,
 	       pte_t **ptepp, spinlock_t **ptlp);
 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
 			void *buf, int len, int write);
diff --git a/mm/memory.c b/mm/memory.c
index 59c05dc8b18a..1d45d25c1bba 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5926,7 +5926,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 
 /**
  * follow_pte - look up PTE at a user virtual address
- * @mm: the mm_struct of the target address space
+ * @vma: the memory mapping
  * @address: user virtual address
  * @ptepp: location to store found PTE
  * @ptlp: location to store the lock for the PTE
@@ -5945,15 +5945,19 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
  *
  * Return: zero on success, -ve otherwise.
  */
-int follow_pte(struct mm_struct *mm, unsigned long address,
+int follow_pte(struct vm_area_struct *vma, unsigned long address,
 	       pte_t **ptepp, spinlock_t **ptlp)
 {
+	struct mm_struct *mm = vma->vm_mm;
 	pgd_t *pgd;
 	p4d_t *p4d;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *ptep;
 
+	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+		goto out;
+
 	pgd = pgd_offset(mm, address);
 	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
 		goto out;
@@ -6007,11 +6011,8 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
 	int offset = offset_in_page(addr);
 	int ret = -EINVAL;
 
-	if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
-		return -EINVAL;
-
 retry:
-	if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
+	if (follow_pte(vma, addr, &ptep, &ptl))
 		return -EINVAL;
 	pte = ptep_get(ptep);
 	pte_unmap_unlock(ptep, ptl);
@@ -6026,7 +6027,7 @@ retry:
 	if (!maddr)
 		return -ENOMEM;
 
-	if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
+	if (follow_pte(vma, addr, &ptep, &ptl))
 		goto out_unmap;
 
 	if (!pte_same(pte, ptep_get(ptep))) {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index fb49c2a60200..f57dbacb8689 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2902,7 +2902,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
 	spinlock_t *ptl;
 	int r;
 
-	r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
+	r = follow_pte(vma, addr, &ptep, &ptl);
 	if (r) {
 		/*
 		 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
@@ -2917,7 +2917,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
 		if (r)
 			return r;
 
-		r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
+		r = follow_pte(vma, addr, &ptep, &ptl);
 		if (r)
 			return r;
 	}
-- 
cgit v1.2.3


From 02faa73f174c4d1e11cb9a421f9a8eac0dd881f1 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 9 Apr 2024 21:22:44 +0200
Subject: mm: allow for detecting underflows with page_mapcount() again

Patch series "mm: mapcount for large folios + page_mapcount() cleanups".

This series tracks the mapcount of large folios in a single value, so it
can be read efficiently and atomically, just like the mapcount of small
folios.

folio_mapcount() is then used in a couple more places, most notably to
reduce false negatives in folio_likely_mapped_shared(), and many users of
page_mapcount() are cleaned up (that's maybe why you got CCed on the full
series, sorry sh+xtensa folks!  :) ).

The remaining s390x user and one KSM user of page_mapcount() are getting
removed separately on the list right now.  I have patches to handle the
other KSM one, the khugepaged one and the kpagecount one; as they are not
as "obvious", I will send them out separately in the future.  Once that is
all in place, I'm planning on moving page_mapcount() into
fs/proc/task_mmu.c, the remaining user for the time being (and we can
discuss at LSF/MM details on that :) ).

I proposed the mapcount for large folios (previously called total
mapcount) originally in part of [1] and I later included it in [2] where
it is a requirement.  In the meantime, I changed the patch a bit so I
dropped all RB's.  During the discussion of [1], Peter Xu correctly raised
that this additional tracking might affect the performance when PMD->PTE
remapping THPs.  In the meantime.  I addressed that by batching RMAP
operations during fork(), unmap/zap and when PMD->PTE remapping THPs.

Running some of my micro-benchmarks [3] (fork,munmap,cow-byte,remap) on 1
GiB of memory backed by folios with the same order, I observe the
following on an Intel(R) Xeon(R) Silver 4210R CPU @ 2.40GHz tuned for
reproducible results as much as possible:

Standard deviation is mostly < 1%, except for order-9, where it's < 2% for
fork() and munmap().

(1) Small folios are not affected (< 1%) in all 4 microbenchmarks.
(2) Order-4 folios are not affected (< 1%) in all 4 microbenchmarks. A bit
    weird comapred to the other orders ...
(3) PMD->PTE remapping of order-9 THPs is not affected (< 1%)
(4) COW-byte (COWing a single page by writing a single byte) is not
    affected for any order (< 1 %). The page copy_fault overhead dominates
    everything.
(5) fork() is mostly not affected (< 1%), except order-2, where we have
    a slowdown of ~4%. Already for order-3 folios, we're down to a slowdown
    of < 1%.
(6) munmap() sees a slowdown by < 3% for some orders (order-5,
    order-6, order-9), but less for others (< 1% for order-4 and order-8,
    < 2% for order-2, order-3, order-7).

Especially the fork() and munmap() benchmark are sensitive to each added
instruction and other system noise, so I suspect some of the change and
observed weirdness (order-4) is due to code layout changes and other
factors, but not really due to the added atomics.

So in the common case where we can batch, the added atomics don't really
make a big difference, especially in light of the recent improvements for
large folios that we recently gained due to batching.  Surprisingly, for
some cases where we cannot batch (e.g., COW), the added atomics don't seem
to matter, because other overhead dominates.

My fork and munmap micro-benchmarks don't cover cases where we cannot
batch-process bigger parts of large folios.  As this is not the common
case, I'm not worrying about that right now.

Future work is batching RMAP operations during swapout and folio
migration.

[1] https://lore.kernel.org/all/20230809083256.699513-1-david@redhat.com/
[2] https://lore.kernel.org/all/20231124132626.235350-1-david@redhat.com/
[3] https://gitlab.com/davidhildenbrand/scratchspace/-/raw/main/pte-mapped-folio-benchmarks.c?ref_type=heads


This patch (of 18):

Commit 53277bcf126d ("mm: support page_mapcount() on page_has_type()
pages") made it impossible to detect mapcount underflows by treating any
negative raw mapcount value as a mapcount of 0.

We perform such underflow checks in zap_present_folio_ptes() and
zap_huge_pmd(), which would currently no longer trigger.

Let's check against PAGE_MAPCOUNT_RESERVE instead by using
page_type_has_type(), like page_has_type() would, so we can still catch
some underflows.

[david@redhat.com: make page_mapcount() slighly more efficient]
  Link: https://lkml.kernel.org/r/1af4fd61-7926-47c8-be45-833c0dbec08b@redhat.com
Link: https://lkml.kernel.org/r/20240409192301.907377-1-david@redhat.com
Link: https://lkml.kernel.org/r/20240409192301.907377-2-david@redhat.com
Fixes: 53277bcf126d ("mm: support page_mapcount() on page_has_type() pages")
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Richard Chang <richardycc@google.com>
Cc: Rich Felker <dalias@libc.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c25ce6992951..8fa4d78bcc09 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1232,7 +1232,7 @@ static inline int page_mapcount(struct page *page)
 	int mapcount = atomic_read(&page->_mapcount) + 1;
 
 	/* Handle page_has_type() pages */
-	if (mapcount < 0)
+	if (mapcount < PAGE_MAPCOUNT_RESERVE + 1)
 		mapcount = 0;
 	if (unlikely(PageCompound(page)))
 		mapcount += folio_entire_mapcount(page_folio(page));
-- 
cgit v1.2.3


From c2e65ebc02fb60b64a1c5689fe2c6f60d0fc1626 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 9 Apr 2024 21:22:45 +0200
Subject: mm/rmap: always inline anon/file rmap duplication of a single PTE

As we grow the code, the compiler might make stupid decisions and
unnecessarily degrade fork() performance.  Let's make sure to always
inline functions that operate on a single PTE so the compiler will always
optimize out the loop and avoid a function call.

This is a preparation for maintining a total mapcount for large folios.

Link: https://lkml.kernel.org/r/20240409192301.907377-3-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Yin Fengwei <fengwei.yin@intel.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Richard Chang <richardycc@google.com>
Cc: Rich Felker <dalias@libc.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 9bf9324214fc..9549d78928bb 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -347,8 +347,12 @@ static inline void folio_dup_file_rmap_ptes(struct folio *folio,
 {
 	__folio_dup_file_rmap(folio, page, nr_pages, RMAP_LEVEL_PTE);
 }
-#define folio_dup_file_rmap_pte(folio, page) \
-	folio_dup_file_rmap_ptes(folio, page, 1)
+
+static __always_inline void folio_dup_file_rmap_pte(struct folio *folio,
+		struct page *page)
+{
+	__folio_dup_file_rmap(folio, page, 1, RMAP_LEVEL_PTE);
+}
 
 /**
  * folio_dup_file_rmap_pmd - duplicate a PMD mapping of a page range of a folio
@@ -448,8 +452,13 @@ static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio,
 	return __folio_try_dup_anon_rmap(folio, page, nr_pages, src_vma,
 					 RMAP_LEVEL_PTE);
 }
-#define folio_try_dup_anon_rmap_pte(folio, page, vma) \
-	folio_try_dup_anon_rmap_ptes(folio, page, 1, vma)
+
+static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio,
+		struct page *page, struct vm_area_struct *src_vma)
+{
+	return __folio_try_dup_anon_rmap(folio, page, 1, src_vma,
+					 RMAP_LEVEL_PTE);
+}
 
 /**
  * folio_try_dup_anon_rmap_pmd - try duplicating a PMD mapping of a page range
-- 
cgit v1.2.3


From 46d62de7ad1286854e0c2944ad26a1c1b1a5f191 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 9 Apr 2024 21:22:46 +0200
Subject: mm/rmap: add fast-path for small folios when
 adding/removing/duplicating

Let's add a fast-path for small folios to all relevant rmap functions.
Note that only RMAP_LEVEL_PTE applies.

This is a preparation for tracking the mapcount of large folios in a
single value.

Link: https://lkml.kernel.org/r/20240409192301.907377-4-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Yin Fengwei <fengwei.yin@intel.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Richard Chang <richardycc@google.com>
Cc: Rich Felker <dalias@libc.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h | 13 +++++++++++++
 mm/rmap.c            | 26 ++++++++++++++++----------
 2 files changed, 29 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 9549d78928bb..327f1ca5a487 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -322,6 +322,11 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio,
 
 	switch (level) {
 	case RMAP_LEVEL_PTE:
+		if (!folio_test_large(folio)) {
+			atomic_inc(&page->_mapcount);
+			break;
+		}
+
 		do {
 			atomic_inc(&page->_mapcount);
 		} while (page++, --nr_pages > 0);
@@ -405,6 +410,14 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
 				if (PageAnonExclusive(page + i))
 					return -EBUSY;
 		}
+
+		if (!folio_test_large(folio)) {
+			if (PageAnonExclusive(page))
+				ClearPageAnonExclusive(page);
+			atomic_inc(&page->_mapcount);
+			break;
+		}
+
 		do {
 			if (PageAnonExclusive(page))
 				ClearPageAnonExclusive(page);
diff --git a/mm/rmap.c b/mm/rmap.c
index 56b313aa2ebf..4bde6d60db6c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1172,15 +1172,18 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
 
 	switch (level) {
 	case RMAP_LEVEL_PTE:
+		if (!folio_test_large(folio)) {
+			nr = atomic_inc_and_test(&page->_mapcount);
+			break;
+		}
+
 		do {
 			first = atomic_inc_and_test(&page->_mapcount);
-			if (first && folio_test_large(folio)) {
+			if (first) {
 				first = atomic_inc_return_relaxed(mapped);
-				first = (first < ENTIRELY_MAPPED);
+				if (first < ENTIRELY_MAPPED)
+					nr++;
 			}
-
-			if (first)
-				nr++;
 		} while (page++, --nr_pages > 0);
 		break;
 	case RMAP_LEVEL_PMD:
@@ -1514,15 +1517,18 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
 
 	switch (level) {
 	case RMAP_LEVEL_PTE:
+		if (!folio_test_large(folio)) {
+			nr = atomic_add_negative(-1, &page->_mapcount);
+			break;
+		}
+
 		do {
 			last = atomic_add_negative(-1, &page->_mapcount);
-			if (last && folio_test_large(folio)) {
+			if (last) {
 				last = atomic_dec_return_relaxed(mapped);
-				last = (last < ENTIRELY_MAPPED);
+				if (last < ENTIRELY_MAPPED)
+					nr++;
 			}
-
-			if (last)
-				nr++;
 		} while (page++, --nr_pages > 0);
 		break;
 	case RMAP_LEVEL_PMD:
-- 
cgit v1.2.3


From 05c5323b2a344c19c51cd1b91a4ab9ae90853794 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 9 Apr 2024 21:22:47 +0200
Subject: mm: track mapcount of large folios in single value

Let's track the mapcount of large folios in a single value.  The mapcount
of a large folio currently corresponds to the sum of the entire mapcount
and all page mapcounts.

This sum is what we actually want to know in folio_mapcount() and it is
also sufficient for implementing folio_mapped().

With PTE-mapped THP becoming more important and more widely used, we want
to avoid looping over all pages of a folio just to obtain the mapcount of
large folios.  The comment "In the common case, avoid the loop when no
pages mapped by PTE" in folio_total_mapcount() does no longer hold for
mTHP that are always mapped by PTE.

Further, we are planning on using folio_mapcount() more frequently, and
might even want to remove page mapcounts for large folios in some kernel
configs.  Therefore, allow for reading the mapcount of large folios
efficiently and atomically without looping over any pages.

Maintain the mapcount also for hugetlb pages for simplicity.  Use the new
mapcount to implement folio_mapcount() and folio_mapped().  Make
page_mapped() simply call folio_mapped().  We can now get rid of
folio_large_is_mapped().

_nr_pages_mapped is now only used in rmap code and for debugging purposes.
Keep folio_nr_pages_mapped() around, but document that its use should be
limited to rmap internals and debugging purposes.

This change implies one additional atomic add/sub whenever
mapping/unmapping (parts of) a large folio.

As we now batch RMAP operations for PTE-mapped THP during fork(), during
unmap/zap, and when PTE-remapping a PMD-mapped THP, and we adjust the
large mapcount for a PTE batch only once, the added overhead in the common
case is small.  Only when unmapping individual pages of a large folio
(e.g., during COW), the overhead might be bigger in comparison, but it's
essentially one additional atomic operation.

Note that before the new mapcount would overflow, already our refcount
would overflow: each mapping requires a folio reference.  Extend the
focumentation of folio_mapcount().

Link: https://lkml.kernel.org/r/20240409192301.907377-5-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Yin Fengwei <fengwei.yin@intel.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Richard Chang <richardycc@google.com>
Cc: Rich Felker <dalias@libc.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/mm/transhuge.rst | 12 ++++++------
 include/linux/mm.h             | 44 +++++++++++++++++++-----------------------
 include/linux/mm_types.h       |  5 +++--
 include/linux/rmap.h           | 10 ++++++++++
 mm/debug.c                     |  3 ++-
 mm/hugetlb.c                   |  4 ++--
 mm/internal.h                  |  3 +++
 mm/khugepaged.c                |  2 +-
 mm/page_alloc.c                |  4 ++++
 mm/rmap.c                      | 34 +++++++++++---------------------
 10 files changed, 62 insertions(+), 59 deletions(-)

(limited to 'include')

diff --git a/Documentation/mm/transhuge.rst b/Documentation/mm/transhuge.rst
index 93c9239b9ebe..1ba0ad63246c 100644
--- a/Documentation/mm/transhuge.rst
+++ b/Documentation/mm/transhuge.rst
@@ -116,14 +116,14 @@ pages:
     succeeds on tail pages.
 
   - map/unmap of a PMD entry for the whole THP increment/decrement
-    folio->_entire_mapcount and also increment/decrement
-    folio->_nr_pages_mapped by ENTIRELY_MAPPED when _entire_mapcount
-    goes from -1 to 0 or 0 to -1.
+    folio->_entire_mapcount, increment/decrement folio->_large_mapcount
+    and also increment/decrement folio->_nr_pages_mapped by ENTIRELY_MAPPED
+    when _entire_mapcount goes from -1 to 0 or 0 to -1.
 
   - map/unmap of individual pages with PTE entry increment/decrement
-    page->_mapcount and also increment/decrement folio->_nr_pages_mapped
-    when page->_mapcount goes from -1 to 0 or 0 to -1 as this counts
-    the number of pages mapped by PTE.
+    page->_mapcount, increment/decrement folio->_large_mapcount and also
+    increment/decrement folio->_nr_pages_mapped when page->_mapcount goes
+    from -1 to 0 or 0 to -1 as this counts the number of pages mapped by PTE.
 
 split_huge_page internally has to distribute the refcounts in the head
 page to the tail pages before clearing all PG_head/tail bits from the page
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8fa4d78bcc09..059477821625 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1240,16 +1240,26 @@ static inline int page_mapcount(struct page *page)
 	return mapcount;
 }
 
-int folio_total_mapcount(const struct folio *folio);
+static inline int folio_large_mapcount(const struct folio *folio)
+{
+	VM_WARN_ON_FOLIO(!folio_test_large(folio), folio);
+	return atomic_read(&folio->_large_mapcount) + 1;
+}
 
 /**
- * folio_mapcount() - Calculate the number of mappings of this folio.
+ * folio_mapcount() - Number of mappings of this folio.
  * @folio: The folio.
  *
- * A large folio tracks both how many times the entire folio is mapped,
- * and how many times each individual page in the folio is mapped.
- * This function calculates the total number of times the folio is
- * mapped.
+ * The folio mapcount corresponds to the number of present user page table
+ * entries that reference any part of a folio. Each such present user page
+ * table entry must be paired with exactly on folio reference.
+ *
+ * For ordindary folios, each user page table entry (PTE/PMD/PUD/...) counts
+ * exactly once.
+ *
+ * For hugetlb folios, each abstracted "hugetlb" user page table entry that
+ * references the entire folio counts exactly once, even when such special
+ * page table entries are comprised of multiple ordinary page table entries.
  *
  * Return: The number of times this folio is mapped.
  */
@@ -1257,17 +1267,7 @@ static inline int folio_mapcount(const struct folio *folio)
 {
 	if (likely(!folio_test_large(folio)))
 		return atomic_read(&folio->_mapcount) + 1;
-	return folio_total_mapcount(folio);
-}
-
-static inline bool folio_large_is_mapped(const struct folio *folio)
-{
-	/*
-	 * Reading _entire_mapcount below could be omitted if hugetlb
-	 * participated in incrementing nr_pages_mapped when compound mapped.
-	 */
-	return atomic_read(&folio->_nr_pages_mapped) > 0 ||
-		atomic_read(&folio->_entire_mapcount) >= 0;
+	return folio_large_mapcount(folio);
 }
 
 /**
@@ -1276,11 +1276,9 @@ static inline bool folio_large_is_mapped(const struct folio *folio)
  *
  * Return: True if any page in this folio is referenced by user page tables.
  */
-static inline bool folio_mapped(struct folio *folio)
+static inline bool folio_mapped(const struct folio *folio)
 {
-	if (likely(!folio_test_large(folio)))
-		return atomic_read(&folio->_mapcount) >= 0;
-	return folio_large_is_mapped(folio);
+	return folio_mapcount(folio) >= 1;
 }
 
 /*
@@ -1290,9 +1288,7 @@ static inline bool folio_mapped(struct folio *folio)
  */
 static inline bool page_mapped(const struct page *page)
 {
-	if (likely(!PageCompound(page)))
-		return atomic_read(&page->_mapcount) >= 0;
-	return folio_large_is_mapped(page_folio(page));
+	return folio_mapped(page_folio(page));
 }
 
 static inline struct page *virt_to_head_page(const void *x)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index fa0d6995706f..db0adf5721cc 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -289,7 +289,8 @@ typedef struct {
  * @virtual: Virtual address in the kernel direct map.
  * @_last_cpupid: IDs of last CPU and last process that accessed the folio.
  * @_entire_mapcount: Do not use directly, call folio_entire_mapcount().
- * @_nr_pages_mapped: Do not use directly, call folio_mapcount().
+ * @_large_mapcount: Do not use directly, call folio_mapcount().
+ * @_nr_pages_mapped: Do not use outside of rmap and debug code.
  * @_pincount: Do not use directly, call folio_maybe_dma_pinned().
  * @_folio_nr_pages: Do not use directly, call folio_nr_pages().
  * @_hugetlb_subpool: Do not use directly, use accessor in hugetlb.h.
@@ -348,8 +349,8 @@ struct folio {
 		struct {
 			unsigned long _flags_1;
 			unsigned long _head_1;
-			unsigned long _folio_avail;
 	/* public: */
+			atomic_t _large_mapcount;
 			atomic_t _entire_mapcount;
 			atomic_t _nr_pages_mapped;
 			atomic_t _pincount;
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 327f1ca5a487..0f906dc6d280 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -273,6 +273,7 @@ static inline int hugetlb_try_dup_anon_rmap(struct folio *folio,
 		ClearPageAnonExclusive(&folio->page);
 	}
 	atomic_inc(&folio->_entire_mapcount);
+	atomic_inc(&folio->_large_mapcount);
 	return 0;
 }
 
@@ -306,6 +307,7 @@ static inline void hugetlb_add_file_rmap(struct folio *folio)
 	VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);
 
 	atomic_inc(&folio->_entire_mapcount);
+	atomic_inc(&folio->_large_mapcount);
 }
 
 static inline void hugetlb_remove_rmap(struct folio *folio)
@@ -313,11 +315,14 @@ static inline void hugetlb_remove_rmap(struct folio *folio)
 	VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
 
 	atomic_dec(&folio->_entire_mapcount);
+	atomic_dec(&folio->_large_mapcount);
 }
 
 static __always_inline void __folio_dup_file_rmap(struct folio *folio,
 		struct page *page, int nr_pages, enum rmap_level level)
 {
+	const int orig_nr_pages = nr_pages;
+
 	__folio_rmap_sanity_checks(folio, page, nr_pages, level);
 
 	switch (level) {
@@ -330,9 +335,11 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio,
 		do {
 			atomic_inc(&page->_mapcount);
 		} while (page++, --nr_pages > 0);
+		atomic_add(orig_nr_pages, &folio->_large_mapcount);
 		break;
 	case RMAP_LEVEL_PMD:
 		atomic_inc(&folio->_entire_mapcount);
+		atomic_inc(&folio->_large_mapcount);
 		break;
 	}
 }
@@ -382,6 +389,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
 		struct page *page, int nr_pages, struct vm_area_struct *src_vma,
 		enum rmap_level level)
 {
+	const int orig_nr_pages = nr_pages;
 	bool maybe_pinned;
 	int i;
 
@@ -423,6 +431,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
 				ClearPageAnonExclusive(page);
 			atomic_inc(&page->_mapcount);
 		} while (page++, --nr_pages > 0);
+		atomic_add(orig_nr_pages, &folio->_large_mapcount);
 		break;
 	case RMAP_LEVEL_PMD:
 		if (PageAnonExclusive(page)) {
@@ -431,6 +440,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
 			ClearPageAnonExclusive(page);
 		}
 		atomic_inc(&folio->_entire_mapcount);
+		atomic_inc(&folio->_large_mapcount);
 		break;
 	}
 	return 0;
diff --git a/mm/debug.c b/mm/debug.c
index b71186f1fb0b..d064db42af54 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -68,8 +68,9 @@ static void __dump_folio(struct folio *folio, struct page *page,
 			folio_ref_count(folio), mapcount, mapping,
 			folio->index + idx, pfn);
 	if (folio_test_large(folio)) {
-		pr_warn("head: order:%u entire_mapcount:%d nr_pages_mapped:%d pincount:%d\n",
+		pr_warn("head: order:%u mapcount:%d entire_mapcount:%d nr_pages_mapped:%d pincount:%d\n",
 				folio_order(folio),
+				folio_mapcount(folio),
 				folio_entire_mapcount(folio),
 				folio_nr_pages_mapped(folio),
 				atomic_read(&folio->_pincount));
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5dc3f5ea3a2e..d74289d3f30d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1517,7 +1517,7 @@ static void __destroy_compound_gigantic_folio(struct folio *folio,
 	struct page *p;
 
 	atomic_set(&folio->_entire_mapcount, 0);
-	atomic_set(&folio->_nr_pages_mapped, 0);
+	atomic_set(&folio->_large_mapcount, 0);
 	atomic_set(&folio->_pincount, 0);
 
 	for (i = 1; i < nr_pages; i++) {
@@ -2120,7 +2120,7 @@ static bool __prep_compound_gigantic_folio(struct folio *folio,
 	/* we rely on prep_new_hugetlb_folio to set the hugetlb flag */
 	folio_set_order(folio, order);
 	atomic_set(&folio->_entire_mapcount, -1);
-	atomic_set(&folio->_nr_pages_mapped, 0);
+	atomic_set(&folio->_large_mapcount, -1);
 	atomic_set(&folio->_pincount, 0);
 	return true;
 
diff --git a/mm/internal.h b/mm/internal.h
index 22152e0c8494..2adc3f616b71 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -72,6 +72,8 @@ void page_writeback_init(void);
 /*
  * How many individual pages have an elevated _mapcount.  Excludes
  * the folio's entire_mapcount.
+ *
+ * Don't use this function outside of debugging code.
  */
 static inline int folio_nr_pages_mapped(const struct folio *folio)
 {
@@ -611,6 +613,7 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
 	struct folio *folio = (struct folio *)page;
 
 	folio_set_order(folio, order);
+	atomic_set(&folio->_large_mapcount, -1);
 	atomic_set(&folio->_entire_mapcount, -1);
 	atomic_set(&folio->_nr_pages_mapped, 0);
 	atomic_set(&folio->_pincount, 0);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 89e2624fb3ff..2f73d2aa9ae8 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1358,7 +1358,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 		 * Check if the page has any GUP (or other external) pins.
 		 *
 		 * Here the check may be racy:
-		 * it may see total_mapcount > refcount in some cases?
+		 * it may see folio_mapcount() > folio_ref_count().
 		 * But such case is ephemeral we could always retry collapse
 		 * later.  However it may report false positive if the page
 		 * has excessive GUP pins (i.e. 512).  Anyway the same check
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 22e8b9f1d710..dd4265c760ff 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -935,6 +935,10 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
 			bad_page(page, "nonzero entire_mapcount");
 			goto out;
 		}
+		if (unlikely(folio_large_mapcount(folio))) {
+			bad_page(page, "nonzero large_mapcount");
+			goto out;
+		}
 		if (unlikely(atomic_read(&folio->_nr_pages_mapped))) {
 			bad_page(page, "nonzero nr_pages_mapped");
 			goto out;
diff --git a/mm/rmap.c b/mm/rmap.c
index 4bde6d60db6c..2608c40dffad 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1138,34 +1138,12 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
 	return page_vma_mkclean_one(&pvmw);
 }
 
-int folio_total_mapcount(const struct folio *folio)
-{
-	int mapcount = folio_entire_mapcount(folio);
-	int nr_pages;
-	int i;
-
-	/* In the common case, avoid the loop when no pages mapped by PTE */
-	if (folio_nr_pages_mapped(folio) == 0)
-		return mapcount;
-	/*
-	 * Add all the PTE mappings of those pages mapped by PTE.
-	 * Limit the loop to folio_nr_pages_mapped()?
-	 * Perhaps: given all the raciness, that may be a good or a bad idea.
-	 */
-	nr_pages = folio_nr_pages(folio);
-	for (i = 0; i < nr_pages; i++)
-		mapcount += atomic_read(&folio_page(folio, i)->_mapcount);
-
-	/* But each of those _mapcounts was based on -1 */
-	mapcount += nr_pages;
-	return mapcount;
-}
-
 static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
 		struct page *page, int nr_pages, enum rmap_level level,
 		int *nr_pmdmapped)
 {
 	atomic_t *mapped = &folio->_nr_pages_mapped;
+	const int orig_nr_pages = nr_pages;
 	int first, nr = 0;
 
 	__folio_rmap_sanity_checks(folio, page, nr_pages, level);
@@ -1185,6 +1163,7 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
 					nr++;
 			}
 		} while (page++, --nr_pages > 0);
+		atomic_add(orig_nr_pages, &folio->_large_mapcount);
 		break;
 	case RMAP_LEVEL_PMD:
 		first = atomic_inc_and_test(&folio->_entire_mapcount);
@@ -1201,6 +1180,7 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
 				nr = 0;
 			}
 		}
+		atomic_inc(&folio->_large_mapcount);
 		break;
 	}
 	return nr;
@@ -1436,10 +1416,14 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
 			SetPageAnonExclusive(page);
 		}
 
+		/* increment count (starts at -1) */
+		atomic_set(&folio->_large_mapcount, nr - 1);
 		atomic_set(&folio->_nr_pages_mapped, nr);
 	} else {
 		/* increment count (starts at -1) */
 		atomic_set(&folio->_entire_mapcount, 0);
+		/* increment count (starts at -1) */
+		atomic_set(&folio->_large_mapcount, 0);
 		atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED);
 		SetPageAnonExclusive(&folio->page);
 		__lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr);
@@ -1522,6 +1506,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
 			break;
 		}
 
+		atomic_sub(nr_pages, &folio->_large_mapcount);
 		do {
 			last = atomic_add_negative(-1, &page->_mapcount);
 			if (last) {
@@ -1532,6 +1517,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
 		} while (page++, --nr_pages > 0);
 		break;
 	case RMAP_LEVEL_PMD:
+		atomic_dec(&folio->_large_mapcount);
 		last = atomic_add_negative(-1, &folio->_entire_mapcount);
 		if (last) {
 			nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped);
@@ -2714,6 +2700,7 @@ void hugetlb_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
 	VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
 
 	atomic_inc(&folio->_entire_mapcount);
+	atomic_inc(&folio->_large_mapcount);
 	if (flags & RMAP_EXCLUSIVE)
 		SetPageAnonExclusive(&folio->page);
 	VM_WARN_ON_FOLIO(folio_entire_mapcount(folio) > 1 &&
@@ -2728,6 +2715,7 @@ void hugetlb_add_new_anon_rmap(struct folio *folio,
 	BUG_ON(address < vma->vm_start || address >= vma->vm_end);
 	/* increment count (starts at -1) */
 	atomic_set(&folio->_entire_mapcount, 0);
+	atomic_set(&folio->_large_mapcount, 0);
 	folio_clear_hugetlb_restore_reserve(folio);
 	__folio_set_anon(folio, vma, address, true);
 	SetPageAnonExclusive(&folio->page);
-- 
cgit v1.2.3


From eefb9b2725e395d58119a92eb8eaea8f2504b5eb Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 9 Apr 2024 21:22:48 +0200
Subject: mm: improve folio_likely_mapped_shared() using the mapcount of large
 folios

We can now read the mapcount of large folios very efficiently.  Use it to
improve our handling of partially-mappable folios, falling back to making
a guess only in case the folio is not "obviously mapped shared".

We can now better detect partially-mappable folios where the first page is
not mapped as "mapped shared", reducing "false negatives"; but false
negatives are still possible.

While at it, fixup a wrong comment (false positive vs.  false negative)
for KSM folios.

Link: https://lkml.kernel.org/r/20240409192301.907377-6-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Yin Fengwei <fengwei.yin@intel.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Richard Chang <richardycc@google.com>
Cc: Rich Felker <dalias@libc.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 059477821625..c7470da5797f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2184,7 +2184,7 @@ static inline size_t folio_size(struct folio *folio)
  *       indicate "mapped shared" (false positive) when two VMAs in the same MM
  *       cover the same file range.
  *    #. For (small) KSM folios, the return value can wrongly indicate "mapped
- *       shared" (false negative), when the folio is mapped multiple times into
+ *       shared" (false positive), when the folio is mapped multiple times into
  *       the same MM.
  *
  * Further, this function only considers current page table mappings that
@@ -2201,7 +2201,22 @@ static inline size_t folio_size(struct folio *folio)
  */
 static inline bool folio_likely_mapped_shared(struct folio *folio)
 {
-	return page_mapcount(folio_page(folio, 0)) > 1;
+	int mapcount = folio_mapcount(folio);
+
+	/* Only partially-mappable folios require more care. */
+	if (!folio_test_large(folio) || unlikely(folio_test_hugetlb(folio)))
+		return mapcount > 1;
+
+	/* A single mapping implies "mapped exclusively". */
+	if (mapcount <= 1)
+		return false;
+
+	/* If any page is mapped more than once we treat it "mapped shared". */
+	if (folio_entire_mapcount(folio) || mapcount > folio_nr_pages(folio))
+		return true;
+
+	/* Let's guess based on the first subpage. */
+	return atomic_read(&folio->_mapcount) > 0;
 }
 
 #ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE
-- 
cgit v1.2.3


From 4103b93b07bceac30bc83cbce81693bb2ea93c22 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 9 Apr 2024 21:22:49 +0200
Subject: mm: make folio_mapcount() return 0 for small typed folios

We already handle it properly for large folios.  Let's also return "0" for
small typed folios, like page_mapcount() currently would.

Consequently, folio_mapcount() will never return negative values for typed
folios, but may return negative values for underflows.

[david@redhat.com: make folio_mapcount() slightly more efficient]
  Link: https://lkml.kernel.org/r/c30fcda1-ed87-46f5-8297-cdedbddac009@redhat.com
Link: https://lkml.kernel.org/r/20240409192301.907377-7-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Richard Chang <richardycc@google.com>
Cc: Rich Felker <dalias@libc.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c7470da5797f..78e583b50e42 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1261,12 +1261,22 @@ static inline int folio_large_mapcount(const struct folio *folio)
  * references the entire folio counts exactly once, even when such special
  * page table entries are comprised of multiple ordinary page table entries.
  *
+ * Will report 0 for pages which cannot be mapped into userspace, such as
+ * slab, page tables and similar.
+ *
  * Return: The number of times this folio is mapped.
  */
 static inline int folio_mapcount(const struct folio *folio)
 {
-	if (likely(!folio_test_large(folio)))
-		return atomic_read(&folio->_mapcount) + 1;
+	int mapcount;
+
+	if (likely(!folio_test_large(folio))) {
+		mapcount = atomic_read(&folio->_mapcount) + 1;
+		/* Handle page_has_type() pages */
+		if (mapcount < PAGE_MAPCOUNT_RESERVE + 1)
+			mapcount = 0;
+		return mapcount;
+	}
 	return folio_large_mapcount(folio);
 }
 
-- 
cgit v1.2.3


From 6eca32567455db2db38b1126e0d6ad8f0e5c3ed9 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 9 Apr 2024 21:22:58 +0200
Subject: trace/events/page_ref: trace the raw page mapcount value

We want to limit the use of page_mapcount() to the places where it is
absolutely necessary.  We already trace raw page->refcount, raw
page->flags and raw page->mapping, and don't involve any folios.  Let's
also trace the raw mapcount value that does not consider the entire
mapcount of large folios, and we don't add "1" to it.

When dealing with typed folios, this makes a lot more sense.  ...  and
it's for debugging purposes only either way.

Link: https://lkml.kernel.org/r/20240409192301.907377-16-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: Hugh Dickins <hughd@google.com>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Richard Chang <richardycc@google.com>
Cc: Rich Felker <dalias@libc.org>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/page_ref.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/page_ref.h b/include/trace/events/page_ref.h
index 8a99c1cd417b..fe33a255b7d0 100644
--- a/include/trace/events/page_ref.h
+++ b/include/trace/events/page_ref.h
@@ -30,7 +30,7 @@ DECLARE_EVENT_CLASS(page_ref_mod_template,
 		__entry->pfn = page_to_pfn(page);
 		__entry->flags = page->flags;
 		__entry->count = page_ref_count(page);
-		__entry->mapcount = page_mapcount(page);
+		__entry->mapcount = atomic_read(&page->_mapcount);
 		__entry->mapping = page->mapping;
 		__entry->mt = get_pageblock_migratetype(page);
 		__entry->val = v;
@@ -79,7 +79,7 @@ DECLARE_EVENT_CLASS(page_ref_mod_and_test_template,
 		__entry->pfn = page_to_pfn(page);
 		__entry->flags = page->flags;
 		__entry->count = page_ref_count(page);
-		__entry->mapcount = page_mapcount(page);
+		__entry->mapcount = atomic_read(&page->_mapcount);
 		__entry->mapping = page->mapping;
 		__entry->mt = get_pageblock_migratetype(page);
 		__entry->val = v;
-- 
cgit v1.2.3


From 54fa49b2e0ef3af944bbeacb7ed2ba0b4f02facc Mon Sep 17 00:00:00 2001
From: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Date: Thu, 11 Apr 2024 09:47:56 -0700
Subject: mm/hugetlb: convert dissolve_free_huge_pages() to folios

Allows us to rename dissolve_free_huge_pages() to
dissolve_free_hugetlb_folio(). Convert one caller to pass in a folio
directly and use page_folio() to convert the caller in mm/memory-failure.

[sidhartha.kumar@oracle.com: remove unneeded `extern']
  Link: https://lkml.kernel.org/r/71760ed4-e80d-493a-95ea-2545414b1aba@oracle.com
[sidhartha.kumar@oracle.com: v2]
  Link: https://lkml.kernel.org/r/20240412182139.120871-1-sidhartha.kumar@oracle.com
Link: https://lkml.kernel.org/r/20240411164756.261178-1-sidhartha.kumar@oracle.com
Signed-off-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h |  4 ++--
 mm/hugetlb.c            | 17 ++++++++---------
 mm/memory-failure.c     |  8 ++++----
 3 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 3f3e62880279..8968e8a3a205 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -861,7 +861,7 @@ static inline int hstate_index(struct hstate *h)
 	return h - hstates;
 }
 
-extern int dissolve_free_huge_page(struct page *page);
+int dissolve_free_hugetlb_folio(struct folio *folio);
 extern int dissolve_free_huge_pages(unsigned long start_pfn,
 				    unsigned long end_pfn);
 
@@ -1148,7 +1148,7 @@ static inline int hstate_index(struct hstate *h)
 	return 0;
 }
 
-static inline int dissolve_free_huge_page(struct page *page)
+static inline int dissolve_free_hugetlb_folio(struct folio *folio)
 {
 	return 0;
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d74289d3f30d..40bcaf2bad55 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2377,8 +2377,8 @@ static struct folio *remove_pool_hugetlb_folio(struct hstate *h,
 }
 
 /*
- * Dissolve a given free hugepage into free buddy pages. This function does
- * nothing for in-use hugepages and non-hugepages.
+ * Dissolve a given free hugetlb folio into free buddy pages. This function
+ * does nothing for in-use hugetlb folios and non-hugetlb folios.
  * This function returns values like below:
  *
  *  -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages
@@ -2390,10 +2390,9 @@ static struct folio *remove_pool_hugetlb_folio(struct hstate *h,
  *       0:  successfully dissolved free hugepages or the page is not a
  *           hugepage (considered as already dissolved)
  */
-int dissolve_free_huge_page(struct page *page)
+int dissolve_free_hugetlb_folio(struct folio *folio)
 {
 	int rc = -EBUSY;
-	struct folio *folio = page_folio(page);
 
 retry:
 	/* Not to disrupt normal path by vainly holding hugetlb_lock */
@@ -2470,13 +2469,13 @@ out:
  * make specified memory blocks removable from the system.
  * Note that this will dissolve a free gigantic hugepage completely, if any
  * part of it lies within the given range.
- * Also note that if dissolve_free_huge_page() returns with an error, all
- * free hugepages that were dissolved before that error are lost.
+ * Also note that if dissolve_free_hugetlb_folio() returns with an error, all
+ * free hugetlb folios that were dissolved before that error are lost.
  */
 int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
 {
 	unsigned long pfn;
-	struct page *page;
+	struct folio *folio;
 	int rc = 0;
 	unsigned int order;
 	struct hstate *h;
@@ -2489,8 +2488,8 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
 		order = min(order, huge_page_order(h));
 
 	for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) {
-		page = pfn_to_page(pfn);
-		rc = dissolve_free_huge_page(page);
+		folio = pfn_folio(pfn);
+		rc = dissolve_free_hugetlb_folio(folio);
 		if (rc)
 			break;
 	}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index db8d770ce801..68e1fe1c0b72 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -155,7 +155,7 @@ static int __page_handle_poison(struct page *page)
 
 	/*
 	 * zone_pcp_disable() can't be used here. It will
-	 * hold pcp_batch_high_lock and dissolve_free_huge_page() might hold
+	 * hold pcp_batch_high_lock and dissolve_free_hugetlb_folio() might hold
 	 * cpu_hotplug_lock via static_key_slow_dec() when hugetlb vmemmap
 	 * optimization is enabled. This will break current lock dependency
 	 * chain and leads to deadlock.
@@ -165,7 +165,7 @@ static int __page_handle_poison(struct page *page)
 	 * but nothing guarantees that those pages do not get back to a PCP
 	 * queue if we need to refill those.
 	 */
-	ret = dissolve_free_huge_page(page);
+	ret = dissolve_free_hugetlb_folio(page_folio(page));
 	if (!ret) {
 		drain_all_pages(page_zone(page));
 		ret = take_page_off_buddy(page);
@@ -178,8 +178,8 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo
 {
 	if (hugepage_or_freepage) {
 		/*
-		 * Doing this check for free pages is also fine since dissolve_free_huge_page
-		 * returns 0 for non-hugetlb pages as well.
+		 * Doing this check for free pages is also fine since
+		 * dissolve_free_hugetlb_folio() returns 0 for non-hugetlb folios as well.
 		 */
 		if (__page_handle_poison(page) <= 0)
 			/*
-- 
cgit v1.2.3


From d199483c2b972064be5fce846228ceb773cd9d17 Mon Sep 17 00:00:00 2001
From: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Date: Fri, 12 Apr 2024 11:21:39 -0700
Subject: mm/hugetlb: rename dissolve_free_huge_pages() to
 dissolve_free_hugetlb_folios()

dissolve_free_huge_pages() only uses folios internally, rename it to
dissolve_free_hugetlb_folios() and change the comments which reference it.

[akpm@linux-foundation.org: remove unneeded `extern']
Link: https://lkml.kernel.org/r/20240412182139.120871-2-sidhartha.kumar@oracle.com
Signed-off-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Reviewed-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 4 ++--
 mm/hugetlb.c            | 2 +-
 mm/memory_hotplug.c     | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 8968e8a3a205..1bc93e7e315b 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -862,7 +862,7 @@ static inline int hstate_index(struct hstate *h)
 }
 
 int dissolve_free_hugetlb_folio(struct folio *folio);
-extern int dissolve_free_huge_pages(unsigned long start_pfn,
+int dissolve_free_hugetlb_folios(unsigned long start_pfn,
 				    unsigned long end_pfn);
 
 #ifdef CONFIG_MEMORY_FAILURE
@@ -1153,7 +1153,7 @@ static inline int dissolve_free_hugetlb_folio(struct folio *folio)
 	return 0;
 }
 
-static inline int dissolve_free_huge_pages(unsigned long start_pfn,
+static inline int dissolve_free_hugetlb_folios(unsigned long start_pfn,
 					   unsigned long end_pfn)
 {
 	return 0;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 40bcaf2bad55..3b7d5ddc32ad 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2472,7 +2472,7 @@ out:
  * Also note that if dissolve_free_hugetlb_folio() returns with an error, all
  * free hugetlb folios that were dissolved before that error are lost.
  */
-int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
+int dissolve_free_hugetlb_folios(unsigned long start_pfn, unsigned long end_pfn)
 {
 	unsigned long pfn;
 	struct folio *folio;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b79ba36e09e0..431b1f6753c0 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -2051,11 +2051,11 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
 		}
 
 		/*
-		 * Dissolve free hugepages in the memory block before doing
+		 * Dissolve free hugetlb folios in the memory block before doing
 		 * offlining actually in order to make hugetlbfs's object
 		 * counting consistent.
 		 */
-		ret = dissolve_free_huge_pages(start_pfn, end_pfn);
+		ret = dissolve_free_hugetlb_folios(start_pfn, end_pfn);
 		if (ret) {
 			reason = "failure to dissolve huge pages";
 			goto failed_removal_isolated;
-- 
cgit v1.2.3


From ec33687c674934dfefd782a8ffd58370b080b503 Mon Sep 17 00:00:00 2001
From: Barry Song <v-songbaohua@oppo.com>
Date: Fri, 12 Apr 2024 23:48:55 +1200
Subject: mm: add per-order mTHP anon_fault_alloc and anon_fault_fallback
 counters

Patch series "mm: add per-order mTHP alloc and swpout counters", v6.

The patchset introduces a framework to facilitate mTHP counters, starting
with the allocation and swap-out counters.  Currently, only four new nodes
are appended to the stats directory for each mTHP size.

/sys/kernel/mm/transparent_hugepage/hugepages-<size>/stats
	anon_fault_alloc
	anon_fault_fallback
	anon_fault_fallback_charge
	anon_swpout
	anon_swpout_fallback

These nodes are crucial for us to monitor the fragmentation levels of both
the buddy system and the swap partitions.  In the future, we may consider
adding additional nodes for further insights.


This patch (of 4):

Profiling a system blindly with mTHP has become challenging due to the
lack of visibility into its operations.  Presenting the success rate of
mTHP allocations appears to be pressing need.

Recently, I've been experiencing significant difficulty debugging
performance improvements and regressions without these figures.  It's
crucial for us to understand the true effectiveness of mTHP in real-world
scenarios, especially in systems with fragmented memory.

This patch establishes the framework for per-order mTHP counters.  It
begins by introducing the anon_fault_alloc and anon_fault_fallback
counters.  Additionally, to maintain consistency with
thp_fault_fallback_charge in /proc/vmstat, this patch also tracks
anon_fault_fallback_charge when mem_cgroup_charge fails for mTHP.
Incorporating additional counters should now be straightforward as well.

Link: https://lkml.kernel.org/r/20240412114858.407208-1-21cnbao@gmail.com
Link: https://lkml.kernel.org/r/20240412114858.407208-2-21cnbao@gmail.com
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 21 ++++++++++++++++++++
 mm/huge_memory.c        | 52 +++++++++++++++++++++++++++++++++++++++++++++++++
 mm/memory.c             |  5 +++++
 3 files changed, 78 insertions(+)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e896ca4760f6..d4fdb2641070 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -264,6 +264,27 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
 					  enforce_sysfs, orders);
 }
 
+enum mthp_stat_item {
+	MTHP_STAT_ANON_FAULT_ALLOC,
+	MTHP_STAT_ANON_FAULT_FALLBACK,
+	MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
+	__MTHP_STAT_COUNT
+};
+
+struct mthp_stat {
+	unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT];
+};
+
+DECLARE_PER_CPU(struct mthp_stat, mthp_stats);
+
+static inline void count_mthp_stat(int order, enum mthp_stat_item item)
+{
+	if (order <= 0 || order > PMD_ORDER)
+		return;
+
+	this_cpu_inc(mthp_stats.stats[order][item]);
+}
+
 #define transparent_hugepage_use_zero_page()				\
 	(transparent_hugepage_flags &					\
 	 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3ad76f23fadd..01f01d6a50a1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -526,6 +526,48 @@ static const struct kobj_type thpsize_ktype = {
 	.sysfs_ops = &kobj_sysfs_ops,
 };
 
+DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = {{{0}}};
+
+static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
+{
+	unsigned long sum = 0;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct mthp_stat *this = &per_cpu(mthp_stats, cpu);
+
+		sum += this->stats[order][item];
+	}
+
+	return sum;
+}
+
+#define DEFINE_MTHP_STAT_ATTR(_name, _index)				\
+static ssize_t _name##_show(struct kobject *kobj,			\
+			struct kobj_attribute *attr, char *buf)		\
+{									\
+	int order = to_thpsize(kobj)->order;				\
+									\
+	return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index));	\
+}									\
+static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
+DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
+DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+
+static struct attribute *stats_attrs[] = {
+	&anon_fault_alloc_attr.attr,
+	&anon_fault_fallback_attr.attr,
+	&anon_fault_fallback_charge_attr.attr,
+	NULL,
+};
+
+static struct attribute_group stats_attr_group = {
+	.name = "stats",
+	.attrs = stats_attrs,
+};
+
 static struct thpsize *thpsize_create(int order, struct kobject *parent)
 {
 	unsigned long size = (PAGE_SIZE << order) / SZ_1K;
@@ -549,6 +591,12 @@ static struct thpsize *thpsize_create(int order, struct kobject *parent)
 		return ERR_PTR(ret);
 	}
 
+	ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
+	if (ret) {
+		kobject_put(&thpsize->kobj);
+		return ERR_PTR(ret);
+	}
+
 	thpsize->order = order;
 	return thpsize;
 }
@@ -880,6 +928,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 		folio_put(folio);
 		count_vm_event(THP_FAULT_FALLBACK);
 		count_vm_event(THP_FAULT_FALLBACK_CHARGE);
+		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
+		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
 		return VM_FAULT_FALLBACK;
 	}
 	folio_throttle_swaprate(folio, gfp);
@@ -929,6 +979,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 		mm_inc_nr_ptes(vma->vm_mm);
 		spin_unlock(vmf->ptl);
 		count_vm_event(THP_FAULT_ALLOC);
+		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
 		count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
 	}
 
@@ -1050,6 +1101,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 	folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
 	if (unlikely(!folio)) {
 		count_vm_event(THP_FAULT_FALLBACK);
+		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
 		return VM_FAULT_FALLBACK;
 	}
 	return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
diff --git a/mm/memory.c b/mm/memory.c
index d592128d7f7d..b71d01110596 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4368,6 +4368,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
 		folio = vma_alloc_folio(gfp, order, vma, addr, true);
 		if (folio) {
 			if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
+				count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
 				folio_put(folio);
 				goto next;
 			}
@@ -4376,6 +4377,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
 			return folio;
 		}
 next:
+		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
 		order = next_order(&orders, order);
 	}
 
@@ -4485,6 +4487,9 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 
 	folio_ref_add(folio, nr_pages - 1);
 	add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC);
+#endif
 	folio_add_new_anon_rmap(folio, vma, addr);
 	folio_add_lru_vma(folio, vma);
 setpte:
-- 
cgit v1.2.3


From d0f048ac39f6a71566d3f49a5922dfd7fa0d585b Mon Sep 17 00:00:00 2001
From: Barry Song <v-songbaohua@oppo.com>
Date: Fri, 12 Apr 2024 23:48:56 +1200
Subject: mm: add per-order mTHP anon_swpout and anon_swpout_fallback counters

This helps to display the fragmentation situation of the swapfile, knowing
the proportion of how much we haven't split large folios.  So far, we only
support non-split swapout for anon memory, with the possibility of
expanding to shmem in the future.  So, we add the "anon" prefix to the
counter names.

Link: https://lkml.kernel.org/r/20240412114858.407208-3-21cnbao@gmail.com
Signed-off-by: Barry Song <v-songbaohua@oppo.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Domenico Cerasuolo <cerasuolodomenico@gmail.com>
Cc: Kairui Song <kasong@tencent.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Yu Zhao <yuzhao@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 2 ++
 mm/huge_memory.c        | 4 ++++
 mm/page_io.c            | 1 +
 mm/vmscan.c             | 3 +++
 4 files changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index d4fdb2641070..7cd07b83a3d0 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -268,6 +268,8 @@ enum mthp_stat_item {
 	MTHP_STAT_ANON_FAULT_ALLOC,
 	MTHP_STAT_ANON_FAULT_FALLBACK,
 	MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
+	MTHP_STAT_ANON_SWPOUT,
+	MTHP_STAT_ANON_SWPOUT_FALLBACK,
 	__MTHP_STAT_COUNT
 };
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 01f01d6a50a1..264e09043f09 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -555,11 +555,15 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
 DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+DEFINE_MTHP_STAT_ATTR(anon_swpout, MTHP_STAT_ANON_SWPOUT);
+DEFINE_MTHP_STAT_ATTR(anon_swpout_fallback, MTHP_STAT_ANON_SWPOUT_FALLBACK);
 
 static struct attribute *stats_attrs[] = {
 	&anon_fault_alloc_attr.attr,
 	&anon_fault_fallback_attr.attr,
 	&anon_fault_fallback_charge_attr.attr,
+	&anon_swpout_attr.attr,
+	&anon_swpout_fallback_attr.attr,
 	NULL,
 };
 
diff --git a/mm/page_io.c b/mm/page_io.c
index a9a7c236aecc..46c603dddf04 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -217,6 +217,7 @@ static inline void count_swpout_vm_event(struct folio *folio)
 		count_memcg_folio_events(folio, THP_SWPOUT, 1);
 		count_vm_event(THP_SWPOUT);
 	}
+	count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_SWPOUT);
 #endif
 	count_vm_events(PSWPOUT, folio_nr_pages(folio));
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index da93213af981..d194e7240df4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1214,6 +1214,8 @@ retry:
 						goto activate_locked;
 				}
 				if (!add_to_swap(folio)) {
+					int __maybe_unused order = folio_order(folio);
+
 					if (!folio_test_large(folio))
 						goto activate_locked_split;
 					/* Fallback to swap normal pages */
@@ -1225,6 +1227,7 @@ retry:
 							THP_SWPOUT_FALLBACK, 1);
 						count_vm_event(THP_SWPOUT_FALLBACK);
 					}
+					count_mthp_stat(order, MTHP_STAT_ANON_SWPOUT_FALLBACK);
 #endif
 					if (!add_to_swap(folio))
 						goto activate_locked_split;
-- 
cgit v1.2.3


From 324ecaee46f86c1eaf083fd82eaf997335e70163 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 16 Apr 2024 04:17:48 +0100
Subject: buffer: fix __bread and __bread_gfp kernel-doc

The extra indentation confused the kernel-doc parser, so remove it.  Fix
some other wording while I'm here, and advise the user they need to call
brelse() on this buffer.

__bread_gfp() isn't used directly by filesystems, but the other wrappers
for it don't have documentation, so document it accordingly.

Link: https://lkml.kernel.org/r/20240416031754.4076917-5-willy@infradead.org
Co-developed-by: Pankaj Raghav <p.raghav@samsung.com>
Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/buffer.c                 | 35 ++++++++++++++++++++++-------------
 include/linux/buffer_head.h | 22 +++++++++++++---------
 2 files changed, 35 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/fs/buffer.c b/fs/buffer.c
index 0466ed7ed95a..32ab3eddc44f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1453,20 +1453,29 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
 EXPORT_SYMBOL(__breadahead);
 
 /**
- *  __bread_gfp() - reads a specified block and returns the bh
- *  @bdev: the block_device to read from
- *  @block: number of block
- *  @size: size (in bytes) to read
- *  @gfp: page allocation flag
- *
- *  Reads a specified block, and returns buffer head that contains it.
- *  The page cache can be allocated from non-movable area
- *  not to prevent page migration if you set gfp to zero.
- *  It returns NULL if the block was unreadable.
+ * __bread_gfp() - Read a block.
+ * @bdev: The block device to read from.
+ * @block: Block number in units of block size.
+ * @size: The block size of this device in bytes.
+ * @gfp: Not page allocation flags; see below.
+ *
+ * You are not expected to call this function.  You should use one of
+ * sb_bread(), sb_bread_unmovable() or __bread().
+ *
+ * Read a specified block, and return the buffer head that refers to it.
+ * If @gfp is 0, the memory will be allocated using the block device's
+ * default GFP flags.  If @gfp is __GFP_MOVABLE, the memory may be
+ * allocated from a movable area.  Do not pass in a complete set of
+ * GFP flags.
+ *
+ * The returned buffer head has its refcount increased.  The caller should
+ * call brelse() when it has finished with the buffer.
+ *
+ * Context: May sleep waiting for I/O.
+ * Return: NULL if the block was unreadable.
  */
-struct buffer_head *
-__bread_gfp(struct block_device *bdev, sector_t block,
-		   unsigned size, gfp_t gfp)
+struct buffer_head *__bread_gfp(struct block_device *bdev, sector_t block,
+		unsigned size, gfp_t gfp)
 {
 	struct buffer_head *bh;
 
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index d78454a4dd1f..56a1e9c1e71e 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -437,17 +437,21 @@ static inline void bh_readahead_batch(int nr, struct buffer_head *bhs[],
 }
 
 /**
- *  __bread() - reads a specified block and returns the bh
- *  @bdev: the block_device to read from
- *  @block: number of block
- *  @size: size (in bytes) to read
+ * __bread() - Read a block.
+ * @bdev: The block device to read from.
+ * @block: Block number in units of block size.
+ * @size: The block size of this device in bytes.
  *
- *  Reads a specified block, and returns buffer head that contains it.
- *  The page cache is allocated from movable area so that it can be migrated.
- *  It returns NULL if the block was unreadable.
+ * Read a specified block, and return the buffer head that refers
+ * to it.  The memory is allocated from the movable area so that it can
+ * be migrated.  The returned buffer head has its refcount increased.
+ * The caller should call brelse() when it has finished with the buffer.
+ *
+ * Context: May sleep waiting for I/O.
+ * Return: NULL if the block was unreadable.
  */
-static inline struct buffer_head *
-__bread(struct block_device *bdev, sector_t block, unsigned size)
+static inline struct buffer_head *__bread(struct block_device *bdev,
+		sector_t block, unsigned size)
 {
 	return __bread_gfp(bdev, block, size, __GFP_MOVABLE);
 }
-- 
cgit v1.2.3


From 66924fdaf835f5fc6fe78d92a79afcca7d4db7ec Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 16 Apr 2024 04:17:49 +0100
Subject: buffer: add kernel-doc for brelse() and __brelse()

Move the documentation for __brelse() to brelse(), format it as kernel-doc
and update it from talking about pages to folios.

Link: https://lkml.kernel.org/r/20240416031754.4076917-6-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Pankaj Raghav <p.raghav@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/buffer.c                 | 17 ++++++++---------
 include/linux/buffer_head.h | 16 ++++++++++++++++
 2 files changed, 24 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/fs/buffer.c b/fs/buffer.c
index 32ab3eddc44f..e5beca3868a7 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1226,17 +1226,16 @@ void mark_buffer_write_io_error(struct buffer_head *bh)
 }
 EXPORT_SYMBOL(mark_buffer_write_io_error);
 
-/*
- * Decrement a buffer_head's reference count.  If all buffers against a page
- * have zero reference count, are clean and unlocked, and if the page is clean
- * and unlocked then try_to_free_buffers() may strip the buffers from the page
- * in preparation for freeing it (sometimes, rarely, buffers are removed from
- * a page but it ends up not being freed, and buffers may later be reattached).
+/**
+ * __brelse - Release a buffer.
+ * @bh: The buffer to release.
+ *
+ * This variant of brelse() can be called if @bh is guaranteed to not be NULL.
  */
-void __brelse(struct buffer_head * buf)
+void __brelse(struct buffer_head *bh)
 {
-	if (atomic_read(&buf->b_count)) {
-		put_bh(buf);
+	if (atomic_read(&bh->b_count)) {
+		put_bh(bh);
 		return;
 	}
 	WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 56a1e9c1e71e..c145817c6ca0 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -303,6 +303,22 @@ static inline void put_bh(struct buffer_head *bh)
         atomic_dec(&bh->b_count);
 }
 
+/**
+ * brelse - Release a buffer.
+ * @bh: The buffer to release.
+ *
+ * Decrement a buffer_head's reference count.  If @bh is NULL, this
+ * function is a no-op.
+ *
+ * If all buffers on a folio have zero reference count, are clean
+ * and unlocked, and if the folio is unlocked and not under writeback
+ * then try_to_free_buffers() may strip the buffers from the folio in
+ * preparation for freeing it (sometimes, rarely, buffers are removed
+ * from a folio but it ends up not being freed, and buffers may later
+ * be reattached).
+ *
+ * Context: Any context.
+ */
 static inline void brelse(struct buffer_head *bh)
 {
 	if (bh)
-- 
cgit v1.2.3


From b73a936f99914e23fbe236f75ecf257923cb06e7 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 16 Apr 2024 04:17:50 +0100
Subject: buffer: add kernel-doc for bforget() and __bforget()

Distinguish these functions from brelse() and __brelse().

Link: https://lkml.kernel.org/r/20240416031754.4076917-7-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Pankaj Raghav <p.raghav@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/buffer.c                 |  9 ++++++---
 include/linux/buffer_head.h | 10 ++++++++++
 2 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/fs/buffer.c b/fs/buffer.c
index e5beca3868a7..60829312787a 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1242,9 +1242,12 @@ void __brelse(struct buffer_head *bh)
 }
 EXPORT_SYMBOL(__brelse);
 
-/*
- * bforget() is like brelse(), except it discards any
- * potentially dirty data.
+/**
+ * __bforget - Discard any dirty data in a buffer.
+ * @bh: The buffer to forget.
+ *
+ * This variant of bforget() can be called if @bh is guaranteed to not
+ * be NULL.
  */
 void __bforget(struct buffer_head *bh)
 {
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index c145817c6ca0..a1c0bdd0cca6 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -325,6 +325,16 @@ static inline void brelse(struct buffer_head *bh)
 		__brelse(bh);
 }
 
+/**
+ * bforget - Discard any dirty data in a buffer.
+ * @bh: The buffer to forget.
+ *
+ * Call this function instead of brelse() if the data written to a buffer
+ * no longer needs to be written back.  It will clear the buffer's dirty
+ * flag so writeback of this buffer will be skipped.
+ *
+ * Context: Any context.
+ */
 static inline void bforget(struct buffer_head *bh)
 {
 	if (bh)
-- 
cgit v1.2.3


From 1b68112c40395b3b0fed3c8bb648e2d9d0b37ec2 Mon Sep 17 00:00:00 2001
From: Lance Yang <ioworker0@gmail.com>
Date: Thu, 18 Apr 2024 21:44:32 +0800
Subject: mm/madvise: introduce clear_young_dirty_ptes() batch helper

Patch series "mm/madvise: enhance lazyfreeing with mTHP in madvise_free",
v10.

This patchset adds support for lazyfreeing multi-size THP (mTHP) without
needing to first split the large folio via split_folio().  However, we
still need to split a large folio that is not fully mapped within the
target range.

If a large folio is locked or shared, or if we fail to split it, we just
leave it in place and advance to the next PTE in the range.  But note that
the behavior is changed; previously, any failure of this sort would cause
the entire operation to give up.  As large folios become more common,
sticking to the old way could result in wasted opportunities.

Performance Testing
===================

On an Intel I5 CPU, lazyfreeing a 1GiB VMA backed by PTE-mapped folios of
the same size results in the following runtimes for madvise(MADV_FREE) in
seconds (shorter is better):

Folio Size |   Old    |   New    | Change
------------------------------------------
      4KiB | 0.590251 | 0.590259 |    0%
     16KiB | 2.990447 | 0.185655 |  -94%
     32KiB | 2.547831 | 0.104870 |  -95%
     64KiB | 2.457796 | 0.052812 |  -97%
    128KiB | 2.281034 | 0.032777 |  -99%
    256KiB | 2.230387 | 0.017496 |  -99%
    512KiB | 2.189106 | 0.010781 |  -99%
   1024KiB | 2.183949 | 0.007753 |  -99%
   2048KiB | 0.002799 | 0.002804 |    0%


This patch (of 4):

This commit introduces clear_young_dirty_ptes() to replace mkold_ptes().
By doing so, we can use the same function for both use cases
(madvise_pageout and madvise_free), and it also provides the flexibility
to only clear the dirty flag in the future if needed.

Link: https://lkml.kernel.org/r/20240418134435.6092-1-ioworker0@gmail.com
Link: https://lkml.kernel.org/r/20240418134435.6092-2-ioworker0@gmail.com
Signed-off-by: Lance Yang <ioworker0@gmail.com>
Suggested-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Jeff Xie <xiehuan09@gmail.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Cc: Zach O'Keefe <zokeefe@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm_types.h |  9 ++++++
 include/linux/pgtable.h  | 74 ++++++++++++++++++++++++++++--------------------
 mm/madvise.c             |  3 +-
 3 files changed, 55 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index db0adf5721cc..24323c7d0bd4 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1368,6 +1368,15 @@ enum fault_flag {
 
 typedef unsigned int __bitwise zap_flags_t;
 
+/* Flags for clear_young_dirty_ptes(). */
+typedef int __bitwise cydp_t;
+
+/* Clear the access bit */
+#define CYDP_CLEAR_YOUNG		((__force cydp_t)BIT(0))
+
+/* Clear the dirty bit */
+#define CYDP_CLEAR_DIRTY		((__force cydp_t)BIT(1))
+
 /*
  * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each
  * other. Here is what they mean, and how to use them:
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index e2f45e22a6d1..18019f037bae 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -361,36 +361,6 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
 }
 #endif
 
-#ifndef mkold_ptes
-/**
- * mkold_ptes - Mark PTEs that map consecutive pages of the same folio as old.
- * @vma: VMA the pages are mapped into.
- * @addr: Address the first page is mapped at.
- * @ptep: Page table pointer for the first entry.
- * @nr: Number of entries to mark old.
- *
- * May be overridden by the architecture; otherwise, implemented as a simple
- * loop over ptep_test_and_clear_young().
- *
- * Note that PTE bits in the PTE range besides the PFN can differ. For example,
- * some PTEs might be write-protected.
- *
- * Context: The caller holds the page table lock.  The PTEs map consecutive
- * pages that belong to the same folio.  The PTEs are all in the same PMD.
- */
-static inline void mkold_ptes(struct vm_area_struct *vma, unsigned long addr,
-		pte_t *ptep, unsigned int nr)
-{
-	for (;;) {
-		ptep_test_and_clear_young(vma, addr, ptep);
-		if (--nr == 0)
-			break;
-		ptep++;
-		addr += PAGE_SIZE;
-	}
-}
-#endif
-
 #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
 static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
@@ -489,6 +459,50 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 }
 #endif
 
+#ifndef clear_young_dirty_ptes
+/**
+ * clear_young_dirty_ptes - Mark PTEs that map consecutive pages of the
+ *		same folio as old/clean.
+ * @mm: Address space the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to mark old/clean.
+ * @flags: Flags to modify the PTE batch semantics.
+ *
+ * May be overridden by the architecture; otherwise, implemented by
+ * get_and_clear/modify/set for each pte in the range.
+ *
+ * Note that PTE bits in the PTE range besides the PFN can differ. For example,
+ * some PTEs might be write-protected.
+ *
+ * Context: The caller holds the page table lock.  The PTEs map consecutive
+ * pages that belong to the same folio.  The PTEs are all in the same PMD.
+ */
+static inline void clear_young_dirty_ptes(struct vm_area_struct *vma,
+					  unsigned long addr, pte_t *ptep,
+					  unsigned int nr, cydp_t flags)
+{
+	pte_t pte;
+
+	for (;;) {
+		if (flags == CYDP_CLEAR_YOUNG)
+			ptep_test_and_clear_young(vma, addr, ptep);
+		else {
+			pte = ptep_get_and_clear(vma->vm_mm, addr, ptep);
+			if (flags & CYDP_CLEAR_YOUNG)
+				pte = pte_mkold(pte);
+			if (flags & CYDP_CLEAR_DIRTY)
+				pte = pte_mkclean(pte);
+			set_pte_at(vma->vm_mm, addr, ptep, pte);
+		}
+		if (--nr == 0)
+			break;
+		ptep++;
+		addr += PAGE_SIZE;
+	}
+}
+#endif
+
 static inline void ptep_clear(struct mm_struct *mm, unsigned long addr,
 			      pte_t *ptep)
 {
diff --git a/mm/madvise.c b/mm/madvise.c
index f59169888b8e..edb592adb749 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -507,7 +507,8 @@ restart:
 			continue;
 
 		if (!pageout && pte_young(ptent)) {
-			mkold_ptes(vma, addr, pte, nr);
+			clear_young_dirty_ptes(vma, addr, pte, nr,
+					       CYDP_CLEAR_YOUNG);
 			tlb_remove_tlb_entries(tlb, pte, nr, addr);
 		}
 
-- 
cgit v1.2.3


From 2d8b272cdcad17d9b875c206fbcb0422b791ab3a Mon Sep 17 00:00:00 2001
From: Hao Ge <gehao@kylinos.cn>
Date: Mon, 22 Apr 2024 11:27:25 +0800
Subject: mm/page-flags: make PageUptodate return bool

Make PageUptodate return bool to align the return values of
folio_test_uptodate function

Link: https://lkml.kernel.org/r/20240422032725.41452-1-gehao@kylinos.cn
Signed-off-by: Hao Ge <gehao@kylinos.cn>
Cc: David Hildenbrand <david@redhat.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Ruihan Li <lrh2000@pku.edu.cn>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index f83331684e47..0af2cef02383 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -789,7 +789,7 @@ static inline bool folio_test_uptodate(const struct folio *folio)
 	return ret;
 }
 
-static inline int PageUptodate(const struct page *page)
+static inline bool PageUptodate(const struct page *page)
 {
 	return folio_test_uptodate(page_folio(page));
 }
-- 
cgit v1.2.3


From 91882c1617c15eb2a4e996b0822e0b9cbd854dea Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Sat, 20 Apr 2024 16:25:05 -0700
Subject: memcg: simple cleanup of stats update functions

mod_memcg_lruvec_state() is never called from outside of memcontrol.c and
with always irq disabled.  So, replace it with the irq disabled version
and add an assert that irq is disabled in the caller.

Similarly mod_objcg_state() is not called from outside of memcontrol.c, so
simply make it static and change it's name to __mod_objcg_state().

Link: https://lkml.kernel.org/r/20240420232505.2768428-1-shakeel.butt@linux.dev
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: T.J. Mercier <tjmercier@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 17 -----------------
 mm/memcontrol.c            | 31 +++++++++++++++----------------
 mm/slab.h                  |  2 --
 3 files changed, 15 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8f332b4ae84c..9aba0d0462ca 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1077,8 +1077,6 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
 void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
 void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
 
-void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
-			      int val);
 void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val);
 
 static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
@@ -1091,16 +1089,6 @@ static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
 	local_irq_restore(flags);
 }
 
-static inline void mod_memcg_lruvec_state(struct lruvec *lruvec,
-					  enum node_stat_item idx, int val)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	__mod_memcg_lruvec_state(lruvec, idx, val);
-	local_irq_restore(flags);
-}
-
 void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
 			  unsigned long count);
 
@@ -1594,11 +1582,6 @@ static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
 {
 }
 
-static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec,
-					    enum node_stat_item idx, int val)
-{
-}
-
 static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
 					   int val)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 29fa5b17a1ef..9095ab05d47a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -836,8 +836,9 @@ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
 	return x;
 }
 
-void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
-			      int val)
+static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
+				     enum node_stat_item idx,
+				     int val)
 {
 	struct mem_cgroup_per_node *pn;
 	struct mem_cgroup *memcg;
@@ -2978,21 +2979,19 @@ void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
 
 #ifdef CONFIG_MEMCG_KMEM
 
-/*
- * mod_objcg_mlstate() may be called with irq enabled, so
- * mod_memcg_lruvec_state() should be used.
- */
-static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
-				     struct pglist_data *pgdat,
-				     enum node_stat_item idx, int nr)
+static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg,
+				       struct pglist_data *pgdat,
+				       enum node_stat_item idx, int nr)
 {
 	struct mem_cgroup *memcg;
 	struct lruvec *lruvec;
 
+	lockdep_assert_irqs_disabled();
+
 	rcu_read_lock();
 	memcg = obj_cgroup_memcg(objcg);
 	lruvec = mem_cgroup_lruvec(memcg, pgdat);
-	mod_memcg_lruvec_state(lruvec, idx, nr);
+	__mod_memcg_lruvec_state(lruvec, idx, nr);
 	rcu_read_unlock();
 }
 
@@ -3312,7 +3311,7 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
 	obj_cgroup_put(objcg);
 }
 
-void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
+static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
 		     enum node_stat_item idx, int nr)
 {
 	struct memcg_stock_pcp *stock;
@@ -3340,12 +3339,12 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
 		struct pglist_data *oldpg = stock->cached_pgdat;
 
 		if (stock->nr_slab_reclaimable_b) {
-			mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
+			__mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
 					  stock->nr_slab_reclaimable_b);
 			stock->nr_slab_reclaimable_b = 0;
 		}
 		if (stock->nr_slab_unreclaimable_b) {
-			mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
+			__mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
 					  stock->nr_slab_unreclaimable_b);
 			stock->nr_slab_unreclaimable_b = 0;
 		}
@@ -3371,7 +3370,7 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
 		}
 	}
 	if (nr)
-		mod_objcg_mlstate(objcg, pgdat, idx, nr);
+		__mod_objcg_mlstate(objcg, pgdat, idx, nr);
 
 	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
 	obj_cgroup_put(old);
@@ -3437,13 +3436,13 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
 	 */
 	if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) {
 		if (stock->nr_slab_reclaimable_b) {
-			mod_objcg_mlstate(old, stock->cached_pgdat,
+			__mod_objcg_mlstate(old, stock->cached_pgdat,
 					  NR_SLAB_RECLAIMABLE_B,
 					  stock->nr_slab_reclaimable_b);
 			stock->nr_slab_reclaimable_b = 0;
 		}
 		if (stock->nr_slab_unreclaimable_b) {
-			mod_objcg_mlstate(old, stock->cached_pgdat,
+			__mod_objcg_mlstate(old, stock->cached_pgdat,
 					  NR_SLAB_UNRECLAIMABLE_B,
 					  stock->nr_slab_unreclaimable_b);
 			stock->nr_slab_unreclaimable_b = 0;
diff --git a/mm/slab.h b/mm/slab.h
index 411251b9bdd1..d12fb4392e35 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -581,8 +581,6 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru,
 				  gfp_t flags, size_t size, void **p);
 void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
 			    void **p, int objects, struct slabobj_ext *obj_exts);
-void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
-		     enum node_stat_item idx, int nr);
 #endif
 
 size_t __ksize(const void *objp);
-- 
cgit v1.2.3


From ccde70f4d4baea4d2ebc3d646a0a4ab4cfdf683c Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 23 Apr 2024 17:20:24 +0300
Subject: xarray: use BITS_PER_LONGS()

Patch series "xarray: Clean up xarray.h".

Main portion of this change is to get rid of kernel.h included into other
globally available headers.  This decreases a dependency hell degree.  The
first patch makes it possible to avoid math.h to be included as bitops.h
is implied by bitmap.h.


This patch (of 2):

Use BITS_PER_LONGS() instead of open coded variant.

Link: https://lkml.kernel.org/r/20240423142204.2408923-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/xarray.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index d9d479334c9e..d54a1e98b0ca 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -1146,7 +1146,7 @@ static inline void xa_release(struct xarray *xa, unsigned long index)
 #define XA_CHUNK_SIZE		(1UL << XA_CHUNK_SHIFT)
 #define XA_CHUNK_MASK		(XA_CHUNK_SIZE - 1)
 #define XA_MAX_MARKS		3
-#define XA_MARK_LONGS		DIV_ROUND_UP(XA_CHUNK_SIZE, BITS_PER_LONG)
+#define XA_MARK_LONGS		BITS_TO_LONGS(XA_CHUNK_SIZE)
 
 /*
  * @count is the count of every non-NULL element in the ->slots array
-- 
cgit v1.2.3


From d0aea4dcd23ca8e3e28b54d81156fdbe017fd595 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 23 Apr 2024 17:20:25 +0300
Subject: xarray: don't use "proxy" headers

Update header inclusions to follow IWYU (Include What You Use)
principle.

Link: https://lkml.kernel.org/r/20240423142204.2408923-3-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/xarray.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index d54a1e98b0ca..d2e4d3624689 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -12,14 +12,18 @@
 #include <linux/bitmap.h>
 #include <linux/bug.h>
 #include <linux/compiler.h>
+#include <linux/err.h>
 #include <linux/gfp.h>
 #include <linux/kconfig.h>
-#include <linux/kernel.h>
+#include <linux/limits.h>
+#include <linux/lockdep.h>
 #include <linux/rcupdate.h>
 #include <linux/sched/mm.h>
 #include <linux/spinlock.h>
 #include <linux/types.h>
 
+struct list_lru;
+
 /*
  * The bottom two bits of the entry determine how the XArray interprets
  * the contents:
-- 
cgit v1.2.3


From 37bc2ff506b184411e4cc80f111c638b2b4c83d4 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 12 Apr 2024 20:35:00 +0100
Subject: mm: return the address from page_mapped_in_vma()

The only user of this function calls page_address_in_vma() immediately
after page_mapped_in_vma() calculates it and uses it to return true/false.
Return the address instead, allowing memory-failure to skip the call to
page_address_in_vma().

Link: https://lkml.kernel.org/r/20240412193510.2356957-4-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: Jane Chu <jane.chu@oracle.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/rmap.h |  2 +-
 mm/memory-failure.c  | 22 +++++++++++++---------
 mm/page_vma_mapped.c | 16 +++++++++-------
 3 files changed, 23 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 0f906dc6d280..7229b9baf20d 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -730,7 +730,7 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
 
 void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
 
-int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
+unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
 
 /*
  * rmap_walk_control: To control rmap traversing for specific needs
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9e1a7d8ca745..12e5d2844cb1 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -473,10 +473,11 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p,
 }
 
 static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p,
-				  struct vm_area_struct *vma,
-				  struct list_head *to_kill)
+		struct vm_area_struct *vma, struct list_head *to_kill,
+		unsigned long addr)
 {
-	unsigned long addr = page_address_in_vma(p, vma);
+	if (addr == -EFAULT)
+		return;
 	__add_to_kill(tsk, p, vma, to_kill, addr);
 }
 
@@ -601,7 +602,6 @@ struct task_struct *task_early_kill(struct task_struct *tsk, int force_early)
 static void collect_procs_anon(struct folio *folio, struct page *page,
 		struct list_head *to_kill, int force_early)
 {
-	struct vm_area_struct *vma;
 	struct task_struct *tsk;
 	struct anon_vma *av;
 	pgoff_t pgoff;
@@ -613,8 +613,10 @@ static void collect_procs_anon(struct folio *folio, struct page *page,
 	pgoff = page_to_pgoff(page);
 	rcu_read_lock();
 	for_each_process(tsk) {
+		struct vm_area_struct *vma;
 		struct anon_vma_chain *vmac;
 		struct task_struct *t = task_early_kill(tsk, force_early);
+		unsigned long addr;
 
 		if (!t)
 			continue;
@@ -623,9 +625,8 @@ static void collect_procs_anon(struct folio *folio, struct page *page,
 			vma = vmac->vma;
 			if (vma->vm_mm != t->mm)
 				continue;
-			if (!page_mapped_in_vma(page, vma))
-				continue;
-			add_to_kill_anon_file(t, page, vma, to_kill);
+			addr = page_mapped_in_vma(page, vma);
+			add_to_kill_anon_file(t, page, vma, to_kill, addr);
 		}
 	}
 	rcu_read_unlock();
@@ -648,6 +649,7 @@ static void collect_procs_file(struct folio *folio, struct page *page,
 	pgoff = page_to_pgoff(page);
 	for_each_process(tsk) {
 		struct task_struct *t = task_early_kill(tsk, force_early);
+		unsigned long addr;
 
 		if (!t)
 			continue;
@@ -660,8 +662,10 @@ static void collect_procs_file(struct folio *folio, struct page *page,
 			 * Assume applications who requested early kill want
 			 * to be informed of all such data corruptions.
 			 */
-			if (vma->vm_mm == t->mm)
-				add_to_kill_anon_file(t, page, vma, to_kill);
+			if (vma->vm_mm != t->mm)
+				continue;
+			addr = page_address_in_vma(page, vma);
+			add_to_kill_anon_file(t, page, vma, to_kill, addr);
 		}
 	}
 	rcu_read_unlock();
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 53b8868ede61..c202eab84936 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -319,11 +319,12 @@ next_pte:
  * @page: the page to test
  * @vma: the VMA to test
  *
- * Returns 1 if the page is mapped into the page tables of the VMA, 0
- * if the page is not mapped into the page tables of this VMA.  Only
- * valid for normal file or anonymous VMAs.
+ * Return: The address the page is mapped at if the page is in the range
+ * covered by the VMA and present in the page table.  If the page is
+ * outside the VMA or not present, returns -EFAULT.
+ * Only valid for normal file or anonymous VMAs.
  */
-int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
+unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
 {
 	struct folio *folio = page_folio(page);
 	pgoff_t pgoff = folio->index + folio_page_idx(folio, page);
@@ -336,9 +337,10 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
 
 	pvmw.address = vma_address(vma, pgoff, 1);
 	if (pvmw.address == -EFAULT)
-		return 0;
+		goto out;
 	if (!page_vma_mapped_walk(&pvmw))
-		return 0;
+		return -EFAULT;
 	page_vma_mapped_walk_done(&pvmw);
-	return 1;
+out:
+	return pvmw.address;
 }
-- 
cgit v1.2.3


From fed5348ee2b136c84c5a27d6fceef14066beeb66 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 12 Apr 2024 20:35:02 +0100
Subject: mm/memory-failure: convert shake_page() to shake_folio()

Removes two calls to compound_head().  Move the prototype to internal.h;
we definitely don't want code outside mm using it.

Link: https://lkml.kernel.org/r/20240412193510.2356957-6-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jane Chu <jane.chu@oracle.com>
Acked-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h   |  1 -
 mm/hwpoison-inject.c | 11 ++++++-----
 mm/internal.h        |  1 +
 mm/memory-failure.c  | 15 ++++++++++-----
 4 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 78e583b50e42..b9ac49c9eb00 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4033,7 +4033,6 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
 extern int memory_failure(unsigned long pfn, int flags);
 extern void memory_failure_queue_kick(int cpu);
 extern int unpoison_memory(unsigned long pfn);
-extern void shake_page(struct page *p);
 extern atomic_long_t num_poisoned_pages __read_mostly;
 extern int soft_offline_page(unsigned long pfn, int flags);
 #ifdef CONFIG_MEMORY_FAILURE
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index d0548e382b6b..c9d653f51e45 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -15,7 +15,7 @@ static int hwpoison_inject(void *data, u64 val)
 {
 	unsigned long pfn = val;
 	struct page *p;
-	struct page *hpage;
+	struct folio *folio;
 	int err;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -25,16 +25,17 @@ static int hwpoison_inject(void *data, u64 val)
 		return -ENXIO;
 
 	p = pfn_to_page(pfn);
-	hpage = compound_head(p);
+	folio = page_folio(p);
 
 	if (!hwpoison_filter_enable)
 		goto inject;
 
-	shake_page(hpage);
+	shake_folio(folio);
 	/*
 	 * This implies unable to support non-LRU pages except free page.
 	 */
-	if (!PageLRU(hpage) && !PageHuge(p) && !is_free_buddy_page(p))
+	if (!folio_test_lru(folio) && !folio_test_hugetlb(folio) &&
+	    !is_free_buddy_page(p))
 		return 0;
 
 	/*
@@ -42,7 +43,7 @@ static int hwpoison_inject(void *data, u64 val)
 	 * the targeted owner (or on a free page).
 	 * memory_failure() will redo the check reliably inside page lock.
 	 */
-	err = hwpoison_filter(hpage);
+	err = hwpoison_filter(&folio->page);
 	if (err)
 		return 0;
 
diff --git a/mm/internal.h b/mm/internal.h
index 5d5e49b86fe3..6803c7b17c1f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1037,6 +1037,7 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
 /*
  * mm/memory-failure.c
  */
+void shake_folio(struct folio *folio);
 extern int hwpoison_filter(struct page *p);
 
 extern u32 hwpoison_filter_dev_major;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 12e5d2844cb1..4daf581e3878 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -369,20 +369,25 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
  * Unknown page type encountered. Try to check whether it can turn PageLRU by
  * lru_add_drain_all.
  */
-void shake_page(struct page *p)
+void shake_folio(struct folio *folio)
 {
-	if (PageHuge(p))
+	if (folio_test_hugetlb(folio))
 		return;
 	/*
 	 * TODO: Could shrink slab caches here if a lightweight range-based
 	 * shrinker will be available.
 	 */
-	if (PageSlab(p))
+	if (folio_test_slab(folio))
 		return;
 
 	lru_add_drain_all();
 }
-EXPORT_SYMBOL_GPL(shake_page);
+EXPORT_SYMBOL_GPL(shake_folio);
+
+static void shake_page(struct page *page)
+{
+	shake_folio(page_folio(page));
+}
 
 static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
 		unsigned long address)
@@ -1639,7 +1644,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
 	 * shake_page() again to ensure that it's flushed.
 	 */
 	if (mlocked)
-		shake_page(hpage);
+		shake_folio(folio);
 
 	/*
 	 * Now that the dirty bit has been propagated to the
-- 
cgit v1.2.3


From 6e8cda4c2c87b2a44828e651a10705647a6fd542 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 12 Apr 2024 20:35:03 +0100
Subject: mm: convert hugetlb_page_mapping_lock_write to folio
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The page is only used to get the mapping, so the folio will do just as
well.  Both callers already have a folio available, so this saves a call
to compound_head().

Link: https://lkml.kernel.org/r/20240412193510.2356957-7-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jane Chu  <jane.chu@oracle.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h | 6 +++---
 mm/hugetlb.c            | 6 +++---
 mm/memory-failure.c     | 2 +-
 mm/migrate.c            | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 1bc93e7e315b..68244bb3637a 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -178,7 +178,7 @@ bool hugetlbfs_pagecache_present(struct hstate *h,
 				 struct vm_area_struct *vma,
 				 unsigned long address);
 
-struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage);
+struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio);
 
 extern int sysctl_hugetlb_shm_group;
 extern struct list_head huge_boot_pages[MAX_NUMNODES];
@@ -297,8 +297,8 @@ static inline unsigned long hugetlb_total_pages(void)
 	return 0;
 }
 
-static inline struct address_space *hugetlb_page_mapping_lock_write(
-							struct page *hpage)
+static inline struct address_space *hugetlb_folio_mapping_lock_write(
+							struct folio *folio)
 {
 	return NULL;
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3b7d5ddc32ad..417fc5cdb6ee 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2155,13 +2155,13 @@ static bool prep_compound_gigantic_folio_for_demote(struct folio *folio,
 /*
  * Find and lock address space (mapping) in write mode.
  *
- * Upon entry, the page is locked which means that page_mapping() is
+ * Upon entry, the folio is locked which means that folio_mapping() is
  * stable.  Due to locking order, we can only trylock_write.  If we can
  * not get the lock, simply return NULL to caller.
  */
-struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
+struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio)
 {
-	struct address_space *mapping = page_mapping(hpage);
+	struct address_space *mapping = folio_mapping(folio);
 
 	if (!mapping)
 		return mapping;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 4daf581e3878..1a5f3403dd2a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1624,7 +1624,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
 		 * TTU_RMAP_LOCKED to indicate we have taken the lock
 		 * at this higher level.
 		 */
-		mapping = hugetlb_page_mapping_lock_write(hpage);
+		mapping = hugetlb_folio_mapping_lock_write(folio);
 		if (mapping) {
 			try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
 			i_mmap_unlock_write(mapping);
diff --git a/mm/migrate.c b/mm/migrate.c
index c7692f303fa7..dd04f578c19c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1425,7 +1425,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
 			 * semaphore in write mode here and set TTU_RMAP_LOCKED
 			 * to let lower levels know we have taken the lock.
 			 */
-			mapping = hugetlb_page_mapping_lock_write(&src->page);
+			mapping = hugetlb_folio_mapping_lock_write(src);
 			if (unlikely(!mapping))
 				goto unlock_put_anon;
 
-- 
cgit v1.2.3


From b650e1d2aefbbb31e7578ad60a0a71bf5e5c5346 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 12 Apr 2024 20:35:08 +0100
Subject: mm/memory-failure: pass the folio to collect_procs_ksm()

We've already calculated it, so pass it in instead of recalculating it in
collect_procs_ksm().

Link: https://lkml.kernel.org/r/20240412193510.2356957-12-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jane Chu <jane.chu@oracle.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/ksm.h | 14 +++-----------
 mm/ksm.c            |  5 ++---
 mm/memory-failure.c |  2 +-
 3 files changed, 6 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 358803cfd4d5..52c63a9c5a9c 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -81,15 +81,9 @@ struct folio *ksm_might_need_to_copy(struct folio *folio,
 
 void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc);
 void folio_migrate_ksm(struct folio *newfolio, struct folio *folio);
-
-#ifdef CONFIG_MEMORY_FAILURE
-void collect_procs_ksm(struct page *page, struct list_head *to_kill,
-		       int force_early);
-#endif
-
-#ifdef CONFIG_PROC_FS
+void collect_procs_ksm(struct folio *folio, struct page *page,
+		struct list_head *to_kill, int force_early);
 long ksm_process_profit(struct mm_struct *);
-#endif /* CONFIG_PROC_FS */
 
 #else  /* !CONFIG_KSM */
 
@@ -120,12 +114,10 @@ static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte)
 {
 }
 
-#ifdef CONFIG_MEMORY_FAILURE
-static inline void collect_procs_ksm(struct page *page,
+static inline void collect_procs_ksm(struct folio *folio, struct page *page,
 				     struct list_head *to_kill, int force_early)
 {
 }
-#endif
 
 #ifdef CONFIG_MMU
 static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
diff --git a/mm/ksm.c b/mm/ksm.c
index 159604ad4779..e1034bf1c937 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -3178,12 +3178,11 @@ again:
 /*
  * Collect processes when the error hit an ksm page.
  */
-void collect_procs_ksm(struct page *page, struct list_head *to_kill,
-		       int force_early)
+void collect_procs_ksm(struct folio *folio, struct page *page,
+		struct list_head *to_kill, int force_early)
 {
 	struct ksm_stable_node *stable_node;
 	struct ksm_rmap_item *rmap_item;
-	struct folio *folio = page_folio(page);
 	struct vm_area_struct *vma;
 	struct task_struct *tsk;
 
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 498564f23abb..ec30611ec5ad 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -729,7 +729,7 @@ static void collect_procs(struct folio *folio, struct page *page,
 	if (!folio->mapping)
 		return;
 	if (unlikely(folio_test_ksm(folio)))
-		collect_procs_ksm(page, tokill, force_early);
+		collect_procs_ksm(folio, page, tokill, force_early);
 	else if (folio_test_anon(folio))
 		collect_procs_anon(folio, page, tokill, force_early);
 	else
-- 
cgit v1.2.3


From 3f2ae4ebd53b6e555455f293364e1e0d4b7530e2 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 24 Apr 2024 20:19:06 +0100
Subject: mm: remove page_cache_alloc()

Patch series "More folio compat code removal".

More code removal with bonus kernel-doc addition.


This patch (of 7):

All callers have now been converted to filemap_alloc_folio().

Link: https://lkml.kernel.org/r/20240424191914.361554-1-willy@infradead.org
Link: https://lkml.kernel.org/r/20240424191914.361554-2-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pagemap.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 65e332df4d76..2a0bc2ff5258 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -558,11 +558,6 @@ static inline struct page *__page_cache_alloc(gfp_t gfp)
 	return &filemap_alloc_folio(gfp, 0)->page;
 }
 
-static inline struct page *page_cache_alloc(struct address_space *x)
-{
-	return __page_cache_alloc(mapping_gfp_mask(x));
-}
-
 static inline gfp_t readahead_gfp_mask(struct address_space *x)
 {
 	return mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN;
-- 
cgit v1.2.3


From 6785c54a1b43c44ade9064ce46db4aa4d09e8cef Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 24 Apr 2024 20:19:07 +0100
Subject: mm: remove put_devmap_managed_page()

It only has one caller; convert that caller to use
put_devmap_managed_page_refs() instead.

Link: https://lkml.kernel.org/r/20240424191914.361554-3-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index b9ac49c9eb00..deb51f8dc283 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1457,11 +1457,6 @@ static inline bool put_devmap_managed_page_refs(struct page *page, int refs)
 }
 #endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
 
-static inline bool put_devmap_managed_page(struct page *page)
-{
-	return put_devmap_managed_page_refs(page, 1);
-}
-
 /* 127: arbitrary random number, small enough to assemble well */
 #define folio_ref_zero_or_close_to_overflow(folio) \
 	((unsigned int) folio_ref_count(folio) + 127u <= 127u)
@@ -1580,7 +1575,7 @@ static inline void put_page(struct page *page)
 	 * For some devmap managed pages we need to catch refcount transition
 	 * from 2 to 1:
 	 */
-	if (put_devmap_managed_page(&folio->page))
+	if (put_devmap_managed_page_refs(&folio->page, 1))
 		return;
 	folio_put(folio);
 }
-- 
cgit v1.2.3


From 53e45c4f6d4f6cd7e62f4cb016018ba31c2ac8b4 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 24 Apr 2024 20:19:08 +0100
Subject: mm: convert put_devmap_managed_page_refs() to
 put_devmap_managed_folio_refs()

All callers have a folio so we can remove this use of
page_ref_sub_return().

Link: https://lkml.kernel.org/r/20240424191914.361554-4-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 12 ++++++------
 mm/gup.c           |  6 +++---
 mm/memremap.c      | 10 +++++-----
 mm/swap.c          |  2 +-
 4 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index deb51f8dc283..9849dfda44d4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1441,17 +1441,17 @@ vm_fault_t finish_fault(struct vm_fault *vmf);
 #if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX)
 DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
 
-bool __put_devmap_managed_page_refs(struct page *page, int refs);
-static inline bool put_devmap_managed_page_refs(struct page *page, int refs)
+bool __put_devmap_managed_folio_refs(struct folio *folio, int refs);
+static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs)
 {
 	if (!static_branch_unlikely(&devmap_managed_key))
 		return false;
-	if (!is_zone_device_page(page))
+	if (!folio_is_zone_device(folio))
 		return false;
-	return __put_devmap_managed_page_refs(page, refs);
+	return __put_devmap_managed_folio_refs(folio, refs);
 }
 #else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
-static inline bool put_devmap_managed_page_refs(struct page *page, int refs)
+static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs)
 {
 	return false;
 }
@@ -1575,7 +1575,7 @@ static inline void put_page(struct page *page)
 	 * For some devmap managed pages we need to catch refcount transition
 	 * from 2 to 1:
 	 */
-	if (put_devmap_managed_page_refs(&folio->page, 1))
+	if (put_devmap_managed_folio_refs(folio, 1))
 		return;
 	folio_put(folio);
 }
diff --git a/mm/gup.c b/mm/gup.c
index 8dcbeae714e2..44c6ceee7e1f 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -89,7 +89,7 @@ retry:
 	 * belongs to this folio.
 	 */
 	if (unlikely(page_folio(page) != folio)) {
-		if (!put_devmap_managed_page_refs(&folio->page, refs))
+		if (!put_devmap_managed_folio_refs(folio, refs))
 			folio_put_refs(folio, refs);
 		goto retry;
 	}
@@ -156,7 +156,7 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
 	 */
 	if (unlikely((flags & FOLL_LONGTERM) &&
 		     !folio_is_longterm_pinnable(folio))) {
-		if (!put_devmap_managed_page_refs(&folio->page, refs))
+		if (!put_devmap_managed_folio_refs(folio, refs))
 			folio_put_refs(folio, refs);
 		return NULL;
 	}
@@ -198,7 +198,7 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
 			refs *= GUP_PIN_COUNTING_BIAS;
 	}
 
-	if (!put_devmap_managed_page_refs(&folio->page, refs))
+	if (!put_devmap_managed_folio_refs(folio, refs))
 		folio_put_refs(folio, refs);
 }
 
diff --git a/mm/memremap.c b/mm/memremap.c
index e1776693e2ea..40d4547ce514 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -512,9 +512,9 @@ void zone_device_page_init(struct page *page)
 EXPORT_SYMBOL_GPL(zone_device_page_init);
 
 #ifdef CONFIG_FS_DAX
-bool __put_devmap_managed_page_refs(struct page *page, int refs)
+bool __put_devmap_managed_folio_refs(struct folio *folio, int refs)
 {
-	if (page->pgmap->type != MEMORY_DEVICE_FS_DAX)
+	if (folio->page.pgmap->type != MEMORY_DEVICE_FS_DAX)
 		return false;
 
 	/*
@@ -522,9 +522,9 @@ bool __put_devmap_managed_page_refs(struct page *page, int refs)
 	 * refcount is 1, then the page is free and the refcount is
 	 * stable because nobody holds a reference on the page.
 	 */
-	if (page_ref_sub_return(page, refs) == 1)
-		wake_up_var(&page->_refcount);
+	if (folio_ref_sub_return(folio, refs) == 1)
+		wake_up_var(&folio->_refcount);
 	return true;
 }
-EXPORT_SYMBOL(__put_devmap_managed_page_refs);
+EXPORT_SYMBOL(__put_devmap_managed_folio_refs);
 #endif /* CONFIG_FS_DAX */
diff --git a/mm/swap.c b/mm/swap.c
index 8ae5cd4ed180..bfce6d53b237 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -980,7 +980,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
 				unlock_page_lruvec_irqrestore(lruvec, flags);
 				lruvec = NULL;
 			}
-			if (put_devmap_managed_page_refs(&folio->page, nr_refs))
+			if (put_devmap_managed_folio_refs(folio, nr_refs))
 				continue;
 			if (folio_ref_sub_and_test(folio, nr_refs))
 				free_zone_device_folio(folio);
-- 
cgit v1.2.3


From 498aefbc69d5719d8e4713b568122c259167f7b9 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 24 Apr 2024 20:19:09 +0100
Subject: mm: remove page_ref_sub_return()

With all callers converted to folios, we can act directly on
folio->_refcount.

Link: https://lkml.kernel.org/r/20240424191914.361554-5-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page_ref.h | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
index d7c2d33baa7f..1acf5bac7f50 100644
--- a/include/linux/page_ref.h
+++ b/include/linux/page_ref.h
@@ -139,20 +139,15 @@ static inline void folio_ref_sub(struct folio *folio, int nr)
 	page_ref_sub(&folio->page, nr);
 }
 
-static inline int page_ref_sub_return(struct page *page, int nr)
+static inline int folio_ref_sub_return(struct folio *folio, int nr)
 {
-	int ret = atomic_sub_return(nr, &page->_refcount);
+	int ret = atomic_sub_return(nr, &folio->_refcount);
 
 	if (page_ref_tracepoint_active(page_ref_mod_and_return))
-		__page_ref_mod_and_return(page, -nr, ret);
+		__page_ref_mod_and_return(&folio->page, -nr, ret);
 	return ret;
 }
 
-static inline int folio_ref_sub_return(struct folio *folio, int nr)
-{
-	return page_ref_sub_return(&folio->page, nr);
-}
-
 static inline void page_ref_inc(struct page *page)
 {
 	atomic_inc(&page->_refcount);
-- 
cgit v1.2.3


From 093137ea97bdc6d3617157e42ea58f8d4f376460 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Wed, 24 Apr 2024 20:19:12 +0100
Subject: mm: remove PageReferenced

All callers now use folio_*_referenced() so we can remove the
PageReferenced family of functions.

Link: https://lkml.kernel.org/r/20240424191914.361554-8-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/page-flags.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 0af2cef02383..104078afe0b1 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -512,9 +512,9 @@ static inline int TestClearPage##uname(struct page *page) { return 0; }
 __PAGEFLAG(Locked, locked, PF_NO_TAIL)
 FOLIO_FLAG(waiters, FOLIO_HEAD_PAGE)
 PAGEFLAG(Error, error, PF_NO_TAIL) TESTCLEARFLAG(Error, error, PF_NO_TAIL)
-PAGEFLAG(Referenced, referenced, PF_HEAD)
-	TESTCLEARFLAG(Referenced, referenced, PF_HEAD)
-	__SETPAGEFLAG(Referenced, referenced, PF_HEAD)
+FOLIO_FLAG(referenced, FOLIO_HEAD_PAGE)
+	FOLIO_TEST_CLEAR_FLAG(referenced, FOLIO_HEAD_PAGE)
+	__FOLIO_SET_FLAG(referenced, FOLIO_HEAD_PAGE)
 PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
 	__CLEARPAGEFLAG(Dirty, dirty, PF_HEAD)
 PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
-- 
cgit v1.2.3


From 4b5bbc39d7a637a135fd8b3ef86e60a6a7301e9d Mon Sep 17 00:00:00 2001
From: Kemeng Shi <shikemeng@huaweicloud.com>
Date: Tue, 23 Apr 2024 11:46:40 +0800
Subject: writeback: support retrieving per group debug writeback stats of bdi

Add /sys/kernel/debug/bdi/xxx/wb_stats to show per group writeback stats
of bdi.

Following domain hierarchy is tested:
                global domain (320G)
                /                 \
        cgroup domain1(10G)     cgroup domain2(10G)
                |                 |
bdi            wb1               wb2

/* per wb writeback info of bdi is collected */
cat wb_stats
WbCgIno:                    1
WbWriteback:                0 kB
WbReclaimable:              0 kB
WbDirtyThresh:              0 kB
WbDirtied:                  0 kB
WbWritten:                  0 kB
WbWriteBandwidth:      102400 kBps
b_dirty:                    0
b_io:                       0
b_more_io:                  0
b_dirty_time:               0
state:                      1

WbCgIno:                 4091
WbWriteback:             1792 kB
WbReclaimable:         820512 kB
WbDirtyThresh:        6004692 kB
WbDirtied:            1820448 kB
WbWritten:             999488 kB
WbWriteBandwidth:      169020 kBps
b_dirty:                    0
b_io:                       0
b_more_io:                  1
b_dirty_time:               0
state:                      5

WbCgIno:                 4131
WbWriteback:             1120 kB
WbReclaimable:         820064 kB
WbDirtyThresh:        6004728 kB
WbDirtied:            1822688 kB
WbWritten:            1002400 kB
WbWriteBandwidth:      153520 kBps
b_dirty:                    0
b_io:                       0
b_more_io:                  1
b_dirty_time:               0
state:                      5

[shikemeng@huaweicloud.com: fix build problems]
  Link: https://lkml.kernel.org/r/20240423034643.141219-4-shikemeng@huaweicloud.com
Link: https://lkml.kernel.org/r/20240423034643.141219-3-shikemeng@huaweicloud.com
Signed-off-by: Kemeng Shi <shikemeng@huaweicloud.com>
Cc: Brian Foster <bfoster@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: David Sterba <dsterba@suse.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Mateusz Guzik <mjguzik@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: SeongJae Park <sj@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/writeback.h |  1 +
 mm/backing-dev.c          | 81 +++++++++++++++++++++++++++++++++++++++++++++--
 mm/page-writeback.c       | 19 +++++++++++
 3 files changed, 99 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 9845cb62e40b..112d806ddbe4 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -355,6 +355,7 @@ int dirtytime_interval_handler(struct ctl_table *table, int write,
 
 void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
 unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
+unsigned long cgwb_calc_thresh(struct bdi_writeback *wb);
 
 void wb_update_bandwidth(struct bdi_writeback *wb);
 
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 089146feb830..e61bbb1bd622 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -155,19 +155,96 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 }
 DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats);
 
+static void wb_stats_show(struct seq_file *m, struct bdi_writeback *wb,
+			  struct wb_stats *stats)
+{
+
+	seq_printf(m,
+		   "WbCgIno:           %10lu\n"
+		   "WbWriteback:       %10lu kB\n"
+		   "WbReclaimable:     %10lu kB\n"
+		   "WbDirtyThresh:     %10lu kB\n"
+		   "WbDirtied:         %10lu kB\n"
+		   "WbWritten:         %10lu kB\n"
+		   "WbWriteBandwidth:  %10lu kBps\n"
+		   "b_dirty:           %10lu\n"
+		   "b_io:              %10lu\n"
+		   "b_more_io:         %10lu\n"
+		   "b_dirty_time:      %10lu\n"
+		   "state:             %10lx\n\n",
+#ifdef CONFIG_CGROUP_WRITEBACK
+		   cgroup_ino(wb->memcg_css->cgroup),
+#else
+		   1ul,
+#endif
+		   K(stats->nr_writeback),
+		   K(stats->nr_reclaimable),
+		   K(stats->wb_thresh),
+		   K(stats->nr_dirtied),
+		   K(stats->nr_written),
+		   K(wb->avg_write_bandwidth),
+		   stats->nr_dirty,
+		   stats->nr_io,
+		   stats->nr_more_io,
+		   stats->nr_dirty_time,
+		   wb->state);
+}
+
+static int cgwb_debug_stats_show(struct seq_file *m, void *v)
+{
+	struct backing_dev_info *bdi = m->private;
+	unsigned long background_thresh;
+	unsigned long dirty_thresh;
+	struct bdi_writeback *wb;
+
+	global_dirty_limits(&background_thresh, &dirty_thresh);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) {
+		struct wb_stats stats = { .dirty_thresh = dirty_thresh };
+
+		if (!wb_tryget(wb))
+			continue;
+
+		collect_wb_stats(&stats, wb);
+
+		/*
+		 * Calculate thresh of wb in writeback cgroup which is min of
+		 * thresh in global domain and thresh in cgroup domain. Drop
+		 * rcu lock because cgwb_calc_thresh may sleep in
+		 * cgroup_rstat_flush. We can do so here because we have a ref.
+		 */
+		if (mem_cgroup_wb_domain(wb)) {
+			rcu_read_unlock();
+			stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb));
+			rcu_read_lock();
+		}
+
+		wb_stats_show(m, wb, &stats);
+
+		wb_put(wb);
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(cgwb_debug_stats);
+
 static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
 {
 	bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
 
 	debugfs_create_file("stats", 0444, bdi->debug_dir, bdi,
 			    &bdi_debug_stats_fops);
+	debugfs_create_file("wb_stats", 0444, bdi->debug_dir, bdi,
+			    &cgwb_debug_stats_fops);
 }
 
 static void bdi_debug_unregister(struct backing_dev_info *bdi)
 {
 	debugfs_remove_recursive(bdi->debug_dir);
 }
-#else
+#else /* CONFIG_DEBUG_FS */
 static inline void bdi_debug_init(void)
 {
 }
@@ -178,7 +255,7 @@ static inline void bdi_debug_register(struct backing_dev_info *bdi,
 static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
 {
 }
-#endif
+#endif /* CONFIG_DEBUG_FS */
 
 static ssize_t read_ahead_kb_store(struct device *dev,
 				  struct device_attribute *attr,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 2c1f595d1ddc..c5fb3121469a 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -892,6 +892,25 @@ unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
 	return __wb_calc_thresh(&gdtc);
 }
 
+unsigned long cgwb_calc_thresh(struct bdi_writeback *wb)
+{
+	struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
+	struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) };
+	unsigned long filepages = 0, headroom = 0, writeback = 0;
+
+	gdtc.avail = global_dirtyable_memory();
+	gdtc.dirty = global_node_page_state(NR_FILE_DIRTY) +
+		     global_node_page_state(NR_WRITEBACK);
+
+	mem_cgroup_wb_stats(wb, &filepages, &headroom,
+			    &mdtc.dirty, &writeback);
+	mdtc.dirty += writeback;
+	mdtc_calc_avail(&mdtc, filepages, headroom);
+	domain_dirty_limits(&mdtc);
+
+	return __wb_calc_thresh(&mdtc);
+}
+
 /*
  *                           setpoint - dirty 3
  *        f(dirty) := 1.0 + (----------------)
-- 
cgit v1.2.3


From e0ffb29bc54d86b9ab10ebafc66eb1b7229e0cd7 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@infradead.org>
Date: Thu, 25 Apr 2024 05:00:55 +0100
Subject: mm: simplify thp_vma_allowable_order

Combine the three boolean arguments into one flags argument for
readability.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/proc/task_mmu.c      |  4 ++--
 include/linux/huge_mm.h | 29 +++++++++++++++--------------
 mm/huge_memory.c        |  7 +++++--
 mm/khugepaged.c         | 16 +++++++---------
 mm/memory.c             | 10 ++++++----
 5 files changed, 35 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f4259b7edfde..81fbecfe5ff6 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -871,8 +871,8 @@ static int show_smap(struct seq_file *m, void *v)
 	__show_smap(m, &mss, false);
 
 	seq_printf(m, "THPeligible:    %8u\n",
-		   !!thp_vma_allowable_orders(vma, vma->vm_flags, true, false,
-					      true, THP_ORDERS_ALL));
+		   !!thp_vma_allowable_orders(vma, vma->vm_flags,
+			   TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL));
 
 	if (arch_pkeys_enabled())
 		seq_printf(m, "ProtectionKey:  %8u\n", vma_pkey(vma));
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 7cd07b83a3d0..c8d3ec116e29 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -81,8 +81,12 @@ extern struct kobj_attribute shmem_enabled_attr;
  */
 #define THP_ORDERS_ALL		(THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE)
 
-#define thp_vma_allowable_order(vma, vm_flags, smaps, in_pf, enforce_sysfs, order) \
-	(!!thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf, enforce_sysfs, BIT(order)))
+#define TVA_SMAPS		(1 << 0)	/* Will be used for procfs */
+#define TVA_IN_PF		(1 << 1)	/* Page fault handler */
+#define TVA_ENFORCE_SYSFS	(1 << 2)	/* Obey sysfs configuration */
+
+#define thp_vma_allowable_order(vma, vm_flags, tva_flags, order) \
+	(!!thp_vma_allowable_orders(vma, vm_flags, tva_flags, BIT(order)))
 
 #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
 #define HPAGE_PMD_SHIFT PMD_SHIFT
@@ -218,17 +222,15 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
 }
 
 unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
-					 unsigned long vm_flags, bool smaps,
-					 bool in_pf, bool enforce_sysfs,
+					 unsigned long vm_flags,
+					 unsigned long tva_flags,
 					 unsigned long orders);
 
 /**
  * thp_vma_allowable_orders - determine hugepage orders that are allowed for vma
  * @vma:  the vm area to check
  * @vm_flags: use these vm_flags instead of vma->vm_flags
- * @smaps: whether answer will be used for smaps file
- * @in_pf: whether answer will be used by page fault handler
- * @enforce_sysfs: whether sysfs config should be taken into account
+ * @tva_flags: Which TVA flags to honour
  * @orders: bitfield of all orders to consider
  *
  * Calculates the intersection of the requested hugepage orders and the allowed
@@ -241,12 +243,12 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
  */
 static inline
 unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
-				       unsigned long vm_flags, bool smaps,
-				       bool in_pf, bool enforce_sysfs,
+				       unsigned long vm_flags,
+				       unsigned long tva_flags,
 				       unsigned long orders)
 {
 	/* Optimization to check if required orders are enabled early. */
-	if (enforce_sysfs && vma_is_anonymous(vma)) {
+	if ((tva_flags & TVA_ENFORCE_SYSFS) && vma_is_anonymous(vma)) {
 		unsigned long mask = READ_ONCE(huge_anon_orders_always);
 
 		if (vm_flags & VM_HUGEPAGE)
@@ -260,8 +262,7 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
 			return 0;
 	}
 
-	return __thp_vma_allowable_orders(vma, vm_flags, smaps, in_pf,
-					  enforce_sysfs, orders);
+	return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
 }
 
 enum mthp_stat_item {
@@ -428,8 +429,8 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
 }
 
 static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
-					unsigned long vm_flags, bool smaps,
-					bool in_pf, bool enforce_sysfs,
+					unsigned long vm_flags,
+					unsigned long tva_flags,
 					unsigned long orders)
 {
 	return 0;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index eabf088cb444..aaa327bd5155 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -81,10 +81,13 @@ unsigned long huge_anon_orders_madvise __read_mostly;
 unsigned long huge_anon_orders_inherit __read_mostly;
 
 unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
-					 unsigned long vm_flags, bool smaps,
-					 bool in_pf, bool enforce_sysfs,
+					 unsigned long vm_flags,
+					 unsigned long tva_flags,
 					 unsigned long orders)
 {
+	bool smaps = tva_flags & TVA_SMAPS;
+	bool in_pf = tva_flags & TVA_IN_PF;
+	bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS;
 	/* Check the intersection of requested and supported orders. */
 	orders &= vma_is_anonymous(vma) ?
 			THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index cf518fc44098..774a97e6e2da 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -453,7 +453,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma,
 {
 	if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
 	    hugepage_flags_enabled()) {
-		if (thp_vma_allowable_order(vma, vm_flags, false, false, true,
+		if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS,
 					    PMD_ORDER))
 			__khugepaged_enter(vma->vm_mm);
 	}
@@ -900,6 +900,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
 				   struct collapse_control *cc)
 {
 	struct vm_area_struct *vma;
+	unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0;
 
 	if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
 		return SCAN_ANY_PROCESS;
@@ -910,8 +911,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
 
 	if (!thp_vma_suitable_order(vma, address, PMD_ORDER))
 		return SCAN_ADDRESS_RANGE;
-	if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false,
-				     cc->is_khugepaged, PMD_ORDER))
+	if (!thp_vma_allowable_order(vma, vma->vm_flags, tva_flags, PMD_ORDER))
 		return SCAN_VMA_CHECK;
 	/*
 	 * Anon VMA expected, the address may be unmapped then
@@ -1501,8 +1501,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
 	 * and map it by a PMD, regardless of sysfs THP settings. As such, let's
 	 * analogously elide sysfs THP settings here.
 	 */
-	if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false,
-				     PMD_ORDER))
+	if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
 		return SCAN_VMA_CHECK;
 
 	/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
@@ -2363,8 +2362,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
 			progress++;
 			break;
 		}
-		if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false,
-					     true, PMD_ORDER)) {
+		if (!thp_vma_allowable_order(vma, vma->vm_flags,
+					TVA_ENFORCE_SYSFS, PMD_ORDER)) {
 skip:
 			progress++;
 			continue;
@@ -2701,8 +2700,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
 
 	*prev = vma;
 
-	if (!thp_vma_allowable_order(vma, vma->vm_flags, false, false, false,
-				     PMD_ORDER))
+	if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
 		return -EINVAL;
 
 	cc = kmalloc(sizeof(*cc), GFP_KERNEL);
diff --git a/mm/memory.c b/mm/memory.c
index c0bd4e0d5e7a..79d851be8ab2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4334,8 +4334,8 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
 	 * for this vma. Then filter out the orders that can't be allocated over
 	 * the faulting address and still be fully contained in the vma.
 	 */
-	orders = thp_vma_allowable_orders(vma, vma->vm_flags, false, true, true,
-					  BIT(PMD_ORDER) - 1);
+	orders = thp_vma_allowable_orders(vma, vma->vm_flags,
+			TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1);
 	orders = thp_vma_suitable_orders(vma, vmf->address, orders);
 
 	if (!orders)
@@ -5438,7 +5438,8 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
 		return VM_FAULT_OOM;
 retry_pud:
 	if (pud_none(*vmf.pud) &&
-	    thp_vma_allowable_order(vma, vm_flags, false, true, true, PUD_ORDER)) {
+	    thp_vma_allowable_order(vma, vm_flags,
+				TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) {
 		ret = create_huge_pud(&vmf);
 		if (!(ret & VM_FAULT_FALLBACK))
 			return ret;
@@ -5472,7 +5473,8 @@ retry_pud:
 		goto retry_pud;
 
 	if (pmd_none(*vmf.pmd) &&
-	    thp_vma_allowable_order(vma, vm_flags, false, true, true, PMD_ORDER)) {
+	    thp_vma_allowable_order(vma, vm_flags,
+				TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) {
 		ret = create_huge_pmd(&vmf);
 		if (!(ret & VM_FAULT_FALLBACK))
 			return ret;
-- 
cgit v1.2.3


From 2d8b24654feabc62ed9e132911319f1c48ab0256 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj@kernel.org>
Date: Fri, 26 Apr 2024 12:52:42 -0700
Subject: mm/damon: add DAMOS filter type YOUNG

Define yet another DAMOS filter type, YOUNG.  Like anon and memcg, the
type of filter will be applied to each page in the memory region, and see
if the page is accessed since the last check.  Based on the 'matching'
parameter, the page is filtered out or in.

Note that this commit is adding only the type definition.  The
implementation should be made by DAMON operations sets.  A commit for the
implementation on 'paddr' DAMON operations set will follow.

Link: https://lkml.kernel.org/r/20240426195247.100306-4-sj@kernel.org
Signed-off-by: SeongJae Park <sj@kernel.org>
Tested-by: Honggyu Kim <honggyu.kim@sk.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/damon.h    | 2 ++
 mm/damon/sysfs-schemes.c | 1 +
 2 files changed, 3 insertions(+)

(limited to 'include')

diff --git a/include/linux/damon.h b/include/linux/damon.h
index 886d07294f4e..f7da65e1ac04 100644
--- a/include/linux/damon.h
+++ b/include/linux/damon.h
@@ -297,6 +297,7 @@ struct damos_stat {
  * enum damos_filter_type - Type of memory for &struct damos_filter
  * @DAMOS_FILTER_TYPE_ANON:	Anonymous pages.
  * @DAMOS_FILTER_TYPE_MEMCG:	Specific memcg's pages.
+ * @DAMOS_FILTER_TYPE_YOUNG:	Recently accessed pages.
  * @DAMOS_FILTER_TYPE_ADDR:	Address range.
  * @DAMOS_FILTER_TYPE_TARGET:	Data Access Monitoring target.
  * @NR_DAMOS_FILTER_TYPES:	Number of filter types.
@@ -315,6 +316,7 @@ struct damos_stat {
 enum damos_filter_type {
 	DAMOS_FILTER_TYPE_ANON,
 	DAMOS_FILTER_TYPE_MEMCG,
+	DAMOS_FILTER_TYPE_YOUNG,
 	DAMOS_FILTER_TYPE_ADDR,
 	DAMOS_FILTER_TYPE_TARGET,
 	NR_DAMOS_FILTER_TYPES,
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 53a90ac678fb..bea5bc52846a 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -343,6 +343,7 @@ static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(void)
 static const char * const damon_sysfs_scheme_filter_type_strs[] = {
 	"anon",
 	"memcg",
+	"young",
 	"addr",
 	"target",
 };
-- 
cgit v1.2.3


From 620875560bd65084a8bfcea6899003d6a29ddc57 Mon Sep 17 00:00:00 2001
From: Hao Ge <gehao@kylinos.cn>
Date: Sun, 28 Apr 2024 09:47:11 +0800
Subject: mm/pagemap: make trylock_page return bool

Make trylock_page return bool to align the return values of folio_trylock
function and it also corresponds to its comment.

Link: https://lkml.kernel.org/r/20240428014711.11169-1-gehao@kylinos.cn
Signed-off-by: Hao Ge <gehao@kylinos.cn>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/pagemap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 2a0bc2ff5258..850d32057939 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1012,7 +1012,7 @@ static inline bool folio_trylock(struct folio *folio)
 /*
  * Return true if the page was successfully locked
  */
-static inline int trylock_page(struct page *page)
+static inline bool trylock_page(struct page *page)
 {
 	return folio_trylock(page_folio(page));
 }
-- 
cgit v1.2.3


From d3155742db89df3b3c96da383c400e6ff4d23c25 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Mon, 25 Mar 2024 19:56:10 -0600
Subject: firewire: Annotate struct fw_iso_packet with __counted_by()

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time via CONFIG_UBSAN_BOUNDS (for
array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/ZgIrOuR3JI/jzqoH@neat
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 include/linux/firewire.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/firewire.h b/include/linux/firewire.h
index dd9f2d765e68..00abe0e5d602 100644
--- a/include/linux/firewire.h
+++ b/include/linux/firewire.h
@@ -463,7 +463,8 @@ struct fw_iso_packet {
 	u32 tag:2;		/* tx: Tag in packet header		*/
 	u32 sy:4;		/* tx: Sy in packet header		*/
 	u32 header_length:8;	/* Length of immediate header		*/
-	u32 header[];		/* tx: Top of 1394 isoch. data_block	*/
+				/* tx: Top of 1394 isoch. data_block    */
+	u32 header[] __counted_by(header_length);
 };
 
 #define FW_ISO_CONTEXT_TRANSMIT			0
-- 
cgit v1.2.3


From 57614c28843d4ec34c12ca67dd11411396aab55d Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Mon, 29 Apr 2024 13:32:14 +0900
Subject: firewire: core: add support for Linux kernel tracepoints

The Linux Kernel Tracepoints framework is enough useful to trace
packet data inbound to and outbound from core.

This commit adds firewire subsystem to use the framework.

Link: https://lore.kernel.org/r/20240429043218.609398-2-o-takashi@sakamocchi.jp
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 drivers/firewire/Makefile       |  2 +-
 drivers/firewire/core-trace.c   |  5 +++++
 include/trace/events/firewire.h | 15 +++++++++++++++
 3 files changed, 21 insertions(+), 1 deletion(-)
 create mode 100644 drivers/firewire/core-trace.c
 create mode 100644 include/trace/events/firewire.h

(limited to 'include')

diff --git a/drivers/firewire/Makefile b/drivers/firewire/Makefile
index bbde29a0fba6..013e1f2641bd 100644
--- a/drivers/firewire/Makefile
+++ b/drivers/firewire/Makefile
@@ -3,7 +3,7 @@
 # Makefile for the Linux IEEE 1394 implementation
 #
 
-firewire-core-y += core-card.o core-cdev.o core-device.o \
+firewire-core-y += core-trace.o core-card.o core-cdev.o core-device.o \
                    core-iso.o core-topology.o core-transaction.o
 firewire-ohci-y += ohci.o
 firewire-sbp2-y += sbp2.o
diff --git a/drivers/firewire/core-trace.c b/drivers/firewire/core-trace.c
new file mode 100644
index 000000000000..96cbd9d384dc
--- /dev/null
+++ b/drivers/firewire/core-trace.c
@@ -0,0 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+// Copyright (c) 2024 Takashi Sakamoto
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/firewire.h>
diff --git a/include/trace/events/firewire.h b/include/trace/events/firewire.h
new file mode 100644
index 000000000000..bc55eaabf695
--- /dev/null
+++ b/include/trace/events/firewire.h
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+// Copyright (c) 2024 Takashi Sakamoto
+
+#define TRACE_SYSTEM	firewire
+
+#if !defined(_FIREWIRE_TRACE_EVENT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _FIREWIRE_TRACE_EVENT_H
+
+#include <linux/tracepoint.h>
+
+// Placeholder for future use.
+
+#endif // _FIREWIRE_TRACE_EVENT_H
+
+#include <trace/define_trace.h>
-- 
cgit v1.2.3


From 944b06840a73a23c096b2582b960a0a1e3678d43 Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Mon, 29 Apr 2024 13:32:15 +0900
Subject: firewire: core: add tracepoints events for asynchronous outbound
 request

In a view of core transaction service, the asynchronous outbound request
consists of two stages; initiation and completion. This commit adds a pair
of event for them.

The following example is for asynchronous lock request with compare_swap
code to offset 0x'ffff'f000'0904 in node 0xffc0.

async_request_outbound_initiate: \
  transaction=0xffff955fc6a07a10 generation=5 scode=2 dst_id=0xffc0 \
  tlabel=54 tcode=9 src_id=0xffc1 offset=0xfffff0000904 \
  header={0xffc0d990,0xffc1ffff,0xf0000904,0x80002}
  data={0x80,0x940181}
async_request_outbound_complete: \
  transaction=0xffff955fc6a07a10 generation=5 scode=2 status=2 \
  timestamp=0xd887

Link: https://lore.kernel.org/r/20240429043218.609398-3-o-takashi@sakamocchi.jp
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 drivers/firewire/core-transaction.c |   7 +++
 include/trace/events/firewire.h     | 103 +++++++++++++++++++++++++++++++++++-
 2 files changed, 109 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index 3ecb0b945083..c9318024386f 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -29,6 +29,7 @@
 #include <asm/byteorder.h>
 
 #include "core.h"
+#include <trace/events/firewire.h>
 #include "packet-header-definitions.h"
 
 #define HEADER_DESTINATION_IS_BROADCAST(header) \
@@ -173,6 +174,9 @@ static void transmit_complete_callback(struct fw_packet *packet,
 	struct fw_transaction *t =
 	    container_of(packet, struct fw_transaction, packet);
 
+	trace_async_request_outbound_complete((uintptr_t)t, packet->generation, packet->speed,
+					      status, packet->timestamp);
+
 	switch (status) {
 	case ACK_COMPLETE:
 		close_transaction(t, card, RCODE_COMPLETE, packet->timestamp);
@@ -394,6 +398,9 @@ void __fw_send_request(struct fw_card *card, struct fw_transaction *t, int tcode
 
 	spin_unlock_irqrestore(&card->lock, flags);
 
+	trace_async_request_outbound_initiate((uintptr_t)t, generation, speed, t->packet.header, payload,
+					      tcode_is_read_request(tcode) ? 0 : length / 4);
+
 	card->driver->send_request(card, &t->packet);
 }
 EXPORT_SYMBOL_GPL(__fw_send_request);
diff --git a/include/trace/events/firewire.h b/include/trace/events/firewire.h
index bc55eaabf695..063695874cfb 100644
--- a/include/trace/events/firewire.h
+++ b/include/trace/events/firewire.h
@@ -7,8 +7,109 @@
 #define _FIREWIRE_TRACE_EVENT_H
 
 #include <linux/tracepoint.h>
+#include <linux/firewire.h>
 
-// Placeholder for future use.
+#include <linux/firewire-constants.h>
+
+#include "../../../drivers/firewire/packet-header-definitions.h"
+
+// The content of TP_printk field is preprocessed, then put to the module binary.
+#define ASYNC_HEADER_GET_DESTINATION(header)	\
+	(((header)[0] & ASYNC_HEADER_Q0_DESTINATION_MASK) >> ASYNC_HEADER_Q0_DESTINATION_SHIFT)
+
+#define ASYNC_HEADER_GET_TLABEL(header)	\
+	(((header)[0] & ASYNC_HEADER_Q0_TLABEL_MASK) >> ASYNC_HEADER_Q0_TLABEL_SHIFT)
+
+#define ASYNC_HEADER_GET_TCODE(header)	\
+	(((header)[0] & ASYNC_HEADER_Q0_TCODE_MASK) >> ASYNC_HEADER_Q0_TCODE_SHIFT)
+
+#define ASYNC_HEADER_GET_SOURCE(header)	\
+	(((header)[1] & ASYNC_HEADER_Q1_SOURCE_MASK) >> ASYNC_HEADER_Q1_SOURCE_SHIFT)
+
+#define ASYNC_HEADER_GET_OFFSET(header)	\
+	((((unsigned long long)((header)[1] & ASYNC_HEADER_Q1_OFFSET_HIGH_MASK)) >> ASYNC_HEADER_Q1_OFFSET_HIGH_SHIFT) << 32)| \
+	(header)[2]
+
+#define QUADLET_SIZE	4
+
+DECLARE_EVENT_CLASS(async_outbound_initiate_template,
+	TP_PROTO(u64 transaction, unsigned int generation, unsigned int scode, const u32 *header, const u32 *data, unsigned int data_count),
+	TP_ARGS(transaction, generation, scode, header, data, data_count),
+	TP_STRUCT__entry(
+		__field(u64, transaction)
+		__field(u8, generation)
+		__field(u8, scode)
+		__array(u32, header, ASYNC_HEADER_QUADLET_COUNT)
+		__dynamic_array(u32, data, data_count)
+	),
+	TP_fast_assign(
+		__entry->transaction = transaction;
+		__entry->generation = generation;
+		__entry->scode = scode;
+		memcpy(__entry->header, header, QUADLET_SIZE * ASYNC_HEADER_QUADLET_COUNT);
+		memcpy(__get_dynamic_array(data), data, __get_dynamic_array_len(data));
+	),
+	// This format is for the request subaction.
+	TP_printk(
+		"transaction=0x%llx generation=%u scode=%u dst_id=0x%04x tlabel=%u tcode=%u src_id=0x%04x offset=0x%012llx header=%s data=%s",
+		__entry->transaction,
+		__entry->generation,
+		__entry->scode,
+		ASYNC_HEADER_GET_DESTINATION(__entry->header),
+		ASYNC_HEADER_GET_TLABEL(__entry->header),
+		ASYNC_HEADER_GET_TCODE(__entry->header),
+		ASYNC_HEADER_GET_SOURCE(__entry->header),
+		ASYNC_HEADER_GET_OFFSET(__entry->header),
+		__print_array(__entry->header, ASYNC_HEADER_QUADLET_COUNT, QUADLET_SIZE),
+		__print_array(__get_dynamic_array(data),
+			      __get_dynamic_array_len(data) / QUADLET_SIZE, QUADLET_SIZE)
+	)
+);
+
+// The value of status is one of ack codes and rcodes specific to Linux FireWire subsystem.
+DECLARE_EVENT_CLASS(async_outbound_complete_template,
+	TP_PROTO(u64 transaction, unsigned int generation, unsigned int scode, unsigned int status, unsigned int timestamp),
+	TP_ARGS(transaction, generation, scode, status, timestamp),
+	TP_STRUCT__entry(
+		__field(u64, transaction)
+		__field(u8, generation)
+		__field(u8, scode)
+		__field(u8, status)
+		__field(u16, timestamp)
+	),
+	TP_fast_assign(
+		__entry->transaction = transaction;
+		__entry->generation = generation;
+		__entry->scode = scode;
+		__entry->status = status;
+		__entry->timestamp = timestamp;
+	),
+	TP_printk(
+		"transaction=0x%llx generation=%u scode=%u status=%u timestamp=0x%04x",
+		__entry->transaction,
+		__entry->generation,
+		__entry->scode,
+		__entry->status,
+		__entry->timestamp
+	)
+);
+
+DEFINE_EVENT(async_outbound_initiate_template, async_request_outbound_initiate,
+	TP_PROTO(u64 transaction, unsigned int generation, unsigned int scode, const u32 *header, const u32 *data, unsigned int data_count),
+	TP_ARGS(transaction, generation, scode, header, data, data_count)
+);
+
+DEFINE_EVENT(async_outbound_complete_template, async_request_outbound_complete,
+	TP_PROTO(u64 transaction, unsigned int generation, unsigned int scode, unsigned int status, unsigned int timestamp),
+	TP_ARGS(transaction, generation, scode, status, timestamp)
+);
+
+#undef ASYNC_HEADER_GET_DESTINATION
+#undef ASYNC_HEADER_GET_TLABEL
+#undef ASYNC_HEADER_GET_TCODE
+#undef ASYNC_HEADER_GET_SOURCE
+#undef ASYNC_HEADER_GET_OFFSET
+#undef QUADLET_SIZE
 
 #endif // _FIREWIRE_TRACE_EVENT_H
 
-- 
cgit v1.2.3


From 06cc078c078e58402dd42cdffe95872bf4c97c90 Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Mon, 29 Apr 2024 13:32:16 +0900
Subject: firewire: core: add tracepoints event for asynchronous inbound
 response

In the transaction of IEEE 1394, the node to receive the asynchronous
request transfers any response packet to the requester except for the
unified transaction.

This commit adds an event for the inbound packet. Note that the code to
decode the packet header is moved, against the note about the sanity
check.

The following example is for asynchronous lock response with
compare_and_swap code.

async_response_inbound: \
  transaction=0xffff955fc6a07a10 generation=5 scode=2 status=1 \
  timestamp=0x0089 dst_id=0xffc1 tlabel=54 tcode=11 src_id=0xffc0 \
  rcode=0 header={0xffc1d9b0,0xffc00000,0x0,0x40002} data={0x50800080}

Link: https://lore.kernel.org/r/20240429043218.609398-4-o-takashi@sakamocchi.jp
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 drivers/firewire/core-transaction.c | 55 +++++++++++++++++++------------------
 include/trace/events/firewire.h     | 50 +++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index c9318024386f..56510d305564 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -1011,32 +1011,10 @@ void fw_core_handle_response(struct fw_card *card, struct fw_packet *p)
 	source = async_header_get_source(p->header);
 	rcode = async_header_get_rcode(p->header);
 
-	spin_lock_irqsave(&card->lock, flags);
-	list_for_each_entry(iter, &card->transaction_list, link) {
-		if (iter->node_id == source && iter->tlabel == tlabel) {
-			if (!try_cancel_split_timeout(iter)) {
-				spin_unlock_irqrestore(&card->lock, flags);
-				goto timed_out;
-			}
-			list_del_init(&iter->link);
-			card->tlabel_mask &= ~(1ULL << iter->tlabel);
-			t = iter;
-			break;
-		}
-	}
-	spin_unlock_irqrestore(&card->lock, flags);
-
-	if (!t) {
- timed_out:
-		fw_notice(card, "unsolicited response (source %x, tlabel %x)\n",
-			  source, tlabel);
-		return;
-	}
-
-	/*
-	 * FIXME: sanity check packet, is length correct, does tcodes
-	 * and addresses match.
-	 */
+	// FIXME: sanity check packet, is length correct, does tcodes
+	// and addresses match to the transaction request queried later.
+	//
+	// For the tracepoints event, let us decode the header here against the concern.
 
 	switch (tcode) {
 	case TCODE_READ_QUADLET_RESPONSE:
@@ -1062,6 +1040,31 @@ void fw_core_handle_response(struct fw_card *card, struct fw_packet *p)
 		break;
 	}
 
+	spin_lock_irqsave(&card->lock, flags);
+	list_for_each_entry(iter, &card->transaction_list, link) {
+		if (iter->node_id == source && iter->tlabel == tlabel) {
+			if (!try_cancel_split_timeout(iter)) {
+				spin_unlock_irqrestore(&card->lock, flags);
+				goto timed_out;
+			}
+			list_del_init(&iter->link);
+			card->tlabel_mask &= ~(1ULL << iter->tlabel);
+			t = iter;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&card->lock, flags);
+
+	trace_async_response_inbound((uintptr_t)t, p->generation, p->speed, p->ack, p->timestamp,
+				     p->header, data, data_length / 4);
+
+	if (!t) {
+ timed_out:
+		fw_notice(card, "unsolicited response (source %x, tlabel %x)\n",
+			  source, tlabel);
+		return;
+	}
+
 	/*
 	 * The response handler may be executed while the request handler
 	 * is still pending.  Cancel the request handler.
diff --git a/include/trace/events/firewire.h b/include/trace/events/firewire.h
index 063695874cfb..2d5f6b196a22 100644
--- a/include/trace/events/firewire.h
+++ b/include/trace/events/firewire.h
@@ -30,6 +30,9 @@
 	((((unsigned long long)((header)[1] & ASYNC_HEADER_Q1_OFFSET_HIGH_MASK)) >> ASYNC_HEADER_Q1_OFFSET_HIGH_SHIFT) << 32)| \
 	(header)[2]
 
+#define ASYNC_HEADER_GET_RCODE(header)	\
+	(((header)[1] & ASYNC_HEADER_Q1_RCODE_MASK) >> ASYNC_HEADER_Q1_RCODE_SHIFT)
+
 #define QUADLET_SIZE	4
 
 DECLARE_EVENT_CLASS(async_outbound_initiate_template,
@@ -94,6 +97,47 @@ DECLARE_EVENT_CLASS(async_outbound_complete_template,
 	)
 );
 
+// The value of status is one of ack codes and rcodes specific to Linux FireWire subsystem.
+DECLARE_EVENT_CLASS(async_inbound_template,
+	TP_PROTO(u64 transaction, unsigned int generation, unsigned int scode, unsigned int status, unsigned int timestamp, const u32 *header, const u32 *data, unsigned int data_count),
+	TP_ARGS(transaction, generation, scode, status, timestamp, header, data, data_count),
+	TP_STRUCT__entry(
+		__field(u64, transaction)
+		__field(u8, generation)
+		__field(u8, scode)
+		__field(u8, status)
+		__field(u8, timestamp)
+		__array(u32, header, ASYNC_HEADER_QUADLET_COUNT)
+		__dynamic_array(u32, data, data_count)
+	),
+	TP_fast_assign(
+		__entry->transaction = transaction;
+		__entry->generation = generation;
+		__entry->scode = scode;
+		__entry->status = status;
+		__entry->timestamp = timestamp;
+		memcpy(__entry->header, header, QUADLET_SIZE * ASYNC_HEADER_QUADLET_COUNT);
+		memcpy(__get_dynamic_array(data), data, __get_dynamic_array_len(data));
+	),
+	// This format is for the response subaction.
+	TP_printk(
+		"transaction=0x%llx generation=%u scode=%u status=%u timestamp=0x%04x dst_id=0x%04x tlabel=%u tcode=%u src_id=0x%04x rcode=%u header=%s data=%s",
+		__entry->transaction,
+		__entry->generation,
+		__entry->scode,
+		__entry->status,
+		__entry->timestamp,
+		ASYNC_HEADER_GET_DESTINATION(__entry->header),
+		ASYNC_HEADER_GET_TLABEL(__entry->header),
+		ASYNC_HEADER_GET_TCODE(__entry->header),
+		ASYNC_HEADER_GET_SOURCE(__entry->header),
+		ASYNC_HEADER_GET_RCODE(__entry->header),
+		__print_array(__entry->header, ASYNC_HEADER_QUADLET_COUNT, QUADLET_SIZE),
+		__print_array(__get_dynamic_array(data),
+			      __get_dynamic_array_len(data) / QUADLET_SIZE, QUADLET_SIZE)
+	)
+);
+
 DEFINE_EVENT(async_outbound_initiate_template, async_request_outbound_initiate,
 	TP_PROTO(u64 transaction, unsigned int generation, unsigned int scode, const u32 *header, const u32 *data, unsigned int data_count),
 	TP_ARGS(transaction, generation, scode, header, data, data_count)
@@ -104,11 +148,17 @@ DEFINE_EVENT(async_outbound_complete_template, async_request_outbound_complete,
 	TP_ARGS(transaction, generation, scode, status, timestamp)
 );
 
+DEFINE_EVENT(async_inbound_template, async_response_inbound,
+	TP_PROTO(u64 transaction, unsigned int generation, unsigned int scode, unsigned int status, unsigned int timestamp, const u32 *header, const u32 *data, unsigned int data_count),
+	TP_ARGS(transaction, generation, scode, status, timestamp, header, data, data_count)
+);
+
 #undef ASYNC_HEADER_GET_DESTINATION
 #undef ASYNC_HEADER_GET_TLABEL
 #undef ASYNC_HEADER_GET_TCODE
 #undef ASYNC_HEADER_GET_SOURCE
 #undef ASYNC_HEADER_GET_OFFSET
+#undef ASYNC_HEADER_GET_RCODE
 #undef QUADLET_SIZE
 
 #endif // _FIREWIRE_TRACE_EVENT_H
-- 
cgit v1.2.3


From 2c945b10d7a6ef923e945ef031e9cfa78671fb3f Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Mon, 29 Apr 2024 13:32:17 +0900
Subject: firewire: core: add tracepoint event for asynchronous inbound request

This commit adds an event for asynchronous inbound request.

The following example is for asynchronous block write request as IEC
61883-1 FCP request from node 0xffc1.

async_request_inbound: \
  transaction=0xffff89fa08cf16c0 generation=4 scode=2 status=2 \
  timestamp=0x00b3 dst_id=0xffc0 tlabel=19 tcode=1 src_id=0xffc1 \
  offset=0xfffff0000d00 header={0xffc04d10,0xffc1ffff,0xf0000d00,0x80000} \
  data={0x19ff08,0xffff0090}

Link: https://lore.kernel.org/r/20240429043218.609398-5-o-takashi@sakamocchi.jp
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 drivers/firewire/core-transaction.c |  8 +++++++-
 include/trace/events/firewire.h     | 21 +++++++++++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index 56510d305564..8c13f996a938 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -973,11 +973,13 @@ void fw_core_handle_request(struct fw_card *card, struct fw_packet *p)
 {
 	struct fw_request *request;
 	unsigned long long offset;
+	unsigned int tcode;
 
 	if (p->ack != ACK_PENDING && p->ack != ACK_COMPLETE)
 		return;
 
-	if (tcode_is_link_internal(async_header_get_tcode(p->header))) {
+	tcode = async_header_get_tcode(p->header);
+	if (tcode_is_link_internal(tcode)) {
 		fw_cdev_handle_phy_packet(card, p);
 		return;
 	}
@@ -988,6 +990,10 @@ void fw_core_handle_request(struct fw_card *card, struct fw_packet *p)
 		return;
 	}
 
+	trace_async_request_inbound((uintptr_t)request, p->generation, p->speed, p->ack,
+				    p->timestamp, p->header, request->data,
+				    tcode_is_read_request(tcode) ? 0 : request->length / 4);
+
 	offset = async_header_get_offset(p->header);
 
 	if (!is_in_fcp_region(offset, request->length))
diff --git a/include/trace/events/firewire.h b/include/trace/events/firewire.h
index 2d5f6b196a22..a30cebed119a 100644
--- a/include/trace/events/firewire.h
+++ b/include/trace/events/firewire.h
@@ -153,6 +153,27 @@ DEFINE_EVENT(async_inbound_template, async_response_inbound,
 	TP_ARGS(transaction, generation, scode, status, timestamp, header, data, data_count)
 );
 
+DEFINE_EVENT_PRINT(async_inbound_template, async_request_inbound,
+	TP_PROTO(u64 transaction, unsigned int generation, unsigned int scode, unsigned int status, unsigned int timestamp, const u32 *header, const u32 *data, unsigned int data_count),
+	TP_ARGS(transaction, generation, scode, status, timestamp, header, data, data_count),
+	TP_printk(
+		"transaction=0x%llx generation=%u scode=%u status=%u timestamp=0x%04x dst_id=0x%04x tlabel=%u tcode=%u src_id=0x%04x offset=0x%012llx header=%s data=%s",
+		__entry->transaction,
+		__entry->generation,
+		__entry->scode,
+		__entry->status,
+		__entry->timestamp,
+		ASYNC_HEADER_GET_DESTINATION(__entry->header),
+		ASYNC_HEADER_GET_TLABEL(__entry->header),
+		ASYNC_HEADER_GET_TCODE(__entry->header),
+		ASYNC_HEADER_GET_SOURCE(__entry->header),
+		ASYNC_HEADER_GET_OFFSET(__entry->header),
+		__print_array(__entry->header, ASYNC_HEADER_QUADLET_COUNT, QUADLET_SIZE),
+		__print_array(__get_dynamic_array(data),
+			      __get_dynamic_array_len(data) / QUADLET_SIZE, QUADLET_SIZE)
+	)
+);
+
 #undef ASYNC_HEADER_GET_DESTINATION
 #undef ASYNC_HEADER_GET_TLABEL
 #undef ASYNC_HEADER_GET_TCODE
-- 
cgit v1.2.3


From 624a8535f7036d8c4de1481b72704daa7f7fd43e Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Mon, 29 Apr 2024 13:32:18 +0900
Subject: firewire: core: add tracepoints events for asynchronous outbound
 response

In a view of core transaction service, the asynchronous outbound response
consists of two stages; initiation and completion.

This commit adds a pair of events for the asynchronous outbound response.
The following example is for asynchronous write quadlet request as IEC
61883-1 FCP response to node 0xffc1.

async_response_outbound_initiate: \
  transaction=0xffff89fa08cf16c0 generation=4 scode=2 dst_id=0xffc1 \
  tlabel=25 tcode=2 src_id=0xffc0 rcode=0 \
  header={0xffc16420,0xffc00000,0x0,0x0} data={}
async_response_outbound_complete: \
  transaction=0xffff89fa08cf16c0 generation=4 scode=2 status=1 \
  timestamp=0x0000

Link: https://lore.kernel.org/r/20240429043218.609398-6-o-takashi@sakamocchi.jp
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 drivers/firewire/core-transaction.c | 23 ++++++++++++++++-------
 include/trace/events/firewire.h     | 24 ++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index 8c13f996a938..0e49ebf52500 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -668,6 +668,9 @@ static void free_response_callback(struct fw_packet *packet,
 {
 	struct fw_request *request = container_of(packet, struct fw_request, response);
 
+	trace_async_response_outbound_complete((uintptr_t)request, packet->generation,
+					       packet->speed, status, packet->timestamp);
+
 	// Decrease the reference count since not at in-flight.
 	fw_request_put(request);
 
@@ -850,6 +853,9 @@ static struct fw_request *allocate_request(struct fw_card *card,
 void fw_send_response(struct fw_card *card,
 		      struct fw_request *request, int rcode)
 {
+	u32 *data = NULL;
+	unsigned int data_length = 0;
+
 	/* unified transaction or broadcast transaction: don't respond */
 	if (request->ack != ACK_PENDING ||
 	    HEADER_DESTINATION_IS_BROADCAST(request->request_header)) {
@@ -857,17 +863,20 @@ void fw_send_response(struct fw_card *card,
 		return;
 	}
 
-	if (rcode == RCODE_COMPLETE)
-		fw_fill_response(&request->response, request->request_header,
-				 rcode, request->data,
-				 fw_get_response_length(request));
-	else
-		fw_fill_response(&request->response, request->request_header,
-				 rcode, NULL, 0);
+	if (rcode == RCODE_COMPLETE) {
+		data = request->data;
+		data_length = fw_get_response_length(request);
+	}
+
+	fw_fill_response(&request->response, request->request_header, rcode, data, data_length);
 
 	// Increase the reference count so that the object is kept during in-flight.
 	fw_request_get(request);
 
+	trace_async_response_outbound_initiate((uintptr_t)request, request->response.generation,
+					       request->response.speed, request->response.header,
+					       data, data ? data_length / 4 : 0);
+
 	card->driver->send_response(card, &request->response);
 }
 EXPORT_SYMBOL(fw_send_response);
diff --git a/include/trace/events/firewire.h b/include/trace/events/firewire.h
index a30cebed119a..d4688e341837 100644
--- a/include/trace/events/firewire.h
+++ b/include/trace/events/firewire.h
@@ -174,6 +174,30 @@ DEFINE_EVENT_PRINT(async_inbound_template, async_request_inbound,
 	)
 );
 
+DEFINE_EVENT_PRINT(async_outbound_initiate_template, async_response_outbound_initiate,
+	TP_PROTO(u64 transaction, unsigned int generation, unsigned int scode, const u32 *header, const u32 *data, unsigned int data_count),
+	TP_ARGS(transaction, generation, scode, header, data, data_count),
+	TP_printk(
+		"transaction=0x%llx generation=%u scode=%u dst_id=0x%04x tlabel=%u tcode=%u src_id=0x%04x rcode=%u header=%s data=%s",
+		__entry->transaction,
+		__entry->generation,
+		__entry->scode,
+		ASYNC_HEADER_GET_DESTINATION(__entry->header),
+		ASYNC_HEADER_GET_TLABEL(__entry->header),
+		ASYNC_HEADER_GET_TCODE(__entry->header),
+		ASYNC_HEADER_GET_SOURCE(__entry->header),
+		ASYNC_HEADER_GET_RCODE(__entry->header),
+		__print_array(__entry->header, ASYNC_HEADER_QUADLET_COUNT, QUADLET_SIZE),
+		__print_array(__get_dynamic_array(data),
+			      __get_dynamic_array_len(data) / QUADLET_SIZE, QUADLET_SIZE)
+	)
+);
+
+DEFINE_EVENT(async_outbound_complete_template, async_response_outbound_complete,
+	TP_PROTO(u64 transaction, unsigned int generation, unsigned int scode, unsigned int status, unsigned int timestamp),
+	TP_ARGS(transaction, generation, scode, status, timestamp)
+);
+
 #undef ASYNC_HEADER_GET_DESTINATION
 #undef ASYNC_HEADER_GET_TLABEL
 #undef ASYNC_HEADER_GET_TCODE
-- 
cgit v1.2.3


From 1a4c53cf355326a28daa6cbe00939cb4b8ac2659 Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Tue, 30 Apr 2024 09:14:03 +0900
Subject: firewire: core/cdev: add tracepoints events for asynchronous phy
 packet

In IEEE 1394 bus, the type of asynchronous packet without any offset to
node address space is called as phy packet. The destination of packet is
IEEE 1394 phy itself. This type of packet is used for several purposes,
mainly for selfID at the state of bus reset, to force selection of root
node, and to adjust gap count.

This commit adds tracepoints events for the type of asynchronous outbound
packet. Like asynchronous outbound transaction packets, a pair of events
are added to trace initiation and completion of transmission.

In the case that the phy packet is sent by kernel API, the match between
the initiation and completion is not so easy, since the data of
'struct fw_packet' is allocated statically. In the case that it is sent by
userspace applications via cdev, the match is easy, since the data is
allocated per each.

This example is for Remote Access Packet by lsfirewirephy command in
linux-firewire-utils:

async_phy_outbound_initiate: \
  packet=0xffff89fb34e42e78 generation=1 first_quadlet=0x00148200 \
  second_quadlet=0xffeb7dff
async_phy_outbound_complete: \
  packet=0xffff89fb34e42e78 generation=1 status=1 timestamp=0x0619

Link: https://lore.kernel.org/r/20240430001404.734657-2-o-takashi@sakamocchi.jp
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 drivers/firewire/core-cdev.c        |  7 ++++++
 drivers/firewire/core-transaction.c |  6 +++++
 include/trace/events/firewire.h     | 48 +++++++++++++++++++++++++++++++++++++
 3 files changed, 61 insertions(+)

(limited to 'include')

diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c
index 6274b86eb943..55993c9e0b90 100644
--- a/drivers/firewire/core-cdev.c
+++ b/drivers/firewire/core-cdev.c
@@ -35,6 +35,7 @@
 
 
 #include "core.h"
+#include <trace/events/firewire.h>
 
 /*
  * ABI version history is documented in linux/firewire-cdev.h.
@@ -1558,6 +1559,9 @@ static void outbound_phy_packet_callback(struct fw_packet *packet,
 	struct client *e_client = e->client;
 	u32 rcode;
 
+	trace_async_phy_outbound_complete((uintptr_t)packet, status, packet->generation,
+					  packet->timestamp);
+
 	switch (status) {
 	// expected:
 	case ACK_COMPLETE:
@@ -1655,6 +1659,9 @@ static int ioctl_send_phy_packet(struct client *client, union ioctl_arg *arg)
 		memcpy(pp->data, a->data, sizeof(a->data));
 	}
 
+	trace_async_phy_outbound_initiate((uintptr_t)&e->p, e->p.generation, e->p.header[1],
+					  e->p.header[2]);
+
 	card->driver->send_request(card, &e->p);
 
 	return 0;
diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index 0e49ebf52500..a828b7167d15 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -463,6 +463,8 @@ static DECLARE_COMPLETION(phy_config_done);
 static void transmit_phy_packet_callback(struct fw_packet *packet,
 					 struct fw_card *card, int status)
 {
+	trace_async_phy_outbound_complete((uintptr_t)packet, packet->generation, status,
+					  packet->timestamp);
 	complete(&phy_config_done);
 }
 
@@ -501,6 +503,10 @@ void fw_send_phy_config(struct fw_card *card,
 	phy_config_packet.generation = generation;
 	reinit_completion(&phy_config_done);
 
+	trace_async_phy_outbound_initiate((uintptr_t)&phy_config_packet,
+					  phy_config_packet.generation, phy_config_packet.header[1],
+					  phy_config_packet.header[2]);
+
 	card->driver->send_request(card, &phy_config_packet);
 	wait_for_completion_timeout(&phy_config_done, timeout);
 
diff --git a/include/trace/events/firewire.h b/include/trace/events/firewire.h
index d4688e341837..3ade7d4b9268 100644
--- a/include/trace/events/firewire.h
+++ b/include/trace/events/firewire.h
@@ -206,6 +206,54 @@ DEFINE_EVENT(async_outbound_complete_template, async_response_outbound_complete,
 #undef ASYNC_HEADER_GET_RCODE
 #undef QUADLET_SIZE
 
+TRACE_EVENT(async_phy_outbound_initiate,
+	TP_PROTO(u64 packet, unsigned int generation, u32 first_quadlet, u32 second_quadlet),
+	TP_ARGS(packet, generation, first_quadlet, second_quadlet),
+	TP_STRUCT__entry(
+		__field(u64, packet)
+		__field(u8, generation)
+		__field(u32, first_quadlet)
+		__field(u32, second_quadlet)
+	),
+	TP_fast_assign(
+		__entry->packet = packet;
+		__entry->generation = generation;
+		__entry->first_quadlet = first_quadlet;
+		__entry->second_quadlet = second_quadlet
+	),
+	TP_printk(
+		"packet=0x%016llx generation=%u first_quadlet=0x%08x second_quadlet=0x%08x",
+		__entry->packet,
+		__entry->generation,
+		__entry->first_quadlet,
+		__entry->second_quadlet
+	)
+);
+
+TRACE_EVENT(async_phy_outbound_complete,
+	TP_PROTO(u64 packet, unsigned int generation, unsigned int status, unsigned int timestamp),
+	TP_ARGS(packet, generation, status, timestamp),
+	TP_STRUCT__entry(
+		__field(u64, packet)
+		__field(u8, generation)
+		__field(u8, status)
+		__field(u16, timestamp)
+	),
+	TP_fast_assign(
+		__entry->packet = packet;
+		__entry->generation = generation;
+		__entry->status = status;
+		__entry->timestamp = timestamp;
+	),
+	TP_printk(
+		"packet=0x%016llx generation=%u status=%u timestamp=0x%04x",
+		__entry->packet,
+		__entry->generation,
+		__entry->status,
+		__entry->timestamp
+	)
+);
+
 #endif // _FIREWIRE_TRACE_EVENT_H
 
 #include <trace/define_trace.h>
-- 
cgit v1.2.3


From eec045c571cbf89cad5c6622e191af385ecf2340 Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Tue, 30 Apr 2024 09:14:04 +0900
Subject: firewire: core: add tracepoints event for asynchronous inbound phy
 packet

At the former commit, a pair of tracepoints events is added to trace
asynchronous outbound phy packet. This commit adds a tracepoints event
to trace inbound phy packet. It includes transaction status as well as
the content of phy packet.

This is an example for Remote Reply Packet as a response to Remote Access
Packet sent by lsfirewirephy command in linux-firewire-utils:

async_phy_inbound: \
  packet=0xffff955fc02b4e10 generation=1 status=1 timestamp=0x0619 \
  first_quadlet=0x001c8208 second_quadlet=0xffe37df7

Link: https://lore.kernel.org/r/20240430001404.734657-3-o-takashi@sakamocchi.jp
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 drivers/firewire/core-transaction.c |  2 ++
 include/trace/events/firewire.h     | 30 ++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

(limited to 'include')

diff --git a/drivers/firewire/core-transaction.c b/drivers/firewire/core-transaction.c
index a828b7167d15..d3eefbf23663 100644
--- a/drivers/firewire/core-transaction.c
+++ b/drivers/firewire/core-transaction.c
@@ -995,6 +995,8 @@ void fw_core_handle_request(struct fw_card *card, struct fw_packet *p)
 
 	tcode = async_header_get_tcode(p->header);
 	if (tcode_is_link_internal(tcode)) {
+		trace_async_phy_inbound((uintptr_t)p, p->generation, p->ack, p->timestamp,
+					 p->header[1], p->header[2]);
 		fw_cdev_handle_phy_packet(card, p);
 		return;
 	}
diff --git a/include/trace/events/firewire.h b/include/trace/events/firewire.h
index 3ade7d4b9268..db49b9828bd1 100644
--- a/include/trace/events/firewire.h
+++ b/include/trace/events/firewire.h
@@ -254,6 +254,36 @@ TRACE_EVENT(async_phy_outbound_complete,
 	)
 );
 
+TRACE_EVENT(async_phy_inbound,
+	TP_PROTO(u64 packet, unsigned int generation, unsigned int status, unsigned int timestamp, u32 first_quadlet, u32 second_quadlet),
+	TP_ARGS(packet, generation, status, timestamp, first_quadlet, second_quadlet),
+	TP_STRUCT__entry(
+		__field(u64, packet)
+		__field(u8, generation)
+		__field(u8, status)
+		__field(u16, timestamp)
+		__field(u32, first_quadlet)
+		__field(u32, second_quadlet)
+	),
+	TP_fast_assign(
+		__entry->packet = packet;
+		__entry->generation = generation;
+		__entry->status = status;
+		__entry->timestamp = timestamp;
+		__entry->first_quadlet = first_quadlet;
+		__entry->second_quadlet = second_quadlet
+	),
+	TP_printk(
+		"packet=0x%016llx generation=%u status=%u timestamp=0x%04x first_quadlet=0x%08x second_quadlet=0x%08x",
+		__entry->packet,
+		__entry->generation,
+		__entry->status,
+		__entry->timestamp,
+		__entry->first_quadlet,
+		__entry->second_quadlet
+	)
+);
+
 #endif // _FIREWIRE_TRACE_EVENT_H
 
 #include <trace/define_trace.h>
-- 
cgit v1.2.3


From 08dd8602aab9fe9dbc3e41727caf7e416d2ccec4 Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Wed, 1 May 2024 16:32:36 +0900
Subject: firewire: core: add tracepoints events for initiating bus reset

At a commit 673249124304 ("firewire: core: option to log bus reset
initiation"), some kernel log messages were added to trace initiation of
bus reset. The kernel log messages are really helpful, while nowadays it
is not preferable just for debugging purpose. For the purpose, Linux
kernel tracepoints is more preferable.

This commit adds some alternative tracepoints events.

Link: https://lore.kernel.org/r/20240501073238.72769-4-o-takashi@sakamocchi.jp
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 drivers/firewire/core-card.c    |  7 +++++++
 include/trace/events/firewire.h | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+)

(limited to 'include')

diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c
index dac1b0fc7a42..5d43acf45a7d 100644
--- a/drivers/firewire/core-card.c
+++ b/drivers/firewire/core-card.c
@@ -23,6 +23,7 @@
 #include <asm/byteorder.h>
 
 #include "core.h"
+#include <trace/events/firewire.h>
 
 #define define_fw_printk_level(func, kern_level)		\
 void func(const struct fw_card *card, const char *fmt, ...)	\
@@ -221,6 +222,8 @@ static int reset_bus(struct fw_card *card, bool short_reset)
 	int reg = short_reset ? 5 : 1;
 	int bit = short_reset ? PHY_BUS_SHORT_RESET : PHY_BUS_RESET;
 
+	trace_bus_reset_initiate(card->generation, short_reset);
+
 	if (unlikely(fw_core_param_debug & FW_CORE_PARAM_DEBUG_BUSRESETS))
 		fw_notice(card, "initiating %s bus reset\n",
 		          short_reset ? "short" : "long");
@@ -230,6 +233,8 @@ static int reset_bus(struct fw_card *card, bool short_reset)
 
 void fw_schedule_bus_reset(struct fw_card *card, bool delayed, bool short_reset)
 {
+	trace_bus_reset_schedule(card->generation, short_reset);
+
 	if (unlikely(fw_core_param_debug & FW_CORE_PARAM_DEBUG_BUSRESETS))
 		fw_notice(card, "scheduling %s bus reset\n",
 		          short_reset ? "short" : "long");
@@ -252,6 +257,8 @@ static void br_work(struct work_struct *work)
 	/* Delay for 2s after last reset per IEEE 1394 clause 8.2.1. */
 	if (card->reset_jiffies != 0 &&
 	    time_before64(get_jiffies_64(), card->reset_jiffies + 2 * HZ)) {
+		trace_bus_reset_postpone(card->generation, card->br_short);
+
 		if (unlikely(fw_core_param_debug & FW_CORE_PARAM_DEBUG_BUSRESETS))
 			fw_notice(card, "delaying bus reset\n");
 		if (!queue_delayed_work(fw_workqueue, &card->br_work, 2 * HZ))
diff --git a/include/trace/events/firewire.h b/include/trace/events/firewire.h
index db49b9828bd1..92bcbe69bb42 100644
--- a/include/trace/events/firewire.h
+++ b/include/trace/events/firewire.h
@@ -284,6 +284,39 @@ TRACE_EVENT(async_phy_inbound,
 	)
 );
 
+DECLARE_EVENT_CLASS(bus_reset_arrange_template,
+	TP_PROTO(unsigned int generation, bool short_reset),
+	TP_ARGS(generation, short_reset),
+	TP_STRUCT__entry(
+		__field(u8, generation)
+		__field(bool, short_reset)
+	),
+	TP_fast_assign(
+		__entry->generation = generation;
+		__entry->short_reset = short_reset;
+	),
+	TP_printk(
+		"generation=%u short_reset=%s",
+		__entry->generation,
+		__entry->short_reset ? "true" : "false"
+	)
+);
+
+DEFINE_EVENT(bus_reset_arrange_template, bus_reset_initiate,
+	TP_PROTO(unsigned int generation, bool short_reset),
+	TP_ARGS(generation, short_reset)
+);
+
+DEFINE_EVENT(bus_reset_arrange_template, bus_reset_schedule,
+	TP_PROTO(unsigned int generation, bool short_reset),
+	TP_ARGS(generation, short_reset)
+);
+
+DEFINE_EVENT(bus_reset_arrange_template, bus_reset_postpone,
+	TP_PROTO(unsigned int generation, bool short_reset),
+	TP_ARGS(generation, short_reset)
+);
+
 #endif // _FIREWIRE_TRACE_EVENT_H
 
 #include <trace/define_trace.h>
-- 
cgit v1.2.3


From 6b0b708f12d18f9cccfb1c418bea59fcbff8798c Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Wed, 1 May 2024 16:32:38 +0900
Subject: firewire: core: add tracepoint event for handling bus reset

The core function expects hardware drivers to call
fw_core_handle_bus_reset() when changing bus topology. The 1394 OHCI
driver calls it when handling selfID event as a result of any bus-reset.

This commit adds a tracepoints event for it.

Link: https://lore.kernel.org/r/20240501073238.72769-6-o-takashi@sakamocchi.jp
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 drivers/firewire/core-topology.c |  3 +++
 include/trace/events/firewire.h  | 28 +++++++++++++++++++++++++++-
 2 files changed, 30 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c
index f40c81534381..837cc44d8d9f 100644
--- a/drivers/firewire/core-topology.c
+++ b/drivers/firewire/core-topology.c
@@ -20,6 +20,7 @@
 #include <asm/byteorder.h>
 
 #include "core.h"
+#include <trace/events/firewire.h>
 
 #define SELF_ID_PHY_ID(q)		(((q) >> 24) & 0x3f)
 #define SELF_ID_EXTENDED(q)		(((q) >> 23) & 0x01)
@@ -507,6 +508,8 @@ void fw_core_handle_bus_reset(struct fw_card *card, int node_id, int generation,
 	struct fw_node *local_node;
 	unsigned long flags;
 
+	trace_bus_reset_handle(generation, node_id, bm_abdicate, self_ids, self_id_count);
+
 	spin_lock_irqsave(&card->lock, flags);
 
 	/*
diff --git a/include/trace/events/firewire.h b/include/trace/events/firewire.h
index 92bcbe69bb42..4163959be57a 100644
--- a/include/trace/events/firewire.h
+++ b/include/trace/events/firewire.h
@@ -204,7 +204,6 @@ DEFINE_EVENT(async_outbound_complete_template, async_response_outbound_complete,
 #undef ASYNC_HEADER_GET_SOURCE
 #undef ASYNC_HEADER_GET_OFFSET
 #undef ASYNC_HEADER_GET_RCODE
-#undef QUADLET_SIZE
 
 TRACE_EVENT(async_phy_outbound_initiate,
 	TP_PROTO(u64 packet, unsigned int generation, u32 first_quadlet, u32 second_quadlet),
@@ -317,6 +316,33 @@ DEFINE_EVENT(bus_reset_arrange_template, bus_reset_postpone,
 	TP_ARGS(generation, short_reset)
 );
 
+TRACE_EVENT(bus_reset_handle,
+	TP_PROTO(unsigned int generation, unsigned int node_id, bool bm_abdicate, u32 *self_ids, unsigned int self_id_count),
+	TP_ARGS(generation, node_id, bm_abdicate, self_ids, self_id_count),
+	TP_STRUCT__entry(
+		__field(u8, generation)
+		__field(u8, node_id)
+		__field(bool, bm_abdicate)
+		__dynamic_array(u32, self_ids, self_id_count)
+	),
+	TP_fast_assign(
+		__entry->generation = generation;
+		__entry->node_id = node_id;
+		__entry->bm_abdicate = bm_abdicate;
+		memcpy(__get_dynamic_array(self_ids), self_ids, __get_dynamic_array_len(self_ids));
+	),
+	TP_printk(
+		"generation=%u node_id=0x%04x bm_abdicate=%s self_ids=%s",
+		__entry->generation,
+		__entry->node_id,
+		__entry->bm_abdicate ? "true" : "false",
+		__print_array(__get_dynamic_array(self_ids),
+			      __get_dynamic_array_len(self_ids) / QUADLET_SIZE, QUADLET_SIZE)
+	)
+);
+
+#undef QUADLET_SIZE
+
 #endif // _FIREWIRE_TRACE_EVENT_H
 
 #include <trace/define_trace.h>
-- 
cgit v1.2.3


From 87144bbc9935b3117495c330890184edb925f3c5 Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Mon, 6 May 2024 17:21:53 +0900
Subject: firewire: core: fix type of timestamp for async_inbound_template
 tracepoints events

The type of time stamp should be u16, instead of u8.

Link: https://lore.kernel.org/r/20240506082154.396077-1-o-takashi@sakamocchi.jp
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 include/trace/events/firewire.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/firewire.h b/include/trace/events/firewire.h
index 4163959be57a..9e79c864fff6 100644
--- a/include/trace/events/firewire.h
+++ b/include/trace/events/firewire.h
@@ -106,7 +106,7 @@ DECLARE_EVENT_CLASS(async_inbound_template,
 		__field(u8, generation)
 		__field(u8, scode)
 		__field(u8, status)
-		__field(u8, timestamp)
+		__field(u16, timestamp)
 		__array(u32, header, ASYNC_HEADER_QUADLET_COUNT)
 		__dynamic_array(u32, data, data_count)
 	),
-- 
cgit v1.2.3


From 5a5dc48083dae07e3079e4ded731388214729f64 Mon Sep 17 00:00:00 2001
From: Takashi Sakamoto <o-takashi@sakamocchi.jp>
Date: Mon, 6 May 2024 17:21:54 +0900
Subject: firewire: core: remove flag and width from u64 formats of tracepoints
 events

The pointer to fw_packet structure is passed to ring buffer of tracepoints
framework as the value of u64 type. '0x%016llx' is used for the print
format of value, while the flag and width are useless in the case.

This commit removes them.

Link: https://lore.kernel.org/r/20240506082154.396077-2-o-takashi@sakamocchi.jp
Signed-off-by: Takashi Sakamoto <o-takashi@sakamocchi.jp>
---
 include/trace/events/firewire.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/firewire.h b/include/trace/events/firewire.h
index 9e79c864fff6..d695a560673f 100644
--- a/include/trace/events/firewire.h
+++ b/include/trace/events/firewire.h
@@ -221,7 +221,7 @@ TRACE_EVENT(async_phy_outbound_initiate,
 		__entry->second_quadlet = second_quadlet
 	),
 	TP_printk(
-		"packet=0x%016llx generation=%u first_quadlet=0x%08x second_quadlet=0x%08x",
+		"packet=0x%llx generation=%u first_quadlet=0x%08x second_quadlet=0x%08x",
 		__entry->packet,
 		__entry->generation,
 		__entry->first_quadlet,
@@ -245,7 +245,7 @@ TRACE_EVENT(async_phy_outbound_complete,
 		__entry->timestamp = timestamp;
 	),
 	TP_printk(
-		"packet=0x%016llx generation=%u status=%u timestamp=0x%04x",
+		"packet=0x%llx generation=%u status=%u timestamp=0x%04x",
 		__entry->packet,
 		__entry->generation,
 		__entry->status,
@@ -273,7 +273,7 @@ TRACE_EVENT(async_phy_inbound,
 		__entry->second_quadlet = second_quadlet
 	),
 	TP_printk(
-		"packet=0x%016llx generation=%u status=%u timestamp=0x%04x first_quadlet=0x%08x second_quadlet=0x%08x",
+		"packet=0x%llx generation=%u status=%u timestamp=0x%04x first_quadlet=0x%08x second_quadlet=0x%08x",
 		__entry->packet,
 		__entry->generation,
 		__entry->status,
-- 
cgit v1.2.3


From 4a3540a8bf3c13dc3955f0c0895332b9c653be3f Mon Sep 17 00:00:00 2001
From: Linus Lüssing <linus.luessing@c0d3.blue>
Date: Wed, 6 Mar 2024 15:18:04 +0100
Subject: netfilter: conntrack: fix ct-state for ICMPv6 Multicast Router
 Discovery
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

So far Multicast Router Advertisements and Multicast Router
Solicitations from the Multicast Router Discovery protocol (RFC4286)
would be marked as INVALID for IPv6, even if they are in fact intact
and adhering to RFC4286.

This broke MRA reception and by that multicast reception on
IPv6 multicast routers in a Proxmox managed setup, where Proxmox
would install a rule like "-m conntrack --ctstate INVALID -j DROP"
at the top of the FORWARD chain with br-nf-call-ip6tables enabled
by default.

Similar to as it's done for MLDv1, MLDv2 and IPv6 Neighbor Discovery
already, fix this issue by excluding MRD from connection tracking
handling as MRD always uses predefined multicast destinations
for its messages, too. This changes the ct-state for ICMPv6 MRD messages
from INVALID to UNTRACKED.

This issue was found and fixed with the help of the mrdisc tool
(https://github.com/troglobit/mrdisc).

Signed-off-by: Linus Lüssing <linus.luessing@c0d3.blue>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/icmpv6.h               | 1 +
 net/netfilter/nf_conntrack_proto_icmpv6.c | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/icmpv6.h b/include/uapi/linux/icmpv6.h
index ecaece3af38d..4eaab89e2856 100644
--- a/include/uapi/linux/icmpv6.h
+++ b/include/uapi/linux/icmpv6.h
@@ -112,6 +112,7 @@ struct icmp6hdr {
 #define ICMPV6_MOBILE_PREFIX_ADV	147
 
 #define ICMPV6_MRDISC_ADV		151
+#define ICMPV6_MRDISC_SOL		152
 
 #define ICMPV6_MSG_MAX          255
 
diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c
index 1020d67600a9..327b8059025d 100644
--- a/net/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/netfilter/nf_conntrack_proto_icmpv6.c
@@ -62,7 +62,9 @@ static const u_int8_t noct_valid_new[] = {
 	[NDISC_ROUTER_ADVERTISEMENT - 130] = 1,
 	[NDISC_NEIGHBOUR_SOLICITATION - 130] = 1,
 	[NDISC_NEIGHBOUR_ADVERTISEMENT - 130] = 1,
-	[ICMPV6_MLD2_REPORT - 130] = 1
+	[ICMPV6_MLD2_REPORT - 130] = 1,
+	[ICMPV6_MRDISC_ADV - 130] = 1,
+	[ICMPV6_MRDISC_SOL - 130] = 1
 };
 
 bool nf_conntrack_invert_icmpv6_tuple(struct nf_conntrack_tuple *tuple,
-- 
cgit v1.2.3


From 8928756d53d5b99dcd18073dc7738b8ebdbe7d96 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Thu, 2 May 2024 10:44:42 +0200
Subject: net: move skb_gro_receive_list from udp to core

This helper function will be used for TCP fraglist GRO support

Acked-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/gro.h      |  1 +
 net/core/gro.c         | 27 +++++++++++++++++++++++++++
 net/ipv4/udp_offload.c | 27 ---------------------------
 3 files changed, 28 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/net/gro.h b/include/net/gro.h
index c1d4ca0463a1..5df8bf318197 100644
--- a/include/net/gro.h
+++ b/include/net/gro.h
@@ -438,6 +438,7 @@ static inline __wsum ip6_gro_compute_pseudo(const struct sk_buff *skb,
 }
 
 int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb);
+int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb);
 
 /* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
 static inline void gro_normal_list(struct napi_struct *napi)
diff --git a/net/core/gro.c b/net/core/gro.c
index 99a45a5211c9..e2f84947fb74 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -231,6 +231,33 @@ done:
 	return 0;
 }
 
+int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
+{
+	if (unlikely(p->len + skb->len >= 65536))
+		return -E2BIG;
+
+	if (NAPI_GRO_CB(p)->last == p)
+		skb_shinfo(p)->frag_list = skb;
+	else
+		NAPI_GRO_CB(p)->last->next = skb;
+
+	skb_pull(skb, skb_gro_offset(skb));
+
+	NAPI_GRO_CB(p)->last = skb;
+	NAPI_GRO_CB(p)->count++;
+	p->data_len += skb->len;
+
+	/* sk ownership - if any - completely transferred to the aggregated packet */
+	skb->destructor = NULL;
+	skb->sk = NULL;
+	p->truesize += skb->truesize;
+	p->len += skb->len;
+
+	NAPI_GRO_CB(skb)->same_flow = 1;
+
+	return 0;
+}
+
 
 static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
 {
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 8721fe5beca2..f6d3c442e260 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -433,33 +433,6 @@ out:
 	return segs;
 }
 
-static int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
-{
-	if (unlikely(p->len + skb->len >= 65536))
-		return -E2BIG;
-
-	if (NAPI_GRO_CB(p)->last == p)
-		skb_shinfo(p)->frag_list = skb;
-	else
-		NAPI_GRO_CB(p)->last->next = skb;
-
-	skb_pull(skb, skb_gro_offset(skb));
-
-	NAPI_GRO_CB(p)->last = skb;
-	NAPI_GRO_CB(p)->count++;
-	p->data_len += skb->len;
-
-	/* sk ownership - if any - completely transferred to the aggregated packet */
-	skb->destructor = NULL;
-	skb->sk = NULL;
-	p->truesize += skb->truesize;
-	p->len += skb->len;
-
-	NAPI_GRO_CB(skb)->same_flow = 1;
-
-	return 0;
-}
-
 
 #define UDP_GRO_CNT_MAX 64
 static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
-- 
cgit v1.2.3


From 80e85fbdf19ecc4dfa31ecf639adb55555db02fe Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Thu, 2 May 2024 10:44:45 +0200
Subject: net: create tcp_gro_lookup helper function

This pulls the flow port matching out of tcp_gro_receive, so that it can be
reused for the next change, which adds the TCP fraglist GRO heuristic.

Acked-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/tcp.h      |  1 +
 net/ipv4/tcp_offload.c | 41 +++++++++++++++++++++++++----------------
 2 files changed, 26 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0a51e6a45bce..79c4e8947b5d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2191,6 +2191,7 @@ void tcp_v4_destroy_sock(struct sock *sk);
 
 struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 				netdev_features_t features);
+struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th);
 struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb);
 INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff));
 INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb));
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index aa7508676315..4a194a9d36cd 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -245,6 +245,27 @@ out:
 	return segs;
 }
 
+struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th)
+{
+	struct tcphdr *th2;
+	struct sk_buff *p;
+
+	list_for_each_entry(p, head, list) {
+		if (!NAPI_GRO_CB(p)->same_flow)
+			continue;
+
+		th2 = tcp_hdr(p);
+		if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
+			NAPI_GRO_CB(p)->same_flow = 0;
+			continue;
+		}
+
+		return p;
+	}
+
+	return NULL;
+}
+
 struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
 {
 	struct sk_buff *pp = NULL;
@@ -282,24 +303,12 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
 	len = skb_gro_len(skb);
 	flags = tcp_flag_word(th);
 
-	list_for_each_entry(p, head, list) {
-		if (!NAPI_GRO_CB(p)->same_flow)
-			continue;
-
-		th2 = tcp_hdr(p);
-
-		if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
-			NAPI_GRO_CB(p)->same_flow = 0;
-			continue;
-		}
-
-		goto found;
-	}
-	p = NULL;
-	goto out_check_final;
+	p = tcp_gro_lookup(head, th);
+	if (!p)
+		goto out_check_final;
 
-found:
 	/* Include the IP ID check below from the inner most IP hdr */
+	th2 = tcp_hdr(p);
 	flush = NAPI_GRO_CB(p)->flush;
 	flush |= (__force int)(flags & TCP_FLAG_CWR);
 	flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
-- 
cgit v1.2.3


From 7516b27c555c1711ec17a5d891befb6986e573a3 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Thu, 2 May 2024 10:44:46 +0200
Subject: net: create tcp_gro_header_pull helper function

Pull the code out of tcp_gro_receive in order to access the tcp header
from tcp4/6_gro_receive.

Acked-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Reviewed-by: David Ahern <dsahern@kernel.org>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/tcp.h        |  4 +++-
 net/ipv4/tcp_offload.c   | 55 ++++++++++++++++++++++++++++++------------------
 net/ipv6/tcpv6_offload.c | 18 +++++++++++-----
 3 files changed, 50 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 79c4e8947b5d..8f63a163c7de 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2191,8 +2191,10 @@ void tcp_v4_destroy_sock(struct sock *sk);
 
 struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 				netdev_features_t features);
+struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb);
 struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th);
-struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb);
+struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
+				struct tcphdr *th);
 INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff));
 INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb));
 INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff));
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 4a194a9d36cd..87ae9808e260 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -266,40 +266,46 @@ struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th)
 	return NULL;
 }
 
-struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
+struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb)
 {
-	struct sk_buff *pp = NULL;
-	struct sk_buff *p;
+	unsigned int thlen, hlen, off;
 	struct tcphdr *th;
-	struct tcphdr *th2;
-	unsigned int len;
-	unsigned int thlen;
-	__be32 flags;
-	unsigned int mss = 1;
-	unsigned int hlen;
-	unsigned int off;
-	int flush = 1;
-	int i;
 
 	off = skb_gro_offset(skb);
 	hlen = off + sizeof(*th);
 	th = skb_gro_header(skb, hlen, off);
 	if (unlikely(!th))
-		goto out;
+		return NULL;
 
 	thlen = th->doff * 4;
 	if (thlen < sizeof(*th))
-		goto out;
+		return NULL;
 
 	hlen = off + thlen;
 	if (!skb_gro_may_pull(skb, hlen)) {
 		th = skb_gro_header_slow(skb, hlen, off);
 		if (unlikely(!th))
-			goto out;
+			return NULL;
 	}
 
 	skb_gro_pull(skb, thlen);
 
+	return th;
+}
+
+struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
+				struct tcphdr *th)
+{
+	unsigned int thlen = th->doff * 4;
+	struct sk_buff *pp = NULL;
+	struct sk_buff *p;
+	struct tcphdr *th2;
+	unsigned int len;
+	__be32 flags;
+	unsigned int mss = 1;
+	int flush = 1;
+	int i;
+
 	len = skb_gro_len(skb);
 	flags = tcp_flag_word(th);
 
@@ -376,7 +382,6 @@ out_check_final:
 	if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
 		pp = p;
 
-out:
 	NAPI_GRO_CB(skb)->flush |= (flush != 0);
 
 	return pp;
@@ -405,15 +410,23 @@ EXPORT_SYMBOL(tcp_gro_complete);
 INDIRECT_CALLABLE_SCOPE
 struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)
 {
+	struct tcphdr *th;
+
 	/* Don't bother verifying checksum if we're going to flush anyway. */
 	if (!NAPI_GRO_CB(skb)->flush &&
 	    skb_gro_checksum_validate(skb, IPPROTO_TCP,
-				      inet_gro_compute_pseudo)) {
-		NAPI_GRO_CB(skb)->flush = 1;
-		return NULL;
-	}
+				      inet_gro_compute_pseudo))
+		goto flush;
+
+	th = tcp_gro_pull_header(skb);
+	if (!th)
+		goto flush;
 
-	return tcp_gro_receive(head, skb);
+	return tcp_gro_receive(head, skb, th);
+
+flush:
+	NAPI_GRO_CB(skb)->flush = 1;
+	return NULL;
 }
 
 INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff)
diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c
index 575e2743e331..e73a4f74fd96 100644
--- a/net/ipv6/tcpv6_offload.c
+++ b/net/ipv6/tcpv6_offload.c
@@ -16,15 +16,23 @@
 INDIRECT_CALLABLE_SCOPE
 struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb)
 {
+	struct tcphdr *th;
+
 	/* Don't bother verifying checksum if we're going to flush anyway. */
 	if (!NAPI_GRO_CB(skb)->flush &&
 	    skb_gro_checksum_validate(skb, IPPROTO_TCP,
-				      ip6_gro_compute_pseudo)) {
-		NAPI_GRO_CB(skb)->flush = 1;
-		return NULL;
-	}
+				      ip6_gro_compute_pseudo))
+		goto flush;
+
+	th = tcp_gro_pull_header(skb);
+	if (!th)
+		goto flush;
+
+	return tcp_gro_receive(head, skb, th);
 
-	return tcp_gro_receive(head, skb);
+flush:
+	NAPI_GRO_CB(skb)->flush = 1;
+	return NULL;
 }
 
 INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff)
-- 
cgit v1.2.3


From a4184174be36369c3af8d937e165f28a43ef1e02 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 13 Dec 2023 18:00:54 +0100
Subject: alpha: drop pre-EV56 support

All EV4 machines are already gone, and the remaining EV5 based machines
all support the slightly more modern EV56 generation as well.
Debian only supports EV56 and later.

Drop both of these and build kernels optimized for EV56 and higher
when the "generic" options is selected, tuning for an out-of-order
EV6 pipeline, same as Debian userspace.

Since this was the only supported architecture without 8-bit and
16-bit stores, common kernel code no longer has to worry about
aligning struct members, and existing workarounds from the block
and tty layers can be removed.

The alpha memory management code no longer needs an abstraction
for the differences between EV4 and EV5+.

Link: https://lists.debian.org/debian-alpha/2023/05/msg00009.html
Acked-by: Paul E. McKenney <paulmck@kernel.org>
Acked-by: Matt Turner <mattst88@gmail.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/alpha/Kconfig                     | 21 ++-------
 arch/alpha/Makefile                    |  8 +---
 arch/alpha/include/asm/elf.h           |  4 +-
 arch/alpha/include/asm/machvec.h       |  9 ----
 arch/alpha/include/asm/mmu_context.h   | 45 ++-----------------
 arch/alpha/include/asm/special_insns.h |  5 +--
 arch/alpha/include/asm/tlbflush.h      | 37 +---------------
 arch/alpha/include/asm/uaccess.h       | 80 ----------------------------------
 arch/alpha/include/uapi/asm/compiler.h | 18 --------
 arch/alpha/kernel/machvec_impl.h       | 25 ++---------
 arch/alpha/kernel/traps.c              | 64 ---------------------------
 include/linux/blk_types.h              |  6 ---
 include/linux/tty.h                    | 14 ++----
 13 files changed, 19 insertions(+), 317 deletions(-)

(limited to 'include')

diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 7987bb548652..50ff06d5b799 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -240,29 +240,14 @@ config ISA_DMA_API
 	bool
 	default y
 
-config ALPHA_EV4
-	bool
-
-config ALPHA_EV5
-	bool
-	default y if ALPHA_RX164 || ALPHA_RAWHIDE || ALPHA_MIATA || ALPHA_LX164 || ALPHA_SX164 || ALPHA_RUFFIAN || ALPHA_SABLE || ALPHA_NORITAKE || ALPHA_MIKASA || ALPHA_PC164 || ALPHA_TAKARA || ALPHA_EB164 || ALPHA_ALCOR
-
 config ALPHA_CIA
 	bool
 	depends on ALPHA_MIATA || ALPHA_LX164 || ALPHA_SX164 || ALPHA_RUFFIAN || ALPHA_NORITAKE || ALPHA_MIKASA || ALPHA_PC164 || ALPHA_TAKARA || ALPHA_ALCOR
 	default y
 
 config ALPHA_EV56
-	bool "EV56 CPU (speed >= 366MHz)?" if ALPHA_ALCOR
-	default y if ALPHA_RX164 || ALPHA_MIATA || ALPHA_LX164 || ALPHA_SX164 || ALPHA_RUFFIAN || ALPHA_PC164 || ALPHA_TAKARA
-
-config ALPHA_EV56
-	prompt "EV56 CPU (speed >= 333MHz)?"
-	depends on ALPHA_NORITAKE || ALPHA_MIKASA
-
-config ALPHA_EV56
-	prompt "EV56 CPU (speed >= 400MHz)?"
-	depends on ALPHA_RAWHIDE
+	bool
+	default y if ALPHA_ALCOR || ALPHA_RX164 || ALPHA_MIATA || ALPHA_LX164 || ALPHA_SX164 || ALPHA_RUFFIAN || ALPHA_PC164 || ALPHA_TAKARA || ALPHA_NORITAKE || ALPHA_MIKASA || ALPHA_RAWHIDE || ALPHA_SABLE
 
 config ALPHA_T2
 	bool
@@ -403,7 +388,7 @@ config ARCH_SPARSEMEM_ENABLE
 config ALPHA_WTINT
 	bool "Use WTINT" if ALPHA_SRM || ALPHA_GENERIC
 	default y if ALPHA_QEMU
-	default n if ALPHA_EV5 || ALPHA_EV56
+	default n if ALPHA_EV56
 	default n if !ALPHA_SRM && !ALPHA_GENERIC
 	default y if SMP
 	help
diff --git a/arch/alpha/Makefile b/arch/alpha/Makefile
index 45158024085e..35445ff2e489 100644
--- a/arch/alpha/Makefile
+++ b/arch/alpha/Makefile
@@ -15,18 +15,14 @@ CHECKFLAGS	+= -D__alpha__
 cflags-y	:= -pipe -mno-fp-regs -ffixed-8
 cflags-y	+= $(call cc-option, -fno-jump-tables)
 
-cpuflags-$(CONFIG_ALPHA_EV4)		:= -mcpu=ev4
-cpuflags-$(CONFIG_ALPHA_EV5)		:= -mcpu=ev5
 cpuflags-$(CONFIG_ALPHA_EV56)		:= -mcpu=ev56
 cpuflags-$(CONFIG_ALPHA_POLARIS)	:= -mcpu=pca56
 cpuflags-$(CONFIG_ALPHA_SX164)		:= -mcpu=pca56
 cpuflags-$(CONFIG_ALPHA_EV6)		:= -mcpu=ev6
 cpuflags-$(CONFIG_ALPHA_EV67)		:= -mcpu=ev67
 # If GENERIC, make sure to turn off any instruction set extensions that
-# the host compiler might have on by default.  Given that EV4 and EV5
-# have the same instruction set, prefer EV5 because an EV5 schedule is
-# more likely to keep an EV4 processor busy than vice-versa.
-cpuflags-$(CONFIG_ALPHA_GENERIC)	:= -mcpu=ev5
+# the host compiler might have on by default.
+cpuflags-$(CONFIG_ALPHA_GENERIC)	:= -mcpu=ev56 -mtune=ev6
 
 cflags-y				+= $(cpuflags-y)
 
diff --git a/arch/alpha/include/asm/elf.h b/arch/alpha/include/asm/elf.h
index e6da23f1da83..4d7c46f50382 100644
--- a/arch/alpha/include/asm/elf.h
+++ b/arch/alpha/include/asm/elf.h
@@ -133,9 +133,7 @@ extern int dump_elf_task(elf_greg_t *dest, struct task_struct *task);
 #define ELF_PLATFORM				\
 ({						\
 	enum implver_enum i_ = implver();	\
-	( i_ == IMPLVER_EV4 ? "ev4"		\
-	: i_ == IMPLVER_EV5			\
-	  ? (amask(AMASK_BWX) ? "ev5" : "ev56")	\
+	( i_ == IMPLVER_EV5 ? "ev56"			\
 	: amask (AMASK_CIX) ? "ev6" : "ev67");	\
 })
 
diff --git a/arch/alpha/include/asm/machvec.h b/arch/alpha/include/asm/machvec.h
index 8623f995d34c..490fc880bb3f 100644
--- a/arch/alpha/include/asm/machvec.h
+++ b/arch/alpha/include/asm/machvec.h
@@ -72,15 +72,6 @@ struct alpha_machine_vector
 	int (*mv_is_ioaddr)(unsigned long);
 	int (*mv_is_mmio)(const volatile void __iomem *);
 
-	void (*mv_switch_mm)(struct mm_struct *, struct mm_struct *,
-			     struct task_struct *);
-	void (*mv_activate_mm)(struct mm_struct *, struct mm_struct *);
-
-	void (*mv_flush_tlb_current)(struct mm_struct *);
-	void (*mv_flush_tlb_current_page)(struct mm_struct * mm,
-					  struct vm_area_struct *vma,
-					  unsigned long addr);
-
 	void (*update_irq_hw)(unsigned long, unsigned long, int);
 	void (*ack_irq)(unsigned long);
 	void (*device_interrupt)(unsigned long vector);
diff --git a/arch/alpha/include/asm/mmu_context.h b/arch/alpha/include/asm/mmu_context.h
index 29a3e3a1f02b..eee8fe836a59 100644
--- a/arch/alpha/include/asm/mmu_context.h
+++ b/arch/alpha/include/asm/mmu_context.h
@@ -71,9 +71,7 @@ __reload_thread(struct pcb_struct *pcb)
 #ifdef CONFIG_ALPHA_GENERIC
 # define MAX_ASN	(alpha_mv.max_asn)
 #else
-# ifdef CONFIG_ALPHA_EV4
-#  define MAX_ASN	EV4_MAX_ASN
-# elif defined(CONFIG_ALPHA_EV5)
+# if defined(CONFIG_ALPHA_EV56)
 #  define MAX_ASN	EV5_MAX_ASN
 # else
 #  define MAX_ASN	EV6_MAX_ASN
@@ -162,26 +160,6 @@ ev5_switch_mm(struct mm_struct *prev_mm, struct mm_struct *next_mm,
 	task_thread_info(next)->pcb.asn = mmc & HARDWARE_ASN_MASK;
 }
 
-__EXTERN_INLINE void
-ev4_switch_mm(struct mm_struct *prev_mm, struct mm_struct *next_mm,
-	      struct task_struct *next)
-{
-	/* As described, ASN's are broken for TLB usage.  But we can
-	   optimize for switching between threads -- if the mm is
-	   unchanged from current we needn't flush.  */
-	/* ??? May not be needed because EV4 PALcode recognizes that
-	   ASN's are broken and does a tbiap itself on swpctx, under
-	   the "Must set ASN or flush" rule.  At least this is true
-	   for a 1992 SRM, reports Joseph Martin (jmartin@hlo.dec.com).
-	   I'm going to leave this here anyway, just to Be Sure.  -- r~  */
-	if (prev_mm != next_mm)
-		tbiap();
-
-	/* Do continue to allocate ASNs, because we can still use them
-	   to avoid flushing the icache.  */
-	ev5_switch_mm(prev_mm, next_mm, next);
-}
-
 extern void __load_new_mm_context(struct mm_struct *);
 asmlinkage void do_page_fault(unsigned long address, unsigned long mmcsr,
 			      long cause, struct pt_regs *regs);
@@ -209,25 +187,8 @@ ev5_activate_mm(struct mm_struct *prev_mm, struct mm_struct *next_mm)
 	__load_new_mm_context(next_mm);
 }
 
-__EXTERN_INLINE void
-ev4_activate_mm(struct mm_struct *prev_mm, struct mm_struct *next_mm)
-{
-	__load_new_mm_context(next_mm);
-	tbiap();
-}
-
-#ifdef CONFIG_ALPHA_GENERIC
-# define switch_mm(a,b,c)	alpha_mv.mv_switch_mm((a),(b),(c))
-# define activate_mm(x,y)	alpha_mv.mv_activate_mm((x),(y))
-#else
-# ifdef CONFIG_ALPHA_EV4
-#  define switch_mm(a,b,c)	ev4_switch_mm((a),(b),(c))
-#  define activate_mm(x,y)	ev4_activate_mm((x),(y))
-# else
-#  define switch_mm(a,b,c)	ev5_switch_mm((a),(b),(c))
-#  define activate_mm(x,y)	ev5_activate_mm((x),(y))
-# endif
-#endif
+#define switch_mm(a,b,c)	ev5_switch_mm((a),(b),(c))
+#define activate_mm(x,y)	ev5_activate_mm((x),(y))
 
 #define init_new_context init_new_context
 static inline int
diff --git a/arch/alpha/include/asm/special_insns.h b/arch/alpha/include/asm/special_insns.h
index ca2c5c30b22e..798d0bdb11f9 100644
--- a/arch/alpha/include/asm/special_insns.h
+++ b/arch/alpha/include/asm/special_insns.h
@@ -15,10 +15,7 @@ enum implver_enum {
    (enum implver_enum) __implver; })
 #else
 /* Try to eliminate some dead code.  */
-#ifdef CONFIG_ALPHA_EV4
-#define implver() IMPLVER_EV4
-#endif
-#ifdef CONFIG_ALPHA_EV5
+#ifdef CONFIG_ALPHA_EV56
 #define implver() IMPLVER_EV5
 #endif
 #if defined(CONFIG_ALPHA_EV6)
diff --git a/arch/alpha/include/asm/tlbflush.h b/arch/alpha/include/asm/tlbflush.h
index 94dc37cf873a..ba4b359d6c39 100644
--- a/arch/alpha/include/asm/tlbflush.h
+++ b/arch/alpha/include/asm/tlbflush.h
@@ -14,16 +14,6 @@
 extern void __load_new_mm_context(struct mm_struct *);
 
 
-/* Use a few helper functions to hide the ugly broken ASN
-   numbers on early Alphas (ev4 and ev45).  */
-
-__EXTERN_INLINE void
-ev4_flush_tlb_current(struct mm_struct *mm)
-{
-	__load_new_mm_context(mm);
-	tbiap();
-}
-
 __EXTERN_INLINE void
 ev5_flush_tlb_current(struct mm_struct *mm)
 {
@@ -34,19 +24,6 @@ ev5_flush_tlb_current(struct mm_struct *mm)
    careful about the icache here, there is no way to invalidate a
    specific icache page.  */
 
-__EXTERN_INLINE void
-ev4_flush_tlb_current_page(struct mm_struct * mm,
-			   struct vm_area_struct *vma,
-			   unsigned long addr)
-{
-	int tbi_flag = 2;
-	if (vma->vm_flags & VM_EXEC) {
-		__load_new_mm_context(mm);
-		tbi_flag = 3;
-	}
-	tbi(tbi_flag, addr);
-}
-
 __EXTERN_INLINE void
 ev5_flush_tlb_current_page(struct mm_struct * mm,
 			   struct vm_area_struct *vma,
@@ -59,18 +36,8 @@ ev5_flush_tlb_current_page(struct mm_struct * mm,
 }
 
 
-#ifdef CONFIG_ALPHA_GENERIC
-# define flush_tlb_current		alpha_mv.mv_flush_tlb_current
-# define flush_tlb_current_page		alpha_mv.mv_flush_tlb_current_page
-#else
-# ifdef CONFIG_ALPHA_EV4
-#  define flush_tlb_current		ev4_flush_tlb_current
-#  define flush_tlb_current_page	ev4_flush_tlb_current_page
-# else
-#  define flush_tlb_current		ev5_flush_tlb_current
-#  define flush_tlb_current_page	ev5_flush_tlb_current_page
-# endif
-#endif
+#define flush_tlb_current	ev5_flush_tlb_current
+#define flush_tlb_current_page	ev5_flush_tlb_current_page
 
 #ifdef __MMU_EXTERN_INLINE
 #undef __EXTERN_INLINE
diff --git a/arch/alpha/include/asm/uaccess.h b/arch/alpha/include/asm/uaccess.h
index c32c2584c0b7..ef295cbb797c 100644
--- a/arch/alpha/include/asm/uaccess.h
+++ b/arch/alpha/include/asm/uaccess.h
@@ -96,9 +96,6 @@ struct __large_struct { unsigned long buf[100]; };
 		: "=r"(__gu_val), "=r"(__gu_err)	\
 		: "m"(__m(addr)), "1"(__gu_err))
 
-#ifdef __alpha_bwx__
-/* Those lucky bastards with ev56 and later CPUs can do byte/word moves.  */
-
 #define __get_user_16(addr)				\
 	__asm__("1: ldwu %0,%2\n"			\
 	"2:\n"						\
@@ -112,33 +109,6 @@ struct __large_struct { unsigned long buf[100]; };
 	EXC(1b,2b,%0,%1)				\
 		: "=r"(__gu_val), "=r"(__gu_err)	\
 		: "m"(__m(addr)), "1"(__gu_err))
-#else
-/* Unfortunately, we can't get an unaligned access trap for the sub-word
-   load, so we have to do a general unaligned operation.  */
-
-#define __get_user_16(addr)						\
-{									\
-	long __gu_tmp;							\
-	__asm__("1: ldq_u %0,0(%3)\n"					\
-	"2:	ldq_u %1,1(%3)\n"					\
-	"	extwl %0,%3,%0\n"					\
-	"	extwh %1,%3,%1\n"					\
-	"	or %0,%1,%0\n"						\
-	"3:\n"								\
-	EXC(1b,3b,%0,%2)						\
-	EXC(2b,3b,%0,%2)						\
-		: "=&r"(__gu_val), "=&r"(__gu_tmp), "=r"(__gu_err)	\
-		: "r"(addr), "2"(__gu_err));				\
-}
-
-#define __get_user_8(addr)						\
-	__asm__("1: ldq_u %0,0(%2)\n"					\
-	"	extbl %0,%2,%0\n"					\
-	"2:\n"								\
-	EXC(1b,2b,%0,%1)						\
-		: "=&r"(__gu_val), "=r"(__gu_err)			\
-		: "r"(addr), "1"(__gu_err))
-#endif
 
 extern void __put_user_unknown(void);
 
@@ -192,9 +162,6 @@ __asm__ __volatile__("1: stl %r2,%1\n"				\
 		: "=r"(__pu_err)				\
 		: "m"(__m(addr)), "rJ"(x), "0"(__pu_err))
 
-#ifdef __alpha_bwx__
-/* Those lucky bastards with ev56 and later CPUs can do byte/word moves.  */
-
 #define __put_user_16(x, addr)					\
 __asm__ __volatile__("1: stw %r2,%1\n"				\
 	"2:\n"							\
@@ -208,53 +175,6 @@ __asm__ __volatile__("1: stb %r2,%1\n"				\
 	EXC(1b,2b,$31,%0)					\
 		: "=r"(__pu_err)				\
 		: "m"(__m(addr)), "rJ"(x), "0"(__pu_err))
-#else
-/* Unfortunately, we can't get an unaligned access trap for the sub-word
-   write, so we have to do a general unaligned operation.  */
-
-#define __put_user_16(x, addr)					\
-{								\
-	long __pu_tmp1, __pu_tmp2, __pu_tmp3, __pu_tmp4;	\
-	__asm__ __volatile__(					\
-	"1:	ldq_u %2,1(%5)\n"				\
-	"2:	ldq_u %1,0(%5)\n"				\
-	"	inswh %6,%5,%4\n"				\
-	"	inswl %6,%5,%3\n"				\
-	"	mskwh %2,%5,%2\n"				\
-	"	mskwl %1,%5,%1\n"				\
-	"	or %2,%4,%2\n"					\
-	"	or %1,%3,%1\n"					\
-	"3:	stq_u %2,1(%5)\n"				\
-	"4:	stq_u %1,0(%5)\n"				\
-	"5:\n"							\
-	EXC(1b,5b,$31,%0)					\
-	EXC(2b,5b,$31,%0)					\
-	EXC(3b,5b,$31,%0)					\
-	EXC(4b,5b,$31,%0)					\
-		: "=r"(__pu_err), "=&r"(__pu_tmp1), 		\
-		  "=&r"(__pu_tmp2), "=&r"(__pu_tmp3), 		\
-		  "=&r"(__pu_tmp4)				\
-		: "r"(addr), "r"((unsigned long)(x)), "0"(__pu_err)); \
-}
-
-#define __put_user_8(x, addr)					\
-{								\
-	long __pu_tmp1, __pu_tmp2;				\
-	__asm__ __volatile__(					\
-	"1:	ldq_u %1,0(%4)\n"				\
-	"	insbl %3,%4,%2\n"				\
-	"	mskbl %1,%4,%1\n"				\
-	"	or %1,%2,%1\n"					\
-	"2:	stq_u %1,0(%4)\n"				\
-	"3:\n"							\
-	EXC(1b,3b,$31,%0)					\
-	EXC(2b,3b,$31,%0)					\
-		: "=r"(__pu_err), 				\
-	  	  "=&r"(__pu_tmp1), "=&r"(__pu_tmp2)		\
-		: "r"((unsigned long)(x)), "r"(addr), "0"(__pu_err)); \
-}
-#endif
-
 
 /*
  * Complex access routines
diff --git a/arch/alpha/include/uapi/asm/compiler.h b/arch/alpha/include/uapi/asm/compiler.h
index 0e00c0e13374..8c03740966b4 100644
--- a/arch/alpha/include/uapi/asm/compiler.h
+++ b/arch/alpha/include/uapi/asm/compiler.h
@@ -95,24 +95,6 @@
 #define __kernel_ldwu(mem)	(mem)
 #define __kernel_stb(val,mem)	((mem) = (val))
 #define __kernel_stw(val,mem)	((mem) = (val))
-#else
-#define __kernel_ldbu(mem)				\
-  ({ unsigned char __kir;				\
-     __asm__(".arch ev56;				\
-	      ldbu %0,%1" : "=r"(__kir) : "m"(mem));	\
-     __kir; })
-#define __kernel_ldwu(mem)				\
-  ({ unsigned short __kir;				\
-     __asm__(".arch ev56;				\
-	      ldwu %0,%1" : "=r"(__kir) : "m"(mem));	\
-     __kir; })
-#define __kernel_stb(val,mem)				\
-  __asm__(".arch ev56;					\
-	   stb %1,%0" : "=m"(mem) : "r"(val))
-#define __kernel_stw(val,mem)				\
-  __asm__(".arch ev56;					\
-	   stw %1,%0" : "=m"(mem) : "r"(val))
 #endif
 
-
 #endif /* _UAPI__ALPHA_COMPILER_H */
diff --git a/arch/alpha/kernel/machvec_impl.h b/arch/alpha/kernel/machvec_impl.h
index c2ebcb39e589..129ae36b8e6d 100644
--- a/arch/alpha/kernel/machvec_impl.h
+++ b/arch/alpha/kernel/machvec_impl.h
@@ -44,33 +44,14 @@
 
 #define DO_DEFAULT_RTC			.rtc_port = 0x70
 
-#define DO_EV4_MMU							\
-	.max_asn =			EV4_MAX_ASN,			\
-	.mv_switch_mm =			ev4_switch_mm,			\
-	.mv_activate_mm =		ev4_activate_mm,		\
-	.mv_flush_tlb_current =		ev4_flush_tlb_current,		\
-	.mv_flush_tlb_current_page =	ev4_flush_tlb_current_page
-
 #define DO_EV5_MMU							\
-	.max_asn =			EV5_MAX_ASN,			\
-	.mv_switch_mm =			ev5_switch_mm,			\
-	.mv_activate_mm =		ev5_activate_mm,		\
-	.mv_flush_tlb_current =		ev5_flush_tlb_current,		\
-	.mv_flush_tlb_current_page =	ev5_flush_tlb_current_page
+	.max_asn =			EV5_MAX_ASN			\
 
 #define DO_EV6_MMU							\
-	.max_asn =			EV6_MAX_ASN,			\
-	.mv_switch_mm =			ev5_switch_mm,			\
-	.mv_activate_mm =		ev5_activate_mm,		\
-	.mv_flush_tlb_current =		ev5_flush_tlb_current,		\
-	.mv_flush_tlb_current_page =	ev5_flush_tlb_current_page
+	.max_asn =			EV6_MAX_ASN			\
 
 #define DO_EV7_MMU							\
-	.max_asn =			EV6_MAX_ASN,			\
-	.mv_switch_mm =			ev5_switch_mm,			\
-	.mv_activate_mm =		ev5_activate_mm,		\
-	.mv_flush_tlb_current =		ev5_flush_tlb_current,		\
-	.mv_flush_tlb_current_page =	ev5_flush_tlb_current_page
+	.max_asn =			EV6_MAX_ASN			\
 
 #define IO_LITE(UP,low)							\
 	.hae_register =		(unsigned long *) CAT(UP,_HAE_ADDRESS),	\
diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c
index 7fc72aeb7398..6afae65e9a8b 100644
--- a/arch/alpha/kernel/traps.c
+++ b/arch/alpha/kernel/traps.c
@@ -30,39 +30,6 @@
 
 #include "proto.h"
 
-/* Work-around for some SRMs which mishandle opDEC faults.  */
-
-static int opDEC_fix;
-
-static void
-opDEC_check(void)
-{
-	__asm__ __volatile__ (
-	/* Load the address of... */
-	"	br	$16, 1f\n"
-	/* A stub instruction fault handler.  Just add 4 to the
-	   pc and continue.  */
-	"	ldq	$16, 8($sp)\n"
-	"	addq	$16, 4, $16\n"
-	"	stq	$16, 8($sp)\n"
-	"	call_pal %[rti]\n"
-	/* Install the instruction fault handler.  */
-	"1:	lda	$17, 3\n"
-	"	call_pal %[wrent]\n"
-	/* With that in place, the fault from the round-to-minf fp
-	   insn will arrive either at the "lda 4" insn (bad) or one
-	   past that (good).  This places the correct fixup in %0.  */
-	"	lda %[fix], 0\n"
-	"	cvttq/svm $f31,$f31\n"
-	"	lda %[fix], 4"
-	: [fix] "=r" (opDEC_fix)
-	: [rti] "n" (PAL_rti), [wrent] "n" (PAL_wrent)
-	: "$0", "$1", "$16", "$17", "$22", "$23", "$24", "$25");
-
-	if (opDEC_fix)
-		printk("opDEC fixup enabled.\n");
-}
-
 void
 dik_show_regs(struct pt_regs *regs, unsigned long *r9_15)
 {
@@ -353,32 +320,6 @@ do_entIF(unsigned long type, struct pt_regs *regs)
 		return;
 
 	      case 4: /* opDEC */
-		if (implver() == IMPLVER_EV4) {
-			long si_code;
-
-			/* The some versions of SRM do not handle
-			   the opDEC properly - they return the PC of the
-			   opDEC fault, not the instruction after as the
-			   Alpha architecture requires.  Here we fix it up.
-			   We do this by intentionally causing an opDEC
-			   fault during the boot sequence and testing if
-			   we get the correct PC.  If not, we set a flag
-			   to correct it every time through.  */
-			regs->pc += opDEC_fix; 
-			
-			/* EV4 does not implement anything except normal
-			   rounding.  Everything else will come here as
-			   an illegal instruction.  Emulate them.  */
-			si_code = alpha_fp_emul(regs->pc - 4);
-			if (si_code == 0)
-				return;
-			if (si_code > 0) {
-				send_sig_fault_trapno(SIGFPE, si_code,
-						      (void __user *) regs->pc,
-						      0, current);
-				return;
-			}
-		}
 		break;
 
 	      case 5: /* illoc */
@@ -979,11 +920,6 @@ trap_init(void)
 	register unsigned long gptr __asm__("$29");
 	wrkgp(gptr);
 
-	/* Hack for Multia (UDB) and JENSEN: some of their SRMs have
-	   a bug in the handling of the opDEC fault.  Fix it up if so.  */
-	if (implver() == IMPLVER_EV4)
-		opDEC_check();
-
 	wrent(entArith, 1);
 	wrent(entMM, 2);
 	wrent(entIF, 3);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index cb1526ec44b5..c3e098b21c16 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -88,15 +88,9 @@ struct block_device {
 
 /*
  * Block error status values.  See block/blk-core:blk_errors for the details.
- * Alpha cannot write a byte atomically, so we need to use 32-bit value.
  */
-#if defined(CONFIG_ALPHA) && !defined(__alpha_bwx__)
-typedef u32 __bitwise blk_status_t;
-typedef u32 blk_short_t;
-#else
 typedef u8 __bitwise blk_status_t;
 typedef u16 blk_short_t;
-#endif
 #define	BLK_STS_OK 0
 #define BLK_STS_NOTSUPP		((__force blk_status_t)1)
 #define BLK_STS_TIMEOUT		((__force blk_status_t)2)
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 2b2e6f0a54d6..2372f9357240 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -145,15 +145,12 @@ struct tty_operations;
  * @count: count of open processes, reaching zero cancels all the work for
  *	   this tty and drops a @kref too (but does not free this tty)
  * @winsize: size of the terminal "window" (cf. @winsize_mutex)
- * @flow: flow settings grouped together, see also @flow.unused
+ * @flow: flow settings grouped together
  * @flow.lock: lock for @flow members
  * @flow.stopped: tty stopped/started by stop_tty()/start_tty()
  * @flow.tco_stopped: tty stopped/started by %TCOOFF/%TCOON ioctls (it has
  *		      precedence over @flow.stopped)
- * @flow.unused: alignment for Alpha, so that no members other than @flow.* are
- *		 modified by the same 64b word store. The @flow's __aligned is
- *		 there for the very same reason.
- * @ctrl: control settings grouped together, see also @ctrl.unused
+ * @ctrl: control settings grouped together
  * @ctrl.lock: lock for @ctrl members
  * @ctrl.pgrp: process group of this tty (setpgrp(2))
  * @ctrl.session: session of this tty (setsid(2)). Writes are protected by both
@@ -161,7 +158,6 @@ struct tty_operations;
  *		  them.
  * @ctrl.pktstatus: packet mode status (bitwise OR of %TIOCPKT_ constants)
  * @ctrl.packet: packet mode enabled
- * @ctrl.unused: alignment for Alpha, see @flow.unused for explanation
  * @hw_stopped: not controlled by the tty layer, under @driver's control for CTS
  *		handling
  * @receive_room: bytes permitted to feed to @ldisc without any being lost
@@ -216,8 +212,7 @@ struct tty_struct {
 		spinlock_t lock;
 		bool stopped;
 		bool tco_stopped;
-		unsigned long unused[0];
-	} __aligned(sizeof(unsigned long)) flow;
+	} flow;
 
 	struct {
 		struct pid *pgrp;
@@ -225,8 +220,7 @@ struct tty_struct {
 		spinlock_t lock;
 		unsigned char pktstatus;
 		bool packet;
-		unsigned long unused[0];
-	} __aligned(sizeof(unsigned long)) ctrl;
+	} ctrl;
 
 	bool hw_stopped;
 	bool closing;
-- 
cgit v1.2.3


From 628d701f2de5b9a16d1dd82bea68fd895f56f1a1 Mon Sep 17 00:00:00 2001
From: Benjamin Gray <bgray@linux.ibm.com>
Date: Wed, 17 Apr 2024 21:23:20 +1000
Subject: powerpc/dexcr: Add DEXCR prctl interface

Now that we track a DEXCR on a per-task basis, individual tasks are free
to configure it as they like.

The interface is a pair of getter/setter prctl's that work on a single
aspect at a time (multiple aspects at once is more difficult if there
are different rules applied for each aspect, now or in future). The
getter shows the current state of the process config, and the setter
allows setting/clearing the aspect.

Signed-off-by: Benjamin Gray <bgray@linux.ibm.com>
[mpe: Account for PR_RISCV_SET_ICACHE_FLUSH_CTX, shrink some longs lines]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://msgid.link/20240417112325.728010-5-bgray@linux.ibm.com
---
 arch/powerpc/include/asm/processor.h |  10 ++++
 arch/powerpc/kernel/dexcr.c          | 101 +++++++++++++++++++++++++++++++++++
 include/uapi/linux/prctl.h           |  16 ++++++
 kernel/sys.c                         |  16 ++++++
 4 files changed, 143 insertions(+)

(limited to 'include')

diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index aad85a24134a..e44cac0da346 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -334,6 +334,16 @@ extern int set_endian(struct task_struct *tsk, unsigned int val);
 extern int get_unalign_ctl(struct task_struct *tsk, unsigned long adr);
 extern int set_unalign_ctl(struct task_struct *tsk, unsigned int val);
 
+#ifdef CONFIG_PPC_BOOK3S_64
+
+#define PPC_GET_DEXCR_ASPECT(tsk, asp) get_dexcr_prctl((tsk), (asp))
+#define PPC_SET_DEXCR_ASPECT(tsk, asp, val) set_dexcr_prctl((tsk), (asp), (val))
+
+int get_dexcr_prctl(struct task_struct *tsk, unsigned long asp);
+int set_dexcr_prctl(struct task_struct *tsk, unsigned long asp, unsigned long val);
+
+#endif
+
 extern void load_fp_state(struct thread_fp_state *fp);
 extern void store_fp_state(struct thread_fp_state *fp);
 extern void load_vr_state(struct thread_vr_state *vr);
diff --git a/arch/powerpc/kernel/dexcr.c b/arch/powerpc/kernel/dexcr.c
index d5cd77421088..3a0358e91c60 100644
--- a/arch/powerpc/kernel/dexcr.c
+++ b/arch/powerpc/kernel/dexcr.c
@@ -21,3 +21,104 @@ static int __init init_task_dexcr(void)
 	return 0;
 }
 early_initcall(init_task_dexcr)
+
+/* Allow thread local configuration of these by default */
+#define DEXCR_PRCTL_EDITABLE ( \
+	DEXCR_PR_IBRTPD | \
+	DEXCR_PR_SRAPD | \
+	DEXCR_PR_NPHIE)
+
+static int prctl_to_aspect(unsigned long which, unsigned int *aspect)
+{
+	switch (which) {
+	case PR_PPC_DEXCR_SBHE:
+		*aspect = DEXCR_PR_SBHE;
+		break;
+	case PR_PPC_DEXCR_IBRTPD:
+		*aspect = DEXCR_PR_IBRTPD;
+		break;
+	case PR_PPC_DEXCR_SRAPD:
+		*aspect = DEXCR_PR_SRAPD;
+		break;
+	case PR_PPC_DEXCR_NPHIE:
+		*aspect = DEXCR_PR_NPHIE;
+		break;
+	default:
+		return -ENODEV;
+	}
+
+	return 0;
+}
+
+int get_dexcr_prctl(struct task_struct *task, unsigned long which)
+{
+	unsigned int aspect;
+	int ret;
+
+	ret = prctl_to_aspect(which, &aspect);
+	if (ret)
+		return ret;
+
+	if (aspect & DEXCR_PRCTL_EDITABLE)
+		ret |= PR_PPC_DEXCR_CTRL_EDITABLE;
+
+	if (aspect & mfspr(SPRN_DEXCR))
+		ret |= PR_PPC_DEXCR_CTRL_SET;
+	else
+		ret |= PR_PPC_DEXCR_CTRL_CLEAR;
+
+	if (aspect & task->thread.dexcr_onexec)
+		ret |= PR_PPC_DEXCR_CTRL_SET_ONEXEC;
+	else
+		ret |= PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC;
+
+	return ret;
+}
+
+int set_dexcr_prctl(struct task_struct *task, unsigned long which, unsigned long ctrl)
+{
+	unsigned long dexcr;
+	unsigned int aspect;
+	int err = 0;
+
+	err = prctl_to_aspect(which, &aspect);
+	if (err)
+		return err;
+
+	if (!(aspect & DEXCR_PRCTL_EDITABLE))
+		return -EPERM;
+
+	if (ctrl & ~PR_PPC_DEXCR_CTRL_MASK)
+		return -EINVAL;
+
+	if (ctrl & PR_PPC_DEXCR_CTRL_SET && ctrl & PR_PPC_DEXCR_CTRL_CLEAR)
+		return -EINVAL;
+
+	if (ctrl & PR_PPC_DEXCR_CTRL_SET_ONEXEC && ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC)
+		return -EINVAL;
+
+	/*
+	 * We do not want an unprivileged process being able to disable
+	 * a setuid process's hash check instructions
+	 */
+	if (aspect == DEXCR_PR_NPHIE &&
+	    ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC &&
+	    !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	dexcr = mfspr(SPRN_DEXCR);
+
+	if (ctrl & PR_PPC_DEXCR_CTRL_SET)
+		dexcr |= aspect;
+	else if (ctrl & PR_PPC_DEXCR_CTRL_CLEAR)
+		dexcr &= ~aspect;
+
+	if (ctrl & PR_PPC_DEXCR_CTRL_SET_ONEXEC)
+		task->thread.dexcr_onexec |= aspect;
+	else if (ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC)
+		task->thread.dexcr_onexec &= ~aspect;
+
+	mtspr(SPRN_DEXCR, dexcr);
+
+	return 0;
+}
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 370ed14b1ae0..713d28788df7 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -306,4 +306,20 @@ struct prctl_mm_map {
 # define PR_RISCV_V_VSTATE_CTRL_NEXT_MASK	0xc
 # define PR_RISCV_V_VSTATE_CTRL_MASK		0x1f
 
+/* PowerPC Dynamic Execution Control Register (DEXCR) controls */
+#define PR_PPC_GET_DEXCR		72
+#define PR_PPC_SET_DEXCR		73
+/* DEXCR aspect to act on */
+# define PR_PPC_DEXCR_SBHE		0 /* Speculative branch hint enable */
+# define PR_PPC_DEXCR_IBRTPD		1 /* Indirect branch recurrent target prediction disable */
+# define PR_PPC_DEXCR_SRAPD		2 /* Subroutine return address prediction disable */
+# define PR_PPC_DEXCR_NPHIE		3 /* Non-privileged hash instruction enable */
+/* Action to apply / return */
+# define PR_PPC_DEXCR_CTRL_EDITABLE	 0x1 /* Aspect can be modified with PR_PPC_SET_DEXCR */
+# define PR_PPC_DEXCR_CTRL_SET		 0x2 /* Set the aspect for this process */
+# define PR_PPC_DEXCR_CTRL_CLEAR	 0x4 /* Clear the aspect for this process */
+# define PR_PPC_DEXCR_CTRL_SET_ONEXEC	 0x8 /* Set the aspect on exec */
+# define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC	0x10 /* Clear the aspect on exec */
+# define PR_PPC_DEXCR_CTRL_MASK		0x1f
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 8bb106a56b3a..f9c95410278c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -146,6 +146,12 @@
 #ifndef RISCV_V_GET_CONTROL
 # define RISCV_V_GET_CONTROL()		(-EINVAL)
 #endif
+#ifndef PPC_GET_DEXCR_ASPECT
+# define PPC_GET_DEXCR_ASPECT(a, b)	(-EINVAL)
+#endif
+#ifndef PPC_SET_DEXCR_ASPECT
+# define PPC_SET_DEXCR_ASPECT(a, b, c)	(-EINVAL)
+#endif
 
 /*
  * this is where the system-wide overflow UID and GID are defined, for
@@ -2726,6 +2732,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_GET_MDWE:
 		error = prctl_get_mdwe(arg2, arg3, arg4, arg5);
 		break;
+	case PR_PPC_GET_DEXCR:
+		if (arg3 || arg4 || arg5)
+			return -EINVAL;
+		error = PPC_GET_DEXCR_ASPECT(me, arg2);
+		break;
+	case PR_PPC_SET_DEXCR:
+		if (arg4 || arg5)
+			return -EINVAL;
+		error = PPC_SET_DEXCR_ASPECT(me, arg2, arg3);
+		break;
 	case PR_SET_VMA:
 		error = prctl_set_vma(arg2, arg3, arg4, arg5);
 		break;
-- 
cgit v1.2.3


From 33a1e6ea73e5f1defe6706f006c0930a82ebdaaa Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Sat, 23 Mar 2024 09:25:54 -0400
Subject: nfsd: trivial GET_DIR_DELEGATION support

This adds basic infrastructure for handing GET_DIR_DELEGATION calls from
clients, including the decoders and encoders. For now, it always just
returns NFS4_OK + GDD4_UNAVAIL.

Eventually clients may start sending this operation, and it's better if
we can return GDD4_UNAVAIL instead of having to abort the whole compound.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfs4proc.c   | 41 ++++++++++++++++++++++++++++
 fs/nfsd/nfs4xdr.c    | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/nfsd/xdr4.h       | 19 +++++++++++++
 include/linux/nfs4.h |  6 +++++
 4 files changed, 140 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 2927b1263f08..3dc173b29803 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -2154,6 +2154,29 @@ nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	return status == nfserr_same ? nfs_ok : status;
 }
 
+static __be32
+nfsd4_get_dir_delegation(struct svc_rqst *rqstp,
+			 struct nfsd4_compound_state *cstate,
+			 union nfsd4_op_u *u)
+{
+	struct nfsd4_get_dir_delegation *gdd = &u->get_dir_delegation;
+
+	/*
+	 * RFC 8881, section 18.39.3 says:
+	 *
+	 * "The server may refuse to grant the delegation. In that case, the
+	 *  server will return NFS4ERR_DIRDELEG_UNAVAIL."
+	 *
+	 * This is sub-optimal, since it means that the server would need to
+	 * abort compound processing just because the delegation wasn't
+	 * available. RFC8881bis should change this to allow the server to
+	 * return NFS4_OK with a non-fatal status of GDD4_UNAVAIL in this
+	 * situation.
+	 */
+	gdd->gddrnf_status = GDD4_UNAVAIL;
+	return nfs_ok;
+}
+
 #ifdef CONFIG_NFSD_PNFS
 static const struct nfsd4_layout_ops *
 nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type)
@@ -3082,6 +3105,18 @@ static u32 nfsd4_copy_notify_rsize(const struct svc_rqst *rqstp,
 		* sizeof(__be32);
 }
 
+static u32 nfsd4_get_dir_delegation_rsize(const struct svc_rqst *rqstp,
+					  const struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size +
+		1 /* gddr_status */ +
+		op_encode_verifier_maxsz +
+		op_encode_stateid_maxsz +
+		2 /* gddr_notification */ +
+		2 /* gddr_child_attributes */ +
+		2 /* gddr_dir_attributes */);
+}
+
 #ifdef CONFIG_NFSD_PNFS
 static u32 nfsd4_getdeviceinfo_rsize(const struct svc_rqst *rqstp,
 				     const struct nfsd4_op *op)
@@ -3470,6 +3505,12 @@ static const struct nfsd4_operation nfsd4_ops[] = {
 		.op_get_currentstateid = nfsd4_get_freestateid,
 		.op_rsize_bop = nfsd4_only_status_rsize,
 	},
+	[OP_GET_DIR_DELEGATION] = {
+		.op_func = nfsd4_get_dir_delegation,
+		.op_flags = OP_MODIFIES_SOMETHING,
+		.op_name = "OP_GET_DIR_DELEGATION",
+		.op_rsize_bop = nfsd4_get_dir_delegation_rsize,
+	},
 #ifdef CONFIG_NFSD_PNFS
 	[OP_GETDEVICEINFO] = {
 		.op_func = nfsd4_getdeviceinfo,
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index a644460f3a5e..ea76100e50ca 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1732,6 +1732,35 @@ nfsd4_decode_free_stateid(struct nfsd4_compoundargs *argp,
 	return nfsd4_decode_stateid4(argp, &free_stateid->fr_stateid);
 }
 
+static __be32
+nfsd4_decode_get_dir_delegation(struct nfsd4_compoundargs *argp,
+		union nfsd4_op_u *u)
+{
+	struct nfsd4_get_dir_delegation *gdd = &u->get_dir_delegation;
+	__be32 status;
+
+	memset(gdd, 0, sizeof(*gdd));
+
+	if (xdr_stream_decode_bool(argp->xdr, &gdd->gdda_signal_deleg_avail) < 0)
+		return nfserr_bad_xdr;
+	status = nfsd4_decode_bitmap4(argp, gdd->gdda_notification_types,
+				      ARRAY_SIZE(gdd->gdda_notification_types));
+	if (status)
+		return status;
+	status = nfsd4_decode_nfstime4(argp, &gdd->gdda_child_attr_delay);
+	if (status)
+		return status;
+	status = nfsd4_decode_nfstime4(argp, &gdd->gdda_dir_attr_delay);
+	if (status)
+		return status;
+	status = nfsd4_decode_bitmap4(argp, gdd->gdda_child_attributes,
+					ARRAY_SIZE(gdd->gdda_child_attributes));
+	if (status)
+		return status;
+	return nfsd4_decode_bitmap4(argp, gdd->gdda_dir_attributes,
+					ARRAY_SIZE(gdd->gdda_dir_attributes));
+}
+
 #ifdef CONFIG_NFSD_PNFS
 static __be32
 nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp,
@@ -2370,7 +2399,7 @@ static const nfsd4_dec nfsd4_dec_ops[] = {
 	[OP_CREATE_SESSION]	= nfsd4_decode_create_session,
 	[OP_DESTROY_SESSION]	= nfsd4_decode_destroy_session,
 	[OP_FREE_STATEID]	= nfsd4_decode_free_stateid,
-	[OP_GET_DIR_DELEGATION]	= nfsd4_decode_notsupp,
+	[OP_GET_DIR_DELEGATION]	= nfsd4_decode_get_dir_delegation,
 #ifdef CONFIG_NFSD_PNFS
 	[OP_GETDEVICEINFO]	= nfsd4_decode_getdeviceinfo,
 	[OP_GETDEVICELIST]	= nfsd4_decode_notsupp,
@@ -4963,6 +4992,49 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
 	return nfs_ok;
 }
 
+static __be32
+nfsd4_encode_get_dir_delegation(struct nfsd4_compoundres *resp, __be32 nfserr,
+				union nfsd4_op_u *u)
+{
+	struct nfsd4_get_dir_delegation *gdd = &u->get_dir_delegation;
+	struct xdr_stream *xdr = resp->xdr;
+	__be32 status = nfserr_resource;
+
+	switch(gdd->gddrnf_status) {
+	case GDD4_OK:
+		if (xdr_stream_encode_u32(xdr, GDD4_OK) != XDR_UNIT)
+			break;
+		status = nfsd4_encode_verifier4(xdr, &gdd->gddr_cookieverf);
+		if (status)
+			break;
+		status = nfsd4_encode_stateid4(xdr, &gdd->gddr_stateid);
+		if (status)
+			break;
+		status = nfsd4_encode_bitmap4(xdr, gdd->gddr_notification[0], 0, 0);
+		if (status)
+			break;
+		status = nfsd4_encode_bitmap4(xdr, gdd->gddr_child_attributes[0],
+						   gdd->gddr_child_attributes[1],
+						   gdd->gddr_child_attributes[2]);
+		if (status)
+			break;
+		status = nfsd4_encode_bitmap4(xdr, gdd->gddr_dir_attributes[0],
+						   gdd->gddr_dir_attributes[1],
+						   gdd->gddr_dir_attributes[2]);
+		break;
+	default:
+		pr_warn("nfsd: bad gddrnf_status (%u)\n", gdd->gddrnf_status);
+		gdd->gddrnf_will_signal_deleg_avail = 0;
+		fallthrough;
+	case GDD4_UNAVAIL:
+		if (xdr_stream_encode_u32(xdr, GDD4_UNAVAIL) != XDR_UNIT)
+			break;
+		status = nfsd4_encode_bool(xdr, gdd->gddrnf_will_signal_deleg_avail);
+		break;
+	}
+	return status;
+}
+
 #ifdef CONFIG_NFSD_PNFS
 static __be32
 nfsd4_encode_device_addr4(struct xdr_stream *xdr,
@@ -5579,7 +5651,7 @@ static const nfsd4_enc nfsd4_enc_ops[] = {
 	[OP_CREATE_SESSION]	= nfsd4_encode_create_session,
 	[OP_DESTROY_SESSION]	= nfsd4_encode_noop,
 	[OP_FREE_STATEID]	= nfsd4_encode_noop,
-	[OP_GET_DIR_DELEGATION]	= nfsd4_encode_noop,
+	[OP_GET_DIR_DELEGATION]	= nfsd4_encode_get_dir_delegation,
 #ifdef CONFIG_NFSD_PNFS
 	[OP_GETDEVICEINFO]	= nfsd4_encode_getdeviceinfo,
 	[OP_GETDEVICELIST]	= nfsd4_encode_noop,
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 415516c1b27e..446e72b0385e 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -518,6 +518,24 @@ struct nfsd4_free_stateid {
 	stateid_t	fr_stateid;         /* request */
 };
 
+struct nfsd4_get_dir_delegation {
+	/* request */
+	u32			gdda_signal_deleg_avail;
+	u32			gdda_notification_types[1];
+	struct timespec64	gdda_child_attr_delay;
+	struct timespec64	gdda_dir_attr_delay;
+	u32			gdda_child_attributes[3];
+	u32			gdda_dir_attributes[3];
+	/* response */
+	u32			gddrnf_status;
+	nfs4_verifier		gddr_cookieverf;
+	stateid_t		gddr_stateid;
+	u32			gddr_notification[1];
+	u32			gddr_child_attributes[3];
+	u32			gddr_dir_attributes[3];
+	bool			gddrnf_will_signal_deleg_avail;
+};
+
 /* also used for NVERIFY */
 struct nfsd4_verify {
 	u32		ve_bmval[3];        /* request */
@@ -797,6 +815,7 @@ struct nfsd4_op {
 		struct nfsd4_reclaim_complete	reclaim_complete;
 		struct nfsd4_test_stateid	test_stateid;
 		struct nfsd4_free_stateid	free_stateid;
+		struct nfsd4_get_dir_delegation	get_dir_delegation;
 		struct nfsd4_getdeviceinfo	getdeviceinfo;
 		struct nfsd4_layoutget		layoutget;
 		struct nfsd4_layoutcommit	layoutcommit;
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index ef8d2d618d5b..0d896ce296ce 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -701,6 +701,12 @@ enum state_protect_how4 {
 	SP4_SSV		= 2
 };
 
+/* GET_DIR_DELEGATION non-fatal status codes */
+enum gddrnf4_status {
+	GDD4_OK		= 0,
+	GDD4_UNAVAIL	= 1
+};
+
 enum pnfs_layouttype {
 	LAYOUT_NFSV4_1_FILES  = 1,
 	LAYOUT_OSD2_OBJECTS = 2,
-- 
cgit v1.2.3


From 924f4fb003ba114c60b3c07a011dcd86a8956cd1 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Tue, 23 Apr 2024 15:25:40 +0200
Subject: NFSD: convert write_threads to netlink command

Introduce write_threads netlink command similar to the one available
through the procfs.

Tested-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Co-developed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 Documentation/netlink/specs/nfsd.yaml |  39 ++++++++++
 fs/nfsd/netlink.c                     |  20 +++++
 fs/nfsd/netlink.h                     |   2 +
 fs/nfsd/nfsctl.c                      | 143 ++++++++++++++++++++++++++++++++++
 include/uapi/linux/nfsd_netlink.h     |  12 +++
 5 files changed, 216 insertions(+)

(limited to 'include')

diff --git a/Documentation/netlink/specs/nfsd.yaml b/Documentation/netlink/specs/nfsd.yaml
index 05acc73e2e33..703c2d11d0a9 100644
--- a/Documentation/netlink/specs/nfsd.yaml
+++ b/Documentation/netlink/specs/nfsd.yaml
@@ -62,6 +62,22 @@ attribute-sets:
         name: compound-ops
         type: u32
         multi-attr: true
+  -
+    name: server
+    attributes:
+      -
+        name: threads
+        type: u32
+        multi-attr: true
+      -
+        name: gracetime
+        type: u32
+      -
+        name: leasetime
+        type: u32
+      -
+        name: scope
+        type: string
 
 operations:
   list:
@@ -87,3 +103,26 @@ operations:
             - sport
             - dport
             - compound-ops
+    -
+      name: threads-set
+      doc: set the number of running threads
+      attribute-set: server
+      flags: [ admin-perm ]
+      do:
+        request:
+          attributes:
+            - threads
+            - gracetime
+            - leasetime
+            - scope
+    -
+      name: threads-get
+      doc: get the number of running threads
+      attribute-set: server
+      do:
+        reply:
+          attributes:
+            - threads
+            - gracetime
+            - leasetime
+            - scope
diff --git a/fs/nfsd/netlink.c b/fs/nfsd/netlink.c
index 0e1d635ec5f9..fe9eef5c7f27 100644
--- a/fs/nfsd/netlink.c
+++ b/fs/nfsd/netlink.c
@@ -10,6 +10,14 @@
 
 #include <uapi/linux/nfsd_netlink.h>
 
+/* NFSD_CMD_THREADS_SET - do */
+static const struct nla_policy nfsd_threads_set_nl_policy[NFSD_A_SERVER_SCOPE + 1] = {
+	[NFSD_A_SERVER_THREADS] = { .type = NLA_U32, },
+	[NFSD_A_SERVER_GRACETIME] = { .type = NLA_U32, },
+	[NFSD_A_SERVER_LEASETIME] = { .type = NLA_U32, },
+	[NFSD_A_SERVER_SCOPE] = { .type = NLA_NUL_STRING, },
+};
+
 /* Ops table for nfsd */
 static const struct genl_split_ops nfsd_nl_ops[] = {
 	{
@@ -19,6 +27,18 @@ static const struct genl_split_ops nfsd_nl_ops[] = {
 		.done	= nfsd_nl_rpc_status_get_done,
 		.flags	= GENL_CMD_CAP_DUMP,
 	},
+	{
+		.cmd		= NFSD_CMD_THREADS_SET,
+		.doit		= nfsd_nl_threads_set_doit,
+		.policy		= nfsd_threads_set_nl_policy,
+		.maxattr	= NFSD_A_SERVER_SCOPE,
+		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+	},
+	{
+		.cmd	= NFSD_CMD_THREADS_GET,
+		.doit	= nfsd_nl_threads_get_doit,
+		.flags	= GENL_CMD_CAP_DO,
+	},
 };
 
 struct genl_family nfsd_nl_family __ro_after_init = {
diff --git a/fs/nfsd/netlink.h b/fs/nfsd/netlink.h
index d83dd6bdee92..4137fac477e4 100644
--- a/fs/nfsd/netlink.h
+++ b/fs/nfsd/netlink.h
@@ -16,6 +16,8 @@ int nfsd_nl_rpc_status_get_done(struct netlink_callback *cb);
 
 int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb,
 				  struct netlink_callback *cb);
+int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info);
+int nfsd_nl_threads_get_doit(struct sk_buff *skb, struct genl_info *info);
 
 extern struct genl_family nfsd_nl_family;
 
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 34148d73b6c1..5bfdebb2e7e9 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -15,6 +15,7 @@
 #include <linux/sunrpc/addr.h>
 #include <linux/sunrpc/gss_api.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
+#include <linux/sunrpc/svc.h>
 #include <linux/module.h>
 #include <linux/fsnotify.h>
 
@@ -1653,6 +1654,148 @@ int nfsd_nl_rpc_status_get_done(struct netlink_callback *cb)
 	return 0;
 }
 
+/**
+ * nfsd_nl_threads_set_doit - set the number of running threads
+ * @skb: reply buffer
+ * @info: netlink metadata and command arguments
+ *
+ * Return 0 on success or a negative errno.
+ */
+int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	int nthreads = 0, count = 0, nrpools, ret = -EOPNOTSUPP, rem;
+	struct net *net = genl_info_net(info);
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	const struct nlattr *attr;
+	const char *scope = NULL;
+
+	if (GENL_REQ_ATTR_CHECK(info, NFSD_A_SERVER_THREADS))
+		return -EINVAL;
+
+	/* count number of SERVER_THREADS values */
+	nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) {
+		if (nla_type(attr) == NFSD_A_SERVER_THREADS)
+			count++;
+	}
+
+	mutex_lock(&nfsd_mutex);
+
+	nrpools = nfsd_nrpools(net);
+	if (nrpools && count > nrpools)
+		count = nrpools;
+
+	/* XXX: make this handle non-global pool-modes */
+	if (count > 1)
+		goto out_unlock;
+
+	nthreads = nla_get_u32(info->attrs[NFSD_A_SERVER_THREADS]);
+	if (info->attrs[NFSD_A_SERVER_GRACETIME] ||
+	    info->attrs[NFSD_A_SERVER_LEASETIME] ||
+	    info->attrs[NFSD_A_SERVER_SCOPE]) {
+		ret = -EBUSY;
+		if (nn->nfsd_serv && nn->nfsd_serv->sv_nrthreads)
+			goto out_unlock;
+
+		ret = -EINVAL;
+		attr = info->attrs[NFSD_A_SERVER_GRACETIME];
+		if (attr) {
+			u32 gracetime = nla_get_u32(attr);
+
+			if (gracetime < 10 || gracetime > 3600)
+				goto out_unlock;
+
+			nn->nfsd4_grace = gracetime;
+		}
+
+		attr = info->attrs[NFSD_A_SERVER_LEASETIME];
+		if (attr) {
+			u32 leasetime = nla_get_u32(attr);
+
+			if (leasetime < 10 || leasetime > 3600)
+				goto out_unlock;
+
+			nn->nfsd4_lease = leasetime;
+		}
+
+		attr = info->attrs[NFSD_A_SERVER_SCOPE];
+		if (attr)
+			scope = nla_data(attr);
+	}
+
+	ret = nfsd_svc(nthreads, net, get_current_cred(), scope);
+
+out_unlock:
+	mutex_unlock(&nfsd_mutex);
+
+	return ret == nthreads ? 0 : ret;
+}
+
+/**
+ * nfsd_nl_threads_get_doit - get the number of running threads
+ * @skb: reply buffer
+ * @info: netlink metadata and command arguments
+ *
+ * Return 0 on success or a negative errno.
+ */
+int nfsd_nl_threads_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct net *net = genl_info_net(info);
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	void *hdr;
+	int err;
+
+	skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	hdr = genlmsg_iput(skb, info);
+	if (!hdr) {
+		err = -EMSGSIZE;
+		goto err_free_msg;
+	}
+
+	mutex_lock(&nfsd_mutex);
+
+	err = nla_put_u32(skb, NFSD_A_SERVER_GRACETIME,
+			  nn->nfsd4_grace) ||
+	      nla_put_u32(skb, NFSD_A_SERVER_LEASETIME,
+			  nn->nfsd4_lease) ||
+	      nla_put_string(skb, NFSD_A_SERVER_SCOPE,
+			  nn->nfsd_name);
+	if (err)
+		goto err_unlock;
+
+	if (nn->nfsd_serv) {
+		int i;
+
+		for (i = 0; i < nfsd_nrpools(net); ++i) {
+			struct svc_pool *sp = &nn->nfsd_serv->sv_pools[i];
+
+			err = nla_put_u32(skb, NFSD_A_SERVER_THREADS,
+					  atomic_read(&sp->sp_nrthreads));
+			if (err)
+				goto err_unlock;
+		}
+	} else {
+		err = nla_put_u32(skb, NFSD_A_SERVER_THREADS, 0);
+		if (err)
+			goto err_unlock;
+	}
+
+	mutex_unlock(&nfsd_mutex);
+
+	genlmsg_end(skb, hdr);
+
+	return genlmsg_reply(skb, info);
+
+err_unlock:
+	mutex_unlock(&nfsd_mutex);
+err_free_msg:
+	nlmsg_free(skb);
+
+	return err;
+}
+
 /**
  * nfsd_net_init - Prepare the nfsd_net portion of a new net namespace
  * @net: a freshly-created network namespace
diff --git a/include/uapi/linux/nfsd_netlink.h b/include/uapi/linux/nfsd_netlink.h
index 3cd044edee5d..4bbccd5db7cd 100644
--- a/include/uapi/linux/nfsd_netlink.h
+++ b/include/uapi/linux/nfsd_netlink.h
@@ -29,8 +29,20 @@ enum {
 	NFSD_A_RPC_STATUS_MAX = (__NFSD_A_RPC_STATUS_MAX - 1)
 };
 
+enum {
+	NFSD_A_SERVER_THREADS = 1,
+	NFSD_A_SERVER_GRACETIME,
+	NFSD_A_SERVER_LEASETIME,
+	NFSD_A_SERVER_SCOPE,
+
+	__NFSD_A_SERVER_MAX,
+	NFSD_A_SERVER_MAX = (__NFSD_A_SERVER_MAX - 1)
+};
+
 enum {
 	NFSD_CMD_RPC_STATUS_GET = 1,
+	NFSD_CMD_THREADS_SET,
+	NFSD_CMD_THREADS_GET,
 
 	__NFSD_CMD_MAX,
 	NFSD_CMD_MAX = (__NFSD_CMD_MAX - 1)
-- 
cgit v1.2.3


From 5a939bea25be9793d9aa5d8494df667dfe625e6b Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Tue, 23 Apr 2024 15:25:41 +0200
Subject: NFSD: add write_version to netlink command

Introduce write_version netlink command through a "declarative" interface.
This patch introduces a change in behavior since for version-set userspace
is expected to provide a NFS major/minor version list it wants to enable
while all the other ones will be disabled. (procfs write_version
command implements imperative interface where the admin writes +3/-3 to
enable/disable a single version.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 Documentation/netlink/specs/nfsd.yaml |  37 +++++++++
 fs/nfsd/netlink.c                     |  24 ++++++
 fs/nfsd/netlink.h                     |   5 ++
 fs/nfsd/netns.h                       |   1 +
 fs/nfsd/nfsctl.c                      | 150 ++++++++++++++++++++++++++++++++++
 fs/nfsd/nfssvc.c                      |   3 +-
 include/uapi/linux/nfsd_netlink.h     |  18 ++++
 7 files changed, 236 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/netlink/specs/nfsd.yaml b/Documentation/netlink/specs/nfsd.yaml
index 703c2d11d0a9..77adda1425e2 100644
--- a/Documentation/netlink/specs/nfsd.yaml
+++ b/Documentation/netlink/specs/nfsd.yaml
@@ -78,6 +78,26 @@ attribute-sets:
       -
         name: scope
         type: string
+  -
+    name: version
+    attributes:
+      -
+        name: major
+        type: u32
+      -
+        name: minor
+        type: u32
+      -
+        name: enabled
+        type: flag
+  -
+    name: server-proto
+    attributes:
+      -
+        name: version
+        type: nest
+        nested-attributes: version
+        multi-attr: true
 
 operations:
   list:
@@ -126,3 +146,20 @@ operations:
             - gracetime
             - leasetime
             - scope
+    -
+      name: version-set
+      doc: set nfs enabled versions
+      attribute-set: server-proto
+      flags: [ admin-perm ]
+      do:
+        request:
+          attributes:
+            - version
+    -
+      name: version-get
+      doc: get nfs enabled versions
+      attribute-set: server-proto
+      do:
+        reply:
+          attributes:
+            - version
diff --git a/fs/nfsd/netlink.c b/fs/nfsd/netlink.c
index fe9eef5c7f27..b23b0b84a59a 100644
--- a/fs/nfsd/netlink.c
+++ b/fs/nfsd/netlink.c
@@ -10,6 +10,13 @@
 
 #include <uapi/linux/nfsd_netlink.h>
 
+/* Common nested types */
+const struct nla_policy nfsd_version_nl_policy[NFSD_A_VERSION_ENABLED + 1] = {
+	[NFSD_A_VERSION_MAJOR] = { .type = NLA_U32, },
+	[NFSD_A_VERSION_MINOR] = { .type = NLA_U32, },
+	[NFSD_A_VERSION_ENABLED] = { .type = NLA_FLAG, },
+};
+
 /* NFSD_CMD_THREADS_SET - do */
 static const struct nla_policy nfsd_threads_set_nl_policy[NFSD_A_SERVER_SCOPE + 1] = {
 	[NFSD_A_SERVER_THREADS] = { .type = NLA_U32, },
@@ -18,6 +25,11 @@ static const struct nla_policy nfsd_threads_set_nl_policy[NFSD_A_SERVER_SCOPE +
 	[NFSD_A_SERVER_SCOPE] = { .type = NLA_NUL_STRING, },
 };
 
+/* NFSD_CMD_VERSION_SET - do */
+static const struct nla_policy nfsd_version_set_nl_policy[NFSD_A_SERVER_PROTO_VERSION + 1] = {
+	[NFSD_A_SERVER_PROTO_VERSION] = NLA_POLICY_NESTED(nfsd_version_nl_policy),
+};
+
 /* Ops table for nfsd */
 static const struct genl_split_ops nfsd_nl_ops[] = {
 	{
@@ -39,6 +51,18 @@ static const struct genl_split_ops nfsd_nl_ops[] = {
 		.doit	= nfsd_nl_threads_get_doit,
 		.flags	= GENL_CMD_CAP_DO,
 	},
+	{
+		.cmd		= NFSD_CMD_VERSION_SET,
+		.doit		= nfsd_nl_version_set_doit,
+		.policy		= nfsd_version_set_nl_policy,
+		.maxattr	= NFSD_A_SERVER_PROTO_VERSION,
+		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+	},
+	{
+		.cmd	= NFSD_CMD_VERSION_GET,
+		.doit	= nfsd_nl_version_get_doit,
+		.flags	= GENL_CMD_CAP_DO,
+	},
 };
 
 struct genl_family nfsd_nl_family __ro_after_init = {
diff --git a/fs/nfsd/netlink.h b/fs/nfsd/netlink.h
index 4137fac477e4..c7c0da275481 100644
--- a/fs/nfsd/netlink.h
+++ b/fs/nfsd/netlink.h
@@ -11,6 +11,9 @@
 
 #include <uapi/linux/nfsd_netlink.h>
 
+/* Common nested types */
+extern const struct nla_policy nfsd_version_nl_policy[NFSD_A_VERSION_ENABLED + 1];
+
 int nfsd_nl_rpc_status_get_start(struct netlink_callback *cb);
 int nfsd_nl_rpc_status_get_done(struct netlink_callback *cb);
 
@@ -18,6 +21,8 @@ int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb,
 				  struct netlink_callback *cb);
 int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info);
 int nfsd_nl_threads_get_doit(struct sk_buff *skb, struct genl_info *info);
+int nfsd_nl_version_set_doit(struct sk_buff *skb, struct genl_info *info);
+int nfsd_nl_version_get_doit(struct sk_buff *skb, struct genl_info *info);
 
 extern struct genl_family nfsd_nl_family;
 
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index d4be519b5734..14ec15656320 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -218,6 +218,7 @@ struct nfsd_net {
 /* Simple check to find out if a given net was properly initialized */
 #define nfsd_netns_ready(nn) ((nn)->sessionid_hashtbl)
 
+extern bool nfsd_support_version(int vers);
 extern void nfsd_netns_free_versions(struct nfsd_net *nn);
 
 extern unsigned int nfsd_net_id;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 5bfdebb2e7e9..b6b405437aa0 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1796,6 +1796,156 @@ err_free_msg:
 	return err;
 }
 
+/**
+ * nfsd_nl_version_set_doit - set the nfs enabled versions
+ * @skb: reply buffer
+ * @info: netlink metadata and command arguments
+ *
+ * Return 0 on success or a negative errno.
+ */
+int nfsd_nl_version_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	const struct nlattr *attr;
+	struct nfsd_net *nn;
+	int i, rem;
+
+	if (GENL_REQ_ATTR_CHECK(info, NFSD_A_SERVER_PROTO_VERSION))
+		return -EINVAL;
+
+	mutex_lock(&nfsd_mutex);
+
+	nn = net_generic(genl_info_net(info), nfsd_net_id);
+	if (nn->nfsd_serv) {
+		mutex_unlock(&nfsd_mutex);
+		return -EBUSY;
+	}
+
+	/* clear current supported versions. */
+	nfsd_vers(nn, 2, NFSD_CLEAR);
+	nfsd_vers(nn, 3, NFSD_CLEAR);
+	for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++)
+		nfsd_minorversion(nn, i, NFSD_CLEAR);
+
+	nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) {
+		struct nlattr *tb[NFSD_A_VERSION_MAX + 1];
+		u32 major, minor = 0;
+		bool enabled;
+
+		if (nla_type(attr) != NFSD_A_SERVER_PROTO_VERSION)
+			continue;
+
+		if (nla_parse_nested(tb, NFSD_A_VERSION_MAX, attr,
+				     nfsd_version_nl_policy, info->extack) < 0)
+			continue;
+
+		if (!tb[NFSD_A_VERSION_MAJOR])
+			continue;
+
+		major = nla_get_u32(tb[NFSD_A_VERSION_MAJOR]);
+		if (tb[NFSD_A_VERSION_MINOR])
+			minor = nla_get_u32(tb[NFSD_A_VERSION_MINOR]);
+
+		enabled = nla_get_flag(tb[NFSD_A_VERSION_ENABLED]);
+
+		switch (major) {
+		case 4:
+			nfsd_minorversion(nn, minor, enabled ? NFSD_SET : NFSD_CLEAR);
+			break;
+		case 3:
+		case 2:
+			if (!minor)
+				nfsd_vers(nn, major, enabled ? NFSD_SET : NFSD_CLEAR);
+			break;
+		default:
+			break;
+		}
+	}
+
+	mutex_unlock(&nfsd_mutex);
+
+	return 0;
+}
+
+/**
+ * nfsd_nl_version_get_doit - get the enabled status for all supported nfs versions
+ * @skb: reply buffer
+ * @info: netlink metadata and command arguments
+ *
+ * Return 0 on success or a negative errno.
+ */
+int nfsd_nl_version_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nfsd_net *nn;
+	int i, err;
+	void *hdr;
+
+	skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	hdr = genlmsg_iput(skb, info);
+	if (!hdr) {
+		err = -EMSGSIZE;
+		goto err_free_msg;
+	}
+
+	mutex_lock(&nfsd_mutex);
+	nn = net_generic(genl_info_net(info), nfsd_net_id);
+
+	for (i = 2; i <= 4; i++) {
+		int j;
+
+		for (j = 0; j <= NFSD_SUPPORTED_MINOR_VERSION; j++) {
+			struct nlattr *attr;
+
+			/* Don't record any versions the kernel doesn't have
+			 * compiled in
+			 */
+			if (!nfsd_support_version(i))
+				continue;
+
+			/* NFSv{2,3} does not support minor numbers */
+			if (i < 4 && j)
+				continue;
+
+			attr = nla_nest_start(skb,
+					      NFSD_A_SERVER_PROTO_VERSION);
+			if (!attr) {
+				err = -EINVAL;
+				goto err_nfsd_unlock;
+			}
+
+			if (nla_put_u32(skb, NFSD_A_VERSION_MAJOR, i) ||
+			    nla_put_u32(skb, NFSD_A_VERSION_MINOR, j)) {
+				err = -EINVAL;
+				goto err_nfsd_unlock;
+			}
+
+			/* Set the enabled flag if the version is enabled */
+			if (nfsd_vers(nn, i, NFSD_TEST) &&
+			    (i < 4 || nfsd_minorversion(nn, j, NFSD_TEST)) &&
+			    nla_put_flag(skb, NFSD_A_VERSION_ENABLED)) {
+				err = -EINVAL;
+				goto err_nfsd_unlock;
+			}
+
+			nla_nest_end(skb, attr);
+		}
+	}
+
+	mutex_unlock(&nfsd_mutex);
+	genlmsg_end(skb, hdr);
+
+	return genlmsg_reply(skb, info);
+
+err_nfsd_unlock:
+	mutex_unlock(&nfsd_mutex);
+err_free_msg:
+	nlmsg_free(skb);
+
+	return err;
+}
+
 /**
  * nfsd_net_init - Prepare the nfsd_net portion of a new net namespace
  * @net: a freshly-created network namespace
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index e25b9b829749..cd9a6a1a9fc8 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -133,8 +133,7 @@ struct svc_program		nfsd_program = {
 	.pg_rpcbind_set		= nfsd_rpcbind_set,
 };
 
-static bool
-nfsd_support_version(int vers)
+bool nfsd_support_version(int vers)
 {
 	if (vers >= NFSD_MINVERS && vers < NFSD_NRVERS)
 		return nfsd_version[vers] != NULL;
diff --git a/include/uapi/linux/nfsd_netlink.h b/include/uapi/linux/nfsd_netlink.h
index 4bbccd5db7cd..7176cd8e6f09 100644
--- a/include/uapi/linux/nfsd_netlink.h
+++ b/include/uapi/linux/nfsd_netlink.h
@@ -39,10 +39,28 @@ enum {
 	NFSD_A_SERVER_MAX = (__NFSD_A_SERVER_MAX - 1)
 };
 
+enum {
+	NFSD_A_VERSION_MAJOR = 1,
+	NFSD_A_VERSION_MINOR,
+	NFSD_A_VERSION_ENABLED,
+
+	__NFSD_A_VERSION_MAX,
+	NFSD_A_VERSION_MAX = (__NFSD_A_VERSION_MAX - 1)
+};
+
+enum {
+	NFSD_A_SERVER_PROTO_VERSION = 1,
+
+	__NFSD_A_SERVER_PROTO_MAX,
+	NFSD_A_SERVER_PROTO_MAX = (__NFSD_A_SERVER_PROTO_MAX - 1)
+};
+
 enum {
 	NFSD_CMD_RPC_STATUS_GET = 1,
 	NFSD_CMD_THREADS_SET,
 	NFSD_CMD_THREADS_GET,
+	NFSD_CMD_VERSION_SET,
+	NFSD_CMD_VERSION_GET,
 
 	__NFSD_CMD_MAX,
 	NFSD_CMD_MAX = (__NFSD_CMD_MAX - 1)
-- 
cgit v1.2.3


From a79ec2aecb1b3d9c59241a6155a5554b236de9b7 Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Tue, 23 Apr 2024 15:25:42 +0200
Subject: SUNRPC: introduce svc_xprt_create_from_sa utility routine

Add svc_xprt_create_from_sa utility routine and refactor
svc_xprt_create() codebase in order to introduce the capability to
create a svc port from socket address.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Tested-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_xprt.h |   3 +
 net/sunrpc/svc_xprt.c           | 133 ++++++++++++++++++++++------------------
 2 files changed, 78 insertions(+), 58 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 8e20cd60e2e7..0d9b10dbe07d 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -135,6 +135,9 @@ int	svc_reg_xprt_class(struct svc_xprt_class *);
 void	svc_unreg_xprt_class(struct svc_xprt_class *);
 void	svc_xprt_init(struct net *, struct svc_xprt_class *, struct svc_xprt *,
 		      struct svc_serv *);
+int	svc_xprt_create_from_sa(struct svc_serv *serv, const char *xprt_name,
+				struct net *net, struct sockaddr *sap,
+				int flags, const struct cred *cred);
 int	svc_xprt_create(struct svc_serv *serv, const char *xprt_name,
 			struct net *net, const int family,
 			const unsigned short port, int flags,
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index b4a85a227bd7..463fe544ae28 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -211,51 +211,6 @@ void svc_xprt_init(struct net *net, struct svc_xprt_class *xcl,
 }
 EXPORT_SYMBOL_GPL(svc_xprt_init);
 
-static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
-					 struct svc_serv *serv,
-					 struct net *net,
-					 const int family,
-					 const unsigned short port,
-					 int flags)
-{
-	struct sockaddr_in sin = {
-		.sin_family		= AF_INET,
-		.sin_addr.s_addr	= htonl(INADDR_ANY),
-		.sin_port		= htons(port),
-	};
-#if IS_ENABLED(CONFIG_IPV6)
-	struct sockaddr_in6 sin6 = {
-		.sin6_family		= AF_INET6,
-		.sin6_addr		= IN6ADDR_ANY_INIT,
-		.sin6_port		= htons(port),
-	};
-#endif
-	struct svc_xprt *xprt;
-	struct sockaddr *sap;
-	size_t len;
-
-	switch (family) {
-	case PF_INET:
-		sap = (struct sockaddr *)&sin;
-		len = sizeof(sin);
-		break;
-#if IS_ENABLED(CONFIG_IPV6)
-	case PF_INET6:
-		sap = (struct sockaddr *)&sin6;
-		len = sizeof(sin6);
-		break;
-#endif
-	default:
-		return ERR_PTR(-EAFNOSUPPORT);
-	}
-
-	xprt = xcl->xcl_ops->xpo_create(serv, net, sap, len, flags);
-	if (IS_ERR(xprt))
-		trace_svc_xprt_create_err(serv->sv_program->pg_name,
-					  xcl->xcl_name, sap, len, xprt);
-	return xprt;
-}
-
 /**
  * svc_xprt_received - start next receiver thread
  * @xprt: controlling transport
@@ -294,9 +249,8 @@ void svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *new)
 }
 
 static int _svc_xprt_create(struct svc_serv *serv, const char *xprt_name,
-			    struct net *net, const int family,
-			    const unsigned short port, int flags,
-			    const struct cred *cred)
+			    struct net *net, struct sockaddr *sap,
+			    size_t len, int flags, const struct cred *cred)
 {
 	struct svc_xprt_class *xcl;
 
@@ -312,8 +266,11 @@ static int _svc_xprt_create(struct svc_serv *serv, const char *xprt_name,
 			goto err;
 
 		spin_unlock(&svc_xprt_class_lock);
-		newxprt = __svc_xpo_create(xcl, serv, net, family, port, flags);
+		newxprt = xcl->xcl_ops->xpo_create(serv, net, sap, len, flags);
 		if (IS_ERR(newxprt)) {
+			trace_svc_xprt_create_err(serv->sv_program->pg_name,
+						  xcl->xcl_name, sap, len,
+						  newxprt);
 			module_put(xcl->xcl_owner);
 			return PTR_ERR(newxprt);
 		}
@@ -329,6 +286,48 @@ static int _svc_xprt_create(struct svc_serv *serv, const char *xprt_name,
 	return -EPROTONOSUPPORT;
 }
 
+/**
+ * svc_xprt_create_from_sa - Add a new listener to @serv from socket address
+ * @serv: target RPC service
+ * @xprt_name: transport class name
+ * @net: network namespace
+ * @sap: socket address pointer
+ * @flags: SVC_SOCK flags
+ * @cred: credential to bind to this transport
+ *
+ * Return local xprt port on success or %-EPROTONOSUPPORT on failure
+ */
+int svc_xprt_create_from_sa(struct svc_serv *serv, const char *xprt_name,
+			    struct net *net, struct sockaddr *sap,
+			    int flags, const struct cred *cred)
+{
+	size_t len;
+	int err;
+
+	switch (sap->sa_family) {
+	case AF_INET:
+		len = sizeof(struct sockaddr_in);
+		break;
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		len = sizeof(struct sockaddr_in6);
+		break;
+#endif
+	default:
+		return -EAFNOSUPPORT;
+	}
+
+	err = _svc_xprt_create(serv, xprt_name, net, sap, len, flags, cred);
+	if (err == -EPROTONOSUPPORT) {
+		request_module("svc%s", xprt_name);
+		err = _svc_xprt_create(serv, xprt_name, net, sap, len, flags,
+				       cred);
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(svc_xprt_create_from_sa);
+
 /**
  * svc_xprt_create - Add a new listener to @serv
  * @serv: target RPC service
@@ -339,23 +338,41 @@ static int _svc_xprt_create(struct svc_serv *serv, const char *xprt_name,
  * @flags: SVC_SOCK flags
  * @cred: credential to bind to this transport
  *
- * Return values:
- *   %0: New listener added successfully
- *   %-EPROTONOSUPPORT: Requested transport type not supported
+ * Return local xprt port on success or %-EPROTONOSUPPORT on failure
  */
 int svc_xprt_create(struct svc_serv *serv, const char *xprt_name,
 		    struct net *net, const int family,
 		    const unsigned short port, int flags,
 		    const struct cred *cred)
 {
-	int err;
+	struct sockaddr_in sin = {
+		.sin_family		= AF_INET,
+		.sin_addr.s_addr	= htonl(INADDR_ANY),
+		.sin_port		= htons(port),
+	};
+#if IS_ENABLED(CONFIG_IPV6)
+	struct sockaddr_in6 sin6 = {
+		.sin6_family		= AF_INET6,
+		.sin6_addr		= IN6ADDR_ANY_INIT,
+		.sin6_port		= htons(port),
+	};
+#endif
+	struct sockaddr *sap;
 
-	err = _svc_xprt_create(serv, xprt_name, net, family, port, flags, cred);
-	if (err == -EPROTONOSUPPORT) {
-		request_module("svc%s", xprt_name);
-		err = _svc_xprt_create(serv, xprt_name, net, family, port, flags, cred);
+	switch (family) {
+	case PF_INET:
+		sap = (struct sockaddr *)&sin;
+		break;
+#if IS_ENABLED(CONFIG_IPV6)
+	case PF_INET6:
+		sap = (struct sockaddr *)&sin6;
+		break;
+#endif
+	default:
+		return -EAFNOSUPPORT;
 	}
-	return err;
+
+	return svc_xprt_create_from_sa(serv, xprt_name, net, sap, flags, cred);
 }
 EXPORT_SYMBOL_GPL(svc_xprt_create);
 
-- 
cgit v1.2.3


From cf619507ae8d30b5735e9356d85a8df2bcf2b6ca Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Tue, 23 Apr 2024 15:25:43 +0200
Subject: SUNRPC: add a new svc_find_listener helper

svc_find_listener will return the transport instance pointer for the
endpoint accepting connections/peer traffic from the specified transport
class and matching sockaddr.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/sunrpc/svc_xprt.h |  2 ++
 net/sunrpc/svc_xprt.c           | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+)

(limited to 'include')

diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 0d9b10dbe07d..0981e35a9fed 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -150,6 +150,8 @@ void	svc_xprt_copy_addrs(struct svc_rqst *rqstp, struct svc_xprt *xprt);
 void	svc_xprt_close(struct svc_xprt *xprt);
 int	svc_port_is_privileged(struct sockaddr *sin);
 int	svc_print_xprts(char *buf, int maxlen);
+struct svc_xprt *svc_find_listener(struct svc_serv *serv, const char *xcl_name,
+				   struct net *net, const struct sockaddr *sa);
 struct	svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name,
 			struct net *net, const sa_family_t af,
 			const unsigned short port);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 463fe544ae28..34a3626c56b1 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -1276,6 +1276,40 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt)
 	return dr;
 }
 
+/**
+ * svc_find_listener - find an RPC transport instance
+ * @serv: pointer to svc_serv to search
+ * @xcl_name: C string containing transport's class name
+ * @net: owner net pointer
+ * @sa: sockaddr containing address
+ *
+ * Return the transport instance pointer for the endpoint accepting
+ * connections/peer traffic from the specified transport class,
+ * and matching sockaddr.
+ */
+struct svc_xprt *svc_find_listener(struct svc_serv *serv, const char *xcl_name,
+				   struct net *net, const struct sockaddr *sa)
+{
+	struct svc_xprt *xprt;
+	struct svc_xprt *found = NULL;
+
+	spin_lock_bh(&serv->sv_lock);
+	list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) {
+		if (xprt->xpt_net != net)
+			continue;
+		if (strcmp(xprt->xpt_class->xcl_name, xcl_name))
+			continue;
+		if (!rpc_cmp_addr_port(sa, (struct sockaddr *)&xprt->xpt_local))
+			continue;
+		found = xprt;
+		svc_xprt_get(xprt);
+		break;
+	}
+	spin_unlock_bh(&serv->sv_lock);
+	return found;
+}
+EXPORT_SYMBOL_GPL(svc_find_listener);
+
 /**
  * svc_find_xprt - find an RPC transport instance
  * @serv: pointer to svc_serv to search
-- 
cgit v1.2.3


From 16a471177496c8e04a9793812c187a2c1a2192fa Mon Sep 17 00:00:00 2001
From: Lorenzo Bianconi <lorenzo@kernel.org>
Date: Tue, 23 Apr 2024 15:25:44 +0200
Subject: NFSD: add listener-{set,get} netlink command

Introduce write_ports netlink command. For listener-set, userspace is
expected to provide a NFS listeners list it wants enabled. All other
sockets will be closed.

Reviewed-by: Jeff Layton <jlayton@kernel.org>
Co-developed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 Documentation/netlink/specs/nfsd.yaml |  34 ++++++
 fs/nfsd/netlink.c                     |  22 ++++
 fs/nfsd/netlink.h                     |   3 +
 fs/nfsd/nfsctl.c                      | 220 ++++++++++++++++++++++++++++++++++
 include/uapi/linux/nfsd_netlink.h     |  17 +++
 5 files changed, 296 insertions(+)

(limited to 'include')

diff --git a/Documentation/netlink/specs/nfsd.yaml b/Documentation/netlink/specs/nfsd.yaml
index 77adda1425e2..d21234097167 100644
--- a/Documentation/netlink/specs/nfsd.yaml
+++ b/Documentation/netlink/specs/nfsd.yaml
@@ -98,6 +98,23 @@ attribute-sets:
         type: nest
         nested-attributes: version
         multi-attr: true
+  -
+    name: sock
+    attributes:
+      -
+        name: addr
+        type: binary
+      -
+        name: transport-name
+        type: string
+  -
+    name: server-sock
+    attributes:
+      -
+        name: addr
+        type: nest
+        nested-attributes: sock
+        multi-attr: true
 
 operations:
   list:
@@ -163,3 +180,20 @@ operations:
         reply:
           attributes:
             - version
+    -
+      name: listener-set
+      doc: set nfs running sockets
+      attribute-set: server-sock
+      flags: [ admin-perm ]
+      do:
+        request:
+          attributes:
+            - addr
+    -
+      name: listener-get
+      doc: get nfs running listeners
+      attribute-set: server-sock
+      do:
+        reply:
+          attributes:
+            - addr
diff --git a/fs/nfsd/netlink.c b/fs/nfsd/netlink.c
index b23b0b84a59a..62d2586d9902 100644
--- a/fs/nfsd/netlink.c
+++ b/fs/nfsd/netlink.c
@@ -11,6 +11,11 @@
 #include <uapi/linux/nfsd_netlink.h>
 
 /* Common nested types */
+const struct nla_policy nfsd_sock_nl_policy[NFSD_A_SOCK_TRANSPORT_NAME + 1] = {
+	[NFSD_A_SOCK_ADDR] = { .type = NLA_BINARY, },
+	[NFSD_A_SOCK_TRANSPORT_NAME] = { .type = NLA_NUL_STRING, },
+};
+
 const struct nla_policy nfsd_version_nl_policy[NFSD_A_VERSION_ENABLED + 1] = {
 	[NFSD_A_VERSION_MAJOR] = { .type = NLA_U32, },
 	[NFSD_A_VERSION_MINOR] = { .type = NLA_U32, },
@@ -30,6 +35,11 @@ static const struct nla_policy nfsd_version_set_nl_policy[NFSD_A_SERVER_PROTO_VE
 	[NFSD_A_SERVER_PROTO_VERSION] = NLA_POLICY_NESTED(nfsd_version_nl_policy),
 };
 
+/* NFSD_CMD_LISTENER_SET - do */
+static const struct nla_policy nfsd_listener_set_nl_policy[NFSD_A_SERVER_SOCK_ADDR + 1] = {
+	[NFSD_A_SERVER_SOCK_ADDR] = NLA_POLICY_NESTED(nfsd_sock_nl_policy),
+};
+
 /* Ops table for nfsd */
 static const struct genl_split_ops nfsd_nl_ops[] = {
 	{
@@ -63,6 +73,18 @@ static const struct genl_split_ops nfsd_nl_ops[] = {
 		.doit	= nfsd_nl_version_get_doit,
 		.flags	= GENL_CMD_CAP_DO,
 	},
+	{
+		.cmd		= NFSD_CMD_LISTENER_SET,
+		.doit		= nfsd_nl_listener_set_doit,
+		.policy		= nfsd_listener_set_nl_policy,
+		.maxattr	= NFSD_A_SERVER_SOCK_ADDR,
+		.flags		= GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+	},
+	{
+		.cmd	= NFSD_CMD_LISTENER_GET,
+		.doit	= nfsd_nl_listener_get_doit,
+		.flags	= GENL_CMD_CAP_DO,
+	},
 };
 
 struct genl_family nfsd_nl_family __ro_after_init = {
diff --git a/fs/nfsd/netlink.h b/fs/nfsd/netlink.h
index c7c0da275481..e3724637d64d 100644
--- a/fs/nfsd/netlink.h
+++ b/fs/nfsd/netlink.h
@@ -12,6 +12,7 @@
 #include <uapi/linux/nfsd_netlink.h>
 
 /* Common nested types */
+extern const struct nla_policy nfsd_sock_nl_policy[NFSD_A_SOCK_TRANSPORT_NAME + 1];
 extern const struct nla_policy nfsd_version_nl_policy[NFSD_A_VERSION_ENABLED + 1];
 
 int nfsd_nl_rpc_status_get_start(struct netlink_callback *cb);
@@ -23,6 +24,8 @@ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info);
 int nfsd_nl_threads_get_doit(struct sk_buff *skb, struct genl_info *info);
 int nfsd_nl_version_set_doit(struct sk_buff *skb, struct genl_info *info);
 int nfsd_nl_version_get_doit(struct sk_buff *skb, struct genl_info *info);
+int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info);
+int nfsd_nl_listener_get_doit(struct sk_buff *skb, struct genl_info *info);
 
 extern struct genl_family nfsd_nl_family;
 
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index b6b405437aa0..202140df8f82 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1946,6 +1946,226 @@ err_free_msg:
 	return err;
 }
 
+/**
+ * nfsd_nl_listener_set_doit - set the nfs running sockets
+ * @skb: reply buffer
+ * @info: netlink metadata and command arguments
+ *
+ * Return 0 on success or a negative errno.
+ */
+int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct net *net = genl_info_net(info);
+	struct svc_xprt *xprt, *tmp;
+	const struct nlattr *attr;
+	struct svc_serv *serv;
+	LIST_HEAD(permsocks);
+	struct nfsd_net *nn;
+	int err, rem;
+
+	mutex_lock(&nfsd_mutex);
+
+	err = nfsd_create_serv(net);
+	if (err) {
+		mutex_unlock(&nfsd_mutex);
+		return err;
+	}
+
+	nn = net_generic(net, nfsd_net_id);
+	serv = nn->nfsd_serv;
+
+	spin_lock_bh(&serv->sv_lock);
+
+	/* Move all of the old listener sockets to a temp list */
+	list_splice_init(&serv->sv_permsocks, &permsocks);
+
+	/*
+	 * Walk the list of server_socks from userland and move any that match
+	 * back to sv_permsocks
+	 */
+	nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) {
+		struct nlattr *tb[NFSD_A_SOCK_MAX + 1];
+		const char *xcl_name;
+		struct sockaddr *sa;
+
+		if (nla_type(attr) != NFSD_A_SERVER_SOCK_ADDR)
+			continue;
+
+		if (nla_parse_nested(tb, NFSD_A_SOCK_MAX, attr,
+				     nfsd_sock_nl_policy, info->extack) < 0)
+			continue;
+
+		if (!tb[NFSD_A_SOCK_ADDR] || !tb[NFSD_A_SOCK_TRANSPORT_NAME])
+			continue;
+
+		if (nla_len(tb[NFSD_A_SOCK_ADDR]) < sizeof(*sa))
+			continue;
+
+		xcl_name = nla_data(tb[NFSD_A_SOCK_TRANSPORT_NAME]);
+		sa = nla_data(tb[NFSD_A_SOCK_ADDR]);
+
+		/* Put back any matching sockets */
+		list_for_each_entry_safe(xprt, tmp, &permsocks, xpt_list) {
+			/* This shouldn't be possible */
+			if (WARN_ON_ONCE(xprt->xpt_net != net)) {
+				list_move(&xprt->xpt_list, &serv->sv_permsocks);
+				continue;
+			}
+
+			/* If everything matches, put it back */
+			if (!strcmp(xprt->xpt_class->xcl_name, xcl_name) &&
+			    rpc_cmp_addr_port(sa, (struct sockaddr *)&xprt->xpt_local)) {
+				list_move(&xprt->xpt_list, &serv->sv_permsocks);
+				break;
+			}
+		}
+	}
+
+	/* For now, no removing old sockets while server is running */
+	if (serv->sv_nrthreads && !list_empty(&permsocks)) {
+		list_splice_init(&permsocks, &serv->sv_permsocks);
+		spin_unlock_bh(&serv->sv_lock);
+		err = -EBUSY;
+		goto out_unlock_mtx;
+	}
+
+	/* Close the remaining sockets on the permsocks list */
+	while (!list_empty(&permsocks)) {
+		xprt = list_first_entry(&permsocks, struct svc_xprt, xpt_list);
+		list_move(&xprt->xpt_list, &serv->sv_permsocks);
+
+		/*
+		 * Newly-created sockets are born with the BUSY bit set. Clear
+		 * it if there are no threads, since nothing can pick it up
+		 * in that case.
+		 */
+		if (!serv->sv_nrthreads)
+			clear_bit(XPT_BUSY, &xprt->xpt_flags);
+
+		set_bit(XPT_CLOSE, &xprt->xpt_flags);
+		spin_unlock_bh(&serv->sv_lock);
+		svc_xprt_close(xprt);
+		spin_lock_bh(&serv->sv_lock);
+	}
+
+	spin_unlock_bh(&serv->sv_lock);
+
+	/* walk list of addrs again, open any that still don't exist */
+	nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) {
+		struct nlattr *tb[NFSD_A_SOCK_MAX + 1];
+		const char *xcl_name;
+		struct sockaddr *sa;
+		int ret;
+
+		if (nla_type(attr) != NFSD_A_SERVER_SOCK_ADDR)
+			continue;
+
+		if (nla_parse_nested(tb, NFSD_A_SOCK_MAX, attr,
+				     nfsd_sock_nl_policy, info->extack) < 0)
+			continue;
+
+		if (!tb[NFSD_A_SOCK_ADDR] || !tb[NFSD_A_SOCK_TRANSPORT_NAME])
+			continue;
+
+		if (nla_len(tb[NFSD_A_SOCK_ADDR]) < sizeof(*sa))
+			continue;
+
+		xcl_name = nla_data(tb[NFSD_A_SOCK_TRANSPORT_NAME]);
+		sa = nla_data(tb[NFSD_A_SOCK_ADDR]);
+
+		xprt = svc_find_listener(serv, xcl_name, net, sa);
+		if (xprt) {
+			svc_xprt_put(xprt);
+			continue;
+		}
+
+		ret = svc_xprt_create_from_sa(serv, xcl_name, net, sa,
+					      SVC_SOCK_ANONYMOUS,
+					      get_current_cred());
+		/* always save the latest error */
+		if (ret < 0)
+			err = ret;
+	}
+
+	if (!serv->sv_nrthreads && list_empty(&nn->nfsd_serv->sv_permsocks))
+		nfsd_destroy_serv(net);
+
+out_unlock_mtx:
+	mutex_unlock(&nfsd_mutex);
+
+	return err;
+}
+
+/**
+ * nfsd_nl_listener_get_doit - get the nfs running listeners
+ * @skb: reply buffer
+ * @info: netlink metadata and command arguments
+ *
+ * Return 0 on success or a negative errno.
+ */
+int nfsd_nl_listener_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct svc_xprt *xprt;
+	struct svc_serv *serv;
+	struct nfsd_net *nn;
+	void *hdr;
+	int err;
+
+	skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	hdr = genlmsg_iput(skb, info);
+	if (!hdr) {
+		err = -EMSGSIZE;
+		goto err_free_msg;
+	}
+
+	mutex_lock(&nfsd_mutex);
+	nn = net_generic(genl_info_net(info), nfsd_net_id);
+
+	/* no nfs server? Just send empty socket list */
+	if (!nn->nfsd_serv)
+		goto out_unlock_mtx;
+
+	serv = nn->nfsd_serv;
+	spin_lock_bh(&serv->sv_lock);
+	list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) {
+		struct nlattr *attr;
+
+		attr = nla_nest_start(skb, NFSD_A_SERVER_SOCK_ADDR);
+		if (!attr) {
+			err = -EINVAL;
+			goto err_serv_unlock;
+		}
+
+		if (nla_put_string(skb, NFSD_A_SOCK_TRANSPORT_NAME,
+				   xprt->xpt_class->xcl_name) ||
+		    nla_put(skb, NFSD_A_SOCK_ADDR,
+			    sizeof(struct sockaddr_storage),
+			    &xprt->xpt_local)) {
+			err = -EINVAL;
+			goto err_serv_unlock;
+		}
+
+		nla_nest_end(skb, attr);
+	}
+	spin_unlock_bh(&serv->sv_lock);
+out_unlock_mtx:
+	mutex_unlock(&nfsd_mutex);
+	genlmsg_end(skb, hdr);
+
+	return genlmsg_reply(skb, info);
+
+err_serv_unlock:
+	spin_unlock_bh(&serv->sv_lock);
+	mutex_unlock(&nfsd_mutex);
+err_free_msg:
+	nlmsg_free(skb);
+
+	return err;
+}
+
 /**
  * nfsd_net_init - Prepare the nfsd_net portion of a new net namespace
  * @net: a freshly-created network namespace
diff --git a/include/uapi/linux/nfsd_netlink.h b/include/uapi/linux/nfsd_netlink.h
index 7176cd8e6f09..24c86dbc7ed5 100644
--- a/include/uapi/linux/nfsd_netlink.h
+++ b/include/uapi/linux/nfsd_netlink.h
@@ -55,12 +55,29 @@ enum {
 	NFSD_A_SERVER_PROTO_MAX = (__NFSD_A_SERVER_PROTO_MAX - 1)
 };
 
+enum {
+	NFSD_A_SOCK_ADDR = 1,
+	NFSD_A_SOCK_TRANSPORT_NAME,
+
+	__NFSD_A_SOCK_MAX,
+	NFSD_A_SOCK_MAX = (__NFSD_A_SOCK_MAX - 1)
+};
+
+enum {
+	NFSD_A_SERVER_SOCK_ADDR = 1,
+
+	__NFSD_A_SERVER_SOCK_MAX,
+	NFSD_A_SERVER_SOCK_MAX = (__NFSD_A_SERVER_SOCK_MAX - 1)
+};
+
 enum {
 	NFSD_CMD_RPC_STATUS_GET = 1,
 	NFSD_CMD_THREADS_SET,
 	NFSD_CMD_THREADS_GET,
 	NFSD_CMD_VERSION_SET,
 	NFSD_CMD_VERSION_GET,
+	NFSD_CMD_LISTENER_SET,
+	NFSD_CMD_LISTENER_GET,
 
 	__NFSD_CMD_MAX,
 	NFSD_CMD_MAX = (__NFSD_CMD_MAX - 1)
-- 
cgit v1.2.3


From b3e90f375b3c7ab85aef631ebb0ad8ce66cbf3fd Mon Sep 17 00:00:00 2001
From: Yoann Congal <yoann.congal@smile.fr>
Date: Sun, 5 May 2024 10:03:42 +0200
Subject: printk: Change type of CONFIG_BASE_SMALL to bool

CONFIG_BASE_SMALL is currently a type int but is only used as a boolean.

So, change its type to bool and adapt all usages:
CONFIG_BASE_SMALL == 0 becomes !IS_ENABLED(CONFIG_BASE_SMALL) and
CONFIG_BASE_SMALL != 0 becomes  IS_ENABLED(CONFIG_BASE_SMALL).

Reviewed-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Masahiro Yamada <masahiroy@kernel.org>
Signed-off-by: Yoann Congal <yoann.congal@smile.fr>
Link: https://lore.kernel.org/r/20240505080343.1471198-3-yoann.congal@smile.fr
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 arch/x86/include/asm/mpspec.h | 6 +++---
 drivers/tty/vt/vc_screen.c    | 2 +-
 include/linux/threads.h       | 4 ++--
 include/linux/udp.h           | 2 +-
 include/linux/xarray.h        | 2 +-
 init/Kconfig                  | 6 ++----
 kernel/futex/core.c           | 2 +-
 kernel/user.c                 | 2 +-
 8 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index c72c7ff78fcd..d593e52e6635 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -16,10 +16,10 @@ extern int pic_mode;
  * Summit or generic (i.e. installer) kernels need lots of bus entries.
  * Maximum 256 PCI busses, plus 1 ISA bus in each of 4 cabinets.
  */
-#if CONFIG_BASE_SMALL == 0
-# define MAX_MP_BUSSES		260
-#else
+#ifdef CONFIG_BASE_SMALL
 # define MAX_MP_BUSSES		32
+#else
+# define MAX_MP_BUSSES		260
 #endif
 
 #define MAX_IRQ_SOURCES		256
diff --git a/drivers/tty/vt/vc_screen.c b/drivers/tty/vt/vc_screen.c
index 67e2cb7c96ee..da33c6c4691c 100644
--- a/drivers/tty/vt/vc_screen.c
+++ b/drivers/tty/vt/vc_screen.c
@@ -51,7 +51,7 @@
 #include <asm/unaligned.h>
 
 #define HEADER_SIZE	4u
-#define CON_BUF_SIZE (CONFIG_BASE_SMALL ? 256 : PAGE_SIZE)
+#define CON_BUF_SIZE (IS_ENABLED(CONFIG_BASE_SMALL) ? 256 : PAGE_SIZE)
 
 /*
  * Our minor space:
diff --git a/include/linux/threads.h b/include/linux/threads.h
index c34173e6c5f1..1674a471b0b4 100644
--- a/include/linux/threads.h
+++ b/include/linux/threads.h
@@ -25,13 +25,13 @@
 /*
  * This controls the default maximum pid allocated to a process
  */
-#define PID_MAX_DEFAULT (CONFIG_BASE_SMALL ? 0x1000 : 0x8000)
+#define PID_MAX_DEFAULT (IS_ENABLED(CONFIG_BASE_SMALL) ? 0x1000 : 0x8000)
 
 /*
  * A maximum of 4 million PIDs should be enough for a while.
  * [NOTE: PID/TIDs are limited to 2^30 ~= 1 billion, see FUTEX_TID_MASK.]
  */
-#define PID_MAX_LIMIT (CONFIG_BASE_SMALL ? PAGE_SIZE * 8 : \
+#define PID_MAX_LIMIT (IS_ENABLED(CONFIG_BASE_SMALL) ? PAGE_SIZE * 8 : \
 	(sizeof(long) > 4 ? 4 * 1024 * 1024 : PID_MAX_DEFAULT))
 
 /*
diff --git a/include/linux/udp.h b/include/linux/udp.h
index 3748e82b627b..2d21cdb41ad8 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -24,7 +24,7 @@ static inline struct udphdr *udp_hdr(const struct sk_buff *skb)
 }
 
 #define UDP_HTABLE_SIZE_MIN_PERNET	128
-#define UDP_HTABLE_SIZE_MIN		(CONFIG_BASE_SMALL ? 128 : 256)
+#define UDP_HTABLE_SIZE_MIN		(IS_ENABLED(CONFIG_BASE_SMALL) ? 128 : 256)
 #define UDP_HTABLE_SIZE_MAX		65536
 
 static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask)
diff --git a/include/linux/xarray.h b/include/linux/xarray.h
index cb571dfcf4b1..3f81ee5f9fb9 100644
--- a/include/linux/xarray.h
+++ b/include/linux/xarray.h
@@ -1141,7 +1141,7 @@ static inline void xa_release(struct xarray *xa, unsigned long index)
  * doubled the number of slots per node, we'd get only 3 nodes per 4kB page.
  */
 #ifndef XA_CHUNK_SHIFT
-#define XA_CHUNK_SHIFT		(CONFIG_BASE_SMALL ? 4 : 6)
+#define XA_CHUNK_SHIFT		(IS_ENABLED(CONFIG_BASE_SMALL) ? 4 : 6)
 #endif
 #define XA_CHUNK_SIZE		(1UL << XA_CHUNK_SHIFT)
 #define XA_CHUNK_MASK		(XA_CHUNK_SIZE - 1)
diff --git a/init/Kconfig b/init/Kconfig
index ff5b607c3218..4b517523d566 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -743,7 +743,7 @@ config LOG_CPU_MAX_BUF_SHIFT
 	int "CPU kernel log buffer size contribution (13 => 8 KB, 17 => 128KB)"
 	depends on SMP
 	range 0 21
-	default 0 if BASE_SMALL != 0
+	default 0 if BASE_SMALL
 	default 12
 	depends on PRINTK
 	help
@@ -1945,9 +1945,7 @@ config RT_MUTEXES
 	default y if PREEMPT_RT
 
 config BASE_SMALL
-	int
-	default 0 if BASE_FULL
-	default 1 if !BASE_FULL
+	def_bool !BASE_FULL
 
 config MODULE_SIG_FORMAT
 	def_bool n
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 1e78ef24321e..06a1f091be81 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -1150,7 +1150,7 @@ static int __init futex_init(void)
 	unsigned int futex_shift;
 	unsigned long i;
 
-#if CONFIG_BASE_SMALL
+#ifdef CONFIG_BASE_SMALL
 	futex_hashsize = 16;
 #else
 	futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
diff --git a/kernel/user.c b/kernel/user.c
index 03cedc366dc9..aa1162deafe4 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -88,7 +88,7 @@ EXPORT_SYMBOL_GPL(init_user_ns);
  * when changing user ID's (ie setuid() and friends).
  */
 
-#define UIDHASH_BITS	(CONFIG_BASE_SMALL ? 3 : 7)
+#define UIDHASH_BITS	(IS_ENABLED(CONFIG_BASE_SMALL) ? 3 : 7)
 #define UIDHASH_SZ	(1 << UIDHASH_BITS)
 #define UIDHASH_MASK		(UIDHASH_SZ - 1)
 #define __uidhashfn(uid)	(((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
-- 
cgit v1.2.3


From 939cb14d51a150e3c12ef7a8ce0ba04ce6131bd2 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@hammerspace.com>
Date: Mon, 6 May 2024 12:30:05 -0400
Subject: NFS/knfsd: Remove the invalid NFS error 'NFSERR_OPNOTSUPP'

NFSERR_OPNOTSUPP is not described by any RFC, and should not be used.

Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfsd/nfsd.h           | 1 -
 include/trace/misc/nfs.h | 2 --
 include/uapi/linux/nfs.h | 1 -
 3 files changed, 4 deletions(-)

(limited to 'include')

diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 2f6c6f3815b4..8f4f239d9f8a 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -230,7 +230,6 @@ void		nfsd_lockd_shutdown(void);
 #define	nfserr_nospc		cpu_to_be32(NFSERR_NOSPC)
 #define	nfserr_rofs		cpu_to_be32(NFSERR_ROFS)
 #define	nfserr_mlink		cpu_to_be32(NFSERR_MLINK)
-#define	nfserr_opnotsupp	cpu_to_be32(NFSERR_OPNOTSUPP)
 #define	nfserr_nametoolong	cpu_to_be32(NFSERR_NAMETOOLONG)
 #define	nfserr_notempty		cpu_to_be32(NFSERR_NOTEMPTY)
 #define	nfserr_dquot		cpu_to_be32(NFSERR_DQUOT)
diff --git a/include/trace/misc/nfs.h b/include/trace/misc/nfs.h
index e43e74591561..7b221d51133a 100644
--- a/include/trace/misc/nfs.h
+++ b/include/trace/misc/nfs.h
@@ -28,7 +28,6 @@ TRACE_DEFINE_ENUM(NFSERR_FBIG);
 TRACE_DEFINE_ENUM(NFSERR_NOSPC);
 TRACE_DEFINE_ENUM(NFSERR_ROFS);
 TRACE_DEFINE_ENUM(NFSERR_MLINK);
-TRACE_DEFINE_ENUM(NFSERR_OPNOTSUPP);
 TRACE_DEFINE_ENUM(NFSERR_NAMETOOLONG);
 TRACE_DEFINE_ENUM(NFSERR_NOTEMPTY);
 TRACE_DEFINE_ENUM(NFSERR_DQUOT);
@@ -64,7 +63,6 @@ TRACE_DEFINE_ENUM(NFSERR_JUKEBOX);
 		{ NFSERR_NOSPC,			"NOSPC" }, \
 		{ NFSERR_ROFS,			"ROFS" }, \
 		{ NFSERR_MLINK,			"MLINK" }, \
-		{ NFSERR_OPNOTSUPP,		"OPNOTSUPP" }, \
 		{ NFSERR_NAMETOOLONG,		"NAMETOOLONG" }, \
 		{ NFSERR_NOTEMPTY,		"NOTEMPTY" }, \
 		{ NFSERR_DQUOT,			"DQUOT" }, \
diff --git a/include/uapi/linux/nfs.h b/include/uapi/linux/nfs.h
index 946cb62d64b0..f356f2ba3814 100644
--- a/include/uapi/linux/nfs.h
+++ b/include/uapi/linux/nfs.h
@@ -61,7 +61,6 @@
 	NFSERR_NOSPC = 28,		/* v2 v3 v4 */
 	NFSERR_ROFS = 30,		/* v2 v3 v4 */
 	NFSERR_MLINK = 31,		/*    v3 v4 */
-	NFSERR_OPNOTSUPP = 45,		/* v2 v3 */
 	NFSERR_NAMETOOLONG = 63,	/* v2 v3 v4 */
 	NFSERR_NOTEMPTY = 66,		/* v2 v3 v4 */
 	NFSERR_DQUOT = 69,		/* v2 v3 v4 */
-- 
cgit v1.2.3


From 3a35c13007dea132a65f07de05c26b87837fadc2 Mon Sep 17 00:00:00 2001
From: Mickaël Salaün <mic@digikod.net>
Date: Mon, 8 Apr 2024 09:46:22 +0200
Subject: kunit: Handle test faults
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, when a kernel test thread crashed (e.g. NULL pointer
dereference, general protection fault), the KUnit test hanged for 30
seconds and exited with a timeout error.

Fix this issue by waiting on task_struct->vfork_done instead of the
custom kunit_try_catch.try_completion, and track the execution state by
initially setting try_result with -EINTR and only setting it to 0 if
the test passed.

Fix kunit_generic_run_threadfn_adapter() signature by returning 0
instead of calling kthread_complete_and_exit().  Because thread's exit
code is never checked, always set it to 0 to make it clear.  To make
this explicit, export kthread_exit() for KUnit tests built as module.

Fix the -EINTR error message, which couldn't be reached until now.

This is tested with a following patch.

Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: David Gow <davidgow@google.com>
Tested-by: Rae Moar <rmoar@google.com>
Signed-off-by: Mickaël Salaün <mic@digikod.net>
Link: https://lore.kernel.org/r/20240408074625.65017-5-mic@digikod.net
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 include/kunit/try-catch.h |  3 ---
 kernel/kthread.c          |  1 +
 lib/kunit/try-catch.c     | 19 ++++++++++++-------
 3 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/kunit/try-catch.h b/include/kunit/try-catch.h
index c507dd43119d..7c966a1adbd3 100644
--- a/include/kunit/try-catch.h
+++ b/include/kunit/try-catch.h
@@ -14,13 +14,11 @@
 
 typedef void (*kunit_try_catch_func_t)(void *);
 
-struct completion;
 struct kunit;
 
 /**
  * struct kunit_try_catch - provides a generic way to run code which might fail.
  * @test: The test case that is currently being executed.
- * @try_completion: Completion that the control thread waits on while test runs.
  * @try_result: Contains any errno obtained while running test case.
  * @try: The function, the test case, to attempt to run.
  * @catch: The function called if @try bails out.
@@ -46,7 +44,6 @@ struct kunit;
 struct kunit_try_catch {
 	/* private: internal use only. */
 	struct kunit *test;
-	struct completion *try_completion;
 	int try_result;
 	kunit_try_catch_func_t try;
 	kunit_try_catch_func_t catch;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index c5e40830c1f2..f7be976ff88a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -315,6 +315,7 @@ void __noreturn kthread_exit(long result)
 	kthread->result = result;
 	do_exit(0);
 }
+EXPORT_SYMBOL(kthread_exit);
 
 /**
  * kthread_complete_and_exit - Exit the current kthread.
diff --git a/lib/kunit/try-catch.c b/lib/kunit/try-catch.c
index cab8b24b5d5a..7a3910dd78a6 100644
--- a/lib/kunit/try-catch.c
+++ b/lib/kunit/try-catch.c
@@ -18,7 +18,7 @@
 void __noreturn kunit_try_catch_throw(struct kunit_try_catch *try_catch)
 {
 	try_catch->try_result = -EFAULT;
-	kthread_complete_and_exit(try_catch->try_completion, -EFAULT);
+	kthread_exit(0);
 }
 EXPORT_SYMBOL_GPL(kunit_try_catch_throw);
 
@@ -26,9 +26,12 @@ static int kunit_generic_run_threadfn_adapter(void *data)
 {
 	struct kunit_try_catch *try_catch = data;
 
+	try_catch->try_result = -EINTR;
 	try_catch->try(try_catch->context);
+	if (try_catch->try_result == -EINTR)
+		try_catch->try_result = 0;
 
-	kthread_complete_and_exit(try_catch->try_completion, 0);
+	return 0;
 }
 
 static unsigned long kunit_test_timeout(void)
@@ -58,13 +61,11 @@ static unsigned long kunit_test_timeout(void)
 
 void kunit_try_catch_run(struct kunit_try_catch *try_catch, void *context)
 {
-	DECLARE_COMPLETION_ONSTACK(try_completion);
 	struct kunit *test = try_catch->test;
 	struct task_struct *task_struct;
 	int exit_code, time_remaining;
 
 	try_catch->context = context;
-	try_catch->try_completion = &try_completion;
 	try_catch->try_result = 0;
 	task_struct = kthread_create(kunit_generic_run_threadfn_adapter,
 				     try_catch, "kunit_try_catch_thread");
@@ -75,8 +76,12 @@ void kunit_try_catch_run(struct kunit_try_catch *try_catch, void *context)
 	}
 	get_task_struct(task_struct);
 	wake_up_process(task_struct);
-
-	time_remaining = wait_for_completion_timeout(&try_completion,
+	/*
+	 * As for a vfork(2), task_struct->vfork_done (pointing to the
+	 * underlying kthread->exited) can be used to wait for the end of a
+	 * kernel thread.
+	 */
+	time_remaining = wait_for_completion_timeout(task_struct->vfork_done,
 						     kunit_test_timeout());
 	if (time_remaining == 0) {
 		try_catch->try_result = -ETIMEDOUT;
@@ -92,7 +97,7 @@ void kunit_try_catch_run(struct kunit_try_catch *try_catch, void *context)
 	if (exit_code == -EFAULT)
 		try_catch->try_result = 0;
 	else if (exit_code == -EINTR)
-		kunit_err(test, "wake_up_process() was never called\n");
+		kunit_err(test, "try faulted\n");
 	else if (exit_code == -ETIMEDOUT)
 		kunit_err(test, "try timed out\n");
 	else if (exit_code)
-- 
cgit v1.2.3


From 8bd5d74babc92558da67497918210e053515b2c4 Mon Sep 17 00:00:00 2001
From: Mickaël Salaün <mic@digikod.net>
Date: Mon, 8 Apr 2024 09:46:24 +0200
Subject: kunit: Print last test location on fault
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This helps identify the location of test faults with opportunistic calls
to _KUNIT_SAVE_LOC().  This can be useful while writing tests or
debugging them.  It is possible to call KUNIT_SUCCESS() to explicit save
last location.

Cc: Brendan Higgins <brendanhiggins@google.com>
Cc: David Gow <davidgow@google.com>
Cc: Rae Moar <rmoar@google.com>
Cc: Shuah Khan <skhan@linuxfoundation.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Mickaël Salaün <mic@digikod.net>
Link: https://lore.kernel.org/r/20240408074625.65017-7-mic@digikod.net
Signed-off-by: Shuah Khan <skhan@linuxfoundation.org>
---
 include/kunit/test.h  | 24 +++++++++++++++++++++---
 lib/kunit/try-catch.c | 10 +++++++---
 2 files changed, 28 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/kunit/test.h b/include/kunit/test.h
index 61637ef32302..e32b4cb7afa2 100644
--- a/include/kunit/test.h
+++ b/include/kunit/test.h
@@ -301,6 +301,8 @@ struct kunit {
 	struct list_head resources; /* Protected by lock. */
 
 	char status_comment[KUNIT_STATUS_COMMENT_SIZE];
+	/* Saves the last seen test. Useful to help with faults. */
+	struct kunit_loc last_seen;
 };
 
 static inline void kunit_set_failure(struct kunit *test)
@@ -567,6 +569,15 @@ void __printf(2, 3) kunit_log_append(struct string_stream *log, const char *fmt,
 #define kunit_err(test, fmt, ...) \
 	kunit_printk(KERN_ERR, test, fmt, ##__VA_ARGS__)
 
+/*
+ * Must be called at the beginning of each KUNIT_*_ASSERTION().
+ * Cf. KUNIT_CURRENT_LOC.
+ */
+#define _KUNIT_SAVE_LOC(test) do {					       \
+	WRITE_ONCE(test->last_seen.file, __FILE__);			       \
+	WRITE_ONCE(test->last_seen.line, __LINE__);			       \
+} while (0)
+
 /**
  * KUNIT_SUCCEED() - A no-op expectation. Only exists for code clarity.
  * @test: The test context object.
@@ -575,7 +586,7 @@ void __printf(2, 3) kunit_log_append(struct string_stream *log, const char *fmt,
  * words, it does nothing and only exists for code clarity. See
  * KUNIT_EXPECT_TRUE() for more information.
  */
-#define KUNIT_SUCCEED(test) do {} while (0)
+#define KUNIT_SUCCEED(test) _KUNIT_SAVE_LOC(test)
 
 void __noreturn __kunit_abort(struct kunit *test);
 
@@ -601,14 +612,16 @@ void __printf(6, 7) __kunit_do_failed_assertion(struct kunit *test,
 } while (0)
 
 
-#define KUNIT_FAIL_ASSERTION(test, assert_type, fmt, ...)		       \
+#define KUNIT_FAIL_ASSERTION(test, assert_type, fmt, ...) do {		       \
+	_KUNIT_SAVE_LOC(test);						       \
 	_KUNIT_FAILED(test,						       \
 		      assert_type,					       \
 		      kunit_fail_assert,				       \
 		      kunit_fail_assert_format,				       \
 		      {},						       \
 		      fmt,						       \
-		      ##__VA_ARGS__)
+		      ##__VA_ARGS__);					       \
+} while (0)
 
 /**
  * KUNIT_FAIL() - Always causes a test to fail when evaluated.
@@ -637,6 +650,7 @@ void __printf(6, 7) __kunit_do_failed_assertion(struct kunit *test,
 			      fmt,					       \
 			      ...)					       \
 do {									       \
+	_KUNIT_SAVE_LOC(test);						       \
 	if (likely(!!(condition_) == !!expected_true_))			       \
 		break;							       \
 									       \
@@ -698,6 +712,7 @@ do {									       \
 		.right_text = #right,					       \
 	};								       \
 									       \
+	_KUNIT_SAVE_LOC(test);						       \
 	if (likely(__left op __right))					       \
 		break;							       \
 									       \
@@ -758,6 +773,7 @@ do {									       \
 		.right_text = #right,					       \
 	};								       \
 									       \
+	_KUNIT_SAVE_LOC(test);						       \
 	if (likely((__left) && (__right) && (strcmp(__left, __right) op 0)))   \
 		break;							       \
 									       \
@@ -791,6 +807,7 @@ do {									       \
 		.right_text = #right,					       \
 	};								       \
 									       \
+	_KUNIT_SAVE_LOC(test);						       \
 	if (likely(__left && __right))					       \
 		if (likely(memcmp(__left, __right, __size) op 0))	       \
 			break;						       \
@@ -815,6 +832,7 @@ do {									       \
 do {									       \
 	const typeof(ptr) __ptr = (ptr);				       \
 									       \
+	_KUNIT_SAVE_LOC(test);						       \
 	if (!IS_ERR_OR_NULL(__ptr))					       \
 		break;							       \
 									       \
diff --git a/lib/kunit/try-catch.c b/lib/kunit/try-catch.c
index 7a3910dd78a6..fa687278ccc9 100644
--- a/lib/kunit/try-catch.c
+++ b/lib/kunit/try-catch.c
@@ -96,9 +96,13 @@ void kunit_try_catch_run(struct kunit_try_catch *try_catch, void *context)
 
 	if (exit_code == -EFAULT)
 		try_catch->try_result = 0;
-	else if (exit_code == -EINTR)
-		kunit_err(test, "try faulted\n");
-	else if (exit_code == -ETIMEDOUT)
+	else if (exit_code == -EINTR) {
+		if (test->last_seen.file)
+			kunit_err(test, "try faulted: last line seen %s:%d\n",
+				  test->last_seen.file, test->last_seen.line);
+		else
+			kunit_err(test, "try faulted\n");
+	} else if (exit_code == -ETIMEDOUT)
 		kunit_err(test, "try timed out\n");
 	else if (exit_code)
 		kunit_err(test, "Unknown error: %d\n", exit_code);
-- 
cgit v1.2.3


From 3628e0383dd349f02f882e612ab6184e4bb3dc10 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 6 May 2024 13:28:59 -0700
Subject: Reapply "drm/qxl: simplify qxl_fence_wait"

This reverts commit 07ed11afb68d94eadd4ffc082b97c2331307c5ea.

Stephen Rostedt reports:
 "I went to run my tests on my VMs and the tests hung on boot up.
  Unfortunately, the most I ever got out was:

  [   93.607888] Testing event system initcall: OK
  [   93.667730] Running tests on all trace events:
  [   93.669757] Testing all events: OK
  [   95.631064] ------------[ cut here ]------------
  Timed out after 60 seconds"

and further debugging points to a possible circular locking dependency
between the console_owner locking and the worker pool locking.

Reverting the commit allows Steve's VM to boot to completion again.

[ This may obviously result in the "[TTM] Buffer eviction failed"
  messages again, which was the reason for that original revert. But at
  this point this seems preferable to a non-booting system... ]

Reported-and-bisected-by: Steven Rostedt <rostedt@goodmis.org>
Link: https://lore.kernel.org/all/20240502081641.457aa25f@gandalf.local.home/
Acked-by: Maxime Ripard <mripard@kernel.org>
Cc: Alex Constantino <dreaming.about.electric.sheep@gmail.com>
Cc: Maxime Ripard <mripard@kernel.org>
Cc: Timo Lindfors <timo.lindfors@iki.fi>
Cc: Dave Airlie <airlied@redhat.com>
Cc: Gerd Hoffmann <kraxel@redhat.com>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Thomas Zimmermann <tzimmermann@suse.de>
Cc: Daniel Vetter <daniel@ffwll.ch>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/gpu/drm/qxl/qxl_release.c | 50 ++++-----------------------------------
 include/linux/dma-fence.h         |  7 ------
 2 files changed, 5 insertions(+), 52 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/qxl/qxl_release.c b/drivers/gpu/drm/qxl/qxl_release.c
index 9febc8b73f09..368d26da0d6a 100644
--- a/drivers/gpu/drm/qxl/qxl_release.c
+++ b/drivers/gpu/drm/qxl/qxl_release.c
@@ -58,56 +58,16 @@ static long qxl_fence_wait(struct dma_fence *fence, bool intr,
 			   signed long timeout)
 {
 	struct qxl_device *qdev;
-	struct qxl_release *release;
-	int count = 0, sc = 0;
-	bool have_drawable_releases;
 	unsigned long cur, end = jiffies + timeout;
 
 	qdev = container_of(fence->lock, struct qxl_device, release_lock);
-	release = container_of(fence, struct qxl_release, base);
-	have_drawable_releases = release->type == QXL_RELEASE_DRAWABLE;
-
-retry:
-	sc++;
-
-	if (dma_fence_is_signaled(fence))
-		goto signaled;
-
-	qxl_io_notify_oom(qdev);
-
-	for (count = 0; count < 11; count++) {
-		if (!qxl_queue_garbage_collect(qdev, true))
-			break;
-
-		if (dma_fence_is_signaled(fence))
-			goto signaled;
-	}
-
-	if (dma_fence_is_signaled(fence))
-		goto signaled;
 
-	if (have_drawable_releases || sc < 4) {
-		if (sc > 2)
-			/* back off */
-			usleep_range(500, 1000);
-
-		if (time_after(jiffies, end))
-			return 0;
-
-		if (have_drawable_releases && sc > 300) {
-			DMA_FENCE_WARN(fence,
-				       "failed to wait on release %llu after spincount %d\n",
-				       fence->context & ~0xf0000000, sc);
-			goto signaled;
-		}
-		goto retry;
-	}
-	/*
-	 * yeah, original sync_obj_wait gave up after 3 spins when
-	 * have_drawable_releases is not set.
-	 */
+	if (!wait_event_timeout(qdev->release_event,
+				(dma_fence_is_signaled(fence) ||
+				 (qxl_io_notify_oom(qdev), 0)),
+				timeout))
+		return 0;
 
-signaled:
 	cur = jiffies;
 	if (time_after(cur, end))
 		return 0;
diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h
index c3f9bb6602ba..e06bad467f55 100644
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -682,11 +682,4 @@ static inline bool dma_fence_is_container(struct dma_fence *fence)
 	return dma_fence_is_array(fence) || dma_fence_is_chain(fence);
 }
 
-#define DMA_FENCE_WARN(f, fmt, args...) \
-	do {								\
-		struct dma_fence *__ff = (f);				\
-		pr_warn("f %llu#%llu: " fmt, __ff->context, __ff->seqno,\
-			 ##args);					\
-	} while (0)
-
 #endif /* __LINUX_DMA_FENCE_H */
-- 
cgit v1.2.3


From b6fc0956ac532d1b35f6f517c083603b5e150b0d Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 7 May 2024 01:11:47 +0200
Subject: gtp: properly parse extension headers

Currently GTP packets are dropped if the next extension field is set to
non-zero value, but this are valid GTP packets.

TS 29.281 provides a longer header format, which is defined as struct
gtp1_header_long. Such long header format is used if any of the S, PN, E
flags is set.

This long header is 4 bytes longer than struct gtp1_header, plus
variable length (optional) extension headers. The next extension header
field is zero is no extension header is provided.

The extension header is composed of a length field which includes total
number of 4 byte words including the extension header itself (1 byte),
payload (variable length) and next type (1 byte). The extension header
size and its payload is aligned to 4 bytes.

A GTP packet might come with a chain extensions headers, which makes it
slightly cumbersome to parse because the extension next header field
comes at the end of the extension header, and there is a need to check
if this field becomes zero to stop the extension header parser.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 drivers/net/gtp.c | 41 +++++++++++++++++++++++++++++++++++++++++
 include/net/gtp.h |  5 +++++
 2 files changed, 46 insertions(+)

(limited to 'include')

diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index dffb99a97e0f..0522b20242ba 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -567,6 +567,43 @@ static int gtp1u_handle_echo_resp(struct gtp_dev *gtp, struct sk_buff *skb)
 				       msg, 0, GTP_GENL_MCGRP, GFP_ATOMIC);
 }
 
+static int gtp_parse_exthdrs(struct sk_buff *skb, unsigned int *hdrlen)
+{
+	struct gtp_ext_hdr *gtp_exthdr, _gtp_exthdr;
+	unsigned int offset = *hdrlen;
+	__u8 *next_type, _next_type;
+
+	/* From 29.060: "The Extension Header Length field specifies the length
+	 * of the particular Extension header in 4 octets units."
+	 *
+	 * This length field includes length field size itself (1 byte),
+	 * payload (variable length) and next type (1 byte). The extension
+	 * header is aligned to to 4 bytes.
+	 */
+
+	do {
+		gtp_exthdr = skb_header_pointer(skb, offset, sizeof(*gtp_exthdr),
+						&_gtp_exthdr);
+		if (!gtp_exthdr || !gtp_exthdr->len)
+			return -1;
+
+		offset += gtp_exthdr->len * 4;
+
+		/* From 29.060: "If no such Header follows, then the value of
+		 * the Next Extension Header Type shall be 0."
+		 */
+		next_type = skb_header_pointer(skb, offset - 1,
+					       sizeof(_next_type), &_next_type);
+		if (!next_type)
+			return -1;
+
+	} while (*next_type != 0);
+
+	*hdrlen = offset;
+
+	return 0;
+}
+
 static int gtp1u_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb)
 {
 	unsigned int hdrlen = sizeof(struct udphdr) +
@@ -616,6 +653,10 @@ static int gtp1u_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb)
 		return 1;
 	}
 
+	if (gtp1->flags & GTP1_F_EXTHDR &&
+	    gtp_parse_exthdrs(skb, &hdrlen) < 0)
+		return -1;
+
 	return gtp_rx(pctx, skb, hdrlen, gtp->role);
 }
 
diff --git a/include/net/gtp.h b/include/net/gtp.h
index 2a503f035d18..c0253c8702d0 100644
--- a/include/net/gtp.h
+++ b/include/net/gtp.h
@@ -78,4 +78,9 @@ static inline bool netif_is_gtp(const struct net_device *dev)
 #define GTP1_F_EXTHDR	0x04
 #define GTP1_F_MASK	0x07
 
+struct gtp_ext_hdr {
+	__u8	len;
+	__u8	data[];
+};
+
 #endif
-- 
cgit v1.2.3


From 999cb275c807b92662f8a74fdaee9619700f64a5 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 7 May 2024 01:12:08 +0200
Subject: gtp: add IPv6 support

Add new iflink attributes to configure in-kernel UDP listener socket
address: IFLA_GTP_LOCAL and IFLA_GTP_LOCAL6. If none of these attributes
are specified, default is still to IPv4 INADDR_ANY for backward
compatibility.

Add new attributes to set up family and IPv6 address of GTP tunnels:
GTPA_FAMILY, GTPA_PEER_ADDR6 and GTPA_MS_ADDR6. If no GTPA_FAMILY is
specified, AF_INET is assumed for backward compatibility.

setsockopt IPV6_ADDRFORM allows to downgrade socket from IPv6 to IPv4
after socket is bound. Assumption is that socket listener that is
attached to the gtp device needs to be either IPv4 or IPv6. Therefore,
GTP socket listener does not allow for IPv4-mapped-IPv6 listener.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 drivers/net/gtp.c            | 396 ++++++++++++++++++++++++++++++++++++++++---
 include/uapi/linux/gtp.h     |   3 +
 include/uapi/linux/if_link.h |   2 +
 3 files changed, 374 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
index 811e4a660636..84ff502a38bc 100644
--- a/drivers/net/gtp.c
+++ b/drivers/net/gtp.c
@@ -24,6 +24,7 @@
 #include <net/net_namespace.h>
 #include <net/protocol.h>
 #include <net/ip.h>
+#include <net/ipv6.h>
 #include <net/udp.h>
 #include <net/udp_tunnel.h>
 #include <net/icmp.h>
@@ -52,9 +53,11 @@ struct pdp_ctx {
 
 	union {
 		struct in_addr	addr;
+		struct in6_addr	addr6;
 	} ms;
 	union {
 		struct in_addr	addr;
+		struct in6_addr	addr6;
 	} peer;
 
 	struct sock		*sk;
@@ -131,6 +134,11 @@ static inline u32 ipv4_hashfn(__be32 ip)
 	return jhash_1word((__force u32)ip, gtp_h_initval);
 }
 
+static u32 ipv6_hashfn(const struct in6_addr *ip6)
+{
+	return jhash(ip6, sizeof(*ip6), gtp_h_initval);
+}
+
 /* Resolve a PDP context structure based on the 64bit TID. */
 static struct pdp_ctx *gtp0_pdp_find(struct gtp_dev *gtp, u64 tid)
 {
@@ -180,6 +188,23 @@ static struct pdp_ctx *ipv4_pdp_find(struct gtp_dev *gtp, __be32 ms_addr)
 	return NULL;
 }
 
+static struct pdp_ctx *ipv6_pdp_find(struct gtp_dev *gtp,
+				     const struct in6_addr *ms_addr)
+{
+	struct hlist_head *head;
+	struct pdp_ctx *pdp;
+
+	head = &gtp->addr_hash[ipv6_hashfn(ms_addr) % gtp->hash_size];
+
+	hlist_for_each_entry_rcu(pdp, head, hlist_addr) {
+		if (pdp->af == AF_INET6 &&
+		    memcmp(&pdp->ms.addr6, ms_addr, sizeof(struct in6_addr)) == 0)
+			return pdp;
+	}
+
+	return NULL;
+}
+
 static bool gtp_check_ms_ipv4(struct sk_buff *skb, struct pdp_ctx *pctx,
 				  unsigned int hdrlen, unsigned int role)
 {
@@ -196,6 +221,28 @@ static bool gtp_check_ms_ipv4(struct sk_buff *skb, struct pdp_ctx *pctx,
 		return iph->saddr == pctx->ms.addr.s_addr;
 }
 
+static bool gtp_check_ms_ipv6(struct sk_buff *skb, struct pdp_ctx *pctx,
+			      unsigned int hdrlen, unsigned int role)
+{
+	struct ipv6hdr *ip6h;
+	int ret;
+
+	if (!pskb_may_pull(skb, hdrlen + sizeof(struct ipv6hdr)))
+		return false;
+
+	ip6h = (struct ipv6hdr *)(skb->data + hdrlen);
+
+	if (role == GTP_ROLE_SGSN) {
+		ret = memcmp(&ip6h->daddr, &pctx->ms.addr6,
+			     sizeof(struct in6_addr));
+	} else {
+		ret = memcmp(&ip6h->saddr, &pctx->ms.addr6,
+			     sizeof(struct in6_addr));
+	}
+
+	return ret == 0;
+}
+
 /* Check if the inner IP address in this packet is assigned to any
  * existing mobile subscriber.
  */
@@ -205,6 +252,8 @@ static bool gtp_check_ms(struct sk_buff *skb, struct pdp_ctx *pctx,
 	switch (ntohs(skb->protocol)) {
 	case ETH_P_IP:
 		return gtp_check_ms_ipv4(skb, pctx, hdrlen, role);
+	case ETH_P_IPV6:
+		return gtp_check_ms_ipv6(skb, pctx, hdrlen, role);
 	}
 	return false;
 }
@@ -260,6 +309,27 @@ static struct rtable *ip4_route_output_gtp(struct flowi4 *fl4,
 	return ip_route_output_key(sock_net(sk), fl4);
 }
 
+static struct rt6_info *ip6_route_output_gtp(struct net *net,
+					     struct flowi6 *fl6,
+					     const struct sock *sk,
+					     const struct in6_addr *daddr,
+					     struct in6_addr *saddr)
+{
+	struct dst_entry *dst;
+
+	memset(fl6, 0, sizeof(*fl6));
+	fl6->flowi6_oif		= sk->sk_bound_dev_if;
+	fl6->daddr		= *daddr;
+	fl6->saddr		= *saddr;
+	fl6->flowi6_proto	= sk->sk_protocol;
+
+	dst = ipv6_stub->ipv6_dst_lookup_flow(net, sk, fl6, NULL);
+	if (IS_ERR(dst))
+		return ERR_PTR(-ENETUNREACH);
+
+	return (struct rt6_info *)dst;
+}
+
 /* GSM TS 09.60. 7.3
  * In all Path Management messages:
  * - TID: is not used and shall be set to 0.
@@ -838,12 +908,15 @@ struct gtp_pktinfo {
 	struct sock		*sk;
 	union {
 		struct iphdr	*iph;
+		struct ipv6hdr	*ip6h;
 	};
 	union {
 		struct flowi4	fl4;
+		struct flowi6	fl6;
 	};
 	union {
 		struct rtable	*rt;
+		struct rt6_info	*rt6;
 	};
 	struct pdp_ctx		*pctx;
 	struct net_device	*dev;
@@ -878,6 +951,20 @@ static inline void gtp_set_pktinfo_ipv4(struct gtp_pktinfo *pktinfo,
 	pktinfo->dev	= dev;
 }
 
+static void gtp_set_pktinfo_ipv6(struct gtp_pktinfo *pktinfo,
+				 struct sock *sk, struct ipv6hdr *ip6h,
+				 struct pdp_ctx *pctx, struct rt6_info *rt6,
+				 struct flowi6 *fl6,
+				 struct net_device *dev)
+{
+	pktinfo->sk	= sk;
+	pktinfo->ip6h	= ip6h;
+	pktinfo->pctx	= pctx;
+	pktinfo->rt6	= rt6;
+	pktinfo->fl6	= *fl6;
+	pktinfo->dev	= dev;
+}
+
 static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev,
 			     struct gtp_pktinfo *pktinfo)
 {
@@ -959,6 +1046,81 @@ err:
 	return -EBADMSG;
 }
 
+static int gtp_build_skb_ip6(struct sk_buff *skb, struct net_device *dev,
+			     struct gtp_pktinfo *pktinfo)
+{
+	struct gtp_dev *gtp = netdev_priv(dev);
+	struct net *net = gtp->net;
+	struct dst_entry *dst;
+	struct pdp_ctx *pctx;
+	struct ipv6hdr *ip6h;
+	struct rt6_info *rt;
+	struct flowi6 fl6;
+	int mtu;
+
+	/* Read the IP destination address and resolve the PDP context.
+	 * Prepend PDP header with TEI/TID from PDP ctx.
+	 */
+	ip6h = ipv6_hdr(skb);
+	if (gtp->role == GTP_ROLE_SGSN)
+		pctx = ipv6_pdp_find(gtp, &ip6h->saddr);
+	else
+		pctx = ipv6_pdp_find(gtp, &ip6h->daddr);
+
+	if (!pctx) {
+		netdev_dbg(dev, "no PDP ctx found for %pI6, skip\n",
+			   &ip6h->daddr);
+		return -ENOENT;
+	}
+	netdev_dbg(dev, "found PDP context %p\n", pctx);
+
+	rt = ip6_route_output_gtp(net, &fl6, pctx->sk, &pctx->peer.addr6,
+				  &inet6_sk(pctx->sk)->saddr);
+	if (IS_ERR(rt)) {
+		netdev_dbg(dev, "no route to SSGN %pI6\n",
+			   &pctx->peer.addr6);
+		dev->stats.tx_carrier_errors++;
+		goto err;
+	}
+	dst = &rt->dst;
+
+	if (rt->dst.dev == dev) {
+		netdev_dbg(dev, "circular route to SSGN %pI6\n",
+			   &pctx->peer.addr6);
+		dev->stats.collisions++;
+		goto err_rt;
+	}
+
+	mtu = dst_mtu(&rt->dst) - dev->hard_header_len -
+		sizeof(struct ipv6hdr) - sizeof(struct udphdr);
+	switch (pctx->gtp_version) {
+	case GTP_V0:
+		mtu -= sizeof(struct gtp0_header);
+		break;
+	case GTP_V1:
+		mtu -= sizeof(struct gtp1_header);
+		break;
+	}
+
+	skb_dst_update_pmtu_no_confirm(skb, mtu);
+
+	if ((!skb_is_gso(skb) && skb->len > mtu) ||
+	    (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))) {
+		netdev_dbg(dev, "packet too big, fragmentation needed\n");
+		icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+		goto err_rt;
+	}
+
+	gtp_set_pktinfo_ipv6(pktinfo, pctx->sk, ip6h, pctx, rt, &fl6, dev);
+	gtp_push_header(skb, pktinfo);
+
+	return 0;
+err_rt:
+	dst_release(dst);
+err:
+	return -EBADMSG;
+}
+
 static netdev_tx_t gtp_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	unsigned int proto = ntohs(skb->protocol);
@@ -977,6 +1139,9 @@ static netdev_tx_t gtp_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 	case ETH_P_IP:
 		err = gtp_build_skb_ip4(skb, dev, &pktinfo);
 		break;
+	case ETH_P_IPV6:
+		err = gtp_build_skb_ip6(skb, dev, &pktinfo);
+		break;
 	default:
 		err = -EOPNOTSUPP;
 		break;
@@ -1000,6 +1165,21 @@ static netdev_tx_t gtp_dev_xmit(struct sk_buff *skb, struct net_device *dev)
 					    dev_net(dev)),
 				    false);
 		break;
+	case ETH_P_IPV6:
+#if IS_ENABLED(CONFIG_IPV6)
+		netdev_dbg(pktinfo.dev, "gtp -> IP src: %pI6 dst: %pI6\n",
+			   &pktinfo.ip6h->saddr, &pktinfo.ip6h->daddr);
+		udp_tunnel6_xmit_skb(&pktinfo.rt6->dst, pktinfo.sk, skb, dev,
+				     &pktinfo.fl6.saddr, &pktinfo.fl6.daddr,
+				     ipv6_get_dsfield(pktinfo.ip6h),
+				     ip6_dst_hoplimit(&pktinfo.rt->dst),
+				     0,
+				     pktinfo.gtph_port, pktinfo.gtph_port,
+				     false);
+#else
+		goto tx_err;
+#endif
+		break;
 	}
 
 	return NETDEV_TX_OK;
@@ -1057,17 +1237,45 @@ static void gtp_destructor(struct net_device *dev)
 	kfree(gtp->tid_hash);
 }
 
-static struct sock *gtp_create_sock(int type, struct gtp_dev *gtp)
+static int gtp_sock_udp_config(struct udp_port_cfg *udp_conf,
+			       const struct nlattr *nla, int family)
+{
+	udp_conf->family = family;
+
+	switch (udp_conf->family) {
+	case AF_INET:
+		udp_conf->local_ip.s_addr = nla_get_be32(nla);
+		break;
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		udp_conf->local_ip6 = nla_get_in6_addr(nla);
+		break;
+#endif
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+static struct sock *gtp_create_sock(int type, struct gtp_dev *gtp,
+				    const struct nlattr *nla, int family)
 {
 	struct udp_tunnel_sock_cfg tuncfg = {};
-	struct udp_port_cfg udp_conf = {
-		.local_ip.s_addr	= htonl(INADDR_ANY),
-		.family			= AF_INET,
-	};
+	struct udp_port_cfg udp_conf = {};
 	struct net *net = gtp->net;
 	struct socket *sock;
 	int err;
 
+	if (nla) {
+		err = gtp_sock_udp_config(&udp_conf, nla, family);
+		if (err < 0)
+			return ERR_PTR(err);
+	} else {
+		udp_conf.local_ip.s_addr = htonl(INADDR_ANY);
+		udp_conf.family = AF_INET;
+	}
+
 	if (type == UDP_ENCAP_GTP0)
 		udp_conf.local_udp_port = htons(GTP0_PORT);
 	else if (type == UDP_ENCAP_GTP1U)
@@ -1089,16 +1297,17 @@ static struct sock *gtp_create_sock(int type, struct gtp_dev *gtp)
 	return sock->sk;
 }
 
-static int gtp_create_sockets(struct gtp_dev *gtp, struct nlattr *data[])
+static int gtp_create_sockets(struct gtp_dev *gtp, const struct nlattr *nla,
+			      int family)
 {
 	struct sock *sk1u;
 	struct sock *sk0;
 
-	sk0 = gtp_create_sock(UDP_ENCAP_GTP0, gtp);
+	sk0 = gtp_create_sock(UDP_ENCAP_GTP0, gtp, nla, family);
 	if (IS_ERR(sk0))
 		return PTR_ERR(sk0);
 
-	sk1u = gtp_create_sock(UDP_ENCAP_GTP1U, gtp);
+	sk1u = gtp_create_sock(UDP_ENCAP_GTP1U, gtp, nla, family);
 	if (IS_ERR(sk1u)) {
 		udp_tunnel_sock_release(sk0->sk_socket);
 		return PTR_ERR(sk1u);
@@ -1111,6 +1320,9 @@ static int gtp_create_sockets(struct gtp_dev *gtp, struct nlattr *data[])
 	return 0;
 }
 
+#define GTP_TH_MAXLEN	(sizeof(struct udphdr) + sizeof(struct gtp0_header))
+#define GTP_IPV6_MAXLEN	(sizeof(struct ipv6hdr) + GTP_TH_MAXLEN)
+
 static int gtp_newlink(struct net *src_net, struct net_device *dev,
 		       struct nlattr *tb[], struct nlattr *data[],
 		       struct netlink_ext_ack *extack)
@@ -1120,6 +1332,11 @@ static int gtp_newlink(struct net *src_net, struct net_device *dev,
 	struct gtp_net *gn;
 	int hashsize, err;
 
+#if !IS_ENABLED(CONFIG_IPV6)
+	if (data[IFLA_GTP_LOCAL6])
+		return -EAFNOSUPPORT;
+#endif
+
 	gtp = netdev_priv(dev);
 
 	if (!data[IFLA_GTP_PDP_HASHSIZE]) {
@@ -1148,13 +1365,24 @@ static int gtp_newlink(struct net *src_net, struct net_device *dev,
 	if (err < 0)
 		return err;
 
-	if (data[IFLA_GTP_CREATE_SOCKETS])
-		err = gtp_create_sockets(gtp, data);
-	else
+	if (data[IFLA_GTP_CREATE_SOCKETS]) {
+		if (data[IFLA_GTP_LOCAL6])
+			err = gtp_create_sockets(gtp, data[IFLA_GTP_LOCAL6], AF_INET6);
+		else
+			err = gtp_create_sockets(gtp, data[IFLA_GTP_LOCAL], AF_INET);
+	} else {
 		err = gtp_encap_enable(gtp, data);
+	}
+
 	if (err < 0)
 		goto out_hashtable;
 
+	if ((gtp->sk0 && gtp->sk0->sk_family == AF_INET6) ||
+	    (gtp->sk1u && gtp->sk1u->sk_family == AF_INET6)) {
+		dev->mtu = ETH_DATA_LEN - GTP_IPV6_MAXLEN;
+		dev->needed_headroom = LL_MAX_HEADER + GTP_IPV6_MAXLEN;
+	}
+
 	err = register_netdevice(dev);
 	if (err < 0) {
 		netdev_dbg(dev, "failed to register new netdev %d\n", err);
@@ -1199,6 +1427,8 @@ static const struct nla_policy gtp_policy[IFLA_GTP_MAX + 1] = {
 	[IFLA_GTP_ROLE]			= { .type = NLA_U32 },
 	[IFLA_GTP_CREATE_SOCKETS]	= { .type = NLA_U8 },
 	[IFLA_GTP_RESTART_COUNT]	= { .type = NLA_U8 },
+	[IFLA_GTP_LOCAL]		= { .type = NLA_U32 },
+	[IFLA_GTP_LOCAL6]		= { .len = sizeof(struct in6_addr) },
 };
 
 static int gtp_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -1298,6 +1528,12 @@ static struct sock *gtp_encap_enable_socket(int fd, int type,
 		goto out_sock;
 	}
 
+	if (sk->sk_family == AF_INET6 &&
+	    !sk->sk_ipv6only) {
+		sk = ERR_PTR(-EADDRNOTAVAIL);
+		goto out_sock;
+	}
+
 	lock_sock(sk);
 	if (sk->sk_user_data) {
 		sk = ERR_PTR(-EBUSY);
@@ -1349,6 +1585,13 @@ static int gtp_encap_enable(struct gtp_dev *gtp, struct nlattr *data[])
 	gtp->sk0 = sk0;
 	gtp->sk1u = sk1u;
 
+	if (sk0 && sk1u &&
+	    sk0->sk_family != sk1u->sk_family) {
+		gtp_encap_disable_sock(sk0);
+		gtp_encap_disable_sock(sk1u);
+		return -EINVAL;
+	}
+
 	return 0;
 }
 
@@ -1378,14 +1621,9 @@ static struct gtp_dev *gtp_find_dev(struct net *src_net, struct nlattr *nla[])
 	return gtp;
 }
 
-static void ipv4_pdp_fill(struct pdp_ctx *pctx, struct genl_info *info)
+static void gtp_pdp_fill(struct pdp_ctx *pctx, struct genl_info *info)
 {
 	pctx->gtp_version = nla_get_u32(info->attrs[GTPA_VERSION]);
-	pctx->af = AF_INET;
-	pctx->peer.addr.s_addr =
-		nla_get_be32(info->attrs[GTPA_PEER_ADDRESS]);
-	pctx->ms.addr.s_addr =
-		nla_get_be32(info->attrs[GTPA_MS_ADDRESS]);
 
 	switch (pctx->gtp_version) {
 	case GTP_V0:
@@ -1405,21 +1643,78 @@ static void ipv4_pdp_fill(struct pdp_ctx *pctx, struct genl_info *info)
 	}
 }
 
+static void ipv4_pdp_fill(struct pdp_ctx *pctx, struct genl_info *info)
+{
+	pctx->peer.addr.s_addr =
+		nla_get_be32(info->attrs[GTPA_PEER_ADDRESS]);
+	pctx->ms.addr.s_addr =
+		nla_get_be32(info->attrs[GTPA_MS_ADDRESS]);
+	gtp_pdp_fill(pctx, info);
+}
+
+static void ipv6_pdp_fill(struct pdp_ctx *pctx, struct genl_info *info)
+{
+	pctx->peer.addr6 = nla_get_in6_addr(info->attrs[GTPA_PEER_ADDR6]);
+	pctx->ms.addr6 = nla_get_in6_addr(info->attrs[GTPA_MS_ADDR6]);
+	gtp_pdp_fill(pctx, info);
+}
+
 static struct pdp_ctx *gtp_pdp_add(struct gtp_dev *gtp, struct sock *sk,
 				   struct genl_info *info)
 {
 	struct pdp_ctx *pctx, *pctx_tid = NULL;
 	struct net_device *dev = gtp->dev;
 	u32 hash_ms, hash_tid = 0;
+	struct in6_addr ms_addr6;
 	unsigned int version;
 	bool found = false;
 	__be32 ms_addr;
+	int family;
 
-	ms_addr = nla_get_be32(info->attrs[GTPA_MS_ADDRESS]);
-	hash_ms = ipv4_hashfn(ms_addr) % gtp->hash_size;
 	version = nla_get_u32(info->attrs[GTPA_VERSION]);
 
-	pctx = ipv4_pdp_find(gtp, ms_addr);
+	if (info->attrs[GTPA_FAMILY])
+		family = nla_get_u8(info->attrs[GTPA_FAMILY]);
+	else
+		family = AF_INET;
+
+#if !IS_ENABLED(CONFIG_IPV6)
+	if (family == AF_INET6)
+		return ERR_PTR(-EAFNOSUPPORT);
+#endif
+
+	if ((info->attrs[GTPA_PEER_ADDRESS] &&
+	     sk->sk_family == AF_INET6) ||
+	    (info->attrs[GTPA_PEER_ADDR6] &&
+	     sk->sk_family == AF_INET))
+		return ERR_PTR(-EAFNOSUPPORT);
+
+	switch (family) {
+	case AF_INET:
+		if (!info->attrs[GTPA_MS_ADDRESS] ||
+		    !info->attrs[GTPA_PEER_ADDRESS] ||
+		    info->attrs[GTPA_MS_ADDR6] ||
+		    info->attrs[GTPA_PEER_ADDR6])
+			return ERR_PTR(-EINVAL);
+
+		ms_addr = nla_get_be32(info->attrs[GTPA_MS_ADDRESS]);
+		hash_ms = ipv4_hashfn(ms_addr) % gtp->hash_size;
+		pctx = ipv4_pdp_find(gtp, ms_addr);
+		break;
+	case AF_INET6:
+		if (!info->attrs[GTPA_MS_ADDR6] ||
+		    !info->attrs[GTPA_PEER_ADDR6] ||
+		    info->attrs[GTPA_MS_ADDRESS] ||
+		    info->attrs[GTPA_PEER_ADDRESS])
+			return ERR_PTR(-EINVAL);
+
+		ms_addr6 = nla_get_in6_addr(info->attrs[GTPA_MS_ADDR6]);
+		hash_ms = ipv6_hashfn(&ms_addr6) % gtp->hash_size;
+		pctx = ipv6_pdp_find(gtp, &ms_addr6);
+		break;
+	default:
+		return ERR_PTR(-EAFNOSUPPORT);
+	}
 	if (pctx)
 		found = true;
 	if (version == GTP_V0)
@@ -1442,7 +1737,14 @@ static struct pdp_ctx *gtp_pdp_add(struct gtp_dev *gtp, struct sock *sk,
 		if (!pctx)
 			pctx = pctx_tid;
 
-		ipv4_pdp_fill(pctx, info);
+		switch (pctx->af) {
+		case AF_INET:
+			ipv4_pdp_fill(pctx, info);
+			break;
+		case AF_INET6:
+			ipv6_pdp_fill(pctx, info);
+			break;
+		}
 
 		if (pctx->gtp_version == GTP_V0)
 			netdev_dbg(dev, "GTPv0-U: update tunnel id = %llx (pdp %p)\n",
@@ -1462,7 +1764,30 @@ static struct pdp_ctx *gtp_pdp_add(struct gtp_dev *gtp, struct sock *sk,
 	sock_hold(sk);
 	pctx->sk = sk;
 	pctx->dev = gtp->dev;
-	ipv4_pdp_fill(pctx, info);
+	pctx->af = family;
+
+	switch (pctx->af) {
+	case AF_INET:
+		if (!info->attrs[GTPA_MS_ADDRESS] ||
+		    !info->attrs[GTPA_PEER_ADDRESS]) {
+			sock_put(sk);
+			kfree(pctx);
+			return ERR_PTR(-EINVAL);
+		}
+
+		ipv4_pdp_fill(pctx, info);
+		break;
+	case AF_INET6:
+		if (!info->attrs[GTPA_MS_ADDR6] ||
+		    !info->attrs[GTPA_PEER_ADDR6]) {
+			sock_put(sk);
+			kfree(pctx);
+			return ERR_PTR(-EINVAL);
+		}
+
+		ipv6_pdp_fill(pctx, info);
+		break;
+	}
 	atomic_set(&pctx->tx_seq, 0);
 
 	switch (pctx->gtp_version) {
@@ -1524,9 +1849,7 @@ static int gtp_genl_new_pdp(struct sk_buff *skb, struct genl_info *info)
 	int err;
 
 	if (!info->attrs[GTPA_VERSION] ||
-	    !info->attrs[GTPA_LINK] ||
-	    !info->attrs[GTPA_PEER_ADDRESS] ||
-	    !info->attrs[GTPA_MS_ADDRESS])
+	    !info->attrs[GTPA_LINK])
 		return -EINVAL;
 
 	version = nla_get_u32(info->attrs[GTPA_VERSION]);
@@ -1593,6 +1916,10 @@ static struct pdp_ctx *gtp_find_pdp_by_link(struct net *net,
 		__be32 ip = nla_get_be32(nla[GTPA_MS_ADDRESS]);
 
 		return ipv4_pdp_find(gtp, ip);
+	} else if (nla[GTPA_MS_ADDR6]) {
+		struct in6_addr addr = nla_get_in6_addr(nla[GTPA_MS_ADDR6]);
+
+		return ipv6_pdp_find(gtp, &addr);
 	} else if (nla[GTPA_VERSION]) {
 		u32 gtp_version = nla_get_u32(nla[GTPA_VERSION]);
 
@@ -1663,10 +1990,22 @@ static int gtp_genl_fill_info(struct sk_buff *skb, u32 snd_portid, u32 snd_seq,
 
 	if (nla_put_u32(skb, GTPA_VERSION, pctx->gtp_version) ||
 	    nla_put_u32(skb, GTPA_LINK, pctx->dev->ifindex) ||
-	    nla_put_be32(skb, GTPA_PEER_ADDRESS, pctx->peer.addr.s_addr) ||
-	    nla_put_be32(skb, GTPA_MS_ADDRESS, pctx->ms.addr.s_addr))
+	    nla_put_u8(skb, GTPA_FAMILY, pctx->af))
 		goto nla_put_failure;
 
+	switch (pctx->af) {
+	case AF_INET:
+		if (nla_put_be32(skb, GTPA_PEER_ADDRESS, pctx->peer.addr.s_addr) ||
+		    nla_put_be32(skb, GTPA_MS_ADDRESS, pctx->ms.addr.s_addr))
+			goto nla_put_failure;
+		break;
+	case AF_INET6:
+		if (nla_put_in6_addr(skb, GTPA_PEER_ADDR6, &pctx->peer.addr6) ||
+		    nla_put_in6_addr(skb, GTPA_MS_ADDR6, &pctx->ms.addr6))
+			goto nla_put_failure;
+		break;
+	}
+
 	switch (pctx->gtp_version) {
 	case GTP_V0:
 		if (nla_put_u64_64bit(skb, GTPA_TID, pctx->u.v0.tid, GTPA_PAD) ||
@@ -1893,6 +2232,9 @@ static const struct nla_policy gtp_genl_policy[GTPA_MAX + 1] = {
 	[GTPA_NET_NS_FD]	= { .type = NLA_U32, },
 	[GTPA_I_TEI]		= { .type = NLA_U32, },
 	[GTPA_O_TEI]		= { .type = NLA_U32, },
+	[GTPA_PEER_ADDR6]	= { .len = sizeof(struct in6_addr), },
+	[GTPA_MS_ADDR6]		= { .len = sizeof(struct in6_addr), },
+	[GTPA_FAMILY]		= { .type = NLA_U8, },
 };
 
 static const struct genl_small_ops gtp_genl_ops[] = {
diff --git a/include/uapi/linux/gtp.h b/include/uapi/linux/gtp.h
index 3dcdb9e33cba..40f5388d6de0 100644
--- a/include/uapi/linux/gtp.h
+++ b/include/uapi/linux/gtp.h
@@ -31,6 +31,9 @@ enum gtp_attrs {
 	GTPA_I_TEI,	/* for GTPv1 only */
 	GTPA_O_TEI,	/* for GTPv1 only */
 	GTPA_PAD,
+	GTPA_PEER_ADDR6,
+	GTPA_MS_ADDR6,
+	GTPA_FAMILY,
 	__GTPA_MAX,
 };
 #define GTPA_MAX (__GTPA_MAX - 1)
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index e9f10860ec8e..6dc258993b17 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -1466,6 +1466,8 @@ enum {
 	IFLA_GTP_ROLE,
 	IFLA_GTP_CREATE_SOCKETS,
 	IFLA_GTP_RESTART_COUNT,
+	IFLA_GTP_LOCAL,
+	IFLA_GTP_LOCAL6,
 	__IFLA_GTP_MAX,
 };
 #define IFLA_GTP_MAX (__IFLA_GTP_MAX - 1)
-- 
cgit v1.2.3


From 9b1fe0510494c989ab6a131ce8b97cdd02a1c869 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Mon, 6 May 2024 21:33:33 +0200
Subject: regmap: Reorder fields in 'struct regmap_config' to save some memory

On x86_64 and allmodconfig, this shrinks the size of 'struct regmap_config'
from 328 to 312 bytes.

This is usually a win, because this structure is used as a static global
variable.

When moving the kerneldoc fields, I've tried to keep the layout as
consistent as possible, which is not really easy!

Before:
	/* size: 328, cachelines: 6, members: 55 */
	/* sum members: 296, holes: 6, sum holes: 25 */
	/* padding: 7 */
	/* last cacheline: 8 bytes */

After:
	/* size: 312, cachelines: 5, members: 55 */
	/* sum members: 296, holes: 5, sum holes: 16 */
	/* last cacheline: 56 bytes */

For the records, this is also widely used:
	$git grep static.*regmap_config | wc -l
	1327

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Link: https://lore.kernel.org/r/5e039cd8fe415dd7ab3169948c08a5311db9fb9a.1715024007.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/linux/regmap.h | 62 +++++++++++++++++++++++++-------------------------
 1 file changed, 31 insertions(+), 31 deletions(-)

(limited to 'include')

diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index d470303b1bbb..a6bc2980a98b 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -297,15 +297,6 @@ typedef void (*regmap_unlock)(void *);
  *			performed on such table (a register is no increment
  *			readable if it belongs to one of the ranges specified
  *			by rd_noinc_table).
- * @disable_locking: This regmap is either protected by external means or
- *                   is guaranteed not to be accessed from multiple threads.
- *                   Don't use any locking mechanisms.
- * @lock:	  Optional lock callback (overrides regmap's default lock
- *		  function, based on spinlock or mutex).
- * @unlock:	  As above for unlocking.
- * @lock_arg:	  this field is passed as the only argument of lock/unlock
- *		  functions (ignored in case regular lock/unlock functions
- *		  are not overridden).
  * @reg_read:	  Optional callback that if filled will be used to perform
  *           	  all the reads from the registers. Should only be provided for
  *		  devices whose read operation cannot be represented as a simple
@@ -323,6 +314,7 @@ typedef void (*regmap_unlock)(void *);
  * @write: Same as above for writing.
  * @max_raw_read: Max raw read size that can be used on the device.
  * @max_raw_write: Max raw write size that can be used on the device.
+ * @can_sleep:	  Optional, specifies whether regmap operations can sleep.
  * @fast_io:	  Register IO is fast. Use a spinlock instead of a mutex
  *	     	  to perform locking. This field is ignored if custom lock/unlock
  *	     	  functions are used (see fields lock/unlock of struct regmap_config).
@@ -331,6 +323,15 @@ typedef void (*regmap_unlock)(void *);
  *		   Use it only for "no-bus" cases.
  * @io_port:	  Support IO port accessors. Makes sense only when MMIO vs. IO port
  *		  access can be distinguished.
+ * @disable_locking: This regmap is either protected by external means or
+ *		     is guaranteed not to be accessed from multiple threads.
+ *		     Don't use any locking mechanisms.
+ * @lock:	  Optional lock callback (overrides regmap's default lock
+ *		  function, based on spinlock or mutex).
+ * @unlock:	  As above for unlocking.
+ * @lock_arg:	  This field is passed as the only argument of lock/unlock
+ *		  functions (ignored in case regular lock/unlock functions
+ *		  are not overridden).
  * @max_register: Optional, specifies the maximum valid register address.
  * @max_register_is_0: Optional, specifies that zero value in @max_register
  *                     should be taken into account. This is a workaround to
@@ -373,21 +374,20 @@ typedef void (*regmap_unlock)(void *);
  * @reg_defaults_raw: Power on reset values for registers (for use with
  *                    register cache support).
  * @num_reg_defaults_raw: Number of elements in reg_defaults_raw.
- * @reg_format_endian: Endianness for formatted register addresses. If this is
- *                     DEFAULT, the @reg_format_endian_default value from the
- *                     regmap bus is used.
- * @val_format_endian: Endianness for formatted register values. If this is
- *                     DEFAULT, the @reg_format_endian_default value from the
- *                     regmap bus is used.
- *
- * @ranges: Array of configuration entries for virtual address ranges.
- * @num_ranges: Number of range configuration entries.
  * @use_hwlock: Indicate if a hardware spinlock should be used.
  * @use_raw_spinlock: Indicate if a raw spinlock should be used.
  * @hwlock_id: Specify the hardware spinlock id.
  * @hwlock_mode: The hardware spinlock mode, should be HWLOCK_IRQSTATE,
  *		 HWLOCK_IRQ or 0.
- * @can_sleep: Optional, specifies whether regmap operations can sleep.
+ * @reg_format_endian: Endianness for formatted register addresses. If this is
+ *		       DEFAULT, the @reg_format_endian_default value from the
+ *		       regmap bus is used.
+ * @val_format_endian: Endianness for formatted register values. If this is
+ *		       DEFAULT, the @reg_format_endian_default value from the
+ *		       regmap bus is used.
+ *
+ * @ranges: Array of configuration entries for virtual address ranges.
+ * @num_ranges: Number of range configuration entries.
  */
 struct regmap_config {
 	const char *name;
@@ -406,11 +406,6 @@ struct regmap_config {
 	bool (*writeable_noinc_reg)(struct device *dev, unsigned int reg);
 	bool (*readable_noinc_reg)(struct device *dev, unsigned int reg);
 
-	bool disable_locking;
-	regmap_lock lock;
-	regmap_unlock unlock;
-	void *lock_arg;
-
 	int (*reg_read)(void *context, unsigned int reg, unsigned int *val);
 	int (*reg_write)(void *context, unsigned int reg, unsigned int val);
 	int (*reg_update_bits)(void *context, unsigned int reg,
@@ -422,9 +417,16 @@ struct regmap_config {
 	size_t max_raw_read;
 	size_t max_raw_write;
 
+	bool can_sleep;
+
 	bool fast_io;
 	bool io_port;
 
+	bool disable_locking;
+	regmap_lock lock;
+	regmap_unlock unlock;
+	void *lock_arg;
+
 	unsigned int max_register;
 	bool max_register_is_0;
 	const struct regmap_access_table *wr_table;
@@ -448,18 +450,16 @@ struct regmap_config {
 	bool use_relaxed_mmio;
 	bool can_multi_write;
 
-	enum regmap_endian reg_format_endian;
-	enum regmap_endian val_format_endian;
-
-	const struct regmap_range_cfg *ranges;
-	unsigned int num_ranges;
-
 	bool use_hwlock;
 	bool use_raw_spinlock;
 	unsigned int hwlock_id;
 	unsigned int hwlock_mode;
 
-	bool can_sleep;
+	enum regmap_endian reg_format_endian;
+	enum regmap_endian val_format_endian;
+
+	const struct regmap_range_cfg *ranges;
+	unsigned int num_ranges;
 };
 
 /**
-- 
cgit v1.2.3


From 2b5ae9c7d9e5ef4bc52c932fdf10328feb5167c6 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andy.shevchenko@gmail.com>
Date: Sun, 5 May 2024 17:14:20 +0300
Subject: gpiolib: Discourage to use formatting strings in line names

Currently the documentation for line names allows to use %u inside
the alternative name. This is broken in character device approach
from day 1 and being in use solely in sysfs.

Character device interface has a line number as a part of its address,
so the users better rely on it. Hence remove the misleading documentation.

On top of that, there are no in-kernel users (out of 6, if I'm correct)
for such names and moreover if one exists it won't help in distinguishing
lines with the same naming as '%u' will also be in them and we will get
a warning in gpiochip_set_desc_names() for such cases.

Signed-off-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Kent Gibson <warthog618@gmail.com>
Link: https://lore.kernel.org/r/20240505141420.627398-1-andy.shevchenko@gmail.com
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 include/linux/gpio/driver.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index f8617eaf08ba..0032bb6e7d8f 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -376,9 +376,7 @@ struct gpio_irq_chip {
  * @names: if set, must be an array of strings to use as alternative
  *      names for the GPIOs in this chip. Any entry in the array
  *      may be NULL if there is no alias for the GPIO, however the
- *      array must be @ngpio entries long.  A name can include a single printk
- *      format specifier for an unsigned int.  It is substituted by the actual
- *      number of the gpio.
+ *      array must be @ngpio entries long.
  * @can_sleep: flag must be set iff get()/set() methods sleep, as they
  *	must while accessing GPIO expander chips over I2C or SPI. This
  *	implies that if the chip supports IRQs, these IRQs need to be threaded
-- 
cgit v1.2.3


From e0550222e03bae3fd629641e246ef7f47803d795 Mon Sep 17 00:00:00 2001
From: Justin Stitt <justinstitt@google.com>
Date: Mon, 29 Apr 2024 23:06:54 +0000
Subject: printk: cleanup deprecated uses of strncpy/strcpy

Cleanup some deprecated uses of strncpy() and strcpy() [1].

There doesn't seem to be any bugs with the current code but the
readability of this code could benefit from a quick makeover while
removing some deprecated stuff as a benefit.

The most interesting replacement made in this patch involves
concatenating "ttyS" with a digit-led user-supplied string. Instead of
doing two distinct string copies with carefully managed offsets and
lengths, let's use the more robust and self-explanatory scnprintf().
scnprintf will 1) respect the bounds of @buf, 2) null-terminate @buf, 3)
do the concatenation. This allows us to drop the manual NUL-byte assignment.

Also, since isdigit() is used about a dozen lines after the open-coded
version we'll replace it for uniformity's sake.

All the strcpy() --> strscpy() replacements are trivial as the source
strings are literals and much smaller than the destination size. No
behavioral change here.

Use the new 2-argument version of strscpy() introduced in Commit
e6584c3964f2f ("string: Allow 2-argument strscpy()"). However, to make
this work fully (since the size must be known at compile time), also
update the extern-qualified declaration to have the proper size
information.

Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strncpy-on-nul-terminated-strings [1]
Link: https://github.com/KSPP/linux/issues/90 [2]
Link: https://manpages.debian.org/testing/linux-manual-4.8/strscpy.9.en.html [3]
Cc: linux-hardening@vger.kernel.org
Signed-off-by: Justin Stitt <justinstitt@google.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Link: https://lore.kernel.org/r/20240429-strncpy-kernel-printk-printk-c-v1-1-4da7926d7b69@google.com
[pmladek@suse.com: Removed obsolete brackets and added empty lines.]
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/printk.h |  2 +-
 kernel/printk/printk.c | 26 +++++++++++++-------------
 2 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 2fde40cc9677..dbbd202b1cb3 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -71,7 +71,7 @@ extern void console_verbose(void);
 
 /* strlen("ratelimit") + 1 */
 #define DEVKMSG_STR_MAX_SIZE 10
-extern char devkmsg_log_str[];
+extern char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE];
 struct ctl_table;
 
 extern int suppress_printk;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index b06f63e276c1..360182dfe4cf 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -178,9 +178,9 @@ static int __init control_devkmsg(char *str)
 	 * Set sysctl string accordingly:
 	 */
 	if (devkmsg_log == DEVKMSG_LOG_MASK_ON)
-		strcpy(devkmsg_log_str, "on");
+		strscpy(devkmsg_log_str, "on");
 	else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF)
-		strcpy(devkmsg_log_str, "off");
+		strscpy(devkmsg_log_str, "off");
 	/* else "ratelimit" which is set by default. */
 
 	/*
@@ -209,7 +209,7 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
 			return -EINVAL;
 
 		old = devkmsg_log;
-		strncpy(old_str, devkmsg_log_str, DEVKMSG_STR_MAX_SIZE);
+		strscpy(old_str, devkmsg_log_str);
 	}
 
 	err = proc_dostring(table, write, buffer, lenp, ppos);
@@ -227,7 +227,7 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
 
 			/* ... and restore old setting. */
 			devkmsg_log = old;
-			strncpy(devkmsg_log_str, old_str, DEVKMSG_STR_MAX_SIZE);
+			strscpy(devkmsg_log_str, old_str);
 
 			return -EINVAL;
 		}
@@ -2500,22 +2500,22 @@ static int __init console_setup(char *str)
 	/*
 	 * Decode str into name, index, options.
 	 */
-	if (str[0] >= '0' && str[0] <= '9') {
-		strcpy(buf, "ttyS");
-		strncpy(buf + 4, str, sizeof(buf) - 5);
-	} else {
-		strncpy(buf, str, sizeof(buf) - 1);
-	}
-	buf[sizeof(buf) - 1] = 0;
+	if (isdigit(str[0]))
+		scnprintf(buf, sizeof(buf), "ttyS%s", str);
+	else
+		strscpy(buf, str);
+
 	options = strchr(str, ',');
 	if (options)
 		*(options++) = 0;
+
 #ifdef __sparc__
 	if (!strcmp(str, "ttya"))
-		strcpy(buf, "ttyS0");
+		strscpy(buf, "ttyS0");
 	if (!strcmp(str, "ttyb"))
-		strcpy(buf, "ttyS1");
+		strscpy(buf, "ttyS1");
 #endif
+
 	for (s = buf; *s; s++)
 		if (isdigit(*s) || *s == ',')
 			break;
-- 
cgit v1.2.3


From 327e2c97c46a4d971c5450a9d05b4a673f46c4da Mon Sep 17 00:00:00 2001
From: Michael Kelley <mhklinux@outlook.com>
Date: Sun, 7 Apr 2024 21:11:41 -0700
Subject: swiotlb: remove alloc_size argument to swiotlb_tbl_map_single()

Currently swiotlb_tbl_map_single() takes alloc_align_mask and
alloc_size arguments to specify an swiotlb allocation that is larger
than mapping_size.  This larger allocation is used solely by
iommu_dma_map_single() to handle untrusted devices that should not have
DMA visibility to memory pages that are partially used for unrelated
kernel data.

Having two arguments to specify the allocation is redundant. While
alloc_align_mask naturally specifies the alignment of the starting
address of the allocation, it can also implicitly specify the size
by rounding up the mapping_size to that alignment.

Additionally, the current approach has an edge case bug.
iommu_dma_map_page() already does the rounding up to compute the
alloc_size argument. But swiotlb_tbl_map_single() then calculates the
alignment offset based on the DMA min_align_mask, and adds that offset to
alloc_size. If the offset is non-zero, the addition may result in a value
that is larger than the max the swiotlb can allocate.  If the rounding up
is done _after_ the alignment offset is added to the mapping_size (and
the original mapping_size conforms to the value returned by
swiotlb_max_mapping_size), then the max that the swiotlb can allocate
will not be exceeded.

In view of these issues, simplify the swiotlb_tbl_map_single() interface
by removing the alloc_size argument. Most call sites pass the same value
for mapping_size and alloc_size, and they pass alloc_align_mask as zero.
Just remove the redundant argument from these callers, as they will see
no functional change. For iommu_dma_map_page() also remove the alloc_size
argument, and have swiotlb_tbl_map_single() compute the alloc_size by
rounding up mapping_size after adding the offset based on min_align_mask.
This has the side effect of fixing the edge case bug but with no other
functional change.

Also add a sanity test on the alloc_align_mask. While IOMMU code
currently ensures the granule is not larger than PAGE_SIZE, if that
guarantee were to be removed in the future, the downstream effect on the
swiotlb might go unnoticed until strange allocation failures occurred.

Tested on an ARM64 system with 16K page size and some kernel test-only
hackery to allow modifying the DMA min_align_mask and the granule size
that becomes the alloc_align_mask. Tested these combinations with a
variety of original memory addresses and sizes, including those that
reproduce the edge case bug:

 * 4K granule and 0 min_align_mask
 * 4K granule and 0xFFF min_align_mask (4K - 1)
 * 16K granule and 0xFFF min_align_mask
 * 64K granule and 0xFFF min_align_mask
 * 64K granule and 0x3FFF min_align_mask (16K - 1)

With the changes, all combinations pass.

Signed-off-by: Michael Kelley <mhklinux@outlook.com>
Reviewed-by: Petr Tesarik <petr@tesarici.cz>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/iommu/dma-iommu.c |  2 +-
 drivers/xen/swiotlb-xen.c |  2 +-
 include/linux/swiotlb.h   |  2 +-
 kernel/dma/swiotlb.c      | 56 +++++++++++++++++++++++++++++++++++------------
 4 files changed, 45 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index e4cb26f6a943..7a9e814fc6b3 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1165,7 +1165,7 @@ static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
 		trace_swiotlb_bounced(dev, phys, size);
 
 		aligned_size = iova_align(iovad, size);
-		phys = swiotlb_tbl_map_single(dev, phys, size, aligned_size,
+		phys = swiotlb_tbl_map_single(dev, phys, size,
 					      iova_mask(iovad), dir, attrs);
 
 		if (phys == DMA_MAPPING_ERROR)
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 0e6c6c25d154..d4f1f8d1ebd8 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -216,7 +216,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
 	 */
 	trace_swiotlb_bounced(dev, dev_addr, size);
 
-	map = swiotlb_tbl_map_single(dev, phys, size, size, 0, dir, attrs);
+	map = swiotlb_tbl_map_single(dev, phys, size, 0, dir, attrs);
 	if (map == (phys_addr_t)DMA_MAPPING_ERROR)
 		return DMA_MAPPING_ERROR;
 
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index ea23097e351f..14bc10c1bb23 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -43,7 +43,7 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
 extern void __init swiotlb_update_mem_attributes(void);
 
 phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys,
-		size_t mapping_size, size_t alloc_size,
+		size_t mapping_size,
 		unsigned int alloc_aligned_mask, enum dma_data_direction dir,
 		unsigned long attrs);
 
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index a5e0dfc44d24..046da973a7e2 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -1340,15 +1340,40 @@ static unsigned long mem_used(struct io_tlb_mem *mem)
 
 #endif /* CONFIG_DEBUG_FS */
 
+/**
+ * swiotlb_tbl_map_single() - bounce buffer map a single contiguous physical area
+ * @dev:		Device which maps the buffer.
+ * @orig_addr:		Original (non-bounced) physical IO buffer address
+ * @mapping_size:	Requested size of the actual bounce buffer, excluding
+ *			any pre- or post-padding for alignment
+ * @alloc_align_mask:	Required start and end alignment of the allocated buffer
+ * @dir:		DMA direction
+ * @attrs:		Optional DMA attributes for the map operation
+ *
+ * Find and allocate a suitable sequence of IO TLB slots for the request.
+ * The allocated space starts at an alignment specified by alloc_align_mask,
+ * and the size of the allocated space is rounded up so that the total amount
+ * of allocated space is a multiple of (alloc_align_mask + 1). If
+ * alloc_align_mask is zero, the allocated space may be at any alignment and
+ * the size is not rounded up.
+ *
+ * The returned address is within the allocated space and matches the bits
+ * of orig_addr that are specified in the DMA min_align_mask for the device. As
+ * such, this returned address may be offset from the beginning of the allocated
+ * space. The bounce buffer space starting at the returned address for
+ * mapping_size bytes is initialized to the contents of the original IO buffer
+ * area. Any pre-padding (due to an offset) and any post-padding (due to
+ * rounding-up the size) is not initialized.
+ */
 phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
-		size_t mapping_size, size_t alloc_size,
-		unsigned int alloc_align_mask, enum dma_data_direction dir,
-		unsigned long attrs)
+		size_t mapping_size, unsigned int alloc_align_mask,
+		enum dma_data_direction dir, unsigned long attrs)
 {
 	struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
 	unsigned int offset;
 	struct io_tlb_pool *pool;
 	unsigned int i;
+	size_t size;
 	int index;
 	phys_addr_t tlb_addr;
 	unsigned short pad_slots;
@@ -1362,20 +1387,24 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 	if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
 		pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
 
-	if (mapping_size > alloc_size) {
-		dev_warn_once(dev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)",
-			      mapping_size, alloc_size);
-		return (phys_addr_t)DMA_MAPPING_ERROR;
-	}
+	/*
+	 * The default swiotlb memory pool is allocated with PAGE_SIZE
+	 * alignment. If a mapping is requested with larger alignment,
+	 * the mapping may be unable to use the initial slot(s) in all
+	 * sets of IO_TLB_SEGSIZE slots. In such case, a mapping request
+	 * of or near the maximum mapping size would always fail.
+	 */
+	dev_WARN_ONCE(dev, alloc_align_mask > ~PAGE_MASK,
+		"Alloc alignment may prevent fulfilling requests with max mapping_size\n");
 
 	offset = swiotlb_align_offset(dev, alloc_align_mask, orig_addr);
-	index = swiotlb_find_slots(dev, orig_addr,
-				   alloc_size + offset, alloc_align_mask, &pool);
+	size = ALIGN(mapping_size + offset, alloc_align_mask + 1);
+	index = swiotlb_find_slots(dev, orig_addr, size, alloc_align_mask, &pool);
 	if (index == -1) {
 		if (!(attrs & DMA_ATTR_NO_WARN))
 			dev_warn_ratelimited(dev,
 	"swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
-				 alloc_size, mem->nslabs, mem_used(mem));
+				 size, mem->nslabs, mem_used(mem));
 		return (phys_addr_t)DMA_MAPPING_ERROR;
 	}
 
@@ -1388,7 +1417,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 	offset &= (IO_TLB_SIZE - 1);
 	index += pad_slots;
 	pool->slots[index].pad_slots = pad_slots;
-	for (i = 0; i < nr_slots(alloc_size + offset); i++)
+	for (i = 0; i < (nr_slots(size) - pad_slots); i++)
 		pool->slots[index + i].orig_addr = slot_addr(orig_addr, i);
 	tlb_addr = slot_addr(pool->start, index) + offset;
 	/*
@@ -1543,8 +1572,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
 
 	trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size);
 
-	swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, 0, dir,
-			attrs);
+	swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, 0, dir, attrs);
 	if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR)
 		return DMA_MAPPING_ERROR;
 
-- 
cgit v1.2.3


From 2650073f1b5858008c32712f3d9e1e808ce7e967 Mon Sep 17 00:00:00 2001
From: Michael Kelley <mhklinux@outlook.com>
Date: Sun, 7 Apr 2024 21:11:42 -0700
Subject: iommu/dma: fix zeroing of bounce buffer padding used by untrusted
 devices

iommu_dma_map_page() allocates swiotlb memory as a bounce buffer when an
untrusted device wants to map only part of the memory in an granule.  The
goal is to disallow the untrusted device having DMA access to unrelated
kernel data that may be sharing the granule.  To meet this goal, the
bounce buffer itself is zeroed, and any additional swiotlb memory up to
alloc_size after the bounce buffer end (i.e., "post-padding") is also
zeroed.

However, as of commit 901c7280ca0d ("Reinstate some of "swiotlb: rework
"fix info leak with DMA_FROM_DEVICE"""), swiotlb_tbl_map_single() always
initializes the contents of the bounce buffer to the original memory.
Zeroing the bounce buffer is redundant and probably wrong per the
discussion in that commit. Only the post-padding needs to be zeroed.

Also, when the DMA min_align_mask is non-zero, the allocated bounce
buffer space may not start on a granule boundary.  The swiotlb memory
from the granule boundary to the start of the allocated bounce buffer
might belong to some unrelated bounce buffer. So as described in the
"second issue" in [1], it can't be zeroed to protect against untrusted
devices. But as of commit af133562d5af ("swiotlb: extend buffer
pre-padding to alloc_align_mask if necessary"), swiotlb_tbl_map_single()
allocates pre-padding slots when necessary to meet min_align_mask
requirements, making it possible to zero the pre-padding area as well.

Finally, iommu_dma_map_page() uses the swiotlb for untrusted devices
and also for certain kmalloc() memory. Current code does the zeroing
for both cases, but it is needed only for the untrusted device case.

Fix all of this by updating iommu_dma_map_page() to zero both the
pre-padding and post-padding areas, but not the actual bounce buffer.
Do this only in the case where the bounce buffer is used because
of an untrusted device.

[1] https://lore.kernel.org/all/20210929023300.335969-1-stevensd@google.com/

Signed-off-by: Michael Kelley <mhklinux@outlook.com>
Reviewed-by: Petr Tesarik <petr@tesarici.cz>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/iommu/dma-iommu.c | 29 ++++++++++++++++-------------
 include/linux/iova.h      |  5 +++++
 2 files changed, 21 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 7a9e814fc6b3..c745196bc150 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1154,9 +1154,6 @@ static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
 	 */
 	if (dev_use_swiotlb(dev, size, dir) &&
 	    iova_offset(iovad, phys | size)) {
-		void *padding_start;
-		size_t padding_size, aligned_size;
-
 		if (!is_swiotlb_active(dev)) {
 			dev_warn_once(dev, "DMA bounce buffers are inactive, unable to map unaligned transaction.\n");
 			return DMA_MAPPING_ERROR;
@@ -1164,24 +1161,30 @@ static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
 
 		trace_swiotlb_bounced(dev, phys, size);
 
-		aligned_size = iova_align(iovad, size);
 		phys = swiotlb_tbl_map_single(dev, phys, size,
 					      iova_mask(iovad), dir, attrs);
 
 		if (phys == DMA_MAPPING_ERROR)
 			return DMA_MAPPING_ERROR;
 
-		/* Cleanup the padding area. */
-		padding_start = phys_to_virt(phys);
-		padding_size = aligned_size;
+		/*
+		 * Untrusted devices should not see padding areas with random
+		 * leftover kernel data, so zero the pre- and post-padding.
+		 * swiotlb_tbl_map_single() has initialized the bounce buffer
+		 * proper to the contents of the original memory buffer.
+		 */
+		if (dev_is_untrusted(dev)) {
+			size_t start, virt = (size_t)phys_to_virt(phys);
 
-		if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
-		    (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) {
-			padding_start += size;
-			padding_size -= size;
-		}
+			/* Pre-padding */
+			start = iova_align_down(iovad, virt);
+			memset((void *)start, 0, virt - start);
 
-		memset(padding_start, 0, padding_size);
+			/* Post-padding */
+			start = virt + size;
+			memset((void *)start, 0,
+			       iova_align(iovad, start) - start);
+		}
 	}
 
 	if (!coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 83c00fac2acb..d2c4fd923efa 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -65,6 +65,11 @@ static inline size_t iova_align(struct iova_domain *iovad, size_t size)
 	return ALIGN(size, iovad->granule);
 }
 
+static inline size_t iova_align_down(struct iova_domain *iovad, size_t size)
+{
+	return ALIGN_DOWN(size, iovad->granule);
+}
+
 static inline dma_addr_t iova_dma_addr(struct iova_domain *iovad, struct iova *iova)
 {
 	return (dma_addr_t)iova->pfn_lo << iova_shift(iovad);
-- 
cgit v1.2.3


From fe7514b149e0a8a6f3031d286e52d40163b0b11a Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Tue, 7 May 2024 13:20:20 +0200
Subject: dma: compile-out DMA sync op calls when not used

Some platforms do have DMA, but DMA there is always direct and coherent.
Currently, even on such platforms DMA sync operations are compiled and
called.
Add a new hidden Kconfig symbol, DMA_NEED_SYNC, and set it only when
either sync operations are needed or there is DMA ops or swiotlb
or DMA debug is enabled. Compile global dma_sync_*() and dma_need_sync()
only when it's set, otherwise provide empty inline stubs.
The change allows for future optimizations of DMA sync calls depending
on runtime conditions.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/dma-mapping.h | 62 ++++++++++++++++++++++++---------------------
 kernel/dma/Kconfig          |  5 ++++
 kernel/dma/mapping.c        | 22 ++++++++--------
 3 files changed, 50 insertions(+), 39 deletions(-)

(limited to 'include')

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 4a658de44ee9..a569b56b25e2 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -117,14 +117,6 @@ dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs);
 void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
 		enum dma_data_direction dir, unsigned long attrs);
-void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
-		enum dma_data_direction dir);
-void dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
-		size_t size, enum dma_data_direction dir);
-void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
-		    int nelems, enum dma_data_direction dir);
-void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
-		       int nelems, enum dma_data_direction dir);
 void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
 		gfp_t flag, unsigned long attrs);
 void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr,
@@ -147,7 +139,6 @@ u64 dma_get_required_mask(struct device *dev);
 bool dma_addressing_limited(struct device *dev);
 size_t dma_max_mapping_size(struct device *dev);
 size_t dma_opt_mapping_size(struct device *dev);
-bool dma_need_sync(struct device *dev, dma_addr_t dma_addr);
 unsigned long dma_get_merge_boundary(struct device *dev);
 struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size,
 		enum dma_data_direction dir, gfp_t gfp, unsigned long attrs);
@@ -195,22 +186,6 @@ static inline void dma_unmap_resource(struct device *dev, dma_addr_t addr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
 }
-static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
-		size_t size, enum dma_data_direction dir)
-{
-}
-static inline void dma_sync_single_for_device(struct device *dev,
-		dma_addr_t addr, size_t size, enum dma_data_direction dir)
-{
-}
-static inline void dma_sync_sg_for_cpu(struct device *dev,
-		struct scatterlist *sg, int nelems, enum dma_data_direction dir)
-{
-}
-static inline void dma_sync_sg_for_device(struct device *dev,
-		struct scatterlist *sg, int nelems, enum dma_data_direction dir)
-{
-}
 static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
 {
 	return -ENOMEM;
@@ -277,10 +252,6 @@ static inline size_t dma_opt_mapping_size(struct device *dev)
 {
 	return 0;
 }
-static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
-{
-	return false;
-}
 static inline unsigned long dma_get_merge_boundary(struct device *dev)
 {
 	return 0;
@@ -310,6 +281,39 @@ static inline int dma_mmap_noncontiguous(struct device *dev,
 }
 #endif /* CONFIG_HAS_DMA */
 
+#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
+void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
+		enum dma_data_direction dir);
+void dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir);
+void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
+		int nelems, enum dma_data_direction dir);
+void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
+		int nelems, enum dma_data_direction dir);
+bool dma_need_sync(struct device *dev, dma_addr_t dma_addr);
+#else /* !CONFIG_HAS_DMA || !CONFIG_DMA_NEED_SYNC */
+static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir)
+{
+}
+static inline void dma_sync_single_for_device(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+}
+static inline void dma_sync_sg_for_cpu(struct device *dev,
+		struct scatterlist *sg, int nelems, enum dma_data_direction dir)
+{
+}
+static inline void dma_sync_sg_for_device(struct device *dev,
+		struct scatterlist *sg, int nelems, enum dma_data_direction dir)
+{
+}
+static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
+{
+	return false;
+}
+#endif /* !CONFIG_HAS_DMA || !CONFIG_DMA_NEED_SYNC */
+
 struct page *dma_alloc_pages(struct device *dev, size_t size,
 		dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp);
 void dma_free_pages(struct device *dev, size_t size, struct page *page,
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index d62f5957f36b..c06e56be0ca1 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -107,6 +107,11 @@ config DMA_BOUNCE_UNALIGNED_KMALLOC
 	bool
 	depends on SWIOTLB
 
+config DMA_NEED_SYNC
+	def_bool ARCH_HAS_SYNC_DMA_FOR_DEVICE || ARCH_HAS_SYNC_DMA_FOR_CPU || \
+		 ARCH_HAS_SYNC_DMA_FOR_CPU_ALL || DMA_API_DEBUG || DMA_OPS || \
+		 SWIOTLB
+
 config DMA_RESTRICTED_POOL
 	bool "DMA Restricted Pool"
 	depends on OF && OF_RESERVED_MEM && SWIOTLB
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 58db8fd70471..c78b78e95a26 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -329,6 +329,7 @@ void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
 }
 EXPORT_SYMBOL(dma_unmap_resource);
 
+#ifdef CONFIG_DMA_NEED_SYNC
 void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
 		enum dma_data_direction dir)
 {
@@ -385,6 +386,17 @@ void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 }
 EXPORT_SYMBOL(dma_sync_sg_for_device);
 
+bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	if (dma_map_direct(dev, ops))
+		return dma_direct_need_sync(dev, dma_addr);
+	return ops->sync_single_for_cpu || ops->sync_single_for_device;
+}
+EXPORT_SYMBOL_GPL(dma_need_sync);
+#endif /* CONFIG_DMA_NEED_SYNC */
+
 /*
  * The whole dma_get_sgtable() idea is fundamentally unsafe - it seems
  * that the intention is to allow exporting memory allocated via the
@@ -841,16 +853,6 @@ size_t dma_opt_mapping_size(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(dma_opt_mapping_size);
 
-bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
-{
-	const struct dma_map_ops *ops = get_dma_ops(dev);
-
-	if (dma_map_direct(dev, ops))
-		return dma_direct_need_sync(dev, dma_addr);
-	return ops->sync_single_for_cpu || ops->sync_single_for_device;
-}
-EXPORT_SYMBOL_GPL(dma_need_sync);
-
 unsigned long dma_get_merge_boundary(struct device *dev)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
-- 
cgit v1.2.3


From f406c8e4b770ca3b0df84a17349e13f2b6b07d10 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Tue, 7 May 2024 13:20:21 +0200
Subject: dma: avoid redundant calls for sync operations

Quite often, devices do not need dma_sync operations on x86_64 at least.
Indeed, when dev_is_dma_coherent(dev) is true and
dev_use_swiotlb(dev) is false, iommu_dma_sync_single_for_cpu()
and friends do nothing.

However, indirectly calling them when CONFIG_RETPOLINE=y consumes about
10% of cycles on a cpu receiving packets from softirq at ~100Gbit rate.
Even if/when CONFIG_RETPOLINE is not set, there is a cost of about 3%.

Add dev->need_dma_sync boolean and turn it off during the device
initialization (dma_set_mask()) depending on the setup:
dev_is_dma_coherent() for the direct DMA, !(sync_single_for_device ||
sync_single_for_cpu) or the new dma_map_ops flag, %DMA_F_CAN_SKIP_SYNC,
advertised for non-NULL DMA ops.
Then later, if/when swiotlb is used for the first time, the flag
is reset back to on, from swiotlb_tbl_map_single().

On iavf, the UDP trafficgen with XDP_DROP in skb mode test shows
+3-5% increase for direct DMA.

Suggested-by: Christoph Hellwig <hch@lst.de> # direct DMA shortcut
Co-developed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/device.h      |  4 ++++
 include/linux/dma-map-ops.h | 12 ++++++++++
 include/linux/dma-mapping.h | 53 ++++++++++++++++++++++++++++++++++++++-----
 kernel/dma/mapping.c        | 55 +++++++++++++++++++++++++++++++++++----------
 kernel/dma/swiotlb.c        |  6 +++++
 5 files changed, 113 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/linux/device.h b/include/linux/device.h
index b9f5464f44ed..ed95b829f05b 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -691,6 +691,7 @@ struct device_physical_location {
  *		and optionall (if the coherent mask is large enough) also
  *		for dma allocations.  This flag is managed by the dma ops
  *		instance from ->dma_supported.
+ * @dma_need_sync: The device needs performing DMA sync operations.
  *
  * At the lowest level, every device in a Linux system is represented by an
  * instance of struct device. The device structure contains the information
@@ -803,6 +804,9 @@ struct device {
 #ifdef CONFIG_DMA_OPS_BYPASS
 	bool			dma_ops_bypass : 1;
 #endif
+#ifdef CONFIG_DMA_NEED_SYNC
+	bool			dma_need_sync:1;
+#endif
 };
 
 /**
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 4abc60f04209..4893cb89cb52 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -18,8 +18,11 @@ struct iommu_ops;
  *
  * DMA_F_PCI_P2PDMA_SUPPORTED: Indicates the dma_map_ops implementation can
  * handle PCI P2PDMA pages in the map_sg/unmap_sg operation.
+ * DMA_F_CAN_SKIP_SYNC: DMA sync operations can be skipped if the device is
+ * coherent and it's not an SWIOTLB buffer.
  */
 #define DMA_F_PCI_P2PDMA_SUPPORTED     (1 << 0)
+#define DMA_F_CAN_SKIP_SYNC            (1 << 1)
 
 struct dma_map_ops {
 	unsigned int flags;
@@ -273,6 +276,15 @@ static inline bool dev_is_dma_coherent(struct device *dev)
 }
 #endif /* CONFIG_ARCH_HAS_DMA_COHERENCE_H */
 
+static inline void dma_reset_need_sync(struct device *dev)
+{
+#ifdef CONFIG_DMA_NEED_SYNC
+	/* Reset it only once so that the function can be called on hotpath */
+	if (unlikely(!dev->dma_need_sync))
+		dev->dma_need_sync = true;
+#endif
+}
+
 /*
  * Check whether potential kmalloc() buffers are safe for non-coherent DMA.
  */
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index a569b56b25e2..eb4e15893b6c 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -282,16 +282,59 @@ static inline int dma_mmap_noncontiguous(struct device *dev,
 #endif /* CONFIG_HAS_DMA */
 
 #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
-void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
+void __dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
 		enum dma_data_direction dir);
-void dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
+void __dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
 		size_t size, enum dma_data_direction dir);
-void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
+void __dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
 		int nelems, enum dma_data_direction dir);
-void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
+void __dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 		int nelems, enum dma_data_direction dir);
-bool dma_need_sync(struct device *dev, dma_addr_t dma_addr);
+bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr);
+
+static inline bool dma_dev_need_sync(const struct device *dev)
+{
+	/* Always call DMA sync operations when debugging is enabled */
+	return dev->dma_need_sync || IS_ENABLED(CONFIG_DMA_API_DEBUG);
+}
+
+static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
+		size_t size, enum dma_data_direction dir)
+{
+	if (dma_dev_need_sync(dev))
+		__dma_sync_single_for_cpu(dev, addr, size, dir);
+}
+
+static inline void dma_sync_single_for_device(struct device *dev,
+		dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+	if (dma_dev_need_sync(dev))
+		__dma_sync_single_for_device(dev, addr, size, dir);
+}
+
+static inline void dma_sync_sg_for_cpu(struct device *dev,
+		struct scatterlist *sg, int nelems, enum dma_data_direction dir)
+{
+	if (dma_dev_need_sync(dev))
+		__dma_sync_sg_for_cpu(dev, sg, nelems, dir);
+}
+
+static inline void dma_sync_sg_for_device(struct device *dev,
+		struct scatterlist *sg, int nelems, enum dma_data_direction dir)
+{
+	if (dma_dev_need_sync(dev))
+		__dma_sync_sg_for_device(dev, sg, nelems, dir);
+}
+
+static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
+{
+	return dma_dev_need_sync(dev) ? __dma_need_sync(dev, dma_addr) : false;
+}
 #else /* !CONFIG_HAS_DMA || !CONFIG_DMA_NEED_SYNC */
+static inline bool dma_dev_need_sync(const struct device *dev)
+{
+	return false;
+}
 static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
 		size_t size, enum dma_data_direction dir)
 {
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index c78b78e95a26..3524bc92c37f 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -330,7 +330,7 @@ void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
 EXPORT_SYMBOL(dma_unmap_resource);
 
 #ifdef CONFIG_DMA_NEED_SYNC
-void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
+void __dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
 		enum dma_data_direction dir)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -342,9 +342,9 @@ void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size,
 		ops->sync_single_for_cpu(dev, addr, size, dir);
 	debug_dma_sync_single_for_cpu(dev, addr, size, dir);
 }
-EXPORT_SYMBOL(dma_sync_single_for_cpu);
+EXPORT_SYMBOL(__dma_sync_single_for_cpu);
 
-void dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
+void __dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
 		size_t size, enum dma_data_direction dir)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -356,9 +356,9 @@ void dma_sync_single_for_device(struct device *dev, dma_addr_t addr,
 		ops->sync_single_for_device(dev, addr, size, dir);
 	debug_dma_sync_single_for_device(dev, addr, size, dir);
 }
-EXPORT_SYMBOL(dma_sync_single_for_device);
+EXPORT_SYMBOL(__dma_sync_single_for_device);
 
-void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
+void __dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
 		    int nelems, enum dma_data_direction dir)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -370,9 +370,9 @@ void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
 		ops->sync_sg_for_cpu(dev, sg, nelems, dir);
 	debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir);
 }
-EXPORT_SYMBOL(dma_sync_sg_for_cpu);
+EXPORT_SYMBOL(__dma_sync_sg_for_cpu);
 
-void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
+void __dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 		       int nelems, enum dma_data_direction dir)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
@@ -384,18 +384,47 @@ void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 		ops->sync_sg_for_device(dev, sg, nelems, dir);
 	debug_dma_sync_sg_for_device(dev, sg, nelems, dir);
 }
-EXPORT_SYMBOL(dma_sync_sg_for_device);
+EXPORT_SYMBOL(__dma_sync_sg_for_device);
 
-bool dma_need_sync(struct device *dev, dma_addr_t dma_addr)
+bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 
 	if (dma_map_direct(dev, ops))
+		/*
+		 * dma_need_sync could've been reset on first SWIOTLB buffer
+		 * mapping, but @dma_addr is not necessary an SWIOTLB buffer.
+		 * In this case, fall back to more granular check.
+		 */
 		return dma_direct_need_sync(dev, dma_addr);
-	return ops->sync_single_for_cpu || ops->sync_single_for_device;
+	return true;
 }
-EXPORT_SYMBOL_GPL(dma_need_sync);
-#endif /* CONFIG_DMA_NEED_SYNC */
+EXPORT_SYMBOL_GPL(__dma_need_sync);
+
+static void dma_setup_need_sync(struct device *dev)
+{
+	const struct dma_map_ops *ops = get_dma_ops(dev);
+
+	if (dma_map_direct(dev, ops) || (ops->flags & DMA_F_CAN_SKIP_SYNC))
+		/*
+		 * dma_need_sync will be reset to %true on first SWIOTLB buffer
+		 * mapping, if any. During the device initialization, it's
+		 * enough to check only for the DMA coherence.
+		 */
+		dev->dma_need_sync = !dev_is_dma_coherent(dev);
+	else if (!ops->sync_single_for_device && !ops->sync_single_for_cpu &&
+		 !ops->sync_sg_for_device && !ops->sync_sg_for_cpu)
+		/*
+		 * Synchronization is not possible when none of DMA sync ops
+		 * is set.
+		 */
+		dev->dma_need_sync = false;
+	else
+		dev->dma_need_sync = true;
+}
+#else /* !CONFIG_DMA_NEED_SYNC */
+static inline void dma_setup_need_sync(struct device *dev) { }
+#endif /* !CONFIG_DMA_NEED_SYNC */
 
 /*
  * The whole dma_get_sgtable() idea is fundamentally unsafe - it seems
@@ -785,6 +814,8 @@ int dma_set_mask(struct device *dev, u64 mask)
 
 	arch_dma_set_mask(dev, mask);
 	*dev->dma_mask = mask;
+	dma_setup_need_sync(dev);
+
 	return 0;
 }
 EXPORT_SYMBOL(dma_set_mask);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 046da973a7e2..ae3e593eaadb 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -1408,6 +1408,12 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 		return (phys_addr_t)DMA_MAPPING_ERROR;
 	}
 
+	/*
+	 * If dma_need_sync wasn't set, reset it on first SWIOTLB buffer
+	 * mapping to always sync SWIOTLB buffers.
+	 */
+	dma_reset_need_sync(dev);
+
 	/*
 	 * Save away the mapping from the original address to the DMA address.
 	 * This is needed when we sync the memory.  Then we sync the buffer if
-- 
cgit v1.2.3


From 1f20a5769446a1acae67ac9e63d07a594829a789 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Tue, 7 May 2024 13:20:23 +0200
Subject: page_pool: make sure frag API fields don't span between cachelines

After commit 5027ec19f104 ("net: page_pool: split the page_pool_params
into fast and slow") that made &page_pool contain only "hot" params at
the start, cacheline boundary chops frag API fields group in the middle
again.
To not bother with this each time fast params get expanded or shrunk,
let's just align them to `4 * sizeof(long)`, the closest upper pow-2 to
their actual size (2 longs + 1 int). This ensures 16-byte alignment for
the 32-bit architectures and 32-byte alignment for the 64-bit ones,
excluding unnecessary false-sharing.
::page_state_hold_cnt is used quite intensively on hotpath no matter if
frag API is used, so move it to the newly created hole in the first
cacheline.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/net/page_pool/types.h | 12 +++++++++++-
 net/core/page_pool.c          | 10 ++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
index 5e43a08d3231..5460cbab5de0 100644
--- a/include/net/page_pool/types.h
+++ b/include/net/page_pool/types.h
@@ -130,12 +130,22 @@ struct page_pool {
 	struct page_pool_params_fast p;
 
 	int cpuid;
+	u32 pages_state_hold_cnt;
 	bool has_init_callback;
 
+	/* The following block must stay within one cacheline. On 32-bit
+	 * systems, sizeof(long) == sizeof(int), so that the block size is
+	 * ``3 * sizeof(long)``. On 64-bit systems, the actual size is
+	 * ``2 * sizeof(long) + sizeof(int)``. The closest pow-2 to both of
+	 * them is ``4 * sizeof(long)``, so just use that one for simplicity.
+	 * Having it aligned to a cacheline boundary may be excessive and
+	 * doesn't bring any good.
+	 */
+	__cacheline_group_begin(frag) __aligned(4 * sizeof(long));
 	long frag_users;
 	struct page *frag_page;
 	unsigned int frag_offset;
-	u32 pages_state_hold_cnt;
+	__cacheline_group_end(frag);
 
 	struct delayed_work release_dw;
 	void (*disconnect)(void *pool);
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index dd364d738c00..95eac12e8790 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -172,12 +172,22 @@ static void page_pool_producer_unlock(struct page_pool *pool,
 		spin_unlock_bh(&pool->ring.producer_lock);
 }
 
+static void page_pool_struct_check(void)
+{
+	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset);
+	CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag, 4 * sizeof(long));
+}
+
 static int page_pool_init(struct page_pool *pool,
 			  const struct page_pool_params *params,
 			  int cpuid)
 {
 	unsigned int ring_qsize = 1024; /* Default */
 
+	page_pool_struct_check();
+
 	memcpy(&pool->p, &params->fast, sizeof(pool->p));
 	memcpy(&pool->slow, &params->slow, sizeof(pool->slow));
 
-- 
cgit v1.2.3


From 403f11ac9ab72fc3bee0b8c80c16e33212ea8cd9 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Tue, 7 May 2024 13:20:24 +0200
Subject: page_pool: don't use driver-set flags field directly

page_pool::p is driver-defined params, copied directly from the
structure passed to page_pool_create(). The structure isn't meant
to be modified by the Page Pool core code and this even might look
confusing[0][1].
In order to be able to alter some flags, let's define our own, internal
fields the same way as the already existing one (::has_init_callback).
They are defined as bits in the driver-set params, leave them so here
as well, to not waste byte-per-bit or so. Almost 30 bits are still free
for future extensions.
We could've defined only new flags here or only the ones we may need
to alter, but checking some flags in one place while others in another
doesn't sound convenient or intuitive. ::flags passed by the driver can
now go to the "slow" PP params.

Suggested-by: Jakub Kicinski <kuba@kernel.org>
Link[0]: https://lore.kernel.org/netdev/20230703133207.4f0c54ce@kernel.org
Suggested-by: Alexander Duyck <alexanderduyck@fb.com>
Link[1]: https://lore.kernel.org/netdev/CAKgT0UfZCGnWgOH96E4GV3ZP6LLbROHM7SHE8NKwq+exX+Gk_Q@mail.gmail.com
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/net/page_pool/types.h | 13 ++++++++++---
 net/core/page_pool.c          | 41 ++++++++++++++++++++++-------------------
 2 files changed, 32 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h
index 5460cbab5de0..8c6d9f16bf65 100644
--- a/include/net/page_pool/types.h
+++ b/include/net/page_pool/types.h
@@ -45,7 +45,6 @@ struct pp_alloc_cache {
 
 /**
  * struct page_pool_params - page pool parameters
- * @flags:	PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV
  * @order:	2^order pages on allocation
  * @pool_size:	size of the ptr_ring
  * @nid:	NUMA node id to allocate from pages from
@@ -55,10 +54,11 @@ struct pp_alloc_cache {
  * @dma_dir:	DMA mapping direction
  * @max_len:	max DMA sync memory size for PP_FLAG_DMA_SYNC_DEV
  * @offset:	DMA sync address offset for PP_FLAG_DMA_SYNC_DEV
+ * @netdev:	corresponding &net_device for Netlink introspection
+ * @flags:	PP_FLAG_DMA_MAP, PP_FLAG_DMA_SYNC_DEV, PP_FLAG_SYSTEM_POOL
  */
 struct page_pool_params {
 	struct_group_tagged(page_pool_params_fast, fast,
-		unsigned int	flags;
 		unsigned int	order;
 		unsigned int	pool_size;
 		int		nid;
@@ -70,6 +70,7 @@ struct page_pool_params {
 	);
 	struct_group_tagged(page_pool_params_slow, slow,
 		struct net_device *netdev;
+		unsigned int	flags;
 /* private: used by test code only */
 		void (*init_callback)(struct page *page, void *arg);
 		void *init_arg;
@@ -131,7 +132,13 @@ struct page_pool {
 
 	int cpuid;
 	u32 pages_state_hold_cnt;
-	bool has_init_callback;
+
+	bool has_init_callback:1;	/* slow::init_callback is set */
+	bool dma_map:1;			/* Perform DMA mapping */
+	bool dma_sync:1;		/* Perform DMA sync */
+#ifdef CONFIG_PAGE_POOL_STATS
+	bool system:1;			/* This is a global percpu pool */
+#endif
 
 	/* The following block must stay within one cacheline. On 32-bit
 	 * systems, sizeof(long) == sizeof(int), so that the block size is
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 95eac12e8790..c2819ff03dd2 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -194,7 +194,7 @@ static int page_pool_init(struct page_pool *pool,
 	pool->cpuid = cpuid;
 
 	/* Validate only known flags were used */
-	if (pool->p.flags & ~(PP_FLAG_ALL))
+	if (pool->slow.flags & ~PP_FLAG_ALL)
 		return -EINVAL;
 
 	if (pool->p.pool_size)
@@ -208,22 +208,26 @@ static int page_pool_init(struct page_pool *pool,
 	 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
 	 * which is the XDP_TX use-case.
 	 */
-	if (pool->p.flags & PP_FLAG_DMA_MAP) {
+	if (pool->slow.flags & PP_FLAG_DMA_MAP) {
 		if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
 		    (pool->p.dma_dir != DMA_BIDIRECTIONAL))
 			return -EINVAL;
+
+		pool->dma_map = true;
 	}
 
-	if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) {
+	if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) {
 		/* In order to request DMA-sync-for-device the page
 		 * needs to be mapped
 		 */
-		if (!(pool->p.flags & PP_FLAG_DMA_MAP))
+		if (!(pool->slow.flags & PP_FLAG_DMA_MAP))
 			return -EINVAL;
 
 		if (!pool->p.max_len)
 			return -EINVAL;
 
+		pool->dma_sync = true;
+
 		/* pool->p.offset has to be set according to the address
 		 * offset used by the DMA engine to start copying rx data
 		 */
@@ -232,7 +236,7 @@ static int page_pool_init(struct page_pool *pool,
 	pool->has_init_callback = !!pool->slow.init_callback;
 
 #ifdef CONFIG_PAGE_POOL_STATS
-	if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL)) {
+	if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) {
 		pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
 		if (!pool->recycle_stats)
 			return -ENOMEM;
@@ -242,12 +246,13 @@ static int page_pool_init(struct page_pool *pool,
 		 * (also percpu) page pool instance.
 		 */
 		pool->recycle_stats = &pp_system_recycle_stats;
+		pool->system = true;
 	}
 #endif
 
 	if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) {
 #ifdef CONFIG_PAGE_POOL_STATS
-		if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL))
+		if (!pool->system)
 			free_percpu(pool->recycle_stats);
 #endif
 		return -ENOMEM;
@@ -258,7 +263,7 @@ static int page_pool_init(struct page_pool *pool,
 	/* Driver calling page_pool_create() also call page_pool_destroy() */
 	refcount_set(&pool->user_cnt, 1);
 
-	if (pool->p.flags & PP_FLAG_DMA_MAP)
+	if (pool->dma_map)
 		get_device(pool->p.dev);
 
 	return 0;
@@ -268,11 +273,11 @@ static void page_pool_uninit(struct page_pool *pool)
 {
 	ptr_ring_cleanup(&pool->ring, NULL);
 
-	if (pool->p.flags & PP_FLAG_DMA_MAP)
+	if (pool->dma_map)
 		put_device(pool->p.dev);
 
 #ifdef CONFIG_PAGE_POOL_STATS
-	if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL))
+	if (!pool->system)
 		free_percpu(pool->recycle_stats);
 #endif
 }
@@ -424,7 +429,7 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
 	if (page_pool_set_dma_addr(page, dma))
 		goto unmap_failed;
 
-	if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
+	if (pool->dma_sync)
 		page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
 
 	return true;
@@ -470,8 +475,7 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
 	if (unlikely(!page))
 		return NULL;
 
-	if ((pool->p.flags & PP_FLAG_DMA_MAP) &&
-	    unlikely(!page_pool_dma_map(pool, page))) {
+	if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page))) {
 		put_page(page);
 		return NULL;
 	}
@@ -491,8 +495,8 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
 						 gfp_t gfp)
 {
 	const int bulk = PP_ALLOC_CACHE_REFILL;
-	unsigned int pp_flags = pool->p.flags;
 	unsigned int pp_order = pool->p.order;
+	bool dma_map = pool->dma_map;
 	struct page *page;
 	int i, nr_pages;
 
@@ -517,8 +521,7 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
 	 */
 	for (i = 0; i < nr_pages; i++) {
 		page = pool->alloc.cache[i];
-		if ((pp_flags & PP_FLAG_DMA_MAP) &&
-		    unlikely(!page_pool_dma_map(pool, page))) {
+		if (dma_map && unlikely(!page_pool_dma_map(pool, page))) {
 			put_page(page);
 			continue;
 		}
@@ -590,7 +593,7 @@ void __page_pool_release_page_dma(struct page_pool *pool, struct page *page)
 {
 	dma_addr_t dma;
 
-	if (!(pool->p.flags & PP_FLAG_DMA_MAP))
+	if (!pool->dma_map)
 		/* Always account for inflight pages, even if we didn't
 		 * map them
 		 */
@@ -673,7 +676,7 @@ static bool __page_pool_page_can_be_recycled(const struct page *page)
 }
 
 /* If the page refcnt == 1, this will try to recycle the page.
- * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for
+ * If pool->dma_sync is set, we'll try to sync the DMA area for
  * the configured size min(dma_sync_size, pool->max_len).
  * If the page refcnt != 1, then the page will be returned to memory
  * subsystem.
@@ -696,7 +699,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
 	if (likely(__page_pool_page_can_be_recycled(page))) {
 		/* Read barrier done in page_ref_count / READ_ONCE */
 
-		if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
+		if (pool->dma_sync)
 			page_pool_dma_sync_for_device(pool, page,
 						      dma_sync_size);
 
@@ -809,7 +812,7 @@ static struct page *page_pool_drain_frag(struct page_pool *pool,
 		return NULL;
 
 	if (__page_pool_page_can_be_recycled(page)) {
-		if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
+		if (pool->dma_sync)
 			page_pool_dma_sync_for_device(pool, page, -1);
 
 		return page;
-- 
cgit v1.2.3


From f9f67e5adc8dc2e1cc51ab2d3d6382fa97f074d4 Mon Sep 17 00:00:00 2001
From: Robert Richter <rrichter@amd.com>
Date: Thu, 2 May 2024 15:10:09 +0200
Subject: x86/numa: Fix SRAT lookup of CFMWS ranges with numa_fill_memblks()

For configurations that have the kconfig option NUMA_KEEP_MEMINFO
disabled, numa_fill_memblks() only returns with NUMA_NO_MEMBLK (-1).
SRAT lookup fails then because an existing SRAT memory range cannot be
found for a CFMWS address range. This causes the addition of a
duplicate numa_memblk with a different node id and a subsequent page
fault and kernel crash during boot.

Fix this by making numa_fill_memblks() always available regardless of
NUMA_KEEP_MEMINFO.

As Dan suggested, the fix is implemented to remove numa_fill_memblks()
from sparsemem.h and alos using __weak for the function.

Note that the issue was initially introduced with [1]. But since
phys_to_target_node() was originally used that returned the valid node
0, an additional numa_memblk was not added. Though, the node id was
wrong too, a message is seen then in the logs:

 kernel/numa.c:  pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n",

[1] commit fd49f99c1809 ("ACPI: NUMA: Add a node and memblk for each
    CFMWS not in SRAT")

Suggested-by: Dan Williams <dan.j.williams@intel.com>
Link: https://lore.kernel.org/all/66271b0072317_69102944c@dwillia2-xfh.jf.intel.com.notmuch/
Fixes: 8f1004679987 ("ACPI/NUMA: Apply SRAT proximity domain to entire CFMWS window")
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Robert Richter <rrichter@amd.com>
Acked-by: Borislav Petkov (AMD) <bp@alien8.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/x86/include/asm/sparsemem.h | 2 --
 arch/x86/mm/numa.c               | 4 ++--
 drivers/acpi/numa/srat.c         | 5 +++++
 include/linux/numa.h             | 7 +------
 4 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index 1be13b2dfe8b..64df897c0ee3 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -37,8 +37,6 @@ extern int phys_to_target_node(phys_addr_t start);
 #define phys_to_target_node phys_to_target_node
 extern int memory_add_physaddr_to_nid(u64 start);
 #define memory_add_physaddr_to_nid memory_add_physaddr_to_nid
-extern int numa_fill_memblks(u64 start, u64 end);
-#define numa_fill_memblks numa_fill_memblks
 #endif
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 65e9a6e391c0..ce84ba86e69e 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -929,6 +929,8 @@ int memory_add_physaddr_to_nid(u64 start)
 }
 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 
+#endif
+
 static int __init cmp_memblk(const void *a, const void *b)
 {
 	const struct numa_memblk *ma = *(const struct numa_memblk **)a;
@@ -1001,5 +1003,3 @@ int __init numa_fill_memblks(u64 start, u64 end)
 	}
 	return 0;
 }
-
-#endif
diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c
index e45e64993c50..3b09fd39eeb4 100644
--- a/drivers/acpi/numa/srat.c
+++ b/drivers/acpi/numa/srat.c
@@ -208,6 +208,11 @@ int __init srat_disabled(void)
 	return acpi_numa < 0;
 }
 
+__weak int __init numa_fill_memblks(u64 start, u64 end)
+{
+	return NUMA_NO_MEMBLK;
+}
+
 #if defined(CONFIG_X86) || defined(CONFIG_ARM64) || defined(CONFIG_LOONGARCH)
 /*
  * Callback for SLIT parsing.  pxm_to_node() returns NUMA_NO_NODE for
diff --git a/include/linux/numa.h b/include/linux/numa.h
index 915033a75731..1d43371fafd2 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -36,12 +36,7 @@ int memory_add_physaddr_to_nid(u64 start);
 int phys_to_target_node(u64 start);
 #endif
 
-#ifndef numa_fill_memblks
-static inline int __init numa_fill_memblks(u64 start, u64 end)
-{
-	return NUMA_NO_MEMBLK;
-}
-#endif
+int numa_fill_memblks(u64 start, u64 end);
 
 #else /* !CONFIG_NUMA */
 static inline int numa_nearest_node(int node, unsigned int state)
-- 
cgit v1.2.3


From 3a785e19f432672b9ef53f07c506d5e698439033 Mon Sep 17 00:00:00 2001
From: Robert Richter <rrichter@amd.com>
Date: Thu, 2 May 2024 15:10:10 +0200
Subject: ACPI/NUMA: Remove architecture dependent remainings

With the removal of the Itanium architecture [1] the last architecture
dependent functions:

 acpi_numa_slit_init(), acpi_numa_memory_affinity_init()

were removed. Remove its remainings in the header files too and make
them static.

[1] commit cf8e8658100d ("arch: Remove Itanium (IA-64) architecture")

Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Alison Schofield <alison.schofield@intel.com>
Signed-off-by: Robert Richter <rrichter@amd.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/numa/srat.c | 16 ++--------------
 include/linux/acpi.h     |  5 -----
 2 files changed, 2 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c
index 3b09fd39eeb4..e4d53e3660fd 100644
--- a/drivers/acpi/numa/srat.c
+++ b/drivers/acpi/numa/srat.c
@@ -213,13 +213,12 @@ __weak int __init numa_fill_memblks(u64 start, u64 end)
 	return NUMA_NO_MEMBLK;
 }
 
-#if defined(CONFIG_X86) || defined(CONFIG_ARM64) || defined(CONFIG_LOONGARCH)
 /*
  * Callback for SLIT parsing.  pxm_to_node() returns NUMA_NO_NODE for
  * I/O localities since SRAT does not list them.  I/O localities are
  * not supported at this point.
  */
-void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
+static void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
 {
 	int i, j;
 
@@ -241,11 +240,7 @@ void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
 	}
 }
 
-/*
- * Default callback for parsing of the Proximity Domain <-> Memory
- * Area mappings
- */
-int __init
+static int __init
 acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 {
 	u64 start, end;
@@ -345,13 +340,6 @@ static int __init acpi_parse_cfmws(union acpi_subtable_headers *header,
 	(*fake_pxm)++;
 	return 0;
 }
-#else
-static int __init acpi_parse_cfmws(union acpi_subtable_headers *header,
-				   void *arg, const unsigned long table_end)
-{
-	return 0;
-}
-#endif /* defined(CONFIG_X86) || defined (CONFIG_ARM64) */
 
 static int __init acpi_parse_slit(struct acpi_table_header *table)
 {
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 34829f2c517a..2c227b61a452 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -242,9 +242,6 @@ static inline bool acpi_gicc_is_usable(struct acpi_madt_generic_interrupt *gicc)
 	return gicc->flags & ACPI_MADT_ENABLED;
 }
 
-/* the following numa functions are architecture-dependent */
-void acpi_numa_slit_init (struct acpi_table_slit *slit);
-
 #if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH)
 void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa);
 #else
@@ -267,8 +264,6 @@ static inline void
 acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa) { }
 #endif
 
-int acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma);
-
 #ifndef PHYS_CPUID_INVALID
 typedef u32 phys_cpuid_t;
 #define PHYS_CPUID_INVALID (phys_cpuid_t)(-1)
-- 
cgit v1.2.3


From e7cda7fe37ff1ece39bd2bf35ea68b1175395d95 Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@toblux.com>
Date: Fri, 12 Apr 2024 15:54:07 +0200
Subject: bug: Improve comment

Add parentheses to WARN_ON_ONCE() for consistency.

Signed-off-by: Thorsten Blum <thorsten.blum@toblux.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 include/asm-generic/bug.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index b7de3a4eade1..387720933973 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -72,7 +72,7 @@ struct bug_entry {
 #endif
 
 /*
- * WARN(), WARN_ON(), WARN_ON_ONCE, and so on can be used to report
+ * WARN(), WARN_ON(), WARN_ON_ONCE(), and so on can be used to report
  * significant kernel issues that need prompt attention if they should ever
  * appear at runtime.
  *
-- 
cgit v1.2.3


From 81c2168c229bab0665e862937bb476f18cff056d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 6 May 2024 06:20:24 +0200
Subject: block: add a bio_chain_and_submit helper

This is basically blk_next_bio just with the bio allocation moved
to the caller to allow for more flexible bio handling in the caller.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20240506042027.2289826-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c         | 27 +++++++++++++++++++--------
 include/linux/bio.h |  1 +
 2 files changed, 20 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/block/bio.c b/block/bio.c
index 38baedb39c6f..d82ef4fd545c 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -345,18 +345,29 @@ void bio_chain(struct bio *bio, struct bio *parent)
 }
 EXPORT_SYMBOL(bio_chain);
 
-struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
-		unsigned int nr_pages, blk_opf_t opf, gfp_t gfp)
+/**
+ * bio_chain_and_submit - submit a bio after chaining it to another one
+ * @prev: bio to chain and submit
+ * @new: bio to chain to
+ *
+ * If @prev is non-NULL, chain it to @new and submit it.
+ *
+ * Return: @new.
+ */
+struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new)
 {
-	struct bio *new = bio_alloc(bdev, nr_pages, opf, gfp);
-
-	if (bio) {
-		bio_chain(bio, new);
-		submit_bio(bio);
+	if (prev) {
+		bio_chain(prev, new);
+		submit_bio(prev);
 	}
-
 	return new;
 }
+
+struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
+		unsigned int nr_pages, blk_opf_t opf, gfp_t gfp)
+{
+	return bio_chain_and_submit(bio, bio_alloc(bdev, nr_pages, opf, gfp));
+}
 EXPORT_SYMBOL_GPL(blk_next_bio);
 
 static void bio_alloc_rescue(struct work_struct *work)
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 9b8a369f44bc..283a0dcbd1de 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -831,5 +831,6 @@ static inline void bio_clear_polled(struct bio *bio)
 
 struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
 		unsigned int nr_pages, blk_opf_t opf, gfp_t gfp);
+struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new);
 
 #endif /* __LINUX_BIO_H */
-- 
cgit v1.2.3


From e8b4869bc78da1a71f2a2ab476caf50c1dcfeed0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 6 May 2024 06:20:25 +0200
Subject: block: add a blk_alloc_discard_bio helper

Factor out a helper from __blkdev_issue_discard that chews off as much as
possible from a discard range and allocates a bio for it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20240506042027.2289826-5-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-lib.c     | 50 +++++++++++++++++++++++++++++---------------------
 include/linux/bio.h |  3 +++
 2 files changed, 32 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 6e54ef140bab..442da9dad042 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -35,31 +35,39 @@ static sector_t bio_discard_limit(struct block_device *bdev, sector_t sector)
 	return round_down(UINT_MAX, discard_granularity) >> SECTOR_SHIFT;
 }
 
-int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
-		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop)
+struct bio *blk_alloc_discard_bio(struct block_device *bdev,
+		sector_t *sector, sector_t *nr_sects, gfp_t gfp_mask)
 {
-	struct bio *bio = *biop;
+	sector_t bio_sects = min(*nr_sects, bio_discard_limit(bdev, *sector));
+	struct bio *bio;
 
-	while (nr_sects) {
-		sector_t req_sects =
-			min(nr_sects, bio_discard_limit(bdev, sector));
+	if (!bio_sects)
+		return NULL;
 
-		bio = blk_next_bio(bio, bdev, 0, REQ_OP_DISCARD, gfp_mask);
-		bio->bi_iter.bi_sector = sector;
-		bio->bi_iter.bi_size = req_sects << 9;
-		sector += req_sects;
-		nr_sects -= req_sects;
-
-		/*
-		 * We can loop for a long time in here, if someone does
-		 * full device discards (like mkfs). Be nice and allow
-		 * us to schedule out to avoid softlocking if preempt
-		 * is disabled.
-		 */
-		cond_resched();
-	}
+	bio = bio_alloc(bdev, 0, REQ_OP_DISCARD, gfp_mask);
+	if (!bio)
+		return NULL;
+	bio->bi_iter.bi_sector = *sector;
+	bio->bi_iter.bi_size = bio_sects << SECTOR_SHIFT;
+	*sector += bio_sects;
+	*nr_sects -= bio_sects;
+	/*
+	 * We can loop for a long time in here if someone does full device
+	 * discards (like mkfs).  Be nice and allow us to schedule out to avoid
+	 * softlocking if preempt is disabled.
+	 */
+	cond_resched();
+	return bio;
+}
 
-	*biop = bio;
+int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop)
+{
+	struct bio *bio;
+
+	while ((bio = blk_alloc_discard_bio(bdev, &sector, &nr_sects,
+			gfp_mask)))
+		*biop = bio_chain_and_submit(*biop, bio);
 	return 0;
 }
 EXPORT_SYMBOL(__blkdev_issue_discard);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 283a0dcbd1de..d5379548d684 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -833,4 +833,7 @@ struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
 		unsigned int nr_pages, blk_opf_t opf, gfp_t gfp);
 struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new);
 
+struct bio *blk_alloc_discard_bio(struct block_device *bdev,
+		sector_t *sector, sector_t *nr_sects, gfp_t gfp_mask);
+
 #endif /* __LINUX_BIO_H */
-- 
cgit v1.2.3


From 504fbcffea649cad69111e7597081dd8adc3b395 Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Tue, 7 May 2024 10:31:03 +0800
Subject: md: Revert "md: Fix overflow in is_mddev_idle"

This reverts commit 3f9f231236ce7e48780d8a4f1f8cb9fae2df1e4e.

Using 64bit for 'sync_io' is unnecessary from the gendisk side. This
overflow will not cause any functional impact, except for a UBSAN
warning. Solving this overflow requires introducing additional
calculations and checks which are not necessary. So just keep using
32bit for 'sync_io'.

Signed-off-by: Li Nan <linan122@huawei.com>
Link: https://lore.kernel.org/r/20240507023103.781816-1-linan666@huaweicloud.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/md.c        | 7 +++----
 drivers/md/md.h        | 4 ++--
 include/linux/blkdev.h | 2 +-
 3 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 00bbafcd27bb..aff9118ff697 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8577,7 +8577,7 @@ static int is_mddev_idle(struct mddev *mddev, int init)
 {
 	struct md_rdev *rdev;
 	int idle;
-	long long curr_events;
+	int curr_events;
 
 	idle = 1;
 	rcu_read_lock();
@@ -8587,9 +8587,8 @@ static int is_mddev_idle(struct mddev *mddev, int init)
 		if (!init && !blk_queue_io_stat(disk->queue))
 			continue;
 
-		curr_events =
-			(long long)part_stat_read_accum(disk->part0, sectors) -
-			atomic64_read(&disk->sync_io);
+		curr_events = (int)part_stat_read_accum(disk->part0, sectors) -
+			      atomic_read(&disk->sync_io);
 		/* sync IO will cause sync_io to increase before the disk_stats
 		 * as sync_io is counted when a request starts, and
 		 * disk_stats is counted when it completes.
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 029dd0491a36..ca085ecad504 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -51,7 +51,7 @@ struct md_rdev {
 
 	sector_t sectors;		/* Device size (in 512bytes sectors) */
 	struct mddev *mddev;		/* RAID array if running */
-	long long last_events;		/* IO event timestamp */
+	int last_events;		/* IO event timestamp */
 
 	/*
 	 * If meta_bdev is non-NULL, it means that a separate device is
@@ -622,7 +622,7 @@ extern void mddev_unlock(struct mddev *mddev);
 static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
 {
 	if (blk_queue_io_stat(bdev->bd_disk->queue))
-		atomic64_add(nr_sectors, &bdev->bd_disk->sync_io);
+		atomic_add(nr_sectors, &bdev->bd_disk->sync_io);
 }
 
 static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3b18a40a1fc1..26acf80c50c0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -172,7 +172,7 @@ struct gendisk {
 	struct list_head slave_bdevs;
 #endif
 	struct timer_rand_state *random;
-	atomic64_t sync_io;		/* RAID */
+	atomic_t sync_io;		/* RAID */
 	struct disk_events *ev;
 
 #ifdef CONFIG_BLK_DEV_ZONED
-- 
cgit v1.2.3


From 65ad580a14e875c2d8c027cf1a2ca03b849ff843 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <bentiss@kernel.org>
Date: Wed, 10 Apr 2024 19:19:21 +0200
Subject: HID: do not assume HAT Switch logical max < 8

Turns out that the code can handle a greater range, but the data stored
can not. This is problematic on the Raptor Mach 2 joystick which
logical max is 239. The kernel interprets it as `-15` and thus ignores
the Hat Switch handling.

Link: https://gitlab.freedesktop.org/libevdev/udev-hid-bpf/-/issues/17
Link: https://lore.kernel.org/r/20240410-bpf_sources-v1-1-a8bf16033ef8@kernel.org
Reviewed-by: Peter Hutterer <peter.hutterer@who-t.net>
Signed-off-by: Benjamin Tissoires <bentiss@kernel.org>
---
 include/linux/hid.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/hid.h b/include/linux/hid.h
index b12cb1c8e682..8e06d89698e6 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -474,9 +474,9 @@ struct hid_usage {
 	__s8	  wheel_factor;		/* 120/resolution_multiplier */
 	__u16     code;			/* input driver code */
 	__u8      type;			/* input driver type */
-	__s8	  hat_min;		/* hat switch fun */
-	__s8	  hat_max;		/* ditto */
-	__s8	  hat_dir;		/* ditto */
+	__s16	  hat_min;		/* hat switch fun */
+	__s16	  hat_max;		/* ditto */
+	__s16	  hat_dir;		/* ditto */
 	__s16	  wheel_accumulated;	/* hi-res wheel */
 };
 
-- 
cgit v1.2.3


From 651d61bc8b7d8bb622cfc24be2ee92eebb4ed3cc Mon Sep 17 00:00:00 2001
From: Joel Stanley <joel@jms.id.au>
Date: Tue, 11 Apr 2023 15:44:46 +0930
Subject: KVM: PPC: Fix documentation for ppc mmu caps

The documentation mentions KVM_CAP_PPC_RADIX_MMU, but the defines in the
kvm headers spell it KVM_CAP_PPC_MMU_RADIX. Similarly with
KVM_CAP_PPC_MMU_HASH_V3.

Fixes: c92701322711 ("KVM: PPC: Book3S HV: Add userspace interfaces for POWER9 MMU")
Signed-off-by: Joel Stanley <joel@jms.id.au>
Acked-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://msgid.link/20230411061446.26324-1-joel@jms.id.au
---
 Documentation/virt/kvm/api.rst | 8 ++++----
 include/uapi/linux/kvm.h       | 4 ++--
 tools/include/uapi/linux/kvm.h | 2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 0b5a33ee71ee..636e6794828f 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -4300,7 +4300,7 @@ operating system that uses the PIT for timing (e.g. Linux 2.4.x).
 4.100 KVM_PPC_CONFIGURE_V3_MMU
 ------------------------------
 
-:Capability: KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3
+:Capability: KVM_CAP_PPC_MMU_RADIX or KVM_CAP_PPC_MMU_HASH_V3
 :Architectures: ppc
 :Type: vm ioctl
 :Parameters: struct kvm_ppc_mmuv3_cfg (in)
@@ -4334,7 +4334,7 @@ the Power ISA V3.00, Book III section 5.7.6.1.
 4.101 KVM_PPC_GET_RMMU_INFO
 ---------------------------
 
-:Capability: KVM_CAP_PPC_RADIX_MMU
+:Capability: KVM_CAP_PPC_MMU_RADIX
 :Architectures: ppc
 :Type: vm ioctl
 :Parameters: struct kvm_ppc_rmmu_info (out)
@@ -8095,7 +8095,7 @@ capability via KVM_ENABLE_CAP ioctl on the vcpu fd. Note that this
 will disable the use of APIC hardware virtualization even if supported
 by the CPU, as it's incompatible with SynIC auto-EOI behavior.
 
-8.3 KVM_CAP_PPC_RADIX_MMU
+8.3 KVM_CAP_PPC_MMU_RADIX
 -------------------------
 
 :Architectures: ppc
@@ -8105,7 +8105,7 @@ available, means that the kernel can support guests using the
 radix MMU defined in Power ISA V3.00 (as implemented in the POWER9
 processor).
 
-8.4 KVM_CAP_PPC_HASH_MMU_V3
+8.4 KVM_CAP_PPC_MMU_HASH_V3
 ---------------------------
 
 :Architectures: ppc
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 2190adbe3002..d03842abae57 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1221,9 +1221,9 @@ struct kvm_vfio_spapr_tce {
 /* Available with KVM_CAP_SPAPR_RESIZE_HPT */
 #define KVM_PPC_RESIZE_HPT_PREPARE _IOR(KVMIO, 0xad, struct kvm_ppc_resize_hpt)
 #define KVM_PPC_RESIZE_HPT_COMMIT  _IOR(KVMIO, 0xae, struct kvm_ppc_resize_hpt)
-/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 */
+/* Available with KVM_CAP_PPC_MMU_RADIX or KVM_CAP_PPC_MMU_HASH_V3 */
 #define KVM_PPC_CONFIGURE_V3_MMU  _IOW(KVMIO,  0xaf, struct kvm_ppc_mmuv3_cfg)
-/* Available with KVM_CAP_PPC_RADIX_MMU */
+/* Available with KVM_CAP_PPC_MMU_RADIX */
 #define KVM_PPC_GET_RMMU_INFO	  _IOW(KVMIO,  0xb0, struct kvm_ppc_rmmu_info)
 /* Available with KVM_CAP_PPC_GET_CPU_CHAR */
 #define KVM_PPC_GET_CPU_CHAR	  _IOR(KVMIO,  0xb1, struct kvm_ppc_cpu_char)
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index c3308536482b..4b6b635b8bd1 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -1502,7 +1502,7 @@ struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_SPAPR_RESIZE_HPT */
 #define KVM_PPC_RESIZE_HPT_PREPARE _IOR(KVMIO, 0xad, struct kvm_ppc_resize_hpt)
 #define KVM_PPC_RESIZE_HPT_COMMIT  _IOR(KVMIO, 0xae, struct kvm_ppc_resize_hpt)
-/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 */
+/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_MMU_HASH_V3 */
 #define KVM_PPC_CONFIGURE_V3_MMU  _IOW(KVMIO,  0xaf, struct kvm_ppc_mmuv3_cfg)
 /* Available with KVM_CAP_PPC_RADIX_MMU */
 #define KVM_PPC_GET_RMMU_INFO	  _IOW(KVMIO,  0xb0, struct kvm_ppc_rmmu_info)
-- 
cgit v1.2.3


From 70a64b7919cbd6c12306051ff2825839a9d65605 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Wed, 1 May 2024 10:26:11 -0700
Subject: memcg: dynamically allocate lruvec_stats

To decouple the dependency of lruvec_stats on NR_VM_NODE_STAT_ITEMS, we
need to dynamically allocate lruvec_stats in the mem_cgroup_per_node
structure.  Also move the definition of lruvec_stats_percpu and
lruvec_stats and related functions to the memcontrol.c to facilitate later
patches.  No functional changes in the patch.

Link: https://lkml.kernel.org/r/20240501172617.678560-3-shakeel.butt@linux.dev
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Yosry Ahmed <yosryahmed@google.com>
Reviewed-by: T.J. Mercier <tjmercier@google.com>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 62 ++++-----------------------------
 mm/memcontrol.c            | 87 +++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 81 insertions(+), 68 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9aba0d0462ca..ab8a6e884375 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -83,6 +83,8 @@ enum mem_cgroup_events_target {
 
 struct memcg_vmstats_percpu;
 struct memcg_vmstats;
+struct lruvec_stats_percpu;
+struct lruvec_stats;
 
 struct mem_cgroup_reclaim_iter {
 	struct mem_cgroup *position;
@@ -90,25 +92,6 @@ struct mem_cgroup_reclaim_iter {
 	unsigned int generation;
 };
 
-struct lruvec_stats_percpu {
-	/* Local (CPU and cgroup) state */
-	long state[NR_VM_NODE_STAT_ITEMS];
-
-	/* Delta calculation for lockless upward propagation */
-	long state_prev[NR_VM_NODE_STAT_ITEMS];
-};
-
-struct lruvec_stats {
-	/* Aggregated (CPU and subtree) state */
-	long state[NR_VM_NODE_STAT_ITEMS];
-
-	/* Non-hierarchical (CPU aggregated) state */
-	long state_local[NR_VM_NODE_STAT_ITEMS];
-
-	/* Pending child counts during tree propagation */
-	long state_pending[NR_VM_NODE_STAT_ITEMS];
-};
-
 /*
  * per-node information in memory controller.
  */
@@ -116,7 +99,7 @@ struct mem_cgroup_per_node {
 	struct lruvec		lruvec;
 
 	struct lruvec_stats_percpu __percpu	*lruvec_stats_percpu;
-	struct lruvec_stats			lruvec_stats;
+	struct lruvec_stats			*lruvec_stats;
 
 	unsigned long		lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
 
@@ -1037,42 +1020,9 @@ static inline void mod_memcg_page_state(struct page *page,
 }
 
 unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx);
-
-static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
-					      enum node_stat_item idx)
-{
-	struct mem_cgroup_per_node *pn;
-	long x;
-
-	if (mem_cgroup_disabled())
-		return node_page_state(lruvec_pgdat(lruvec), idx);
-
-	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
-	x = READ_ONCE(pn->lruvec_stats.state[idx]);
-#ifdef CONFIG_SMP
-	if (x < 0)
-		x = 0;
-#endif
-	return x;
-}
-
-static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
-						    enum node_stat_item idx)
-{
-	struct mem_cgroup_per_node *pn;
-	long x = 0;
-
-	if (mem_cgroup_disabled())
-		return node_page_state(lruvec_pgdat(lruvec), idx);
-
-	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
-	x = READ_ONCE(pn->lruvec_stats.state_local[idx]);
-#ifdef CONFIG_SMP
-	if (x < 0)
-		x = 0;
-#endif
-	return x;
-}
+unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx);
+unsigned long lruvec_page_state_local(struct lruvec *lruvec,
+				      enum node_stat_item idx);
 
 void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
 void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 96b72a6e04ba..cf6819a3310d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -575,6 +575,60 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 	return mz;
 }
 
+struct lruvec_stats_percpu {
+	/* Local (CPU and cgroup) state */
+	long state[NR_VM_NODE_STAT_ITEMS];
+
+	/* Delta calculation for lockless upward propagation */
+	long state_prev[NR_VM_NODE_STAT_ITEMS];
+};
+
+struct lruvec_stats {
+	/* Aggregated (CPU and subtree) state */
+	long state[NR_VM_NODE_STAT_ITEMS];
+
+	/* Non-hierarchical (CPU aggregated) state */
+	long state_local[NR_VM_NODE_STAT_ITEMS];
+
+	/* Pending child counts during tree propagation */
+	long state_pending[NR_VM_NODE_STAT_ITEMS];
+};
+
+unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx)
+{
+	struct mem_cgroup_per_node *pn;
+	long x;
+
+	if (mem_cgroup_disabled())
+		return node_page_state(lruvec_pgdat(lruvec), idx);
+
+	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+	x = READ_ONCE(pn->lruvec_stats->state[idx]);
+#ifdef CONFIG_SMP
+	if (x < 0)
+		x = 0;
+#endif
+	return x;
+}
+
+unsigned long lruvec_page_state_local(struct lruvec *lruvec,
+				      enum node_stat_item idx)
+{
+	struct mem_cgroup_per_node *pn;
+	long x = 0;
+
+	if (mem_cgroup_disabled())
+		return node_page_state(lruvec_pgdat(lruvec), idx);
+
+	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+	x = READ_ONCE(pn->lruvec_stats->state_local[idx]);
+#ifdef CONFIG_SMP
+	if (x < 0)
+		x = 0;
+#endif
+	return x;
+}
+
 /* Subset of vm_event_item to report for memcg event stats */
 static const unsigned int memcg_vm_event_stat[] = {
 	PGPGIN,
@@ -5486,18 +5540,25 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
 	if (!pn)
 		return 1;
 
+	pn->lruvec_stats = kzalloc_node(sizeof(struct lruvec_stats), GFP_KERNEL,
+					node);
+	if (!pn->lruvec_stats)
+		goto fail;
+
 	pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu,
 						   GFP_KERNEL_ACCOUNT);
-	if (!pn->lruvec_stats_percpu) {
-		kfree(pn);
-		return 1;
-	}
+	if (!pn->lruvec_stats_percpu)
+		goto fail;
 
 	lruvec_init(&pn->lruvec);
 	pn->memcg = memcg;
 
 	memcg->nodeinfo[node] = pn;
 	return 0;
+fail:
+	kfree(pn->lruvec_stats);
+	kfree(pn);
+	return 1;
 }
 
 static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
@@ -5508,6 +5569,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
 		return;
 
 	free_percpu(pn->lruvec_stats_percpu);
+	kfree(pn->lruvec_stats);
 	kfree(pn);
 }
 
@@ -5860,18 +5922,19 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
 
 	for_each_node_state(nid, N_MEMORY) {
 		struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
-		struct mem_cgroup_per_node *ppn = NULL;
+		struct lruvec_stats *lstats = pn->lruvec_stats;
+		struct lruvec_stats *plstats = NULL;
 		struct lruvec_stats_percpu *lstatc;
 
 		if (parent)
-			ppn = parent->nodeinfo[nid];
+			plstats = parent->nodeinfo[nid]->lruvec_stats;
 
 		lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
 
 		for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
-			delta = pn->lruvec_stats.state_pending[i];
+			delta = lstats->state_pending[i];
 			if (delta)
-				pn->lruvec_stats.state_pending[i] = 0;
+				lstats->state_pending[i] = 0;
 
 			delta_cpu = 0;
 			v = READ_ONCE(lstatc->state[i]);
@@ -5882,12 +5945,12 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
 			}
 
 			if (delta_cpu)
-				pn->lruvec_stats.state_local[i] += delta_cpu;
+				lstats->state_local[i] += delta_cpu;
 
 			if (delta) {
-				pn->lruvec_stats.state[i] += delta;
-				if (ppn)
-					ppn->lruvec_stats.state_pending[i] += delta;
+				lstats->state[i] += delta;
+				if (plstats)
+					plstats->state_pending[i] += delta;
 			}
 		}
 	}
-- 
cgit v1.2.3


From a94032b35e5f97dc1023030d92998418c9feb27b Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeel.butt@linux.dev>
Date: Wed, 1 May 2024 10:26:17 -0700
Subject: memcg: use proper type for mod_memcg_state

The memcg stats update functions can take arbitrary integer but the only
input which make sense is enum memcg_stat_item and we don't want these
functions to be called with arbitrary integer, so replace the parameter
type with enum memcg_stat_item and compiler will be able to warn if memcg
stat update functions are called with incorrect index value.

Link: https://lkml.kernel.org/r/20240501172617.678560-9-shakeel.butt@linux.dev
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: T.J. Mercier <tjmercier@google.com>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memcontrol.h | 13 +++++++------
 mm/memcontrol.c            |  3 ++-
 2 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ab8a6e884375..030d34e9d117 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -974,7 +974,8 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg);
 void folio_memcg_lock(struct folio *folio);
 void folio_memcg_unlock(struct folio *folio);
 
-void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val);
+void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
+		       int val);
 
 /* try to stablize folio_memcg() for all the pages in a memcg */
 static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
@@ -995,7 +996,7 @@ static inline void mem_cgroup_unlock_pages(void)
 
 /* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void mod_memcg_state(struct mem_cgroup *memcg,
-				   int idx, int val)
+				   enum memcg_stat_item idx, int val)
 {
 	unsigned long flags;
 
@@ -1005,7 +1006,7 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg,
 }
 
 static inline void mod_memcg_page_state(struct page *page,
-					int idx, int val)
+					enum memcg_stat_item idx, int val)
 {
 	struct mem_cgroup *memcg;
 
@@ -1491,19 +1492,19 @@ static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
 }
 
 static inline void __mod_memcg_state(struct mem_cgroup *memcg,
-				     int idx,
+				     enum memcg_stat_item idx,
 				     int nr)
 {
 }
 
 static inline void mod_memcg_state(struct mem_cgroup *memcg,
-				   int idx,
+				   enum memcg_stat_item idx,
 				   int nr)
 {
 }
 
 static inline void mod_memcg_page_state(struct page *page,
-					int idx, int val)
+					enum memcg_stat_item idx, int val)
 {
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 87411116ec5e..e6302f42e40f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -955,7 +955,8 @@ static int memcg_state_val_in_pages(int idx, int val)
  * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
  * @val: delta to add to the counter, can be negative
  */
-void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
+void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
+		       int val)
 {
 	int i = memcg_stats_index(idx);
 
-- 
cgit v1.2.3


From 31f605a308e627f06e4e6ab77254473f1c90f0bf Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 2 May 2024 16:12:17 +0200
Subject: kcsan, compiler_types: Introduce __data_racy type qualifier

Based on the discussion at [1], it would be helpful to mark certain
variables as explicitly "data racy", which would result in KCSAN not
reporting data races involving any accesses on such variables. To do
that, introduce the __data_racy type qualifier:

	struct foo {
		...
		int __data_racy bar;
		...
	};

In KCSAN-kernels, __data_racy turns into volatile, which KCSAN already
treats specially by considering them "marked". In non-KCSAN kernels the
type qualifier turns into no-op.

The generated code between KCSAN-instrumented kernels and non-KCSAN
kernels is already huge (inserted calls into runtime for every memory
access), so the extra generated code (if any) due to volatile for few
such __data_racy variables are unlikely to have measurable impact on
performance.

Link: https://lore.kernel.org/all/CAHk-=wi3iondeh_9V2g3Qz5oHTRjLsOpoy83hb58MVh=nRZe0A@mail.gmail.com/ [1]
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Marco Elver <elver@google.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/dev-tools/kcsan.rst | 10 ++++++++++
 include/linux/compiler_types.h    |  7 +++++++
 kernel/kcsan/kcsan_test.c         | 17 +++++++++++++++++
 3 files changed, 34 insertions(+)

(limited to 'include')

diff --git a/Documentation/dev-tools/kcsan.rst b/Documentation/dev-tools/kcsan.rst
index 94b6802ab0ab..02143f060b22 100644
--- a/Documentation/dev-tools/kcsan.rst
+++ b/Documentation/dev-tools/kcsan.rst
@@ -91,6 +91,16 @@ the below options are available:
   behaviour when encountering a data race is deemed safe.  Please see
   `"Marking Shared-Memory Accesses" in the LKMM`_ for more information.
 
+* Similar to ``data_race(...)``, the type qualifier ``__data_racy`` can be used
+  to document that all data races due to accesses to a variable are intended
+  and should be ignored by KCSAN::
+
+    struct foo {
+        ...
+        int __data_racy stats_counter;
+        ...
+    };
+
 * Disabling data race detection for entire functions can be accomplished by
   using the function attribute ``__no_kcsan``::
 
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 2abaa3a825a9..a38162a8590d 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -273,9 +273,16 @@ struct ftrace_likely_data {
  * disable all instrumentation. See Kconfig.kcsan where this is mandatory.
  */
 # define __no_kcsan __no_sanitize_thread __disable_sanitizer_instrumentation
+/*
+ * Type qualifier to mark variables where all data-racy accesses should be
+ * ignored by KCSAN. Note, the implementation simply marks these variables as
+ * volatile, since KCSAN will treat such accesses as "marked".
+ */
+# define __data_racy volatile
 # define __no_sanitize_or_inline __no_kcsan notrace __maybe_unused
 #else
 # define __no_kcsan
+# define __data_racy
 #endif
 
 #ifndef __no_sanitize_or_inline
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index 015586217875..0c17b4c83e1c 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -304,6 +304,7 @@ static long test_array[3 * PAGE_SIZE / sizeof(long)];
 static struct {
 	long val[8];
 } test_struct;
+static long __data_racy test_data_racy;
 static DEFINE_SEQLOCK(test_seqlock);
 static DEFINE_SPINLOCK(test_spinlock);
 static DEFINE_MUTEX(test_mutex);
@@ -358,6 +359,8 @@ static noinline void test_kernel_write_uninstrumented(void) { test_var++; }
 
 static noinline void test_kernel_data_race(void) { data_race(test_var++); }
 
+static noinline void test_kernel_data_racy_qualifier(void) { test_data_racy++; }
+
 static noinline void test_kernel_assert_writer(void)
 {
 	ASSERT_EXCLUSIVE_WRITER(test_var);
@@ -1009,6 +1012,19 @@ static void test_data_race(struct kunit *test)
 	KUNIT_EXPECT_FALSE(test, match_never);
 }
 
+/* Test the __data_racy type qualifier. */
+__no_kcsan
+static void test_data_racy_qualifier(struct kunit *test)
+{
+	bool match_never = false;
+
+	begin_test_checks(test_kernel_data_racy_qualifier, test_kernel_data_racy_qualifier);
+	do {
+		match_never = report_available();
+	} while (!end_test_checks(match_never));
+	KUNIT_EXPECT_FALSE(test, match_never);
+}
+
 __no_kcsan
 static void test_assert_exclusive_writer(struct kunit *test)
 {
@@ -1424,6 +1440,7 @@ static struct kunit_case kcsan_test_cases[] = {
 	KCSAN_KUNIT_CASE(test_read_plain_atomic_rmw),
 	KCSAN_KUNIT_CASE(test_zero_size_access),
 	KCSAN_KUNIT_CASE(test_data_race),
+	KCSAN_KUNIT_CASE(test_data_racy_qualifier),
 	KCSAN_KUNIT_CASE(test_assert_exclusive_writer),
 	KCSAN_KUNIT_CASE(test_assert_exclusive_access),
 	KCSAN_KUNIT_CASE(test_assert_exclusive_access_writer),
-- 
cgit v1.2.3


From 2e438442ba4d864c474ff38158054146010bd29b Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 2 Apr 2024 14:24:00 +0100
Subject: btrfs: remove not needed mod_start and mod_len from struct extent_map

The mod_start and mod_len fields of struct extent_map were introduced by
commit 4e2f84e63dc1 ("Btrfs: improve fsync by filtering extents that we
want") in order to avoid too low performance when fsyncing a file that
keeps getting extent maps merge, because it resulted in each fsync logging
again csum ranges that were already merged before.

We don't need this anymore as extent maps in the list of modified extents
are never merged with other extent maps and once we log an extent map we
remove it from the list of modified extent maps, so it's never logged
twice.

So remove the mod_start and mod_len fields from struct extent_map and use
instead the start and len fields when logging checksums in the fast fsync
path. This also makes EXTENT_FLAG_FILLING unused so remove it as well.

Running the reproducer from the commit mentioned before, with a larger
number of extents and against a null block device, so that IO is fast
and we can better see any impact from searching checksums items and
logging them, gave the following results from dd:

Before this change:

   409600000 bytes (410 MB, 391 MiB) copied, 22.948 s, 17.8 MB/s

After this change:

   409600000 bytes (410 MB, 391 MiB) copied, 22.9997 s, 17.8 MB/s

So no changes in throughput.
The test was done in a release kernel (non-debug, Debian's default kernel
config) and its steps are the following:

   $ mkfs.btrfs -f /dev/nullb0
   $ mount /dev/sdb /mnt
   $ dd if=/dev/zero of=/mnt/foobar bs=4k count=100000 oflag=sync
   $ umount /mnt

This also reduces the size of struct extent_map from 128 bytes down to 112
bytes, so now we can have 36 extents maps per 4K page instead of 32.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_map.c        | 18 ------------------
 fs/btrfs/extent_map.h        |  4 ----
 fs/btrfs/inode.c             |  4 +---
 fs/btrfs/tree-log.c          |  4 ++--
 include/trace/events/btrfs.h |  3 +--
 5 files changed, 4 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 24a048210b15..955ce300e5a1 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -252,8 +252,6 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
 			em->len += merge->len;
 			em->block_len += merge->block_len;
 			em->block_start = merge->block_start;
-			em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
-			em->mod_start = merge->mod_start;
 			em->generation = max(em->generation, merge->generation);
 			em->flags |= EXTENT_FLAG_MERGED;
 
@@ -271,7 +269,6 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
 		em->block_len += merge->block_len;
 		rb_erase_cached(&merge->rb_node, &tree->map);
 		RB_CLEAR_NODE(&merge->rb_node);
-		em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
 		em->generation = max(em->generation, merge->generation);
 		em->flags |= EXTENT_FLAG_MERGED;
 		free_extent_map(merge);
@@ -300,7 +297,6 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen)
 	struct extent_map_tree *tree = &inode->extent_tree;
 	int ret = 0;
 	struct extent_map *em;
-	bool prealloc = false;
 
 	write_lock(&tree->lock);
 	em = lookup_extent_mapping(tree, start, len);
@@ -325,21 +321,9 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen)
 
 	em->generation = gen;
 	em->flags &= ~EXTENT_FLAG_PINNED;
-	em->mod_start = em->start;
-	em->mod_len = em->len;
-
-	if (em->flags & EXTENT_FLAG_FILLING) {
-		prealloc = true;
-		em->flags &= ~EXTENT_FLAG_FILLING;
-	}
 
 	try_merge_map(tree, em);
 
-	if (prealloc) {
-		em->mod_start = em->start;
-		em->mod_len = em->len;
-	}
-
 out:
 	write_unlock(&tree->lock);
 	free_extent_map(em);
@@ -361,8 +345,6 @@ static inline void setup_extent_mapping(struct extent_map_tree *tree,
 					int modified)
 {
 	refcount_inc(&em->refs);
-	em->mod_start = em->start;
-	em->mod_len = em->len;
 
 	ASSERT(list_empty(&em->list));
 
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index c5a098c99cc6..10e9491865c9 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -30,8 +30,6 @@ enum {
 	ENUM_BIT(EXTENT_FLAG_PREALLOC),
 	/* Logging this extent */
 	ENUM_BIT(EXTENT_FLAG_LOGGING),
-	/* Filling in a preallocated extent */
-	ENUM_BIT(EXTENT_FLAG_FILLING),
 	/* This em is merged from two or more physically adjacent ems */
 	ENUM_BIT(EXTENT_FLAG_MERGED),
 };
@@ -46,8 +44,6 @@ struct extent_map {
 	/* all of these are in bytes */
 	u64 start;
 	u64 len;
-	u64 mod_start;
-	u64 mod_len;
 	u64 orig_start;
 	u64 orig_block_len;
 	u64 ram_bytes;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 15a13e191ee7..12b1a83a4303 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7337,9 +7337,7 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
 	em->ram_bytes = ram_bytes;
 	em->generation = -1;
 	em->flags |= EXTENT_FLAG_PINNED;
-	if (type == BTRFS_ORDERED_PREALLOC)
-		em->flags |= EXTENT_FLAG_FILLING;
-	else if (type == BTRFS_ORDERED_COMPRESSED)
+	if (type == BTRFS_ORDERED_COMPRESSED)
 		extent_map_set_compression(em, compress_type);
 
 	ret = btrfs_replace_extent_map_range(inode, em, true);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 472918a5bc73..d9777649e170 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4574,8 +4574,8 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
 	struct btrfs_root *csum_root;
 	u64 csum_offset;
 	u64 csum_len;
-	u64 mod_start = em->mod_start;
-	u64 mod_len = em->mod_len;
+	u64 mod_start = em->start;
+	u64 mod_len = em->len;
 	LIST_HEAD(ordered_sums);
 	int ret = 0;
 
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 90b0222390e5..766cfd48386c 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -277,8 +277,7 @@ DEFINE_EVENT(btrfs__inode, btrfs_inode_evict,
 		{ EXTENT_FLAG_COMPRESS_LZO,	"COMPRESS_LZO"	},\
 		{ EXTENT_FLAG_COMPRESS_ZSTD,	"COMPRESS_ZSTD"	},\
 		{ EXTENT_FLAG_PREALLOC,		"PREALLOC"	},\
-		{ EXTENT_FLAG_LOGGING,		"LOGGING"	},\
-		{ EXTENT_FLAG_FILLING,		"FILLING"	})
+		{ EXTENT_FLAG_LOGGING,		"LOGGING"	})
 
 TRACE_EVENT_CONDITION(btrfs_get_extent,
 
-- 
cgit v1.2.3


From 1bff6d4f873790cfc675afce9860208576508c5a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 12 Apr 2024 20:27:00 -0400
Subject: btrfs: simplify delayed ref tracepoints

Now that all of the delayed ref information is in the delayed ref node,
drastically simplify the delayed ref tracepoints by simply passing in
the btrfs_delayed_ref_node and populating the tracepoints with the
values from the structure itself.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/delayed-ref.c       | 14 ++----------
 fs/btrfs/extent-tree.c       |  4 ++--
 include/trace/events/btrfs.h | 54 +++++++++++++++++---------------------------
 3 files changed, 25 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 743cc52c30af..cc1510d7eee8 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -1064,7 +1064,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 			       struct btrfs_delayed_extent_op *extent_op)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
-	struct btrfs_delayed_tree_ref *ref;
 	struct btrfs_delayed_ref_node *node;
 	struct btrfs_delayed_ref_head *head_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
@@ -1093,8 +1092,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 		}
 	}
 
-	ref = btrfs_delayed_node_to_tree_ref(node);
-
 	init_delayed_ref_common(fs_info, node, generic_ref);
 
 	init_delayed_ref_head(head_ref, generic_ref, record, 0);
@@ -1119,9 +1116,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
 	 */
 	btrfs_update_delayed_refs_rsv(trans);
 
-	trace_add_delayed_tree_ref(fs_info, node, ref,
-				   action == BTRFS_ADD_DELAYED_EXTENT ?
-				   BTRFS_ADD_DELAYED_REF : action);
+	trace_add_delayed_tree_ref(fs_info, node);
 	if (merged)
 		kmem_cache_free(btrfs_delayed_ref_node_cachep, node);
 
@@ -1139,7 +1134,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
 			       u64 reserved)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
-	struct btrfs_delayed_data_ref *ref;
 	struct btrfs_delayed_ref_node *node;
 	struct btrfs_delayed_ref_head *head_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
@@ -1153,8 +1147,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
 	if (!node)
 		return -ENOMEM;
 
-	ref = btrfs_delayed_node_to_data_ref(node);
-
 	init_delayed_ref_common(fs_info, node, generic_ref);
 
 	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
@@ -1195,9 +1187,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
 	 */
 	btrfs_update_delayed_refs_rsv(trans);
 
-	trace_add_delayed_data_ref(trans->fs_info, node, ref,
-				   action == BTRFS_ADD_DELAYED_EXTENT ?
-				   BTRFS_ADD_DELAYED_REF : action);
+	trace_add_delayed_data_ref(trans->fs_info, node);
 	if (merged)
 		kmem_cache_free(btrfs_delayed_ref_node_cachep, node);
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9a5e553707b7..e633ca2dc498 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1574,7 +1574,7 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
 	u64 flags = 0;
 
 	ref = btrfs_delayed_node_to_data_ref(node);
-	trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
+	trace_run_delayed_data_ref(trans->fs_info, node);
 
 	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
 		parent = ref->parent;
@@ -1737,7 +1737,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
 	u64 ref_root = 0;
 
 	ref = btrfs_delayed_node_to_tree_ref(node);
-	trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
+	trace_run_delayed_tree_ref(trans->fs_info, node);
 
 	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
 		parent = ref->parent;
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 766cfd48386c..dae29f6d6b4c 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -868,11 +868,9 @@ TRACE_EVENT(btrfs_add_block_group,
 DECLARE_EVENT_CLASS(btrfs_delayed_tree_ref,
 
 	TP_PROTO(const struct btrfs_fs_info *fs_info,
-		 const struct btrfs_delayed_ref_node *ref,
-		 const struct btrfs_delayed_tree_ref *full_ref,
-		 int action),
+		 const struct btrfs_delayed_ref_node *ref),
 
-	TP_ARGS(fs_info, ref, full_ref, action),
+	TP_ARGS(fs_info, ref),
 
 	TP_STRUCT__entry_btrfs(
 		__field(	u64,  bytenr		)
@@ -888,10 +886,10 @@ DECLARE_EVENT_CLASS(btrfs_delayed_tree_ref,
 	TP_fast_assign_btrfs(fs_info,
 		__entry->bytenr		= ref->bytenr;
 		__entry->num_bytes	= ref->num_bytes;
-		__entry->action		= action;
-		__entry->parent		= full_ref->parent;
-		__entry->ref_root	= full_ref->root;
-		__entry->level		= full_ref->level;
+		__entry->action		= ref->action;
+		__entry->parent		= ref->tree_ref.parent;
+		__entry->ref_root	= ref->tree_ref.root;
+		__entry->level		= ref->tree_ref.level;
 		__entry->type		= ref->type;
 		__entry->seq		= ref->seq;
 	),
@@ -911,31 +909,25 @@ DECLARE_EVENT_CLASS(btrfs_delayed_tree_ref,
 DEFINE_EVENT(btrfs_delayed_tree_ref,  add_delayed_tree_ref,
 
 	TP_PROTO(const struct btrfs_fs_info *fs_info,
-		 const struct btrfs_delayed_ref_node *ref,
-		 const struct btrfs_delayed_tree_ref *full_ref,
-		 int action),
+		 const struct btrfs_delayed_ref_node *ref),
 
-	TP_ARGS(fs_info, ref, full_ref, action)
+	TP_ARGS(fs_info, ref)
 );
 
 DEFINE_EVENT(btrfs_delayed_tree_ref,  run_delayed_tree_ref,
 
 	TP_PROTO(const struct btrfs_fs_info *fs_info,
-		 const struct btrfs_delayed_ref_node *ref,
-		 const struct btrfs_delayed_tree_ref *full_ref,
-		 int action),
+		 const struct btrfs_delayed_ref_node *ref),
 
-	TP_ARGS(fs_info, ref, full_ref, action)
+	TP_ARGS(fs_info, ref)
 );
 
 DECLARE_EVENT_CLASS(btrfs_delayed_data_ref,
 
 	TP_PROTO(const struct btrfs_fs_info *fs_info,
-		 const struct btrfs_delayed_ref_node *ref,
-		 const struct btrfs_delayed_data_ref *full_ref,
-		 int action),
+		 const struct btrfs_delayed_ref_node *ref),
 
-	TP_ARGS(fs_info, ref, full_ref, action),
+	TP_ARGS(fs_info, ref),
 
 	TP_STRUCT__entry_btrfs(
 		__field(	u64,  bytenr		)
@@ -952,11 +944,11 @@ DECLARE_EVENT_CLASS(btrfs_delayed_data_ref,
 	TP_fast_assign_btrfs(fs_info,
 		__entry->bytenr		= ref->bytenr;
 		__entry->num_bytes	= ref->num_bytes;
-		__entry->action		= action;
-		__entry->parent		= full_ref->parent;
-		__entry->ref_root	= full_ref->root;
-		__entry->owner		= full_ref->objectid;
-		__entry->offset		= full_ref->offset;
+		__entry->action		= ref->action;
+		__entry->parent		= ref->data_ref.parent;
+		__entry->ref_root	= ref->data_ref.root;
+		__entry->owner		= ref->data_ref.objectid;
+		__entry->offset		= ref->data_ref.offset;
 		__entry->type		= ref->type;
 		__entry->seq		= ref->seq;
 	),
@@ -978,21 +970,17 @@ DECLARE_EVENT_CLASS(btrfs_delayed_data_ref,
 DEFINE_EVENT(btrfs_delayed_data_ref,  add_delayed_data_ref,
 
 	TP_PROTO(const struct btrfs_fs_info *fs_info,
-		 const struct btrfs_delayed_ref_node *ref,
-		 const struct btrfs_delayed_data_ref *full_ref,
-		 int action),
+		 const struct btrfs_delayed_ref_node *ref),
 
-	TP_ARGS(fs_info, ref, full_ref, action)
+	TP_ARGS(fs_info, ref)
 );
 
 DEFINE_EVENT(btrfs_delayed_data_ref,  run_delayed_data_ref,
 
 	TP_PROTO(const struct btrfs_fs_info *fs_info,
-		 const struct btrfs_delayed_ref_node *ref,
-		 const struct btrfs_delayed_data_ref *full_ref,
-		 int action),
+		 const struct btrfs_delayed_ref_node *ref),
 
-	TP_ARGS(fs_info, ref, full_ref, action)
+	TP_ARGS(fs_info, ref)
 );
 
 DECLARE_EVENT_CLASS(btrfs_delayed_ref_head,
-- 
cgit v1.2.3


From cf4f04325b2b27efa5697ba0ea4c1abdee0035b4 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 12 Apr 2024 22:57:13 -0400
Subject: btrfs: move ->parent and ->ref_root into btrfs_delayed_ref_node

These two members are shared by both the tree refs and data refs, so
move them into btrfs_delayed_ref_node proper.  This allows us to greatly
simplify the comparison code, as the shared refs always only sort on
parent, and the non shared refs always sort first on ref_root, and then
only data refs sort on their specific fields.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c           | 12 +++----
 fs/btrfs/delayed-ref.c       | 82 ++++++++++++++------------------------------
 fs/btrfs/delayed-ref.h       | 13 ++++---
 fs/btrfs/extent-tree.c       | 18 +++++-----
 include/trace/events/btrfs.h |  8 ++---
 5 files changed, 52 insertions(+), 81 deletions(-)

(limited to 'include')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 58110c968667..abf2f4bd9f4a 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -928,7 +928,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
 			}
 
 			ref = btrfs_delayed_node_to_tree_ref(node);
-			ret = add_indirect_ref(fs_info, preftrees, ref->root,
+			ret = add_indirect_ref(fs_info, preftrees, node->ref_root,
 					       key_ptr, ref->level + 1,
 					       node->bytenr, count, sc,
 					       GFP_ATOMIC);
@@ -941,7 +941,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
 			ref = btrfs_delayed_node_to_tree_ref(node);
 
 			ret = add_direct_ref(fs_info, preftrees, ref->level + 1,
-					     ref->parent, node->bytenr, count,
+					     node->parent, node->bytenr, count,
 					     sc, GFP_ATOMIC);
 			break;
 		}
@@ -972,18 +972,14 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
 			if (sc && count < 0)
 				sc->have_delayed_delete_refs = true;
 
-			ret = add_indirect_ref(fs_info, preftrees, ref->root,
+			ret = add_indirect_ref(fs_info, preftrees, node->ref_root,
 					       &key, 0, node->bytenr, count, sc,
 					       GFP_ATOMIC);
 			break;
 		}
 		case BTRFS_SHARED_DATA_REF_KEY: {
 			/* SHARED DIRECT FULL backref */
-			struct btrfs_delayed_data_ref *ref;
-
-			ref = btrfs_delayed_node_to_data_ref(node);
-
-			ret = add_direct_ref(fs_info, preftrees, 0, ref->parent,
+			ret = add_direct_ref(fs_info, preftrees, 0, node->parent,
 					     node->bytenr, count, sc,
 					     GFP_ATOMIC);
 			break;
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 0f5e1f1f3efb..7342e8778166 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -303,55 +303,20 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
 	return 0;
 }
 
-/*
- * compare two delayed tree backrefs with same bytenr and type
- */
-static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref1,
-			  struct btrfs_delayed_tree_ref *ref2)
-{
-	struct btrfs_delayed_ref_node *node = btrfs_delayed_tree_ref_to_node(ref1);
-
-	if (node->type == BTRFS_TREE_BLOCK_REF_KEY) {
-		if (ref1->root < ref2->root)
-			return -1;
-		if (ref1->root > ref2->root)
-			return 1;
-	} else {
-		if (ref1->parent < ref2->parent)
-			return -1;
-		if (ref1->parent > ref2->parent)
-			return 1;
-	}
-	return 0;
-}
-
 /*
  * compare two delayed data backrefs with same bytenr and type
  */
-static int comp_data_refs(struct btrfs_delayed_data_ref *ref1,
-			  struct btrfs_delayed_data_ref *ref2)
+static int comp_data_refs(struct btrfs_delayed_ref_node *ref1,
+			  struct btrfs_delayed_ref_node *ref2)
 {
-	struct btrfs_delayed_ref_node *node = btrfs_delayed_data_ref_to_node(ref1);
-
-	if (node->type == BTRFS_EXTENT_DATA_REF_KEY) {
-		if (ref1->root < ref2->root)
-			return -1;
-		if (ref1->root > ref2->root)
-			return 1;
-		if (ref1->objectid < ref2->objectid)
-			return -1;
-		if (ref1->objectid > ref2->objectid)
-			return 1;
-		if (ref1->offset < ref2->offset)
-			return -1;
-		if (ref1->offset > ref2->offset)
-			return 1;
-	} else {
-		if (ref1->parent < ref2->parent)
-			return -1;
-		if (ref1->parent > ref2->parent)
-			return 1;
-	}
+	if (ref1->data_ref.objectid < ref2->data_ref.objectid)
+		return -1;
+	if (ref1->data_ref.objectid > ref2->data_ref.objectid)
+		return 1;
+	if (ref1->data_ref.offset < ref2->data_ref.offset)
+		return -1;
+	if (ref1->data_ref.offset > ref2->data_ref.offset)
+		return 1;
 	return 0;
 }
 
@@ -365,13 +330,20 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1,
 		return -1;
 	if (ref1->type > ref2->type)
 		return 1;
-	if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
-	    ref1->type == BTRFS_SHARED_BLOCK_REF_KEY)
-		ret = comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref1),
-				     btrfs_delayed_node_to_tree_ref(ref2));
-	else
-		ret = comp_data_refs(btrfs_delayed_node_to_data_ref(ref1),
-				     btrfs_delayed_node_to_data_ref(ref2));
+	if (ref1->type == BTRFS_SHARED_BLOCK_REF_KEY ||
+	    ref1->type == BTRFS_SHARED_DATA_REF_KEY) {
+		if (ref1->parent < ref2->parent)
+			return -1;
+		if (ref1->parent > ref2->parent)
+			return 1;
+	} else {
+		if (ref1->ref_root < ref2->ref_root)
+			return -1;
+		if (ref1->ref_root > ref2->ref_root)
+			return -1;
+		if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY)
+			ret = comp_data_refs(ref1, ref2);
+	}
 	if (ret)
 		return ret;
 	if (check_seq) {
@@ -1005,17 +977,15 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
 	ref->action = action;
 	ref->seq = seq;
 	ref->type = btrfs_ref_type(generic_ref);
+	ref->ref_root = generic_ref->ref_root;
+	ref->parent = generic_ref->parent;
 	RB_CLEAR_NODE(&ref->ref_node);
 	INIT_LIST_HEAD(&ref->add_list);
 
 	if (generic_ref->type == BTRFS_REF_DATA) {
-		ref->data_ref.root = generic_ref->ref_root;
-		ref->data_ref.parent = generic_ref->parent;
 		ref->data_ref.objectid = generic_ref->data_ref.ino;
 		ref->data_ref.offset = generic_ref->data_ref.offset;
 	} else {
-		ref->tree_ref.root = generic_ref->ref_root;
-		ref->tree_ref.parent = generic_ref->parent;
 		ref->tree_ref.level = generic_ref->tree_ref.level;
 	}
 }
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 05cf6eb2e604..ec53404cec0c 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -31,14 +31,10 @@ enum btrfs_delayed_ref_action {
 } __packed;
 
 struct btrfs_delayed_tree_ref {
-	u64 root;
-	u64 parent;
 	int level;
 };
 
 struct btrfs_delayed_data_ref {
-	u64 root;
-	u64 parent;
 	u64 objectid;
 	u64 offset;
 };
@@ -61,6 +57,15 @@ struct btrfs_delayed_ref_node {
 	/* seq number to keep track of insertion order */
 	u64 seq;
 
+	/* The ref_root for this ref */
+	u64 ref_root;
+
+	/*
+	 * The parent for this ref, if this isn't set the ref_root is the
+	 * reference owner.
+	 */
+	u64 parent;
+
 	/* ref count on this data structure */
 	refcount_t refs;
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 96fb67a88c1e..60fb481e7411 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1577,7 +1577,7 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
 	trace_run_delayed_data_ref(trans->fs_info, node);
 
 	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
-		parent = ref->parent;
+		parent = node->parent;
 
 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
 		struct btrfs_key key;
@@ -1596,7 +1596,7 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
 		key.type = BTRFS_EXTENT_ITEM_KEY;
 		key.offset = node->num_bytes;
 
-		ret = alloc_reserved_file_extent(trans, parent, ref->root,
+		ret = alloc_reserved_file_extent(trans, parent, node->ref_root,
 						 flags, ref->objectid,
 						 ref->offset, &key,
 						 node->ref_mod, href->owning_root);
@@ -1604,12 +1604,12 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
 		if (!ret)
 			ret = btrfs_record_squota_delta(trans->fs_info, &delta);
 	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
-		ret = __btrfs_inc_extent_ref(trans, node, parent, ref->root,
+		ret = __btrfs_inc_extent_ref(trans, node, parent, node->ref_root,
 					     ref->objectid, ref->offset,
 					     extent_op);
 	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
 		ret = __btrfs_free_extent(trans, href, node, parent,
-					  ref->root, ref->objectid,
+					  node->ref_root, ref->objectid,
 					  ref->offset, extent_op);
 	} else {
 		BUG();
@@ -1740,8 +1740,8 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
 	trace_run_delayed_tree_ref(trans->fs_info, node);
 
 	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
-		parent = ref->parent;
-	ref_root = ref->root;
+		parent = node->parent;
+	ref_root = node->ref_root;
 
 	if (unlikely(node->ref_mod != 1)) {
 		btrfs_err(trans->fs_info,
@@ -2359,7 +2359,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
 		 * If our ref doesn't match the one we're currently looking at
 		 * then we have a cross reference.
 		 */
-		if (data_ref->root != root->root_key.objectid ||
+		if (ref->ref_root != root->root_key.objectid ||
 		    data_ref->objectid != objectid ||
 		    data_ref->offset != offset) {
 			ret = 1;
@@ -4946,11 +4946,11 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
 		btrfs_set_extent_inline_ref_type(leaf, iref,
 						 BTRFS_SHARED_BLOCK_REF_KEY);
-		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
+		btrfs_set_extent_inline_ref_offset(leaf, iref, node->parent);
 	} else {
 		btrfs_set_extent_inline_ref_type(leaf, iref,
 						 BTRFS_TREE_BLOCK_REF_KEY);
-		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
+		btrfs_set_extent_inline_ref_offset(leaf, iref, node->ref_root);
 	}
 
 	btrfs_mark_buffer_dirty(trans, leaf);
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index dae29f6d6b4c..e6cee75c384c 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -887,8 +887,8 @@ DECLARE_EVENT_CLASS(btrfs_delayed_tree_ref,
 		__entry->bytenr		= ref->bytenr;
 		__entry->num_bytes	= ref->num_bytes;
 		__entry->action		= ref->action;
-		__entry->parent		= ref->tree_ref.parent;
-		__entry->ref_root	= ref->tree_ref.root;
+		__entry->parent		= ref->parent;
+		__entry->ref_root	= ref->ref_root;
 		__entry->level		= ref->tree_ref.level;
 		__entry->type		= ref->type;
 		__entry->seq		= ref->seq;
@@ -945,8 +945,8 @@ DECLARE_EVENT_CLASS(btrfs_delayed_data_ref,
 		__entry->bytenr		= ref->bytenr;
 		__entry->num_bytes	= ref->num_bytes;
 		__entry->action		= ref->action;
-		__entry->parent		= ref->data_ref.parent;
-		__entry->ref_root	= ref->data_ref.root;
+		__entry->parent		= ref->parent;
+		__entry->ref_root	= ref->ref_root;
 		__entry->owner		= ref->data_ref.objectid;
 		__entry->offset		= ref->data_ref.offset;
 		__entry->type		= ref->type;
-- 
cgit v1.2.3


From 44cc2e38e67b7b7551e923571ef6ea51f3ae1e05 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 12 Apr 2024 23:53:49 -0400
Subject: btrfs: stop referencing btrfs_delayed_data_ref directly

Now that most of our elements are inside of btrfs_delayed_ref_node
directly and we have helpers for the delayed_data_ref bits, go ahead and
remove all direct usage of btrfs_delayed_data_ref and use the helpers
where needed.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c           |  7 ++-----
 fs/btrfs/extent-tree.c       | 20 +++++++++++---------
 include/trace/events/btrfs.h |  1 -
 3 files changed, 13 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index abf2f4bd9f4a..06849122a022 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -947,12 +947,9 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
 		}
 		case BTRFS_EXTENT_DATA_REF_KEY: {
 			/* NORMAL INDIRECT DATA backref */
-			struct btrfs_delayed_data_ref *ref;
-			ref = btrfs_delayed_node_to_data_ref(node);
-
-			key.objectid = ref->objectid;
+			key.objectid = btrfs_delayed_ref_owner(node);
 			key.type = BTRFS_EXTENT_DATA_KEY;
-			key.offset = ref->offset;
+			key.offset = btrfs_delayed_ref_offset(node);
 
 			/*
 			 * If we have a share check context and a reference for
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 618dd1c50ecf..24997c83dc14 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1543,11 +1543,9 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
 				bool insert_reserved)
 {
 	int ret = 0;
-	struct btrfs_delayed_data_ref *ref;
 	u64 parent = 0;
 	u64 flags = 0;
 
-	ref = btrfs_delayed_node_to_data_ref(node);
 	trace_run_delayed_data_ref(trans->fs_info, node);
 
 	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
@@ -1562,6 +1560,8 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
 			.is_inc	= true,
 			.generation = trans->transid,
 		};
+		u64 owner = btrfs_delayed_ref_owner(node);
+		u64 offset = btrfs_delayed_ref_offset(node);
 
 		if (extent_op)
 			flags |= extent_op->flags_to_set;
@@ -1571,9 +1571,9 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
 		key.offset = node->num_bytes;
 
 		ret = alloc_reserved_file_extent(trans, parent, node->ref_root,
-						 flags, ref->objectid,
-						 ref->offset, &key,
-						 node->ref_mod, href->owning_root);
+						 flags, owner, offset, &key,
+						 node->ref_mod,
+						 href->owning_root);
 		free_head_ref_squota_rsv(trans->fs_info, href);
 		if (!ret)
 			ret = btrfs_record_squota_delta(trans->fs_info, &delta);
@@ -2258,7 +2258,6 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
 {
 	struct btrfs_delayed_ref_head *head;
 	struct btrfs_delayed_ref_node *ref;
-	struct btrfs_delayed_data_ref *data_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_transaction *cur_trans;
 	struct rb_node *node;
@@ -2312,6 +2311,9 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
 	 */
 	for (node = rb_first_cached(&head->ref_tree); node;
 	     node = rb_next(node)) {
+		u64 ref_owner;
+		u64 ref_offset;
+
 		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
 		/* If it's a shared ref we know a cross reference exists */
 		if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
@@ -2319,15 +2321,15 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
 			break;
 		}
 
-		data_ref = btrfs_delayed_node_to_data_ref(ref);
+		ref_owner = btrfs_delayed_ref_owner(ref);
+		ref_offset = btrfs_delayed_ref_offset(ref);
 
 		/*
 		 * If our ref doesn't match the one we're currently looking at
 		 * then we have a cross reference.
 		 */
 		if (ref->ref_root != root->root_key.objectid ||
-		    data_ref->objectid != objectid ||
-		    data_ref->offset != offset) {
+		    ref_owner != objectid || ref_offset != offset) {
 			ret = 1;
 			break;
 		}
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index e6cee75c384c..89fa96fd95b4 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -17,7 +17,6 @@ struct btrfs_file_extent_item;
 struct btrfs_ordered_extent;
 struct btrfs_delayed_ref_node;
 struct btrfs_delayed_tree_ref;
-struct btrfs_delayed_data_ref;
 struct btrfs_delayed_ref_head;
 struct btrfs_block_group;
 struct btrfs_free_cluster;
-- 
cgit v1.2.3


From efc7d5dbf8b42eb53db31443520e979b05cd1908 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Sat, 13 Apr 2024 00:07:06 -0400
Subject: btrfs: stop referencing btrfs_delayed_tree_ref directly

We only ever need to use this to get the level of the tree block ref, so
use the btrfs_delayed_ref_owner() helper, which returns the level for
the given reference.

Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/backref.c           | 21 +++++++++++----------
 fs/btrfs/extent-tree.c       | 10 +++++-----
 include/trace/events/btrfs.h |  1 -
 3 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 06849122a022..5bb71dd1ab2a 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -919,28 +919,29 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
 		switch (node->type) {
 		case BTRFS_TREE_BLOCK_REF_KEY: {
 			/* NORMAL INDIRECT METADATA backref */
-			struct btrfs_delayed_tree_ref *ref;
 			struct btrfs_key *key_ptr = NULL;
+			/* The owner of a tree block ref is the level. */
+			int level = btrfs_delayed_ref_owner(node);
 
 			if (head->extent_op && head->extent_op->update_key) {
 				btrfs_disk_key_to_cpu(&key, &head->extent_op->key);
 				key_ptr = &key;
 			}
 
-			ref = btrfs_delayed_node_to_tree_ref(node);
 			ret = add_indirect_ref(fs_info, preftrees, node->ref_root,
-					       key_ptr, ref->level + 1,
-					       node->bytenr, count, sc,
-					       GFP_ATOMIC);
+					       key_ptr, level + 1, node->bytenr,
+					       count, sc, GFP_ATOMIC);
 			break;
 		}
 		case BTRFS_SHARED_BLOCK_REF_KEY: {
-			/* SHARED DIRECT METADATA backref */
-			struct btrfs_delayed_tree_ref *ref;
-
-			ref = btrfs_delayed_node_to_tree_ref(node);
+			/*
+			 * SHARED DIRECT METADATA backref
+			 *
+			 * The owner of a tree block ref is the level.
+			 */
+			int level = btrfs_delayed_ref_owner(node);
 
-			ret = add_direct_ref(fs_info, preftrees, ref->level + 1,
+			ret = add_direct_ref(fs_info, preftrees, level + 1,
 					     node->parent, node->bytenr, count,
 					     sc, GFP_ATOMIC);
 			break;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 24997c83dc14..87a921b84bab 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4865,16 +4865,16 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 	struct btrfs_extent_inline_ref *iref;
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
-	struct btrfs_delayed_tree_ref *ref;
 	u32 size = sizeof(*extent_item) + sizeof(*iref);
 	u64 flags = extent_op->flags_to_set;
+	/* The owner of a tree block is the level. */
+	int level = btrfs_delayed_ref_owner(node);
 	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
 
-	ref = btrfs_delayed_node_to_tree_ref(node);
-
 	extent_key.objectid = node->bytenr;
 	if (skinny_metadata) {
-		extent_key.offset = ref->level;
+		/* The owner of a tree block is the level. */
+		extent_key.offset = level;
 		extent_key.type = BTRFS_METADATA_ITEM_KEY;
 	} else {
 		extent_key.offset = node->num_bytes;
@@ -4907,7 +4907,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 	} else {
 		block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
 		btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
-		btrfs_set_tree_block_level(leaf, block_info, ref->level);
+		btrfs_set_tree_block_level(leaf, block_info, level);
 		iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
 	}
 
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 89fa96fd95b4..8f2497603cb5 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -16,7 +16,6 @@ struct extent_map;
 struct btrfs_file_extent_item;
 struct btrfs_ordered_extent;
 struct btrfs_delayed_ref_node;
-struct btrfs_delayed_tree_ref;
 struct btrfs_delayed_ref_head;
 struct btrfs_block_group;
 struct btrfs_free_cluster;
-- 
cgit v1.2.3


From 0d89a15e1a0dcc861481ee696ec332c998140510 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Tue, 9 Apr 2024 17:07:32 +0100
Subject: btrfs: add tracepoints for extent map shrinker events

Add some tracepoints for the extent map shrinker to help debug and analyse
main events. These have proved useful during development of the shrinker.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_map.c        | 13 ++++++
 fs/btrfs/super.c             |  5 ++-
 include/trace/events/btrfs.h | 99 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 116 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 62a21491c044..744e8952abb0 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1080,6 +1080,7 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, long *scanned, long nr_t
 			btrfs_set_inode_full_sync(inode);
 
 		remove_extent_mapping(inode, em);
+		trace_btrfs_extent_map_shrinker_remove_em(inode, em);
 		/* Drop the reference for the tree. */
 		free_extent_map(em);
 		nr_dropped++;
@@ -1152,6 +1153,12 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
 	long nr_dropped = 0;
 	long scanned = 0;
 
+	if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
+		s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
+
+		trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan, nr);
+	}
+
 	while (scanned < nr_to_scan) {
 		struct btrfs_root *root;
 		unsigned long count;
@@ -1184,5 +1191,11 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
 		btrfs_put_root(root);
 	}
 
+	if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) {
+		s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
+
+		trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr);
+	}
+
 	return nr_dropped;
 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 571f90d18c7f..2dbc930a20f7 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2374,8 +2374,11 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
 static long btrfs_nr_cached_objects(struct super_block *sb, struct shrink_control *sc)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	const s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
 
-	return percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
+	trace_btrfs_extent_map_shrinker_count(fs_info, nr);
+
+	return nr;
 }
 
 static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_control *sc)
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 8f2497603cb5..d2d94d7c3fb5 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -2537,6 +2537,105 @@ TRACE_EVENT(btrfs_get_raid_extent_offset,
 			__entry->devid)
 );
 
+TRACE_EVENT(btrfs_extent_map_shrinker_count,
+
+	TP_PROTO(const struct btrfs_fs_info *fs_info, long nr),
+
+	TP_ARGS(fs_info, nr),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	long,	nr	)
+	),
+
+	TP_fast_assign_btrfs(fs_info,
+		__entry->nr		= nr;
+	),
+
+	TP_printk_btrfs("nr=%ld", __entry->nr)
+);
+
+TRACE_EVENT(btrfs_extent_map_shrinker_scan_enter,
+
+	TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_to_scan, long nr),
+
+	TP_ARGS(fs_info, nr_to_scan, nr),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	long,	nr_to_scan	)
+		__field(	long,	nr		)
+		__field(	u64,	last_root_id	)
+		__field(	u64,	last_ino	)
+	),
+
+	TP_fast_assign_btrfs(fs_info,
+		__entry->nr_to_scan	= nr_to_scan;
+		__entry->nr		= nr;
+		__entry->last_root_id	= fs_info->extent_map_shrinker_last_root;
+		__entry->last_ino	= fs_info->extent_map_shrinker_last_ino;
+	),
+
+	TP_printk_btrfs("nr_to_scan=%ld nr=%ld last_root=%llu(%s) last_ino=%llu",
+			__entry->nr_to_scan, __entry->nr,
+			show_root_type(__entry->last_root_id), __entry->last_ino)
+);
+
+TRACE_EVENT(btrfs_extent_map_shrinker_scan_exit,
+
+	TP_PROTO(const struct btrfs_fs_info *fs_info, long nr_dropped, long nr),
+
+	TP_ARGS(fs_info, nr_dropped, nr),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	long,	nr_dropped	)
+		__field(	long,	nr		)
+		__field(	u64,	last_root_id	)
+		__field(	u64,	last_ino	)
+	),
+
+	TP_fast_assign_btrfs(fs_info,
+		__entry->nr_dropped	= nr_dropped;
+		__entry->nr		= nr;
+		__entry->last_root_id	= fs_info->extent_map_shrinker_last_root;
+		__entry->last_ino	= fs_info->extent_map_shrinker_last_ino;
+	),
+
+	TP_printk_btrfs("nr_dropped=%ld nr=%ld last_root=%llu(%s) last_ino=%llu",
+			__entry->nr_dropped, __entry->nr,
+			show_root_type(__entry->last_root_id), __entry->last_ino)
+);
+
+TRACE_EVENT(btrfs_extent_map_shrinker_remove_em,
+
+	TP_PROTO(const struct btrfs_inode *inode, const struct extent_map *em),
+
+	TP_ARGS(inode, em),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,	ino		)
+		__field(	u64,	root_id		)
+		__field(	u64,	start		)
+		__field(	u64,	len		)
+		__field(	u64,	block_start	)
+		__field(	u32,	flags		)
+	),
+
+	TP_fast_assign_btrfs(inode->root->fs_info,
+		__entry->ino		= btrfs_ino(inode);
+		__entry->root_id	= inode->root->root_key.objectid;
+		__entry->start		= em->start;
+		__entry->len		= em->len;
+		__entry->block_start	= em->block_start;
+		__entry->flags		= em->flags;
+	),
+
+	TP_printk_btrfs(
+"ino=%llu root=%llu(%s) start=%llu len=%llu block_start=%llu(%s) flags=%s",
+			__entry->ino, show_root_type(__entry->root_id),
+			__entry->start, __entry->len,
+			show_map_type(__entry->block_start),
+			show_map_flags(__entry->flags))
+);
+
 #endif /* _TRACE_BTRFS_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From 098c290a490d0121e209617097ea563e5e607066 Mon Sep 17 00:00:00 2001
From: Conor Dooley <conor.dooley@microchip.com>
Date: Wed, 24 Apr 2024 09:42:08 +0100
Subject: clock, reset: microchip: move all mpfs reset code to the reset
 subsystem

Stephen and Philipp, while reviewing patches, said that all of the aux
device creation and the register read/write code could be moved to the
reset subsystem, leaving the clock driver with no implementations of
reset_* functions at all. Move them.

Suggested-by: Philipp Zabel <p.zabel@pengutronix.de>
Suggested-by: Stephen Boyd <sboyd@kernel.org>
Signed-off-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://lore.kernel.org/r/20240424-strangle-sharpener-34755c5e6e3e@spud
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 drivers/clk/microchip/clk-mpfs.c | 92 +-------------------------------------
 drivers/reset/reset-mpfs.c       | 95 ++++++++++++++++++++++++++++++++++++----
 include/soc/microchip/mpfs.h     | 10 ++---
 3 files changed, 93 insertions(+), 104 deletions(-)

(limited to 'include')

diff --git a/drivers/clk/microchip/clk-mpfs.c b/drivers/clk/microchip/clk-mpfs.c
index 22eab91a6712..28ec0da88cb3 100644
--- a/drivers/clk/microchip/clk-mpfs.c
+++ b/drivers/clk/microchip/clk-mpfs.c
@@ -4,12 +4,10 @@
  *
  * Copyright (C) 2020-2022 Microchip Technology Inc. All rights reserved.
  */
-#include <linux/auxiliary_bus.h>
 #include <linux/clk-provider.h>
 #include <linux/io.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
-#include <linux/slab.h>
 #include <dt-bindings/clock/microchip,mpfs-clock.h>
 #include <soc/microchip/mpfs.h>
 
@@ -361,93 +359,6 @@ static int mpfs_clk_register_periphs(struct device *dev, struct mpfs_periph_hw_c
 	return 0;
 }
 
-/*
- * Peripheral clock resets
- */
-
-#if IS_ENABLED(CONFIG_RESET_CONTROLLER)
-
-u32 mpfs_reset_read(struct device *dev)
-{
-	struct mpfs_clock_data *clock_data = dev_get_drvdata(dev->parent);
-
-	return readl_relaxed(clock_data->base + REG_SUBBLK_RESET_CR);
-}
-EXPORT_SYMBOL_NS_GPL(mpfs_reset_read, MCHP_CLK_MPFS);
-
-void mpfs_reset_write(struct device *dev, u32 val)
-{
-	struct mpfs_clock_data *clock_data = dev_get_drvdata(dev->parent);
-
-	writel_relaxed(val, clock_data->base + REG_SUBBLK_RESET_CR);
-}
-EXPORT_SYMBOL_NS_GPL(mpfs_reset_write, MCHP_CLK_MPFS);
-
-static void mpfs_reset_unregister_adev(void *_adev)
-{
-	struct auxiliary_device *adev = _adev;
-
-	auxiliary_device_delete(adev);
-	auxiliary_device_uninit(adev);
-}
-
-static void mpfs_reset_adev_release(struct device *dev)
-{
-	struct auxiliary_device *adev = to_auxiliary_dev(dev);
-
-	kfree(adev);
-}
-
-static struct auxiliary_device *mpfs_reset_adev_alloc(struct mpfs_clock_data *clk_data)
-{
-	struct auxiliary_device *adev;
-	int ret;
-
-	adev = kzalloc(sizeof(*adev), GFP_KERNEL);
-	if (!adev)
-		return ERR_PTR(-ENOMEM);
-
-	adev->name = "reset-mpfs";
-	adev->dev.parent = clk_data->dev;
-	adev->dev.release = mpfs_reset_adev_release;
-	adev->id = 666u;
-
-	ret = auxiliary_device_init(adev);
-	if (ret) {
-		kfree(adev);
-		return ERR_PTR(ret);
-	}
-
-	return adev;
-}
-
-static int mpfs_reset_controller_register(struct mpfs_clock_data *clk_data)
-{
-	struct auxiliary_device *adev;
-	int ret;
-
-	adev = mpfs_reset_adev_alloc(clk_data);
-	if (IS_ERR(adev))
-		return PTR_ERR(adev);
-
-	ret = auxiliary_device_add(adev);
-	if (ret) {
-		auxiliary_device_uninit(adev);
-		return ret;
-	}
-
-	return devm_add_action_or_reset(clk_data->dev, mpfs_reset_unregister_adev, adev);
-}
-
-#else /* !CONFIG_RESET_CONTROLLER */
-
-static int mpfs_reset_controller_register(struct mpfs_clock_data *clk_data)
-{
-	return 0;
-}
-
-#endif /* !CONFIG_RESET_CONTROLLER */
-
 static int mpfs_clk_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
@@ -499,7 +410,7 @@ static int mpfs_clk_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	return mpfs_reset_controller_register(clk_data);
+	return mpfs_reset_controller_register(dev, clk_data->base + REG_SUBBLK_RESET_CR);
 }
 
 static const struct of_device_id mpfs_clk_of_match_table[] = {
@@ -532,3 +443,4 @@ MODULE_DESCRIPTION("Microchip PolarFire SoC Clock Driver");
 MODULE_AUTHOR("Padmarao Begari <padmarao.begari@microchip.com>");
 MODULE_AUTHOR("Daire McNamara <daire.mcnamara@microchip.com>");
 MODULE_AUTHOR("Conor Dooley <conor.dooley@microchip.com>");
+MODULE_IMPORT_NS(MCHP_CLK_MPFS);
diff --git a/drivers/reset/reset-mpfs.c b/drivers/reset/reset-mpfs.c
index 7f3fb2d472f4..710f9c1676f9 100644
--- a/drivers/reset/reset-mpfs.c
+++ b/drivers/reset/reset-mpfs.c
@@ -8,9 +8,11 @@
  */
 #include <linux/auxiliary_bus.h>
 #include <linux/delay.h>
+#include <linux/io.h>
 #include <linux/module.h>
 #include <linux/of.h>
 #include <linux/platform_device.h>
+#include <linux/slab.h>
 #include <linux/reset-controller.h>
 #include <dt-bindings/clock/microchip,mpfs-clock.h>
 #include <soc/microchip/mpfs.h>
@@ -28,20 +30,30 @@
 /* block concurrent access to the soft reset register */
 static DEFINE_SPINLOCK(mpfs_reset_lock);
 
+struct mpfs_reset {
+	void __iomem *base;
+	struct reset_controller_dev rcdev;
+};
+
+static inline struct mpfs_reset *to_mpfs_reset(struct reset_controller_dev *rcdev)
+{
+	return container_of(rcdev, struct mpfs_reset, rcdev);
+}
+
 /*
  * Peripheral clock resets
  */
-
 static int mpfs_assert(struct reset_controller_dev *rcdev, unsigned long id)
 {
+	struct mpfs_reset *rst = to_mpfs_reset(rcdev);
 	unsigned long flags;
 	u32 reg;
 
 	spin_lock_irqsave(&mpfs_reset_lock, flags);
 
-	reg = mpfs_reset_read(rcdev->dev);
+	reg = readl(rst->base);
 	reg |= BIT(id);
-	mpfs_reset_write(rcdev->dev, reg);
+	writel(reg, rst->base);
 
 	spin_unlock_irqrestore(&mpfs_reset_lock, flags);
 
@@ -50,14 +62,15 @@ static int mpfs_assert(struct reset_controller_dev *rcdev, unsigned long id)
 
 static int mpfs_deassert(struct reset_controller_dev *rcdev, unsigned long id)
 {
+	struct mpfs_reset *rst = to_mpfs_reset(rcdev);
 	unsigned long flags;
 	u32 reg;
 
 	spin_lock_irqsave(&mpfs_reset_lock, flags);
 
-	reg = mpfs_reset_read(rcdev->dev);
+	reg = readl(rst->base);
 	reg &= ~BIT(id);
-	mpfs_reset_write(rcdev->dev, reg);
+	writel(reg, rst->base);
 
 	spin_unlock_irqrestore(&mpfs_reset_lock, flags);
 
@@ -66,7 +79,8 @@ static int mpfs_deassert(struct reset_controller_dev *rcdev, unsigned long id)
 
 static int mpfs_status(struct reset_controller_dev *rcdev, unsigned long id)
 {
-	u32 reg = mpfs_reset_read(rcdev->dev);
+	struct mpfs_reset *rst = to_mpfs_reset(rcdev);
+	u32 reg = readl(rst->base);
 
 	/*
 	 * It is safe to return here as MPFS_NUM_RESETS makes sure the sign bit
@@ -121,11 +135,15 @@ static int mpfs_reset_probe(struct auxiliary_device *adev,
 {
 	struct device *dev = &adev->dev;
 	struct reset_controller_dev *rcdev;
+	struct mpfs_reset *rst;
 
-	rcdev = devm_kzalloc(dev, sizeof(*rcdev), GFP_KERNEL);
-	if (!rcdev)
+	rst = devm_kzalloc(dev, sizeof(*rst), GFP_KERNEL);
+	if (!rst)
 		return -ENOMEM;
 
+	rst->base = (void __iomem *)adev->dev.platform_data;
+
+	rcdev = &rst->rcdev;
 	rcdev->dev = dev;
 	rcdev->dev->parent = dev->parent;
 	rcdev->ops = &mpfs_reset_ops;
@@ -137,9 +155,68 @@ static int mpfs_reset_probe(struct auxiliary_device *adev,
 	return devm_reset_controller_register(dev, rcdev);
 }
 
+static void mpfs_reset_unregister_adev(void *_adev)
+{
+	struct auxiliary_device *adev = _adev;
+
+	auxiliary_device_delete(adev);
+	auxiliary_device_uninit(adev);
+}
+
+static void mpfs_reset_adev_release(struct device *dev)
+{
+	struct auxiliary_device *adev = to_auxiliary_dev(dev);
+
+	kfree(adev);
+}
+
+static struct auxiliary_device *mpfs_reset_adev_alloc(struct device *clk_dev)
+{
+	struct auxiliary_device *adev;
+	int ret;
+
+	adev = kzalloc(sizeof(*adev), GFP_KERNEL);
+	if (!adev)
+		return ERR_PTR(-ENOMEM);
+
+	adev->name = "reset-mpfs";
+	adev->dev.parent = clk_dev;
+	adev->dev.release = mpfs_reset_adev_release;
+	adev->id = 666u;
+
+	ret = auxiliary_device_init(adev);
+	if (ret) {
+		kfree(adev);
+		return ERR_PTR(ret);
+	}
+
+	return adev;
+}
+
+int mpfs_reset_controller_register(struct device *clk_dev, void __iomem *base)
+{
+	struct auxiliary_device *adev;
+	int ret;
+
+	adev = mpfs_reset_adev_alloc(clk_dev);
+	if (IS_ERR(adev))
+		return PTR_ERR(adev);
+
+	ret = auxiliary_device_add(adev);
+	if (ret) {
+		auxiliary_device_uninit(adev);
+		return ret;
+	}
+
+	adev->dev.platform_data = (__force void *)base;
+
+	return devm_add_action_or_reset(clk_dev, mpfs_reset_unregister_adev, adev);
+}
+EXPORT_SYMBOL_NS_GPL(mpfs_reset_controller_register, MCHP_CLK_MPFS);
+
 static const struct auxiliary_device_id mpfs_reset_ids[] = {
 	{
-		.name = "clk_mpfs.reset-mpfs",
+		.name = "reset_mpfs.reset-mpfs",
 	},
 	{ }
 };
diff --git a/include/soc/microchip/mpfs.h b/include/soc/microchip/mpfs.h
index 09722f83b0ca..d7e612b5e22e 100644
--- a/include/soc/microchip/mpfs.h
+++ b/include/soc/microchip/mpfs.h
@@ -43,11 +43,11 @@ struct mtd_info *mpfs_sys_controller_get_flash(struct mpfs_sys_controller *mpfs_
 #endif /* if IS_ENABLED(CONFIG_POLARFIRE_SOC_SYS_CTRL) */
 
 #if IS_ENABLED(CONFIG_MCHP_CLK_MPFS)
-
-u32 mpfs_reset_read(struct device *dev);
-
-void mpfs_reset_write(struct device *dev, u32 val);
-
+#if IS_ENABLED(CONFIG_RESET_CONTROLLER)
+int mpfs_reset_controller_register(struct device *clk_dev, void __iomem *base);
+#else
+static inline int mpfs_reset_controller_register(struct device *clk_dev, void __iomem *base) { return 0; }
+#endif /* if IS_ENABLED(CONFIG_RESET_CONTROLLER) */
 #endif /* if IS_ENABLED(CONFIG_MCHP_CLK_MPFS) */
 
 #endif /* __SOC_MPFS_H__ */
-- 
cgit v1.2.3


From ecc2ae6176a9f0b9f4eab015452ffc5e1f54c95f Mon Sep 17 00:00:00 2001
From: Matthias Schiffer <matthias.schiffer@ew.tq-group.com>
Date: Thu, 2 May 2024 13:13:01 +0200
Subject: net: phy: marvell: add support for MV88E6250 family internal PHYs

The embedded PHYs of the 88E6250 family switches are very basic - they
do not even have an Extended Address / Page register.

This adds support for the PHYs to the driver to set up PHY interrupts
and retrieve error stats. To deal with PHYs without a page register,
"simple" variants of all stat handling functions are introduced.

The code should work with all 88E6250 family switches (6250/6220/6071/
6070/6020). The PHY ID 0x01410db0 was read from a 88E6020, under the
assumption that all switches of this family use the same ID. The spec
only lists the prefix 0x01410c00 and leaves the last 10 bits as reserved,
but that seems too unspecific to be useful, as it would cover several
existing PHY IDs already supported by the driver; therefore, the ID read
from the actual hardware is used.

Signed-off-by: Matthias Schiffer <matthias.schiffer@ew.tq-group.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://lore.kernel.org/r/0695f699cd942e6e06da9d30daeedfd47785bc01.1714643285.git.matthias.schiffer@ew.tq-group.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/marvell.c   | 81 ++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/marvell_phy.h |  2 ++
 2 files changed, 82 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index a57ad53cd09c..9964bf3dea2f 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -301,6 +301,7 @@
 #define LPA_PAUSE_ASYM_FIBER	0x100
 
 #define NB_FIBER_STATS	1
+#define NB_STAT_MAX	3
 
 MODULE_DESCRIPTION("Marvell PHY driver");
 MODULE_AUTHOR("Andy Fleming");
@@ -319,6 +320,23 @@ static const struct marvell_hw_stat marvell_hw_stats[] = {
 	{ "phy_receive_errors_fiber", 1, 21, 16},
 };
 
+static_assert(ARRAY_SIZE(marvell_hw_stats) <= NB_STAT_MAX);
+
+/* "simple" stat list + corresponding marvell_get_*_simple functions are used
+ * on PHYs without a page register
+ */
+struct marvell_hw_stat_simple {
+	const char *string;
+	u8 reg;
+	u8 bits;
+};
+
+static const struct marvell_hw_stat_simple marvell_hw_stats_simple[] = {
+	{ "phy_receive_errors", 21, 16},
+};
+
+static_assert(ARRAY_SIZE(marvell_hw_stats_simple) <= NB_STAT_MAX);
+
 enum {
 	M88E3082_VCT_OFF,
 	M88E3082_VCT_PHASE1,
@@ -326,7 +344,7 @@ enum {
 };
 
 struct marvell_priv {
-	u64 stats[ARRAY_SIZE(marvell_hw_stats)];
+	u64 stats[NB_STAT_MAX];
 	char *hwmon_name;
 	struct device *hwmon_dev;
 	bool cable_test_tdr;
@@ -1978,6 +1996,11 @@ static int marvell_get_sset_count(struct phy_device *phydev)
 		return ARRAY_SIZE(marvell_hw_stats) - NB_FIBER_STATS;
 }
 
+static int marvell_get_sset_count_simple(struct phy_device *phydev)
+{
+	return ARRAY_SIZE(marvell_hw_stats_simple);
+}
+
 static void marvell_get_strings(struct phy_device *phydev, u8 *data)
 {
 	int count = marvell_get_sset_count(phydev);
@@ -1989,6 +2012,17 @@ static void marvell_get_strings(struct phy_device *phydev, u8 *data)
 	}
 }
 
+static void marvell_get_strings_simple(struct phy_device *phydev, u8 *data)
+{
+	int count = marvell_get_sset_count_simple(phydev);
+	int i;
+
+	for (i = 0; i < count; i++) {
+		strscpy(data + i * ETH_GSTRING_LEN,
+			marvell_hw_stats_simple[i].string, ETH_GSTRING_LEN);
+	}
+}
+
 static u64 marvell_get_stat(struct phy_device *phydev, int i)
 {
 	struct marvell_hw_stat stat = marvell_hw_stats[i];
@@ -2008,6 +2042,25 @@ static u64 marvell_get_stat(struct phy_device *phydev, int i)
 	return ret;
 }
 
+static u64 marvell_get_stat_simple(struct phy_device *phydev, int i)
+{
+	struct marvell_hw_stat_simple stat = marvell_hw_stats_simple[i];
+	struct marvell_priv *priv = phydev->priv;
+	int val;
+	u64 ret;
+
+	val = phy_read(phydev, stat.reg);
+	if (val < 0) {
+		ret = U64_MAX;
+	} else {
+		val = val & ((1 << stat.bits) - 1);
+		priv->stats[i] += val;
+		ret = priv->stats[i];
+	}
+
+	return ret;
+}
+
 static void marvell_get_stats(struct phy_device *phydev,
 			      struct ethtool_stats *stats, u64 *data)
 {
@@ -2018,6 +2071,16 @@ static void marvell_get_stats(struct phy_device *phydev,
 		data[i] = marvell_get_stat(phydev, i);
 }
 
+static void marvell_get_stats_simple(struct phy_device *phydev,
+				     struct ethtool_stats *stats, u64 *data)
+{
+	int count = marvell_get_sset_count_simple(phydev);
+	int i;
+
+	for (i = 0; i < count; i++)
+		data[i] = marvell_get_stat_simple(phydev, i);
+}
+
 static int m88e1510_loopback(struct phy_device *phydev, bool enable)
 {
 	int err;
@@ -3924,6 +3987,21 @@ static struct phy_driver marvell_drivers[] = {
 		.get_strings = marvell_get_strings,
 		.get_stats = marvell_get_stats,
 	},
+	{
+		.phy_id = MARVELL_PHY_ID_88E6250_FAMILY,
+		.phy_id_mask = MARVELL_PHY_ID_MASK,
+		.name = "Marvell 88E6250 Family",
+		/* PHY_BASIC_FEATURES */
+		.probe = marvell_probe,
+		.aneg_done = marvell_aneg_done,
+		.config_intr = marvell_config_intr,
+		.handle_interrupt = marvell_handle_interrupt,
+		.resume = genphy_resume,
+		.suspend = genphy_suspend,
+		.get_sset_count = marvell_get_sset_count_simple,
+		.get_strings = marvell_get_strings_simple,
+		.get_stats = marvell_get_stats_simple,
+	},
 	{
 		.phy_id = MARVELL_PHY_ID_88E6341_FAMILY,
 		.phy_id_mask = MARVELL_PHY_ID_MASK,
@@ -4072,6 +4150,7 @@ static struct mdio_device_id __maybe_unused marvell_tbl[] = {
 	{ MARVELL_PHY_ID_88E1540, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E1545, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E3016, MARVELL_PHY_ID_MASK },
+	{ MARVELL_PHY_ID_88E6250_FAMILY, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E6341_FAMILY, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E6390_FAMILY, MARVELL_PHY_ID_MASK },
 	{ MARVELL_PHY_ID_88E6393_FAMILY, MARVELL_PHY_ID_MASK },
diff --git a/include/linux/marvell_phy.h b/include/linux/marvell_phy.h
index 88254f9aec2b..b1fbe4118414 100644
--- a/include/linux/marvell_phy.h
+++ b/include/linux/marvell_phy.h
@@ -32,6 +32,8 @@
 /* Marvel 88E1111 in Finisar SFP module with modified PHY ID */
 #define MARVELL_PHY_ID_88E1111_FINISAR	0x01ff0cc0
 
+/* ID from 88E6020, assumed to be the same for the whole 6250 family */
+#define MARVELL_PHY_ID_88E6250_FAMILY	0x01410db0
 /* These Ethernet switch families contain embedded PHYs, but they do
  * not have a model ID. So the switch driver traps reads to the ID2
  * register and returns the switch family ID
-- 
cgit v1.2.3


From 163943ac00cb31ac1a88ce5f78a7e2ead37329ec Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Tue, 7 May 2024 13:20:26 +0200
Subject: xsk: use generic DMA sync shortcut instead of a custom one

XSk infra's been using its own DMA sync shortcut to try avoiding
redundant function calls. Now that there is a generic one, remove
the custom implementation and rely on the generic helpers.
xsk_buff_dma_sync_for_cpu() doesn't need the second argument anymore,
remove it.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/net/ethernet/engleder/tsnep_main.c         |  2 +-
 drivers/net/ethernet/freescale/dpaa2/dpaa2-xsk.c   |  2 +-
 drivers/net/ethernet/intel/i40e/i40e_xsk.c         |  2 +-
 drivers/net/ethernet/intel/ice/ice_xsk.c           |  2 +-
 drivers/net/ethernet/intel/igc/igc_main.c          |  2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c       |  2 +-
 .../net/ethernet/mellanox/mlx5/core/en/xsk/rx.c    |  4 +--
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    |  2 +-
 drivers/net/ethernet/netronome/nfp/nfd3/xsk.c      |  2 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  |  2 +-
 include/net/xdp_sock_drv.h                         |  7 ++----
 include/net/xsk_buff_pool.h                        | 14 +++--------
 net/xdp/xsk_buff_pool.c                            | 29 +++-------------------
 13 files changed, 21 insertions(+), 51 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/engleder/tsnep_main.c b/drivers/net/ethernet/engleder/tsnep_main.c
index 4b15af6b7122..44da335d66bd 100644
--- a/drivers/net/ethernet/engleder/tsnep_main.c
+++ b/drivers/net/ethernet/engleder/tsnep_main.c
@@ -1587,7 +1587,7 @@ static int tsnep_rx_poll_zc(struct tsnep_rx *rx, struct napi_struct *napi,
 		length = __le32_to_cpu(entry->desc_wb->properties) &
 			 TSNEP_DESC_LENGTH_MASK;
 		xsk_buff_set_size(entry->xdp, length - ETH_FCS_LEN);
-		xsk_buff_dma_sync_for_cpu(entry->xdp, rx->xsk_pool);
+		xsk_buff_dma_sync_for_cpu(entry->xdp);
 
 		/* RX metadata with timestamps is in front of actual data,
 		 * subtract metadata size to get length of actual data and
diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-xsk.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-xsk.c
index 051748b997f3..a466c2379146 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-xsk.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-xsk.c
@@ -55,7 +55,7 @@ static u32 dpaa2_xsk_run_xdp(struct dpaa2_eth_priv *priv,
 	xdp_set_data_meta_invalid(xdp_buff);
 	xdp_buff->rxq = &ch->xdp_rxq;
 
-	xsk_buff_dma_sync_for_cpu(xdp_buff, ch->xsk_pool);
+	xsk_buff_dma_sync_for_cpu(xdp_buff);
 	xdp_act = bpf_prog_run_xdp(xdp_prog, xdp_buff);
 
 	/* xdp.data pointer may have changed */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index 11500003af0d..d20ce517426e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -483,7 +483,7 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
 
 		bi = *i40e_rx_bi(rx_ring, next_to_process);
 		xsk_buff_set_size(bi, size);
-		xsk_buff_dma_sync_for_cpu(bi, rx_ring->xsk_pool);
+		xsk_buff_dma_sync_for_cpu(bi);
 
 		if (!first)
 			first = bi;
diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
index 1857220d27fe..cecd5b1e0757 100644
--- a/drivers/net/ethernet/intel/ice/ice_xsk.c
+++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
@@ -879,7 +879,7 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget)
 				   ICE_RX_FLX_DESC_PKT_LEN_M;
 
 		xsk_buff_set_size(xdp, size);
-		xsk_buff_dma_sync_for_cpu(xdp, xsk_pool);
+		xsk_buff_dma_sync_for_cpu(xdp);
 
 		if (!first) {
 			first = xdp;
diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c
index 4d975d620a8e..07692e2a7c64 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -2813,7 +2813,7 @@ static int igc_clean_rx_irq_zc(struct igc_q_vector *q_vector, const int budget)
 		}
 
 		bi->xdp->data_end = bi->xdp->data + size;
-		xsk_buff_dma_sync_for_cpu(bi->xdp, ring->xsk_pool);
+		xsk_buff_dma_sync_for_cpu(bi->xdp);
 
 		res = __igc_xdp_run_prog(adapter, prog, bi->xdp);
 		switch (res) {
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
index d34d715c59eb..ee2d0ec12b2d 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
@@ -304,7 +304,7 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
 		}
 
 		bi->xdp->data_end = bi->xdp->data + size;
-		xsk_buff_dma_sync_for_cpu(bi->xdp, rx_ring->xsk_pool);
+		xsk_buff_dma_sync_for_cpu(bi->xdp);
 		xdp_res = ixgbe_run_xdp_zc(adapter, rx_ring, bi->xdp);
 
 		if (likely(xdp_res & (IXGBE_XDP_TX | IXGBE_XDP_REDIR))) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
index b8dd74453655..1b7132fa70de 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
@@ -270,7 +270,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
 	/* mxbuf->rq is set on allocation, but cqe is per-packet so set it here */
 	mxbuf->cqe = cqe;
 	xsk_buff_set_size(&mxbuf->xdp, cqe_bcnt);
-	xsk_buff_dma_sync_for_cpu(&mxbuf->xdp, rq->xsk_pool);
+	xsk_buff_dma_sync_for_cpu(&mxbuf->xdp);
 	net_prefetch(mxbuf->xdp.data);
 
 	/* Possible flows:
@@ -319,7 +319,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
 	/* mxbuf->rq is set on allocation, but cqe is per-packet so set it here */
 	mxbuf->cqe = cqe;
 	xsk_buff_set_size(&mxbuf->xdp, cqe_bcnt);
-	xsk_buff_dma_sync_for_cpu(&mxbuf->xdp, rq->xsk_pool);
+	xsk_buff_dma_sync_for_cpu(&mxbuf->xdp);
 	net_prefetch(mxbuf->xdp.data);
 
 	prog = rcu_dereference(rq->xdp_prog);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index d601b5faaed5..b5333da20e8a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -917,7 +917,7 @@ INDIRECT_CALLABLE_SCOPE bool mlx5e_post_rx_wqes(struct mlx5e_rq *rq)
 
 	if (!rq->xsk_pool) {
 		count = mlx5e_refill_rx_wqes(rq, head, wqe_bulk);
-	} else if (likely(!rq->xsk_pool->dma_need_sync)) {
+	} else if (likely(!dma_dev_need_sync(rq->pdev))) {
 		mlx5e_xsk_free_rx_wqes(rq, head, wqe_bulk);
 		count = mlx5e_xsk_alloc_rx_wqes_batched(rq, head, wqe_bulk);
 	} else {
diff --git a/drivers/net/ethernet/netronome/nfp/nfd3/xsk.c b/drivers/net/ethernet/netronome/nfp/nfd3/xsk.c
index 45be6954d5aa..01cfa9cc1b5e 100644
--- a/drivers/net/ethernet/netronome/nfp/nfd3/xsk.c
+++ b/drivers/net/ethernet/netronome/nfp/nfd3/xsk.c
@@ -184,7 +184,7 @@ nfp_nfd3_xsk_rx(struct nfp_net_rx_ring *rx_ring, int budget,
 		xrxbuf->xdp->data += meta_len;
 		xrxbuf->xdp->data_end = xrxbuf->xdp->data + pkt_len;
 		xdp_set_data_meta_invalid(xrxbuf->xdp);
-		xsk_buff_dma_sync_for_cpu(xrxbuf->xdp, r_vec->xsk_pool);
+		xsk_buff_dma_sync_for_cpu(xrxbuf->xdp);
 		net_prefetch(xrxbuf->xdp->data);
 
 		if (meta_len) {
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 7c6fb14b5555..206cba44dd30 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -5352,7 +5352,7 @@ read_again:
 
 		/* RX buffer is good and fit into a XSK pool buffer */
 		buf->xdp->data_end = buf->xdp->data + buf1_len;
-		xsk_buff_dma_sync_for_cpu(buf->xdp, rx_q->xsk_pool);
+		xsk_buff_dma_sync_for_cpu(buf->xdp);
 
 		prog = READ_ONCE(priv->xdp_prog);
 		res = __stmmac_xdp_run_prog(priv, prog, buf->xdp);
diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
index c9aec9ab6191..0a5dca2b2b3f 100644
--- a/include/net/xdp_sock_drv.h
+++ b/include/net/xdp_sock_drv.h
@@ -219,13 +219,10 @@ static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool
 	return meta;
 }
 
-static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool)
+static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp)
 {
 	struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
 
-	if (!pool->dma_need_sync)
-		return;
-
 	xp_dma_sync_for_cpu(xskb);
 }
 
@@ -402,7 +399,7 @@ static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool
 	return NULL;
 }
 
-static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool)
+static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp)
 {
 }
 
diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h
index 99dd7376df6a..bacb33f1e3e5 100644
--- a/include/net/xsk_buff_pool.h
+++ b/include/net/xsk_buff_pool.h
@@ -43,7 +43,6 @@ struct xsk_dma_map {
 	refcount_t users;
 	struct list_head list; /* Protected by the RTNL_LOCK */
 	u32 dma_pages_cnt;
-	bool dma_need_sync;
 };
 
 struct xsk_buff_pool {
@@ -82,7 +81,6 @@ struct xsk_buff_pool {
 	u8 tx_metadata_len; /* inherited from umem */
 	u8 cached_need_wakeup;
 	bool uses_need_wakeup;
-	bool dma_need_sync;
 	bool unaligned;
 	bool tx_sw_csum;
 	void *addrs;
@@ -155,21 +153,17 @@ static inline dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb)
 	return xskb->frame_dma;
 }
 
-void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb);
 static inline void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb)
 {
-	xp_dma_sync_for_cpu_slow(xskb);
+	dma_sync_single_for_cpu(xskb->pool->dev, xskb->dma,
+				xskb->pool->frame_len,
+				DMA_BIDIRECTIONAL);
 }
 
-void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma,
-				 size_t size);
 static inline void xp_dma_sync_for_device(struct xsk_buff_pool *pool,
 					  dma_addr_t dma, size_t size)
 {
-	if (!pool->dma_need_sync)
-		return;
-
-	xp_dma_sync_for_device_slow(pool, dma, size);
+	dma_sync_single_for_device(pool->dev, dma, size, DMA_BIDIRECTIONAL);
 }
 
 /* Masks for xdp_umem_page flags.
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index ce60ecd48a4d..c0e0204b9630 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -338,7 +338,6 @@ static struct xsk_dma_map *xp_create_dma_map(struct device *dev, struct net_devi
 
 	dma_map->netdev = netdev;
 	dma_map->dev = dev;
-	dma_map->dma_need_sync = false;
 	dma_map->dma_pages_cnt = nr_pages;
 	refcount_set(&dma_map->users, 1);
 	list_add(&dma_map->list, &umem->xsk_dma_list);
@@ -424,7 +423,6 @@ static int xp_init_dma_info(struct xsk_buff_pool *pool, struct xsk_dma_map *dma_
 
 	pool->dev = dma_map->dev;
 	pool->dma_pages_cnt = dma_map->dma_pages_cnt;
-	pool->dma_need_sync = dma_map->dma_need_sync;
 	memcpy(pool->dma_pages, dma_map->dma_pages,
 	       pool->dma_pages_cnt * sizeof(*pool->dma_pages));
 
@@ -460,8 +458,6 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
 			__xp_dma_unmap(dma_map, attrs);
 			return -ENOMEM;
 		}
-		if (dma_need_sync(dev, dma))
-			dma_map->dma_need_sync = true;
 		dma_map->dma_pages[i] = dma;
 	}
 
@@ -557,11 +553,9 @@ struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool)
 	xskb->xdp.data_meta = xskb->xdp.data;
 	xskb->xdp.flags = 0;
 
-	if (pool->dma_need_sync) {
-		dma_sync_single_range_for_device(pool->dev, xskb->dma, 0,
-						 pool->frame_len,
-						 DMA_BIDIRECTIONAL);
-	}
+	if (pool->dev)
+		xp_dma_sync_for_device(pool, xskb->dma, pool->frame_len);
+
 	return &xskb->xdp;
 }
 EXPORT_SYMBOL(xp_alloc);
@@ -633,7 +627,7 @@ u32 xp_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max)
 {
 	u32 nb_entries1 = 0, nb_entries2;
 
-	if (unlikely(pool->dma_need_sync)) {
+	if (unlikely(pool->dev && dma_dev_need_sync(pool->dev))) {
 		struct xdp_buff *buff;
 
 		/* Slow path */
@@ -693,18 +687,3 @@ dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr)
 		(addr & ~PAGE_MASK);
 }
 EXPORT_SYMBOL(xp_raw_get_dma);
-
-void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb)
-{
-	dma_sync_single_range_for_cpu(xskb->pool->dev, xskb->dma, 0,
-				      xskb->pool->frame_len, DMA_BIDIRECTIONAL);
-}
-EXPORT_SYMBOL(xp_dma_sync_for_cpu_slow);
-
-void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma,
-				 size_t size)
-{
-	dma_sync_single_range_for_device(pool->dev, dma, 0,
-					 size, DMA_BIDIRECTIONAL);
-}
-EXPORT_SYMBOL(xp_dma_sync_for_device_slow);
-- 
cgit v1.2.3


From 6d305cbef1aa01b9714e01e35f3d5c28544cf04d Mon Sep 17 00:00:00 2001
From: Erick Archer <erick.archer@outlook.com>
Date: Tue, 7 May 2024 19:33:18 +0200
Subject: uapi: stddef.h: Provide UAPI macros for __counted_by_{le, be}

This commit can be considered an addition to commit ca7e324e8ad3
("compiler_types: add Endianness-dependent __counted_by_{le,be}") [1].

In the commit referenced above the __counted_by_{le,be}() attributes
were defined based on platform's endianness with the goal to that the
structures contain flexible arrays at the end, and the counter for,
can be annotated with these attributes.

So, this commit only provide UAPI macros for UAPI structs that will
gain annotations for __counted_by_{le, be} attributes. And it is the
previous step to be able to use these attributes in UAPI.

Link: https://lore.kernel.org/r/20240327142241.1745989-2-aleksander.lobakin@intel.com
Suggested-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Erick Archer <erick.archer@outlook.com>
Link: https://lore.kernel.org/r/AS8PR02MB72372E45071E8821C07236F78BE42@AS8PR02MB7237.eurprd02.prod.outlook.com
Fixes: ca7e324e8ad3 ("compiler_types: add Endianness-dependent __counted_by_{le,be}")
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/uapi/linux/stddef.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/stddef.h b/include/uapi/linux/stddef.h
index 2ec6f35cda32..58154117d9b0 100644
--- a/include/uapi/linux/stddef.h
+++ b/include/uapi/linux/stddef.h
@@ -55,4 +55,12 @@
 #define __counted_by(m)
 #endif
 
+#ifndef __counted_by_le
+#define __counted_by_le(m)
+#endif
+
+#ifndef __counted_by_be
+#define __counted_by_be(m)
+#endif
+
 #endif /* _UAPI_LINUX_STDDEF_H */
-- 
cgit v1.2.3


From 2b8af5001abdf583da3a63201cc6137553019515 Mon Sep 17 00:00:00 2001
From: Michael Margolin <mrgolin@amazon.com>
Date: Mon, 6 May 2024 15:18:29 +0000
Subject: RDMA/efa: Support QP with unsolicited write w/ imm. receive

Add a new EFA flags attribute for QP creation, and support unsolicited
write with immediate flag. QPs created with this flag set will not
consume receive work requests for incoming RDMA write with immediate.
Expose device capability bit for this feature support.

Reviewed-by: Daniel Kranzdorf <dkkranzd@amazon.com>
Reviewed-by: Firas Jahjah <firasj@amazon.com>
Signed-off-by: Michael Margolin <mrgolin@amazon.com>
Link: https://lore.kernel.org/r/20240506151829.6475-1-mrgolin@amazon.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 drivers/infiniband/hw/efa/efa_admin_cmds_defs.h | 11 +++++++++--
 drivers/infiniband/hw/efa/efa_com_cmd.c         |  3 +++
 drivers/infiniband/hw/efa/efa_com_cmd.h         |  1 +
 drivers/infiniband/hw/efa/efa_verbs.c           | 19 ++++++++++++++++++-
 include/uapi/rdma/efa-abi.h                     |  7 +++++++
 5 files changed, 38 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h
index 7377c8a9f4d5..4296662e59c3 100644
--- a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h
+++ b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h
@@ -110,7 +110,10 @@ struct efa_admin_create_qp_cmd {
 	 *    virtual (IOVA returned by MR registration)
 	 * 1 : rq_virt - If set, RQ ring base address is
 	 *    virtual (IOVA returned by MR registration)
-	 * 7:2 : reserved - MBZ
+	 * 2 : unsolicited_write_recv - If set, work requests
+	 *    will not be consumed for incoming RDMA write with
+	 *    immediate
+	 * 7:3 : reserved - MBZ
 	 */
 	u8 flags;
 
@@ -663,7 +666,9 @@ struct efa_admin_feature_device_attr_desc {
 	 *    polling is supported
 	 * 3 : rdma_write - If set, RDMA Write is supported
 	 *    on TX queues
-	 * 31:4 : reserved - MBZ
+	 * 4 : unsolicited_write_recv - If set, unsolicited
+	 *    write with imm. receive is supported
+	 * 31:5 : reserved - MBZ
 	 */
 	u32 device_caps;
 
@@ -1009,6 +1014,7 @@ struct efa_admin_host_info {
 /* create_qp_cmd */
 #define EFA_ADMIN_CREATE_QP_CMD_SQ_VIRT_MASK                BIT(0)
 #define EFA_ADMIN_CREATE_QP_CMD_RQ_VIRT_MASK                BIT(1)
+#define EFA_ADMIN_CREATE_QP_CMD_UNSOLICITED_WRITE_RECV_MASK BIT(2)
 
 /* modify_qp_cmd */
 #define EFA_ADMIN_MODIFY_QP_CMD_QP_STATE_MASK               BIT(0)
@@ -1044,6 +1050,7 @@ struct efa_admin_host_info {
 #define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RNR_RETRY_MASK   BIT(1)
 #define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_DATA_POLLING_128_MASK BIT(2)
 #define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RDMA_WRITE_MASK  BIT(3)
+#define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_UNSOLICITED_WRITE_RECV_MASK BIT(4)
 
 /* create_eq_cmd */
 #define EFA_ADMIN_CREATE_EQ_CMD_ENTRY_SIZE_WORDS_MASK       GENMASK(4, 0)
diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.c b/drivers/infiniband/hw/efa/efa_com_cmd.c
index d3398c7b0bd0..5b9c2b16df0e 100644
--- a/drivers/infiniband/hw/efa/efa_com_cmd.c
+++ b/drivers/infiniband/hw/efa/efa_com_cmd.c
@@ -32,6 +32,9 @@ int efa_com_create_qp(struct efa_com_dev *edev,
 			params->rq_depth;
 	create_qp_cmd.uar = params->uarn;
 
+	if (params->unsolicited_write_recv)
+		EFA_SET(&create_qp_cmd.flags, EFA_ADMIN_CREATE_QP_CMD_UNSOLICITED_WRITE_RECV, 1);
+
 	err = efa_com_cmd_exec(aq,
 			       (struct efa_admin_aq_entry *)&create_qp_cmd,
 			       sizeof(create_qp_cmd),
diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.h b/drivers/infiniband/hw/efa/efa_com_cmd.h
index 720a99ba0f7d..9714105fcf7e 100644
--- a/drivers/infiniband/hw/efa/efa_com_cmd.h
+++ b/drivers/infiniband/hw/efa/efa_com_cmd.h
@@ -27,6 +27,7 @@ struct efa_com_create_qp_params {
 	u16 pd;
 	u16 uarn;
 	u8 qp_type;
+	u8 unsolicited_write_recv : 1;
 };
 
 struct efa_com_create_qp_result {
diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
index 2f412db2edcd..8f7a13b79cdc 100644
--- a/drivers/infiniband/hw/efa/efa_verbs.c
+++ b/drivers/infiniband/hw/efa/efa_verbs.c
@@ -263,6 +263,9 @@ int efa_query_device(struct ib_device *ibdev,
 		if (EFA_DEV_CAP(dev, RDMA_WRITE))
 			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_RDMA_WRITE;
 
+		if (EFA_DEV_CAP(dev, UNSOLICITED_WRITE_RECV))
+			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_UNSOLICITED_WRITE_RECV;
+
 		if (dev->neqs)
 			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS;
 
@@ -639,6 +642,7 @@ int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr,
 	struct efa_ibv_create_qp cmd = {};
 	struct efa_qp *qp = to_eqp(ibqp);
 	struct efa_ucontext *ucontext;
+	u16 supported_efa_flags = 0;
 	int err;
 
 	ucontext = rdma_udata_to_drv_context(udata, struct efa_ucontext,
@@ -676,13 +680,23 @@ int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr,
 		goto err_out;
 	}
 
-	if (cmd.comp_mask) {
+	if (cmd.comp_mask || !is_reserved_cleared(cmd.reserved_90)) {
 		ibdev_dbg(&dev->ibdev,
 			  "Incompatible ABI params, unknown fields in udata\n");
 		err = -EINVAL;
 		goto err_out;
 	}
 
+	if (EFA_DEV_CAP(dev, UNSOLICITED_WRITE_RECV))
+		supported_efa_flags |= EFA_CREATE_QP_WITH_UNSOLICITED_WRITE_RECV;
+
+	if (cmd.flags & ~supported_efa_flags) {
+		ibdev_dbg(&dev->ibdev, "Unsupported EFA QP create flags[%#x], supported[%#x]\n",
+			  cmd.flags, supported_efa_flags);
+		err = -EOPNOTSUPP;
+		goto err_out;
+	}
+
 	create_qp_params.uarn = ucontext->uarn;
 	create_qp_params.pd = to_epd(ibqp->pd)->pdn;
 
@@ -722,6 +736,9 @@ int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr,
 		create_qp_params.rq_base_addr = qp->rq_dma_addr;
 	}
 
+	if (cmd.flags & EFA_CREATE_QP_WITH_UNSOLICITED_WRITE_RECV)
+		create_qp_params.unsolicited_write_recv = true;
+
 	err = efa_com_create_qp(&dev->edev, &create_qp_params,
 				&create_qp_resp);
 	if (err)
diff --git a/include/uapi/rdma/efa-abi.h b/include/uapi/rdma/efa-abi.h
index 701e2d567e41..d689b8b34189 100644
--- a/include/uapi/rdma/efa-abi.h
+++ b/include/uapi/rdma/efa-abi.h
@@ -85,11 +85,17 @@ enum {
 	EFA_QP_DRIVER_TYPE_SRD = 0,
 };
 
+enum {
+	EFA_CREATE_QP_WITH_UNSOLICITED_WRITE_RECV = 1 << 0,
+};
+
 struct efa_ibv_create_qp {
 	__u32 comp_mask;
 	__u32 rq_ring_size; /* bytes */
 	__u32 sq_ring_size; /* bytes */
 	__u32 driver_qp_type;
+	__u16 flags;
+	__u8 reserved_90[6];
 };
 
 struct efa_ibv_create_qp_resp {
@@ -123,6 +129,7 @@ enum {
 	EFA_QUERY_DEVICE_CAPS_CQ_WITH_SGID     = 1 << 3,
 	EFA_QUERY_DEVICE_CAPS_DATA_POLLING_128 = 1 << 4,
 	EFA_QUERY_DEVICE_CAPS_RDMA_WRITE = 1 << 5,
+	EFA_QUERY_DEVICE_CAPS_UNSOLICITED_WRITE_RECV = 1 << 6,
 };
 
 struct efa_ibv_ex_query_device_resp {
-- 
cgit v1.2.3


From 96c6f337951a03ab40e13a44cf5ea59b14725721 Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Fri, 3 May 2024 15:13:40 +0200
Subject: net: dsa: add support for DCB get/set apptrust configuration

Add DCB support to get/set trust configuration for different packet
priority information sources. Some switch allow to chose different
source of packet priority classification. For example on KSZ switches it
is possible to configure VLAN PCP and/or DSCP sources.

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Reviewed-by: Florian Fainelli <florian.fainelli@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h |  4 ++++
 net/dsa/user.c    | 28 ++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index eef702dbea78..40c127a30187 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -955,6 +955,10 @@ struct dsa_switch_ops {
 				      u8 prio);
 	int	(*port_del_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp,
 				      u8 prio);
+	int	(*port_set_apptrust)(struct dsa_switch *ds, int port,
+				     const u8 *sel, int nsel);
+	int	(*port_get_apptrust)(struct dsa_switch *ds, int port, u8 *sel,
+				     int *nsel);
 
 	/*
 	 * Suspend and resume
diff --git a/net/dsa/user.c b/net/dsa/user.c
index 1f7b8f21db9e..89282c086776 100644
--- a/net/dsa/user.c
+++ b/net/dsa/user.c
@@ -2136,6 +2136,32 @@ out_conduit_failed:
 	return err;
 }
 
+static int __maybe_unused
+dsa_user_dcbnl_set_apptrust(struct net_device *dev, u8 *sel, int nsel)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	int port = dp->index;
+
+	if (!ds->ops->port_set_apptrust)
+		return -EOPNOTSUPP;
+
+	return ds->ops->port_set_apptrust(ds, port, sel, nsel);
+}
+
+static int __maybe_unused
+dsa_user_dcbnl_get_apptrust(struct net_device *dev, u8 *sel, int *nsel)
+{
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	int port = dp->index;
+
+	if (!ds->ops->port_get_apptrust)
+		return -EOPNOTSUPP;
+
+	return ds->ops->port_get_apptrust(ds, port, sel, nsel);
+}
+
 static int __maybe_unused
 dsa_user_dcbnl_set_default_prio(struct net_device *dev, struct dcb_app *app)
 {
@@ -2376,6 +2402,8 @@ static const struct ethtool_ops dsa_user_ethtool_ops = {
 static const struct dcbnl_rtnl_ops __maybe_unused dsa_user_dcbnl_ops = {
 	.ieee_setapp		= dsa_user_dcbnl_ieee_setapp,
 	.ieee_delapp		= dsa_user_dcbnl_ieee_delapp,
+	.dcbnl_setapptrust	= dsa_user_dcbnl_set_apptrust,
+	.dcbnl_getapptrust	= dsa_user_dcbnl_get_apptrust,
 };
 
 static void dsa_user_get_stats64(struct net_device *dev,
-- 
cgit v1.2.3


From 768cf8413883892c6ff0db783a0295a60886ec33 Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Fri, 3 May 2024 15:13:42 +0200
Subject: net: add IEEE 802.1q specific helpers

IEEE 802.1q specification provides recommendation and examples which can
be used as good default values for different drivers.

This patch implements mapping examples documented in IEEE 802.1Q-2022 in
Annex I "I.3 Traffic type to traffic class mapping" and IETF DSCP naming
and mapping DSCP to Traffic Type inspired by RFC8325.

This helpers will be used in followup patches for dsa/microchip DCB
implementation.

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dscp.h           |  76 ++++++++++++++
 include/net/ieee8021q.h      |  57 ++++++++++
 net/Kconfig                  |   3 +
 net/core/Makefile            |   1 +
 net/core/ieee8021q_helpers.c | 242 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 379 insertions(+)
 create mode 100644 include/net/dscp.h
 create mode 100644 include/net/ieee8021q.h
 create mode 100644 net/core/ieee8021q_helpers.c

(limited to 'include')

diff --git a/include/net/dscp.h b/include/net/dscp.h
new file mode 100644
index 000000000000..ba40540868c9
--- /dev/null
+++ b/include/net/dscp.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2024 Pengutronix, Oleksij Rempel <kernel@pengutronix.de> */
+
+#ifndef __DSCP_H__
+#define __DSCP_H__
+
+/*
+ * DSCP Pools and Codepoint Space Division:
+ *
+ * The Differentiated Services (Diffserv) architecture defines a method for
+ * classifying and managing network traffic using the DS field in IPv4 and IPv6
+ * packet headers. This field can carry one of 64 distinct DSCP (Differentiated
+ * Services Code Point) values, which are divided into three pools based on
+ * their Least Significant Bits (LSB) patterns and intended usage. Each pool has
+ * a specific registration procedure for assigning DSCP values:
+ *
+ * Pool 1 (Standards Action Pool):
+ * - Codepoint Space: xxxxx0
+ *   This pool includes DSCP values ending in '0' (binary), allocated via
+ *   Standards Action. It is intended for globally recognized traffic classes,
+ *   ensuring interoperability across the internet. This pool encompasses
+ *   well-known DSCP values such as CS0-CS7, AFxx, EF, and VOICE-ADMIT.
+ *
+ * Pool 2 (Experimental/Local Use Pool):
+ * - Codepoint Space: xxxx11
+ *   Reserved for DSCP values ending in '11' (binary), this pool is designated
+ *   for Experimental or Local Use. It allows for private or temporary traffic
+ *   marking schemes not intended for standardized global use, facilitating
+ *   testing and network-specific configurations without impacting
+ *   interoperability.
+ *
+ * Pool 3 (Preferential Standardization Pool):
+ * - Codepoint Space: xxxx01
+ *   Initially reserved for experimental or local use, this pool now serves as
+ *   a secondary standardization resource should Pool 1 become exhausted. DSCP
+ *   values ending in '01' (binary) are assigned via Standards Action, with a
+ *   focus on adopting new, standardized traffic classes as the need arises.
+ *
+ * For pool updates see:
+ * https://www.iana.org/assignments/dscp-registry/dscp-registry.xhtml
+ */
+
+/* Pool 1: Standardized DSCP values as per [RFC8126] */
+#define DSCP_CS0 0		/* 000000, [RFC2474] */
+/* CS0 is some times called default (DF) */
+#define DSCP_DF 0		/* 000000, [RFC2474] */
+#define DSCP_CS1 8		/* 001000, [RFC2474] */
+#define DSCP_CS2 16		/* 010000, [RFC2474] */
+#define DSCP_CS3 24		/* 011000, [RFC2474] */
+#define DSCP_CS4 32		/* 100000, [RFC2474] */
+#define DSCP_CS5 40		/* 101000, [RFC2474] */
+#define DSCP_CS6 48		/* 110000, [RFC2474] */
+#define DSCP_CS7 56		/* 111000, [RFC2474] */
+#define DSCP_AF11 10		/* 001010, [RFC2597] */
+#define DSCP_AF12 12		/* 001100, [RFC2597] */
+#define DSCP_AF13 14		/* 001110, [RFC2597] */
+#define DSCP_AF21 18		/* 010010, [RFC2597] */
+#define DSCP_AF22 20		/* 010100, [RFC2597] */
+#define DSCP_AF23 22		/* 010110, [RFC2597] */
+#define DSCP_AF31 26		/* 011010, [RFC2597] */
+#define DSCP_AF32 28		/* 011100, [RFC2597] */
+#define DSCP_AF33 30		/* 011110, [RFC2597] */
+#define DSCP_AF41 34		/* 100010, [RFC2597] */
+#define DSCP_AF42 36		/* 100100, [RFC2597] */
+#define DSCP_AF43 38		/* 100110, [RFC2597] */
+#define DSCP_EF 46		/* 101110, [RFC3246] */
+#define DSCP_VOICE_ADMIT 44	/* 101100, [RFC5865] */
+
+/* Pool 3: Standardized assignments, previously available for experimental/local
+ * use
+ */
+#define DSCP_LE 1		/* 000001, [RFC8622] */
+
+#define DSCP_MAX 64
+
+#endif /* __DSCP_H__ */
diff --git a/include/net/ieee8021q.h b/include/net/ieee8021q.h
new file mode 100644
index 000000000000..8bfe903dd3d0
--- /dev/null
+++ b/include/net/ieee8021q.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2024 Pengutronix, Oleksij Rempel <kernel@pengutronix.de> */
+
+#ifndef _NET_IEEE8021Q_H
+#define _NET_IEEE8021Q_H
+
+#include <linux/errno.h>
+
+/**
+ * enum ieee8021q_traffic_type - 802.1Q traffic type priority values (802.1Q-2022)
+ *
+ * @IEEE8021Q_TT_BK: Background
+ * @IEEE8021Q_TT_BE: Best Effort (default). According to 802.1Q-2022, BE is 0
+ * but has higher priority than BK which is 1.
+ * @IEEE8021Q_TT_EE: Excellent Effort
+ * @IEEE8021Q_TT_CA: Critical Applications
+ * @IEEE8021Q_TT_VI: Video, < 100 ms latency and jitter
+ * @IEEE8021Q_TT_VO: Voice, < 10 ms latency and jitter
+ * @IEEE8021Q_TT_IC: Internetwork Control
+ * @IEEE8021Q_TT_NC: Network Control
+ */
+enum ieee8021q_traffic_type {
+	IEEE8021Q_TT_BK = 0,
+	IEEE8021Q_TT_BE = 1,
+	IEEE8021Q_TT_EE = 2,
+	IEEE8021Q_TT_CA = 3,
+	IEEE8021Q_TT_VI = 4,
+	IEEE8021Q_TT_VO = 5,
+	IEEE8021Q_TT_IC = 6,
+	IEEE8021Q_TT_NC = 7,
+
+	/* private: */
+	IEEE8021Q_TT_MAX,
+};
+
+#define SIMPLE_IETF_DSCP_TO_IEEE8021Q_TT(dscp)		((dscp >> 3) & 0x7)
+
+#if IS_ENABLED(CONFIG_NET_IEEE8021Q_HELPERS)
+
+int ietf_dscp_to_ieee8021q_tt(u8 dscp);
+int ieee8021q_tt_to_tc(enum ieee8021q_traffic_type tt, unsigned int num_queues);
+
+#else
+
+static inline int ietf_dscp_to_ieee8021q_tt(u8 dscp)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int ieee8021q_tt_to_tc(enum ieee8021q_traffic_type tt,
+				     unsigned int num_queues)
+{
+	return -EOPNOTSUPP;
+}
+
+#endif
+#endif /* _NET_IEEE8021Q_H */
diff --git a/net/Kconfig b/net/Kconfig
index d5ab791f7afa..f0a8692496ff 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -452,6 +452,9 @@ config GRO_CELLS
 config SOCK_VALIDATE_XMIT
 	bool
 
+config NET_IEEE8021Q_HELPERS
+	bool
+
 config NET_SELFTESTS
 	def_tristate PHYLIB
 	depends on PHYLIB && INET
diff --git a/net/core/Makefile b/net/core/Makefile
index 21d6fbc7e884..62be9aef2528 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_NETPOLL) += netpoll.o
 obj-$(CONFIG_FIB_RULES) += fib_rules.o
 obj-$(CONFIG_TRACEPOINTS) += net-traces.o
 obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
+obj-$(CONFIG_NET_IEEE8021Q_HELPERS) += ieee8021q_helpers.o
 obj-$(CONFIG_NET_SELFTESTS) += selftests.o
 obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
 obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
diff --git a/net/core/ieee8021q_helpers.c b/net/core/ieee8021q_helpers.c
new file mode 100644
index 000000000000..759a9b9f3f89
--- /dev/null
+++ b/net/core/ieee8021q_helpers.c
@@ -0,0 +1,242 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2024 Pengutronix, Oleksij Rempel <kernel@pengutronix.de>
+
+#include <linux/array_size.h>
+#include <linux/printk.h>
+#include <linux/types.h>
+#include <net/dscp.h>
+#include <net/ieee8021q.h>
+
+/* The following arrays map Traffic Types (TT) to traffic classes (TC) for
+ * different number of queues as shown in the example provided by
+ * IEEE 802.1Q-2022 in Annex I "I.3 Traffic type to traffic class mapping" and
+ * Table I-1 "Traffic type to traffic class mapping".
+ */
+static const u8 ieee8021q_8queue_tt_tc_map[] = {
+	[IEEE8021Q_TT_BK] = 0,
+	[IEEE8021Q_TT_BE] = 1,
+	[IEEE8021Q_TT_EE] = 2,
+	[IEEE8021Q_TT_CA] = 3,
+	[IEEE8021Q_TT_VI] = 4,
+	[IEEE8021Q_TT_VO] = 5,
+	[IEEE8021Q_TT_IC] = 6,
+	[IEEE8021Q_TT_NC] = 7,
+};
+
+static const u8 ieee8021q_7queue_tt_tc_map[] = {
+	[IEEE8021Q_TT_BK] = 0,
+	[IEEE8021Q_TT_BE] = 1,
+	[IEEE8021Q_TT_EE] = 2,
+	[IEEE8021Q_TT_CA] = 3,
+	[IEEE8021Q_TT_VI] = 4,	[IEEE8021Q_TT_VO] = 4,
+	[IEEE8021Q_TT_IC] = 5,
+	[IEEE8021Q_TT_NC] = 6,
+};
+
+static const u8 ieee8021q_6queue_tt_tc_map[] = {
+	[IEEE8021Q_TT_BK] = 0,
+	[IEEE8021Q_TT_BE] = 1,
+	[IEEE8021Q_TT_EE] = 2,	[IEEE8021Q_TT_CA] = 2,
+	[IEEE8021Q_TT_VI] = 3,	[IEEE8021Q_TT_VO] = 3,
+	[IEEE8021Q_TT_IC] = 4,
+	[IEEE8021Q_TT_NC] = 5,
+};
+
+static const u8 ieee8021q_5queue_tt_tc_map[] = {
+	[IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0,
+	[IEEE8021Q_TT_EE] = 1, [IEEE8021Q_TT_CA] = 1,
+	[IEEE8021Q_TT_VI] = 2, [IEEE8021Q_TT_VO] = 2,
+	[IEEE8021Q_TT_IC] = 3,
+	[IEEE8021Q_TT_NC] = 4,
+};
+
+static const u8 ieee8021q_4queue_tt_tc_map[] = {
+	[IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0,
+	[IEEE8021Q_TT_EE] = 1, [IEEE8021Q_TT_CA] = 1,
+	[IEEE8021Q_TT_VI] = 2, [IEEE8021Q_TT_VO] = 2,
+	[IEEE8021Q_TT_IC] = 3, [IEEE8021Q_TT_NC] = 3,
+};
+
+static const u8 ieee8021q_3queue_tt_tc_map[] = {
+	[IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0,
+	[IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0,
+	[IEEE8021Q_TT_VI] = 1, [IEEE8021Q_TT_VO] = 1,
+	[IEEE8021Q_TT_IC] = 2, [IEEE8021Q_TT_NC] = 2,
+};
+
+static const u8 ieee8021q_2queue_tt_tc_map[] = {
+	[IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0,
+	[IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0,
+	[IEEE8021Q_TT_VI] = 1, [IEEE8021Q_TT_VO] = 1,
+	[IEEE8021Q_TT_IC] = 1, [IEEE8021Q_TT_NC] = 1,
+};
+
+static const u8 ieee8021q_1queue_tt_tc_map[] = {
+	[IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0,
+	[IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0,
+	[IEEE8021Q_TT_VI] = 0, [IEEE8021Q_TT_VO] = 0,
+	[IEEE8021Q_TT_IC] = 0, [IEEE8021Q_TT_NC] = 0,
+};
+
+/**
+ * ieee8021q_tt_to_tc - Map IEEE 802.1Q Traffic Type to Traffic Class
+ * @tt: IEEE 802.1Q Traffic Type
+ * @num_queues: Number of queues
+ *
+ * This function maps an IEEE 802.1Q Traffic Type to a Traffic Class (TC) based
+ * on the number of queues configured on the NIC. The mapping is based on the
+ * example provided by IEEE 802.1Q-2022 in Annex I "I.3 Traffic type to traffic
+ * class mapping" and Table I-1 "Traffic type to traffic class mapping".
+ *
+ * Return: Traffic Class corresponding to the given Traffic Type or negative
+ * value in case of error.
+ */
+int ieee8021q_tt_to_tc(enum ieee8021q_traffic_type tt, unsigned int num_queues)
+{
+	if (tt < 0 || tt >= IEEE8021Q_TT_MAX) {
+		pr_err("Requested Traffic Type (%d) is out of range (%d)\n", tt,
+		       IEEE8021Q_TT_MAX);
+		return -EINVAL;
+	}
+
+	switch (num_queues) {
+	case 8:
+		compiletime_assert(ARRAY_SIZE(ieee8021q_8queue_tt_tc_map) !=
+				   IEEE8021Q_TT_MAX - 1,
+				   "ieee8021q_8queue_tt_tc_map != max - 1");
+		return ieee8021q_8queue_tt_tc_map[tt];
+	case 7:
+		compiletime_assert(ARRAY_SIZE(ieee8021q_7queue_tt_tc_map) !=
+				   IEEE8021Q_TT_MAX - 1,
+				   "ieee8021q_7queue_tt_tc_map != max - 1");
+
+		return ieee8021q_7queue_tt_tc_map[tt];
+	case 6:
+		compiletime_assert(ARRAY_SIZE(ieee8021q_6queue_tt_tc_map) !=
+				   IEEE8021Q_TT_MAX - 1,
+				   "ieee8021q_6queue_tt_tc_map != max - 1");
+
+		return ieee8021q_6queue_tt_tc_map[tt];
+	case 5:
+		compiletime_assert(ARRAY_SIZE(ieee8021q_5queue_tt_tc_map) !=
+				   IEEE8021Q_TT_MAX - 1,
+				   "ieee8021q_5queue_tt_tc_map != max - 1");
+
+		return ieee8021q_5queue_tt_tc_map[tt];
+	case 4:
+		compiletime_assert(ARRAY_SIZE(ieee8021q_4queue_tt_tc_map) !=
+				   IEEE8021Q_TT_MAX - 1,
+				   "ieee8021q_4queue_tt_tc_map != max - 1");
+
+		return ieee8021q_4queue_tt_tc_map[tt];
+	case 3:
+		compiletime_assert(ARRAY_SIZE(ieee8021q_3queue_tt_tc_map) !=
+				   IEEE8021Q_TT_MAX - 1,
+				   "ieee8021q_3queue_tt_tc_map != max - 1");
+
+		return ieee8021q_3queue_tt_tc_map[tt];
+	case 2:
+		compiletime_assert(ARRAY_SIZE(ieee8021q_2queue_tt_tc_map) !=
+				   IEEE8021Q_TT_MAX - 1,
+				   "ieee8021q_2queue_tt_tc_map != max - 1");
+
+		return ieee8021q_2queue_tt_tc_map[tt];
+	case 1:
+		compiletime_assert(ARRAY_SIZE(ieee8021q_1queue_tt_tc_map) !=
+				   IEEE8021Q_TT_MAX - 1,
+				   "ieee8021q_1queue_tt_tc_map != max - 1");
+
+		return ieee8021q_1queue_tt_tc_map[tt];
+	}
+
+	pr_err("Invalid number of queues %d\n", num_queues);
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(ieee8021q_tt_to_tc);
+
+/**
+ * ietf_dscp_to_ieee8021q_tt - Map IETF DSCP to IEEE 802.1Q Traffic Type
+ * @dscp: IETF DSCP value
+ *
+ * This function maps an IETF DSCP value to an IEEE 802.1Q Traffic Type (TT).
+ * Since there is no corresponding mapping between DSCP and IEEE 802.1Q Traffic
+ * Type, this function is inspired by the RFC8325 documentation which describe
+ * the mapping between DSCP and 802.11 User Priority (UP) values.
+ *
+ * Return: IEEE 802.1Q Traffic Type corresponding to the given DSCP value
+ */
+int ietf_dscp_to_ieee8021q_tt(u8 dscp)
+{
+	switch (dscp) {
+	case DSCP_CS0:
+	/* Comment from RFC8325:
+	 * [RFC4594], Section 4.8, recommends High-Throughput Data be marked
+	 * AF1x (that is, AF11, AF12, and AF13, according to the rules defined
+	 * in [RFC2475]).
+	 *
+	 * By default (as described in Section 2.3), High-Throughput Data will
+	 * map to UP 1 and, thus, to the Background Access Category (AC_BK),
+	 * which is contrary to the intent expressed in [RFC4594].
+
+	 * Unfortunately, there really is no corresponding fit for the High-
+	 * Throughput Data service class within the constrained 4 Access
+	 * Category [IEEE.802.11-2016] model.  If the High-Throughput Data
+	 * service class is assigned to the Best Effort Access Category (AC_BE),
+	 * then it would contend with Low-Latency Data (while [RFC4594]
+	 * recommends a distinction in servicing between these service classes)
+	 * as well as with the default service class; alternatively, if it is
+	 * assigned to the Background Access Category (AC_BK), then it would
+	 * receive a less-then-best-effort service and contend with Low-Priority
+	 * Data (as discussed in Section 4.2.10).
+	 *
+	 * As such, since there is no directly corresponding fit for the High-
+	 * Throughout Data service class within the [IEEE.802.11-2016] model, it
+	 * is generally RECOMMENDED to map High-Throughput Data to UP 0, thereby
+	 * admitting it to the Best Effort Access Category (AC_BE).
+	 *
+	 * Note: The above text is from RFC8325 which is describing the mapping
+	 * between DSCP and 802.11 User Priority (UP) values. The mapping
+	 * between UP and IEEE 802.1Q Traffic Type is not defined in the RFC but
+	 * the 802.11 AC_BK and AC_BE are closely related to the IEEE 802.1Q
+	 * Traffic Types BE and BK.
+	 */
+	case DSCP_AF11:
+	case DSCP_AF12:
+	case DSCP_AF13:
+		return IEEE8021Q_TT_BE;
+	/* Comment from RFC8325:
+	 * RFC3662 and RFC4594 both recommend Low-Priority Data be marked
+	 * with DSCP CS1. The Low-Priority Data service class loosely
+	 * corresponds to the [IEEE.802.11-2016] Background Access Category
+	 */
+	case DSCP_CS1:
+		return IEEE8021Q_TT_BK;
+	case DSCP_CS2:
+	case DSCP_AF21:
+	case DSCP_AF22:
+	case DSCP_AF23:
+		return IEEE8021Q_TT_EE;
+	case DSCP_CS3:
+	case DSCP_AF31:
+	case DSCP_AF32:
+	case DSCP_AF33:
+		return IEEE8021Q_TT_CA;
+	case DSCP_CS4:
+	case DSCP_AF41:
+	case DSCP_AF42:
+	case DSCP_AF43:
+		return IEEE8021Q_TT_VI;
+	case DSCP_CS5:
+	case DSCP_EF:
+	case DSCP_VOICE_ADMIT:
+		return IEEE8021Q_TT_VO;
+	case DSCP_CS6:
+		return IEEE8021Q_TT_IC;
+	case DSCP_CS7:
+		return IEEE8021Q_TT_NC;
+	}
+
+	return SIMPLE_IETF_DSCP_TO_IEEE8021Q_TT(dscp);
+}
+EXPORT_SYMBOL_GPL(ietf_dscp_to_ieee8021q_tt);
-- 
cgit v1.2.3


From 5f5109af47535bc613c12a089dded425a373a12f Mon Sep 17 00:00:00 2001
From: Oleksij Rempel <o.rempel@pengutronix.de>
Date: Fri, 3 May 2024 15:13:49 +0200
Subject: net: dsa: add support switches global DSCP priority mapping

Some switches like Microchip KSZ variants do not support per port DSCP
priority configuration. Instead there is a global DSCP mapping table.

To handle it, we will accept set/del request to any of user ports to
make global configuration and update dcb app entries for all other
ports.

Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dsa.h |  9 +++++++
 net/dsa/user.c    | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 40c127a30187..b60e7e410aba 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -433,6 +433,11 @@ struct dsa_switch {
 	 */
 	u32			fdb_isolation:1;
 
+	/* Drivers that have global DSCP mapping settings must set this to
+	 * true to automatically apply the settings to all ports.
+	 */
+	u32			dscp_prio_mapping_is_global:1;
+
 	/* Listener for switch fabric events */
 	struct notifier_block	nb;
 
@@ -586,6 +591,10 @@ static inline bool dsa_is_user_port(struct dsa_switch *ds, int p)
 	dsa_switch_for_each_port((_dp), (_ds)) \
 		if (dsa_port_is_user((_dp)))
 
+#define dsa_switch_for_each_user_port_continue_reverse(_dp, _ds) \
+	dsa_switch_for_each_port_continue_reverse((_dp), (_ds)) \
+		if (dsa_port_is_user((_dp)))
+
 #define dsa_switch_for_each_cpu_port(_dp, _ds) \
 	dsa_switch_for_each_port((_dp), (_ds)) \
 		if (dsa_port_is_cpu((_dp)))
diff --git a/net/dsa/user.c b/net/dsa/user.c
index 89282c086776..867c5fe9a4da 100644
--- a/net/dsa/user.c
+++ b/net/dsa/user.c
@@ -2189,6 +2189,58 @@ dsa_user_dcbnl_set_default_prio(struct net_device *dev, struct dcb_app *app)
 	return 0;
 }
 
+/* Update the DSCP prio entries on all user ports of the switch in case
+ * the switch supports global DSCP prio instead of per port DSCP prios.
+ */
+static int dsa_user_dcbnl_ieee_global_dscp_setdel(struct net_device *dev,
+						  struct dcb_app *app, bool del)
+{
+	int (*setdel)(struct net_device *dev, struct dcb_app *app);
+	struct dsa_port *dp = dsa_user_to_port(dev);
+	struct dsa_switch *ds = dp->ds;
+	struct dsa_port *other_dp;
+	int err, restore_err;
+
+	if (del)
+		setdel = dcb_ieee_delapp;
+	else
+		setdel = dcb_ieee_setapp;
+
+	dsa_switch_for_each_user_port(other_dp, ds) {
+		struct net_device *user = other_dp->user;
+
+		if (!user || user == dev)
+			continue;
+
+		err = setdel(user, app);
+		if (err)
+			goto err_try_to_restore;
+	}
+
+	return 0;
+
+err_try_to_restore:
+
+	/* Revert logic to restore previous state of app entries */
+	if (!del)
+		setdel = dcb_ieee_delapp;
+	else
+		setdel = dcb_ieee_setapp;
+
+	dsa_switch_for_each_user_port_continue_reverse(other_dp, ds) {
+		struct net_device *user = other_dp->user;
+
+		if (!user || user == dev)
+			continue;
+
+		restore_err = setdel(user, app);
+		if (restore_err)
+			netdev_err(user, "Failed to restore DSCP prio entry configuration\n");
+	}
+
+	return err;
+}
+
 static int __maybe_unused
 dsa_user_dcbnl_add_dscp_prio(struct net_device *dev, struct dcb_app *app)
 {
@@ -2220,6 +2272,17 @@ dsa_user_dcbnl_add_dscp_prio(struct net_device *dev, struct dcb_app *app)
 		return err;
 	}
 
+	if (!ds->dscp_prio_mapping_is_global)
+		return 0;
+
+	err = dsa_user_dcbnl_ieee_global_dscp_setdel(dev, app, false);
+	if (err) {
+		if (ds->ops->port_del_dscp_prio)
+			ds->ops->port_del_dscp_prio(ds, port, dscp, new_prio);
+		dcb_ieee_delapp(dev, app);
+		return err;
+	}
+
 	return 0;
 }
 
@@ -2290,6 +2353,18 @@ dsa_user_dcbnl_del_dscp_prio(struct net_device *dev, struct dcb_app *app)
 		return err;
 	}
 
+	if (!ds->dscp_prio_mapping_is_global)
+		return 0;
+
+	err = dsa_user_dcbnl_ieee_global_dscp_setdel(dev, app, true);
+	if (err) {
+		if (ds->ops->port_add_dscp_prio)
+			ds->ops->port_add_dscp_prio(ds, port, dscp,
+						    app->priority);
+		dcb_ieee_setapp(dev, app);
+		return err;
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From db3efdcf70c752e8a8deb16071d8e693c3ef8746 Mon Sep 17 00:00:00 2001
From: Peilin He <he.peilin@zte.com.cn>
Date: Tue, 7 May 2024 15:41:03 +0800
Subject: net/ipv4: add tracepoint for icmp_send

Introduce a tracepoint for icmp_send, which can help users to get more
detail information conveniently when icmp abnormal events happen.

1. Giving an usecase example:
=============================
When an application experiences packet loss due to an unreachable UDP
destination port, the kernel will send an exception message through the
icmp_send function. By adding a trace point for icmp_send, developers or
system administrators can obtain detailed information about the UDP
packet loss, including the type, code, source address, destination address,
source port, and destination port. This facilitates the trouble-shooting
of UDP packet loss issues especially for those network-service
applications.

2. Operation Instructions:
==========================
Switch to the tracing directory.
        cd /sys/kernel/tracing
Filter for destination port unreachable.
        echo "type==3 && code==3" > events/icmp/icmp_send/filter
Enable trace event.
        echo 1 > events/icmp/icmp_send/enable

3. Result View:
================
 udp_client_erro-11370   [002] ...s.12   124.728002:
 icmp_send: icmp_send: type=3, code=3.
 From 127.0.0.1:41895 to 127.0.0.1:6666 ulen=23
 skbaddr=00000000589b167a

Signed-off-by: Peilin He <he.peilin@zte.com.cn>
Signed-off-by: xu xin <xu.xin16@zte.com.cn>
Reviewed-by: Yunkai Zhang <zhang.yunkai@zte.com.cn>
Cc: Yang Yang <yang.yang29@zte.com.cn>
Cc: Liu Chun <liu.chun2@zte.com.cn>
Cc: Xuexin Jiang <jiang.xuexin@zte.com.cn>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/icmp.h | 67 +++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/icmp.c             |  4 +++
 2 files changed, 71 insertions(+)
 create mode 100644 include/trace/events/icmp.h

(limited to 'include')

diff --git a/include/trace/events/icmp.h b/include/trace/events/icmp.h
new file mode 100644
index 000000000000..31559796949a
--- /dev/null
+++ b/include/trace/events/icmp.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM icmp
+
+#if !defined(_TRACE_ICMP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_ICMP_H
+
+#include <linux/icmp.h>
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(icmp_send,
+
+		TP_PROTO(const struct sk_buff *skb, int type, int code),
+
+		TP_ARGS(skb, type, code),
+
+		TP_STRUCT__entry(
+			__field(const void *, skbaddr)
+			__field(int, type)
+			__field(int, code)
+			__array(__u8, saddr, 4)
+			__array(__u8, daddr, 4)
+			__field(__u16, sport)
+			__field(__u16, dport)
+			__field(unsigned short, ulen)
+		),
+
+		TP_fast_assign(
+			struct iphdr *iph = ip_hdr(skb);
+			struct udphdr *uh = udp_hdr(skb);
+			int proto_4 = iph->protocol;
+			__be32 *p32;
+
+			__entry->skbaddr = skb;
+			__entry->type = type;
+			__entry->code = code;
+
+			if (proto_4 != IPPROTO_UDP || (u8 *)uh < skb->head ||
+				(u8 *)uh + sizeof(struct udphdr)
+				> skb_tail_pointer(skb)) {
+				__entry->sport = 0;
+				__entry->dport = 0;
+				__entry->ulen = 0;
+			} else {
+				__entry->sport = ntohs(uh->source);
+				__entry->dport = ntohs(uh->dest);
+				__entry->ulen = ntohs(uh->len);
+			}
+
+			p32 = (__be32 *) __entry->saddr;
+			*p32 = iph->saddr;
+
+			p32 = (__be32 *) __entry->daddr;
+			*p32 = iph->daddr;
+		),
+
+		TP_printk("icmp_send: type=%d, code=%d. From %pI4:%u to %pI4:%u ulen=%d skbaddr=%p",
+			__entry->type, __entry->code,
+			__entry->saddr, __entry->sport, __entry->daddr,
+			__entry->dport, __entry->ulen, __entry->skbaddr)
+);
+
+#endif /* _TRACE_ICMP_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
+
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 207482d30dc7..ab6d0d98dbc3 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -93,6 +93,8 @@
 #include <net/ip_fib.h>
 #include <net/l3mdev.h>
 #include <net/addrconf.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/icmp.h>
 
 /*
  *	Build xmit assembly blocks
@@ -770,6 +772,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
 	if (!fl4.saddr)
 		fl4.saddr = htonl(INADDR_DUMMY);
 
+	trace_icmp_send(skb_in, type, code);
+
 	icmp_push_reply(sk, &icmp_param, &fl4, &ipc, &rt);
 ende:
 	ip_rt_put(rt);
-- 
cgit v1.2.3


From 055e09ac54ae9c8396c1086fe06a73e0ce9bdd10 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 17 Apr 2024 23:11:23 +0300
Subject: cpumask: delete unused reset_cpu_possible_mask()

Link: https://lkml.kernel.org/r/20240417201123.2961-1-adobriyan@gmail.com
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/cpumask.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 1c29947db848..04536a29f10f 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -1017,11 +1017,6 @@ void init_cpu_present(const struct cpumask *src);
 void init_cpu_possible(const struct cpumask *src);
 void init_cpu_online(const struct cpumask *src);
 
-static inline void reset_cpu_possible_mask(void)
-{
-	bitmap_zero(cpumask_bits(&__cpu_possible_mask), NR_CPUS);
-}
-
 static inline void
 set_cpu_possible(unsigned int cpu, bool possible)
 {
-- 
cgit v1.2.3


From f4af41bf177add167e39e4b0203460b1d0b531f6 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Tue, 9 Apr 2024 12:22:38 +0800
Subject: kexec: fix the unexpected kexec_dprintk() macro

Jiri reported that the current kexec_dprintk() always prints out debugging
message whenever kexec/kdmmp loading is triggered.  That is not wanted.
The debugging message is supposed to be printed out when 'kexec -s -d' is
specified for kexec/kdump loading.

After investigating, the reason is the current kexec_dprintk() takes
printk(KERN_INFO) or printk(KERN_DEBUG) depending on whether '-d' is
specified.  However, distros usually have defaulg log level like below:

 [~]# cat /proc/sys/kernel/printk
 7       4      1       7

So, even though '-d' is not specified, printk(KERN_DEBUG) also always
prints out.  I thought printk(KERN_DEBUG) is equal to pr_debug(), it's
not.

Fix it by changing to use pr_info() instead which are expected to work.

Link: https://lkml.kernel.org/r/20240409042238.1240462-1-bhe@redhat.com
Fixes: cbc2fe9d9cb2 ("kexec_file: add kexec_file flag to control debug printing")
Signed-off-by: Baoquan He <bhe@redhat.com>
Reported-by: Jiri Slaby <jirislaby@kernel.org>
Closes: https://lore.kernel.org/all/4c775fca-5def-4a2d-8437-7130b02722a2@kernel.org
Reviewed-by: Dave Young <dyoung@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kexec.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 060835bb82d5..f31bd304df45 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -461,10 +461,8 @@ static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) {
 
 extern bool kexec_file_dbg_print;
 
-#define kexec_dprintk(fmt, ...)					\
-	printk("%s" fmt,					\
-	       kexec_file_dbg_print ? KERN_INFO : KERN_DEBUG,	\
-	       ##__VA_ARGS__)
+#define kexec_dprintk(fmt, arg...) \
+        do { if (kexec_file_dbg_print) pr_info(fmt, ##arg); } while (0)
 
 #else /* !CONFIG_KEXEC_CORE */
 struct pt_regs;
-- 
cgit v1.2.3


From 22bcc915ae910bc823d8351542f6d9e7623fff24 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 23 Apr 2024 22:23:10 +0300
Subject: kfifo: don't use "proxy" headers

Update header inclusions to follow IWYU (Include What You Use) principle.

Link: https://lkml.kernel.org/r/20240423192529.3249134-4-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Alain Volmat <alain.volmat@foss.st.com>
Cc: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
Cc: Chen-Yu Tsai <wens@csie.org>
Cc: Hans Verkuil <hverkuil-cisco@xs4all.nl>
Cc: Jernej Skrabec <jernej.skrabec@gmail.com>
Cc: Matthias Brugger <matthias.bgg@gmail.com>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Patrice Chotard <patrice.chotard@foss.st.com>
Cc: Rob Herring <robh@kernel.org>
Cc: Samuel Holland <samuel@sholland.org>
Cc: Sean Wang <sean.wang@mediatek.com>
Cc: Sean Young <sean@mess.org>
Cc: Stefani Seibold <stefani@seibold.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kfifo.h       | 9 +++++++--
 lib/kfifo.c                 | 8 ++++----
 samples/kfifo/dma-example.c | 3 ++-
 3 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h
index 0b35a41440ff..6b28d642f332 100644
--- a/include/linux/kfifo.h
+++ b/include/linux/kfifo.h
@@ -36,10 +36,15 @@
  * to lock the reader.
  */
 
-#include <linux/kernel.h>
+#include <linux/array_size.h>
 #include <linux/spinlock.h>
 #include <linux/stddef.h>
-#include <linux/scatterlist.h>
+#include <linux/types.h>
+
+#include <asm/barrier.h>
+#include <asm/errno.h>
+
+struct scatterlist;
 
 struct __kfifo {
 	unsigned int	in;
diff --git a/lib/kfifo.c b/lib/kfifo.c
index 12f5a347aa13..15acdee4a8f3 100644
--- a/lib/kfifo.c
+++ b/lib/kfifo.c
@@ -5,13 +5,13 @@
  * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net>
  */
 
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/slab.h>
 #include <linux/err.h>
+#include <linux/export.h>
+#include <linux/kfifo.h>
 #include <linux/log2.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
 #include <linux/uaccess.h>
-#include <linux/kfifo.h>
 
 /*
  * internal helper to calculate the unused elements in a fifo
diff --git a/samples/kfifo/dma-example.c b/samples/kfifo/dma-example.c
index 0cf27483cb36..74fe915b7ffe 100644
--- a/samples/kfifo/dma-example.c
+++ b/samples/kfifo/dma-example.c
@@ -6,8 +6,9 @@
  */
 
 #include <linux/init.h>
-#include <linux/module.h>
 #include <linux/kfifo.h>
+#include <linux/module.h>
+#include <linux/scatterlist.h>
 
 /*
  * This module shows how to handle fifo dma operations.
-- 
cgit v1.2.3


From 393fb313a2e150b768e4850658679e2afff431e9 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Mon, 29 Apr 2024 23:02:36 -0700
Subject: watchdog: allow nmi watchdog to use raw perf event

NMI watchdog permanently consumes one hardware counters per CPU on the
system.  For systems that use many hardware counters, this causes more
aggressive time multiplexing of perf events.

OTOH, some CPUs (mostly Intel) support "ref-cycles" event, which is rarely
used.  Add kernel cmdline arg nmi_watchdog=rNNN to configure the watchdog
to use raw event.  For example, on Intel CPUs, we can use "r300" to
configure the watchdog to use ref-cycles event.

If the raw event does not work, fall back to use "cycles".

[akpm@linux-foundation.org: fix kerneldoc]
Link: https://lkml.kernel.org/r/20240430060236.1878002-2-song@kernel.org
Signed-off-by: Song Liu <song@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  5 +--
 include/linux/nmi.h                             |  2 ++
 kernel/watchdog.c                               |  2 ++
 kernel/watchdog_perf.c                          | 46 +++++++++++++++++++++++++
 4 files changed, 53 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 902ecd92a29f..1fa79a3d0d1a 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3773,10 +3773,12 @@
 			Format: [state][,regs][,debounce][,die]
 
 	nmi_watchdog=	[KNL,BUGS=X86] Debugging features for SMP kernels
-			Format: [panic,][nopanic,][num]
+			Format: [panic,][nopanic,][rNNN,][num]
 			Valid num: 0 or 1
 			0 - turn hardlockup detector in nmi_watchdog off
 			1 - turn hardlockup detector in nmi_watchdog on
+			rNNN - configure the watchdog with raw perf event 0xNNN
+
 			When panic is specified, panic when an NMI watchdog
 			timeout occurs (or 'nopanic' to not panic on an NMI
 			watchdog, if CONFIG_BOOTPARAM_HARDLOCKUP_PANIC is set)
@@ -7464,4 +7466,3 @@
 				memory, and other data can't be written using
 				xmon commands.
 			off	xmon is disabled.
-
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index f53438eae815..a8dfb38c9bb6 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -105,10 +105,12 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs);
 extern void hardlockup_detector_perf_stop(void);
 extern void hardlockup_detector_perf_restart(void);
 extern void hardlockup_detector_perf_cleanup(void);
+extern void hardlockup_config_perf_event(const char *str);
 #else
 static inline void hardlockup_detector_perf_stop(void) { }
 static inline void hardlockup_detector_perf_restart(void) { }
 static inline void hardlockup_detector_perf_cleanup(void) { }
+static inline void hardlockup_config_perf_event(const char *str) { }
 #endif
 
 void watchdog_hardlockup_stop(void);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 7f54484de16f..ab0129b15f25 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -80,6 +80,8 @@ next:
 		watchdog_hardlockup_user_enabled = 0;
 	else if (!strncmp(str, "1", 1))
 		watchdog_hardlockup_user_enabled = 1;
+	else if (!strncmp(str, "r", 1))
+		hardlockup_config_perf_event(str + 1);
 	while (*(str++)) {
 		if (*str == ',') {
 			str++;
diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c
index 8ea00c4a24b2..5f7d1f0d4268 100644
--- a/kernel/watchdog_perf.c
+++ b/kernel/watchdog_perf.c
@@ -90,6 +90,14 @@ static struct perf_event_attr wd_hw_attr = {
 	.disabled	= 1,
 };
 
+static struct perf_event_attr fallback_wd_hw_attr = {
+	.type		= PERF_TYPE_HARDWARE,
+	.config		= PERF_COUNT_HW_CPU_CYCLES,
+	.size		= sizeof(struct perf_event_attr),
+	.pinned		= 1,
+	.disabled	= 1,
+};
+
 /* Callback function for perf event subsystem */
 static void watchdog_overflow_callback(struct perf_event *event,
 				       struct perf_sample_data *data,
@@ -122,6 +130,13 @@ static int hardlockup_detector_event_create(void)
 	/* Try to register using hardware perf events */
 	evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
 					       watchdog_overflow_callback, NULL);
+	if (IS_ERR(evt)) {
+		wd_attr = &fallback_wd_hw_attr;
+		wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
+		evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
+						       watchdog_overflow_callback, NULL);
+	}
+
 	if (IS_ERR(evt)) {
 		pr_debug("Perf event create on CPU %d failed with %ld\n", cpu,
 			 PTR_ERR(evt));
@@ -259,3 +274,34 @@ int __init watchdog_hardlockup_probe(void)
 	}
 	return ret;
 }
+
+/**
+ * hardlockup_config_perf_event - Overwrite config of wd_hw_attr.
+ *
+ * @str: number which identifies the raw perf event to use
+ */
+void __init hardlockup_config_perf_event(const char *str)
+{
+	u64 config;
+	char buf[24];
+	char *comma = strchr(str, ',');
+
+	if (!comma) {
+		if (kstrtoull(str, 16, &config))
+			return;
+	} else {
+		unsigned int len = comma - str;
+
+		if (len >= sizeof(buf))
+			return;
+
+		if (strscpy(buf, str, sizeof(buf)) < 0)
+			return;
+		buf[len] = 0;
+		if (kstrtoull(buf, 16, &config))
+			return;
+	}
+
+	wd_hw_attr.type = PERF_TYPE_RAW;
+	wd_hw_attr.config = config;
+}
-- 
cgit v1.2.3


From 9e993b3d722fb452e274e1f8694d8940db183323 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Mon, 6 May 2024 18:13:45 +0200
Subject: ALSA: hda: codec: Reduce CONFIG_PM dependencies

CONFIG_PM is almost mandatory nowadays for real systems, but we have
lots of CONFIG_PM dependent code in snd-hda-codec helper code.

Let's reduce the dependencies of CONFIG_PM now.  The only visible
drawback would be a couple of superfluous trace entries for runtime
PM, but we can live with that.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Link: https://lore.kernel.org/r/20240506161359.6960-3-tiwai@suse.de
---
 include/sound/hda_codec.h      | 11 -----------
 sound/pci/hda/hda_codec.c      | 37 ++++++++++---------------------------
 sound/pci/hda/hda_controller.c |  2 --
 sound/pci/hda/hda_sysfs.c      |  4 ----
 4 files changed, 10 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/include/sound/hda_codec.h b/include/sound/hda_codec.h
index 9c94ba7c183d..575e55aa08ca 100644
--- a/include/sound/hda_codec.h
+++ b/include/sound/hda_codec.h
@@ -109,11 +109,9 @@ struct hda_codec_ops {
 	void (*unsol_event)(struct hda_codec *codec, unsigned int res);
 	void (*set_power_state)(struct hda_codec *codec, hda_nid_t fg,
 				unsigned int power_state);
-#ifdef CONFIG_PM
 	int (*suspend)(struct hda_codec *codec);
 	int (*resume)(struct hda_codec *codec);
 	int (*check_power_status)(struct hda_codec *codec, hda_nid_t nid);
-#endif
 	void (*stream_pm)(struct hda_codec *codec, hda_nid_t nid, bool on);
 };
 
@@ -259,11 +257,9 @@ struct hda_codec {
 	unsigned int no_stream_clean_at_suspend:1; /* do not clean streams at suspend */
 	unsigned int ctl_dev_id:1; /* old control element id build behaviour */
 
-#ifdef CONFIG_PM
 	unsigned long power_on_acct;
 	unsigned long power_off_acct;
 	unsigned long power_jiffies;
-#endif
 
 	/* filter the requested power state per nid */
 	unsigned int (*power_filter)(struct hda_codec *codec, hda_nid_t nid,
@@ -481,10 +477,8 @@ extern const struct dev_pm_ops hda_codec_driver_pm;
 static inline
 int hda_call_check_power_status(struct hda_codec *codec, hda_nid_t nid)
 {
-#ifdef CONFIG_PM
 	if (codec->patch_ops.check_power_status)
 		return codec->patch_ops.check_power_status(codec, nid);
-#endif
 	return 0;
 }
 
@@ -495,14 +489,9 @@ int hda_call_check_power_status(struct hda_codec *codec, hda_nid_t nid)
 #define snd_hda_power_up_pm(codec)	snd_hdac_power_up_pm(&(codec)->core)
 #define snd_hda_power_down(codec)	snd_hdac_power_down(&(codec)->core)
 #define snd_hda_power_down_pm(codec)	snd_hdac_power_down_pm(&(codec)->core)
-#ifdef CONFIG_PM
 void snd_hda_codec_set_power_save(struct hda_codec *codec, int delay);
 void snd_hda_set_power_save(struct hda_bus *bus, int delay);
 void snd_hda_update_power_acct(struct hda_codec *codec);
-#else
-static inline void snd_hda_codec_set_power_save(struct hda_codec *codec, int delay) {}
-static inline void snd_hda_set_power_save(struct hda_bus *bus, int delay) {}
-#endif
 
 static inline bool hda_codec_need_resume(struct hda_codec *codec)
 {
diff --git a/sound/pci/hda/hda_codec.c b/sound/pci/hda/hda_codec.c
index 2cac337f5263..325e8f0b99a8 100644
--- a/sound/pci/hda/hda_codec.c
+++ b/sound/pci/hda/hda_codec.c
@@ -616,7 +616,6 @@ void snd_hda_shutup_pins(struct hda_codec *codec)
 }
 EXPORT_SYMBOL_GPL(snd_hda_shutup_pins);
 
-#ifdef CONFIG_PM
 /* Restore the pin controls cleared previously via snd_hda_shutup_pins() */
 static void restore_shutup_pins(struct hda_codec *codec)
 {
@@ -634,7 +633,6 @@ static void restore_shutup_pins(struct hda_codec *codec)
 	}
 	codec->pins_shutup = 0;
 }
-#endif
 
 static void hda_jackpoll_work(struct work_struct *work)
 {
@@ -1001,9 +999,7 @@ int snd_hda_codec_device_new(struct hda_bus *bus, struct snd_card *card,
 	codec->card = card;
 	codec->addr = codec_addr;
 
-#ifdef CONFIG_PM
 	codec->power_jiffies = jiffies;
-#endif
 
 	snd_hda_sysfs_init(codec);
 
@@ -1238,7 +1234,6 @@ static void purify_inactive_streams(struct hda_codec *codec)
 	}
 }
 
-#ifdef CONFIG_PM
 /* clean up all streams; called from suspend */
 static void hda_cleanup_all_streams(struct hda_codec *codec)
 {
@@ -1250,7 +1245,6 @@ static void hda_cleanup_all_streams(struct hda_codec *codec)
 			really_cleanup_stream(codec, p);
 	}
 }
-#endif
 
 /*
  * amp access functions
@@ -2858,7 +2852,6 @@ static void hda_exec_init_verbs(struct hda_codec *codec)
 static inline void hda_exec_init_verbs(struct hda_codec *codec) {}
 #endif
 
-#ifdef CONFIG_PM
 /* update the power on/off account with the current jiffies */
 static void update_power_acct(struct hda_codec *codec, bool on)
 {
@@ -2966,9 +2959,6 @@ static int hda_codec_runtime_resume(struct device *dev)
 	return 0;
 }
 
-#endif /* CONFIG_PM */
-
-#ifdef CONFIG_PM_SLEEP
 static int hda_codec_pm_prepare(struct device *dev)
 {
 	struct hda_codec *codec = dev_to_hda_codec(dev);
@@ -3023,22 +3013,19 @@ static int hda_codec_pm_restore(struct device *dev)
 	dev->power.power_state = PMSG_RESTORE;
 	return pm_runtime_force_resume(dev);
 }
-#endif /* CONFIG_PM_SLEEP */
 
 /* referred in hda_bind.c */
 const struct dev_pm_ops hda_codec_driver_pm = {
-#ifdef CONFIG_PM_SLEEP
-	.prepare = hda_codec_pm_prepare,
-	.complete = hda_codec_pm_complete,
-	.suspend = hda_codec_pm_suspend,
-	.resume = hda_codec_pm_resume,
-	.freeze = hda_codec_pm_freeze,
-	.thaw = hda_codec_pm_thaw,
-	.poweroff = hda_codec_pm_suspend,
-	.restore = hda_codec_pm_restore,
-#endif /* CONFIG_PM_SLEEP */
-	SET_RUNTIME_PM_OPS(hda_codec_runtime_suspend, hda_codec_runtime_resume,
-			   NULL)
+	.prepare = pm_sleep_ptr(hda_codec_pm_prepare),
+	.complete = pm_sleep_ptr(hda_codec_pm_complete),
+	.suspend = pm_sleep_ptr(hda_codec_pm_suspend),
+	.resume = pm_sleep_ptr(hda_codec_pm_resume),
+	.freeze = pm_sleep_ptr(hda_codec_pm_freeze),
+	.thaw = pm_sleep_ptr(hda_codec_pm_thaw),
+	.poweroff = pm_sleep_ptr(hda_codec_pm_suspend),
+	.restore = pm_sleep_ptr(hda_codec_pm_restore),
+	.runtime_suspend = pm_ptr(hda_codec_runtime_suspend),
+	.runtime_resume = pm_ptr(hda_codec_runtime_resume),
 };
 
 /* suspend the codec at shutdown; called from driver's shutdown callback */
@@ -3425,7 +3412,6 @@ int snd_hda_add_new_ctls(struct hda_codec *codec,
 }
 EXPORT_SYMBOL_GPL(snd_hda_add_new_ctls);
 
-#ifdef CONFIG_PM
 /**
  * snd_hda_codec_set_power_save - Configure codec's runtime PM
  * @codec: codec device to configure
@@ -3516,7 +3502,6 @@ int snd_hda_check_amp_list_power(struct hda_codec *codec,
 	return 0;
 }
 EXPORT_SYMBOL_GPL(snd_hda_check_amp_list_power);
-#endif
 
 /*
  * input MUX helper
@@ -4060,12 +4045,10 @@ void snd_hda_bus_reset_codecs(struct hda_bus *bus)
 		/* FIXME: maybe a better way needed for forced reset */
 		if (current_work() != &codec->jackpoll_work.work)
 			cancel_delayed_work_sync(&codec->jackpoll_work);
-#ifdef CONFIG_PM
 		if (hda_codec_is_power_on(codec)) {
 			hda_call_codec_suspend(codec);
 			hda_call_codec_resume(codec);
 		}
-#endif
 	}
 }
 
diff --git a/sound/pci/hda/hda_controller.c b/sound/pci/hda/hda_controller.c
index 206306a0eb82..c7142eee9f35 100644
--- a/sound/pci/hda/hda_controller.c
+++ b/sound/pci/hda/hda_controller.c
@@ -1075,11 +1075,9 @@ irqreturn_t azx_interrupt(int irq, void *dev_id)
 	bool active, handled = false;
 	int repeat = 0; /* count for avoiding endless loop */
 
-#ifdef CONFIG_PM
 	if (azx_has_pm_runtime(chip))
 		if (!pm_runtime_active(chip->card->dev))
 			return IRQ_NONE;
-#endif
 
 	spin_lock(&bus->reg_lock);
 
diff --git a/sound/pci/hda/hda_sysfs.c b/sound/pci/hda/hda_sysfs.c
index 69ebc37a4d6f..265fd4737893 100644
--- a/sound/pci/hda/hda_sysfs.c
+++ b/sound/pci/hda/hda_sysfs.c
@@ -26,7 +26,6 @@ struct hda_hint {
 	const char *val;	/* contained in the same alloc as key */
 };
 
-#ifdef CONFIG_PM
 static ssize_t power_on_acct_show(struct device *dev,
 				  struct device_attribute *attr,
 				  char *buf)
@@ -47,7 +46,6 @@ static ssize_t power_off_acct_show(struct device *dev,
 
 static DEVICE_ATTR_RO(power_on_acct);
 static DEVICE_ATTR_RO(power_off_acct);
-#endif /* CONFIG_PM */
 
 #define CODEC_INFO_SHOW(type, field)				\
 static ssize_t type##_show(struct device *dev,			\
@@ -745,10 +743,8 @@ static struct attribute *hda_dev_attrs[] = {
 	&dev_attr_modelname.attr,
 	&dev_attr_init_pin_configs.attr,
 	&dev_attr_driver_pin_configs.attr,
-#ifdef CONFIG_PM
 	&dev_attr_power_on_acct.attr,
 	&dev_attr_power_off_acct.attr,
-#endif
 #ifdef CONFIG_SND_HDA_RECONFIG
 	&dev_attr_init_verbs.attr,
 	&dev_attr_hints.attr,
-- 
cgit v1.2.3


From e8336a63de03eab16927abe1fc686c3ea10b27c6 Mon Sep 17 00:00:00 2001
From: Chaitanya Kumar Borah <chaitanya.kumar.borah@intel.com>
Date: Mon, 6 May 2024 10:55:31 +0530
Subject: ALSA: hda: Add Intel BMG PCI ID and HDMI codec vid

Add HD Audio PCI ID and HDMI codec vendor ID for Intel Battlemage.

Signed-off-by: Chaitanya Kumar Borah <chaitanya.kumar.borah@intel.com>
Link: https://lore.kernel.org/r/20240506052531.1150062-1-chaitanya.kumar.borah@intel.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/linux/pci_ids.h    | 1 +
 include/sound/hdaudio.h    | 1 +
 sound/pci/hda/hda_intel.c  | 2 ++
 sound/pci/hda/patch_hdmi.c | 1 +
 4 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index a0c75e467df3..ae8e66242927 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -3106,6 +3106,7 @@
 #define PCI_DEVICE_ID_INTEL_HDA_CML_S	0xa3f0
 #define PCI_DEVICE_ID_INTEL_HDA_LNL_P	0xa828
 #define PCI_DEVICE_ID_INTEL_S21152BB	0xb152
+#define PCI_DEVICE_ID_INTEL_HDA_BMG	0xe2f7
 #define PCI_DEVICE_ID_INTEL_HDA_CML_R	0xf0c8
 #define PCI_DEVICE_ID_INTEL_HDA_RKL_S	0xf1c8
 
diff --git a/include/sound/hdaudio.h b/include/sound/hdaudio.h
index 85d2c3d00a27..1d10939e40af 100644
--- a/include/sound/hdaudio.h
+++ b/include/sound/hdaudio.h
@@ -732,6 +732,7 @@ static inline unsigned int snd_array_index(struct snd_array *array, void *ptr)
 			{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HDA_DG2_0) }, \
 			{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HDA_DG2_1) }, \
 			{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HDA_DG2_2) }, \
+			{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HDA_BMG) }, \
 			{ } \
 		}, pci) || HDA_CONTROLLER_IS_HSW(pci))
 
diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c
index 6a3022a87245..a03f17f1328f 100644
--- a/sound/pci/hda/hda_intel.c
+++ b/sound/pci/hda/hda_intel.c
@@ -2505,6 +2505,8 @@ static const struct pci_device_id azx_ids[] = {
 	{ PCI_DEVICE_DATA(INTEL, HDA_RPL_M, AZX_DRIVER_SKL | AZX_DCAPS_INTEL_SKYLAKE) },
 	{ PCI_DEVICE_DATA(INTEL, HDA_RPL_PX, AZX_DRIVER_SKL | AZX_DCAPS_INTEL_SKYLAKE) },
 	{ PCI_DEVICE_DATA(INTEL, HDA_MTL, AZX_DRIVER_SKL | AZX_DCAPS_INTEL_SKYLAKE) },
+	/* Battlemage */
+	{ PCI_DEVICE_DATA(INTEL, HDA_BMG, AZX_DRIVER_SKL | AZX_DCAPS_INTEL_SKYLAKE) },
 	/* Lunarlake-P */
 	{ PCI_DEVICE_DATA(INTEL, HDA_LNL_P, AZX_DRIVER_SKL | AZX_DCAPS_INTEL_LNL) },
 	/* Arrow Lake-S */
diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c
index 495d63101186..0a00c5ea42e0 100644
--- a/sound/pci/hda/patch_hdmi.c
+++ b/sound/pci/hda/patch_hdmi.c
@@ -4642,6 +4642,7 @@ HDA_CODEC_ENTRY(0x8086281a, "Jasperlake HDMI",	patch_i915_icl_hdmi),
 HDA_CODEC_ENTRY(0x8086281b, "Elkhartlake HDMI",	patch_i915_icl_hdmi),
 HDA_CODEC_ENTRY(0x8086281c, "Alderlake-P HDMI", patch_i915_adlp_hdmi),
 HDA_CODEC_ENTRY(0x8086281d, "Meteor Lake HDMI",	patch_i915_adlp_hdmi),
+HDA_CODEC_ENTRY(0x8086281e, "Battlemage HDMI",	patch_i915_adlp_hdmi),
 HDA_CODEC_ENTRY(0x8086281f, "Raptor Lake P HDMI",	patch_i915_adlp_hdmi),
 HDA_CODEC_ENTRY(0x80862820, "Lunar Lake HDMI",	patch_i915_adlp_hdmi),
 HDA_CODEC_ENTRY(0x80862880, "CedarTrail HDMI",	patch_generic_hdmi),
-- 
cgit v1.2.3


From 4bbf9c3b53e637eb3a14ee27b996300ce88e752a Mon Sep 17 00:00:00 2001
From: Allen Pais <apais@linux.microsoft.com>
Date: Mon, 6 May 2024 19:37:00 +0000
Subject: fs/coredump: Enable dynamic configuration of max file note size

Introduce the capability to dynamically configure the maximum file
note size for ELF core dumps via sysctl.

Why is this being done?
We have observed that during a crash when there are more than 65k mmaps
in memory, the existing fixed limit on the size of the ELF notes section
becomes a bottleneck. The notes section quickly reaches its capacity,
leading to incomplete memory segment information in the resulting coredump.
This truncation compromises the utility of the coredumps, as crucial
information about the memory state at the time of the crash might be
omitted.

This enhancement removes the previous static limit of 4MB, allowing
system administrators to adjust the size based on system-specific
requirements or constraints.

Eg:
$ sysctl -a | grep core_file_note_size_limit
kernel.core_file_note_size_limit = 4194304

$ sysctl -n kernel.core_file_note_size_limit
4194304

$echo 519304 > /proc/sys/kernel/core_file_note_size_limit

$sysctl -n kernel.core_file_note_size_limit
519304

Attempting to write beyond the ceiling value of 16MB
$echo 17194304 > /proc/sys/kernel/core_file_note_size_limit
bash: echo: write error: Invalid argument

Signed-off-by: Vijay Nag <nagvijay@microsoft.com>
Signed-off-by: Allen Pais <apais@linux.microsoft.com>
Link: https://lore.kernel.org/r/20240506193700.7884-1-apais@linux.microsoft.com
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 fs/binfmt_elf.c          |  7 +++++--
 fs/coredump.c            | 17 +++++++++++++++++
 include/linux/coredump.h |  2 ++
 3 files changed, 24 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 7862962f7a85..b5a25ee49eea 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1567,7 +1567,6 @@ static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata,
 	fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata);
 }
 
-#define MAX_FILE_NOTE_SIZE (4*1024*1024)
 /*
  * Format of NT_FILE note:
  *
@@ -1595,8 +1594,12 @@ static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm
 
 	names_ofs = (2 + 3 * count) * sizeof(data[0]);
  alloc:
-	if (size >= MAX_FILE_NOTE_SIZE) /* paranoia check */
+	/* paranoia check */
+	if (size >= core_file_note_size_limit) {
+		pr_warn_once("coredump Note size too large: %u (does kernel.core_file_note_size_limit sysctl need adjustment?\n",
+			      size);
 		return -EINVAL;
+	}
 	size = round_up(size, PAGE_SIZE);
 	/*
 	 * "size" can be 0 here legitimately.
diff --git a/fs/coredump.c b/fs/coredump.c
index be6403b4b14b..317065e3eb9b 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -56,10 +56,15 @@
 static bool dump_vma_snapshot(struct coredump_params *cprm);
 static void free_vma_snapshot(struct coredump_params *cprm);
 
+#define CORE_FILE_NOTE_SIZE_DEFAULT (4*1024*1024)
+/* Define a reasonable max cap */
+#define CORE_FILE_NOTE_SIZE_MAX (16*1024*1024)
+
 static int core_uses_pid;
 static unsigned int core_pipe_limit;
 static char core_pattern[CORENAME_MAX_SIZE] = "core";
 static int core_name_size = CORENAME_MAX_SIZE;
+unsigned int core_file_note_size_limit = CORE_FILE_NOTE_SIZE_DEFAULT;
 
 struct core_name {
 	char *corename;
@@ -998,6 +1003,9 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
 	return error;
 }
 
+static const unsigned int core_file_note_size_min = CORE_FILE_NOTE_SIZE_DEFAULT;
+static const unsigned int core_file_note_size_max = CORE_FILE_NOTE_SIZE_MAX;
+
 static struct ctl_table coredump_sysctls[] = {
 	{
 		.procname	= "core_uses_pid",
@@ -1020,6 +1028,15 @@ static struct ctl_table coredump_sysctls[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname       = "core_file_note_size_limit",
+		.data           = &core_file_note_size_limit,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler	= proc_douintvec_minmax,
+		.extra1		= (unsigned int *)&core_file_note_size_min,
+		.extra2		= (unsigned int *)&core_file_note_size_max,
+	},
 };
 
 static int __init init_fs_coredump_sysctls(void)
diff --git a/include/linux/coredump.h b/include/linux/coredump.h
index d3eba4360150..0904ba010341 100644
--- a/include/linux/coredump.h
+++ b/include/linux/coredump.h
@@ -30,6 +30,8 @@ struct coredump_params {
 	struct core_vma_metadata *vma_meta;
 };
 
+extern unsigned int core_file_note_size_limit;
+
 /*
  * These are the only things you should do on a core-file: use only these
  * functions to write out all the necessary info.
-- 
cgit v1.2.3


From 962f1e79e7acfb30207a378894b1bbf6742e6212 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 2 May 2024 09:57:30 -0700
Subject: PCI/CXL: Move CXL Vendor ID to pci_ids.h

Move PCI_DVSEC_VENDOR_ID_CXL in CXL private code to PCI_VENDOR_ID_CXL in
pci_ids.h in order to be utilized in PCI subsystem.

While the CXL Vendor ID (0x1e98) is not listed in the PCI SIG "Member
Companies" database at https://pcisig.com/membership/member-companies, the
SIG has confirmed that it is reserved by CXL.

Link: https://lore.kernel.org/r/20240502165851.1948523-2-dave.jiang@intel.com
Suggested-by: Bjorn Helgaas <helgaas@kernel.org>
Link: https://lore.kernel.org/linux-cxl/20240402172323.GA1818777@bhelgaas/
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
[bhelgaas: update commit log]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/core/pci.c  | 6 +++---
 drivers/cxl/core/regs.c | 2 +-
 drivers/cxl/cxlpci.h    | 1 -
 drivers/cxl/pci.c       | 2 +-
 drivers/perf/cxl_pmu.c  | 2 +-
 include/linux/pci_ids.h | 2 ++
 6 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 0df09bd79408..c496a9710d62 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -525,7 +525,7 @@ static int cxl_cdat_get_length(struct device *dev,
 	__le32 response[2];
 	int rc;
 
-	rc = pci_doe(doe_mb, PCI_DVSEC_VENDOR_ID_CXL,
+	rc = pci_doe(doe_mb, PCI_VENDOR_ID_CXL,
 		     CXL_DOE_PROTOCOL_TABLE_ACCESS,
 		     &request, sizeof(request),
 		     &response, sizeof(response));
@@ -555,7 +555,7 @@ static int cxl_cdat_read_table(struct device *dev,
 		__le32 request = CDAT_DOE_REQ(entry_handle);
 		int rc;
 
-		rc = pci_doe(doe_mb, PCI_DVSEC_VENDOR_ID_CXL,
+		rc = pci_doe(doe_mb, PCI_VENDOR_ID_CXL,
 			     CXL_DOE_PROTOCOL_TABLE_ACCESS,
 			     &request, sizeof(request),
 			     rsp, sizeof(*rsp) + remaining);
@@ -640,7 +640,7 @@ void read_cdat_data(struct cxl_port *port)
 	if (!pdev)
 		return;
 
-	doe_mb = pci_find_doe_mailbox(pdev, PCI_DVSEC_VENDOR_ID_CXL,
+	doe_mb = pci_find_doe_mailbox(pdev, PCI_VENDOR_ID_CXL,
 				      CXL_DOE_PROTOCOL_TABLE_ACCESS);
 	if (!doe_mb) {
 		dev_dbg(dev, "No CDAT mailbox\n");
diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c
index 372786f80955..da52fc9e234b 100644
--- a/drivers/cxl/core/regs.c
+++ b/drivers/cxl/core/regs.c
@@ -313,7 +313,7 @@ int cxl_find_regblock_instance(struct pci_dev *pdev, enum cxl_regloc_type type,
 		.resource = CXL_RESOURCE_NONE,
 	};
 
-	regloc = pci_find_dvsec_capability(pdev, PCI_DVSEC_VENDOR_ID_CXL,
+	regloc = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL,
 					   CXL_DVSEC_REG_LOCATOR);
 	if (!regloc)
 		return -ENXIO;
diff --git a/drivers/cxl/cxlpci.h b/drivers/cxl/cxlpci.h
index 93992a1c8eec..4da07727ab9c 100644
--- a/drivers/cxl/cxlpci.h
+++ b/drivers/cxl/cxlpci.h
@@ -13,7 +13,6 @@
  * "DVSEC" redundancies removed. When obvious, abbreviations may be used.
  */
 #define PCI_DVSEC_HEADER1_LENGTH_MASK	GENMASK(31, 20)
-#define PCI_DVSEC_VENDOR_ID_CXL		0x1E98
 
 /* CXL 2.0 8.1.3: PCIe DVSEC for CXL Device */
 #define CXL_DVSEC_PCIE_DEVICE					0
diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
index 2ff361e756d6..110478573296 100644
--- a/drivers/cxl/pci.c
+++ b/drivers/cxl/pci.c
@@ -817,7 +817,7 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	cxlds->rcd = is_cxl_restricted(pdev);
 	cxlds->serial = pci_get_dsn(pdev);
 	cxlds->cxl_dvsec = pci_find_dvsec_capability(
-		pdev, PCI_DVSEC_VENDOR_ID_CXL, CXL_DVSEC_PCIE_DEVICE);
+		pdev, PCI_VENDOR_ID_CXL, CXL_DVSEC_PCIE_DEVICE);
 	if (!cxlds->cxl_dvsec)
 		dev_warn(&pdev->dev,
 			 "Device DVSEC not present, skip CXL.mem init\n");
diff --git a/drivers/perf/cxl_pmu.c b/drivers/perf/cxl_pmu.c
index 308c9969642e..a1b742b1a735 100644
--- a/drivers/perf/cxl_pmu.c
+++ b/drivers/perf/cxl_pmu.c
@@ -345,7 +345,7 @@ static ssize_t cxl_pmu_event_sysfs_show(struct device *dev,
 
 /* For CXL spec defined events */
 #define CXL_PMU_EVENT_CXL_ATTR(_name, _gid, _msk)			\
-	CXL_PMU_EVENT_ATTR(_name, PCI_DVSEC_VENDOR_ID_CXL, _gid, _msk)
+	CXL_PMU_EVENT_ATTR(_name, PCI_VENDOR_ID_CXL, _gid, _msk)
 
 static struct attribute *cxl_pmu_event_attrs[] = {
 	CXL_PMU_EVENT_CXL_ATTR(clock_ticks,			CXL_PMU_GID_CLOCK_TICKS, BIT(0)),
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index a0c75e467df3..7dfbf6d96b3d 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2607,6 +2607,8 @@
 
 #define PCI_VENDOR_ID_ALIBABA		0x1ded
 
+#define PCI_VENDOR_ID_CXL		0x1e98
+
 #define PCI_VENDOR_ID_TEHUTI		0x1fc9
 #define PCI_DEVICE_ID_TEHUTI_3009	0x3009
 #define PCI_DEVICE_ID_TEHUTI_3010	0x3010
-- 
cgit v1.2.3


From 7e89efc6e9e402839643cb297bab14055c547f07 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 2 May 2024 09:57:31 -0700
Subject: PCI: Lock upstream bridge for pci_reset_function()

Fix a long-standing locking gap for missing pci_cfg_access_lock() while
manipulating bridge reset registers and configuration during
pci_reset_bus_function().

If there is an upstream bridge, lock it before locking the device itself.
pci_dev_lock() calls pci_cfg_access_lock(), which blocks the writing of PCI
config space by user space.

Add lockdep assertion via pci_dev->cfg_access_lock to verify
pci_dev->block_cfg_access is set.

Co-developed-by: Dan Williams <dan.j.williams@intel.com>
Link: https://lore.kernel.org/r/20240502165851.1948523-3-dave.jiang@intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
[bhelgaas: commit log]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/access.c    |  4 ++++
 drivers/pci/pci.c       | 13 +++++++++++++
 drivers/pci/probe.c     |  3 +++
 include/linux/lockdep.h |  5 +++++
 include/linux/pci.h     |  2 ++
 5 files changed, 27 insertions(+)

(limited to 'include')

diff --git a/drivers/pci/access.c b/drivers/pci/access.c
index 6449056b57dd..36f10c7f9ef5 100644
--- a/drivers/pci/access.c
+++ b/drivers/pci/access.c
@@ -275,6 +275,8 @@ void pci_cfg_access_lock(struct pci_dev *dev)
 {
 	might_sleep();
 
+	lock_map_acquire(&dev->cfg_access_lock);
+
 	raw_spin_lock_irq(&pci_lock);
 	if (dev->block_cfg_access)
 		pci_wait_cfg(dev);
@@ -329,6 +331,8 @@ void pci_cfg_access_unlock(struct pci_dev *dev)
 	raw_spin_unlock_irqrestore(&pci_lock, flags);
 
 	wake_up_all(&pci_cfg_wait);
+
+	lock_map_release(&dev->cfg_access_lock);
 }
 EXPORT_SYMBOL_GPL(pci_cfg_access_unlock);
 
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index e5f243dd4288..c63142352844 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4879,6 +4879,7 @@ void __weak pcibios_reset_secondary_bus(struct pci_dev *dev)
  */
 int pci_bridge_secondary_bus_reset(struct pci_dev *dev)
 {
+	lock_map_assert_held(&dev->cfg_access_lock);
 	pcibios_reset_secondary_bus(dev);
 
 	return pci_bridge_wait_for_secondary_bus(dev, "bus reset");
@@ -5245,11 +5246,20 @@ void pci_init_reset_methods(struct pci_dev *dev)
  */
 int pci_reset_function(struct pci_dev *dev)
 {
+	struct pci_dev *bridge;
 	int rc;
 
 	if (!pci_reset_supported(dev))
 		return -ENOTTY;
 
+	/*
+	 * If there's no upstream bridge, no locking is needed since there is
+	 * no upstream bridge configuration to hold consistent.
+	 */
+	bridge = pci_upstream_bridge(dev);
+	if (bridge)
+		pci_dev_lock(bridge);
+
 	pci_dev_lock(dev);
 	pci_dev_save_and_disable(dev);
 
@@ -5258,6 +5268,9 @@ int pci_reset_function(struct pci_dev *dev)
 	pci_dev_restore(dev);
 	pci_dev_unlock(dev);
 
+	if (bridge)
+		pci_dev_unlock(bridge);
+
 	return rc;
 }
 EXPORT_SYMBOL_GPL(pci_reset_function);
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 1325fbae2f28..a3da776bf986 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2543,6 +2543,9 @@ void pci_device_add(struct pci_dev *dev, struct pci_bus *bus)
 	dev->dev.dma_mask = &dev->dma_mask;
 	dev->dev.dma_parms = &dev->dma_parms;
 	dev->dev.coherent_dma_mask = 0xffffffffull;
+	lockdep_register_key(&dev->cfg_access_key);
+	lockdep_init_map(&dev->cfg_access_lock, dev_name(&dev->dev),
+			 &dev->cfg_access_key, 0);
 
 	dma_set_max_seg_size(&dev->dev, 65536);
 	dma_set_seg_boundary(&dev->dev, 0xffffffff);
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 08b0d1d9d78b..5e51b0de4c4b 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -297,6 +297,9 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);
 		.wait_type_inner = _wait_type,		\
 		.lock_type = LD_LOCK_WAIT_OVERRIDE, }
 
+#define lock_map_assert_held(l)		\
+	lockdep_assert(lock_is_held(l) != LOCK_STATE_NOT_HELD)
+
 #else /* !CONFIG_LOCKDEP */
 
 static inline void lockdep_init_task(struct task_struct *task)
@@ -388,6 +391,8 @@ extern int lockdep_is_held(const void *);
 #define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type)	\
 	struct lockdep_map __maybe_unused _name = {}
 
+#define lock_map_assert_held(l)			do { (void)(l); } while (0)
+
 #endif /* !LOCKDEP */
 
 #ifdef CONFIG_PROVE_LOCKING
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 16493426a04f..e4e7b175af54 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -413,6 +413,8 @@ struct pci_dev {
 	struct resource driver_exclusive_resource;	 /* driver exclusive resource ranges */
 
 	bool		match_driver;		/* Skip attaching driver */
+	struct lock_class_key cfg_access_key;
+	struct lockdep_map cfg_access_lock;
 
 	unsigned int	transparent:1;		/* Subtractive decode bridge */
 	unsigned int	io_window:1;		/* Bridge has I/O window */
-- 
cgit v1.2.3


From b1956e2d0713e210a56ae65ad3488ae36f833e76 Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 2 May 2024 09:57:32 -0700
Subject: PCI/CXL: Fail bus reset if upstream CXL Port has SBR masked

Per CXL spec r3.1, sec 8.1.5.2, the Secondary Bus Reset (SBR) bit in the
Bridge Control register of a CXL port has no effect unless the "Unmask SBR"
bit is set.

Return -ENOTTY if we attempt a bus reset on a device below a CXL Port where
"Unmask SBR" is 0.  Otherwise, the bus reset would appear to have succeeded
even though setting the bridge SBR bit had no effect.

Link: https://lore.kernel.org/linux-cxl/20240220203956.GA1502351@bhelgaas/
Link: https://lore.kernel.org/r/20240502165851.1948523-4-dave.jiang@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
[bhelgaas: simplify commit log and comments]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/pci/pci.c             | 42 ++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/pci_regs.h |  5 +++++
 2 files changed, 47 insertions(+)

(limited to 'include')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index c63142352844..225cec964f25 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4928,10 +4928,52 @@ static int pci_dev_reset_slot_function(struct pci_dev *dev, bool probe)
 	return pci_reset_hotplug_slot(dev->slot->hotplug, probe);
 }
 
+static u16 cxl_port_dvsec(struct pci_dev *dev)
+{
+	return pci_find_dvsec_capability(dev, PCI_VENDOR_ID_CXL,
+					 PCI_DVSEC_CXL_PORT);
+}
+
+static bool cxl_sbr_masked(struct pci_dev *dev)
+{
+	u16 dvsec, reg;
+	int rc;
+
+	dvsec = cxl_port_dvsec(dev);
+	if (!dvsec)
+		return false;
+
+	rc = pci_read_config_word(dev, dvsec + PCI_DVSEC_CXL_PORT_CTL, &reg);
+	if (rc || PCI_POSSIBLE_ERROR(reg))
+		return false;
+
+	/*
+	 * Per CXL spec r3.1, sec 8.1.5.2, when "Unmask SBR" is 0, the SBR
+	 * bit in Bridge Control has no effect.  When 1, the Port generates
+	 * hot reset when the SBR bit is set to 1.
+	 */
+	if (reg & PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR)
+		return false;
+
+	return true;
+}
+
 static int pci_reset_bus_function(struct pci_dev *dev, bool probe)
 {
+	struct pci_dev *bridge = pci_upstream_bridge(dev);
 	int rc;
 
+	/*
+	 * If "dev" is below a CXL port that has SBR control masked, SBR
+	 * won't do anything, so return error.
+	 */
+	if (bridge && cxl_sbr_masked(bridge)) {
+		if (probe)
+			return 0;
+
+		return -ENOTTY;
+	}
+
 	rc = pci_dev_reset_slot_function(dev, probe);
 	if (rc != -ENOTTY)
 		return rc;
diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h
index a39193213ff2..6024eb2e9a2f 100644
--- a/include/uapi/linux/pci_regs.h
+++ b/include/uapi/linux/pci_regs.h
@@ -1148,4 +1148,9 @@
 #define PCI_DOE_DATA_OBJECT_DISC_RSP_3_PROTOCOL		0x00ff0000
 #define PCI_DOE_DATA_OBJECT_DISC_RSP_3_NEXT_INDEX	0xff000000
 
+/* Compute Express Link (CXL r3.1, sec 8.1.5) */
+#define PCI_DVSEC_CXL_PORT				3
+#define PCI_DVSEC_CXL_PORT_CTL				0x0c
+#define PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR		0x00000001
+
 #endif /* LINUX_PCI_REGS_H */
-- 
cgit v1.2.3


From 53c49b6e6dd2ebc1d3257ae838e067699229bc8d Mon Sep 17 00:00:00 2001
From: Dave Jiang <dave.jiang@intel.com>
Date: Thu, 2 May 2024 09:57:33 -0700
Subject: PCI/CXL: Add 'cxl_bus' reset method for devices below CXL Ports

By default Secondary Bus Reset (SBR) is masked for CXL Ports (see CXL r3.1,
sec 8.1.5.2).

Add cxl_reset_bus_function() (method "cxl_bus") to set the "Unmask SBR" bit
in the upstream CXL Port before performing the bus reset and restore the
original value afterwards.

This method allows the user to perform a bus reset on a CXL device without
needing to set the "Unmask SBR" bit via a user tool.

Link: https://lore.kernel.org/r/20240502165851.1948523-5-dave.jiang@intel.com
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
[bhelgaas: simplify commit log, invert condition to avoid negation]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/pci/pci.c   | 39 +++++++++++++++++++++++++++++++++++++++
 include/linux/pci.h |  2 +-
 2 files changed, 40 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 225cec964f25..13605261a881 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4980,6 +4980,44 @@ static int pci_reset_bus_function(struct pci_dev *dev, bool probe)
 	return pci_parent_bus_reset(dev, probe);
 }
 
+static int cxl_reset_bus_function(struct pci_dev *dev, bool probe)
+{
+	struct pci_dev *bridge;
+	u16 dvsec, reg, val;
+	int rc;
+
+	bridge = pci_upstream_bridge(dev);
+	if (!bridge)
+		return -ENOTTY;
+
+	dvsec = cxl_port_dvsec(bridge);
+	if (!dvsec)
+		return -ENOTTY;
+
+	if (probe)
+		return 0;
+
+	rc = pci_read_config_word(bridge, dvsec + PCI_DVSEC_CXL_PORT_CTL, &reg);
+	if (rc)
+		return -ENOTTY;
+
+	if (reg & PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR) {
+		val = reg;
+	} else {
+		val = reg | PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR;
+		pci_write_config_word(bridge, dvsec + PCI_DVSEC_CXL_PORT_CTL,
+				      val);
+	}
+
+	rc = pci_reset_bus_function(dev, probe);
+
+	if (reg != val)
+		pci_write_config_word(bridge, dvsec + PCI_DVSEC_CXL_PORT_CTL,
+				      reg);
+
+	return rc;
+}
+
 void pci_dev_lock(struct pci_dev *dev)
 {
 	/* block PM suspend, driver probe, etc. */
@@ -5064,6 +5102,7 @@ static const struct pci_reset_fn_method pci_reset_fn_methods[] = {
 	{ pci_af_flr, .name = "af_flr" },
 	{ pci_pm_reset, .name = "pm" },
 	{ pci_reset_bus_function, .name = "bus" },
+	{ cxl_reset_bus_function, .name = "cxl_bus" },
 };
 
 static ssize_t reset_method_show(struct device *dev,
diff --git a/include/linux/pci.h b/include/linux/pci.h
index e4e7b175af54..b06c1c0ec9bd 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -51,7 +51,7 @@
 			       PCI_STATUS_PARITY)
 
 /* Number of reset methods used in pci_reset_fn_methods array in pci.c */
-#define PCI_NUM_RESET_METHODS 7
+#define PCI_NUM_RESET_METHODS 8
 
 #define PCI_RESET_PROBE		true
 #define PCI_RESET_DO_RESET	false
-- 
cgit v1.2.3


From 4c5b7294dedbdfae46f77262c0744879bb9b7221 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 3 May 2024 14:43:54 -0400
Subject: closures: closure_sync_timeout()

Add a new variant of closure_sync_timeout() that takes a timeout.

Note that when this returns -ETIME the closure will still be waiting on
something, i.e. it's not safe to return if you've got a stack allocated
closure.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 include/linux/closure.h | 12 ++++++++++++
 lib/closure.c           | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)

(limited to 'include')

diff --git a/include/linux/closure.h b/include/linux/closure.h
index c554c6a08768..99155df162d0 100644
--- a/include/linux/closure.h
+++ b/include/linux/closure.h
@@ -194,6 +194,18 @@ static inline void closure_sync(struct closure *cl)
 		__closure_sync(cl);
 }
 
+int __closure_sync_timeout(struct closure *cl, unsigned long timeout);
+
+static inline int closure_sync_timeout(struct closure *cl, unsigned long timeout)
+{
+#ifdef CONFIG_DEBUG_CLOSURES
+	BUG_ON(closure_nr_remaining(cl) != 1 && !cl->closure_get_happened);
+#endif
+	return cl->closure_get_happened
+		? __closure_sync_timeout(cl, timeout)
+		: 0;
+}
+
 #ifdef CONFIG_DEBUG_CLOSURES
 
 void closure_debug_create(struct closure *cl);
diff --git a/lib/closure.c b/lib/closure.c
index c16540552d61..07409e9e35a5 100644
--- a/lib/closure.c
+++ b/lib/closure.c
@@ -139,6 +139,43 @@ void __sched __closure_sync(struct closure *cl)
 }
 EXPORT_SYMBOL(__closure_sync);
 
+int __sched __closure_sync_timeout(struct closure *cl, unsigned long timeout)
+{
+	struct closure_syncer s = { .task = current };
+	int ret = 0;
+
+	cl->s = &s;
+	continue_at(cl, closure_sync_fn, NULL);
+
+	while (1) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (s.done)
+			break;
+		if (!timeout) {
+			/*
+			 * Carefully undo the continue_at() - but only if it
+			 * hasn't completed, i.e. the final closure_put() hasn't
+			 * happened yet:
+			 */
+			unsigned old, new, v = atomic_read(&cl->remaining);
+			do {
+				old = v;
+				if (!old || (old & CLOSURE_RUNNING))
+					goto success;
+
+				new = old + CLOSURE_REMAINING_INITIALIZER;
+			} while ((v = atomic_cmpxchg(&cl->remaining, old, new)) != old);
+			ret = -ETIME;
+		}
+
+		timeout = schedule_timeout(timeout);
+	}
+success:
+	__set_current_state(TASK_RUNNING);
+	return ret;
+}
+EXPORT_SYMBOL(__closure_sync_timeout);
+
 #ifdef CONFIG_DEBUG_CLOSURES
 
 static LIST_HEAD(closure_list);
-- 
cgit v1.2.3


From e2f48c48090dea172c0c571101041de64634dae5 Mon Sep 17 00:00:00 2001
From: Petr Vorel <pvorel@suse.cz>
Date: Tue, 7 May 2024 17:37:57 +0200
Subject: bcachefs: Move BCACHEFS_STATFS_MAGIC value to UAPI magic.h

Move BCACHEFS_STATFS_MAGIC value to UAPI <linux/magic.h> under
BCACHEFS_SUPER_MAGIC definition (use common approach for name) and reuse the
definition in bcachefs_format.h BCACHEFS_STATFS_MAGIC.

There are other bcachefs magic definitions: BCACHE_MAGIC, BCHFS_MAGIC,
which use UUID_INIT() and are used only in libbcachefs. Therefore move
only BCACHEFS_STATFS_MAGIC value, which can be used outside of
libbcachefs for f_type field in struct statfs in statfs() or fstatfs().

Suggested-by: Su Yue <glass.su@suse.com>
Signed-off-by: Petr Vorel <pvorel@suse.cz>
Acked-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs_format.h | 3 ++-
 include/uapi/linux/magic.h    | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index d5b90439e581..1bebba881d89 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -76,6 +76,7 @@
 #include <asm/byteorder.h>
 #include <linux/kernel.h>
 #include <linux/uuid.h>
+#include <uapi/linux/magic.h>
 #include "vstructs.h"
 
 #ifdef __KERNEL__
@@ -1290,7 +1291,7 @@ enum bch_compression_opts {
 	UUID_INIT(0xc68573f6, 0x66ce, 0x90a9,				\
 		  0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef)
 
-#define BCACHEFS_STATFS_MAGIC		0xca451a4e
+#define BCACHEFS_STATFS_MAGIC		BCACHEFS_SUPER_MAGIC
 
 #define JSET_MAGIC		__cpu_to_le64(0x245235c1a3625032ULL)
 #define BSET_MAGIC		__cpu_to_le64(0x90135c78b99e07f5ULL)
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index 1b40a968ba91..bb575f3ab45e 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -37,6 +37,7 @@
 #define HOSTFS_SUPER_MAGIC	0x00c0ffee
 #define OVERLAYFS_SUPER_MAGIC	0x794c7630
 #define FUSE_SUPER_MAGIC	0x65735546
+#define BCACHEFS_SUPER_MAGIC	0xca451a4e
 
 #define MINIX_SUPER_MAGIC	0x137F		/* minix v1 fs, 14 char names */
 #define MINIX_SUPER_MAGIC2	0x138F		/* minix v1 fs, 30 char names */
-- 
cgit v1.2.3


From bc2da26599ed800357f24fca4b7f6a8f35d87dcb Mon Sep 17 00:00:00 2001
From: Conor Dooley <conor.dooley@microchip.com>
Date: Wed, 8 May 2024 22:33:24 +0100
Subject: clk, reset: microchip: mpfs: fix incorrect preprocessor conditions

While moving all the reset code in the PolarFire SoC clock driver to the
reset subsystem, I removed an `#if IS_ENABLED(RESET_CONTROLLER)` from
the driver and moved it to the header, however this was not the correct
thing to do. In the driver such a condition over-eagerly provided a
complete implementation for mpfs_reset_{read,write}() when the reset
subsystem was enabled without the PolarFire SoC reset driver, but in the
header it meant that when the subsystem was enabled and the driver was
not, no implementation for mpfs_reset_controller_register() was
provided. Fix the condition so that the stub implementation of
mpfs_reset_controller_register() is used when the reset driver is
disabled.

Fixes: 098c290a490d ("clock, reset: microchip: move all mpfs reset code to the reset subsystem")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202405082259.44DzHvaN-lkp@intel.com/
Closes: https://lore.kernel.org/oe-kbuild-all/202405082200.tBrEs5CZ-lkp@intel.com/
Signed-off-by: Conor Dooley <conor.dooley@microchip.com>
Link: https://lore.kernel.org/r/20240508-unabashed-cheese-8f645b4f69ba@spud
Signed-off-by: Stephen Boyd <sboyd@kernel.org>
---
 include/soc/microchip/mpfs.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/soc/microchip/mpfs.h b/include/soc/microchip/mpfs.h
index d7e612b5e22e..0bd67e10b704 100644
--- a/include/soc/microchip/mpfs.h
+++ b/include/soc/microchip/mpfs.h
@@ -43,11 +43,11 @@ struct mtd_info *mpfs_sys_controller_get_flash(struct mpfs_sys_controller *mpfs_
 #endif /* if IS_ENABLED(CONFIG_POLARFIRE_SOC_SYS_CTRL) */
 
 #if IS_ENABLED(CONFIG_MCHP_CLK_MPFS)
-#if IS_ENABLED(CONFIG_RESET_CONTROLLER)
+#if IS_ENABLED(CONFIG_RESET_POLARFIRE_SOC)
 int mpfs_reset_controller_register(struct device *clk_dev, void __iomem *base);
 #else
 static inline int mpfs_reset_controller_register(struct device *clk_dev, void __iomem *base) { return 0; }
-#endif /* if IS_ENABLED(CONFIG_RESET_CONTROLLER) */
+#endif /* if IS_ENABLED(CONFIG_RESET_POLARFIRE_SOC) */
 #endif /* if IS_ENABLED(CONFIG_MCHP_CLK_MPFS) */
 
 #endif /* __SOC_MPFS_H__ */
-- 
cgit v1.2.3


From 3b09b2bd0d62de1f359b0c130570711c99c5e80b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 7 May 2024 13:20:00 +0000
Subject: net: dst_cache: annotate data-races around dst_cache->reset_ts

dst_cache->reset_ts is read or written locklessly,
add READ_ONCE() and WRITE_ONCE() annotations.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Link: https://lore.kernel.org/r/20240507132000.614591-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dst_cache.h | 2 +-
 net/core/dst_cache.c    | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/dst_cache.h b/include/net/dst_cache.h
index df6622a5fe98..b4a55d2d5e71 100644
--- a/include/net/dst_cache.h
+++ b/include/net/dst_cache.h
@@ -76,7 +76,7 @@ struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache,
  */
 static inline void dst_cache_reset(struct dst_cache *dst_cache)
 {
-	dst_cache->reset_ts = jiffies;
+	WRITE_ONCE(dst_cache->reset_ts, jiffies);
 }
 
 /**
diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c
index 0c0bdb058c5b..f9df84a6c4b2 100644
--- a/net/core/dst_cache.c
+++ b/net/core/dst_cache.c
@@ -47,7 +47,8 @@ static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache,
 	/* the cache already hold a dst reference; it can't go away */
 	dst_hold(dst);
 
-	if (unlikely(!time_after(idst->refresh_ts, dst_cache->reset_ts) ||
+	if (unlikely(!time_after(idst->refresh_ts,
+				 READ_ONCE(dst_cache->reset_ts)) ||
 		     (dst->obsolete && !dst->ops->check(dst, idst->cookie)))) {
 		dst_cache_per_cpu_dst_set(idst, NULL, 0);
 		dst_release(dst);
@@ -170,7 +171,7 @@ void dst_cache_reset_now(struct dst_cache *dst_cache)
 	if (!dst_cache->cache)
 		return;
 
-	dst_cache->reset_ts = jiffies;
+	dst_cache_reset(dst_cache);
 	for_each_possible_cpu(i) {
 		struct dst_cache_pcpu *idst = per_cpu_ptr(dst_cache->cache, i);
 		struct dst_entry *dst = idst->dst;
-- 
cgit v1.2.3


From a0fde7ed05ff020c3e7f410d73ce4f3a72b262d6 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 9 May 2024 13:02:39 +0200
Subject: file: add fd_raw cleanup class

So we can also use CLASS(fd_raw, f)(fd) for codepaths where we allow
FMODE_PATH aka O_PATH file descriptors to be used.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/file.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/file.h b/include/linux/file.h
index 169692cb1906..45d0f4800abd 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -84,6 +84,7 @@ static inline void fdput_pos(struct fd f)
 }
 
 DEFINE_CLASS(fd, struct fd, fdput(_T), fdget(fd), int fd)
+DEFINE_CLASS(fd_raw, struct fd, fdput(_T), fdget_raw(fd), int fd)
 
 extern int f_dupfd(unsigned int from, struct file *file, unsigned flags);
 extern int replace_fd(unsigned fd, struct file *file, unsigned flags);
-- 
cgit v1.2.3


From d927752f287fe10965612541593468ffcfa9231f Mon Sep 17 00:00:00 2001
From: Wardenjohn <zhangwarden@gmail.com>
Date: Tue, 7 May 2024 13:01:11 +0800
Subject: livepatch: Rename KLP_* to KLP_TRANSITION_*

The original macros of KLP_* is about the state of the transition.
Rename macros of KLP_* to KLP_TRANSITION_* to fix the confusing
description of klp transition state.

Signed-off-by: Wardenjohn <zhangwarden@gmail.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Tested-by: Petr Mladek <pmladek@suse.com>
Acked-by: Josh Poimboeuf <jpoimboe@kernel.org>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Link: https://lore.kernel.org/r/20240507050111.38195-2-zhangwarden@gmail.com
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 include/linux/livepatch.h     |  6 ++---
 init/init_task.c              |  2 +-
 kernel/livepatch/core.c       |  4 ++--
 kernel/livepatch/patch.c      |  4 ++--
 kernel/livepatch/transition.c | 54 +++++++++++++++++++++----------------------
 5 files changed, 35 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index 9b9b38e89563..51a258c24ff5 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -18,9 +18,9 @@
 #if IS_ENABLED(CONFIG_LIVEPATCH)
 
 /* task patch states */
-#define KLP_UNDEFINED	-1
-#define KLP_UNPATCHED	 0
-#define KLP_PATCHED	 1
+#define KLP_TRANSITION_IDLE		-1
+#define KLP_TRANSITION_UNPATCHED	 0
+#define KLP_TRANSITION_PATCHED		 1
 
 /**
  * struct klp_func - function structure for live patching
diff --git a/init/init_task.c b/init/init_task.c
index 5727d42149c3..9b41f00d30e2 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -202,7 +202,7 @@ struct task_struct init_task
 	.trace_recursion = 0,
 #endif
 #ifdef CONFIG_LIVEPATCH
-	.patch_state	= KLP_UNDEFINED,
+	.patch_state	= KLP_TRANSITION_IDLE,
 #endif
 #ifdef CONFIG_SECURITY
 	.security	= NULL,
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index ecbc9b6aba3a..52426665eecc 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -973,7 +973,7 @@ static int __klp_disable_patch(struct klp_patch *patch)
 	if (klp_transition_patch)
 		return -EBUSY;
 
-	klp_init_transition(patch, KLP_UNPATCHED);
+	klp_init_transition(patch, KLP_TRANSITION_UNPATCHED);
 
 	klp_for_each_object(patch, obj)
 		if (obj->patched)
@@ -1008,7 +1008,7 @@ static int __klp_enable_patch(struct klp_patch *patch)
 
 	pr_notice("enabling patch '%s'\n", patch->mod->name);
 
-	klp_init_transition(patch, KLP_PATCHED);
+	klp_init_transition(patch, KLP_TRANSITION_PATCHED);
 
 	/*
 	 * Enforce the order of the func->transition writes in
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index 4152c71507e2..90408500e5a3 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -95,9 +95,9 @@ static void notrace klp_ftrace_handler(unsigned long ip,
 
 		patch_state = current->patch_state;
 
-		WARN_ON_ONCE(patch_state == KLP_UNDEFINED);
+		WARN_ON_ONCE(patch_state == KLP_TRANSITION_IDLE);
 
-		if (patch_state == KLP_UNPATCHED) {
+		if (patch_state == KLP_TRANSITION_UNPATCHED) {
 			/*
 			 * Use the previously patched version of the function.
 			 * If no previous patches exist, continue with the
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index e54c3d60a904..ba069459c101 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -23,7 +23,7 @@ static DEFINE_PER_CPU(unsigned long[MAX_STACK_ENTRIES], klp_stack_entries);
 
 struct klp_patch *klp_transition_patch;
 
-static int klp_target_state = KLP_UNDEFINED;
+static int klp_target_state = KLP_TRANSITION_IDLE;
 
 static unsigned int klp_signals_cnt;
 
@@ -96,16 +96,16 @@ static void klp_complete_transition(void)
 
 	pr_debug("'%s': completing %s transition\n",
 		 klp_transition_patch->mod->name,
-		 klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+		 klp_target_state == KLP_TRANSITION_PATCHED ? "patching" : "unpatching");
 
-	if (klp_transition_patch->replace && klp_target_state == KLP_PATCHED) {
+	if (klp_transition_patch->replace && klp_target_state == KLP_TRANSITION_PATCHED) {
 		klp_unpatch_replaced_patches(klp_transition_patch);
 		klp_discard_nops(klp_transition_patch);
 	}
 
-	if (klp_target_state == KLP_UNPATCHED) {
+	if (klp_target_state == KLP_TRANSITION_UNPATCHED) {
 		/*
-		 * All tasks have transitioned to KLP_UNPATCHED so we can now
+		 * All tasks have transitioned to KLP_TRANSITION_UNPATCHED so we can now
 		 * remove the new functions from the func_stack.
 		 */
 		klp_unpatch_objects(klp_transition_patch);
@@ -123,36 +123,36 @@ static void klp_complete_transition(void)
 		klp_for_each_func(obj, func)
 			func->transition = false;
 
-	/* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */
-	if (klp_target_state == KLP_PATCHED)
+	/* Prevent klp_ftrace_handler() from seeing KLP_TRANSITION_IDLE state */
+	if (klp_target_state == KLP_TRANSITION_PATCHED)
 		klp_synchronize_transition();
 
 	read_lock(&tasklist_lock);
 	for_each_process_thread(g, task) {
 		WARN_ON_ONCE(test_tsk_thread_flag(task, TIF_PATCH_PENDING));
-		task->patch_state = KLP_UNDEFINED;
+		task->patch_state = KLP_TRANSITION_IDLE;
 	}
 	read_unlock(&tasklist_lock);
 
 	for_each_possible_cpu(cpu) {
 		task = idle_task(cpu);
 		WARN_ON_ONCE(test_tsk_thread_flag(task, TIF_PATCH_PENDING));
-		task->patch_state = KLP_UNDEFINED;
+		task->patch_state = KLP_TRANSITION_IDLE;
 	}
 
 	klp_for_each_object(klp_transition_patch, obj) {
 		if (!klp_is_object_loaded(obj))
 			continue;
-		if (klp_target_state == KLP_PATCHED)
+		if (klp_target_state == KLP_TRANSITION_PATCHED)
 			klp_post_patch_callback(obj);
-		else if (klp_target_state == KLP_UNPATCHED)
+		else if (klp_target_state == KLP_TRANSITION_UNPATCHED)
 			klp_post_unpatch_callback(obj);
 	}
 
 	pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name,
-		  klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+		  klp_target_state == KLP_TRANSITION_PATCHED ? "patching" : "unpatching");
 
-	klp_target_state = KLP_UNDEFINED;
+	klp_target_state = KLP_TRANSITION_IDLE;
 	klp_transition_patch = NULL;
 }
 
@@ -164,13 +164,13 @@ static void klp_complete_transition(void)
  */
 void klp_cancel_transition(void)
 {
-	if (WARN_ON_ONCE(klp_target_state != KLP_PATCHED))
+	if (WARN_ON_ONCE(klp_target_state != KLP_TRANSITION_PATCHED))
 		return;
 
 	pr_debug("'%s': canceling patching transition, going to unpatch\n",
 		 klp_transition_patch->mod->name);
 
-	klp_target_state = KLP_UNPATCHED;
+	klp_target_state = KLP_TRANSITION_UNPATCHED;
 	klp_complete_transition();
 }
 
@@ -218,7 +218,7 @@ static int klp_check_stack_func(struct klp_func *func, unsigned long *entries,
 	struct klp_ops *ops;
 	int i;
 
-	if (klp_target_state == KLP_UNPATCHED) {
+	if (klp_target_state == KLP_TRANSITION_UNPATCHED) {
 		 /*
 		  * Check for the to-be-unpatched function
 		  * (the func itself).
@@ -455,7 +455,7 @@ void klp_try_complete_transition(void)
 	struct klp_patch *patch;
 	bool complete = true;
 
-	WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED);
+	WARN_ON_ONCE(klp_target_state == KLP_TRANSITION_IDLE);
 
 	/*
 	 * Try to switch the tasks to the target patch state by walking their
@@ -532,11 +532,11 @@ void klp_start_transition(void)
 	struct task_struct *g, *task;
 	unsigned int cpu;
 
-	WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED);
+	WARN_ON_ONCE(klp_target_state == KLP_TRANSITION_IDLE);
 
 	pr_notice("'%s': starting %s transition\n",
 		  klp_transition_patch->mod->name,
-		  klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+		  klp_target_state == KLP_TRANSITION_PATCHED ? "patching" : "unpatching");
 
 	/*
 	 * Mark all normal tasks as needing a patch state update.  They'll
@@ -578,7 +578,7 @@ void klp_init_transition(struct klp_patch *patch, int state)
 	struct klp_func *func;
 	int initial_state = !state;
 
-	WARN_ON_ONCE(klp_target_state != KLP_UNDEFINED);
+	WARN_ON_ONCE(klp_target_state != KLP_TRANSITION_IDLE);
 
 	klp_transition_patch = patch;
 
@@ -589,7 +589,7 @@ void klp_init_transition(struct klp_patch *patch, int state)
 	klp_target_state = state;
 
 	pr_debug("'%s': initializing %s transition\n", patch->mod->name,
-		 klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+		 klp_target_state == KLP_TRANSITION_PATCHED ? "patching" : "unpatching");
 
 	/*
 	 * Initialize all tasks to the initial patch state to prepare them for
@@ -597,7 +597,7 @@ void klp_init_transition(struct klp_patch *patch, int state)
 	 */
 	read_lock(&tasklist_lock);
 	for_each_process_thread(g, task) {
-		WARN_ON_ONCE(task->patch_state != KLP_UNDEFINED);
+		WARN_ON_ONCE(task->patch_state != KLP_TRANSITION_IDLE);
 		task->patch_state = initial_state;
 	}
 	read_unlock(&tasklist_lock);
@@ -607,19 +607,19 @@ void klp_init_transition(struct klp_patch *patch, int state)
 	 */
 	for_each_possible_cpu(cpu) {
 		task = idle_task(cpu);
-		WARN_ON_ONCE(task->patch_state != KLP_UNDEFINED);
+		WARN_ON_ONCE(task->patch_state != KLP_TRANSITION_IDLE);
 		task->patch_state = initial_state;
 	}
 
 	/*
 	 * Enforce the order of the task->patch_state initializations and the
 	 * func->transition updates to ensure that klp_ftrace_handler() doesn't
-	 * see a func in transition with a task->patch_state of KLP_UNDEFINED.
+	 * see a func in transition with a task->patch_state of KLP_TRANSITION_IDLE.
 	 *
 	 * Also enforce the order of the klp_target_state write and future
 	 * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() and
 	 * __klp_sched_try_switch() don't set a task->patch_state to
-	 * KLP_UNDEFINED.
+	 * KLP_TRANSITION_IDLE.
 	 */
 	smp_wmb();
 
@@ -652,7 +652,7 @@ void klp_reverse_transition(void)
 
 	pr_debug("'%s': reversing transition from %s\n",
 		 klp_transition_patch->mod->name,
-		 klp_target_state == KLP_PATCHED ? "patching to unpatching" :
+		 klp_target_state == KLP_TRANSITION_PATCHED ? "patching to unpatching" :
 						   "unpatching to patching");
 
 	/*
@@ -741,7 +741,7 @@ void klp_force_transition(void)
 		klp_update_patch_state(idle_task(cpu));
 
 	/* Set forced flag for patches being removed. */
-	if (klp_target_state == KLP_UNPATCHED)
+	if (klp_target_state == KLP_TRANSITION_UNPATCHED)
 		klp_transition_patch->forced = true;
 	else if (klp_transition_patch->replace) {
 		klp_for_each_patch(patch) {
-- 
cgit v1.2.3


From 078760d950016f5982751f5512e69f26ad8feb31 Mon Sep 17 00:00:00 2001
From: Ye Bin <yebin10@huawei.com>
Date: Sun, 7 Apr 2024 14:53:54 +0800
Subject: jbd2: use shrink_type type instead of bool type for
 __jbd2_journal_clean_checkpoint_list()

"enum shrink_type" can clearly express the meaning of the parameter of
__jbd2_journal_clean_checkpoint_list(), and there is no need to use the
bool type.

Signed-off-by: Ye Bin <yebin10@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Link: https://lore.kernel.org/r/20240407065355.1528580-2-yebin10@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/checkpoint.c | 16 +++++++++-------
 fs/jbd2/commit.c     |  2 +-
 include/linux/jbd2.h |  4 +++-
 3 files changed, 13 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 1c97e64c4784..80c0ab98bc63 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -337,8 +337,6 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
 
 /* Checkpoint list management */
 
-enum shrink_type {SHRINK_DESTROY, SHRINK_BUSY_STOP, SHRINK_BUSY_SKIP};
-
 /*
  * journal_shrink_one_cp_list
  *
@@ -472,21 +470,25 @@ out:
  * journal_clean_checkpoint_list
  *
  * Find all the written-back checkpoint buffers in the journal and release them.
- * If 'destroy' is set, release all buffers unconditionally.
+ * If 'type' is SHRINK_DESTROY, release all buffers unconditionally. If 'type'
+ * is SHRINK_BUSY_STOP, will stop release buffers if encounters a busy buffer.
+ * To avoid wasting CPU cycles scanning the buffer list in some cases, don't
+ * pass SHRINK_BUSY_SKIP 'type' for this function.
  *
  * Called with j_list_lock held.
  */
-void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy)
+void __jbd2_journal_clean_checkpoint_list(journal_t *journal,
+					  enum shrink_type type)
 {
 	transaction_t *transaction, *last_transaction, *next_transaction;
-	enum shrink_type type;
 	bool released;
 
+	WARN_ON_ONCE(type == SHRINK_BUSY_SKIP);
+
 	transaction = journal->j_checkpoint_transactions;
 	if (!transaction)
 		return;
 
-	type = destroy ? SHRINK_DESTROY : SHRINK_BUSY_STOP;
 	last_transaction = transaction->t_cpprev;
 	next_transaction = transaction;
 	do {
@@ -527,7 +529,7 @@ void jbd2_journal_destroy_checkpoint(journal_t *journal)
 			spin_unlock(&journal->j_list_lock);
 			break;
 		}
-		__jbd2_journal_clean_checkpoint_list(journal, true);
+		__jbd2_journal_clean_checkpoint_list(journal, SHRINK_DESTROY);
 		spin_unlock(&journal->j_list_lock);
 		cond_resched();
 	}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 78a9d08ae9f8..b341c396311b 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -501,7 +501,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	 * frees some memory
 	 */
 	spin_lock(&journal->j_list_lock);
-	__jbd2_journal_clean_checkpoint_list(journal, false);
+	__jbd2_journal_clean_checkpoint_list(journal, SHRINK_BUSY_STOP);
 	spin_unlock(&journal->j_list_lock);
 
 	jbd2_debug(3, "JBD2: commit phase 1\n");
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 971f3e826e15..58a961999d70 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1434,7 +1434,9 @@ void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);
 extern void jbd2_journal_commit_transaction(journal_t *);
 
 /* Checkpoint list management */
-void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy);
+enum shrink_type {SHRINK_DESTROY, SHRINK_BUSY_STOP, SHRINK_BUSY_SKIP};
+
+void __jbd2_journal_clean_checkpoint_list(journal_t *journal, enum shrink_type type);
 unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan);
 int __jbd2_journal_remove_checkpoint(struct journal_head *);
 int jbd2_journal_try_remove_checkpoint(struct journal_head *jh);
-- 
cgit v1.2.3


From 26770a717cac57041d9414725e3e01dd19b08dd2 Mon Sep 17 00:00:00 2001
From: Ye Bin <yebin10@huawei.com>
Date: Sun, 7 Apr 2024 14:53:55 +0800
Subject: jbd2: add prefix 'jbd2' for 'shrink_type'

As 'shrink_type' is exported. The module prefix 'jbd2' is added to
distinguish from memory reclamation.

Signed-off-by: Ye Bin <yebin10@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Link: https://lore.kernel.org/r/20240407065355.1528580-3-yebin10@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/checkpoint.c | 22 +++++++++++-----------
 fs/jbd2/commit.c     |  2 +-
 include/linux/jbd2.h |  4 ++--
 3 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 80c0ab98bc63..951f78634adf 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -348,7 +348,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
  * Called with j_list_lock held.
  */
 static unsigned long journal_shrink_one_cp_list(struct journal_head *jh,
-						enum shrink_type type,
+						enum jbd2_shrink_type type,
 						bool *released)
 {
 	struct journal_head *last_jh;
@@ -365,12 +365,12 @@ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh,
 		jh = next_jh;
 		next_jh = jh->b_cpnext;
 
-		if (type == SHRINK_DESTROY) {
+		if (type == JBD2_SHRINK_DESTROY) {
 			ret = __jbd2_journal_remove_checkpoint(jh);
 		} else {
 			ret = jbd2_journal_try_remove_checkpoint(jh);
 			if (ret < 0) {
-				if (type == SHRINK_BUSY_SKIP)
+				if (type == JBD2_SHRINK_BUSY_SKIP)
 					continue;
 				break;
 			}
@@ -437,7 +437,7 @@ again:
 		tid = transaction->t_tid;
 
 		freed = journal_shrink_one_cp_list(transaction->t_checkpoint_list,
-						   SHRINK_BUSY_SKIP, &released);
+						   JBD2_SHRINK_BUSY_SKIP, &released);
 		nr_freed += freed;
 		(*nr_to_scan) -= min(*nr_to_scan, freed);
 		if (*nr_to_scan == 0)
@@ -470,20 +470,20 @@ out:
  * journal_clean_checkpoint_list
  *
  * Find all the written-back checkpoint buffers in the journal and release them.
- * If 'type' is SHRINK_DESTROY, release all buffers unconditionally. If 'type'
- * is SHRINK_BUSY_STOP, will stop release buffers if encounters a busy buffer.
- * To avoid wasting CPU cycles scanning the buffer list in some cases, don't
- * pass SHRINK_BUSY_SKIP 'type' for this function.
+ * If 'type' is JBD2_SHRINK_DESTROY, release all buffers unconditionally. If
+ * 'type' is JBD2_SHRINK_BUSY_STOP, will stop release buffers if encounters a
+ * busy buffer. To avoid wasting CPU cycles scanning the buffer list in some
+ * cases, don't pass JBD2_SHRINK_BUSY_SKIP 'type' for this function.
  *
  * Called with j_list_lock held.
  */
 void __jbd2_journal_clean_checkpoint_list(journal_t *journal,
-					  enum shrink_type type)
+					  enum jbd2_shrink_type type)
 {
 	transaction_t *transaction, *last_transaction, *next_transaction;
 	bool released;
 
-	WARN_ON_ONCE(type == SHRINK_BUSY_SKIP);
+	WARN_ON_ONCE(type == JBD2_SHRINK_BUSY_SKIP);
 
 	transaction = journal->j_checkpoint_transactions;
 	if (!transaction)
@@ -529,7 +529,7 @@ void jbd2_journal_destroy_checkpoint(journal_t *journal)
 			spin_unlock(&journal->j_list_lock);
 			break;
 		}
-		__jbd2_journal_clean_checkpoint_list(journal, SHRINK_DESTROY);
+		__jbd2_journal_clean_checkpoint_list(journal, JBD2_SHRINK_DESTROY);
 		spin_unlock(&journal->j_list_lock);
 		cond_resched();
 	}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index b341c396311b..75ea4e9a5cab 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -501,7 +501,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	 * frees some memory
 	 */
 	spin_lock(&journal->j_list_lock);
-	__jbd2_journal_clean_checkpoint_list(journal, SHRINK_BUSY_STOP);
+	__jbd2_journal_clean_checkpoint_list(journal, JBD2_SHRINK_BUSY_STOP);
 	spin_unlock(&journal->j_list_lock);
 
 	jbd2_debug(3, "JBD2: commit phase 1\n");
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 58a961999d70..7479f64c0939 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1434,9 +1434,9 @@ void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);
 extern void jbd2_journal_commit_transaction(journal_t *);
 
 /* Checkpoint list management */
-enum shrink_type {SHRINK_DESTROY, SHRINK_BUSY_STOP, SHRINK_BUSY_SKIP};
+enum jbd2_shrink_type {JBD2_SHRINK_DESTROY, JBD2_SHRINK_BUSY_STOP, JBD2_SHRINK_BUSY_SKIP};
 
-void __jbd2_journal_clean_checkpoint_list(journal_t *journal, enum shrink_type type);
+void __jbd2_journal_clean_checkpoint_list(journal_t *journal, enum jbd2_shrink_type type);
 unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan);
 int __jbd2_journal_remove_checkpoint(struct journal_head *);
 int jbd2_journal_try_remove_checkpoint(struct journal_head *jh);
-- 
cgit v1.2.3


From 3d16af0b4cfac4b2c3b238e2ec37b38c2f316978 Mon Sep 17 00:00:00 2001
From: David Gstir <david@sigma-star.at>
Date: Wed, 3 Apr 2024 09:21:17 +0200
Subject: crypto: mxs-dcp: Add support for hardware-bound keys

DCP (Data Co-Processor) is able to derive private keys for a fused
random seed, which can be referenced by handle but not accessed by
the CPU. Similarly, DCP is able to store arbitrary keys in four
dedicated key slots located in its secure memory area (internal SRAM).
These keys can be used to perform AES encryption.

Expose these derived keys and key slots through the crypto API via their
handle. The main purpose is to add DCP-backed trusted keys. Other
use cases are possible too (see similar existing paes implementations),
but these should carefully be evaluated as e.g. enabling AF_ALG will
give userspace full access to use keys. In scenarios with untrustworthy
userspace, this will enable en-/decryption oracles.

Co-developed-by: Richard Weinberger <richard@nod.at>
Signed-off-by: Richard Weinberger <richard@nod.at>
Co-developed-by: David Oberhollenzer <david.oberhollenzer@sigma-star.at>
Signed-off-by: David Oberhollenzer <david.oberhollenzer@sigma-star.at>
Signed-off-by: David Gstir <david@sigma-star.at>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 drivers/crypto/mxs-dcp.c | 104 ++++++++++++++++++++++++++++++++++++++++++-----
 include/soc/fsl/dcp.h    |  20 +++++++++
 2 files changed, 113 insertions(+), 11 deletions(-)
 create mode 100644 include/soc/fsl/dcp.h

(limited to 'include')

diff --git a/drivers/crypto/mxs-dcp.c b/drivers/crypto/mxs-dcp.c
index 2b3ebe0db3a6..057d73c370b7 100644
--- a/drivers/crypto/mxs-dcp.c
+++ b/drivers/crypto/mxs-dcp.c
@@ -15,6 +15,7 @@
 #include <linux/platform_device.h>
 #include <linux/stmp_device.h>
 #include <linux/clk.h>
+#include <soc/fsl/dcp.h>
 
 #include <crypto/aes.h>
 #include <crypto/sha1.h>
@@ -101,6 +102,7 @@ struct dcp_async_ctx {
 	struct crypto_skcipher		*fallback;
 	unsigned int			key_len;
 	uint8_t				key[AES_KEYSIZE_128];
+	bool				key_referenced;
 };
 
 struct dcp_aes_req_ctx {
@@ -155,6 +157,7 @@ static struct dcp *global_sdcp;
 #define MXS_DCP_CONTROL0_HASH_TERM		(1 << 13)
 #define MXS_DCP_CONTROL0_HASH_INIT		(1 << 12)
 #define MXS_DCP_CONTROL0_PAYLOAD_KEY		(1 << 11)
+#define MXS_DCP_CONTROL0_OTP_KEY		(1 << 10)
 #define MXS_DCP_CONTROL0_CIPHER_ENCRYPT		(1 << 8)
 #define MXS_DCP_CONTROL0_CIPHER_INIT		(1 << 9)
 #define MXS_DCP_CONTROL0_ENABLE_HASH		(1 << 6)
@@ -168,6 +171,8 @@ static struct dcp *global_sdcp;
 #define MXS_DCP_CONTROL1_CIPHER_MODE_ECB	(0 << 4)
 #define MXS_DCP_CONTROL1_CIPHER_SELECT_AES128	(0 << 0)
 
+#define MXS_DCP_CONTROL1_KEY_SELECT_SHIFT	8
+
 static int mxs_dcp_start_dma(struct dcp_async_ctx *actx)
 {
 	int dma_err;
@@ -224,13 +229,16 @@ static int mxs_dcp_run_aes(struct dcp_async_ctx *actx,
 	struct dcp *sdcp = global_sdcp;
 	struct dcp_dma_desc *desc = &sdcp->coh->desc[actx->chan];
 	struct dcp_aes_req_ctx *rctx = skcipher_request_ctx(req);
+	bool key_referenced = actx->key_referenced;
 	int ret;
 
-	key_phys = dma_map_single(sdcp->dev, sdcp->coh->aes_key,
-				  2 * AES_KEYSIZE_128, DMA_TO_DEVICE);
-	ret = dma_mapping_error(sdcp->dev, key_phys);
-	if (ret)
-		return ret;
+	if (!key_referenced) {
+		key_phys = dma_map_single(sdcp->dev, sdcp->coh->aes_key,
+					  2 * AES_KEYSIZE_128, DMA_TO_DEVICE);
+		ret = dma_mapping_error(sdcp->dev, key_phys);
+		if (ret)
+			return ret;
+	}
 
 	src_phys = dma_map_single(sdcp->dev, sdcp->coh->aes_in_buf,
 				  DCP_BUF_SZ, DMA_TO_DEVICE);
@@ -255,8 +263,12 @@ static int mxs_dcp_run_aes(struct dcp_async_ctx *actx,
 		    MXS_DCP_CONTROL0_INTERRUPT |
 		    MXS_DCP_CONTROL0_ENABLE_CIPHER;
 
-	/* Payload contains the key. */
-	desc->control0 |= MXS_DCP_CONTROL0_PAYLOAD_KEY;
+	if (key_referenced)
+		/* Set OTP key bit to select the key via KEY_SELECT. */
+		desc->control0 |= MXS_DCP_CONTROL0_OTP_KEY;
+	else
+		/* Payload contains the key. */
+		desc->control0 |= MXS_DCP_CONTROL0_PAYLOAD_KEY;
 
 	if (rctx->enc)
 		desc->control0 |= MXS_DCP_CONTROL0_CIPHER_ENCRYPT;
@@ -270,6 +282,9 @@ static int mxs_dcp_run_aes(struct dcp_async_ctx *actx,
 	else
 		desc->control1 |= MXS_DCP_CONTROL1_CIPHER_MODE_CBC;
 
+	if (key_referenced)
+		desc->control1 |= sdcp->coh->aes_key[0] << MXS_DCP_CONTROL1_KEY_SELECT_SHIFT;
+
 	desc->next_cmd_addr = 0;
 	desc->source = src_phys;
 	desc->destination = dst_phys;
@@ -284,9 +299,9 @@ aes_done_run:
 err_dst:
 	dma_unmap_single(sdcp->dev, src_phys, DCP_BUF_SZ, DMA_TO_DEVICE);
 err_src:
-	dma_unmap_single(sdcp->dev, key_phys, 2 * AES_KEYSIZE_128,
-			 DMA_TO_DEVICE);
-
+	if (!key_referenced)
+		dma_unmap_single(sdcp->dev, key_phys, 2 * AES_KEYSIZE_128,
+				 DMA_TO_DEVICE);
 	return ret;
 }
 
@@ -453,7 +468,7 @@ static int mxs_dcp_aes_enqueue(struct skcipher_request *req, int enc, int ecb)
 	struct dcp_aes_req_ctx *rctx = skcipher_request_ctx(req);
 	int ret;
 
-	if (unlikely(actx->key_len != AES_KEYSIZE_128))
+	if (unlikely(actx->key_len != AES_KEYSIZE_128 && !actx->key_referenced))
 		return mxs_dcp_block_fallback(req, enc);
 
 	rctx->enc = enc;
@@ -500,6 +515,7 @@ static int mxs_dcp_aes_setkey(struct crypto_skcipher *tfm, const u8 *key,
 	 * there can still be an operation in progress.
 	 */
 	actx->key_len = len;
+	actx->key_referenced = false;
 	if (len == AES_KEYSIZE_128) {
 		memcpy(actx->key, key, len);
 		return 0;
@@ -516,6 +532,32 @@ static int mxs_dcp_aes_setkey(struct crypto_skcipher *tfm, const u8 *key,
 	return crypto_skcipher_setkey(actx->fallback, key, len);
 }
 
+static int mxs_dcp_aes_setrefkey(struct crypto_skcipher *tfm, const u8 *key,
+				 unsigned int len)
+{
+	struct dcp_async_ctx *actx = crypto_skcipher_ctx(tfm);
+
+	if (len != DCP_PAES_KEYSIZE)
+		return -EINVAL;
+
+	switch (key[0]) {
+	case DCP_PAES_KEY_SLOT0:
+	case DCP_PAES_KEY_SLOT1:
+	case DCP_PAES_KEY_SLOT2:
+	case DCP_PAES_KEY_SLOT3:
+	case DCP_PAES_KEY_UNIQUE:
+	case DCP_PAES_KEY_OTP:
+		memcpy(actx->key, key, len);
+		actx->key_len = len;
+		actx->key_referenced = true;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int mxs_dcp_aes_fallback_init_tfm(struct crypto_skcipher *tfm)
 {
 	const char *name = crypto_tfm_alg_name(crypto_skcipher_tfm(tfm));
@@ -539,6 +581,13 @@ static void mxs_dcp_aes_fallback_exit_tfm(struct crypto_skcipher *tfm)
 	crypto_free_skcipher(actx->fallback);
 }
 
+static int mxs_dcp_paes_init_tfm(struct crypto_skcipher *tfm)
+{
+	crypto_skcipher_set_reqsize(tfm, sizeof(struct dcp_aes_req_ctx));
+
+	return 0;
+}
+
 /*
  * Hashing (SHA1/SHA256)
  */
@@ -889,6 +938,39 @@ static struct skcipher_alg dcp_aes_algs[] = {
 		.ivsize			= AES_BLOCK_SIZE,
 		.init			= mxs_dcp_aes_fallback_init_tfm,
 		.exit			= mxs_dcp_aes_fallback_exit_tfm,
+	}, {
+		.base.cra_name		= "ecb(paes)",
+		.base.cra_driver_name	= "ecb-paes-dcp",
+		.base.cra_priority	= 401,
+		.base.cra_alignmask	= 15,
+		.base.cra_flags		= CRYPTO_ALG_ASYNC | CRYPTO_ALG_INTERNAL,
+		.base.cra_blocksize	= AES_BLOCK_SIZE,
+		.base.cra_ctxsize	= sizeof(struct dcp_async_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= DCP_PAES_KEYSIZE,
+		.max_keysize		= DCP_PAES_KEYSIZE,
+		.setkey			= mxs_dcp_aes_setrefkey,
+		.encrypt		= mxs_dcp_aes_ecb_encrypt,
+		.decrypt		= mxs_dcp_aes_ecb_decrypt,
+		.init			= mxs_dcp_paes_init_tfm,
+	}, {
+		.base.cra_name		= "cbc(paes)",
+		.base.cra_driver_name	= "cbc-paes-dcp",
+		.base.cra_priority	= 401,
+		.base.cra_alignmask	= 15,
+		.base.cra_flags		= CRYPTO_ALG_ASYNC | CRYPTO_ALG_INTERNAL,
+		.base.cra_blocksize	= AES_BLOCK_SIZE,
+		.base.cra_ctxsize	= sizeof(struct dcp_async_ctx),
+		.base.cra_module	= THIS_MODULE,
+
+		.min_keysize		= DCP_PAES_KEYSIZE,
+		.max_keysize		= DCP_PAES_KEYSIZE,
+		.setkey			= mxs_dcp_aes_setrefkey,
+		.encrypt		= mxs_dcp_aes_cbc_encrypt,
+		.decrypt		= mxs_dcp_aes_cbc_decrypt,
+		.ivsize			= AES_BLOCK_SIZE,
+		.init			= mxs_dcp_paes_init_tfm,
 	},
 };
 
diff --git a/include/soc/fsl/dcp.h b/include/soc/fsl/dcp.h
new file mode 100644
index 000000000000..3ec335d8ca8b
--- /dev/null
+++ b/include/soc/fsl/dcp.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 sigma star gmbh
+ *
+ * Specifies paes key slot handles for NXP's DCP (Data Co-Processor) to be used
+ * with the crypto_skcipher_setkey().
+ */
+
+#ifndef MXS_DCP_H
+#define MXS_DCP_H
+
+#define DCP_PAES_KEYSIZE 1
+#define DCP_PAES_KEY_SLOT0 0x00
+#define DCP_PAES_KEY_SLOT1 0x01
+#define DCP_PAES_KEY_SLOT2 0x02
+#define DCP_PAES_KEY_SLOT3 0x03
+#define DCP_PAES_KEY_UNIQUE 0xfe
+#define DCP_PAES_KEY_OTP 0xff
+
+#endif /* MXS_DCP_H */
-- 
cgit v1.2.3


From 2e8a0f40a39cc253002f21c54e1b5b995e5ec510 Mon Sep 17 00:00:00 2001
From: David Gstir <david@sigma-star.at>
Date: Wed, 3 Apr 2024 09:21:19 +0200
Subject: KEYS: trusted: Introduce NXP DCP-backed trusted keys

DCP (Data Co-Processor) is the little brother of NXP's CAAM IP.
Beside of accelerated crypto operations, it also offers support for
hardware-bound keys. Using this feature it is possible to implement a blob
mechanism similar to what CAAM offers. Unlike on CAAM, constructing and
parsing the blob has to happen in software (i.e. the kernel).

The software-based blob format used by DCP trusted keys encrypts
the payload using AES-128-GCM with a freshly generated random key and nonce.
The random key itself is AES-128-ECB encrypted using the DCP unique
or OTP key.

The DCP trusted key blob format is:
/*
 * struct dcp_blob_fmt - DCP BLOB format.
 *
 * @fmt_version: Format version, currently being %1
 * @blob_key: Random AES 128 key which is used to encrypt @payload,
 *            @blob_key itself is encrypted with OTP or UNIQUE device key in
 *            AES-128-ECB mode by DCP.
 * @nonce: Random nonce used for @payload encryption.
 * @payload_len: Length of the plain text @payload.
 * @payload: The payload itself, encrypted using AES-128-GCM and @blob_key,
 *           GCM auth tag of size AES_BLOCK_SIZE is attached at the end of it.
 *
 * The total size of a DCP BLOB is sizeof(struct dcp_blob_fmt) + @payload_len +
 * AES_BLOCK_SIZE.
 */
struct dcp_blob_fmt {
	__u8 fmt_version;
	__u8 blob_key[AES_KEYSIZE_128];
	__u8 nonce[AES_KEYSIZE_128];
	__le32 payload_len;
	__u8 payload[];
} __packed;

By default the unique key is used. It is also possible to use the
OTP key. While the unique key should be unique it is not documented how
this key is derived. Therefore selection the OTP key is supported as
well via the use_otp_key module parameter.

Co-developed-by: Richard Weinberger <richard@nod.at>
Signed-off-by: Richard Weinberger <richard@nod.at>
Co-developed-by: David Oberhollenzer <david.oberhollenzer@sigma-star.at>
Signed-off-by: David Oberhollenzer <david.oberhollenzer@sigma-star.at>
Signed-off-by: David Gstir <david@sigma-star.at>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 include/keys/trusted_dcp.h                |  11 ++
 security/keys/trusted-keys/Kconfig        |   8 +
 security/keys/trusted-keys/Makefile       |   2 +
 security/keys/trusted-keys/trusted_core.c |   6 +-
 security/keys/trusted-keys/trusted_dcp.c  | 313 ++++++++++++++++++++++++++++++
 5 files changed, 339 insertions(+), 1 deletion(-)
 create mode 100644 include/keys/trusted_dcp.h
 create mode 100644 security/keys/trusted-keys/trusted_dcp.c

(limited to 'include')

diff --git a/include/keys/trusted_dcp.h b/include/keys/trusted_dcp.h
new file mode 100644
index 000000000000..9aaa42075b40
--- /dev/null
+++ b/include/keys/trusted_dcp.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2021 sigma star gmbh
+ */
+
+#ifndef TRUSTED_DCP_H
+#define TRUSTED_DCP_H
+
+extern struct trusted_key_ops dcp_trusted_key_ops;
+
+#endif
diff --git a/security/keys/trusted-keys/Kconfig b/security/keys/trusted-keys/Kconfig
index 553dc117f385..1fb8aa001995 100644
--- a/security/keys/trusted-keys/Kconfig
+++ b/security/keys/trusted-keys/Kconfig
@@ -39,6 +39,14 @@ config TRUSTED_KEYS_CAAM
 	  Enable use of NXP's Cryptographic Accelerator and Assurance Module
 	  (CAAM) as trusted key backend.
 
+config TRUSTED_KEYS_DCP
+	bool "DCP-based trusted keys"
+	depends on CRYPTO_DEV_MXS_DCP >= TRUSTED_KEYS
+	default y
+	select HAVE_TRUSTED_KEYS
+	help
+	  Enable use of NXP's DCP (Data Co-Processor) as trusted key backend.
+
 if !HAVE_TRUSTED_KEYS
 	comment "No trust source selected!"
 endif
diff --git a/security/keys/trusted-keys/Makefile b/security/keys/trusted-keys/Makefile
index 735aa0bc08ef..f0f3b27f688b 100644
--- a/security/keys/trusted-keys/Makefile
+++ b/security/keys/trusted-keys/Makefile
@@ -14,3 +14,5 @@ trusted-$(CONFIG_TRUSTED_KEYS_TPM) += tpm2key.asn1.o
 trusted-$(CONFIG_TRUSTED_KEYS_TEE) += trusted_tee.o
 
 trusted-$(CONFIG_TRUSTED_KEYS_CAAM) += trusted_caam.o
+
+trusted-$(CONFIG_TRUSTED_KEYS_DCP) += trusted_dcp.o
diff --git a/security/keys/trusted-keys/trusted_core.c b/security/keys/trusted-keys/trusted_core.c
index fee1ab2c734d..5113aeae5628 100644
--- a/security/keys/trusted-keys/trusted_core.c
+++ b/security/keys/trusted-keys/trusted_core.c
@@ -10,6 +10,7 @@
 #include <keys/trusted-type.h>
 #include <keys/trusted_tee.h>
 #include <keys/trusted_caam.h>
+#include <keys/trusted_dcp.h>
 #include <keys/trusted_tpm.h>
 #include <linux/capability.h>
 #include <linux/err.h>
@@ -30,7 +31,7 @@ MODULE_PARM_DESC(rng, "Select trusted key RNG");
 
 static char *trusted_key_source;
 module_param_named(source, trusted_key_source, charp, 0);
-MODULE_PARM_DESC(source, "Select trusted keys source (tpm, tee or caam)");
+MODULE_PARM_DESC(source, "Select trusted keys source (tpm, tee, caam or dcp)");
 
 static const struct trusted_key_source trusted_key_sources[] = {
 #if defined(CONFIG_TRUSTED_KEYS_TPM)
@@ -42,6 +43,9 @@ static const struct trusted_key_source trusted_key_sources[] = {
 #if defined(CONFIG_TRUSTED_KEYS_CAAM)
 	{ "caam", &trusted_key_caam_ops },
 #endif
+#if defined(CONFIG_TRUSTED_KEYS_DCP)
+	{ "dcp", &dcp_trusted_key_ops },
+#endif
 };
 
 DEFINE_STATIC_CALL_NULL(trusted_key_seal, *trusted_key_sources[0].ops->seal);
diff --git a/security/keys/trusted-keys/trusted_dcp.c b/security/keys/trusted-keys/trusted_dcp.c
new file mode 100644
index 000000000000..16c44aafeab3
--- /dev/null
+++ b/security/keys/trusted-keys/trusted_dcp.c
@@ -0,0 +1,313 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021 sigma star gmbh
+ */
+
+#include <crypto/aead.h>
+#include <crypto/aes.h>
+#include <crypto/algapi.h>
+#include <crypto/gcm.h>
+#include <crypto/skcipher.h>
+#include <keys/trusted-type.h>
+#include <linux/key-type.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <soc/fsl/dcp.h>
+
+#define DCP_BLOB_VERSION 1
+#define DCP_BLOB_AUTHLEN 16
+
+/**
+ * struct dcp_blob_fmt - DCP BLOB format.
+ *
+ * @fmt_version: Format version, currently being %1.
+ * @blob_key: Random AES 128 key which is used to encrypt @payload,
+ *            @blob_key itself is encrypted with OTP or UNIQUE device key in
+ *            AES-128-ECB mode by DCP.
+ * @nonce: Random nonce used for @payload encryption.
+ * @payload_len: Length of the plain text @payload.
+ * @payload: The payload itself, encrypted using AES-128-GCM and @blob_key,
+ *           GCM auth tag of size DCP_BLOB_AUTHLEN is attached at the end of it.
+ *
+ * The total size of a DCP BLOB is sizeof(struct dcp_blob_fmt) + @payload_len +
+ * DCP_BLOB_AUTHLEN.
+ */
+struct dcp_blob_fmt {
+	__u8 fmt_version;
+	__u8 blob_key[AES_KEYSIZE_128];
+	__u8 nonce[AES_KEYSIZE_128];
+	__le32 payload_len;
+	__u8 payload[];
+} __packed;
+
+static bool use_otp_key;
+module_param_named(dcp_use_otp_key, use_otp_key, bool, 0);
+MODULE_PARM_DESC(dcp_use_otp_key, "Use OTP instead of UNIQUE key for sealing");
+
+static bool skip_zk_test;
+module_param_named(dcp_skip_zk_test, skip_zk_test, bool, 0);
+MODULE_PARM_DESC(dcp_skip_zk_test, "Don't test whether device keys are zero'ed");
+
+static unsigned int calc_blob_len(unsigned int payload_len)
+{
+	return sizeof(struct dcp_blob_fmt) + payload_len + DCP_BLOB_AUTHLEN;
+}
+
+static int do_dcp_crypto(u8 *in, u8 *out, bool do_encrypt)
+{
+	struct skcipher_request *req = NULL;
+	struct scatterlist src_sg, dst_sg;
+	struct crypto_skcipher *tfm;
+	u8 paes_key[DCP_PAES_KEYSIZE];
+	DECLARE_CRYPTO_WAIT(wait);
+	int res = 0;
+
+	if (use_otp_key)
+		paes_key[0] = DCP_PAES_KEY_OTP;
+	else
+		paes_key[0] = DCP_PAES_KEY_UNIQUE;
+
+	tfm = crypto_alloc_skcipher("ecb-paes-dcp", CRYPTO_ALG_INTERNAL,
+				    CRYPTO_ALG_INTERNAL);
+	if (IS_ERR(tfm)) {
+		res = PTR_ERR(tfm);
+		tfm = NULL;
+		goto out;
+	}
+
+	req = skcipher_request_alloc(tfm, GFP_NOFS);
+	if (!req) {
+		res = -ENOMEM;
+		goto out;
+	}
+
+	skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG |
+				      CRYPTO_TFM_REQ_MAY_SLEEP,
+				      crypto_req_done, &wait);
+	res = crypto_skcipher_setkey(tfm, paes_key, sizeof(paes_key));
+	if (res < 0)
+		goto out;
+
+	sg_init_one(&src_sg, in, AES_KEYSIZE_128);
+	sg_init_one(&dst_sg, out, AES_KEYSIZE_128);
+	skcipher_request_set_crypt(req, &src_sg, &dst_sg, AES_KEYSIZE_128,
+				   NULL);
+
+	if (do_encrypt)
+		res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
+	else
+		res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait);
+
+out:
+	skcipher_request_free(req);
+	crypto_free_skcipher(tfm);
+
+	return res;
+}
+
+static int do_aead_crypto(u8 *in, u8 *out, size_t len, u8 *key, u8 *nonce,
+			  bool do_encrypt)
+{
+	struct aead_request *aead_req = NULL;
+	struct scatterlist src_sg, dst_sg;
+	struct crypto_aead *aead;
+	int ret;
+
+	aead = crypto_alloc_aead("gcm(aes)", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(aead)) {
+		ret = PTR_ERR(aead);
+		goto out;
+	}
+
+	ret = crypto_aead_setauthsize(aead, DCP_BLOB_AUTHLEN);
+	if (ret < 0) {
+		pr_err("Can't set crypto auth tag len: %d\n", ret);
+		goto free_aead;
+	}
+
+	aead_req = aead_request_alloc(aead, GFP_KERNEL);
+	if (!aead_req) {
+		ret = -ENOMEM;
+		goto free_aead;
+	}
+
+	sg_init_one(&src_sg, in, len);
+	if (do_encrypt) {
+		/*
+		 * If we encrypt our buffer has extra space for the auth tag.
+		 */
+		sg_init_one(&dst_sg, out, len + DCP_BLOB_AUTHLEN);
+	} else {
+		sg_init_one(&dst_sg, out, len);
+	}
+
+	aead_request_set_crypt(aead_req, &src_sg, &dst_sg, len, nonce);
+	aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL,
+				  NULL);
+	aead_request_set_ad(aead_req, 0);
+
+	if (crypto_aead_setkey(aead, key, AES_KEYSIZE_128)) {
+		pr_err("Can't set crypto AEAD key\n");
+		ret = -EINVAL;
+		goto free_req;
+	}
+
+	if (do_encrypt)
+		ret = crypto_aead_encrypt(aead_req);
+	else
+		ret = crypto_aead_decrypt(aead_req);
+
+free_req:
+	aead_request_free(aead_req);
+free_aead:
+	crypto_free_aead(aead);
+out:
+	return ret;
+}
+
+static int decrypt_blob_key(u8 *key)
+{
+	return do_dcp_crypto(key, key, false);
+}
+
+static int encrypt_blob_key(u8 *key)
+{
+	return do_dcp_crypto(key, key, true);
+}
+
+static int trusted_dcp_seal(struct trusted_key_payload *p, char *datablob)
+{
+	struct dcp_blob_fmt *b = (struct dcp_blob_fmt *)p->blob;
+	int blen, ret;
+
+	blen = calc_blob_len(p->key_len);
+	if (blen > MAX_BLOB_SIZE)
+		return -E2BIG;
+
+	b->fmt_version = DCP_BLOB_VERSION;
+	get_random_bytes(b->nonce, AES_KEYSIZE_128);
+	get_random_bytes(b->blob_key, AES_KEYSIZE_128);
+
+	ret = do_aead_crypto(p->key, b->payload, p->key_len, b->blob_key,
+			     b->nonce, true);
+	if (ret) {
+		pr_err("Unable to encrypt blob payload: %i\n", ret);
+		return ret;
+	}
+
+	ret = encrypt_blob_key(b->blob_key);
+	if (ret) {
+		pr_err("Unable to encrypt blob key: %i\n", ret);
+		return ret;
+	}
+
+	b->payload_len = get_unaligned_le32(&p->key_len);
+	p->blob_len = blen;
+	return 0;
+}
+
+static int trusted_dcp_unseal(struct trusted_key_payload *p, char *datablob)
+{
+	struct dcp_blob_fmt *b = (struct dcp_blob_fmt *)p->blob;
+	int blen, ret;
+
+	if (b->fmt_version != DCP_BLOB_VERSION) {
+		pr_err("DCP blob has bad version: %i, expected %i\n",
+		       b->fmt_version, DCP_BLOB_VERSION);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	p->key_len = le32_to_cpu(b->payload_len);
+	blen = calc_blob_len(p->key_len);
+	if (blen != p->blob_len) {
+		pr_err("DCP blob has bad length: %i != %i\n", blen,
+		       p->blob_len);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = decrypt_blob_key(b->blob_key);
+	if (ret) {
+		pr_err("Unable to decrypt blob key: %i\n", ret);
+		goto out;
+	}
+
+	ret = do_aead_crypto(b->payload, p->key, p->key_len + DCP_BLOB_AUTHLEN,
+			     b->blob_key, b->nonce, false);
+	if (ret) {
+		pr_err("Unwrap of DCP payload failed: %i\n", ret);
+		goto out;
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
+static int test_for_zero_key(void)
+{
+	/*
+	 * Encrypting a plaintext of all 0x55 bytes will yield
+	 * this ciphertext in case the DCP test key is used.
+	 */
+	static const u8 bad[] = {0x9a, 0xda, 0xe0, 0x54, 0xf6, 0x3d, 0xfa, 0xff,
+				 0x5e, 0xa1, 0x8e, 0x45, 0xed, 0xf6, 0xea, 0x6f};
+	void *buf = NULL;
+	int ret = 0;
+
+	if (skip_zk_test)
+		goto out;
+
+	buf = kmalloc(AES_BLOCK_SIZE, GFP_KERNEL);
+	if (!buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	memset(buf, 0x55, AES_BLOCK_SIZE);
+
+	ret = do_dcp_crypto(buf, buf, true);
+	if (ret)
+		goto out;
+
+	if (memcmp(buf, bad, AES_BLOCK_SIZE) == 0) {
+		pr_warn("Device neither in secure nor trusted mode!\n");
+		ret = -EINVAL;
+	}
+out:
+	kfree(buf);
+	return ret;
+}
+
+static int trusted_dcp_init(void)
+{
+	int ret;
+
+	if (use_otp_key)
+		pr_info("Using DCP OTP key\n");
+
+	ret = test_for_zero_key();
+	if (ret) {
+		pr_warn("Test for zero'ed keys failed: %i\n", ret);
+
+		return -EINVAL;
+	}
+
+	return register_key_type(&key_type_trusted);
+}
+
+static void trusted_dcp_exit(void)
+{
+	unregister_key_type(&key_type_trusted);
+}
+
+struct trusted_key_ops dcp_trusted_key_ops = {
+	.exit = trusted_dcp_exit,
+	.init = trusted_dcp_init,
+	.seal = trusted_dcp_seal,
+	.unseal = trusted_dcp_unseal,
+	.migratable = 0,
+};
-- 
cgit v1.2.3


From 6802f9347993a534797847f627c27f4334067945 Mon Sep 17 00:00:00 2001
From: Kyle Meyer <kyle.meyer@hpe.com>
Date: Wed, 10 Apr 2024 16:33:10 -0500
Subject: cpumask: Add for_each_cpu_from()

Add for_each_cpu_from() as a generic cpumask macro.

for_each_cpu_from() is the same as for_each_cpu(), except it starts at
@cpu instead of zero.

Signed-off-by: Kyle Meyer <kyle.meyer@hpe.com>
Acked-by: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/cpumask.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 1c29947db848..d75060fbd058 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -368,6 +368,16 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta
 #define for_each_cpu_or(cpu, mask1, mask2)				\
 	for_each_or_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)
 
+/**
+ * for_each_cpu_from - iterate over CPUs present in @mask, from @cpu to the end of @mask.
+ * @cpu: the (optionally unsigned) integer iterator
+ * @mask: the cpumask pointer
+ *
+ * After the loop, cpu is >= nr_cpu_ids.
+ */
+#define for_each_cpu_from(cpu, mask)				\
+	for_each_set_bit_from(cpu, cpumask_bits(mask), small_cpumask_bits)
+
 /**
  * cpumask_any_but - return a "random" in a cpumask, but not this one.
  * @mask: the cpumask to search
-- 
cgit v1.2.3


From efe3a85eab78b6cc02bdfd16aec4410111ea69c0 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Fri, 3 May 2024 12:12:00 -0700
Subject: Compiler Attributes: Add __always_used macro

In some cases like performance benchmarking, we need to call a
function, but don't need to read the returned value. If compiler
recognizes the function as pure or const, it can remove the function
invocation, which is not what we want.

To prevent that, the common practice is assigning the return value to
a temporary static volatile variable. From compiler's point of view,
the variable is unused because never read back after been assigned.
To make sure the variable is always emitted, we provide a __used
attribute. This works with GCC, but clang still emits
Wunused-but-set-variable. To suppress that warning, we need to teach
clang to do that with the 'unused' attribute.

Nathan Chancellor explained that in details:

  While having used and unused attributes together might look unusual,
  reading the GCC attribute manual makes it seem like these attributes
  fulfill similar yet different roles, __unused__ prevents any unused
  warnings while __used__ forces the variable to be emitted. A strict
  reading of that does not make it seem like __used__ implies disabling
  unused warnings

The compiler documentation makes it clear what happens behind the 'used'
and 'unused' attributes, but the chosen names may confuse readers if
such combination catches an eye in a random code.

This patch adds __always_used macro, which combines both attributes
and comments on what happens for those interested in details.

Suggested-by: Nathan Chancellor <nathan@kernel.org>
Reported-by: kernel test robot <lkp@intel.com>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Closes: https://lore.kernel.org/oe-kbuild-all/202405030808.UsoMKFNP-lkp@intel.com/
Acked-by: Miguel Ojeda <ojeda@kernel.org>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/compiler_attributes.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include')

diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h
index 8bdf6e0918c1..32284cd26d52 100644
--- a/include/linux/compiler_attributes.h
+++ b/include/linux/compiler_attributes.h
@@ -361,6 +361,19 @@
  */
 #define __used                          __attribute__((__used__))
 
+/*
+ * The __used attribute guarantees that the attributed variable will be
+ * always emitted by a compiler. It doesn't prevent the compiler from
+ * throwing 'unused' warnings when it can't detect how the variable is
+ * actually used. It's a compiler implementation details either emit
+ * the warning in that case or not.
+ *
+ * The combination of both 'used' and 'unused' attributes ensures that
+ * the variable would be emitted, and will not trigger 'unused' warnings.
+ * The attribute is applicable for functions, static and global variables.
+ */
+#define __always_used			__used __maybe_unused
+
 /*
  *   gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-warn_005funused_005fresult-function-attribute
  * clang: https://clang.llvm.org/docs/AttributeReference.html#nodiscard-warn-unused-result
-- 
cgit v1.2.3


From 1c2aa5619348f7573d6f2269e04fd1dac8eddc47 Mon Sep 17 00:00:00 2001
From: Kuan-Wei Chiu <visitorckw@gmail.com>
Date: Thu, 2 May 2024 17:24:43 +0800
Subject: bitops: Optimize fns() for improved performance

The current fns() repeatedly uses __ffs() to find the index of the
least significant bit and then clears the corresponding bit using
__clear_bit(). The method for clearing the least significant bit can be
optimized by using word &= word - 1 instead.

Typically, the execution time of one __ffs() plus one __clear_bit() is
longer than that of a bitwise AND operation and a subtraction. To
improve performance, the loop for clearing the least significant bit
has been replaced with word &= word - 1, followed by a single __ffs()
operation to obtain the answer. This change reduces the number of
__ffs() iterations from n to just one, enhancing overall performance.

This modification significantly accelerates the fns() function in the
test_bitops benchmark, improving its speed by approximately 7.6 times.
Additionally, it enhances the performance of find_nth_bit() in the
find_bit benchmark by approximately 26%.

Before:
test_bitops: fns:             58033164 ns
find_nth_bit:                  4254313 ns,  16525 iterations

After:
test_bitops: fns:              7637268 ns
find_nth_bit:                  3362863 ns,  16501 iterations

CC: Andrew Morton <akpm@linux-foundation.org>
CC: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/bitops.h | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 2ba557e067fe..57ecef354f47 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -254,16 +254,10 @@ static inline unsigned long __ffs64(u64 word)
  */
 static inline unsigned long fns(unsigned long word, unsigned int n)
 {
-	unsigned int bit;
+	while (word && n--)
+		word &= word - 1;
 
-	while (word) {
-		bit = __ffs(word);
-		if (n-- == 0)
-			return bit;
-		__clear_bit(bit, &word);
-	}
-
-	return BITS_PER_LONG;
+	return word ? __ffs(word) : BITS_PER_LONG;
 }
 
 /**
-- 
cgit v1.2.3


From 0b2811ba11b04353033237359c9d042eb0cdc1c1 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Thu, 2 May 2024 10:12:56 -0700
Subject: bitmap: relax find_nth_bit() limitation on return value

The function claims to return the bitmap size, if Nth bit doesn't exist.
This rule is violated in inline case because the fns() that is used
there doesn't know anything about size of the bitmap.

So, relax this requirement to '>= size', and make the outline
implementation a bit cheaper.

All in-tree kernel users of find_nth_bit() are safe against that.

Reported-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Closes: https://lore.kernel.org/all/Zi50cAgR8nZvgLa3@yury-ThinkPad/T/#m6da806a0525e74dcc91f35e5f20766ed4e853e8a
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/find.h | 2 +-
 lib/find_bit.c       | 2 +-
 lib/test_bitmap.c    | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/find.h b/include/linux/find.h
index c69598e383c1..02751e43d448 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -220,7 +220,7 @@ unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
  *	 idx = find_first_bit(addr, size);
  *
  * Returns the bit number of the N'th set bit.
- * If no such, returns @size.
+ * If no such, returns >= @size.
  */
 static inline
 unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n)
diff --git a/lib/find_bit.c b/lib/find_bit.c
index 32f99e9a670e..0bddfc3ff248 100644
--- a/lib/find_bit.c
+++ b/lib/find_bit.c
@@ -87,7 +87,7 @@ out:										\
 	if (sz % BITS_PER_LONG)							\
 		tmp = (FETCH) & BITMAP_LAST_WORD_MASK(sz);			\
 found:										\
-	sz = min(idx * BITS_PER_LONG + fns(tmp, nr), sz);			\
+	sz = idx * BITS_PER_LONG + fns(tmp, nr);				\
 out:										\
 	sz;									\
 })
diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c
index 6b2b33579f56..088838f829c9 100644
--- a/lib/test_bitmap.c
+++ b/lib/test_bitmap.c
@@ -244,7 +244,7 @@ static void __init test_find_nth_bit(void)
 	expect_eq_uint(60,  find_nth_bit(bmap, 64 * 3, 5));
 	expect_eq_uint(80,  find_nth_bit(bmap, 64 * 3, 6));
 	expect_eq_uint(123, find_nth_bit(bmap, 64 * 3, 7));
-	expect_eq_uint(64 * 3, find_nth_bit(bmap, 64 * 3, 8));
+	expect_eq_uint(0,   !!(find_nth_bit(bmap, 64 * 3, 8) < 64 * 3));
 
 	expect_eq_uint(10,  find_nth_bit(bmap, 64 * 3 - 1, 0));
 	expect_eq_uint(20,  find_nth_bit(bmap, 64 * 3 - 1, 1));
@@ -254,7 +254,7 @@ static void __init test_find_nth_bit(void)
 	expect_eq_uint(60,  find_nth_bit(bmap, 64 * 3 - 1, 5));
 	expect_eq_uint(80,  find_nth_bit(bmap, 64 * 3 - 1, 6));
 	expect_eq_uint(123, find_nth_bit(bmap, 64 * 3 - 1, 7));
-	expect_eq_uint(64 * 3 - 1, find_nth_bit(bmap, 64 * 3 - 1, 8));
+	expect_eq_uint(0,   !!(find_nth_bit(bmap, 64 * 3 - 1, 8) < 64 * 3 - 1));
 
 	for_each_set_bit(bit, exp1, EXP1_IN_BITS) {
 		b = find_nth_bit(exp1, EXP1_IN_BITS, cnt++);
-- 
cgit v1.2.3


From a6016aac5252da9d22a4dc0b98121b0acdf6d2f5 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 9 May 2024 16:46:16 +0200
Subject: dma: fix DMA sync for drivers not calling dma_set_mask*()

There are several reports that the DMA sync shortcut broke non-coherent
devices.
dev->dma_need_sync is false after the &device allocation and if a driver
didn't call dma_set_mask*(), it will still be false even if the device
is not DMA-coherent and thus needs synchronizing. Due to historical
reasons, there's still a lot of drivers not calling it.
Invert the boolean, so that the sync will be performed by default and
the shortcut will be enabled only when calling dma_set_mask*().

Reported-by: Steven Price <steven.price@arm.com>
Closes: https://lore.kernel.org/lkml/010686f5-3049-46a1-8230-7752a1b433ff@arm.com
Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Closes: https://lore.kernel.org/lkml/46160534-5003-4809-a408-6b3a3f4921e9@samsung.com
Fixes: f406c8e4b770. ("dma: avoid redundant calls for sync operations")
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Tested-by: Steven Price <steven.price@arm.com>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
---
 include/linux/device.h      |  4 ++--
 include/linux/dma-map-ops.h |  4 ++--
 include/linux/dma-mapping.h |  2 +-
 kernel/dma/mapping.c        | 10 +++++-----
 kernel/dma/swiotlb.c        |  2 +-
 5 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/device.h b/include/linux/device.h
index ed95b829f05b..d4b50accff26 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -691,7 +691,7 @@ struct device_physical_location {
  *		and optionall (if the coherent mask is large enough) also
  *		for dma allocations.  This flag is managed by the dma ops
  *		instance from ->dma_supported.
- * @dma_need_sync: The device needs performing DMA sync operations.
+ * @dma_skip_sync: DMA sync operations can be skipped for coherent buffers.
  *
  * At the lowest level, every device in a Linux system is represented by an
  * instance of struct device. The device structure contains the information
@@ -805,7 +805,7 @@ struct device {
 	bool			dma_ops_bypass : 1;
 #endif
 #ifdef CONFIG_DMA_NEED_SYNC
-	bool			dma_need_sync:1;
+	bool			dma_skip_sync:1;
 #endif
 };
 
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 4893cb89cb52..5217b922d29f 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -280,8 +280,8 @@ static inline void dma_reset_need_sync(struct device *dev)
 {
 #ifdef CONFIG_DMA_NEED_SYNC
 	/* Reset it only once so that the function can be called on hotpath */
-	if (unlikely(!dev->dma_need_sync))
-		dev->dma_need_sync = true;
+	if (unlikely(dev->dma_skip_sync))
+		dev->dma_skip_sync = false;
 #endif
 }
 
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index eb4e15893b6c..f693aafe221f 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -295,7 +295,7 @@ bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr);
 static inline bool dma_dev_need_sync(const struct device *dev)
 {
 	/* Always call DMA sync operations when debugging is enabled */
-	return dev->dma_need_sync || IS_ENABLED(CONFIG_DMA_API_DEBUG);
+	return !dev->dma_skip_sync || IS_ENABLED(CONFIG_DMA_API_DEBUG);
 }
 
 static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 3524bc92c37f..3f77c3f8d16d 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -392,7 +392,7 @@ bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr)
 
 	if (dma_map_direct(dev, ops))
 		/*
-		 * dma_need_sync could've been reset on first SWIOTLB buffer
+		 * dma_skip_sync could've been reset on first SWIOTLB buffer
 		 * mapping, but @dma_addr is not necessary an SWIOTLB buffer.
 		 * In this case, fall back to more granular check.
 		 */
@@ -407,20 +407,20 @@ static void dma_setup_need_sync(struct device *dev)
 
 	if (dma_map_direct(dev, ops) || (ops->flags & DMA_F_CAN_SKIP_SYNC))
 		/*
-		 * dma_need_sync will be reset to %true on first SWIOTLB buffer
+		 * dma_skip_sync will be reset to %false on first SWIOTLB buffer
 		 * mapping, if any. During the device initialization, it's
 		 * enough to check only for the DMA coherence.
 		 */
-		dev->dma_need_sync = !dev_is_dma_coherent(dev);
+		dev->dma_skip_sync = dev_is_dma_coherent(dev);
 	else if (!ops->sync_single_for_device && !ops->sync_single_for_cpu &&
 		 !ops->sync_sg_for_device && !ops->sync_sg_for_cpu)
 		/*
 		 * Synchronization is not possible when none of DMA sync ops
 		 * is set.
 		 */
-		dev->dma_need_sync = false;
+		dev->dma_skip_sync = true;
 	else
-		dev->dma_need_sync = true;
+		dev->dma_skip_sync = false;
 }
 #else /* !CONFIG_DMA_NEED_SYNC */
 static inline void dma_setup_need_sync(struct device *dev) { }
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index ae3e593eaadb..068134697cf1 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -1409,7 +1409,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 	}
 
 	/*
-	 * If dma_need_sync wasn't set, reset it on first SWIOTLB buffer
+	 * If dma_skip_sync was set, reset it on first SWIOTLB buffer
 	 * mapping to always sync SWIOTLB buffers.
 	 */
 	dma_reset_need_sync(dev);
-- 
cgit v1.2.3


From 7dcc758cca432510f77b2fe1077be2314bc3785b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 7 May 2024 14:06:15 -0600
Subject: io_uring/net: add IORING_ACCEPT_DONTWAIT flag

This allows the caller to perform a non-blocking attempt, similarly to
how recvmsg has MSG_DONTWAIT. If set, and we get -EAGAIN on a connection
attempt, propagate the result to userspace rather than arm poll and
wait for a retry.

Suggested-by: Norman Maurer <norman_maurer@apple.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  1 +
 io_uring/net.c                | 15 +++++++++------
 2 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index f093cb2300d9..4a645d15516f 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -379,6 +379,7 @@ enum io_uring_op {
  * accept flags stored in sqe->ioprio
  */
 #define IORING_ACCEPT_MULTISHOT	(1U << 0)
+#define IORING_ACCEPT_DONTWAIT	(1U << 1)
 
 /*
  * IORING_OP_MSG_RING command types, stored in sqe->addr
diff --git a/io_uring/net.c b/io_uring/net.c
index b0bf8471ecb7..7861bc8fe8b1 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -28,6 +28,7 @@ struct io_accept {
 	struct sockaddr __user		*addr;
 	int __user			*addr_len;
 	int				flags;
+	int				iou_flags;
 	u32				file_slot;
 	unsigned long			nofile;
 };
@@ -1489,7 +1490,6 @@ void io_sendrecv_fail(struct io_kiocb *req)
 int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
-	unsigned flags;
 
 	if (sqe->len || sqe->buf_index)
 		return -EINVAL;
@@ -1498,15 +1498,15 @@ int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
 	accept->flags = READ_ONCE(sqe->accept_flags);
 	accept->nofile = rlimit(RLIMIT_NOFILE);
-	flags = READ_ONCE(sqe->ioprio);
-	if (flags & ~IORING_ACCEPT_MULTISHOT)
+	accept->iou_flags = READ_ONCE(sqe->ioprio);
+	if (accept->iou_flags & ~(IORING_ACCEPT_MULTISHOT | IORING_ACCEPT_DONTWAIT))
 		return -EINVAL;
 
 	accept->file_slot = READ_ONCE(sqe->file_index);
 	if (accept->file_slot) {
 		if (accept->flags & SOCK_CLOEXEC)
 			return -EINVAL;
-		if (flags & IORING_ACCEPT_MULTISHOT &&
+		if (accept->iou_flags & IORING_ACCEPT_MULTISHOT &&
 		    accept->file_slot != IORING_FILE_INDEX_ALLOC)
 			return -EINVAL;
 	}
@@ -1514,8 +1514,10 @@ int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return -EINVAL;
 	if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
 		accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
-	if (flags & IORING_ACCEPT_MULTISHOT)
+	if (accept->iou_flags & IORING_ACCEPT_MULTISHOT)
 		req->flags |= REQ_F_APOLL_MULTISHOT;
+	if (accept->iou_flags & IORING_ACCEPT_DONTWAIT)
+		req->flags |= REQ_F_NOWAIT;
 	return 0;
 }
 
@@ -1540,7 +1542,8 @@ retry:
 		if (!fixed)
 			put_unused_fd(fd);
 		ret = PTR_ERR(file);
-		if (ret == -EAGAIN && force_nonblock) {
+		if (ret == -EAGAIN && force_nonblock &&
+		    !(accept->iou_flags & IORING_ACCEPT_DONTWAIT)) {
 			/*
 			 * if it's multishot and polled, we don't need to
 			 * return EAGAIN to arm the poll infra since it
-- 
cgit v1.2.3


From d3da8e98592693811c14c31f05380f378411fea1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 8 May 2024 08:17:50 -0600
Subject: io_uring/net: add IORING_ACCEPT_POLL_FIRST flag

Similarly to how polling first is supported for receive, it makes sense
to provide the same for accept. An accept operation does a lot of
expensive setup, like allocating an fd, a socket/inode, etc. If no
connection request is already pending, this is wasted and will just be
cleaned up and freed, only to retry via the usual poll trigger.

Add IORING_ACCEPT_POLL_FIRST, which tells accept to only initiate the
accept request if poll says we have something to accept.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h | 1 +
 io_uring/net.c                | 9 ++++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 4a645d15516f..6dbac55f8686 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -380,6 +380,7 @@ enum io_uring_op {
  */
 #define IORING_ACCEPT_MULTISHOT	(1U << 0)
 #define IORING_ACCEPT_DONTWAIT	(1U << 1)
+#define IORING_ACCEPT_POLL_FIRST	(1U << 2)
 
 /*
  * IORING_OP_MSG_RING command types, stored in sqe->addr
diff --git a/io_uring/net.c b/io_uring/net.c
index 7861bc8fe8b1..070dea9a4eda 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -1487,6 +1487,9 @@ void io_sendrecv_fail(struct io_kiocb *req)
 		req->cqe.flags |= IORING_CQE_F_MORE;
 }
 
+#define ACCEPT_FLAGS	(IORING_ACCEPT_MULTISHOT | IORING_ACCEPT_DONTWAIT | \
+			 IORING_ACCEPT_POLL_FIRST)
+
 int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
@@ -1499,7 +1502,7 @@ int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	accept->flags = READ_ONCE(sqe->accept_flags);
 	accept->nofile = rlimit(RLIMIT_NOFILE);
 	accept->iou_flags = READ_ONCE(sqe->ioprio);
-	if (accept->iou_flags & ~(IORING_ACCEPT_MULTISHOT | IORING_ACCEPT_DONTWAIT))
+	if (accept->iou_flags & ~ACCEPT_FLAGS)
 		return -EINVAL;
 
 	accept->file_slot = READ_ONCE(sqe->file_index);
@@ -1530,6 +1533,10 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags)
 	struct file *file;
 	int ret, fd;
 
+	if (!(req->flags & REQ_F_POLLED) &&
+	    accept->iou_flags & IORING_ACCEPT_POLL_FIRST)
+		return -EAGAIN;
+
 retry:
 	if (!fixed) {
 		fd = __get_unused_fd_flags(accept->flags, accept->nofile);
-- 
cgit v1.2.3


From ea89a742daf4317038fbab6776d36726dd7a1e2a Mon Sep 17 00:00:00 2001
From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Date: Fri, 3 May 2024 09:03:59 -0500
Subject: ALSA/ASoC: include: clarify Copyright information
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For some reason a number of files included the "All rights reserved"
statement. Good old copy-paste made sure this mistake proliferated.

Remove the "All rights reserved" in all Intel-copyright to align with
internal guidance.

Reviewed-by: Cezary Rojewski <cezary.rojewski@intel.com>
Signed-off-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Reviewed-by: Péter Ujfalusi <peter.ujfalusi@linux.intel.com>
Link: https://lore.kernel.org/r/20240503140359.259762-10-pierre-louis.bossart@linux.intel.com
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/sound/hda-mlink.h             | 2 +-
 include/sound/soc-acpi-intel-match.h  | 2 +-
 include/sound/soc-acpi.h              | 2 +-
 include/sound/sof.h                   | 2 +-
 include/sound/sof/channel_map.h       | 2 +-
 include/sound/sof/control.h           | 2 +-
 include/sound/sof/dai-intel.h         | 2 +-
 include/sound/sof/dai.h               | 2 +-
 include/sound/sof/debug.h             | 2 +-
 include/sound/sof/ext_manifest.h      | 2 +-
 include/sound/sof/ext_manifest4.h     | 2 +-
 include/sound/sof/header.h            | 2 +-
 include/sound/sof/info.h              | 2 +-
 include/sound/sof/ipc4/header.h       | 2 +-
 include/sound/sof/pm.h                | 2 +-
 include/sound/sof/stream.h            | 2 +-
 include/sound/sof/topology.h          | 2 +-
 include/sound/sof/trace.h             | 2 +-
 include/sound/sof/xtensa.h            | 2 +-
 include/trace/events/sof.h            | 2 +-
 include/trace/events/sof_intel.h      | 2 +-
 include/uapi/sound/intel/avs/tokens.h | 2 +-
 include/uapi/sound/sof/abi.h          | 2 +-
 include/uapi/sound/sof/fw.h           | 2 +-
 include/uapi/sound/sof/header.h       | 2 +-
 include/uapi/sound/sof/tokens.h       | 2 +-
 26 files changed, 26 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/sound/hda-mlink.h b/include/sound/hda-mlink.h
index d849d9b24f13..9ced94686ce3 100644
--- a/include/sound/hda-mlink.h
+++ b/include/sound/hda-mlink.h
@@ -3,7 +3,7 @@
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
  *
- * Copyright(c) 2022-2023 Intel Corporation. All rights reserved.
+ * Copyright(c) 2022-2023 Intel Corporation
  */
 
 struct hdac_bus;
diff --git a/include/sound/soc-acpi-intel-match.h b/include/sound/soc-acpi-intel-match.h
index 845e7608ac37..4843b57798f6 100644
--- a/include/sound/soc-acpi-intel-match.h
+++ b/include/sound/soc-acpi-intel-match.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0
  *
- * Copyright (C) 2017, Intel Corporation. All rights reserved.
+ * Copyright (C) 2017, Intel Corporation
  */
 
 #ifndef __LINUX_SND_SOC_ACPI_INTEL_MATCH_H
diff --git a/include/sound/soc-acpi.h b/include/sound/soc-acpi.h
index 23d6d6bfb073..8967d07a2736 100644
--- a/include/sound/soc-acpi.h
+++ b/include/sound/soc-acpi.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only
  *
- * Copyright (C) 2013-15, Intel Corporation. All rights reserved.
+ * Copyright (C) 2013-15, Intel Corporation
  */
 
 #ifndef __LINUX_SND_SOC_ACPI_H
diff --git a/include/sound/sof.h b/include/sound/sof.h
index 05213bb515a3..cde058777832 100644
--- a/include/sound/sof.h
+++ b/include/sound/sof.h
@@ -3,7 +3,7 @@
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
  *
- * Copyright(c) 2018 Intel Corporation. All rights reserved.
+ * Copyright(c) 2018 Intel Corporation
  *
  * Author: Liam Girdwood <liam.r.girdwood@linux.intel.com>
  */
diff --git a/include/sound/sof/channel_map.h b/include/sound/sof/channel_map.h
index d363f0ca6979..2a177a1938e9 100644
--- a/include/sound/sof/channel_map.h
+++ b/include/sound/sof/channel_map.h
@@ -3,7 +3,7 @@
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
  *
- * Copyright(c) 2019 Intel Corporation. All rights reserved.
+ * Copyright(c) 2019 Intel Corporation
  */
 
 #ifndef __IPC_CHANNEL_MAP_H__
diff --git a/include/sound/sof/control.h b/include/sound/sof/control.h
index 983d374fe511..8d3300dd9d8c 100644
--- a/include/sound/sof/control.h
+++ b/include/sound/sof/control.h
@@ -3,7 +3,7 @@
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
  *
- * Copyright(c) 2018 Intel Corporation. All rights reserved.
+ * Copyright(c) 2018 Intel Corporation
  */
 
 #ifndef __INCLUDE_SOUND_SOF_CONTROL_H__
diff --git a/include/sound/sof/dai-intel.h b/include/sound/sof/dai-intel.h
index 5b93b7292f5e..e4e710b43d2d 100644
--- a/include/sound/sof/dai-intel.h
+++ b/include/sound/sof/dai-intel.h
@@ -3,7 +3,7 @@
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
  *
- * Copyright(c) 2018 Intel Corporation. All rights reserved.
+ * Copyright(c) 2018 Intel Corporation
  */
 
 #ifndef __INCLUDE_SOUND_SOF_DAI_INTEL_H__
diff --git a/include/sound/sof/dai.h b/include/sound/sof/dai.h
index 0764a80c17a9..36809f712723 100644
--- a/include/sound/sof/dai.h
+++ b/include/sound/sof/dai.h
@@ -3,7 +3,7 @@
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
  *
- * Copyright(c) 2018 Intel Corporation. All rights reserved.
+ * Copyright(c) 2018 Intel Corporation
  */
 
 #ifndef __INCLUDE_SOUND_SOF_DAI_H__
diff --git a/include/sound/sof/debug.h b/include/sound/sof/debug.h
index 38693e3fb514..8b308d7e5eee 100644
--- a/include/sound/sof/debug.h
+++ b/include/sound/sof/debug.h
@@ -3,7 +3,7 @@
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
  *
- * Copyright(c) 2020 Intel Corporation. All rights reserved.
+ * Copyright(c) 2020 Intel Corporation
  *
  * Author: Karol Trzcinski <karolx.trzcinski@linux.intel.com>
  */
diff --git a/include/sound/sof/ext_manifest.h b/include/sound/sof/ext_manifest.h
index 2a7e055584f9..fc0231d04a94 100644
--- a/include/sound/sof/ext_manifest.h
+++ b/include/sound/sof/ext_manifest.h
@@ -3,7 +3,7 @@
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
  *
- * Copyright(c) 2020 Intel Corporation. All rights reserved.
+ * Copyright(c) 2020 Intel Corporation
  */
 
 /*
diff --git a/include/sound/sof/ext_manifest4.h b/include/sound/sof/ext_manifest4.h
index ec97edcbbfc3..4e1d379d96ad 100644
--- a/include/sound/sof/ext_manifest4.h
+++ b/include/sound/sof/ext_manifest4.h
@@ -3,7 +3,7 @@
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
  *
- * Copyright(c) 2022 Intel Corporation. All rights reserved.
+ * Copyright(c) 2022 Intel Corporation
  */
 
 /*
diff --git a/include/sound/sof/header.h b/include/sound/sof/header.h
index b22e925c70e2..4e406dc22f31 100644
--- a/include/sound/sof/header.h
+++ b/include/sound/sof/header.h
@@ -3,7 +3,7 @@
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
  *
- * Copyright(c) 2018 Intel Corporation. All rights reserved.
+ * Copyright(c) 2018 Intel Corporation
  */
 
 #ifndef __INCLUDE_SOUND_SOF_HEADER_H__
diff --git a/include/sound/sof/info.h b/include/sound/sof/info.h
index 75193850ead0..08400fbe5402 100644
--- a/include/sound/sof/info.h
+++ b/include/sound/sof/info.h
@@ -3,7 +3,7 @@
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
  *
- * Copyright(c) 2018 Intel Corporation. All rights reserved.
+ * Copyright(c) 2018 Intel Corporation
  */
 
 #ifndef __INCLUDE_SOUND_SOF_INFO_H__
diff --git a/include/sound/sof/ipc4/header.h b/include/sound/sof/ipc4/header.h
index 1eb538e18236..0c0cf47946b1 100644
--- a/include/sound/sof/ipc4/header.h
+++ b/include/sound/sof/ipc4/header.h
@@ -3,7 +3,7 @@
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
  *
- * Copyright(c) 2022 Intel Corporation. All rights reserved.
+ * Copyright(c) 2022 Intel Corporation
  */
 
 #ifndef __INCLUDE_SOUND_SOF_IPC4_HEADER_H__
diff --git a/include/sound/sof/pm.h b/include/sound/sof/pm.h
index 366aa6ec442b..df55bfe58304 100644
--- a/include/sound/sof/pm.h
+++ b/include/sound/sof/pm.h
@@ -3,7 +3,7 @@
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
  *
- * Copyright(c) 2018 Intel Corporation. All rights reserved.
+ * Copyright(c) 2018 Intel Corporation
  */
 
 #ifndef __INCLUDE_SOUND_SOF_PM_H__
diff --git a/include/sound/sof/stream.h b/include/sound/sof/stream.h
index 9377113f13e4..1bb25487d120 100644
--- a/include/sound/sof/stream.h
+++ b/include/sound/sof/stream.h
@@ -3,7 +3,7 @@
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
  *
- * Copyright(c) 2018 Intel Corporation. All rights reserved.
+ * Copyright(c) 2018 Intel Corporation
  */
 
 #ifndef __INCLUDE_SOUND_SOF_STREAM_H__
diff --git a/include/sound/sof/topology.h b/include/sound/sof/topology.h
index b3ca886fa28f..3ba086f61983 100644
--- a/include/sound/sof/topology.h
+++ b/include/sound/sof/topology.h
@@ -3,7 +3,7 @@
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
  *
- * Copyright(c) 2018 Intel Corporation. All rights reserved.
+ * Copyright(c) 2018 Intel Corporation
  */
 
 #ifndef __INCLUDE_SOUND_SOF_TOPOLOGY_H__
diff --git a/include/sound/sof/trace.h b/include/sound/sof/trace.h
index 25ea99f62d37..ac2ebb6fb38f 100644
--- a/include/sound/sof/trace.h
+++ b/include/sound/sof/trace.h
@@ -3,7 +3,7 @@
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
  *
- * Copyright(c) 2018 Intel Corporation. All rights reserved.
+ * Copyright(c) 2018 Intel Corporation
  */
 
 #ifndef __INCLUDE_SOUND_SOF_TRACE_H__
diff --git a/include/sound/sof/xtensa.h b/include/sound/sof/xtensa.h
index 87a07e520415..ef70f8e266a2 100644
--- a/include/sound/sof/xtensa.h
+++ b/include/sound/sof/xtensa.h
@@ -3,7 +3,7 @@
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
  *
- * Copyright(c) 2018 Intel Corporation. All rights reserved.
+ * Copyright(c) 2018 Intel Corporation
  */
 
 #ifndef __INCLUDE_SOUND_SOF_XTENSA_H__
diff --git a/include/trace/events/sof.h b/include/trace/events/sof.h
index 21c2a1efb9f6..cd8e6844cca2 100644
--- a/include/trace/events/sof.h
+++ b/include/trace/events/sof.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright(c) 2022 Intel Corporation. All rights reserved.
+ * Copyright(c) 2022 Intel Corporation
  *
  * Author: Noah Klayman <noah.klayman@intel.com>
  */
diff --git a/include/trace/events/sof_intel.h b/include/trace/events/sof_intel.h
index 2a77f9d26c0b..4cac5fef99a2 100644
--- a/include/trace/events/sof_intel.h
+++ b/include/trace/events/sof_intel.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright(c) 2022 Intel Corporation. All rights reserved.
+ * Copyright(c) 2022 Intel Corporation
  *
  * Author: Noah Klayman <noah.klayman@intel.com>
  */
diff --git a/include/uapi/sound/intel/avs/tokens.h b/include/uapi/sound/intel/avs/tokens.h
index 4beca03405c0..3e3fb258dd54 100644
--- a/include/uapi/sound/intel/avs/tokens.h
+++ b/include/uapi/sound/intel/avs/tokens.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
- * Copyright(c) 2021 Intel Corporation. All rights reserved.
+ * Copyright(c) 2021 Intel Corporation
  *
  * Authors: Cezary Rojewski <cezary.rojewski@intel.com>
  *          Amadeusz Slawinski <amadeuszx.slawinski@linux.intel.com>
diff --git a/include/uapi/sound/sof/abi.h b/include/uapi/sound/sof/abi.h
index 45c657c3919e..937ed9408c23 100644
--- a/include/uapi/sound/sof/abi.h
+++ b/include/uapi/sound/sof/abi.h
@@ -3,7 +3,7 @@
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
  *
- * Copyright(c) 2018 Intel Corporation. All rights reserved.
+ * Copyright(c) 2018 Intel Corporation
  */
 
 /**
diff --git a/include/uapi/sound/sof/fw.h b/include/uapi/sound/sof/fw.h
index e9f697467a86..fcfa71faf239 100644
--- a/include/uapi/sound/sof/fw.h
+++ b/include/uapi/sound/sof/fw.h
@@ -3,7 +3,7 @@
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
  *
- * Copyright(c) 2018 Intel Corporation. All rights reserved.
+ * Copyright(c) 2018 Intel Corporation
  */
 
 /*
diff --git a/include/uapi/sound/sof/header.h b/include/uapi/sound/sof/header.h
index cb3c1ace69e3..228d4c3eb225 100644
--- a/include/uapi/sound/sof/header.h
+++ b/include/uapi/sound/sof/header.h
@@ -3,7 +3,7 @@
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
  *
- * Copyright(c) 2018 Intel Corporation. All rights reserved.
+ * Copyright(c) 2018 Intel Corporation
  */
 
 #ifndef __INCLUDE_UAPI_SOUND_SOF_USER_HEADER_H__
diff --git a/include/uapi/sound/sof/tokens.h b/include/uapi/sound/sof/tokens.h
index 6bf00c09d30d..0a246bc218d3 100644
--- a/include/uapi/sound/sof/tokens.h
+++ b/include/uapi/sound/sof/tokens.h
@@ -3,7 +3,7 @@
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
  *
- * Copyright(c) 2018 Intel Corporation. All rights reserved.
+ * Copyright(c) 2018 Intel Corporation
  * Author: Liam Girdwood <liam.r.girdwood@linux.intel.com>
  *         Keyon Jie <yang.jie@linux.intel.com>
  */
-- 
cgit v1.2.3


From cf792e903affdf585f20fea41ea4f9b4eac124ab Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko@kernel.org>
Date: Mon, 29 Apr 2024 16:27:50 -0400
Subject: tpm: Remove unused tpm_buf_tag()

The helper function has no call sites. Thus, remove it.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Reviewed-by: Stefan Berger <stefanb@linux.ibm.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Tested-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 include/linux/tpm.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index 4ee9d13749ad..6588ca87cf93 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -358,13 +358,6 @@ static inline u32 tpm_buf_length(struct tpm_buf *buf)
 	return be32_to_cpu(head->length);
 }
 
-static inline u16 tpm_buf_tag(struct tpm_buf *buf)
-{
-	struct tpm_header *head = (struct tpm_header *)buf->data;
-
-	return be16_to_cpu(head->tag);
-}
-
 static inline void tpm_buf_append(struct tpm_buf *buf,
 				  const unsigned char *new_data,
 				  unsigned int new_len)
-- 
cgit v1.2.3


From 4f0feb5463cc6783c9145e707f93c54e7bb1112e Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko@kernel.org>
Date: Mon, 29 Apr 2024 16:27:51 -0400
Subject: tpm: Remove tpm_send()

Open code the last remaining call site for tpm_send().

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Reviewed-by: Stefan Berger <stefanb@linux.ibm.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Tested-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 drivers/char/tpm/tpm-interface.c          | 25 -------------------------
 include/linux/tpm.h                       |  5 -----
 security/keys/trusted-keys/trusted_tpm1.c | 14 ++++++++++++--
 3 files changed, 12 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/drivers/char/tpm/tpm-interface.c b/drivers/char/tpm/tpm-interface.c
index 757336324c90..f940045a014e 100644
--- a/drivers/char/tpm/tpm-interface.c
+++ b/drivers/char/tpm/tpm-interface.c
@@ -342,31 +342,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(tpm_pcr_extend);
 
-/**
- * tpm_send - send a TPM command
- * @chip:	a &struct tpm_chip instance, %NULL for the default chip
- * @cmd:	a TPM command buffer
- * @buflen:	the length of the TPM command buffer
- *
- * Return: same as with tpm_transmit_cmd()
- */
-int tpm_send(struct tpm_chip *chip, void *cmd, size_t buflen)
-{
-	struct tpm_buf buf;
-	int rc;
-
-	chip = tpm_find_get_ops(chip);
-	if (!chip)
-		return -ENODEV;
-
-	buf.data = cmd;
-	rc = tpm_transmit_cmd(chip, &buf, 0, "attempting to a send a command");
-
-	tpm_put_ops(chip);
-	return rc;
-}
-EXPORT_SYMBOL_GPL(tpm_send);
-
 int tpm_auto_startup(struct tpm_chip *chip)
 {
 	int rc;
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index 6588ca87cf93..d9d645e9c52c 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -422,7 +422,6 @@ extern int tpm_pcr_read(struct tpm_chip *chip, u32 pcr_idx,
 			struct tpm_digest *digest);
 extern int tpm_pcr_extend(struct tpm_chip *chip, u32 pcr_idx,
 			  struct tpm_digest *digests);
-extern int tpm_send(struct tpm_chip *chip, void *cmd, size_t buflen);
 extern int tpm_get_random(struct tpm_chip *chip, u8 *data, size_t max);
 extern struct tpm_chip *tpm_default_chip(void);
 void tpm2_flush_context(struct tpm_chip *chip, u32 handle);
@@ -443,10 +442,6 @@ static inline int tpm_pcr_extend(struct tpm_chip *chip, u32 pcr_idx,
 	return -ENODEV;
 }
 
-static inline int tpm_send(struct tpm_chip *chip, void *cmd, size_t buflen)
-{
-	return -ENODEV;
-}
 static inline int tpm_get_random(struct tpm_chip *chip, u8 *data, size_t max)
 {
 	return -ENODEV;
diff --git a/security/keys/trusted-keys/trusted_tpm1.c b/security/keys/trusted-keys/trusted_tpm1.c
index aa108bea6739..37bce84eef99 100644
--- a/security/keys/trusted-keys/trusted_tpm1.c
+++ b/security/keys/trusted-keys/trusted_tpm1.c
@@ -356,17 +356,27 @@ out:
  */
 int trusted_tpm_send(unsigned char *cmd, size_t buflen)
 {
+	struct tpm_buf buf;
 	int rc;
 
 	if (!chip)
 		return -ENODEV;
 
+	rc = tpm_try_get_ops(chip);
+	if (rc)
+		return rc;
+
+	buf.flags = 0;
+	buf.data = cmd;
 	dump_tpm_buf(cmd);
-	rc = tpm_send(chip, cmd, buflen);
+	rc = tpm_transmit_cmd(chip, &buf, 4, "sending data");
 	dump_tpm_buf(cmd);
+
 	if (rc > 0)
-		/* Can't return positive return codes values to keyctl */
+		/* TPM error */
 		rc = -EPERM;
+
+	tpm_put_ops(chip);
 	return rc;
 }
 EXPORT_SYMBOL_GPL(trusted_tpm_send);
-- 
cgit v1.2.3


From 17d89b2e2f76e8ae129467665ea7462401f37aa4 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Mon, 29 Apr 2024 16:27:52 -0400
Subject: tpm: Move buffer handling from static inlines to real functions

separate out the tpm_buf_... handling functions from static inlines in
tpm.h and move them to their own tpm-buf.c file.  This is a precursor
to adding new functions for other TPM type handling because the amount
of code will grow from the current 70 lines in tpm.h to about 200
lines when the additions are done.  200 lines of inline functions is a
bit too much to keep in a header file.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Reviewed-by: Stefan Berger <stefanb@linux.ibm.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Tested-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 drivers/char/tpm/Makefile  |  1 +
 drivers/char/tpm/tpm-buf.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/tpm.h        | 80 +++++-------------------------------------
 3 files changed, 97 insertions(+), 71 deletions(-)
 create mode 100644 drivers/char/tpm/tpm-buf.c

(limited to 'include')

diff --git a/drivers/char/tpm/Makefile b/drivers/char/tpm/Makefile
index 0222b1ddb310..ad3594e383e1 100644
--- a/drivers/char/tpm/Makefile
+++ b/drivers/char/tpm/Makefile
@@ -15,6 +15,7 @@ tpm-y += tpm-sysfs.o
 tpm-y += eventlog/common.o
 tpm-y += eventlog/tpm1.o
 tpm-y += eventlog/tpm2.o
+tpm-y += tpm-buf.o
 
 tpm-$(CONFIG_ACPI) += tpm_ppi.o eventlog/acpi.o
 tpm-$(CONFIG_EFI) += eventlog/efi.o
diff --git a/drivers/char/tpm/tpm-buf.c b/drivers/char/tpm/tpm-buf.c
new file mode 100644
index 000000000000..96cee41d5b9c
--- /dev/null
+++ b/drivers/char/tpm/tpm-buf.c
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Handling of TPM command and other buffers.
+ */
+
+#include <linux/module.h>
+#include <linux/tpm.h>
+
+int tpm_buf_init(struct tpm_buf *buf, u16 tag, u32 ordinal)
+{
+	buf->data = (u8 *)__get_free_page(GFP_KERNEL);
+	if (!buf->data)
+		return -ENOMEM;
+
+	buf->flags = 0;
+	tpm_buf_reset(buf, tag, ordinal);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tpm_buf_init);
+
+void tpm_buf_reset(struct tpm_buf *buf, u16 tag, u32 ordinal)
+{
+	struct tpm_header *head = (struct tpm_header *)buf->data;
+
+	head->tag = cpu_to_be16(tag);
+	head->length = cpu_to_be32(sizeof(*head));
+	head->ordinal = cpu_to_be32(ordinal);
+}
+EXPORT_SYMBOL_GPL(tpm_buf_reset);
+
+void tpm_buf_destroy(struct tpm_buf *buf)
+{
+	free_page((unsigned long)buf->data);
+}
+EXPORT_SYMBOL_GPL(tpm_buf_destroy);
+
+u32 tpm_buf_length(struct tpm_buf *buf)
+{
+	struct tpm_header *head = (struct tpm_header *)buf->data;
+
+	return be32_to_cpu(head->length);
+}
+EXPORT_SYMBOL_GPL(tpm_buf_length);
+
+void tpm_buf_append(struct tpm_buf *buf,
+		    const unsigned char *new_data,
+		    unsigned int new_len)
+{
+	struct tpm_header *head = (struct tpm_header *)buf->data;
+	u32 len = tpm_buf_length(buf);
+
+	/* Return silently if overflow has already happened. */
+	if (buf->flags & TPM_BUF_OVERFLOW)
+		return;
+
+	if ((len + new_len) > PAGE_SIZE) {
+		WARN(1, "tpm_buf: overflow\n");
+		buf->flags |= TPM_BUF_OVERFLOW;
+		return;
+	}
+
+	memcpy(&buf->data[len], new_data, new_len);
+	head->length = cpu_to_be32(len + new_len);
+}
+EXPORT_SYMBOL_GPL(tpm_buf_append);
+
+void tpm_buf_append_u8(struct tpm_buf *buf, const u8 value)
+{
+	tpm_buf_append(buf, &value, 1);
+}
+EXPORT_SYMBOL_GPL(tpm_buf_append_u8);
+
+void tpm_buf_append_u16(struct tpm_buf *buf, const u16 value)
+{
+	__be16 value2 = cpu_to_be16(value);
+
+	tpm_buf_append(buf, (u8 *)&value2, 2);
+}
+EXPORT_SYMBOL_GPL(tpm_buf_append_u16);
+
+void tpm_buf_append_u32(struct tpm_buf *buf, const u32 value)
+{
+	__be32 value2 = cpu_to_be32(value);
+
+	tpm_buf_append(buf, (u8 *)&value2, 4);
+}
+EXPORT_SYMBOL_GPL(tpm_buf_append_u32);
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index d9d645e9c52c..bb0e8718a432 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -326,77 +326,15 @@ struct tpm2_hash {
 	unsigned int tpm_id;
 };
 
-static inline void tpm_buf_reset(struct tpm_buf *buf, u16 tag, u32 ordinal)
-{
-	struct tpm_header *head = (struct tpm_header *)buf->data;
-
-	head->tag = cpu_to_be16(tag);
-	head->length = cpu_to_be32(sizeof(*head));
-	head->ordinal = cpu_to_be32(ordinal);
-}
-
-static inline int tpm_buf_init(struct tpm_buf *buf, u16 tag, u32 ordinal)
-{
-	buf->data = (u8 *)__get_free_page(GFP_KERNEL);
-	if (!buf->data)
-		return -ENOMEM;
-
-	buf->flags = 0;
-	tpm_buf_reset(buf, tag, ordinal);
-	return 0;
-}
-
-static inline void tpm_buf_destroy(struct tpm_buf *buf)
-{
-	free_page((unsigned long)buf->data);
-}
-
-static inline u32 tpm_buf_length(struct tpm_buf *buf)
-{
-	struct tpm_header *head = (struct tpm_header *)buf->data;
-
-	return be32_to_cpu(head->length);
-}
-
-static inline void tpm_buf_append(struct tpm_buf *buf,
-				  const unsigned char *new_data,
-				  unsigned int new_len)
-{
-	struct tpm_header *head = (struct tpm_header *)buf->data;
-	u32 len = tpm_buf_length(buf);
-
-	/* Return silently if overflow has already happened. */
-	if (buf->flags & TPM_BUF_OVERFLOW)
-		return;
-
-	if ((len + new_len) > PAGE_SIZE) {
-		WARN(1, "tpm_buf: overflow\n");
-		buf->flags |= TPM_BUF_OVERFLOW;
-		return;
-	}
-
-	memcpy(&buf->data[len], new_data, new_len);
-	head->length = cpu_to_be32(len + new_len);
-}
-
-static inline void tpm_buf_append_u8(struct tpm_buf *buf, const u8 value)
-{
-	tpm_buf_append(buf, &value, 1);
-}
-
-static inline void tpm_buf_append_u16(struct tpm_buf *buf, const u16 value)
-{
-	__be16 value2 = cpu_to_be16(value);
-
-	tpm_buf_append(buf, (u8 *) &value2, 2);
-}
-
-static inline void tpm_buf_append_u32(struct tpm_buf *buf, const u32 value)
-{
-	__be32 value2 = cpu_to_be32(value);
-
-	tpm_buf_append(buf, (u8 *) &value2, 4);
-}
+int tpm_buf_init(struct tpm_buf *buf, u16 tag, u32 ordinal);
+void tpm_buf_reset(struct tpm_buf *buf, u16 tag, u32 ordinal);
+void tpm_buf_destroy(struct tpm_buf *buf);
+u32 tpm_buf_length(struct tpm_buf *buf);
+void tpm_buf_append(struct tpm_buf *buf, const unsigned char *new_data,
+		    unsigned int new_len);
+void tpm_buf_append_u8(struct tpm_buf *buf, const u8 value);
+void tpm_buf_append_u16(struct tpm_buf *buf, const u16 value);
+void tpm_buf_append_u32(struct tpm_buf *buf, const u32 value);
 
 /*
  * Check if TPM device is in the firmware upgrade mode.
-- 
cgit v1.2.3


From 37e2ee16d67d17926085cbd0500dd78ca747e59c Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko@kernel.org>
Date: Mon, 29 Apr 2024 16:27:53 -0400
Subject: tpm: Update struct tpm_buf documentation comments

Remove deprecated portions and document enum values.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Reviewed-by: Stefan Berger <stefanb@linux.ibm.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Tested-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 include/linux/tpm.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index bb0e8718a432..0a8c1351adc2 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -297,15 +297,14 @@ struct tpm_header {
 	};
 } __packed;
 
-/* A string buffer type for constructing TPM commands. This is based on the
- * ideas of string buffer code in security/keys/trusted.h but is heap based
- * in order to keep the stack usage minimal.
- */
-
 enum tpm_buf_flags {
+	/* the capacity exceeded: */
 	TPM_BUF_OVERFLOW	= BIT(0),
 };
 
+/*
+ * A string buffer type for constructing TPM commands.
+ */
 struct tpm_buf {
 	unsigned int flags;
 	u8 *data;
-- 
cgit v1.2.3


From e1b72e1b11109bd81577950538a17bc0428e647f Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko@kernel.org>
Date: Mon, 29 Apr 2024 16:27:54 -0400
Subject: tpm: Store the length of the tpm_buf data separately.

TPM2B buffers, or sized buffers, have a two byte header, which contains the
length of the payload as a 16-bit big-endian number, without counting in
the space taken by the header. This differs from encoding in the TPM header
where the length includes also the bytes taken by the header.

Unbound the length of a tpm_buf from the value stored to the TPM command
header. A separate encoding and decoding step so that different buffer
types can be supported, with variant header format and length encoding.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Reviewed-by: Stefan Berger <stefanb@linux.ibm.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Tested-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 drivers/char/tpm/tpm-buf.c                | 49 ++++++++++++++++++++++++-------
 drivers/char/tpm/tpm-interface.c          |  1 +
 include/keys/trusted_tpm.h                |  2 --
 include/linux/tpm.h                       |  6 ++--
 security/keys/trusted-keys/trusted_tpm1.c |  9 +++---
 5 files changed, 47 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/char/tpm/tpm-buf.c b/drivers/char/tpm/tpm-buf.c
index 96cee41d5b9c..2f2710ca6f6d 100644
--- a/drivers/char/tpm/tpm-buf.c
+++ b/drivers/char/tpm/tpm-buf.c
@@ -3,25 +3,44 @@
  * Handling of TPM command and other buffers.
  */
 
+#include <linux/tpm_command.h>
 #include <linux/module.h>
 #include <linux/tpm.h>
 
+/**
+ * tpm_buf_init() - Allocate and initialize a TPM command
+ * @buf:	A &tpm_buf
+ * @tag:	TPM_TAG_RQU_COMMAND, TPM2_ST_NO_SESSIONS or TPM2_ST_SESSIONS
+ * @ordinal:	A command ordinal
+ *
+ * Return: 0 or -ENOMEM
+ */
 int tpm_buf_init(struct tpm_buf *buf, u16 tag, u32 ordinal)
 {
 	buf->data = (u8 *)__get_free_page(GFP_KERNEL);
 	if (!buf->data)
 		return -ENOMEM;
 
-	buf->flags = 0;
 	tpm_buf_reset(buf, tag, ordinal);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(tpm_buf_init);
 
+/**
+ * tpm_buf_reset() - Initialize a TPM command
+ * @buf:	A &tpm_buf
+ * @tag:	TPM_TAG_RQU_COMMAND, TPM2_ST_NO_SESSIONS or TPM2_ST_SESSIONS
+ * @ordinal:	A command ordinal
+ */
 void tpm_buf_reset(struct tpm_buf *buf, u16 tag, u32 ordinal)
 {
 	struct tpm_header *head = (struct tpm_header *)buf->data;
 
+	WARN_ON(tag != TPM_TAG_RQU_COMMAND && tag != TPM2_ST_NO_SESSIONS &&
+		tag != TPM2_ST_SESSIONS && tag != 0);
+
+	buf->flags = 0;
+	buf->length = sizeof(*head);
 	head->tag = cpu_to_be16(tag);
 	head->length = cpu_to_be32(sizeof(*head));
 	head->ordinal = cpu_to_be32(ordinal);
@@ -34,33 +53,41 @@ void tpm_buf_destroy(struct tpm_buf *buf)
 }
 EXPORT_SYMBOL_GPL(tpm_buf_destroy);
 
+/**
+ * tpm_buf_length() - Return the number of bytes consumed by the data
+ * @buf:	A &tpm_buf
+ *
+ * Return: The number of bytes consumed by the buffer
+ */
 u32 tpm_buf_length(struct tpm_buf *buf)
 {
-	struct tpm_header *head = (struct tpm_header *)buf->data;
-
-	return be32_to_cpu(head->length);
+	return buf->length;
 }
 EXPORT_SYMBOL_GPL(tpm_buf_length);
 
-void tpm_buf_append(struct tpm_buf *buf,
-		    const unsigned char *new_data,
-		    unsigned int new_len)
+/**
+ * tpm_buf_append() - Append data to an initialized buffer
+ * @buf:	A &tpm_buf
+ * @new_data:	A data blob
+ * @new_length:	Size of the appended data
+ */
+void tpm_buf_append(struct tpm_buf *buf, const u8 *new_data, u16 new_length)
 {
 	struct tpm_header *head = (struct tpm_header *)buf->data;
-	u32 len = tpm_buf_length(buf);
 
 	/* Return silently if overflow has already happened. */
 	if (buf->flags & TPM_BUF_OVERFLOW)
 		return;
 
-	if ((len + new_len) > PAGE_SIZE) {
+	if ((buf->length + new_length) > PAGE_SIZE) {
 		WARN(1, "tpm_buf: overflow\n");
 		buf->flags |= TPM_BUF_OVERFLOW;
 		return;
 	}
 
-	memcpy(&buf->data[len], new_data, new_len);
-	head->length = cpu_to_be32(len + new_len);
+	memcpy(&buf->data[buf->length], new_data, new_length);
+	buf->length += new_length;
+	head->length = cpu_to_be32(buf->length);
 }
 EXPORT_SYMBOL_GPL(tpm_buf_append);
 
diff --git a/drivers/char/tpm/tpm-interface.c b/drivers/char/tpm/tpm-interface.c
index f940045a014e..5da134f12c9a 100644
--- a/drivers/char/tpm/tpm-interface.c
+++ b/drivers/char/tpm/tpm-interface.c
@@ -232,6 +232,7 @@ ssize_t tpm_transmit_cmd(struct tpm_chip *chip, struct tpm_buf *buf,
 	if (len < min_rsp_body_length + TPM_HEADER_SIZE)
 		return -EFAULT;
 
+	buf->length = len;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(tpm_transmit_cmd);
diff --git a/include/keys/trusted_tpm.h b/include/keys/trusted_tpm.h
index 7769b726863a..a088b33fd0e3 100644
--- a/include/keys/trusted_tpm.h
+++ b/include/keys/trusted_tpm.h
@@ -6,8 +6,6 @@
 #include <linux/tpm_command.h>
 
 /* implementation specific TPM constants */
-#define MAX_BUF_SIZE			1024
-#define TPM_GETRANDOM_SIZE		14
 #define TPM_SIZE_OFFSET			2
 #define TPM_RETURN_OFFSET		6
 #define TPM_DATA_OFFSET			10
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index 0a8c1351adc2..1d7b39b5c383 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -306,7 +306,8 @@ enum tpm_buf_flags {
  * A string buffer type for constructing TPM commands.
  */
 struct tpm_buf {
-	unsigned int flags;
+	u32 flags;
+	u32 length;
 	u8 *data;
 };
 
@@ -329,8 +330,7 @@ int tpm_buf_init(struct tpm_buf *buf, u16 tag, u32 ordinal);
 void tpm_buf_reset(struct tpm_buf *buf, u16 tag, u32 ordinal);
 void tpm_buf_destroy(struct tpm_buf *buf);
 u32 tpm_buf_length(struct tpm_buf *buf);
-void tpm_buf_append(struct tpm_buf *buf, const unsigned char *new_data,
-		    unsigned int new_len);
+void tpm_buf_append(struct tpm_buf *buf, const u8 *new_data, u16 new_length);
 void tpm_buf_append_u8(struct tpm_buf *buf, const u8 value);
 void tpm_buf_append_u16(struct tpm_buf *buf, const u16 value);
 void tpm_buf_append_u32(struct tpm_buf *buf, const u32 value);
diff --git a/security/keys/trusted-keys/trusted_tpm1.c b/security/keys/trusted-keys/trusted_tpm1.c
index 37bce84eef99..89c9798d1800 100644
--- a/security/keys/trusted-keys/trusted_tpm1.c
+++ b/security/keys/trusted-keys/trusted_tpm1.c
@@ -367,6 +367,7 @@ int trusted_tpm_send(unsigned char *cmd, size_t buflen)
 		return rc;
 
 	buf.flags = 0;
+	buf.length = buflen;
 	buf.data = cmd;
 	dump_tpm_buf(cmd);
 	rc = tpm_transmit_cmd(chip, &buf, 4, "sending data");
@@ -417,7 +418,7 @@ static int osap(struct tpm_buf *tb, struct osapsess *s,
 	tpm_buf_append_u32(tb, handle);
 	tpm_buf_append(tb, ononce, TPM_NONCE_SIZE);
 
-	ret = trusted_tpm_send(tb->data, MAX_BUF_SIZE);
+	ret = trusted_tpm_send(tb->data, tb->length);
 	if (ret < 0)
 		return ret;
 
@@ -441,7 +442,7 @@ int oiap(struct tpm_buf *tb, uint32_t *handle, unsigned char *nonce)
 		return -ENODEV;
 
 	tpm_buf_reset(tb, TPM_TAG_RQU_COMMAND, TPM_ORD_OIAP);
-	ret = trusted_tpm_send(tb->data, MAX_BUF_SIZE);
+	ret = trusted_tpm_send(tb->data, tb->length);
 	if (ret < 0)
 		return ret;
 
@@ -553,7 +554,7 @@ static int tpm_seal(struct tpm_buf *tb, uint16_t keytype,
 	tpm_buf_append_u8(tb, cont);
 	tpm_buf_append(tb, td->pubauth, SHA1_DIGEST_SIZE);
 
-	ret = trusted_tpm_send(tb->data, MAX_BUF_SIZE);
+	ret = trusted_tpm_send(tb->data, tb->length);
 	if (ret < 0)
 		goto out;
 
@@ -644,7 +645,7 @@ static int tpm_unseal(struct tpm_buf *tb,
 	tpm_buf_append_u8(tb, cont);
 	tpm_buf_append(tb, authdata2, SHA1_DIGEST_SIZE);
 
-	ret = trusted_tpm_send(tb->data, MAX_BUF_SIZE);
+	ret = trusted_tpm_send(tb->data, tb->length);
 	if (ret < 0) {
 		pr_info("authhmac failed (%d)\n", ret);
 		return ret;
-- 
cgit v1.2.3


From d926ee92e84146ff85877b15d9ac0c7cd7c422c6 Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko@kernel.org>
Date: Mon, 29 Apr 2024 16:27:55 -0400
Subject: tpm: TPM2B formatted buffers

Declare tpm_buf_init_sized() and tpm_buf_reset_sized() for creating TPM2B
formatted buffers. These buffers are also known as sized buffers in the
specifications and literature.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Reviewed-by: Stefan Berger <stefanb@linux.ibm.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Tested-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 drivers/char/tpm/tpm-buf.c | 38 +++++++++++++++++++++++++++++++++++---
 include/linux/tpm.h        |  4 ++++
 2 files changed, 39 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/char/tpm/tpm-buf.c b/drivers/char/tpm/tpm-buf.c
index 2f2710ca6f6d..d679dc1cecc7 100644
--- a/drivers/char/tpm/tpm-buf.c
+++ b/drivers/char/tpm/tpm-buf.c
@@ -47,6 +47,36 @@ void tpm_buf_reset(struct tpm_buf *buf, u16 tag, u32 ordinal)
 }
 EXPORT_SYMBOL_GPL(tpm_buf_reset);
 
+/**
+ * tpm_buf_init_sized() - Allocate and initialize a sized (TPM2B) buffer
+ * @buf:	A @tpm_buf
+ *
+ * Return: 0 or -ENOMEM
+ */
+int tpm_buf_init_sized(struct tpm_buf *buf)
+{
+	buf->data = (u8 *)__get_free_page(GFP_KERNEL);
+	if (!buf->data)
+		return -ENOMEM;
+
+	tpm_buf_reset_sized(buf);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(tpm_buf_init_sized);
+
+/**
+ * tpm_buf_reset_sized() - Initialize a sized buffer
+ * @buf:	A &tpm_buf
+ */
+void tpm_buf_reset_sized(struct tpm_buf *buf)
+{
+	buf->flags = TPM_BUF_TPM2B;
+	buf->length = 2;
+	buf->data[0] = 0;
+	buf->data[1] = 0;
+}
+EXPORT_SYMBOL_GPL(tpm_buf_reset_sized);
+
 void tpm_buf_destroy(struct tpm_buf *buf)
 {
 	free_page((unsigned long)buf->data);
@@ -73,8 +103,6 @@ EXPORT_SYMBOL_GPL(tpm_buf_length);
  */
 void tpm_buf_append(struct tpm_buf *buf, const u8 *new_data, u16 new_length)
 {
-	struct tpm_header *head = (struct tpm_header *)buf->data;
-
 	/* Return silently if overflow has already happened. */
 	if (buf->flags & TPM_BUF_OVERFLOW)
 		return;
@@ -87,7 +115,11 @@ void tpm_buf_append(struct tpm_buf *buf, const u8 *new_data, u16 new_length)
 
 	memcpy(&buf->data[buf->length], new_data, new_length);
 	buf->length += new_length;
-	head->length = cpu_to_be32(buf->length);
+
+	if (buf->flags & TPM_BUF_TPM2B)
+		((__be16 *)buf->data)[0] = cpu_to_be16(buf->length - 2);
+	else
+		((struct tpm_header *)buf->data)->length = cpu_to_be32(buf->length);
 }
 EXPORT_SYMBOL_GPL(tpm_buf_append);
 
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index 1d7b39b5c383..715db4a91c1f 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -300,6 +300,8 @@ struct tpm_header {
 enum tpm_buf_flags {
 	/* the capacity exceeded: */
 	TPM_BUF_OVERFLOW	= BIT(0),
+	/* TPM2B format: */
+	TPM_BUF_TPM2B		= BIT(1),
 };
 
 /*
@@ -328,6 +330,8 @@ struct tpm2_hash {
 
 int tpm_buf_init(struct tpm_buf *buf, u16 tag, u32 ordinal);
 void tpm_buf_reset(struct tpm_buf *buf, u16 tag, u32 ordinal);
+int tpm_buf_init_sized(struct tpm_buf *buf);
+void tpm_buf_reset_sized(struct tpm_buf *buf);
 void tpm_buf_destroy(struct tpm_buf *buf);
 u32 tpm_buf_length(struct tpm_buf *buf);
 void tpm_buf_append(struct tpm_buf *buf, const u8 *new_data, u16 new_length);
-- 
cgit v1.2.3


From acd5eb4f50241c5e3c2e0056c0e2151295796416 Mon Sep 17 00:00:00 2001
From: Jarkko Sakkinen <jarkko@kernel.org>
Date: Mon, 29 Apr 2024 16:27:56 -0400
Subject: tpm: Add tpm_buf_read_{u8,u16,u32}

Declare reader functions for the instances of struct tpm_buf. If the read
goes out of boundary, TPM_BUF_BOUNDARY_ERROR is set, and subsequent read
will do nothing.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Reviewed-by: Stefan Berger <stefanb@linux.ibm.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Tested-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 drivers/char/tpm/tpm-buf.c | 79 +++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/tpm.h        |  5 +++
 2 files changed, 83 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/char/tpm/tpm-buf.c b/drivers/char/tpm/tpm-buf.c
index d679dc1cecc7..cc14fa3fd8c2 100644
--- a/drivers/char/tpm/tpm-buf.c
+++ b/drivers/char/tpm/tpm-buf.c
@@ -108,7 +108,7 @@ void tpm_buf_append(struct tpm_buf *buf, const u8 *new_data, u16 new_length)
 		return;
 
 	if ((buf->length + new_length) > PAGE_SIZE) {
-		WARN(1, "tpm_buf: overflow\n");
+		WARN(1, "tpm_buf: write overflow\n");
 		buf->flags |= TPM_BUF_OVERFLOW;
 		return;
 	}
@@ -144,3 +144,80 @@ void tpm_buf_append_u32(struct tpm_buf *buf, const u32 value)
 	tpm_buf_append(buf, (u8 *)&value2, 4);
 }
 EXPORT_SYMBOL_GPL(tpm_buf_append_u32);
+
+/**
+ * tpm_buf_read() - Read from a TPM buffer
+ * @buf:	&tpm_buf instance
+ * @offset:	offset within the buffer
+ * @count:	the number of bytes to read
+ * @output:	the output buffer
+ */
+static void tpm_buf_read(struct tpm_buf *buf, off_t *offset, size_t count, void *output)
+{
+	off_t next_offset;
+
+	/* Return silently if overflow has already happened. */
+	if (buf->flags & TPM_BUF_BOUNDARY_ERROR)
+		return;
+
+	next_offset = *offset + count;
+	if (next_offset > buf->length) {
+		WARN(1, "tpm_buf: read out of boundary\n");
+		buf->flags |= TPM_BUF_BOUNDARY_ERROR;
+		return;
+	}
+
+	memcpy(output, &buf->data[*offset], count);
+	*offset = next_offset;
+}
+
+/**
+ * tpm_buf_read_u8() - Read 8-bit word from a TPM buffer
+ * @buf:	&tpm_buf instance
+ * @offset:	offset within the buffer
+ *
+ * Return: next 8-bit word
+ */
+u8 tpm_buf_read_u8(struct tpm_buf *buf, off_t *offset)
+{
+	u8 value;
+
+	tpm_buf_read(buf, offset, sizeof(value), &value);
+
+	return value;
+}
+EXPORT_SYMBOL_GPL(tpm_buf_read_u8);
+
+/**
+ * tpm_buf_read_u16() - Read 16-bit word from a TPM buffer
+ * @buf:	&tpm_buf instance
+ * @offset:	offset within the buffer
+ *
+ * Return: next 16-bit word
+ */
+u16 tpm_buf_read_u16(struct tpm_buf *buf, off_t *offset)
+{
+	u16 value;
+
+	tpm_buf_read(buf, offset, sizeof(value), &value);
+
+	return be16_to_cpu(value);
+}
+EXPORT_SYMBOL_GPL(tpm_buf_read_u16);
+
+/**
+ * tpm_buf_read_u32() - Read 32-bit word from a TPM buffer
+ * @buf:	&tpm_buf instance
+ * @offset:	offset within the buffer
+ *
+ * Return: next 32-bit word
+ */
+u32 tpm_buf_read_u32(struct tpm_buf *buf, off_t *offset)
+{
+	u32 value;
+
+	tpm_buf_read(buf, offset, sizeof(value), &value);
+
+	return be32_to_cpu(value);
+}
+EXPORT_SYMBOL_GPL(tpm_buf_read_u32);
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index 715db4a91c1f..e8172f81c562 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -302,6 +302,8 @@ enum tpm_buf_flags {
 	TPM_BUF_OVERFLOW	= BIT(0),
 	/* TPM2B format: */
 	TPM_BUF_TPM2B		= BIT(1),
+	/* read out of boundary: */
+	TPM_BUF_BOUNDARY_ERROR	= BIT(2),
 };
 
 /*
@@ -338,6 +340,9 @@ void tpm_buf_append(struct tpm_buf *buf, const u8 *new_data, u16 new_length);
 void tpm_buf_append_u8(struct tpm_buf *buf, const u8 value);
 void tpm_buf_append_u16(struct tpm_buf *buf, const u16 value);
 void tpm_buf_append_u32(struct tpm_buf *buf, const u32 value);
+u8 tpm_buf_read_u8(struct tpm_buf *buf, off_t *offset);
+u16 tpm_buf_read_u16(struct tpm_buf *buf, off_t *offset);
+u32 tpm_buf_read_u32(struct tpm_buf *buf, off_t *offset);
 
 /*
  * Check if TPM device is in the firmware upgrade mode.
-- 
cgit v1.2.3


From f135440447af5156de91272ee52ccedcf0796e94 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Mon, 29 Apr 2024 16:27:58 -0400
Subject: crypto: lib - implement library version of AES in CFB mode

Implement AES in CFB mode using the existing, mostly constant-time
generic AES library implementation. This will be used by the TPM code
to encrypt communications with TPM hardware, which is often a discrete
component connected using sniffable wires or traces.

While a CFB template does exist, using a skcipher is a major pain for
non-performance critical synchronous crypto where the algorithm is known
at compile time and the data is in contiguous buffers with valid kernel
virtual addresses.

Tested-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Reviewed-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Link: https://lore.kernel.org/all/20230216201410.15010-1-James.Bottomley@HansenPartnership.com/
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Tested-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 include/crypto/aes.h |   5 +
 lib/crypto/Kconfig   |   5 +
 lib/crypto/Makefile  |   3 +
 lib/crypto/aescfb.c  | 257 +++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 270 insertions(+)
 create mode 100644 lib/crypto/aescfb.c

(limited to 'include')

diff --git a/include/crypto/aes.h b/include/crypto/aes.h
index 2090729701ab..9339da7c20a8 100644
--- a/include/crypto/aes.h
+++ b/include/crypto/aes.h
@@ -87,4 +87,9 @@ void aes_decrypt(const struct crypto_aes_ctx *ctx, u8 *out, const u8 *in);
 extern const u8 crypto_aes_sbox[];
 extern const u8 crypto_aes_inv_sbox[];
 
+void aescfb_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src,
+		    int len, const u8 iv[AES_BLOCK_SIZE]);
+void aescfb_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src,
+		    int len, const u8 iv[AES_BLOCK_SIZE]);
+
 #endif
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig
index 45436bfc6dff..b01253cac70a 100644
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -8,6 +8,11 @@ config CRYPTO_LIB_UTILS
 config CRYPTO_LIB_AES
 	tristate
 
+config CRYPTO_LIB_AESCFB
+	tristate
+	select CRYPTO_LIB_AES
+	select CRYPTO_LIB_UTILS
+
 config CRYPTO_LIB_AESGCM
 	tristate
 	select CRYPTO_LIB_AES
diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile
index 8d1446c2be71..969baab8c805 100644
--- a/lib/crypto/Makefile
+++ b/lib/crypto/Makefile
@@ -10,6 +10,9 @@ obj-$(CONFIG_CRYPTO_LIB_CHACHA_GENERIC)		+= libchacha.o
 obj-$(CONFIG_CRYPTO_LIB_AES)			+= libaes.o
 libaes-y					:= aes.o
 
+obj-$(CONFIG_CRYPTO_LIB_AESCFB)			+= libaescfb.o
+libaescfb-y					:= aescfb.o
+
 obj-$(CONFIG_CRYPTO_LIB_AESGCM)			+= libaesgcm.o
 libaesgcm-y					:= aesgcm.o
 
diff --git a/lib/crypto/aescfb.c b/lib/crypto/aescfb.c
new file mode 100644
index 000000000000..749dc1258a44
--- /dev/null
+++ b/lib/crypto/aescfb.c
@@ -0,0 +1,257 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Minimal library implementation of AES in CFB mode
+ *
+ * Copyright 2023 Google LLC
+ */
+
+#include <linux/module.h>
+
+#include <crypto/algapi.h>
+#include <crypto/aes.h>
+
+#include <asm/irqflags.h>
+
+static void aescfb_encrypt_block(const struct crypto_aes_ctx *ctx, void *dst,
+				 const void *src)
+{
+	unsigned long flags;
+
+	/*
+	 * In AES-CFB, the AES encryption operates on known 'plaintext' (the IV
+	 * and ciphertext), making it susceptible to timing attacks on the
+	 * encryption key. The AES library already mitigates this risk to some
+	 * extent by pulling the entire S-box into the caches before doing any
+	 * substitutions, but this strategy is more effective when running with
+	 * interrupts disabled.
+	 */
+	local_irq_save(flags);
+	aes_encrypt(ctx, dst, src);
+	local_irq_restore(flags);
+}
+
+/**
+ * aescfb_encrypt - Perform AES-CFB encryption on a block of data
+ *
+ * @ctx:	The AES-CFB key schedule
+ * @dst:	Pointer to the ciphertext output buffer
+ * @src:	Pointer the plaintext (may equal @dst for encryption in place)
+ * @len:	The size in bytes of the plaintext and ciphertext.
+ * @iv:		The initialization vector (IV) to use for this block of data
+ */
+void aescfb_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src,
+		    int len, const u8 iv[AES_BLOCK_SIZE])
+{
+	u8 ks[AES_BLOCK_SIZE];
+	const u8 *v = iv;
+
+	while (len > 0) {
+		aescfb_encrypt_block(ctx, ks, v);
+		crypto_xor_cpy(dst, src, ks, min(len, AES_BLOCK_SIZE));
+		v = dst;
+
+		dst += AES_BLOCK_SIZE;
+		src += AES_BLOCK_SIZE;
+		len -= AES_BLOCK_SIZE;
+	}
+
+	memzero_explicit(ks, sizeof(ks));
+}
+EXPORT_SYMBOL(aescfb_encrypt);
+
+/**
+ * aescfb_decrypt - Perform AES-CFB decryption on a block of data
+ *
+ * @ctx:	The AES-CFB key schedule
+ * @dst:	Pointer to the plaintext output buffer
+ * @src:	Pointer the ciphertext (may equal @dst for decryption in place)
+ * @len:	The size in bytes of the plaintext and ciphertext.
+ * @iv:		The initialization vector (IV) to use for this block of data
+ */
+void aescfb_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src,
+		    int len, const u8 iv[AES_BLOCK_SIZE])
+{
+	u8 ks[2][AES_BLOCK_SIZE];
+
+	aescfb_encrypt_block(ctx, ks[0], iv);
+
+	for (int i = 0; len > 0; i ^= 1) {
+		if (len > AES_BLOCK_SIZE)
+			/*
+			 * Generate the keystream for the next block before
+			 * performing the XOR, as that may update in place and
+			 * overwrite the ciphertext.
+			 */
+			aescfb_encrypt_block(ctx, ks[!i], src);
+
+		crypto_xor_cpy(dst, src, ks[i], min(len, AES_BLOCK_SIZE));
+
+		dst += AES_BLOCK_SIZE;
+		src += AES_BLOCK_SIZE;
+		len -= AES_BLOCK_SIZE;
+	}
+
+	memzero_explicit(ks, sizeof(ks));
+}
+EXPORT_SYMBOL(aescfb_decrypt);
+
+MODULE_DESCRIPTION("Generic AES-CFB library");
+MODULE_AUTHOR("Ard Biesheuvel <ardb@kernel.org>");
+MODULE_LICENSE("GPL");
+
+#ifndef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS
+
+/*
+ * Test code below. Vectors taken from crypto/testmgr.h
+ */
+
+static struct {
+	u8	ptext[64];
+	u8	ctext[64];
+
+	u8	key[AES_MAX_KEY_SIZE];
+	u8	iv[AES_BLOCK_SIZE];
+
+	int	klen;
+	int	len;
+} const aescfb_tv[] __initconst = {
+	{ /* From NIST SP800-38A */
+		.key    = "\x2b\x7e\x15\x16\x28\xae\xd2\xa6"
+			  "\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
+		.klen	= 16,
+		.iv	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+		.ptext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+			  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+			  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+			  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+			  "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11"
+			  "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+			  "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17"
+			  "\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
+		.ctext	= "\x3b\x3f\xd9\x2e\xb7\x2d\xad\x20"
+			  "\x33\x34\x49\xf8\xe8\x3c\xfb\x4a"
+			  "\xc8\xa6\x45\x37\xa0\xb3\xa9\x3f"
+			  "\xcd\xe3\xcd\xad\x9f\x1c\xe5\x8b"
+			  "\x26\x75\x1f\x67\xa3\xcb\xb1\x40"
+			  "\xb1\x80\x8c\xf1\x87\xa4\xf4\xdf"
+			  "\xc0\x4b\x05\x35\x7c\x5d\x1c\x0e"
+			  "\xea\xc4\xc6\x6f\x9f\xf7\xf2\xe6",
+		.len	= 64,
+	}, {
+		.key	= "\x8e\x73\xb0\xf7\xda\x0e\x64\x52"
+			  "\xc8\x10\xf3\x2b\x80\x90\x79\xe5"
+			  "\x62\xf8\xea\xd2\x52\x2c\x6b\x7b",
+		.klen	= 24,
+		.iv	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+		.ptext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+			  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+			  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+			  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+			  "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11"
+			  "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+			  "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17"
+			  "\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
+		.ctext	= "\xcd\xc8\x0d\x6f\xdd\xf1\x8c\xab"
+			  "\x34\xc2\x59\x09\xc9\x9a\x41\x74"
+			  "\x67\xce\x7f\x7f\x81\x17\x36\x21"
+			  "\x96\x1a\x2b\x70\x17\x1d\x3d\x7a"
+			  "\x2e\x1e\x8a\x1d\xd5\x9b\x88\xb1"
+			  "\xc8\xe6\x0f\xed\x1e\xfa\xc4\xc9"
+			  "\xc0\x5f\x9f\x9c\xa9\x83\x4f\xa0"
+			  "\x42\xae\x8f\xba\x58\x4b\x09\xff",
+		.len	= 64,
+	}, {
+		.key	= "\x60\x3d\xeb\x10\x15\xca\x71\xbe"
+			  "\x2b\x73\xae\xf0\x85\x7d\x77\x81"
+			  "\x1f\x35\x2c\x07\x3b\x61\x08\xd7"
+			  "\x2d\x98\x10\xa3\x09\x14\xdf\xf4",
+		.klen	= 32,
+		.iv	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+		.ptext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+			  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+			  "\xae\x2d\x8a\x57\x1e\x03\xac\x9c"
+			  "\x9e\xb7\x6f\xac\x45\xaf\x8e\x51"
+			  "\x30\xc8\x1c\x46\xa3\x5c\xe4\x11"
+			  "\xe5\xfb\xc1\x19\x1a\x0a\x52\xef"
+			  "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17"
+			  "\xad\x2b\x41\x7b\xe6\x6c\x37\x10",
+		.ctext	= "\xdc\x7e\x84\xbf\xda\x79\x16\x4b"
+			  "\x7e\xcd\x84\x86\x98\x5d\x38\x60"
+			  "\x39\xff\xed\x14\x3b\x28\xb1\xc8"
+			  "\x32\x11\x3c\x63\x31\xe5\x40\x7b"
+			  "\xdf\x10\x13\x24\x15\xe5\x4b\x92"
+			  "\xa1\x3e\xd0\xa8\x26\x7a\xe2\xf9"
+			  "\x75\xa3\x85\x74\x1a\xb9\xce\xf8"
+			  "\x20\x31\x62\x3d\x55\xb1\xe4\x71",
+		.len	= 64,
+	}, { /* > 16 bytes, not a multiple of 16 bytes */
+		.key	= "\x2b\x7e\x15\x16\x28\xae\xd2\xa6"
+			  "\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
+		.klen	= 16,
+		.iv	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+		.ptext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f\x96"
+			  "\xe9\x3d\x7e\x11\x73\x93\x17\x2a"
+			  "\xae",
+		.ctext	= "\x3b\x3f\xd9\x2e\xb7\x2d\xad\x20"
+			  "\x33\x34\x49\xf8\xe8\x3c\xfb\x4a"
+			  "\xc8",
+		.len	= 17,
+	}, { /* < 16 bytes */
+		.key	= "\x2b\x7e\x15\x16\x28\xae\xd2\xa6"
+			  "\xab\xf7\x15\x88\x09\xcf\x4f\x3c",
+		.klen	= 16,
+		.iv	= "\x00\x01\x02\x03\x04\x05\x06\x07"
+			  "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
+		.ptext	= "\x6b\xc1\xbe\xe2\x2e\x40\x9f",
+		.ctext	= "\x3b\x3f\xd9\x2e\xb7\x2d\xad",
+		.len	= 7,
+	},
+};
+
+static int __init libaescfb_init(void)
+{
+	for (int i = 0; i < ARRAY_SIZE(aescfb_tv); i++) {
+		struct crypto_aes_ctx ctx;
+		u8 buf[64];
+
+		if (aes_expandkey(&ctx, aescfb_tv[i].key, aescfb_tv[i].klen)) {
+			pr_err("aes_expandkey() failed on vector %d\n", i);
+			return -ENODEV;
+		}
+
+		aescfb_encrypt(&ctx, buf, aescfb_tv[i].ptext, aescfb_tv[i].len,
+			       aescfb_tv[i].iv);
+		if (memcmp(buf, aescfb_tv[i].ctext, aescfb_tv[i].len)) {
+			pr_err("aescfb_encrypt() #1 failed on vector %d\n", i);
+			return -ENODEV;
+		}
+
+		/* decrypt in place */
+		aescfb_decrypt(&ctx, buf, buf, aescfb_tv[i].len, aescfb_tv[i].iv);
+		if (memcmp(buf, aescfb_tv[i].ptext, aescfb_tv[i].len)) {
+			pr_err("aescfb_decrypt() failed on vector %d\n", i);
+			return -ENODEV;
+		}
+
+		/* encrypt in place */
+		aescfb_encrypt(&ctx, buf, buf, aescfb_tv[i].len, aescfb_tv[i].iv);
+		if (memcmp(buf, aescfb_tv[i].ctext, aescfb_tv[i].len)) {
+			pr_err("aescfb_encrypt() #2 failed on vector %d\n", i);
+
+			return -ENODEV;
+		}
+
+	}
+	return 0;
+}
+module_init(libaescfb_init);
+
+static void __exit libaescfb_exit(void)
+{
+}
+module_exit(libaescfb_exit);
+#endif
-- 
cgit v1.2.3


From 11189d6828ba4aa77916fdf4fcaebc304b349b51 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Mon, 29 Apr 2024 16:27:59 -0400
Subject: tpm: add buffer function to point to returned parameters

Replace all instances of &buf.data[TPM_HEADER_SIZE] with a new
function tpm_buf_parameters() because encryption sessions change
where the return parameters are located in the buffer since if a
return session is present they're 4 bytes beyond the header with those
4 bytes giving the parameter length.  If there is no return session,
then they're in the usual place immediately after the header.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Reviewed-by: Stefan Berger <stefanb@linux.ibm.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Tested-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 drivers/char/tpm/tpm-buf.c | 28 ++++++++++++++++++++++++++++
 include/linux/tpm.h        |  2 ++
 2 files changed, 30 insertions(+)

(limited to 'include')

diff --git a/drivers/char/tpm/tpm-buf.c b/drivers/char/tpm/tpm-buf.c
index cc14fa3fd8c2..6e4595cc98bc 100644
--- a/drivers/char/tpm/tpm-buf.c
+++ b/drivers/char/tpm/tpm-buf.c
@@ -221,3 +221,31 @@ u32 tpm_buf_read_u32(struct tpm_buf *buf, off_t *offset)
 	return be32_to_cpu(value);
 }
 EXPORT_SYMBOL_GPL(tpm_buf_read_u32);
+
+static u16 tpm_buf_tag(struct tpm_buf *buf)
+{
+	struct tpm_header *head = (struct tpm_header *)buf->data;
+
+	return be16_to_cpu(head->tag);
+}
+
+/**
+ * tpm_buf_parameters - return the TPM response parameters area of the tpm_buf
+ * @buf: tpm_buf to use
+ *
+ * Where the parameters are located depends on the tag of a TPM
+ * command (it's immediately after the header for TPM_ST_NO_SESSIONS
+ * or 4 bytes after for TPM_ST_SESSIONS). Evaluate this and return a
+ * pointer to the first byte of the parameters area.
+ *
+ * @return: pointer to parameters area
+ */
+u8 *tpm_buf_parameters(struct tpm_buf *buf)
+{
+	int offset = TPM_HEADER_SIZE;
+
+	if (tpm_buf_tag(buf) == TPM2_ST_SESSIONS)
+		offset += 4;
+
+	return &buf->data[offset];
+}
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index e8172f81c562..6be263509e81 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -344,6 +344,8 @@ u8 tpm_buf_read_u8(struct tpm_buf *buf, off_t *offset);
 u16 tpm_buf_read_u16(struct tpm_buf *buf, off_t *offset);
 u32 tpm_buf_read_u32(struct tpm_buf *buf, off_t *offset);
 
+u8 *tpm_buf_parameters(struct tpm_buf *buf);
+
 /*
  * Check if TPM device is in the firmware upgrade mode.
  */
-- 
cgit v1.2.3


From d2add27cf2b823a8c1f8caf7ff10c98070df71f5 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Mon, 29 Apr 2024 16:28:01 -0400
Subject: tpm: Add NULL primary creation

The session handling code uses a "salted" session, meaning a session
whose salt is encrypted to the public part of another TPM key so an
observer cannot obtain it (and thus deduce the session keys).  This
patch creates and context saves in the tpm_chip area the primary key
of the NULL hierarchy for this purpose.

[jarkko@kernel.org: fixed documentation errors]
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Tested-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 drivers/char/tpm/Kconfig         |  11 ++
 drivers/char/tpm/Makefile        |   1 +
 drivers/char/tpm/tpm.h           |  10 ++
 drivers/char/tpm/tpm2-cmd.c      |   5 +
 drivers/char/tpm/tpm2-sessions.c | 322 +++++++++++++++++++++++++++++++++++++++
 include/linux/tpm.h              |  69 +++++++++
 6 files changed, 418 insertions(+)
 create mode 100644 drivers/char/tpm/tpm2-sessions.c

(limited to 'include')

diff --git a/drivers/char/tpm/Kconfig b/drivers/char/tpm/Kconfig
index 852bb9344788..ad37b55f5ff9 100644
--- a/drivers/char/tpm/Kconfig
+++ b/drivers/char/tpm/Kconfig
@@ -27,6 +27,17 @@ menuconfig TCG_TPM
 
 if TCG_TPM
 
+config TCG_TPM2_HMAC
+	bool "Use HMAC and encrypted transactions on the TPM bus"
+	default y
+	help
+	  Setting this causes us to deploy a scheme which uses request
+	  and response HMACs in addition to encryption for
+	  communicating with the TPM to prevent or detect bus snooping
+	  and interposer attacks (see tpm-security.rst).  Saying Y
+	  here adds some encryption overhead to all kernel to TPM
+	  transactions.
+
 config HW_RANDOM_TPM
 	bool "TPM HW Random Number Generator support"
 	depends on TCG_TPM && HW_RANDOM && !(TCG_TPM=y && HW_RANDOM=m)
diff --git a/drivers/char/tpm/Makefile b/drivers/char/tpm/Makefile
index ad3594e383e1..4c695b0388f3 100644
--- a/drivers/char/tpm/Makefile
+++ b/drivers/char/tpm/Makefile
@@ -17,6 +17,7 @@ tpm-y += eventlog/tpm1.o
 tpm-y += eventlog/tpm2.o
 tpm-y += tpm-buf.o
 
+tpm-$(CONFIG_TCG_TPM2_HMAC) += tpm2-sessions.o
 tpm-$(CONFIG_ACPI) += tpm_ppi.o eventlog/acpi.o
 tpm-$(CONFIG_EFI) += eventlog/efi.o
 tpm-$(CONFIG_OF) += eventlog/of.o
diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h
index cbc9d1e2974d..6b8b9956ba69 100644
--- a/drivers/char/tpm/tpm.h
+++ b/drivers/char/tpm/tpm.h
@@ -321,4 +321,14 @@ void tpm_bios_log_setup(struct tpm_chip *chip);
 void tpm_bios_log_teardown(struct tpm_chip *chip);
 int tpm_dev_common_init(void);
 void tpm_dev_common_exit(void);
+
+#ifdef CONFIG_TCG_TPM2_HMAC
+int tpm2_sessions_init(struct tpm_chip *chip);
+#else
+static inline int tpm2_sessions_init(struct tpm_chip *chip)
+{
+	return 0;
+}
+#endif
+
 #endif
diff --git a/drivers/char/tpm/tpm2-cmd.c b/drivers/char/tpm/tpm2-cmd.c
index 93545be190a5..b0e72fb563d9 100644
--- a/drivers/char/tpm/tpm2-cmd.c
+++ b/drivers/char/tpm/tpm2-cmd.c
@@ -759,6 +759,11 @@ int tpm2_auto_startup(struct tpm_chip *chip)
 		rc = 0;
 	}
 
+	if (rc)
+		goto out;
+
+	rc = tpm2_sessions_init(chip);
+
 out:
 	/*
 	 * Infineon TPM in field upgrade mode will return no data for the number
diff --git a/drivers/char/tpm/tpm2-sessions.c b/drivers/char/tpm/tpm2-sessions.c
new file mode 100644
index 000000000000..78b94097114d
--- /dev/null
+++ b/drivers/char/tpm/tpm2-sessions.c
@@ -0,0 +1,322 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (C) 2018 James.Bottomley@HansenPartnership.com
+ *
+ */
+
+#include "tpm.h"
+#include <asm/unaligned.h>
+
+/**
+ * tpm2_parse_create_primary() - parse the data returned from TPM_CC_CREATE_PRIMARY
+ *
+ * @chip:	The TPM the primary was created under
+ * @buf:	The response buffer from the chip
+ * @handle:	pointer to be filled in with the return handle of the primary
+ * @hierarchy:	The hierarchy the primary was created for
+ *
+ * Return:
+ * * 0		- OK
+ * * -errno	- A system error
+ * * TPM_RC	- A TPM error
+ */
+static int tpm2_parse_create_primary(struct tpm_chip *chip, struct tpm_buf *buf,
+				     u32 *handle, u32 hierarchy)
+{
+	struct tpm_header *head = (struct tpm_header *)buf->data;
+	off_t offset_r = TPM_HEADER_SIZE, offset_t;
+	u16 len = TPM_HEADER_SIZE;
+	u32 total_len = be32_to_cpu(head->length);
+	u32 val, param_len;
+
+	*handle = tpm_buf_read_u32(buf, &offset_r);
+	param_len = tpm_buf_read_u32(buf, &offset_r);
+	/*
+	 * param_len doesn't include the header, but all the other
+	 * lengths and offsets do, so add it to parm len to make
+	 * the comparisons easier
+	 */
+	param_len += TPM_HEADER_SIZE;
+
+	if (param_len + 8 > total_len)
+		return -EINVAL;
+	len = tpm_buf_read_u16(buf, &offset_r);
+	offset_t = offset_r;
+	/* now we have the public area, compute the name of the object */
+	put_unaligned_be16(TPM_ALG_SHA256, chip->null_key_name);
+	sha256(&buf->data[offset_r], len, chip->null_key_name + 2);
+
+	/* validate the public key */
+	val = tpm_buf_read_u16(buf, &offset_t);
+
+	/* key type (must be what we asked for) */
+	if (val != TPM_ALG_ECC)
+		return -EINVAL;
+	val = tpm_buf_read_u16(buf, &offset_t);
+
+	/* name algorithm */
+	if (val != TPM_ALG_SHA256)
+		return -EINVAL;
+	val = tpm_buf_read_u32(buf, &offset_t);
+
+	/* object properties */
+	if (val != TPM2_OA_TMPL)
+		return -EINVAL;
+
+	/* auth policy (empty) */
+	val = tpm_buf_read_u16(buf, &offset_t);
+	if (val != 0)
+		return -EINVAL;
+
+	/* symmetric key parameters */
+	val = tpm_buf_read_u16(buf, &offset_t);
+	if (val != TPM_ALG_AES)
+		return -EINVAL;
+
+	/* symmetric key length */
+	val = tpm_buf_read_u16(buf, &offset_t);
+	if (val != AES_KEY_BITS)
+		return -EINVAL;
+
+	/* symmetric encryption scheme */
+	val = tpm_buf_read_u16(buf, &offset_t);
+	if (val != TPM_ALG_CFB)
+		return -EINVAL;
+
+	/* signing scheme */
+	val = tpm_buf_read_u16(buf, &offset_t);
+	if (val != TPM_ALG_NULL)
+		return -EINVAL;
+
+	/* ECC Curve */
+	val = tpm_buf_read_u16(buf, &offset_t);
+	if (val != TPM2_ECC_NIST_P256)
+		return -EINVAL;
+
+	/* KDF Scheme */
+	val = tpm_buf_read_u16(buf, &offset_t);
+	if (val != TPM_ALG_NULL)
+		return -EINVAL;
+
+	/* extract public key (x and y points) */
+	val = tpm_buf_read_u16(buf, &offset_t);
+	if (val != EC_PT_SZ)
+		return -EINVAL;
+	memcpy(chip->null_ec_key_x, &buf->data[offset_t], val);
+	offset_t += val;
+	val = tpm_buf_read_u16(buf, &offset_t);
+	if (val != EC_PT_SZ)
+		return -EINVAL;
+	memcpy(chip->null_ec_key_y, &buf->data[offset_t], val);
+	offset_t += val;
+
+	/* original length of the whole TPM2B */
+	offset_r += len;
+
+	/* should have exactly consumed the TPM2B public structure */
+	if (offset_t != offset_r)
+		return -EINVAL;
+	if (offset_r > param_len)
+		return -EINVAL;
+
+	/* creation data (skip) */
+	len = tpm_buf_read_u16(buf, &offset_r);
+	offset_r += len;
+	if (offset_r > param_len)
+		return -EINVAL;
+
+	/* creation digest (must be sha256) */
+	len = tpm_buf_read_u16(buf, &offset_r);
+	offset_r += len;
+	if (len != SHA256_DIGEST_SIZE || offset_r > param_len)
+		return -EINVAL;
+
+	/* TPMT_TK_CREATION follows */
+	/* tag, must be TPM_ST_CREATION (0x8021) */
+	val = tpm_buf_read_u16(buf, &offset_r);
+	if (val != TPM2_ST_CREATION || offset_r > param_len)
+		return -EINVAL;
+
+	/* hierarchy */
+	val = tpm_buf_read_u32(buf, &offset_r);
+	if (val != hierarchy || offset_r > param_len)
+		return -EINVAL;
+
+	/* the ticket digest HMAC (might not be sha256) */
+	len = tpm_buf_read_u16(buf, &offset_r);
+	offset_r += len;
+	if (offset_r > param_len)
+		return -EINVAL;
+
+	/*
+	 * finally we have the name, which is a sha256 digest plus a 2
+	 * byte algorithm type
+	 */
+	len = tpm_buf_read_u16(buf, &offset_r);
+	if (offset_r + len != param_len + 8)
+		return -EINVAL;
+	if (len != SHA256_DIGEST_SIZE + 2)
+		return -EINVAL;
+
+	if (memcmp(chip->null_key_name, &buf->data[offset_r],
+		   SHA256_DIGEST_SIZE + 2) != 0) {
+		dev_err(&chip->dev, "NULL Seed name comparison failed\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * tpm2_create_primary() - create a primary key using a fixed P-256 template
+ *
+ * @chip:      the TPM chip to create under
+ * @hierarchy: The hierarchy handle to create under
+ * @handle:    The returned volatile handle on success
+ *
+ * For platforms that might not have a persistent primary, this can be
+ * used to create one quickly on the fly (it uses Elliptic Curve not
+ * RSA, so even slow TPMs can create one fast).  The template uses the
+ * TCG mandated H one for non-endorsement ECC primaries, i.e. P-256
+ * elliptic curve (the only current one all TPM2s are required to
+ * have) a sha256 name hash and no policy.
+ *
+ * Return:
+ * * 0		- OK
+ * * -errno	- A system error
+ * * TPM_RC	- A TPM error
+ */
+static int tpm2_create_primary(struct tpm_chip *chip, u32 hierarchy,
+			       u32 *handle)
+{
+	int rc;
+	struct tpm_buf buf;
+	struct tpm_buf template;
+
+	rc = tpm_buf_init(&buf, TPM2_ST_SESSIONS, TPM2_CC_CREATE_PRIMARY);
+	if (rc)
+		return rc;
+
+	rc = tpm_buf_init_sized(&template);
+	if (rc) {
+		tpm_buf_destroy(&buf);
+		return rc;
+	}
+
+	/*
+	 * create the template.  Note: in order for userspace to
+	 * verify the security of the system, it will have to create
+	 * and certify this NULL primary, meaning all the template
+	 * parameters will have to be identical, so conform exactly to
+	 * the TCG TPM v2.0 Provisioning Guidance for the SRK ECC
+	 * key H template (H has zero size unique points)
+	 */
+
+	/* key type */
+	tpm_buf_append_u16(&template, TPM_ALG_ECC);
+
+	/* name algorithm */
+	tpm_buf_append_u16(&template, TPM_ALG_SHA256);
+
+	/* object properties */
+	tpm_buf_append_u32(&template, TPM2_OA_TMPL);
+
+	/* sauth policy (empty) */
+	tpm_buf_append_u16(&template, 0);
+
+	/* BEGIN parameters: key specific; for ECC*/
+
+	/* symmetric algorithm */
+	tpm_buf_append_u16(&template, TPM_ALG_AES);
+
+	/* bits for symmetric algorithm */
+	tpm_buf_append_u16(&template, AES_KEY_BITS);
+
+	/* algorithm mode (must be CFB) */
+	tpm_buf_append_u16(&template, TPM_ALG_CFB);
+
+	/* scheme (NULL means any scheme) */
+	tpm_buf_append_u16(&template, TPM_ALG_NULL);
+
+	/* ECC Curve ID */
+	tpm_buf_append_u16(&template, TPM2_ECC_NIST_P256);
+
+	/* KDF Scheme */
+	tpm_buf_append_u16(&template, TPM_ALG_NULL);
+
+	/* unique: key specific; for ECC it is two zero size points */
+	tpm_buf_append_u16(&template, 0);
+	tpm_buf_append_u16(&template, 0);
+
+	/* END parameters */
+
+	/* primary handle */
+	tpm_buf_append_u32(&buf, hierarchy);
+	tpm_buf_append_empty_auth(&buf, TPM2_RS_PW);
+
+	/* sensitive create size is 4 for two empty buffers */
+	tpm_buf_append_u16(&buf, 4);
+
+	/* sensitive create auth data (empty) */
+	tpm_buf_append_u16(&buf, 0);
+
+	/* sensitive create sensitive data (empty) */
+	tpm_buf_append_u16(&buf, 0);
+
+	/* the public template */
+	tpm_buf_append(&buf, template.data, template.length);
+	tpm_buf_destroy(&template);
+
+	/* outside info (empty) */
+	tpm_buf_append_u16(&buf, 0);
+
+	/* creation PCR (none) */
+	tpm_buf_append_u32(&buf, 0);
+
+	rc = tpm_transmit_cmd(chip, &buf, 0,
+			      "attempting to create NULL primary");
+
+	if (rc == TPM2_RC_SUCCESS)
+		rc = tpm2_parse_create_primary(chip, &buf, handle, hierarchy);
+
+	tpm_buf_destroy(&buf);
+
+	return rc;
+}
+
+static int tpm2_create_null_primary(struct tpm_chip *chip)
+{
+	u32 null_key;
+	int rc;
+
+	rc = tpm2_create_primary(chip, TPM2_RH_NULL, &null_key);
+
+	if (rc == TPM2_RC_SUCCESS) {
+		unsigned int offset = 0; /* dummy offset for null key context */
+
+		rc = tpm2_save_context(chip, null_key, chip->null_key_context,
+				       sizeof(chip->null_key_context), &offset);
+		tpm2_flush_context(chip, null_key);
+	}
+
+	return rc;
+}
+
+/**
+ * tpm2_sessions_init() - start of day initialization for the sessions code
+ * @chip: TPM chip
+ *
+ * Derive and context save the null primary and allocate memory in the
+ * struct tpm_chip for the authorizations.
+ */
+int tpm2_sessions_init(struct tpm_chip *chip)
+{
+	int rc;
+
+	rc = tpm2_create_null_primary(chip);
+	if (rc)
+		dev_err(&chip->dev, "TPM: security failed (NULL seed derivation): %d\n", rc);
+
+	return rc;
+}
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index 6be263509e81..bc8c9a350e23 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -23,6 +23,7 @@
 #include <linux/fs.h>
 #include <linux/highmem.h>
 #include <crypto/hash_info.h>
+#include <crypto/aes.h>
 
 #define TPM_DIGEST_SIZE 20	/* Max TPM v1.2 PCR size */
 #define TPM_MAX_DIGEST_SIZE SHA512_DIGEST_SIZE
@@ -35,12 +36,15 @@ struct trusted_key_options;
 enum tpm_algorithms {
 	TPM_ALG_ERROR		= 0x0000,
 	TPM_ALG_SHA1		= 0x0004,
+	TPM_ALG_AES		= 0x0006,
 	TPM_ALG_KEYEDHASH	= 0x0008,
 	TPM_ALG_SHA256		= 0x000B,
 	TPM_ALG_SHA384		= 0x000C,
 	TPM_ALG_SHA512		= 0x000D,
 	TPM_ALG_NULL		= 0x0010,
 	TPM_ALG_SM3_256		= 0x0012,
+	TPM_ALG_ECC		= 0x0023,
+	TPM_ALG_CFB		= 0x0043,
 };
 
 /*
@@ -49,6 +53,11 @@ enum tpm_algorithms {
  */
 #define TPM_MAX_HASHES	5
 
+enum tpm2_curves {
+	TPM2_ECC_NONE		= 0x0000,
+	TPM2_ECC_NIST_P256	= 0x0003,
+};
+
 struct tpm_digest {
 	u16 alg_id;
 	u8 digest[TPM_MAX_DIGEST_SIZE];
@@ -116,6 +125,20 @@ struct tpm_chip_seqops {
 	const struct seq_operations *seqops;
 };
 
+/* fixed define for the curve we use which is NIST_P256 */
+#define EC_PT_SZ	32
+
+/*
+ * fixed define for the size of a name.  This is actually HASHALG size
+ * plus 2, so 32 for SHA256
+ */
+#define TPM2_NAME_SIZE	34
+
+/*
+ * The maximum size for an object context
+ */
+#define TPM2_MAX_CONTEXT_SIZE 4096
+
 struct tpm_chip {
 	struct device dev;
 	struct device devs;
@@ -170,6 +193,17 @@ struct tpm_chip {
 
 	/* active locality */
 	int locality;
+
+#ifdef CONFIG_TCG_TPM2_HMAC
+	/* details for communication security via sessions */
+
+	/* saved context for NULL seed */
+	u8 null_key_context[TPM2_MAX_CONTEXT_SIZE];
+	 /* name of NULL seed */
+	u8 null_key_name[TPM2_NAME_SIZE];
+	u8 null_ec_key_x[EC_PT_SZ];
+	u8 null_ec_key_y[EC_PT_SZ];
+#endif
 };
 
 #define TPM_HEADER_SIZE		10
@@ -194,6 +228,7 @@ enum tpm2_timeouts {
 enum tpm2_structures {
 	TPM2_ST_NO_SESSIONS	= 0x8001,
 	TPM2_ST_SESSIONS	= 0x8002,
+	TPM2_ST_CREATION	= 0x8021,
 };
 
 /* Indicates from what layer of the software stack the error comes from */
@@ -243,6 +278,7 @@ enum tpm2_command_codes {
 };
 
 enum tpm2_permanent_handles {
+	TPM2_RH_NULL		= 0x40000007,
 	TPM2_RS_PW		= 0x40000009,
 };
 
@@ -318,9 +354,28 @@ struct tpm_buf {
 enum tpm2_object_attributes {
 	TPM2_OA_FIXED_TPM		= BIT(1),
 	TPM2_OA_FIXED_PARENT		= BIT(4),
+	TPM2_OA_SENSITIVE_DATA_ORIGIN	= BIT(5),
 	TPM2_OA_USER_WITH_AUTH		= BIT(6),
+	TPM2_OA_NO_DA			= BIT(10),
+	TPM2_OA_RESTRICTED		= BIT(16),
+	TPM2_OA_DECRYPT			= BIT(17),
 };
 
+/*
+ * definitions for the canonical template.  These are mandated
+ * by the TCG key template documents
+ */
+
+#define AES_KEY_BYTES	AES_KEYSIZE_128
+#define AES_KEY_BITS	(AES_KEY_BYTES*8)
+#define TPM2_OA_TMPL	(TPM2_OA_NO_DA |			\
+			 TPM2_OA_FIXED_TPM |			\
+			 TPM2_OA_FIXED_PARENT |			\
+			 TPM2_OA_SENSITIVE_DATA_ORIGIN |	\
+			 TPM2_OA_USER_WITH_AUTH |		\
+			 TPM2_OA_DECRYPT |			\
+			 TPM2_OA_RESTRICTED)
+
 enum tpm2_session_attributes {
 	TPM2_SA_CONTINUE_SESSION	= BIT(0),
 };
@@ -373,6 +428,16 @@ extern int tpm_pcr_extend(struct tpm_chip *chip, u32 pcr_idx,
 extern int tpm_get_random(struct tpm_chip *chip, u8 *data, size_t max);
 extern struct tpm_chip *tpm_default_chip(void);
 void tpm2_flush_context(struct tpm_chip *chip, u32 handle);
+
+static inline void tpm_buf_append_empty_auth(struct tpm_buf *buf, u32 handle)
+{
+	/* simple authorization for empty auth */
+	tpm_buf_append_u32(buf, 9);		/* total length of auth */
+	tpm_buf_append_u32(buf, handle);
+	tpm_buf_append_u16(buf, 0);		/* nonce len */
+	tpm_buf_append_u8(buf, 0);		/* attributes */
+	tpm_buf_append_u16(buf, 0);		/* hmac len */
+}
 #else
 static inline int tpm_is_tpm2(struct tpm_chip *chip)
 {
@@ -399,5 +464,9 @@ static inline struct tpm_chip *tpm_default_chip(void)
 {
 	return NULL;
 }
+
+static inline void tpm_buf_append_empty_auth(struct tpm_buf *buf, u32 handle)
+{
+}
 #endif
 #endif
-- 
cgit v1.2.3


From 699e3efd6c645c741ea4d6d58282c56b6d108cf7 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Mon, 29 Apr 2024 16:28:03 -0400
Subject: tpm: Add HMAC session start and end functions

Add session  based HMAC  authentication plus parameter  decryption and
response encryption  using AES. The  basic design is to  segregate all
the nasty crypto, hash and hmac code into tpm2-sessions.c and export a
usable API.  The API first of all starts off by gaining a session with
tpm2_start_auth_session() which  initiates a session with  the TPM and
allocates  an  opaque  tpm2_auth   structure  to  handle  the  session
parameters.  The  design is that  session use will be  single threaded
from start to finish under the ops lock, so the tpm2_auth structure is
stored in struct tpm2_chip to simpify the externally visible API.

The session can be ended with tpm2_end_auth_session() which is
designed only to be used in error legs.  Ordinarily the further
session API (future patches) will end or continue the session
appropriately without having to call this.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> # crypto API parts
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Tested-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 drivers/char/tpm/Kconfig         |   2 +
 drivers/char/tpm/tpm-buf.c       |   1 +
 drivers/char/tpm/tpm-chip.c      |   3 +
 drivers/char/tpm/tpm2-sessions.c | 285 +++++++++++++++++++++++++++++++++++++++
 include/linux/tpm.h              |  34 +++++
 5 files changed, 325 insertions(+)

(limited to 'include')

diff --git a/drivers/char/tpm/Kconfig b/drivers/char/tpm/Kconfig
index c3996786f9bf..e63a6a17793c 100644
--- a/drivers/char/tpm/Kconfig
+++ b/drivers/char/tpm/Kconfig
@@ -30,6 +30,8 @@ if TCG_TPM
 config TCG_TPM2_HMAC
 	bool "Use HMAC and encrypted transactions on the TPM bus"
 	default y
+	select CRYPTO_ECDH
+	select CRYPTO_LIB_AESCFB
 	select CRYPTO_LIB_SHA256
 	help
 	  Setting this causes us to deploy a scheme which uses request
diff --git a/drivers/char/tpm/tpm-buf.c b/drivers/char/tpm/tpm-buf.c
index 6e4595cc98bc..647c6ca92ac3 100644
--- a/drivers/char/tpm/tpm-buf.c
+++ b/drivers/char/tpm/tpm-buf.c
@@ -44,6 +44,7 @@ void tpm_buf_reset(struct tpm_buf *buf, u16 tag, u32 ordinal)
 	head->tag = cpu_to_be16(tag);
 	head->length = cpu_to_be32(sizeof(*head));
 	head->ordinal = cpu_to_be32(ordinal);
+	buf->handles = 0;
 }
 EXPORT_SYMBOL_GPL(tpm_buf_reset);
 
diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c
index 42b1062e33cd..d93937326b2e 100644
--- a/drivers/char/tpm/tpm-chip.c
+++ b/drivers/char/tpm/tpm-chip.c
@@ -275,6 +275,9 @@ static void tpm_dev_release(struct device *dev)
 	kfree(chip->work_space.context_buf);
 	kfree(chip->work_space.session_buf);
 	kfree(chip->allocated_banks);
+#ifdef CONFIG_TCG_TPM2_HMAC
+	kfree(chip->auth);
+#endif
 	kfree(chip);
 }
 
diff --git a/drivers/char/tpm/tpm2-sessions.c b/drivers/char/tpm/tpm2-sessions.c
index fd121dea614c..1c8d50a501d7 100644
--- a/drivers/char/tpm/tpm2-sessions.c
+++ b/drivers/char/tpm/tpm2-sessions.c
@@ -3,13 +3,101 @@
 /*
  * Copyright (C) 2018 James.Bottomley@HansenPartnership.com
  *
+ * Cryptographic helper routines for handling TPM2 sessions for
+ * authorization HMAC and request response encryption.
+ *
+ * The idea is to ensure that every TPM command is HMAC protected by a
+ * session, meaning in-flight tampering would be detected and in
+ * addition all sensitive inputs and responses should be encrypted.
+ *
+ * The basic way this works is to use a TPM feature called salted
+ * sessions where a random secret used in session construction is
+ * encrypted to the public part of a known TPM key.  The problem is we
+ * have no known keys, so initially a primary Elliptic Curve key is
+ * derived from the NULL seed (we use EC because most TPMs generate
+ * these keys much faster than RSA ones).  The curve used is NIST_P256
+ * because that's now mandated to be present in 'TCG TPM v2.0
+ * Provisioning Guidance'
+ *
+ * Threat problems: the initial TPM2_CreatePrimary is not (and cannot
+ * be) session protected, so a clever Man in the Middle could return a
+ * public key they control to this command and from there intercept
+ * and decode all subsequent session based transactions.  The kernel
+ * cannot mitigate this threat but, after boot, userspace can get
+ * proof this has not happened by asking the TPM to certify the NULL
+ * key.  This certification would chain back to the TPM Endorsement
+ * Certificate and prove the NULL seed primary had not been tampered
+ * with and thus all sessions must have been cryptographically secure.
+ * To assist with this, the initial NULL seed public key name is made
+ * available in a sysfs file.
+ *
+ * Use of these functions:
+ *
+ * The design is all the crypto, hash and hmac gunk is confined in this
+ * file and never needs to be seen even by the kernel internal user.  To
+ * the user there's an init function tpm2_sessions_init() that needs to
+ * be called once per TPM which generates the NULL seed primary key.
+ *
+ * These are the usage functions:
+ *
+ * tpm2_start_auth_session() which allocates the opaque auth structure
+ *	and gets a session from the TPM.  This must be called before
+ *	any of the following functions.  The session is protected by a
+ *	session_key which is derived from a random salt value
+ *	encrypted to the NULL seed.
+ * tpm2_end_auth_session() kills the session and frees the resources.
+ *	Under normal operation this function is done by
+ *	tpm_buf_check_hmac_response(), so this is only to be used on
+ *	error legs where the latter is not executed.
  */
 
 #include "tpm.h"
+#include <linux/random.h>
+#include <linux/scatterlist.h>
 #include <asm/unaligned.h>
+#include <crypto/kpp.h>
+#include <crypto/ecdh.h>
 #include <crypto/hash.h>
 #include <crypto/hmac.h>
 
+/*
+ * This is the structure that carries all the auth information (like
+ * session handle, nonces, session key and auth) from use to use it is
+ * designed to be opaque to anything outside.
+ */
+struct tpm2_auth {
+	u32 handle;
+	/*
+	 * This has two meanings: before tpm_buf_fill_hmac_session()
+	 * it marks the offset in the buffer of the start of the
+	 * sessions (i.e. after all the handles).  Once the buffer has
+	 * been filled it markes the session number of our auth
+	 * session so we can find it again in the response buffer.
+	 *
+	 * The two cases are distinguished because the first offset
+	 * must always be greater than TPM_HEADER_SIZE and the second
+	 * must be less than or equal to 5.
+	 */
+	u32 session;
+	/*
+	 * the size here is variable and set by the size of our_nonce
+	 * which must be between 16 and the name hash length. we set
+	 * the maximum sha256 size for the greatest protection
+	 */
+	u8 our_nonce[SHA256_DIGEST_SIZE];
+	u8 tpm_nonce[SHA256_DIGEST_SIZE];
+	/*
+	 * the salt is only used across the session command/response
+	 * after that it can be used as a scratch area
+	 */
+	union {
+		u8 salt[EC_PT_SZ];
+		/* scratch for key + IV */
+		u8 scratch[AES_KEY_BYTES + AES_BLOCK_SIZE];
+	};
+	u8 session_key[SHA256_DIGEST_SIZE];
+};
+
 /*
  * It turns out the crypto hmac(sha256) is hard for us to consume
  * because it assumes a fixed key and the TPM seems to change the key
@@ -113,6 +201,199 @@ static void tpm2_KDFe(u8 z[EC_PT_SZ], const char *str, u8 *pt_u, u8 *pt_v,
 	sha256_final(&sctx, out);
 }
 
+static void tpm_buf_append_salt(struct tpm_buf *buf, struct tpm_chip *chip)
+{
+	struct crypto_kpp *kpp;
+	struct kpp_request *req;
+	struct scatterlist s[2], d[1];
+	struct ecdh p = {0};
+	u8 encoded_key[EC_PT_SZ], *x, *y;
+	unsigned int buf_len;
+
+	/* secret is two sized points */
+	tpm_buf_append_u16(buf, (EC_PT_SZ + 2)*2);
+	/*
+	 * we cheat here and append uninitialized data to form
+	 * the points.  All we care about is getting the two
+	 * co-ordinate pointers, which will be used to overwrite
+	 * the uninitialized data
+	 */
+	tpm_buf_append_u16(buf, EC_PT_SZ);
+	x = &buf->data[tpm_buf_length(buf)];
+	tpm_buf_append(buf, encoded_key, EC_PT_SZ);
+	tpm_buf_append_u16(buf, EC_PT_SZ);
+	y = &buf->data[tpm_buf_length(buf)];
+	tpm_buf_append(buf, encoded_key, EC_PT_SZ);
+	sg_init_table(s, 2);
+	sg_set_buf(&s[0], x, EC_PT_SZ);
+	sg_set_buf(&s[1], y, EC_PT_SZ);
+
+	kpp = crypto_alloc_kpp("ecdh-nist-p256", CRYPTO_ALG_INTERNAL, 0);
+	if (IS_ERR(kpp)) {
+		dev_err(&chip->dev, "crypto ecdh allocation failed\n");
+		return;
+	}
+
+	buf_len = crypto_ecdh_key_len(&p);
+	if (sizeof(encoded_key) < buf_len) {
+		dev_err(&chip->dev, "salt buffer too small needs %d\n",
+			buf_len);
+		goto out;
+	}
+	crypto_ecdh_encode_key(encoded_key, buf_len, &p);
+	/* this generates a random private key */
+	crypto_kpp_set_secret(kpp, encoded_key, buf_len);
+
+	/* salt is now the public point of this private key */
+	req = kpp_request_alloc(kpp, GFP_KERNEL);
+	if (!req)
+		goto out;
+	kpp_request_set_input(req, NULL, 0);
+	kpp_request_set_output(req, s, EC_PT_SZ*2);
+	crypto_kpp_generate_public_key(req);
+	/*
+	 * we're not done: now we have to compute the shared secret
+	 * which is our private key multiplied by the tpm_key public
+	 * point, we actually only take the x point and discard the y
+	 * point and feed it through KDFe to get the final secret salt
+	 */
+	sg_set_buf(&s[0], chip->null_ec_key_x, EC_PT_SZ);
+	sg_set_buf(&s[1], chip->null_ec_key_y, EC_PT_SZ);
+	kpp_request_set_input(req, s, EC_PT_SZ*2);
+	sg_init_one(d, chip->auth->salt, EC_PT_SZ);
+	kpp_request_set_output(req, d, EC_PT_SZ);
+	crypto_kpp_compute_shared_secret(req);
+	kpp_request_free(req);
+
+	/*
+	 * pass the shared secret through KDFe for salt. Note salt
+	 * area is used both for input shared secret and output salt.
+	 * This works because KDFe fully consumes the secret before it
+	 * writes the salt
+	 */
+	tpm2_KDFe(chip->auth->salt, "SECRET", x, chip->null_ec_key_x,
+		  chip->auth->salt);
+
+ out:
+	crypto_free_kpp(kpp);
+}
+/**
+ * tpm2_end_auth_session() - kill the allocated auth session
+ * @chip: the TPM chip structure
+ *
+ * ends the session started by tpm2_start_auth_session and frees all
+ * the resources.  Under normal conditions,
+ * tpm_buf_check_hmac_response() will correctly end the session if
+ * required, so this function is only for use in error legs that will
+ * bypass the normal invocation of tpm_buf_check_hmac_response().
+ */
+void tpm2_end_auth_session(struct tpm_chip *chip)
+{
+	tpm2_flush_context(chip, chip->auth->handle);
+	memzero_explicit(chip->auth, sizeof(*chip->auth));
+}
+EXPORT_SYMBOL(tpm2_end_auth_session);
+
+static int tpm2_parse_start_auth_session(struct tpm2_auth *auth,
+					 struct tpm_buf *buf)
+{
+	struct tpm_header *head = (struct tpm_header *)buf->data;
+	u32 tot_len = be32_to_cpu(head->length);
+	off_t offset = TPM_HEADER_SIZE;
+	u32 val;
+
+	/* we're starting after the header so adjust the length */
+	tot_len -= TPM_HEADER_SIZE;
+
+	/* should have handle plus nonce */
+	if (tot_len != 4 + 2 + sizeof(auth->tpm_nonce))
+		return -EINVAL;
+
+	auth->handle = tpm_buf_read_u32(buf, &offset);
+	val = tpm_buf_read_u16(buf, &offset);
+	if (val != sizeof(auth->tpm_nonce))
+		return -EINVAL;
+	memcpy(auth->tpm_nonce, &buf->data[offset], sizeof(auth->tpm_nonce));
+	/* now compute the session key from the nonces */
+	tpm2_KDFa(auth->salt, sizeof(auth->salt), "ATH", auth->tpm_nonce,
+		  auth->our_nonce, sizeof(auth->session_key),
+		  auth->session_key);
+
+	return 0;
+}
+
+/**
+ * tpm2_start_auth_session() - create a HMAC authentication session with the TPM
+ * @chip: the TPM chip structure to create the session with
+ *
+ * This function loads the NULL seed from its saved context and starts
+ * an authentication session on the null seed, fills in the
+ * @chip->auth structure to contain all the session details necessary
+ * for performing the HMAC, encrypt and decrypt operations and
+ * returns.  The NULL seed is flushed before this function returns.
+ *
+ * Return: zero on success or actual error encountered.
+ */
+int tpm2_start_auth_session(struct tpm_chip *chip)
+{
+	struct tpm_buf buf;
+	struct tpm2_auth *auth = chip->auth;
+	int rc;
+	/* null seed context has no offset, but we must provide one */
+	unsigned int offset = 0;
+	u32 nullkey;
+
+	rc = tpm2_load_context(chip, chip->null_key_context, &offset,
+			       &nullkey);
+	if (rc)
+		goto out;
+
+	auth->session = TPM_HEADER_SIZE;
+
+	rc = tpm_buf_init(&buf, TPM2_ST_NO_SESSIONS, TPM2_CC_START_AUTH_SESS);
+	if (rc)
+		goto out;
+
+	/* salt key handle */
+	tpm_buf_append_u32(&buf, nullkey);
+	/* bind key handle */
+	tpm_buf_append_u32(&buf, TPM2_RH_NULL);
+	/* nonce caller */
+	get_random_bytes(auth->our_nonce, sizeof(auth->our_nonce));
+	tpm_buf_append_u16(&buf, sizeof(auth->our_nonce));
+	tpm_buf_append(&buf, auth->our_nonce, sizeof(auth->our_nonce));
+
+	/* append encrypted salt and squirrel away unencrypted in auth */
+	tpm_buf_append_salt(&buf, chip);
+	/* session type (HMAC, audit or policy) */
+	tpm_buf_append_u8(&buf, TPM2_SE_HMAC);
+
+	/* symmetric encryption parameters */
+	/* symmetric algorithm */
+	tpm_buf_append_u16(&buf, TPM_ALG_AES);
+	/* bits for symmetric algorithm */
+	tpm_buf_append_u16(&buf, AES_KEY_BITS);
+	/* symmetric algorithm mode (must be CFB) */
+	tpm_buf_append_u16(&buf, TPM_ALG_CFB);
+	/* hash algorithm for session */
+	tpm_buf_append_u16(&buf, TPM_ALG_SHA256);
+
+	rc = tpm_transmit_cmd(chip, &buf, 0, "start auth session");
+	tpm2_flush_context(chip, nullkey);
+
+	if (rc == TPM2_RC_SUCCESS)
+		rc = tpm2_parse_start_auth_session(auth, &buf);
+
+	tpm_buf_destroy(&buf);
+
+	if (rc)
+		goto out;
+
+ out:
+	return rc;
+}
+EXPORT_SYMBOL(tpm2_start_auth_session);
+
 /**
  * tpm2_parse_create_primary() - parse the data returned from TPM_CC_CREATE_PRIMARY
  *
@@ -423,5 +704,9 @@ int tpm2_sessions_init(struct tpm_chip *chip)
 	if (rc)
 		dev_err(&chip->dev, "TPM: security failed (NULL seed derivation): %d\n", rc);
 
+	chip->auth = kmalloc(sizeof(*chip->auth), GFP_KERNEL);
+	if (!chip->auth)
+		return -ENOMEM;
+
 	return rc;
 }
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index bc8c9a350e23..81b5a70ff80d 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -31,6 +31,14 @@
 struct tpm_chip;
 struct trusted_key_payload;
 struct trusted_key_options;
+/* opaque structure, holds auth session parameters like the session key */
+struct tpm2_auth;
+
+enum tpm2_session_types {
+	TPM2_SE_HMAC	= 0x00,
+	TPM2_SE_POLICY	= 0x01,
+	TPM2_SE_TRIAL	= 0x02,
+};
 
 /* if you add a new hash to this, increment TPM_MAX_HASHES below */
 enum tpm_algorithms {
@@ -203,6 +211,7 @@ struct tpm_chip {
 	u8 null_key_name[TPM2_NAME_SIZE];
 	u8 null_ec_key_x[EC_PT_SZ];
 	u8 null_ec_key_y[EC_PT_SZ];
+	struct tpm2_auth *auth;
 #endif
 };
 
@@ -266,6 +275,7 @@ enum tpm2_command_codes {
 	TPM2_CC_CONTEXT_LOAD	        = 0x0161,
 	TPM2_CC_CONTEXT_SAVE	        = 0x0162,
 	TPM2_CC_FLUSH_CONTEXT	        = 0x0165,
+	TPM2_CC_START_AUTH_SESS		= 0x0176,
 	TPM2_CC_VERIFY_SIGNATURE        = 0x0177,
 	TPM2_CC_GET_CAPABILITY	        = 0x017A,
 	TPM2_CC_GET_RANDOM	        = 0x017B,
@@ -349,16 +359,21 @@ struct tpm_buf {
 	u32 flags;
 	u32 length;
 	u8 *data;
+	u8 handles;
 };
 
 enum tpm2_object_attributes {
 	TPM2_OA_FIXED_TPM		= BIT(1),
+	TPM2_OA_ST_CLEAR		= BIT(2),
 	TPM2_OA_FIXED_PARENT		= BIT(4),
 	TPM2_OA_SENSITIVE_DATA_ORIGIN	= BIT(5),
 	TPM2_OA_USER_WITH_AUTH		= BIT(6),
+	TPM2_OA_ADMIN_WITH_POLICY	= BIT(7),
 	TPM2_OA_NO_DA			= BIT(10),
+	TPM2_OA_ENCRYPTED_DUPLICATION	= BIT(11),
 	TPM2_OA_RESTRICTED		= BIT(16),
 	TPM2_OA_DECRYPT			= BIT(17),
+	TPM2_OA_SIGN			= BIT(18),
 };
 
 /*
@@ -378,6 +393,11 @@ enum tpm2_object_attributes {
 
 enum tpm2_session_attributes {
 	TPM2_SA_CONTINUE_SESSION	= BIT(0),
+	TPM2_SA_AUDIT_EXCLUSIVE		= BIT(1),
+	TPM2_SA_AUDIT_RESET		= BIT(3),
+	TPM2_SA_DECRYPT			= BIT(5),
+	TPM2_SA_ENCRYPT			= BIT(6),
+	TPM2_SA_AUDIT			= BIT(7),
 };
 
 struct tpm2_hash {
@@ -469,4 +489,18 @@ static inline void tpm_buf_append_empty_auth(struct tpm_buf *buf, u32 handle)
 {
 }
 #endif
+#ifdef CONFIG_TCG_TPM2_HMAC
+
+int tpm2_start_auth_session(struct tpm_chip *chip);
+void tpm2_end_auth_session(struct tpm_chip *chip);
+#else
+static inline int tpm2_start_auth_session(struct tpm_chip *chip)
+{
+	return 0;
+}
+static inline void tpm2_end_auth_session(struct tpm_chip *chip)
+{
+}
+#endif	/* CONFIG_TCG_TPM2_HMAC */
+
 #endif
-- 
cgit v1.2.3


From d0a25bb961e6e5650083a4f15768e3075f7d8db7 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Mon, 29 Apr 2024 16:28:04 -0400
Subject: tpm: Add HMAC session name/handle append

Add tpm2_append_name() for appending to the handle area of the TPM
command.  When TPM_BUS_SECURITY is enabled and HMAC sessions are in
use this adds the standard u32 handle to the buffer but additionally
records the name of the object which must be used as part of the HMAC
computation.  The name of certain object types (volatile and permanent
handles and NV indexes) is a hash of the public area of the object.
Since this hash is not known ahead of time, it must be requested from
the TPM using TPM2_ReadPublic() (which cannot be HMAC protected, but
if an interposer lies about it, the HMAC check will fail and the
problem will be detected).

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> # crypto API parts
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Tested-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 drivers/char/tpm/tpm2-sessions.c | 129 +++++++++++++++++++++++++++++++++++++++
 include/linux/tpm.h              |  26 ++++++++
 2 files changed, 155 insertions(+)

(limited to 'include')

diff --git a/drivers/char/tpm/tpm2-sessions.c b/drivers/char/tpm/tpm2-sessions.c
index 1c8d50a501d7..203a3b3cee67 100644
--- a/drivers/char/tpm/tpm2-sessions.c
+++ b/drivers/char/tpm/tpm2-sessions.c
@@ -49,6 +49,11 @@
  *	Under normal operation this function is done by
  *	tpm_buf_check_hmac_response(), so this is only to be used on
  *	error legs where the latter is not executed.
+ * tpm_buf_append_name() to add a handle to the buffer.  This must be
+ *	used in place of the usual tpm_buf_append_u32() for adding
+ *	handles because handles have to be processed specially when
+ *	calculating the HMAC.  In particular, for NV, volatile and
+ *	permanent objects you now need to provide the name.
  */
 
 #include "tpm.h"
@@ -60,6 +65,9 @@
 #include <crypto/hash.h>
 #include <crypto/hmac.h>
 
+/* maximum number of names the TPM must remember for authorization */
+#define AUTH_MAX_NAMES	3
+
 /*
  * This is the structure that carries all the auth information (like
  * session handle, nonces, session key and auth) from use to use it is
@@ -96,8 +104,31 @@ struct tpm2_auth {
 		u8 scratch[AES_KEY_BYTES + AES_BLOCK_SIZE];
 	};
 	u8 session_key[SHA256_DIGEST_SIZE];
+
+	/*
+	 * memory for three authorization handles.  We know them by
+	 * handle, but they are part of the session by name, which
+	 * we must compute and remember
+	 */
+	u32 name_h[AUTH_MAX_NAMES];
+	u8 name[AUTH_MAX_NAMES][2 + SHA512_DIGEST_SIZE];
 };
 
+/*
+ * Name Size based on TPM algorithm (assumes no hash bigger than 255)
+ */
+static u8 name_size(const u8 *name)
+{
+	static u8 size_map[] = {
+		[TPM_ALG_SHA1] = SHA1_DIGEST_SIZE,
+		[TPM_ALG_SHA256] = SHA256_DIGEST_SIZE,
+		[TPM_ALG_SHA384] = SHA384_DIGEST_SIZE,
+		[TPM_ALG_SHA512] = SHA512_DIGEST_SIZE,
+	};
+	u16 alg = get_unaligned_be16(name);
+	return size_map[alg] + 2;
+}
+
 /*
  * It turns out the crypto hmac(sha256) is hard for us to consume
  * because it assumes a fixed key and the TPM seems to change the key
@@ -277,6 +308,104 @@ static void tpm_buf_append_salt(struct tpm_buf *buf, struct tpm_chip *chip)
  out:
 	crypto_free_kpp(kpp);
 }
+
+static int tpm2_parse_read_public(char *name, struct tpm_buf *buf)
+{
+	struct tpm_header *head = (struct tpm_header *)buf->data;
+	off_t offset = TPM_HEADER_SIZE;
+	u32 tot_len = be32_to_cpu(head->length);
+	u32 val;
+
+	/* we're starting after the header so adjust the length */
+	tot_len -= TPM_HEADER_SIZE;
+
+	/* skip public */
+	val = tpm_buf_read_u16(buf, &offset);
+	if (val > tot_len)
+		return -EINVAL;
+	offset += val;
+	/* name */
+	val = tpm_buf_read_u16(buf, &offset);
+	if (val != name_size(&buf->data[offset]))
+		return -EINVAL;
+	memcpy(name, &buf->data[offset], val);
+	/* forget the rest */
+	return 0;
+}
+
+static int tpm2_read_public(struct tpm_chip *chip, u32 handle, char *name)
+{
+	struct tpm_buf buf;
+	int rc;
+
+	rc = tpm_buf_init(&buf, TPM2_ST_NO_SESSIONS, TPM2_CC_READ_PUBLIC);
+	if (rc)
+		return rc;
+
+	tpm_buf_append_u32(&buf, handle);
+	rc = tpm_transmit_cmd(chip, &buf, 0, "read public");
+	if (rc == TPM2_RC_SUCCESS)
+		rc = tpm2_parse_read_public(name, &buf);
+
+	tpm_buf_destroy(&buf);
+
+	return rc;
+}
+
+/**
+ * tpm_buf_append_name() - add a handle area to the buffer
+ * @chip: the TPM chip structure
+ * @buf: The buffer to be appended
+ * @handle: The handle to be appended
+ * @name: The name of the handle (may be NULL)
+ *
+ * In order to compute session HMACs, we need to know the names of the
+ * objects pointed to by the handles.  For most objects, this is simply
+ * the actual 4 byte handle or an empty buf (in these cases @name
+ * should be NULL) but for volatile objects, permanent objects and NV
+ * areas, the name is defined as the hash (according to the name
+ * algorithm which should be set to sha256) of the public area to
+ * which the two byte algorithm id has been appended.  For these
+ * objects, the @name pointer should point to this.  If a name is
+ * required but @name is NULL, then TPM2_ReadPublic() will be called
+ * on the handle to obtain the name.
+ *
+ * As with most tpm_buf operations, success is assumed because failure
+ * will be caused by an incorrect programming model and indicated by a
+ * kernel message.
+ */
+void tpm_buf_append_name(struct tpm_chip *chip, struct tpm_buf *buf,
+			 u32 handle, u8 *name)
+{
+	enum tpm2_mso_type mso = tpm2_handle_mso(handle);
+	struct tpm2_auth *auth = chip->auth;
+	int slot;
+
+	slot = (tpm_buf_length(buf) - TPM_HEADER_SIZE)/4;
+	if (slot >= AUTH_MAX_NAMES) {
+		dev_err(&chip->dev, "TPM: too many handles\n");
+		return;
+	}
+	WARN(auth->session != tpm_buf_length(buf),
+	     "name added in wrong place\n");
+	tpm_buf_append_u32(buf, handle);
+	auth->session += 4;
+
+	if (mso == TPM2_MSO_PERSISTENT ||
+	    mso == TPM2_MSO_VOLATILE ||
+	    mso == TPM2_MSO_NVRAM) {
+		if (!name)
+			tpm2_read_public(chip, handle, auth->name[slot]);
+	} else {
+		if (name)
+			dev_err(&chip->dev, "TPM: Handle does not require name but one is specified\n");
+	}
+
+	auth->name_h[slot] = handle;
+	if (name)
+		memcpy(auth->name[slot], name, name_size(name));
+}
+EXPORT_SYMBOL(tpm_buf_append_name);
 /**
  * tpm2_end_auth_session() - kill the allocated auth session
  * @chip: the TPM chip structure
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index 81b5a70ff80d..31c2065fcd35 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -275,6 +275,7 @@ enum tpm2_command_codes {
 	TPM2_CC_CONTEXT_LOAD	        = 0x0161,
 	TPM2_CC_CONTEXT_SAVE	        = 0x0162,
 	TPM2_CC_FLUSH_CONTEXT	        = 0x0165,
+	TPM2_CC_READ_PUBLIC		= 0x0173,
 	TPM2_CC_START_AUTH_SESS		= 0x0176,
 	TPM2_CC_VERIFY_SIGNATURE        = 0x0177,
 	TPM2_CC_GET_CAPABILITY	        = 0x017A,
@@ -292,6 +293,21 @@ enum tpm2_permanent_handles {
 	TPM2_RS_PW		= 0x40000009,
 };
 
+/* Most Significant Octet for key types  */
+enum tpm2_mso_type {
+	TPM2_MSO_NVRAM		= 0x01,
+	TPM2_MSO_SESSION	= 0x02,
+	TPM2_MSO_POLICY		= 0x03,
+	TPM2_MSO_PERMANENT	= 0x40,
+	TPM2_MSO_VOLATILE	= 0x80,
+	TPM2_MSO_PERSISTENT	= 0x81,
+};
+
+static inline enum tpm2_mso_type tpm2_handle_mso(u32 handle)
+{
+	return handle >> 24;
+}
+
 enum tpm2_capabilities {
 	TPM2_CAP_HANDLES	= 1,
 	TPM2_CAP_COMMANDS	= 2,
@@ -492,6 +508,8 @@ static inline void tpm_buf_append_empty_auth(struct tpm_buf *buf, u32 handle)
 #ifdef CONFIG_TCG_TPM2_HMAC
 
 int tpm2_start_auth_session(struct tpm_chip *chip);
+void tpm_buf_append_name(struct tpm_chip *chip, struct tpm_buf *buf,
+			 u32 handle, u8 *name);
 void tpm2_end_auth_session(struct tpm_chip *chip);
 #else
 static inline int tpm2_start_auth_session(struct tpm_chip *chip)
@@ -501,6 +519,14 @@ static inline int tpm2_start_auth_session(struct tpm_chip *chip)
 static inline void tpm2_end_auth_session(struct tpm_chip *chip)
 {
 }
+static inline void tpm_buf_append_name(struct tpm_chip *chip,
+				       struct tpm_buf *buf,
+				       u32 handle, u8 *name)
+{
+	tpm_buf_append_u32(buf, handle);
+	/* count the number of handles in the upper bits of flags */
+	buf->handles++;
+}
 #endif	/* CONFIG_TCG_TPM2_HMAC */
 
 #endif
-- 
cgit v1.2.3


From 1085b8276bb4239daa7008f0dcd5c973e4bd690f Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Mon, 29 Apr 2024 16:28:05 -0400
Subject: tpm: Add the rest of the session HMAC API

The final pieces of the HMAC API are for manipulating the session area
of the command.  To add an authentication HMAC session
tpm_buf_append_hmac_session() is called where tpm2_append_auth() would
go. If a non empty password is passed in, this is correctly added to
the HMAC to prove knowledge of it without revealing it.  Note that if
the session is only used to encrypt or decrypt parameters (no
authentication) then tpm_buf_append_hmac_session_opt() must be used
instead.  This functions identically to tpm_buf_append_hmac_session()
when TPM_BUS_SECURITY is enabled, but differently when it isn't,
because effectively nothing is appended to the session area.

Next the parameters should be filled in for the command and finally
tpm_buf_fill_hmac_session() is called immediately prior to transmitting
the command which computes the correct HMAC and places it in the
command at the session location in the tpm buffer

Finally, after tpm_transmit_cmd() is called,
tpm_buf_check_hmac_response() is called to check that the returned
HMAC matched and collect the new state for the next use of the
session, if any.

The features of the session are controlled by the session attributes
set in tpm_buf_append_hmac_session().  If TPM2_SA_CONTINUE_SESSION is
not specified, the session will be flushed and the tpm2_auth structure
freed in tpm_buf_check_hmac_response(); otherwise the session may be
used again.  Parameter encryption is specified by or'ing the flag
TPM2_SA_DECRYPT and response encryption by or'ing the flag
TPM2_SA_ENCRYPT.  the various encryptions will be taken care of by
tpm_buf_fill_hmac_session() and tpm_buf_check_hmac_response()
respectively.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Reviewed-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> # crypto API parts
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Tested-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 drivers/char/tpm/tpm2-sessions.c | 400 +++++++++++++++++++++++++++++++++++++++
 include/linux/tpm.h              |  69 +++++++
 2 files changed, 469 insertions(+)

(limited to 'include')

diff --git a/drivers/char/tpm/tpm2-sessions.c b/drivers/char/tpm/tpm2-sessions.c
index 203a3b3cee67..dc50f635700c 100644
--- a/drivers/char/tpm/tpm2-sessions.c
+++ b/drivers/char/tpm/tpm2-sessions.c
@@ -54,6 +54,18 @@
  *	handles because handles have to be processed specially when
  *	calculating the HMAC.  In particular, for NV, volatile and
  *	permanent objects you now need to provide the name.
+ * tpm_buf_append_hmac_session() which appends the hmac session to the
+ *	buf in the same way tpm_buf_append_auth does().
+ * tpm_buf_fill_hmac_session() This calculates the correct hash and
+ *	places it in the buffer.  It must be called after the complete
+ *	command buffer is finalized so it can fill in the correct HMAC
+ *	based on the parameters.
+ * tpm_buf_check_hmac_response() which checks the session response in
+ *	the buffer and calculates what it should be.  If there's a
+ *	mismatch it will log a warning and return an error.  If
+ *	tpm_buf_append_hmac_session() did not specify
+ *	TPM_SA_CONTINUE_SESSION then the session will be closed (if it
+ *	hasn't been consumed) and the auth structure freed.
  */
 
 #include "tpm.h"
@@ -103,7 +115,23 @@ struct tpm2_auth {
 		/* scratch for key + IV */
 		u8 scratch[AES_KEY_BYTES + AES_BLOCK_SIZE];
 	};
+	/*
+	 * the session key and passphrase are the same size as the
+	 * name digest (sha256 again).  The session key is constant
+	 * for the use of the session and the passphrase can change
+	 * with every invocation.
+	 *
+	 * Note: these fields must be adjacent and in this order
+	 * because several HMAC/KDF schemes use the combination of the
+	 * session_key and passphrase.
+	 */
 	u8 session_key[SHA256_DIGEST_SIZE];
+	u8 passphrase[SHA256_DIGEST_SIZE];
+	int passphrase_len;
+	struct crypto_aes_ctx aes_ctx;
+	/* saved session attributes: */
+	u8 attrs;
+	__be32 ordinal;
 
 	/*
 	 * memory for three authorization handles.  We know them by
@@ -309,6 +337,230 @@ static void tpm_buf_append_salt(struct tpm_buf *buf, struct tpm_chip *chip)
 	crypto_free_kpp(kpp);
 }
 
+/**
+ * tpm_buf_append_hmac_session() - Append a TPM session element
+ * @chip: the TPM chip structure
+ * @buf: The buffer to be appended
+ * @attributes: The session attributes
+ * @passphrase: The session authority (NULL if none)
+ * @passphrase_len: The length of the session authority (0 if none)
+ *
+ * This fills in a session structure in the TPM command buffer, except
+ * for the HMAC which cannot be computed until the command buffer is
+ * complete.  The type of session is controlled by the @attributes,
+ * the main ones of which are TPM2_SA_CONTINUE_SESSION which means the
+ * session won't terminate after tpm_buf_check_hmac_response(),
+ * TPM2_SA_DECRYPT which means this buffers first parameter should be
+ * encrypted with a session key and TPM2_SA_ENCRYPT, which means the
+ * response buffer's first parameter needs to be decrypted (confusing,
+ * but the defines are written from the point of view of the TPM).
+ *
+ * Any session appended by this command must be finalized by calling
+ * tpm_buf_fill_hmac_session() otherwise the HMAC will be incorrect
+ * and the TPM will reject the command.
+ *
+ * As with most tpm_buf operations, success is assumed because failure
+ * will be caused by an incorrect programming model and indicated by a
+ * kernel message.
+ */
+void tpm_buf_append_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf,
+				 u8 attributes, u8 *passphrase,
+				 int passphrase_len)
+{
+	u8 nonce[SHA256_DIGEST_SIZE];
+	u32 len;
+	struct tpm2_auth *auth = chip->auth;
+
+	/*
+	 * The Architecture Guide requires us to strip trailing zeros
+	 * before computing the HMAC
+	 */
+	while (passphrase && passphrase_len > 0
+	       && passphrase[passphrase_len - 1] == '\0')
+		passphrase_len--;
+
+	auth->attrs = attributes;
+	auth->passphrase_len = passphrase_len;
+	if (passphrase_len)
+		memcpy(auth->passphrase, passphrase, passphrase_len);
+
+	if (auth->session != tpm_buf_length(buf)) {
+		/* we're not the first session */
+		len = get_unaligned_be32(&buf->data[auth->session]);
+		if (4 + len + auth->session != tpm_buf_length(buf)) {
+			WARN(1, "session length mismatch, cannot append");
+			return;
+		}
+
+		/* add our new session */
+		len += 9 + 2 * SHA256_DIGEST_SIZE;
+		put_unaligned_be32(len, &buf->data[auth->session]);
+	} else {
+		tpm_buf_append_u32(buf, 9 + 2 * SHA256_DIGEST_SIZE);
+	}
+
+	/* random number for our nonce */
+	get_random_bytes(nonce, sizeof(nonce));
+	memcpy(auth->our_nonce, nonce, sizeof(nonce));
+	tpm_buf_append_u32(buf, auth->handle);
+	/* our new nonce */
+	tpm_buf_append_u16(buf, SHA256_DIGEST_SIZE);
+	tpm_buf_append(buf, nonce, SHA256_DIGEST_SIZE);
+	tpm_buf_append_u8(buf, auth->attrs);
+	/* and put a placeholder for the hmac */
+	tpm_buf_append_u16(buf, SHA256_DIGEST_SIZE);
+	tpm_buf_append(buf, nonce, SHA256_DIGEST_SIZE);
+}
+EXPORT_SYMBOL(tpm_buf_append_hmac_session);
+
+/**
+ * tpm_buf_fill_hmac_session() - finalize the session HMAC
+ * @chip: the TPM chip structure
+ * @buf: The buffer to be appended
+ *
+ * This command must not be called until all of the parameters have
+ * been appended to @buf otherwise the computed HMAC will be
+ * incorrect.
+ *
+ * This function computes and fills in the session HMAC using the
+ * session key and, if TPM2_SA_DECRYPT was specified, computes the
+ * encryption key and encrypts the first parameter of the command
+ * buffer with it.
+ *
+ * As with most tpm_buf operations, success is assumed because failure
+ * will be caused by an incorrect programming model and indicated by a
+ * kernel message.
+ */
+void tpm_buf_fill_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf)
+{
+	u32 cc, handles, val;
+	struct tpm2_auth *auth = chip->auth;
+	int i;
+	struct tpm_header *head = (struct tpm_header *)buf->data;
+	off_t offset_s = TPM_HEADER_SIZE, offset_p;
+	u8 *hmac = NULL;
+	u32 attrs;
+	u8 cphash[SHA256_DIGEST_SIZE];
+	struct sha256_state sctx;
+
+	/* save the command code in BE format */
+	auth->ordinal = head->ordinal;
+
+	cc = be32_to_cpu(head->ordinal);
+
+	i = tpm2_find_cc(chip, cc);
+	if (i < 0) {
+		dev_err(&chip->dev, "Command 0x%x not found in TPM\n", cc);
+		return;
+	}
+	attrs = chip->cc_attrs_tbl[i];
+
+	handles = (attrs >> TPM2_CC_ATTR_CHANDLES) & GENMASK(2, 0);
+
+	/*
+	 * just check the names, it's easy to make mistakes.  This
+	 * would happen if someone added a handle via
+	 * tpm_buf_append_u32() instead of tpm_buf_append_name()
+	 */
+	for (i = 0; i < handles; i++) {
+		u32 handle = tpm_buf_read_u32(buf, &offset_s);
+
+		if (auth->name_h[i] != handle) {
+			dev_err(&chip->dev, "TPM: handle %d wrong for name\n",
+				  i);
+			return;
+		}
+	}
+	/* point offset_s to the start of the sessions */
+	val = tpm_buf_read_u32(buf, &offset_s);
+	/* point offset_p to the start of the parameters */
+	offset_p = offset_s + val;
+	for (i = 1; offset_s < offset_p; i++) {
+		u32 handle = tpm_buf_read_u32(buf, &offset_s);
+		u16 len;
+		u8 a;
+
+		/* nonce (already in auth) */
+		len = tpm_buf_read_u16(buf, &offset_s);
+		offset_s += len;
+
+		a = tpm_buf_read_u8(buf, &offset_s);
+
+		len = tpm_buf_read_u16(buf, &offset_s);
+		if (handle == auth->handle && auth->attrs == a) {
+			hmac = &buf->data[offset_s];
+			/*
+			 * save our session number so we know which
+			 * session in the response belongs to us
+			 */
+			auth->session = i;
+		}
+
+		offset_s += len;
+	}
+	if (offset_s != offset_p) {
+		dev_err(&chip->dev, "TPM session length is incorrect\n");
+		return;
+	}
+	if (!hmac) {
+		dev_err(&chip->dev, "TPM could not find HMAC session\n");
+		return;
+	}
+
+	/* encrypt before HMAC */
+	if (auth->attrs & TPM2_SA_DECRYPT) {
+		u16 len;
+
+		/* need key and IV */
+		tpm2_KDFa(auth->session_key, SHA256_DIGEST_SIZE
+			  + auth->passphrase_len, "CFB", auth->our_nonce,
+			  auth->tpm_nonce, AES_KEY_BYTES + AES_BLOCK_SIZE,
+			  auth->scratch);
+
+		len = tpm_buf_read_u16(buf, &offset_p);
+		aes_expandkey(&auth->aes_ctx, auth->scratch, AES_KEY_BYTES);
+		aescfb_encrypt(&auth->aes_ctx, &buf->data[offset_p],
+			       &buf->data[offset_p], len,
+			       auth->scratch + AES_KEY_BYTES);
+		/* reset p to beginning of parameters for HMAC */
+		offset_p -= 2;
+	}
+
+	sha256_init(&sctx);
+	/* ordinal is already BE */
+	sha256_update(&sctx, (u8 *)&head->ordinal, sizeof(head->ordinal));
+	/* add the handle names */
+	for (i = 0; i < handles; i++) {
+		enum tpm2_mso_type mso = tpm2_handle_mso(auth->name_h[i]);
+
+		if (mso == TPM2_MSO_PERSISTENT ||
+		    mso == TPM2_MSO_VOLATILE ||
+		    mso == TPM2_MSO_NVRAM) {
+			sha256_update(&sctx, auth->name[i],
+				      name_size(auth->name[i]));
+		} else {
+			__be32 h = cpu_to_be32(auth->name_h[i]);
+
+			sha256_update(&sctx, (u8 *)&h, 4);
+		}
+	}
+	if (offset_s != tpm_buf_length(buf))
+		sha256_update(&sctx, &buf->data[offset_s],
+			      tpm_buf_length(buf) - offset_s);
+	sha256_final(&sctx, cphash);
+
+	/* now calculate the hmac */
+	tpm2_hmac_init(&sctx, auth->session_key, sizeof(auth->session_key)
+		       + auth->passphrase_len);
+	sha256_update(&sctx, cphash, sizeof(cphash));
+	sha256_update(&sctx, auth->our_nonce, sizeof(auth->our_nonce));
+	sha256_update(&sctx, auth->tpm_nonce, sizeof(auth->tpm_nonce));
+	sha256_update(&sctx, &auth->attrs, 1);
+	tpm2_hmac_final(&sctx, auth->session_key, sizeof(auth->session_key)
+			+ auth->passphrase_len, hmac);
+}
+EXPORT_SYMBOL(tpm_buf_fill_hmac_session);
+
 static int tpm2_parse_read_public(char *name, struct tpm_buf *buf)
 {
 	struct tpm_header *head = (struct tpm_header *)buf->data;
@@ -406,6 +658,154 @@ void tpm_buf_append_name(struct tpm_chip *chip, struct tpm_buf *buf,
 		memcpy(auth->name[slot], name, name_size(name));
 }
 EXPORT_SYMBOL(tpm_buf_append_name);
+
+/**
+ * tpm_buf_check_hmac_response() - check the TPM return HMAC for correctness
+ * @chip: the TPM chip structure
+ * @buf: the original command buffer (which now contains the response)
+ * @rc: the return code from tpm_transmit_cmd
+ *
+ * If @rc is non zero, @buf may not contain an actual return, so @rc
+ * is passed through as the return and the session cleaned up and
+ * de-allocated if required (this is required if
+ * TPM2_SA_CONTINUE_SESSION was not specified as a session flag).
+ *
+ * If @rc is zero, the response HMAC is computed against the returned
+ * @buf and matched to the TPM one in the session area.  If there is a
+ * mismatch, an error is logged and -EINVAL returned.
+ *
+ * The reason for this is that the command issue and HMAC check
+ * sequence should look like:
+ *
+ *	rc = tpm_transmit_cmd(...);
+ *	rc = tpm_buf_check_hmac_response(&buf, auth, rc);
+ *	if (rc)
+ *		...
+ *
+ * Which is easily layered into the current contrl flow.
+ *
+ * Returns: 0 on success or an error.
+ */
+int tpm_buf_check_hmac_response(struct tpm_chip *chip, struct tpm_buf *buf,
+				int rc)
+{
+	struct tpm_header *head = (struct tpm_header *)buf->data;
+	struct tpm2_auth *auth = chip->auth;
+	off_t offset_s, offset_p;
+	u8 rphash[SHA256_DIGEST_SIZE];
+	u32 attrs;
+	struct sha256_state sctx;
+	u16 tag = be16_to_cpu(head->tag);
+	u32 cc = be32_to_cpu(auth->ordinal);
+	int parm_len, len, i, handles;
+
+	if (auth->session >= TPM_HEADER_SIZE) {
+		WARN(1, "tpm session not filled correctly\n");
+		goto out;
+	}
+
+	if (rc != 0)
+		/* pass non success rc through and close the session */
+		goto out;
+
+	rc = -EINVAL;
+	if (tag != TPM2_ST_SESSIONS) {
+		dev_err(&chip->dev, "TPM: HMAC response check has no sessions tag\n");
+		goto out;
+	}
+
+	i = tpm2_find_cc(chip, cc);
+	if (i < 0)
+		goto out;
+	attrs = chip->cc_attrs_tbl[i];
+	handles = (attrs >> TPM2_CC_ATTR_RHANDLE) & 1;
+
+	/* point to area beyond handles */
+	offset_s = TPM_HEADER_SIZE + handles * 4;
+	parm_len = tpm_buf_read_u32(buf, &offset_s);
+	offset_p = offset_s;
+	offset_s += parm_len;
+	/* skip over any sessions before ours */
+	for (i = 0; i < auth->session - 1; i++) {
+		len = tpm_buf_read_u16(buf, &offset_s);
+		offset_s += len + 1;
+		len = tpm_buf_read_u16(buf, &offset_s);
+		offset_s += len;
+	}
+	/* TPM nonce */
+	len = tpm_buf_read_u16(buf, &offset_s);
+	if (offset_s + len > tpm_buf_length(buf))
+		goto out;
+	if (len != SHA256_DIGEST_SIZE)
+		goto out;
+	memcpy(auth->tpm_nonce, &buf->data[offset_s], len);
+	offset_s += len;
+	attrs = tpm_buf_read_u8(buf, &offset_s);
+	len = tpm_buf_read_u16(buf, &offset_s);
+	if (offset_s + len != tpm_buf_length(buf))
+		goto out;
+	if (len != SHA256_DIGEST_SIZE)
+		goto out;
+	/*
+	 * offset_s points to the HMAC. now calculate comparison, beginning
+	 * with rphash
+	 */
+	sha256_init(&sctx);
+	/* yes, I know this is now zero, but it's what the standard says */
+	sha256_update(&sctx, (u8 *)&head->return_code,
+		      sizeof(head->return_code));
+	/* ordinal is already BE */
+	sha256_update(&sctx, (u8 *)&auth->ordinal, sizeof(auth->ordinal));
+	sha256_update(&sctx, &buf->data[offset_p], parm_len);
+	sha256_final(&sctx, rphash);
+
+	/* now calculate the hmac */
+	tpm2_hmac_init(&sctx, auth->session_key, sizeof(auth->session_key)
+		       + auth->passphrase_len);
+	sha256_update(&sctx, rphash, sizeof(rphash));
+	sha256_update(&sctx, auth->tpm_nonce, sizeof(auth->tpm_nonce));
+	sha256_update(&sctx, auth->our_nonce, sizeof(auth->our_nonce));
+	sha256_update(&sctx, &auth->attrs, 1);
+	/* we're done with the rphash, so put our idea of the hmac there */
+	tpm2_hmac_final(&sctx, auth->session_key, sizeof(auth->session_key)
+			+ auth->passphrase_len, rphash);
+	if (memcmp(rphash, &buf->data[offset_s], SHA256_DIGEST_SIZE) == 0) {
+		rc = 0;
+	} else {
+		dev_err(&chip->dev, "TPM: HMAC check failed\n");
+		goto out;
+	}
+
+	/* now do response decryption */
+	if (auth->attrs & TPM2_SA_ENCRYPT) {
+		/* need key and IV */
+		tpm2_KDFa(auth->session_key, SHA256_DIGEST_SIZE
+			  + auth->passphrase_len, "CFB", auth->tpm_nonce,
+			  auth->our_nonce, AES_KEY_BYTES + AES_BLOCK_SIZE,
+			  auth->scratch);
+
+		len = tpm_buf_read_u16(buf, &offset_p);
+		aes_expandkey(&auth->aes_ctx, auth->scratch, AES_KEY_BYTES);
+		aescfb_decrypt(&auth->aes_ctx, &buf->data[offset_p],
+			       &buf->data[offset_p], len,
+			       auth->scratch + AES_KEY_BYTES);
+	}
+
+ out:
+	if ((auth->attrs & TPM2_SA_CONTINUE_SESSION) == 0) {
+		if (rc)
+			/* manually close the session if it wasn't consumed */
+			tpm2_flush_context(chip, auth->handle);
+		memzero_explicit(auth, sizeof(*auth));
+	} else {
+		/* reset for next use  */
+		auth->session = TPM_HEADER_SIZE;
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(tpm_buf_check_hmac_response);
+
 /**
  * tpm2_end_auth_session() - kill the allocated auth session
  * @chip: the TPM chip structure
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index 31c2065fcd35..dd4d6a6158c4 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -510,8 +510,25 @@ static inline void tpm_buf_append_empty_auth(struct tpm_buf *buf, u32 handle)
 int tpm2_start_auth_session(struct tpm_chip *chip);
 void tpm_buf_append_name(struct tpm_chip *chip, struct tpm_buf *buf,
 			 u32 handle, u8 *name);
+void tpm_buf_append_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf,
+				 u8 attributes, u8 *passphrase,
+				 int passphraselen);
+static inline void tpm_buf_append_hmac_session_opt(struct tpm_chip *chip,
+						   struct tpm_buf *buf,
+						   u8 attributes,
+						   u8 *passphrase,
+						   int passphraselen)
+{
+	tpm_buf_append_hmac_session(chip, buf, attributes, passphrase,
+				    passphraselen);
+}
+void tpm_buf_fill_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf);
+int tpm_buf_check_hmac_response(struct tpm_chip *chip, struct tpm_buf *buf,
+				int rc);
 void tpm2_end_auth_session(struct tpm_chip *chip);
 #else
+#include <asm/unaligned.h>
+
 static inline int tpm2_start_auth_session(struct tpm_chip *chip)
 {
 	return 0;
@@ -527,6 +544,58 @@ static inline void tpm_buf_append_name(struct tpm_chip *chip,
 	/* count the number of handles in the upper bits of flags */
 	buf->handles++;
 }
+static inline void tpm_buf_append_hmac_session(struct tpm_chip *chip,
+					       struct tpm_buf *buf,
+					       u8 attributes, u8 *passphrase,
+					       int passphraselen)
+{
+	/* offset tells us where the sessions area begins */
+	int offset = buf->handles * 4 + TPM_HEADER_SIZE;
+	u32 len = 9 + passphraselen;
+
+	if (tpm_buf_length(buf) != offset) {
+		/* not the first session so update the existing length */
+		len += get_unaligned_be32(&buf->data[offset]);
+		put_unaligned_be32(len, &buf->data[offset]);
+	} else {
+		tpm_buf_append_u32(buf, len);
+	}
+	/* auth handle */
+	tpm_buf_append_u32(buf, TPM2_RS_PW);
+	/* nonce */
+	tpm_buf_append_u16(buf, 0);
+	/* attributes */
+	tpm_buf_append_u8(buf, 0);
+	/* passphrase */
+	tpm_buf_append_u16(buf, passphraselen);
+	tpm_buf_append(buf, passphrase, passphraselen);
+}
+static inline void tpm_buf_append_hmac_session_opt(struct tpm_chip *chip,
+						   struct tpm_buf *buf,
+						   u8 attributes,
+						   u8 *passphrase,
+						   int passphraselen)
+{
+	int offset = buf->handles * 4 + TPM_HEADER_SIZE;
+	struct tpm_header *head = (struct tpm_header *) buf->data;
+
+	/*
+	 * if the only sessions are optional, the command tag
+	 * must change to TPM2_ST_NO_SESSIONS
+	 */
+	if (tpm_buf_length(buf) == offset)
+		head->tag = cpu_to_be16(TPM2_ST_NO_SESSIONS);
+}
+static inline void tpm_buf_fill_hmac_session(struct tpm_chip *chip,
+					     struct tpm_buf *buf)
+{
+}
+static inline int tpm_buf_check_hmac_response(struct tpm_chip *chip,
+					      struct tpm_buf *buf,
+					      int rc)
+{
+	return rc;
+}
 #endif	/* CONFIG_TCG_TPM2_HMAC */
 
 #endif
-- 
cgit v1.2.3


From eb24c9788cd90db397b3e41322aff4a5557623b4 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Mon, 29 Apr 2024 16:28:11 -0400
Subject: tpm: disable the TPM if NULL name changes

Update tpm2_load_context() to return -EINVAL on integrity failures and
use this as a signal when loading the NULL context that something
might be wrong.  If the signal fails, check the name of the NULL
primary against the one stored in the chip data and if there is a
mismatch disable the TPM because it is likely to have suffered a reset
attack.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Tested-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
---
 drivers/char/tpm/tpm-chip.c      |  3 ++
 drivers/char/tpm/tpm2-sessions.c | 77 +++++++++++++++++++++++++++++++---------
 drivers/char/tpm/tpm2-space.c    |  3 ++
 include/linux/tpm.h              |  4 ++-
 4 files changed, 70 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c
index d93937326b2e..854546000c92 100644
--- a/drivers/char/tpm/tpm-chip.c
+++ b/drivers/char/tpm/tpm-chip.c
@@ -158,6 +158,9 @@ int tpm_try_get_ops(struct tpm_chip *chip)
 {
 	int rc = -EIO;
 
+	if (chip->flags & TPM_CHIP_FLAG_DISABLE)
+		return rc;
+
 	get_device(&chip->dev);
 
 	down_read(&chip->ops_sem);
diff --git a/drivers/char/tpm/tpm2-sessions.c b/drivers/char/tpm/tpm2-sessions.c
index dc50f635700c..ea8860661876 100644
--- a/drivers/char/tpm/tpm2-sessions.c
+++ b/drivers/char/tpm/tpm2-sessions.c
@@ -80,6 +80,9 @@
 /* maximum number of names the TPM must remember for authorization */
 #define AUTH_MAX_NAMES	3
 
+static int tpm2_create_primary(struct tpm_chip *chip, u32 hierarchy,
+			       u32 *handle, u8 *name);
+
 /*
  * This is the structure that carries all the auth information (like
  * session handle, nonces, session key and auth) from use to use it is
@@ -851,6 +854,37 @@ static int tpm2_parse_start_auth_session(struct tpm2_auth *auth,
 	return 0;
 }
 
+static int tpm2_load_null(struct tpm_chip *chip, u32 *null_key)
+{
+	int rc;
+	unsigned int offset = 0; /* dummy offset for null seed context */
+	u8 name[SHA256_DIGEST_SIZE + 2];
+
+	rc = tpm2_load_context(chip, chip->null_key_context, &offset,
+			       null_key);
+	if (rc != -EINVAL)
+		return rc;
+
+	/* an integrity failure may mean the TPM has been reset */
+	dev_err(&chip->dev, "NULL key integrity failure!\n");
+	/* check the null name against what we know */
+	tpm2_create_primary(chip, TPM2_RH_NULL, NULL, name);
+	if (memcmp(name, chip->null_key_name, sizeof(name)) == 0)
+		/* name unchanged, assume transient integrity failure */
+		return rc;
+	/*
+	 * Fatal TPM failure: the NULL seed has actually changed, so
+	 * the TPM must have been illegally reset.  All in-kernel TPM
+	 * operations will fail because the NULL primary can't be
+	 * loaded to salt the sessions, but disable the TPM anyway so
+	 * userspace programmes can't be compromised by it.
+	 */
+	dev_err(&chip->dev, "NULL name has changed, disabling TPM due to interference\n");
+	chip->flags |= TPM_CHIP_FLAG_DISABLE;
+
+	return rc;
+}
+
 /**
  * tpm2_start_auth_session() - create a HMAC authentication session with the TPM
  * @chip: the TPM chip structure to create the session with
@@ -868,12 +902,9 @@ int tpm2_start_auth_session(struct tpm_chip *chip)
 	struct tpm_buf buf;
 	struct tpm2_auth *auth = chip->auth;
 	int rc;
-	/* null seed context has no offset, but we must provide one */
-	unsigned int offset = 0;
-	u32 nullkey;
+	u32 null_key;
 
-	rc = tpm2_load_context(chip, chip->null_key_context, &offset,
-			       &nullkey);
+	rc = tpm2_load_null(chip, &null_key);
 	if (rc)
 		goto out;
 
@@ -884,7 +915,7 @@ int tpm2_start_auth_session(struct tpm_chip *chip)
 		goto out;
 
 	/* salt key handle */
-	tpm_buf_append_u32(&buf, nullkey);
+	tpm_buf_append_u32(&buf, null_key);
 	/* bind key handle */
 	tpm_buf_append_u32(&buf, TPM2_RH_NULL);
 	/* nonce caller */
@@ -908,7 +939,7 @@ int tpm2_start_auth_session(struct tpm_chip *chip)
 	tpm_buf_append_u16(&buf, TPM_ALG_SHA256);
 
 	rc = tpm_transmit_cmd(chip, &buf, 0, "start auth session");
-	tpm2_flush_context(chip, nullkey);
+	tpm2_flush_context(chip, null_key);
 
 	if (rc == TPM2_RC_SUCCESS)
 		rc = tpm2_parse_start_auth_session(auth, &buf);
@@ -930,6 +961,7 @@ EXPORT_SYMBOL(tpm2_start_auth_session);
  * @buf:	The response buffer from the chip
  * @handle:	pointer to be filled in with the return handle of the primary
  * @hierarchy:	The hierarchy the primary was created for
+ * @name:	pointer to be filled in with the primary key name
  *
  * Return:
  * * 0		- OK
@@ -937,15 +969,20 @@ EXPORT_SYMBOL(tpm2_start_auth_session);
  * * TPM_RC	- A TPM error
  */
 static int tpm2_parse_create_primary(struct tpm_chip *chip, struct tpm_buf *buf,
-				     u32 *handle, u32 hierarchy)
+				     u32 *handle, u32 hierarchy, u8 *name)
 {
 	struct tpm_header *head = (struct tpm_header *)buf->data;
 	off_t offset_r = TPM_HEADER_SIZE, offset_t;
 	u16 len = TPM_HEADER_SIZE;
 	u32 total_len = be32_to_cpu(head->length);
-	u32 val, param_len;
+	u32 val, param_len, keyhandle;
+
+	keyhandle = tpm_buf_read_u32(buf, &offset_r);
+	if (handle)
+		*handle = keyhandle;
+	else
+		tpm2_flush_context(chip, keyhandle);
 
-	*handle = tpm_buf_read_u32(buf, &offset_r);
 	param_len = tpm_buf_read_u32(buf, &offset_r);
 	/*
 	 * param_len doesn't include the header, but all the other
@@ -958,9 +995,14 @@ static int tpm2_parse_create_primary(struct tpm_chip *chip, struct tpm_buf *buf,
 		return -EINVAL;
 	len = tpm_buf_read_u16(buf, &offset_r);
 	offset_t = offset_r;
-	/* now we have the public area, compute the name of the object */
-	put_unaligned_be16(TPM_ALG_SHA256, chip->null_key_name);
-	sha256(&buf->data[offset_r], len, chip->null_key_name + 2);
+	if (name) {
+		/*
+		 * now we have the public area, compute the name of
+		 * the object
+		 */
+		put_unaligned_be16(TPM_ALG_SHA256, name);
+		sha256(&buf->data[offset_r], len, name + 2);
+	}
 
 	/* validate the public key */
 	val = tpm_buf_read_u16(buf, &offset_t);
@@ -1089,6 +1131,7 @@ static int tpm2_parse_create_primary(struct tpm_chip *chip, struct tpm_buf *buf,
  * @chip:      the TPM chip to create under
  * @hierarchy: The hierarchy handle to create under
  * @handle:    The returned volatile handle on success
+ * @name:      The name of the returned key
  *
  * For platforms that might not have a persistent primary, this can be
  * used to create one quickly on the fly (it uses Elliptic Curve not
@@ -1103,7 +1146,7 @@ static int tpm2_parse_create_primary(struct tpm_chip *chip, struct tpm_buf *buf,
  * * TPM_RC	- A TPM error
  */
 static int tpm2_create_primary(struct tpm_chip *chip, u32 hierarchy,
-			       u32 *handle)
+			       u32 *handle, u8 *name)
 {
 	int rc;
 	struct tpm_buf buf;
@@ -1193,7 +1236,8 @@ static int tpm2_create_primary(struct tpm_chip *chip, u32 hierarchy,
 			      "attempting to create NULL primary");
 
 	if (rc == TPM2_RC_SUCCESS)
-		rc = tpm2_parse_create_primary(chip, &buf, handle, hierarchy);
+		rc = tpm2_parse_create_primary(chip, &buf, handle, hierarchy,
+					       name);
 
 	tpm_buf_destroy(&buf);
 
@@ -1205,7 +1249,8 @@ static int tpm2_create_null_primary(struct tpm_chip *chip)
 	u32 null_key;
 	int rc;
 
-	rc = tpm2_create_primary(chip, TPM2_RH_NULL, &null_key);
+	rc = tpm2_create_primary(chip, TPM2_RH_NULL, &null_key,
+				 chip->null_key_name);
 
 	if (rc == TPM2_RC_SUCCESS) {
 		unsigned int offset = 0; /* dummy offset for null key context */
diff --git a/drivers/char/tpm/tpm2-space.c b/drivers/char/tpm/tpm2-space.c
index 24479a81c23c..4892d491da8d 100644
--- a/drivers/char/tpm/tpm2-space.c
+++ b/drivers/char/tpm/tpm2-space.c
@@ -105,6 +105,9 @@ int tpm2_load_context(struct tpm_chip *chip, u8 *buf,
 		*handle = 0;
 		tpm_buf_destroy(&tbuf);
 		return -ENOENT;
+	} else if (tpm2_rc_value(rc) == TPM2_RC_INTEGRITY) {
+		tpm_buf_destroy(&tbuf);
+		return -EINVAL;
 	} else if (rc > 0) {
 		dev_warn(&chip->dev, "%s: failed with a TPM error 0x%04X\n",
 			 __func__, rc);
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index dd4d6a6158c4..c17e4efbb2e5 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -248,6 +248,7 @@ enum tpm2_return_codes {
 	TPM2_RC_SUCCESS		= 0x0000,
 	TPM2_RC_HASH		= 0x0083, /* RC_FMT1 */
 	TPM2_RC_HANDLE		= 0x008B,
+	TPM2_RC_INTEGRITY	= 0x009F,
 	TPM2_RC_INITIALIZE	= 0x0100, /* RC_VER1 */
 	TPM2_RC_FAILURE		= 0x0101,
 	TPM2_RC_DISABLED	= 0x0120,
@@ -346,6 +347,7 @@ enum tpm_chip_flags {
 	TPM_CHIP_FLAG_FIRMWARE_UPGRADE		= BIT(7),
 	TPM_CHIP_FLAG_SUSPENDED			= BIT(8),
 	TPM_CHIP_FLAG_HWRNG_DISABLED		= BIT(9),
+	TPM_CHIP_FLAG_DISABLE			= BIT(10),
 };
 
 #define to_tpm_chip(d) container_of(d, struct tpm_chip, dev)
@@ -447,7 +449,7 @@ static inline bool tpm_is_firmware_upgrade(struct tpm_chip *chip)
 
 static inline u32 tpm2_rc_value(u32 rc)
 {
-	return (rc & BIT(7)) ? rc & 0xff : rc;
+	return (rc & BIT(7)) ? rc & 0xbf : rc;
 }
 
 #if defined(CONFIG_TCG_TPM) || defined(CONFIG_TCG_TPM_MODULE)
-- 
cgit v1.2.3


From 383eed2de529287337d9153a5084d4291a7c69f2 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 7 May 2024 16:41:40 +0000
Subject: tcp: get rid of twsk_unique()

DCCP is going away soon, and had no twsk_unique() method.

We can directly call tcp_twsk_unique() for TCP sockets.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://lore.kernel.org/r/20240507164140.940547-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/timewait_sock.h | 9 ---------
 net/ipv4/inet_hashtables.c  | 3 ++-
 net/ipv4/tcp_ipv4.c         | 1 -
 net/ipv6/inet6_hashtables.c | 4 +++-
 net/ipv6/tcp_ipv6.c         | 1 -
 5 files changed, 5 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/include/net/timewait_sock.h b/include/net/timewait_sock.h
index 74d2b463cc95..62b3e9f2aed4 100644
--- a/include/net/timewait_sock.h
+++ b/include/net/timewait_sock.h
@@ -15,18 +15,9 @@ struct timewait_sock_ops {
 	struct kmem_cache	*twsk_slab;
 	char		*twsk_slab_name;
 	unsigned int	twsk_obj_size;
-	int		(*twsk_unique)(struct sock *sk,
-				       struct sock *sktw, void *twp);
 	void		(*twsk_destructor)(struct sock *sk);
 };
 
-static inline int twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
-{
-	if (sk->sk_prot->twsk_prot->twsk_unique != NULL)
-		return sk->sk_prot->twsk_prot->twsk_unique(sk, sktw, twp);
-	return 0;
-}
-
 static inline void twsk_destructor(struct sock *sk)
 {
 	if (sk->sk_prot->twsk_prot->twsk_destructor != NULL)
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index cf88eca5f1b4..48d0d494185b 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -565,7 +565,8 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
 		if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) {
 			if (sk2->sk_state == TCP_TIME_WAIT) {
 				tw = inet_twsk(sk2);
-				if (twsk_unique(sk, sk2, twp))
+				if (sk->sk_protocol == IPPROTO_TCP &&
+				    tcp_twsk_unique(sk, sk2, twp))
 					break;
 			}
 			goto not_unique;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 95e3d28b83b8..108a438dc247 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2437,7 +2437,6 @@ do_time_wait:
 
 static struct timewait_sock_ops tcp_timewait_sock_ops = {
 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
-	.twsk_unique	= tcp_twsk_unique,
 	.twsk_destructor= tcp_twsk_destructor,
 };
 
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 2e81383b663b..6db71bb1cd30 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -21,6 +21,7 @@
 #include <net/secure_seq.h>
 #include <net/ip.h>
 #include <net/sock_reuseport.h>
+#include <net/tcp.h>
 
 u32 inet6_ehashfn(const struct net *net,
 		  const struct in6_addr *laddr, const u16 lport,
@@ -289,7 +290,8 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
 				       dif, sdif))) {
 			if (sk2->sk_state == TCP_TIME_WAIT) {
 				tw = inet_twsk(sk2);
-				if (twsk_unique(sk, sk2, twp))
+				if (sk->sk_protocol == IPPROTO_TCP &&
+				    tcp_twsk_unique(sk, sk2, twp))
 					break;
 			}
 			goto not_unique;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 37201c4fb393..7f6693e794bd 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2049,7 +2049,6 @@ void tcp_v6_early_demux(struct sk_buff *skb)
 
 static struct timewait_sock_ops tcp6_timewait_sock_ops = {
 	.twsk_obj_size	= sizeof(struct tcp6_timewait_sock),
-	.twsk_unique	= tcp_twsk_unique,
 	.twsk_destructor = tcp_twsk_destructor,
 };
 
-- 
cgit v1.2.3


From c62b758bae6af16fee94f556091fa74883a96b1e Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 9 May 2024 13:04:24 +0200
Subject: fcntl: add F_DUPFD_QUERY fcntl()

Often userspace needs to know whether two file descriptors refer to the
same struct file. For example, systemd uses this to filter out duplicate
file descriptors in it's file descriptor store (cf. [1]) and vulkan uses
it to compare dma-buf fds (cf. [2]).

The only api we provided for this was kcmp() but that's not generally
available or might be disallowed because it is way more powerful (allows
ordering of file pointers, operates on non-current task) etc. So give
userspace a simple way of comparing two file descriptors for sameness
adding a new fcntl() F_DUDFD_QUERY.

Link: https://github.com/systemd/systemd/blob/a4f0e0da3573a10bc5404142be8799418760b1d1/src/basic/fd-util.c#L517 [1]
Link: https://gitlab.freedesktop.org/wlroots/wlroots/-/blob/master/render/vulkan/texture.c#L490 [2]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
[brauner: commit message]
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fcntl.c                 | 20 ++++++++++++++++++++
 include/uapi/linux/fcntl.h | 14 ++++++++------
 2 files changed, 28 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 54cc85d3338e..300e5d9ad913 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -327,6 +327,22 @@ static long fcntl_set_rw_hint(struct file *file, unsigned int cmd,
 	return 0;
 }
 
+/* Is the file descriptor a dup of the file? */
+static long f_dupfd_query(int fd, struct file *filp)
+{
+	CLASS(fd_raw, f)(fd);
+
+	/*
+	 * We can do the 'fdput()' immediately, as the only thing that
+	 * matters is the pointer value which isn't changed by the fdput.
+	 *
+	 * Technically we didn't need a ref at all, and 'fdget()' was
+	 * overkill, but given our lockless file pointer lookup, the
+	 * alternatives are complicated.
+	 */
+	return f.file == filp;
+}
+
 static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 		struct file *filp)
 {
@@ -342,6 +358,9 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 	case F_DUPFD_CLOEXEC:
 		err = f_dupfd(argi, filp, O_CLOEXEC);
 		break;
+	case F_DUPFD_QUERY:
+		err = f_dupfd_query(argi, filp);
+		break;
 	case F_GETFD:
 		err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
 		break;
@@ -446,6 +465,7 @@ static int check_fcntl_cmd(unsigned cmd)
 	switch (cmd) {
 	case F_DUPFD:
 	case F_DUPFD_CLOEXEC:
+	case F_DUPFD_QUERY:
 	case F_GETFD:
 	case F_SETFD:
 	case F_GETFL:
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 282e90aeb163..c0bcc185fa48 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -8,6 +8,14 @@
 #define F_SETLEASE	(F_LINUX_SPECIFIC_BASE + 0)
 #define F_GETLEASE	(F_LINUX_SPECIFIC_BASE + 1)
 
+/*
+ * Request nofications on a directory.
+ * See below for events that may be notified.
+ */
+#define F_NOTIFY	(F_LINUX_SPECIFIC_BASE + 2)
+
+#define F_DUPFD_QUERY	(F_LINUX_SPECIFIC_BASE + 3)
+
 /*
  * Cancel a blocking posix lock; internal use only until we expose an
  * asynchronous lock api to userspace:
@@ -17,12 +25,6 @@
 /* Create a file descriptor with FD_CLOEXEC set. */
 #define F_DUPFD_CLOEXEC	(F_LINUX_SPECIFIC_BASE + 6)
 
-/*
- * Request nofications on a directory.
- * See below for events that may be notified.
- */
-#define F_NOTIFY	(F_LINUX_SPECIFIC_BASE+2)
-
 /*
  * Set and get of pipe page size array
  */
-- 
cgit v1.2.3


From fa23e0d4b756d25829e124d6b670a4c6bbd4bf7e Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 8 May 2024 14:52:47 +0200
Subject: netfilter: nf_tables: allow clone callbacks to sleep

Sven Auhagen reports transaction failures with following error:
  ./main.nft:13:1-26: Error: Could not process rule: Cannot allocate memory
  percpu: allocation failed, size=16 align=8 atomic=1, atomic alloc failed, no space left

This points to failing pcpu allocation with GFP_ATOMIC flag.
However, transactions happen from user context and are allowed to sleep.

One case where we can call into percpu allocator with GFP_ATOMIC is
nft_counter expression.

Normally this happens from control plane, so this could use GFP_KERNEL
instead.  But one use case, element insertion from packet path,
needs to use GFP_ATOMIC allocations (nft_dynset expression).

At this time, .clone callbacks always use GFP_ATOMIC for this reason.

Add gfp_t argument to the .clone function and pass GFP_KERNEL or
GFP_ATOMIC flag depending on context, this allows all clone memory
allocations to sleep for the normal (transaction) case.

Cc: Sven Auhagen <sven.auhagen@voleatech.de>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  4 ++--
 net/netfilter/nf_tables_api.c     |  8 ++++----
 net/netfilter/nft_connlimit.c     |  4 ++--
 net/netfilter/nft_counter.c       |  4 ++--
 net/netfilter/nft_dynset.c        |  2 +-
 net/netfilter/nft_last.c          |  4 ++--
 net/netfilter/nft_limit.c         | 14 ++++++++------
 net/netfilter/nft_quota.c         |  4 ++--
 8 files changed, 23 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 3f1ed467f951..2796153b03da 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -416,7 +416,7 @@ struct nft_expr_info;
 
 int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla,
 			 struct nft_expr_info *info);
-int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src);
+int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp);
 void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr);
 int nft_expr_dump(struct sk_buff *skb, unsigned int attr,
 		  const struct nft_expr *expr, bool reset);
@@ -935,7 +935,7 @@ struct nft_expr_ops {
 						struct nft_regs *regs,
 						const struct nft_pktinfo *pkt);
 	int				(*clone)(struct nft_expr *dst,
-						 const struct nft_expr *src);
+						 const struct nft_expr *src, gfp_t gfp);
 	unsigned int			size;
 
 	int				(*init)(const struct nft_ctx *ctx,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index a7f54eb68d9a..be3b4c90d2ed 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3333,7 +3333,7 @@ err_expr_parse:
 	return ERR_PTR(err);
 }
 
-int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src)
+int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp)
 {
 	int err;
 
@@ -3341,7 +3341,7 @@ int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src)
 		return -EINVAL;
 
 	dst->ops = src->ops;
-	err = src->ops->clone(dst, src);
+	err = src->ops->clone(dst, src, gfp);
 	if (err < 0)
 		return err;
 
@@ -6525,7 +6525,7 @@ int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set,
 		if (!expr)
 			goto err_expr;
 
-		err = nft_expr_clone(expr, set->exprs[i]);
+		err = nft_expr_clone(expr, set->exprs[i], GFP_KERNEL_ACCOUNT);
 		if (err < 0) {
 			kfree(expr);
 			goto err_expr;
@@ -6564,7 +6564,7 @@ static int nft_set_elem_expr_setup(struct nft_ctx *ctx,
 
 	for (i = 0; i < num_exprs; i++) {
 		expr = nft_setelem_expr_at(elem_expr, elem_expr->size);
-		err = nft_expr_clone(expr, expr_array[i]);
+		err = nft_expr_clone(expr, expr_array[i], GFP_KERNEL_ACCOUNT);
 		if (err < 0)
 			goto err_elem_expr_setup;
 
diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
index de9d1980df69..92b984fa8175 100644
--- a/net/netfilter/nft_connlimit.c
+++ b/net/netfilter/nft_connlimit.c
@@ -210,12 +210,12 @@ static void nft_connlimit_destroy(const struct nft_ctx *ctx,
 	nft_connlimit_do_destroy(ctx, priv);
 }
 
-static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src)
+static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp)
 {
 	struct nft_connlimit *priv_dst = nft_expr_priv(dst);
 	struct nft_connlimit *priv_src = nft_expr_priv(src);
 
-	priv_dst->list = kmalloc(sizeof(*priv_dst->list), GFP_ATOMIC);
+	priv_dst->list = kmalloc(sizeof(*priv_dst->list), gfp);
 	if (!priv_dst->list)
 		return -ENOMEM;
 
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index dccc68a5135a..291ed2026367 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -226,7 +226,7 @@ static void nft_counter_destroy(const struct nft_ctx *ctx,
 	nft_counter_do_destroy(priv);
 }
 
-static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src)
+static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp)
 {
 	struct nft_counter_percpu_priv *priv = nft_expr_priv(src);
 	struct nft_counter_percpu_priv *priv_clone = nft_expr_priv(dst);
@@ -236,7 +236,7 @@ static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src)
 
 	nft_counter_fetch(priv, &total);
 
-	cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_ATOMIC);
+	cpu_stats = alloc_percpu_gfp(struct nft_counter, gfp);
 	if (cpu_stats == NULL)
 		return -ENOMEM;
 
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index c09dba57354c..b4ada3ab2167 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -35,7 +35,7 @@ static int nft_dynset_expr_setup(const struct nft_dynset *priv,
 
 	for (i = 0; i < priv->num_exprs; i++) {
 		expr = nft_setelem_expr_at(elem_expr, elem_expr->size);
-		if (nft_expr_clone(expr, priv->expr_array[i]) < 0)
+		if (nft_expr_clone(expr, priv->expr_array[i], GFP_ATOMIC) < 0)
 			return -1;
 
 		elem_expr->size += priv->expr_array[i]->ops->size;
diff --git a/net/netfilter/nft_last.c b/net/netfilter/nft_last.c
index 8e6d7eaf9dc8..de1b6066bfa8 100644
--- a/net/netfilter/nft_last.c
+++ b/net/netfilter/nft_last.c
@@ -102,12 +102,12 @@ static void nft_last_destroy(const struct nft_ctx *ctx,
 	kfree(priv->last);
 }
 
-static int nft_last_clone(struct nft_expr *dst, const struct nft_expr *src)
+static int nft_last_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp)
 {
 	struct nft_last_priv *priv_dst = nft_expr_priv(dst);
 	struct nft_last_priv *priv_src = nft_expr_priv(src);
 
-	priv_dst->last = kzalloc(sizeof(*priv_dst->last), GFP_ATOMIC);
+	priv_dst->last = kzalloc(sizeof(*priv_dst->last), gfp);
 	if (!priv_dst->last)
 		return -ENOMEM;
 
diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
index cefa25e0dbb0..21d26b79b460 100644
--- a/net/netfilter/nft_limit.c
+++ b/net/netfilter/nft_limit.c
@@ -150,7 +150,7 @@ static void nft_limit_destroy(const struct nft_ctx *ctx,
 }
 
 static int nft_limit_clone(struct nft_limit_priv *priv_dst,
-			   const struct nft_limit_priv *priv_src)
+			   const struct nft_limit_priv *priv_src, gfp_t gfp)
 {
 	priv_dst->tokens_max = priv_src->tokens_max;
 	priv_dst->rate = priv_src->rate;
@@ -158,7 +158,7 @@ static int nft_limit_clone(struct nft_limit_priv *priv_dst,
 	priv_dst->burst = priv_src->burst;
 	priv_dst->invert = priv_src->invert;
 
-	priv_dst->limit = kmalloc(sizeof(*priv_dst->limit), GFP_ATOMIC);
+	priv_dst->limit = kmalloc(sizeof(*priv_dst->limit), gfp);
 	if (!priv_dst->limit)
 		return -ENOMEM;
 
@@ -223,14 +223,15 @@ static void nft_limit_pkts_destroy(const struct nft_ctx *ctx,
 	nft_limit_destroy(ctx, &priv->limit);
 }
 
-static int nft_limit_pkts_clone(struct nft_expr *dst, const struct nft_expr *src)
+static int nft_limit_pkts_clone(struct nft_expr *dst, const struct nft_expr *src,
+				gfp_t gfp)
 {
 	struct nft_limit_priv_pkts *priv_dst = nft_expr_priv(dst);
 	struct nft_limit_priv_pkts *priv_src = nft_expr_priv(src);
 
 	priv_dst->cost = priv_src->cost;
 
-	return nft_limit_clone(&priv_dst->limit, &priv_src->limit);
+	return nft_limit_clone(&priv_dst->limit, &priv_src->limit, gfp);
 }
 
 static struct nft_expr_type nft_limit_type;
@@ -281,12 +282,13 @@ static void nft_limit_bytes_destroy(const struct nft_ctx *ctx,
 	nft_limit_destroy(ctx, priv);
 }
 
-static int nft_limit_bytes_clone(struct nft_expr *dst, const struct nft_expr *src)
+static int nft_limit_bytes_clone(struct nft_expr *dst, const struct nft_expr *src,
+				 gfp_t gfp)
 {
 	struct nft_limit_priv *priv_dst = nft_expr_priv(dst);
 	struct nft_limit_priv *priv_src = nft_expr_priv(src);
 
-	return nft_limit_clone(priv_dst, priv_src);
+	return nft_limit_clone(priv_dst, priv_src, gfp);
 }
 
 static const struct nft_expr_ops nft_limit_bytes_ops = {
diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
index 3ba12a7471b0..9b2d7463d3d3 100644
--- a/net/netfilter/nft_quota.c
+++ b/net/netfilter/nft_quota.c
@@ -233,7 +233,7 @@ static void nft_quota_destroy(const struct nft_ctx *ctx,
 	return nft_quota_do_destroy(ctx, priv);
 }
 
-static int nft_quota_clone(struct nft_expr *dst, const struct nft_expr *src)
+static int nft_quota_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp)
 {
 	struct nft_quota *priv_dst = nft_expr_priv(dst);
 	struct nft_quota *priv_src = nft_expr_priv(src);
@@ -241,7 +241,7 @@ static int nft_quota_clone(struct nft_expr *dst, const struct nft_expr *src)
 	priv_dst->quota = priv_src->quota;
 	priv_dst->flags = priv_src->flags;
 
-	priv_dst->consumed = kmalloc(sizeof(*priv_dst->consumed), GFP_ATOMIC);
+	priv_dst->consumed = kmalloc(sizeof(*priv_dst->consumed), gfp);
 	if (!priv_dst->consumed)
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From 76d7570734c198e09f369faa35ebdacfeec3448e Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 8 May 2024 14:38:09 +0300
Subject: usb: phy: tegra: Replace of_gpio.h by proper one

of_gpio.h is deprecated and subject to remove.
The driver doesn't use it directly, replace it
with what is really being used.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20240508113809.926155-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/tegra_usb_phy.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/usb/tegra_usb_phy.h b/include/linux/usb/tegra_usb_phy.h
index 46e73584b6e6..e6c14f2b1f9b 100644
--- a/include/linux/usb/tegra_usb_phy.h
+++ b/include/linux/usb/tegra_usb_phy.h
@@ -7,11 +7,12 @@
 #define __TEGRA_USB_PHY_H
 
 #include <linux/clk.h>
-#include <linux/gpio.h>
 #include <linux/regmap.h>
 #include <linux/reset.h>
 #include <linux/usb/otg.h>
 
+struct gpio_desc;
+
 /*
  * utmi_pll_config_in_car_module: true if the UTMI PLL configuration registers
  *     should be set up by clk-tegra, false if by the PHY code
-- 
cgit v1.2.3


From deb1e496a83557896fe0cca0b8af01c2a97c0dc6 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 10 May 2024 11:50:28 +0800
Subject: io_uring: support to inject result for NOP

Support to inject result for NOP so that we can inject failure from
userspace. It is very helpful for covering failure handling code in
io_uring core change.

With nop flags, it becomes possible to add more test features on NOP in
future.

Suggested-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20240510035031.78874-3-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  8 ++++++++
 io_uring/nop.c                | 26 +++++++++++++++++++++-----
 2 files changed, 29 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 6dbac55f8686..994bf7af0efe 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -72,6 +72,7 @@ struct io_uring_sqe {
 		__u32		waitid_flags;
 		__u32		futex_flags;
 		__u32		install_fd_flags;
+		__u32		nop_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	/* pack this to avoid bogus arm OABI complaints */
@@ -407,6 +408,13 @@ enum io_uring_msg_ring_flags {
  */
 #define IORING_FIXED_FD_NO_CLOEXEC	(1U << 0)
 
+/*
+ * IORING_OP_NOP flags (sqe->nop_flags)
+ *
+ * IORING_NOP_INJECT_RESULT	Inject result from sqe->result
+ */
+#define IORING_NOP_INJECT_RESULT	(1U << 0)
+
 /*
  * IO completion data structure (Completion Queue Entry)
  */
diff --git a/io_uring/nop.c b/io_uring/nop.c
index 1a4e312dfe51..a5bcf3d6984f 100644
--- a/io_uring/nop.c
+++ b/io_uring/nop.c
@@ -10,18 +10,34 @@
 #include "io_uring.h"
 #include "nop.h"
 
+struct io_nop {
+	/* NOTE: kiocb has the file as the first member, so don't do it here */
+	struct file     *file;
+	int             result;
+};
+
 int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
-	if (READ_ONCE(sqe->rw_flags))
+	unsigned int flags;
+	struct io_nop *nop = io_kiocb_to_cmd(req, struct io_nop);
+
+	flags = READ_ONCE(sqe->nop_flags);
+	if (flags & ~IORING_NOP_INJECT_RESULT)
 		return -EINVAL;
+
+	if (flags & IORING_NOP_INJECT_RESULT)
+		nop->result = READ_ONCE(sqe->len);
+	else
+		nop->result = 0;
 	return 0;
 }
 
-/*
- * IORING_OP_NOP just posts a completion event, nothing else.
- */
 int io_nop(struct io_kiocb *req, unsigned int issue_flags)
 {
-	io_req_set_res(req, 0, 0);
+	struct io_nop *nop = io_kiocb_to_cmd(req, struct io_nop);
+
+	if (nop->result < 0)
+		req_set_fail(req);
+	io_req_set_res(req, nop->result, 0);
 	return IOU_OK;
 }
-- 
cgit v1.2.3


From 0e640f0a47d8426eab1fb9c03f0af898dfe810b8 Mon Sep 17 00:00:00 2001
From: Shyam Sundar S K <Shyam-sundar.S-k@amd.com>
Date: Fri, 10 May 2024 16:48:28 +0530
Subject: x86/amd_nb: Add new PCI IDs for AMD family 0x1a

Add the new PCI Device IDs to the MISC IDs list to support new
generation of AMD 1Ah family 70h Models of processors.

  [ bp: Massage commit message. ]

Signed-off-by: Shyam Sundar S K <Shyam-sundar.S-k@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Link: https://lore.kernel.org/r/20240510111829.969501-1-Shyam-sundar.S-k@amd.com
---
 arch/x86/kernel/amd_nb.c | 1 +
 include/linux/pci_ids.h  | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 5bf5f9fc5753..3cf156f70859 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -95,6 +95,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F3) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_1AH_M70H_DF_F3) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI200_DF_F3) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_MI300_DF_F3) },
 	{}
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index a0c75e467df3..c547d1d4feb1 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -580,6 +580,7 @@
 #define PCI_DEVICE_ID_AMD_19H_M78H_DF_F3 0x12fb
 #define PCI_DEVICE_ID_AMD_1AH_M00H_DF_F3 0x12c3
 #define PCI_DEVICE_ID_AMD_1AH_M20H_DF_F3 0x16fb
+#define PCI_DEVICE_ID_AMD_1AH_M70H_DF_F3 0x12bb
 #define PCI_DEVICE_ID_AMD_MI200_DF_F3	0x14d3
 #define PCI_DEVICE_ID_AMD_MI300_DF_F3	0x152b
 #define PCI_DEVICE_ID_AMD_VANGOGH_USB	0x163a
-- 
cgit v1.2.3


From 5549eeedcdd6ab156e90622449bb0f1df5a616a4 Mon Sep 17 00:00:00 2001
From: Andreas Kemnade <andreas@kemnade.info>
Date: Tue, 2 Apr 2024 13:17:00 +0200
Subject: mfd: rohm-bd71828: Add power off functionality

Since the chip can power off the system, add the corresponding
functionality.
Based on https://github.com/kobolabs/Kobo-Reader/raw/master/hw/imx6sll-clara2e/kernel.tar.bz2

Signed-off-by: Andreas Kemnade <andreas@kemnade.info>
Acked-by: Matti Vaittinen <mazziesaccount@gmail.com>
Link: https://lore.kernel.org/r/20240402111700.494004-3-andreas@kemnade.info
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/mfd/rohm-bd71828.c       | 36 +++++++++++++++++++++++++++++++++++-
 include/linux/mfd/rohm-bd71828.h |  3 +++
 2 files changed, 38 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/mfd/rohm-bd71828.c b/drivers/mfd/rohm-bd71828.c
index 2f3826c7eef4..5b4290f116fc 100644
--- a/drivers/mfd/rohm-bd71828.c
+++ b/drivers/mfd/rohm-bd71828.c
@@ -464,6 +464,27 @@ static int set_clk_mode(struct device *dev, struct regmap *regmap,
 				  OUT32K_MODE_CMOS);
 }
 
+static struct i2c_client *bd71828_dev;
+static void bd71828_power_off(void)
+{
+	while (true) {
+		s32 val;
+
+		/* We are not allowed to sleep, so do not use regmap involving mutexes here. */
+		val = i2c_smbus_read_byte_data(bd71828_dev, BD71828_REG_PS_CTRL_1);
+		if (val >= 0)
+			i2c_smbus_write_byte_data(bd71828_dev,
+						  BD71828_REG_PS_CTRL_1,
+						  BD71828_MASK_STATE_HBNT | (u8)val);
+		mdelay(500);
+	}
+}
+
+static void bd71828_remove_poweroff(void *data)
+{
+	pm_power_off = NULL;
+}
+
 static int bd71828_i2c_probe(struct i2c_client *i2c)
 {
 	struct regmap_irq_chip_data *irq_data;
@@ -542,7 +563,20 @@ static int bd71828_i2c_probe(struct i2c_client *i2c)
 	ret = devm_mfd_add_devices(&i2c->dev, PLATFORM_DEVID_AUTO, mfd, cells,
 				   NULL, 0, regmap_irq_get_domain(irq_data));
 	if (ret)
-		dev_err_probe(&i2c->dev, ret, "Failed to create subdevices\n");
+		return	dev_err_probe(&i2c->dev, ret, "Failed to create subdevices\n");
+
+	if (of_device_is_system_power_controller(i2c->dev.of_node) &&
+	    chip_type == ROHM_CHIP_TYPE_BD71828) {
+		if (!pm_power_off) {
+			bd71828_dev = i2c;
+			pm_power_off = bd71828_power_off;
+			ret = devm_add_action_or_reset(&i2c->dev,
+						       bd71828_remove_poweroff,
+						       NULL);
+		} else {
+			dev_warn(&i2c->dev, "Poweroff callback already assigned\n");
+		}
+	}
 
 	return ret;
 }
diff --git a/include/linux/mfd/rohm-bd71828.h b/include/linux/mfd/rohm-bd71828.h
index 3b5f3a7db4bd..9776fde1262d 100644
--- a/include/linux/mfd/rohm-bd71828.h
+++ b/include/linux/mfd/rohm-bd71828.h
@@ -4,6 +4,7 @@
 #ifndef __LINUX_MFD_BD71828_H__
 #define __LINUX_MFD_BD71828_H__
 
+#include <linux/bits.h>
 #include <linux/mfd/rohm-generic.h>
 #include <linux/mfd/rohm-shared.h>
 
@@ -41,6 +42,8 @@ enum {
 #define BD71828_REG_PS_CTRL_2		0x05
 #define BD71828_REG_PS_CTRL_3		0x06
 
+#define BD71828_MASK_STATE_HBNT		BIT(1)
+
 //#define BD71828_REG_SWRESET		0x06
 #define BD71828_MASK_RUN_LVL_CTRL	0x30
 
-- 
cgit v1.2.3


From 74c6317df04bbfbb82ffed9dbb530e4075c7abed Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Tue, 2 Apr 2024 14:49:25 -0400
Subject: mfd: intel-m10-bmc: Change staging size to a variable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The secure update driver does a sanity-check of the image size in
comparison to the size of the staging area in FLASH. Instead of
hard-wiring M10BMC_STAGING_SIZE, move the staging size to the
m10bmc_csr_map structure to make the size assignment more flexible.

Co-developed-by: Russ Weight <russell.h.weight@intel.com>
Signed-off-by: Russ Weight <russell.h.weight@intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Peter Colberg <peter.colberg@intel.com>
Reviewed-by: Xu Yilun <yilun.xu@intel.com>
Link: https://lore.kernel.org/r/20240402184925.1065932-1-peter.colberg@intel.com
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/fpga/intel-m10-bmc-sec-update.c | 3 ++-
 drivers/mfd/intel-m10-bmc-pmci.c        | 1 +
 drivers/mfd/intel-m10-bmc-spi.c         | 1 +
 include/linux/mfd/intel-m10-bmc.h       | 1 +
 4 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/fpga/intel-m10-bmc-sec-update.c b/drivers/fpga/intel-m10-bmc-sec-update.c
index 89851b133709..7ac9f9f5af12 100644
--- a/drivers/fpga/intel-m10-bmc-sec-update.c
+++ b/drivers/fpga/intel-m10-bmc-sec-update.c
@@ -529,11 +529,12 @@ static enum fw_upload_err m10bmc_sec_prepare(struct fw_upload *fwl,
 					     const u8 *data, u32 size)
 {
 	struct m10bmc_sec *sec = fwl->dd_handle;
+	const struct m10bmc_csr_map *csr_map = sec->m10bmc->info->csr_map;
 	u32 ret;
 
 	sec->cancel_request = false;
 
-	if (!size || size > M10BMC_STAGING_SIZE)
+	if (!size || size > csr_map->staging_size)
 		return FW_UPLOAD_ERR_INVALID_SIZE;
 
 	if (sec->m10bmc->flash_bulk_ops)
diff --git a/drivers/mfd/intel-m10-bmc-pmci.c b/drivers/mfd/intel-m10-bmc-pmci.c
index 0392ef8b57d8..698c5933938b 100644
--- a/drivers/mfd/intel-m10-bmc-pmci.c
+++ b/drivers/mfd/intel-m10-bmc-pmci.c
@@ -370,6 +370,7 @@ static const struct m10bmc_csr_map m10bmc_n6000_csr_map = {
 	.pr_reh_addr = M10BMC_N6000_PR_REH_ADDR,
 	.pr_magic = M10BMC_N6000_PR_PROG_MAGIC,
 	.rsu_update_counter = M10BMC_N6000_STAGING_FLASH_COUNT,
+	.staging_size = M10BMC_STAGING_SIZE,
 };
 
 static const struct intel_m10bmc_platform_info m10bmc_pmci_n6000 = {
diff --git a/drivers/mfd/intel-m10-bmc-spi.c b/drivers/mfd/intel-m10-bmc-spi.c
index cbeb7de9e041..d64d28199df6 100644
--- a/drivers/mfd/intel-m10-bmc-spi.c
+++ b/drivers/mfd/intel-m10-bmc-spi.c
@@ -109,6 +109,7 @@ static const struct m10bmc_csr_map m10bmc_n3000_csr_map = {
 	.pr_reh_addr = M10BMC_N3000_PR_REH_ADDR,
 	.pr_magic = M10BMC_N3000_PR_PROG_MAGIC,
 	.rsu_update_counter = M10BMC_N3000_STAGING_FLASH_COUNT,
+	.staging_size = M10BMC_STAGING_SIZE,
 };
 
 static struct mfd_cell m10bmc_d5005_subdevs[] = {
diff --git a/include/linux/mfd/intel-m10-bmc.h b/include/linux/mfd/intel-m10-bmc.h
index ee66c9751003..988f1cd90032 100644
--- a/include/linux/mfd/intel-m10-bmc.h
+++ b/include/linux/mfd/intel-m10-bmc.h
@@ -205,6 +205,7 @@ struct m10bmc_csr_map {
 	unsigned int pr_reh_addr;
 	unsigned int pr_magic;
 	unsigned int rsu_update_counter;
+	unsigned int staging_size;
 };
 
 /**
-- 
cgit v1.2.3


From de8f4d97bb4af59b15a807401427d25aa830fe53 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <mazziesaccount@gmail.com>
Date: Fri, 12 Apr 2024 09:02:55 +0300
Subject: mfd: bd71828: Remove commented code lines

Some commented out code was accidentally left in the header.

Clean up commented out macros.

Signed-off-by: Matti Vaittinen <mazziesaccount@gmail.com>
Link: https://lore.kernel.org/r/ZhjOj_4AUgC4Iwh_@drtxq0yyyyyyyyyyyyyby-3.rev.dnainternet.fi
Signed-off-by: Lee Jones <lee@kernel.org>
---
 include/linux/mfd/rohm-bd71828.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mfd/rohm-bd71828.h b/include/linux/mfd/rohm-bd71828.h
index 9776fde1262d..ce786c96404a 100644
--- a/include/linux/mfd/rohm-bd71828.h
+++ b/include/linux/mfd/rohm-bd71828.h
@@ -44,7 +44,6 @@ enum {
 
 #define BD71828_MASK_STATE_HBNT		BIT(1)
 
-//#define BD71828_REG_SWRESET		0x06
 #define BD71828_MASK_RUN_LVL_CTRL	0x30
 
 /* Regulator control masks */
@@ -136,7 +135,6 @@ enum {
 #define BD71828_REG_LDO5_VOLT		0x43
 #define BD71828_REG_LDO5_VOLT_OPT	0x42
 #define BD71828_REG_LDO6_EN		0x44
-//#define BD71828_REG_LDO6_VOLT		0x4
 #define BD71828_REG_LDO7_EN		0x45
 #define BD71828_REG_LDO7_VOLT		0x46
 
-- 
cgit v1.2.3


From b303ab78f8b76f3322632912f539f557c552a1bd Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Fri, 29 Mar 2024 14:38:39 +0100
Subject: backlight: lp8788: Drop support for platform data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The backlight driver supports getting passed platform data. However this
isn't used. This allows to remove quite some dead code from the driver
because bl->pdata is always NULL, and so bl->mode is always
LP8788_BL_REGISTER_ONLY.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Reviewed-by: Daniel Thompson <daniel.thompson@linaro.org>
Link: https://lore.kernel.org/r/20240329133839.550065-2-u.kleine-koenig@pengutronix.de
Signed-off-by: Lee Jones <lee@kernel.org>
---
 drivers/video/backlight/lp8788_bl.c | 151 ++----------------------------------
 include/linux/mfd/lp8788.h          |  36 ---------
 2 files changed, 8 insertions(+), 179 deletions(-)

(limited to 'include')

diff --git a/drivers/video/backlight/lp8788_bl.c b/drivers/video/backlight/lp8788_bl.c
index 31f97230ee50..0b7663519fa5 100644
--- a/drivers/video/backlight/lp8788_bl.c
+++ b/drivers/video/backlight/lp8788_bl.c
@@ -12,7 +12,6 @@
 #include <linux/mfd/lp8788.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
-#include <linux/pwm.h>
 #include <linux/slab.h>
 
 /* Register address */
@@ -31,149 +30,40 @@
 #define MAX_BRIGHTNESS			127
 #define DEFAULT_BL_NAME			"lcd-backlight"
 
-struct lp8788_bl_config {
-	enum lp8788_bl_ctrl_mode bl_mode;
-	enum lp8788_bl_dim_mode dim_mode;
-	enum lp8788_bl_full_scale_current full_scale;
-	enum lp8788_bl_ramp_step rise_time;
-	enum lp8788_bl_ramp_step fall_time;
-	enum pwm_polarity pwm_pol;
-};
-
 struct lp8788_bl {
 	struct lp8788 *lp;
 	struct backlight_device *bl_dev;
-	struct lp8788_backlight_platform_data *pdata;
-	enum lp8788_bl_ctrl_mode mode;
-	struct pwm_device *pwm;
-};
-
-static struct lp8788_bl_config default_bl_config = {
-	.bl_mode    = LP8788_BL_REGISTER_ONLY,
-	.dim_mode   = LP8788_DIM_EXPONENTIAL,
-	.full_scale = LP8788_FULLSCALE_1900uA,
-	.rise_time  = LP8788_RAMP_8192us,
-	.fall_time  = LP8788_RAMP_8192us,
-	.pwm_pol    = PWM_POLARITY_NORMAL,
 };
 
-static inline bool is_brightness_ctrl_by_pwm(enum lp8788_bl_ctrl_mode mode)
-{
-	return mode == LP8788_BL_COMB_PWM_BASED;
-}
-
-static inline bool is_brightness_ctrl_by_register(enum lp8788_bl_ctrl_mode mode)
-{
-	return mode == LP8788_BL_REGISTER_ONLY ||
-		mode == LP8788_BL_COMB_REGISTER_BASED;
-}
-
 static int lp8788_backlight_configure(struct lp8788_bl *bl)
 {
-	struct lp8788_backlight_platform_data *pdata = bl->pdata;
-	struct lp8788_bl_config *cfg = &default_bl_config;
 	int ret;
 	u8 val;
 
-	/*
-	 * Update chip configuration if platform data exists,
-	 * otherwise use the default settings.
-	 */
-	if (pdata) {
-		cfg->bl_mode    = pdata->bl_mode;
-		cfg->dim_mode   = pdata->dim_mode;
-		cfg->full_scale = pdata->full_scale;
-		cfg->rise_time  = pdata->rise_time;
-		cfg->fall_time  = pdata->fall_time;
-		cfg->pwm_pol    = pdata->pwm_pol;
-	}
-
 	/* Brightness ramp up/down */
-	val = (cfg->rise_time << LP8788_BL_RAMP_RISE_SHIFT) | cfg->fall_time;
+	val = (LP8788_RAMP_8192us << LP8788_BL_RAMP_RISE_SHIFT) | LP8788_RAMP_8192us;
 	ret = lp8788_write_byte(bl->lp, LP8788_BL_RAMP, val);
 	if (ret)
 		return ret;
 
 	/* Fullscale current setting */
-	val = (cfg->full_scale << LP8788_BL_FULLSCALE_SHIFT) |
-		(cfg->dim_mode << LP8788_BL_DIM_MODE_SHIFT);
+	val = (LP8788_FULLSCALE_1900uA << LP8788_BL_FULLSCALE_SHIFT) |
+		(LP8788_DIM_EXPONENTIAL << LP8788_BL_DIM_MODE_SHIFT);
 
 	/* Brightness control mode */
-	switch (cfg->bl_mode) {
-	case LP8788_BL_REGISTER_ONLY:
-		val |= LP8788_BL_EN;
-		break;
-	case LP8788_BL_COMB_PWM_BASED:
-	case LP8788_BL_COMB_REGISTER_BASED:
-		val |= LP8788_BL_EN | LP8788_BL_PWM_INPUT_EN |
-			(cfg->pwm_pol << LP8788_BL_PWM_POLARITY_SHIFT);
-		break;
-	default:
-		dev_err(bl->lp->dev, "invalid mode: %d\n", cfg->bl_mode);
-		return -EINVAL;
-	}
-
-	bl->mode = cfg->bl_mode;
+	val |= LP8788_BL_EN;
 
 	return lp8788_write_byte(bl->lp, LP8788_BL_CONFIG, val);
 }
 
-static void lp8788_pwm_ctrl(struct lp8788_bl *bl, int br, int max_br)
-{
-	unsigned int period;
-	unsigned int duty;
-	struct device *dev;
-	struct pwm_device *pwm;
-
-	if (!bl->pdata)
-		return;
-
-	period = bl->pdata->period_ns;
-	duty = br * period / max_br;
-	dev = bl->lp->dev;
-
-	/* request PWM device with the consumer name */
-	if (!bl->pwm) {
-		pwm = devm_pwm_get(dev, LP8788_DEV_BACKLIGHT);
-		if (IS_ERR(pwm)) {
-			dev_err(dev, "can not get PWM device\n");
-			return;
-		}
-
-		bl->pwm = pwm;
-
-		/*
-		 * FIXME: pwm_apply_args() should be removed when switching to
-		 * the atomic PWM API.
-		 */
-		pwm_apply_args(pwm);
-	}
-
-	pwm_config(bl->pwm, duty, period);
-	if (duty)
-		pwm_enable(bl->pwm);
-	else
-		pwm_disable(bl->pwm);
-}
-
 static int lp8788_bl_update_status(struct backlight_device *bl_dev)
 {
 	struct lp8788_bl *bl = bl_get_data(bl_dev);
-	enum lp8788_bl_ctrl_mode mode = bl->mode;
 
 	if (bl_dev->props.state & BL_CORE_SUSPENDED)
 		bl_dev->props.brightness = 0;
 
-	if (is_brightness_ctrl_by_pwm(mode)) {
-		int brt = bl_dev->props.brightness;
-		int max = bl_dev->props.max_brightness;
-
-		lp8788_pwm_ctrl(bl, brt, max);
-	} else if (is_brightness_ctrl_by_register(mode)) {
-		u8 brt = bl_dev->props.brightness;
-
-		lp8788_write_byte(bl->lp, LP8788_BL_BRIGHTNESS, brt);
-	}
+	lp8788_write_byte(bl->lp, LP8788_BL_BRIGHTNESS, bl_dev->props.brightness);
 
 	return 0;
 }
@@ -187,30 +77,16 @@ static int lp8788_backlight_register(struct lp8788_bl *bl)
 {
 	struct backlight_device *bl_dev;
 	struct backlight_properties props;
-	struct lp8788_backlight_platform_data *pdata = bl->pdata;
-	int init_brt;
-	char *name;
 
 	memset(&props, 0, sizeof(struct backlight_properties));
 	props.type = BACKLIGHT_PLATFORM;
 	props.max_brightness = MAX_BRIGHTNESS;
 
 	/* Initial brightness */
-	if (pdata)
-		init_brt = min_t(int, pdata->initial_brightness,
-				props.max_brightness);
-	else
-		init_brt = 0;
-
-	props.brightness = init_brt;
+	props.brightness = 0;
 
 	/* Backlight device name */
-	if (!pdata || !pdata->name)
-		name = DEFAULT_BL_NAME;
-	else
-		name = pdata->name;
-
-	bl_dev = backlight_device_register(name, bl->lp->dev, bl,
+	bl_dev = backlight_device_register(DEFAULT_BL_NAME, bl->lp->dev, bl,
 				       &lp8788_bl_ops, &props);
 	if (IS_ERR(bl_dev))
 		return PTR_ERR(bl_dev);
@@ -230,16 +106,7 @@ static void lp8788_backlight_unregister(struct lp8788_bl *bl)
 static ssize_t lp8788_get_bl_ctl_mode(struct device *dev,
 				     struct device_attribute *attr, char *buf)
 {
-	struct lp8788_bl *bl = dev_get_drvdata(dev);
-	enum lp8788_bl_ctrl_mode mode = bl->mode;
-	char *strmode;
-
-	if (is_brightness_ctrl_by_pwm(mode))
-		strmode = "PWM based";
-	else if (is_brightness_ctrl_by_register(mode))
-		strmode = "Register based";
-	else
-		strmode = "Invalid mode";
+	const char *strmode = "Register based";
 
 	return scnprintf(buf, PAGE_SIZE, "%s\n", strmode);
 }
@@ -266,8 +133,6 @@ static int lp8788_backlight_probe(struct platform_device *pdev)
 		return -ENOMEM;
 
 	bl->lp = lp;
-	if (lp->pdata)
-		bl->pdata = lp->pdata->bl_pdata;
 
 	platform_set_drvdata(pdev, bl);
 
diff --git a/include/linux/mfd/lp8788.h b/include/linux/mfd/lp8788.h
index 51b47966a04d..fd17bec2a33e 100644
--- a/include/linux/mfd/lp8788.h
+++ b/include/linux/mfd/lp8788.h
@@ -11,7 +11,6 @@
 #define __MFD_LP8788_H__
 
 #include <linux/irqdomain.h>
-#include <linux/pwm.h>
 #include <linux/regmap.h>
 
 #define LP8788_DEV_BUCK		"lp8788-buck"
@@ -87,12 +86,6 @@ enum lp8788_charger_event {
 	CHARGER_DETECTED,
 };
 
-enum lp8788_bl_ctrl_mode {
-	LP8788_BL_REGISTER_ONLY,
-	LP8788_BL_COMB_PWM_BASED,	/* PWM + I2C, changed by PWM input */
-	LP8788_BL_COMB_REGISTER_BASED,	/* PWM + I2C, changed by I2C */
-};
-
 enum lp8788_bl_dim_mode {
 	LP8788_DIM_EXPONENTIAL,
 	LP8788_DIM_LINEAR,
@@ -201,31 +194,6 @@ struct lp8788_charger_platform_data {
 				enum lp8788_charger_event event);
 };
 
-/*
- * struct lp8788_backlight_platform_data
- * @name                  : backlight driver name. (default: "lcd-backlight")
- * @initial_brightness    : initial value of backlight brightness
- * @bl_mode               : brightness control by pwm or lp8788 register
- * @dim_mode              : dimming mode selection
- * @full_scale            : full scale current setting
- * @rise_time             : brightness ramp up step time
- * @fall_time             : brightness ramp down step time
- * @pwm_pol               : pwm polarity setting when bl_mode is pwm based
- * @period_ns             : platform specific pwm period value. unit is nano.
-			    Only valid when bl_mode is LP8788_BL_COMB_PWM_BASED
- */
-struct lp8788_backlight_platform_data {
-	char *name;
-	int initial_brightness;
-	enum lp8788_bl_ctrl_mode bl_mode;
-	enum lp8788_bl_dim_mode dim_mode;
-	enum lp8788_bl_full_scale_current full_scale;
-	enum lp8788_bl_ramp_step rise_time;
-	enum lp8788_bl_ramp_step fall_time;
-	enum pwm_polarity pwm_pol;
-	unsigned int period_ns;
-};
-
 /*
  * struct lp8788_led_platform_data
  * @name         : led driver name. (default: "keyboard-backlight")
@@ -267,7 +235,6 @@ struct lp8788_vib_platform_data {
  * @buck2_dvs    : configurations for buck2 dvs
  * @chg_pdata    : platform data for charger driver
  * @alarm_sel    : rtc alarm selection (1 or 2)
- * @bl_pdata     : configurable data for backlight driver
  * @led_pdata    : configurable data for led driver
  * @vib_pdata    : configurable data for vibrator driver
  * @adc_pdata    : iio map data for adc driver
@@ -289,9 +256,6 @@ struct lp8788_platform_data {
 	/* rtc alarm */
 	enum lp8788_alarm_sel alarm_sel;
 
-	/* backlight */
-	struct lp8788_backlight_platform_data *bl_pdata;
-
 	/* current sinks */
 	struct lp8788_led_platform_data *led_pdata;
 	struct lp8788_vib_platform_data *vib_pdata;
-- 
cgit v1.2.3


From 991b6bdf1b009832256f8bc3035d4bcba664657b Mon Sep 17 00:00:00 2001
From: Zhiguo Niu <zhiguo.niu@unisoc.com>
Date: Fri, 26 Apr 2024 20:01:29 +0800
Subject: f2fs: fix some ambiguous comments

After commit d7e9a9037de2 ("f2fs: Support Block Size == Page Size"),
Some comments are confused and just correct with block size is 4KB.

Signed-off-by: Zhiguo Niu <zhiguo.niu@unisoc.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 include/linux/f2fs_fs.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index a357287eac1e..41d1d71c36ff 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -394,7 +394,8 @@ struct f2fs_nat_block {
 
 /*
  * F2FS uses 4 bytes to represent block address. As a result, supported size of
- * disk is 16 TB and it equals to 16 * 1024 * 1024 / 2 segments.
+ * disk is 16 TB for a 4K page size and 64 TB for a 16K page size and it equals
+ * to 16 * 1024 * 1024 / 2 segments.
  */
 #define F2FS_MAX_SEGMENT       ((16 * 1024 * 1024) / 2)
 
@@ -424,8 +425,10 @@ struct f2fs_sit_block {
 /*
  * For segment summary
  *
- * One summary block contains exactly 512 summary entries, which represents
- * exactly one segment by default. Not allow to change the basic units.
+ * One summary block with 4KB size contains exactly 512 summary entries, which
+ * represents exactly one segment with 2MB size.
+ * Similarly, in the case of block with 16KB size, it represents one segment with 8MB size.
+ * Not allow to change the basic units.
  *
  * NOTE: For initializing fields, you must use set_summary
  *
@@ -556,6 +559,7 @@ typedef __le32	f2fs_hash_t;
 
 /*
  * space utilization of regular dentry and inline dentry (w/o extra reservation)
+ * when block size is 4KB.
  *		regular dentry		inline dentry (def)	inline dentry (min)
  * bitmap	1 * 27 = 27		1 * 23 = 23		1 * 1 = 1
  * reserved	1 * 3 = 3		1 * 7 = 7		1 * 1 = 1
-- 
cgit v1.2.3


From 7172dc93d621d5dc302d007e95ddd1311ec64283 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Wed, 8 May 2024 10:11:50 -0700
Subject: af_unix: Add dead flag to struct scm_fp_list.

Commit 1af2dface5d2 ("af_unix: Don't access successor in unix_del_edges()
during GC.") fixed use-after-free by avoid accessing edge->successor while
GC is in progress.

However, there could be a small race window where another process could
call unix_del_edges() while gc_in_progress is true and __skb_queue_purge()
is on the way.

So, we need another marker for struct scm_fp_list which indicates if the
skb is garbage-collected.

This patch adds dead flag in struct scm_fp_list and set it true before
calling __skb_queue_purge().

Fixes: 1af2dface5d2 ("af_unix: Don't access successor in unix_del_edges() during GC.")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Link: https://lore.kernel.org/r/20240508171150.50601-1-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/scm.h  |  1 +
 net/core/scm.c     |  1 +
 net/unix/garbage.c | 14 ++++++++++----
 3 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/net/scm.h b/include/net/scm.h
index bbc5527809d1..0d35c7c77a74 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -33,6 +33,7 @@ struct scm_fp_list {
 	short			max;
 #ifdef CONFIG_UNIX
 	bool			inflight;
+	bool			dead;
 	struct list_head	vertices;
 	struct unix_edge	*edges;
 #endif
diff --git a/net/core/scm.c b/net/core/scm.c
index 5763f3320358..4f6a14babe5a 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -91,6 +91,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
 		fpl->user = NULL;
 #if IS_ENABLED(CONFIG_UNIX)
 		fpl->inflight = false;
+		fpl->dead = false;
 		fpl->edges = NULL;
 		INIT_LIST_HEAD(&fpl->vertices);
 #endif
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index d76450133e4f..1f8b8cdfcdc8 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -158,13 +158,11 @@ static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
 	unix_update_graph(unix_edge_successor(edge));
 }
 
-static bool gc_in_progress;
-
 static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
 {
 	struct unix_vertex *vertex = edge->predecessor->vertex;
 
-	if (!gc_in_progress)
+	if (!fpl->dead)
 		unix_update_graph(unix_edge_successor(edge));
 
 	list_del(&edge->vertex_entry);
@@ -240,7 +238,7 @@ void unix_del_edges(struct scm_fp_list *fpl)
 		unix_del_edge(fpl, edge);
 	} while (i < fpl->count_unix);
 
-	if (!gc_in_progress) {
+	if (!fpl->dead) {
 		receiver = fpl->edges[0].successor;
 		receiver->scm_stat.nr_unix_fds -= fpl->count_unix;
 	}
@@ -559,9 +557,12 @@ static void unix_walk_scc_fast(struct sk_buff_head *hitlist)
 	list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
 }
 
+static bool gc_in_progress;
+
 static void __unix_gc(struct work_struct *work)
 {
 	struct sk_buff_head hitlist;
+	struct sk_buff *skb;
 
 	spin_lock(&unix_gc_lock);
 
@@ -579,6 +580,11 @@ static void __unix_gc(struct work_struct *work)
 
 	spin_unlock(&unix_gc_lock);
 
+	skb_queue_walk(&hitlist, skb) {
+		if (UNIXCB(skb).fp)
+			UNIXCB(skb).fp->dead = true;
+	}
+
 	__skb_queue_purge(&hitlist);
 skip_gc:
 	WRITE_ONCE(gc_in_progress, false);
-- 
cgit v1.2.3


From 485d65e1357123a697c591a5aeb773994b247ad7 Mon Sep 17 00:00:00 2001
From: Akiva Goldberger <agoldberger@nvidia.com>
Date: Thu, 9 May 2024 14:29:50 +0300
Subject: net/mlx5: Add a timeout to acquire the command queue semaphore

Prevent forced completion handling on an entry that has not yet been
assigned an index, causing an out of bounds access on idx = -22.
Instead of waiting indefinitely for the sem, blocking flow now waits for
index to be allocated or a sem acquisition timeout before beginning the
timer for FW completion.

Kernel log example:
mlx5_core 0000:06:00.0: wait_func_handle_exec_timeout:1128:(pid 185911): cmd[-22]: CREATE_UCTX(0xa04) No done completion

Fixes: 8e715cd613a1 ("net/mlx5: Set command entry semaphore up once got index free")
Signed-off-by: Akiva Goldberger <agoldberger@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Link: https://lore.kernel.org/r/20240509112951.590184-5-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 41 +++++++++++++++++++++------
 include/linux/mlx5/driver.h                   |  1 +
 2 files changed, 33 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index 4957412ff1f6..511e7fee39ac 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -969,19 +969,32 @@ static void cmd_work_handler(struct work_struct *work)
 	bool poll_cmd = ent->polling;
 	struct mlx5_cmd_layout *lay;
 	struct mlx5_core_dev *dev;
-	unsigned long cb_timeout;
-	struct semaphore *sem;
+	unsigned long timeout;
 	unsigned long flags;
 	int alloc_ret;
 	int cmd_mode;
 
+	complete(&ent->handling);
+
 	dev = container_of(cmd, struct mlx5_core_dev, cmd);
-	cb_timeout = msecs_to_jiffies(mlx5_tout_ms(dev, CMD));
+	timeout = msecs_to_jiffies(mlx5_tout_ms(dev, CMD));
 
-	complete(&ent->handling);
-	sem = ent->page_queue ? &cmd->vars.pages_sem : &cmd->vars.sem;
-	down(sem);
 	if (!ent->page_queue) {
+		if (down_timeout(&cmd->vars.sem, timeout)) {
+			mlx5_core_warn(dev, "%s(0x%x) timed out while waiting for a slot.\n",
+				       mlx5_command_str(ent->op), ent->op);
+			if (ent->callback) {
+				ent->callback(-EBUSY, ent->context);
+				mlx5_free_cmd_msg(dev, ent->out);
+				free_msg(dev, ent->in);
+				cmd_ent_put(ent);
+			} else {
+				ent->ret = -EBUSY;
+				complete(&ent->done);
+			}
+			complete(&ent->slotted);
+			return;
+		}
 		alloc_ret = cmd_alloc_index(cmd, ent);
 		if (alloc_ret < 0) {
 			mlx5_core_err_rl(dev, "failed to allocate command entry\n");
@@ -994,10 +1007,11 @@ static void cmd_work_handler(struct work_struct *work)
 				ent->ret = -EAGAIN;
 				complete(&ent->done);
 			}
-			up(sem);
+			up(&cmd->vars.sem);
 			return;
 		}
 	} else {
+		down(&cmd->vars.pages_sem);
 		ent->idx = cmd->vars.max_reg_cmds;
 		spin_lock_irqsave(&cmd->alloc_lock, flags);
 		clear_bit(ent->idx, &cmd->vars.bitmask);
@@ -1005,6 +1019,8 @@ static void cmd_work_handler(struct work_struct *work)
 		spin_unlock_irqrestore(&cmd->alloc_lock, flags);
 	}
 
+	complete(&ent->slotted);
+
 	lay = get_inst(cmd, ent->idx);
 	ent->lay = lay;
 	memset(lay, 0, sizeof(*lay));
@@ -1023,7 +1039,7 @@ static void cmd_work_handler(struct work_struct *work)
 	ent->ts1 = ktime_get_ns();
 	cmd_mode = cmd->mode;
 
-	if (ent->callback && schedule_delayed_work(&ent->cb_timeout_work, cb_timeout))
+	if (ent->callback && schedule_delayed_work(&ent->cb_timeout_work, timeout))
 		cmd_ent_get(ent);
 	set_bit(MLX5_CMD_ENT_STATE_PENDING_COMP, &ent->state);
 
@@ -1143,6 +1159,9 @@ static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent)
 		ent->ret = -ECANCELED;
 		goto out_err;
 	}
+
+	wait_for_completion(&ent->slotted);
+
 	if (cmd->mode == CMD_MODE_POLLING || ent->polling)
 		wait_for_completion(&ent->done);
 	else if (!wait_for_completion_timeout(&ent->done, timeout))
@@ -1157,6 +1176,9 @@ out_err:
 	} else if (err == -ECANCELED) {
 		mlx5_core_warn(dev, "%s(0x%x) canceled on out of queue timeout.\n",
 			       mlx5_command_str(ent->op), ent->op);
+	} else if (err == -EBUSY) {
+		mlx5_core_warn(dev, "%s(0x%x) timeout while waiting for command semaphore.\n",
+			       mlx5_command_str(ent->op), ent->op);
 	}
 	mlx5_core_dbg(dev, "err %d, delivery status %s(%d)\n",
 		      err, deliv_status_to_str(ent->status), ent->status);
@@ -1208,6 +1230,7 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in,
 	ent->polling = force_polling;
 
 	init_completion(&ent->handling);
+	init_completion(&ent->slotted);
 	if (!callback)
 		init_completion(&ent->done);
 
@@ -1225,7 +1248,7 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in,
 		return 0; /* mlx5_cmd_comp_handler() will put(ent) */
 
 	err = wait_func(dev, ent);
-	if (err == -ETIMEDOUT || err == -ECANCELED)
+	if (err == -ETIMEDOUT || err == -ECANCELED || err == -EBUSY)
 		goto out_free;
 
 	ds = ent->ts2 - ent->ts1;
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index bf9324a31ae9..80452bd98253 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -862,6 +862,7 @@ struct mlx5_cmd_work_ent {
 	void		       *context;
 	int			idx;
 	struct completion	handling;
+	struct completion	slotted;
 	struct completion	done;
 	struct mlx5_cmd        *cmd;
 	struct work_struct	work;
-- 
cgit v1.2.3


From 51474ab44abf907023a8a875e799b07de461e466 Mon Sep 17 00:00:00 2001
From: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Date: Sat, 11 May 2024 11:54:30 +0300
Subject: drm/bridge: aux-hpd-bridge: correct devm_drm_dp_hpd_bridge_add() stub

If CONFIG_DRM_AUX_HPD_BRIDGE is not enabled, the aux-bridge.h header
provides a stub for the bridge's functions. Correct the arguments list
of one of those stubs to match the argument list of the non-stubbed
function.

Fixes: e5ca263508f7 ("drm/bridge: aux-hpd: separate allocation and registration")
Reported-by: kernel test robot <lkp@intel.com>
Cc: stable <stable@kernel.org>
Closes: https://lore.kernel.org/oe-kbuild-all/202405110428.TMCfb1Ut-lkp@intel.com/
Cc: Johan Hovold <johan+linaro@kernel.org>
Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Reviewed-by: Johan Hovold <johan+linaro@kernel.org>
Link: https://lore.kernel.org/r/20240511-fix-aux-hpd-stubs-v1-1-98dae71dfaec@linaro.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/drm/bridge/aux-bridge.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/drm/bridge/aux-bridge.h b/include/drm/bridge/aux-bridge.h
index 4453906105ca..c2f5a855512f 100644
--- a/include/drm/bridge/aux-bridge.h
+++ b/include/drm/bridge/aux-bridge.h
@@ -33,7 +33,7 @@ static inline struct auxiliary_device *devm_drm_dp_hpd_bridge_alloc(struct devic
 	return NULL;
 }
 
-static inline int devm_drm_dp_hpd_bridge_add(struct auxiliary_device *adev)
+static inline int devm_drm_dp_hpd_bridge_add(struct device *dev, struct auxiliary_device *adev)
 {
 	return 0;
 }
-- 
cgit v1.2.3


From 76edc534cc289308130272a2ac28694fc9b72a03 Mon Sep 17 00:00:00 2001
From: Xiu Jianfeng <xiujianfeng@huawei.com>
Date: Thu, 9 May 2024 03:26:28 +0000
Subject: memcg, oom: cleanup unused memcg_oom_gfp_mask and memcg_oom_order

Since commit 857f21397f71 ("memcg, oom: remove unnecessary check in
mem_cgroup_oom_synchronize()"), memcg_oom_gfp_mask and memcg_oom_order are
no longer used any more.

Link: https://lkml.kernel.org/r/20240509032628.1217652-1-xiujianfeng@huawei.com
Signed-off-by: Xiu Jianfeng <xiujianfeng@huawei.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Benjamin Segall <bsegall@google.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt (Google) <rostedt@goodmis.org>
Cc: Valentin Schneider <vschneid@redhat.com>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched.h | 2 --
 mm/memcontrol.c       | 2 --
 2 files changed, 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4118b3f959c3..427de5e4754b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1448,8 +1448,6 @@ struct task_struct {
 
 #ifdef CONFIG_MEMCG
 	struct mem_cgroup		*memcg_in_oom;
-	gfp_t				memcg_oom_gfp_mask;
-	int				memcg_oom_order;
 
 	/* Number of pages to reclaim on returning to userland: */
 	unsigned int			memcg_nr_pages_over_high;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index eef02a59b8c9..7fad15b2290c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2188,8 +2188,6 @@ static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 		if (current->in_user_fault) {
 			css_get(&memcg->css);
 			current->memcg_in_oom = memcg;
-			current->memcg_oom_gfp_mask = mask;
-			current->memcg_oom_order = order;
 		}
 		return false;
 	}
-- 
cgit v1.2.3


From 33580d667bb20e00356fd06500f5197ef1baa1f5 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Tue, 7 May 2024 23:24:54 +0900
Subject: nilfs2: use __field_struct() for a bitwise field

As one can see in include/trace/stages/stage4_event_fields.h, the
implementation of __field() uses the is_signed_type() macro.  As one can
see in commit dcf8e5633e2e ("tracing: Define the is_signed_type() macro
once"), there has been an attempt to not make is_signed_type() trigger
sparse warnings for bitwise types.

Despite that change, sparse complains when passing a bitwise type to
is_signed_type().  The reason is that in its definition below, an
inequality comparison will be made against bitwise types, which are random
collections of bits (the casts to bitwise types themselves are
semantically valid and not problematic):

 #define is_signed_type(type) (((type)(-1)) < (__force type)1)

So, as a workaround, follow the example of <trace/events/initcall.h> and
suppress the following sparse warnings by changing __field() into
__field_struct() that doesn't use is_signed_type():

 fs/nilfs2/segment.c: note: in included file (through
   include/trace/trace_events.h, include/trace/define_trace.h,
   include/trace/events/nilfs2.h):
 ./include/trace/events/nilfs2.h:191:1: warning: cast to restricted
   blk_opf_t
 ./include/trace/events/nilfs2.h:191:1: warning: restricted blk_opf_t
   degrades to integer
 ./include/trace/events/nilfs2.h:191:1: warning: restricted blk_opf_t
   degrades to integer

[konishi.ryusuke: describe the reason for the warnings per Linus's explanation]
  Link: https://lkml.kernel.org/r/20240507222041.4876-1-konishi.ryusuke@gmail.com
Link: https://lkml.kernel.org/r/20240507142454.3344-1-konishi.ryusuke@gmail.com
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202401092241.I4mm9OWl-lkp@intel.com/
Reported-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
Closes: https://lore.kernel.org/all/20240430080019.4242-2-konishi.ryusuke@gmail.com/
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/trace/events/nilfs2.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/nilfs2.h b/include/trace/events/nilfs2.h
index 8efc6236f57c..8880c11733dd 100644
--- a/include/trace/events/nilfs2.h
+++ b/include/trace/events/nilfs2.h
@@ -200,7 +200,11 @@ TRACE_EVENT(nilfs2_mdt_submit_block,
 		    __field(struct inode *, inode)
 		    __field(unsigned long, ino)
 		    __field(unsigned long, blkoff)
-		    __field(enum req_op, mode)
+		    /*
+		     * Use field_struct() to avoid is_signed_type() on the
+		     * bitwise type enum req_op.
+		     */
+		    __field_struct(enum req_op, mode)
 	    ),
 
 	    TP_fast_assign(
-- 
cgit v1.2.3


From 2ddec2c80b4402c293c7e6e0881cecaaf77e8cec Mon Sep 17 00:00:00 2001
From: Puranjay Mohan <puranjay@kernel.org>
Date: Thu, 2 May 2024 15:18:52 +0000
Subject: riscv, bpf: inline bpf_get_smp_processor_id()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Inline the calls to bpf_get_smp_processor_id() in the riscv bpf jit.

RISCV saves the pointer to the CPU's task_struct in the TP (thread
pointer) register. This makes it trivial to get the CPU's processor id.
As thread_info is the first member of task_struct, we can read the
processor id from TP + offsetof(struct thread_info, cpu).

          RISCV64 JIT output for `call bpf_get_smp_processor_id`
	  ======================================================

                Before                           After
               --------                         -------

         auipc   t1,0x848c                  ld    a5,32(tp)
         jalr    604(t1)
         mv      a5,a0

Benchmark using [1] on Qemu.

./benchs/run_bench_trigger.sh glob-arr-inc arr-inc hash-inc

+---------------+------------------+------------------+--------------+
|      Name     |     Before       |       After      |   % change   |
|---------------+------------------+------------------+--------------|
| glob-arr-inc  | 1.077 ± 0.006M/s | 1.336 ± 0.010M/s |   + 24.04%   |
| arr-inc       | 1.078 ± 0.002M/s | 1.332 ± 0.015M/s |   + 23.56%   |
| hash-inc      | 0.494 ± 0.004M/s | 0.653 ± 0.001M/s |   + 32.18%   |
+---------------+------------------+------------------+--------------+

NOTE: This benchmark includes changes from this patch and the previous
      patch that implemented the per-cpu insn.

[1] https://github.com/anakryiko/linux/commit/8dec900975ef

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Björn Töpel <bjorn@kernel.org>
Link: https://lore.kernel.org/r/20240502151854.9810-3-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 arch/riscv/net/bpf_jit_comp64.c | 26 ++++++++++++++++++++++++++
 include/linux/filter.h          |  1 +
 kernel/bpf/core.c               | 11 +++++++++++
 kernel/bpf/verifier.c           |  4 ++++
 4 files changed, 42 insertions(+)

(limited to 'include')

diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
index 1f0159963b3e..a46ec7fb4489 100644
--- a/arch/riscv/net/bpf_jit_comp64.c
+++ b/arch/riscv/net/bpf_jit_comp64.c
@@ -1493,6 +1493,22 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
 		bool fixed_addr;
 		u64 addr;
 
+		/* Inline calls to bpf_get_smp_processor_id()
+		 *
+		 * RV_REG_TP holds the address of the current CPU's task_struct and thread_info is
+		 * at offset 0 in task_struct.
+		 * Load cpu from thread_info:
+		 *     Set R0 to ((struct thread_info *)(RV_REG_TP))->cpu
+		 *
+		 * This replicates the implementation of raw_smp_processor_id() on RISCV
+		 */
+		if (insn->src_reg == 0 && insn->imm == BPF_FUNC_get_smp_processor_id) {
+			/* Load current CPU number in R0 */
+			emit_ld(bpf_to_rv_reg(BPF_REG_0, ctx), offsetof(struct thread_info, cpu),
+				RV_REG_TP, ctx);
+			break;
+		}
+
 		mark_call(ctx);
 		ret = bpf_jit_get_func_addr(ctx->prog, insn, extra_pass,
 					    &addr, &fixed_addr);
@@ -2062,3 +2078,13 @@ bool bpf_jit_supports_percpu_insn(void)
 {
 	return true;
 }
+
+bool bpf_jit_inlines_helper_call(s32 imm)
+{
+	switch (imm) {
+	case BPF_FUNC_get_smp_processor_id:
+		return true;
+	default:
+		return false;
+	}
+}
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 7a27f19bf44d..3e19bb62ed1a 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -993,6 +993,7 @@ u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
 void bpf_jit_compile(struct bpf_prog *prog);
 bool bpf_jit_needs_zext(void);
+bool bpf_jit_inlines_helper_call(s32 imm);
 bool bpf_jit_supports_subprog_tailcalls(void);
 bool bpf_jit_supports_percpu_insn(void);
 bool bpf_jit_supports_kfunc_call(void);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 99b8b1c9a248..aa59af9f9bd9 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2941,6 +2941,17 @@ bool __weak bpf_jit_needs_zext(void)
 	return false;
 }
 
+/* Return true if the JIT inlines the call to the helper corresponding to
+ * the imm.
+ *
+ * The verifier will not patch the insn->imm for the call to the helper if
+ * this returns true.
+ */
+bool __weak bpf_jit_inlines_helper_call(s32 imm)
+{
+	return false;
+}
+
 /* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */
 bool __weak bpf_jit_supports_subprog_tailcalls(void)
 {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9e3aba08984e..1658ca4136a3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -19996,6 +19996,10 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
 			goto next_insn;
 		}
 
+		/* Skip inlining the helper call if the JIT does it. */
+		if (bpf_jit_inlines_helper_call(insn->imm))
+			goto next_insn;
+
 		if (insn->imm == BPF_FUNC_get_route_realm)
 			prog->dst_needed = 1;
 		if (insn->imm == BPF_FUNC_get_prandom_u32)
-- 
cgit v1.2.3


From b25f7415eb4108aa32dd3e74289d7f997090708f Mon Sep 17 00:00:00 2001
From: Günther Noack <gnoack@google.com>
Date: Fri, 19 Apr 2024 16:11:12 +0000
Subject: landlock: Add IOCTL access right for character and block devices
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduces the LANDLOCK_ACCESS_FS_IOCTL_DEV right
and increments the Landlock ABI version to 5.

This access right applies to device-custom IOCTL commands
when they are invoked on block or character device files.

Like the truncate right, this right is associated with a file
descriptor at the time of open(2), and gets respected even when the
file descriptor is used outside of the thread which it was originally
opened in.

Therefore, a newly enabled Landlock policy does not apply to file
descriptors which are already open.

If the LANDLOCK_ACCESS_FS_IOCTL_DEV right is handled, only a small
number of safe IOCTL commands will be permitted on newly opened device
files.  These include FIOCLEX, FIONCLEX, FIONBIO and FIOASYNC, as well
as other IOCTL commands for regular files which are implemented in
fs/ioctl.c.

Noteworthy scenarios which require special attention:

TTY devices are often passed into a process from the parent process,
and so a newly enabled Landlock policy does not retroactively apply to
them automatically.  In the past, TTY devices have often supported
IOCTL commands like TIOCSTI and some TIOCLINUX subcommands, which were
letting callers control the TTY input buffer (and simulate
keypresses).  This should be restricted to CAP_SYS_ADMIN programs on
modern kernels though.

Known limitations:

The LANDLOCK_ACCESS_FS_IOCTL_DEV access right is a coarse-grained
control over IOCTL commands.

Landlock users may use path-based restrictions in combination with
their knowledge about the file system layout to control what IOCTLs
can be done.

Cc: Paul Moore <paul@paul-moore.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Günther Noack <gnoack@google.com>
Link: https://lore.kernel.org/r/20240419161122.2023765-2-gnoack@google.com
Signed-off-by: Mickaël Salaün <mic@digikod.net>
---
 include/uapi/linux/landlock.h                |  38 ++++-
 security/landlock/fs.c                       | 225 ++++++++++++++++++++++++++-
 security/landlock/limits.h                   |   2 +-
 security/landlock/syscalls.c                 |   2 +-
 tools/testing/selftests/landlock/base_test.c |   2 +-
 tools/testing/selftests/landlock/fs_test.c   |   5 +-
 6 files changed, 258 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h
index 25c8d7677539..68625e728f43 100644
--- a/include/uapi/linux/landlock.h
+++ b/include/uapi/linux/landlock.h
@@ -128,7 +128,7 @@ struct landlock_net_port_attr {
  * files and directories.  Files or directories opened before the sandboxing
  * are not subject to these restrictions.
  *
- * A file can only receive these access rights:
+ * The following access rights apply only to files:
  *
  * - %LANDLOCK_ACCESS_FS_EXECUTE: Execute a file.
  * - %LANDLOCK_ACCESS_FS_WRITE_FILE: Open a file with write access. Note that
@@ -138,12 +138,13 @@ struct landlock_net_port_attr {
  * - %LANDLOCK_ACCESS_FS_READ_FILE: Open a file with read access.
  * - %LANDLOCK_ACCESS_FS_TRUNCATE: Truncate a file with :manpage:`truncate(2)`,
  *   :manpage:`ftruncate(2)`, :manpage:`creat(2)`, or :manpage:`open(2)` with
- *   ``O_TRUNC``. Whether an opened file can be truncated with
- *   :manpage:`ftruncate(2)` is determined during :manpage:`open(2)`, in the
- *   same way as read and write permissions are checked during
- *   :manpage:`open(2)` using %LANDLOCK_ACCESS_FS_READ_FILE and
- *   %LANDLOCK_ACCESS_FS_WRITE_FILE. This access right is available since the
- *   third version of the Landlock ABI.
+ *   ``O_TRUNC``.  This access right is available since the third version of the
+ *   Landlock ABI.
+ *
+ * Whether an opened file can be truncated with :manpage:`ftruncate(2)` or used
+ * with `ioctl(2)` is determined during :manpage:`open(2)`, in the same way as
+ * read and write permissions are checked during :manpage:`open(2)` using
+ * %LANDLOCK_ACCESS_FS_READ_FILE and %LANDLOCK_ACCESS_FS_WRITE_FILE.
  *
  * A directory can receive access rights related to files or directories.  The
  * following access right is applied to the directory itself, and the
@@ -198,13 +199,33 @@ struct landlock_net_port_attr {
  *   If multiple requirements are not met, the ``EACCES`` error code takes
  *   precedence over ``EXDEV``.
  *
+ * The following access right applies both to files and directories:
+ *
+ * - %LANDLOCK_ACCESS_FS_IOCTL_DEV: Invoke :manpage:`ioctl(2)` commands on an opened
+ *   character or block device.
+ *
+ *   This access right applies to all `ioctl(2)` commands implemented by device
+ *   drivers.  However, the following common IOCTL commands continue to be
+ *   invokable independent of the %LANDLOCK_ACCESS_FS_IOCTL_DEV right:
+ *
+ *   * IOCTL commands targeting file descriptors (``FIOCLEX``, ``FIONCLEX``),
+ *   * IOCTL commands targeting file descriptions (``FIONBIO``, ``FIOASYNC``),
+ *   * IOCTL commands targeting file systems (``FIFREEZE``, ``FITHAW``,
+ *     ``FIGETBSZ``, ``FS_IOC_GETFSUUID``, ``FS_IOC_GETFSSYSFSPATH``)
+ *   * Some IOCTL commands which do not make sense when used with devices, but
+ *     whose implementations are safe and return the right error codes
+ *     (``FS_IOC_FIEMAP``, ``FICLONE``, ``FICLONERANGE``, ``FIDEDUPERANGE``)
+ *
+ *   This access right is available since the fifth version of the Landlock
+ *   ABI.
+ *
  * .. warning::
  *
  *   It is currently not possible to restrict some file-related actions
  *   accessible through these syscall families: :manpage:`chdir(2)`,
  *   :manpage:`stat(2)`, :manpage:`flock(2)`, :manpage:`chmod(2)`,
  *   :manpage:`chown(2)`, :manpage:`setxattr(2)`, :manpage:`utime(2)`,
- *   :manpage:`ioctl(2)`, :manpage:`fcntl(2)`, :manpage:`access(2)`.
+ *   :manpage:`fcntl(2)`, :manpage:`access(2)`.
  *   Future Landlock evolutions will enable to restrict them.
  */
 /* clang-format off */
@@ -223,6 +244,7 @@ struct landlock_net_port_attr {
 #define LANDLOCK_ACCESS_FS_MAKE_SYM			(1ULL << 12)
 #define LANDLOCK_ACCESS_FS_REFER			(1ULL << 13)
 #define LANDLOCK_ACCESS_FS_TRUNCATE			(1ULL << 14)
+#define LANDLOCK_ACCESS_FS_IOCTL_DEV			(1ULL << 15)
 /* clang-format on */
 
 /**
diff --git a/security/landlock/fs.c b/security/landlock/fs.c
index c15559432d3d..22d8b7c28074 100644
--- a/security/landlock/fs.c
+++ b/security/landlock/fs.c
@@ -5,8 +5,11 @@
  * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net>
  * Copyright © 2018-2020 ANSSI
  * Copyright © 2021-2022 Microsoft Corporation
+ * Copyright © 2022 Günther Noack <gnoack3000@gmail.com>
+ * Copyright © 2023-2024 Google LLC
  */
 
+#include <asm/ioctls.h>
 #include <kunit/test.h>
 #include <linux/atomic.h>
 #include <linux/bitops.h>
@@ -14,6 +17,7 @@
 #include <linux/compiler_types.h>
 #include <linux/dcache.h>
 #include <linux/err.h>
+#include <linux/falloc.h>
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -29,6 +33,7 @@
 #include <linux/types.h>
 #include <linux/wait_bit.h>
 #include <linux/workqueue.h>
+#include <uapi/linux/fiemap.h>
 #include <uapi/linux/landlock.h>
 
 #include "common.h"
@@ -84,6 +89,160 @@ static const struct landlock_object_underops landlock_fs_underops = {
 	.release = release_inode
 };
 
+/* IOCTL helpers */
+
+/**
+ * is_masked_device_ioctl - Determine whether an IOCTL command is always
+ * permitted with Landlock for device files.  These commands can not be
+ * restricted on device files by enforcing a Landlock policy.
+ *
+ * @cmd: The IOCTL command that is supposed to be run.
+ *
+ * By default, any IOCTL on a device file requires the
+ * LANDLOCK_ACCESS_FS_IOCTL_DEV right.  However, we blanket-permit some
+ * commands, if:
+ *
+ * 1. The command is implemented in fs/ioctl.c's do_vfs_ioctl(),
+ *    not in f_ops->unlocked_ioctl() or f_ops->compat_ioctl().
+ *
+ * 2. The command is harmless when invoked on devices.
+ *
+ * We also permit commands that do not make sense for devices, but where the
+ * do_vfs_ioctl() implementation returns a more conventional error code.
+ *
+ * Any new IOCTL commands that are implemented in fs/ioctl.c's do_vfs_ioctl()
+ * should be considered for inclusion here.
+ *
+ * Returns: true if the IOCTL @cmd can not be restricted with Landlock for
+ * device files.
+ */
+static __attribute_const__ bool is_masked_device_ioctl(const unsigned int cmd)
+{
+	switch (cmd) {
+	/*
+	 * FIOCLEX, FIONCLEX, FIONBIO and FIOASYNC manipulate the FD's
+	 * close-on-exec and the file's buffered-IO and async flags.  These
+	 * operations are also available through fcntl(2), and are
+	 * unconditionally permitted in Landlock.
+	 */
+	case FIOCLEX:
+	case FIONCLEX:
+	case FIONBIO:
+	case FIOASYNC:
+	/*
+	 * FIOQSIZE queries the size of a regular file, directory, or link.
+	 *
+	 * We still permit it, because it always returns -ENOTTY for
+	 * other file types.
+	 */
+	case FIOQSIZE:
+	/*
+	 * FIFREEZE and FITHAW freeze and thaw the file system which the
+	 * given file belongs to.  Requires CAP_SYS_ADMIN.
+	 *
+	 * These commands operate on the file system's superblock rather
+	 * than on the file itself.  The same operations can also be
+	 * done through any other file or directory on the same file
+	 * system, so it is safe to permit these.
+	 */
+	case FIFREEZE:
+	case FITHAW:
+	/*
+	 * FS_IOC_FIEMAP queries information about the allocation of
+	 * blocks within a file.
+	 *
+	 * This IOCTL command only makes sense for regular files and is
+	 * not implemented by devices. It is harmless to permit.
+	 */
+	case FS_IOC_FIEMAP:
+	/*
+	 * FIGETBSZ queries the file system's block size for a file or
+	 * directory.
+	 *
+	 * This command operates on the file system's superblock rather
+	 * than on the file itself.  The same operation can also be done
+	 * through any other file or directory on the same file system,
+	 * so it is safe to permit it.
+	 */
+	case FIGETBSZ:
+	/*
+	 * FICLONE, FICLONERANGE and FIDEDUPERANGE make files share
+	 * their underlying storage ("reflink") between source and
+	 * destination FDs, on file systems which support that.
+	 *
+	 * These IOCTL commands only apply to regular files
+	 * and are harmless to permit for device files.
+	 */
+	case FICLONE:
+	case FICLONERANGE:
+	case FIDEDUPERANGE:
+	/*
+	 * FS_IOC_GETFSUUID and FS_IOC_GETFSSYSFSPATH both operate on
+	 * the file system superblock, not on the specific file, so
+	 * these operations are available through any other file on the
+	 * same file system as well.
+	 */
+	case FS_IOC_GETFSUUID:
+	case FS_IOC_GETFSSYSFSPATH:
+		return true;
+
+	/*
+	 * FIONREAD, FS_IOC_GETFLAGS, FS_IOC_SETFLAGS, FS_IOC_FSGETXATTR and
+	 * FS_IOC_FSSETXATTR are forwarded to device implementations.
+	 */
+
+	/*
+	 * file_ioctl() commands (FIBMAP, FS_IOC_RESVSP, FS_IOC_RESVSP64,
+	 * FS_IOC_UNRESVSP, FS_IOC_UNRESVSP64 and FS_IOC_ZERO_RANGE) are
+	 * forwarded to device implementations, so not permitted.
+	 */
+
+	/* Other commands are guarded by the access right. */
+	default:
+		return false;
+	}
+}
+
+/*
+ * is_masked_device_ioctl_compat - same as the helper above, but checking the
+ * "compat" IOCTL commands.
+ *
+ * The IOCTL commands with special handling in compat-mode should behave the
+ * same as their non-compat counterparts.
+ */
+static __attribute_const__ bool
+is_masked_device_ioctl_compat(const unsigned int cmd)
+{
+	switch (cmd) {
+	/* FICLONE is permitted, same as in the non-compat variant. */
+	case FICLONE:
+		return true;
+
+#if defined(CONFIG_X86_64)
+	/*
+	 * FS_IOC_RESVSP_32, FS_IOC_RESVSP64_32, FS_IOC_UNRESVSP_32,
+	 * FS_IOC_UNRESVSP64_32, FS_IOC_ZERO_RANGE_32: not blanket-permitted,
+	 * for consistency with their non-compat variants.
+	 */
+	case FS_IOC_RESVSP_32:
+	case FS_IOC_RESVSP64_32:
+	case FS_IOC_UNRESVSP_32:
+	case FS_IOC_UNRESVSP64_32:
+	case FS_IOC_ZERO_RANGE_32:
+#endif
+
+	/*
+	 * FS_IOC32_GETFLAGS, FS_IOC32_SETFLAGS are forwarded to their device
+	 * implementations.
+	 */
+	case FS_IOC32_GETFLAGS:
+	case FS_IOC32_SETFLAGS:
+		return false;
+	default:
+		return is_masked_device_ioctl(cmd);
+	}
+}
+
 /* Ruleset management */
 
 static struct landlock_object *get_inode_object(struct inode *const inode)
@@ -148,7 +307,8 @@ retry:
 	LANDLOCK_ACCESS_FS_EXECUTE | \
 	LANDLOCK_ACCESS_FS_WRITE_FILE | \
 	LANDLOCK_ACCESS_FS_READ_FILE | \
-	LANDLOCK_ACCESS_FS_TRUNCATE)
+	LANDLOCK_ACCESS_FS_TRUNCATE | \
+	LANDLOCK_ACCESS_FS_IOCTL_DEV)
 /* clang-format on */
 
 /*
@@ -1332,11 +1492,18 @@ static int hook_file_alloc_security(struct file *const file)
 	return 0;
 }
 
+static bool is_device(const struct file *const file)
+{
+	const struct inode *inode = file_inode(file);
+
+	return S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode);
+}
+
 static int hook_file_open(struct file *const file)
 {
 	layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {};
-	access_mask_t open_access_request, full_access_request, allowed_access;
-	const access_mask_t optional_access = LANDLOCK_ACCESS_FS_TRUNCATE;
+	access_mask_t open_access_request, full_access_request, allowed_access,
+		optional_access;
 	const struct landlock_ruleset *const dom =
 		get_fs_domain(landlock_cred(file->f_cred)->domain);
 
@@ -1354,6 +1521,10 @@ static int hook_file_open(struct file *const file)
 	 * We look up more access than what we immediately need for open(), so
 	 * that we can later authorize operations on opened files.
 	 */
+	optional_access = LANDLOCK_ACCESS_FS_TRUNCATE;
+	if (is_device(file))
+		optional_access |= LANDLOCK_ACCESS_FS_IOCTL_DEV;
+
 	full_access_request = open_access_request | optional_access;
 
 	if (is_access_to_paths_allowed(
@@ -1410,6 +1581,52 @@ static int hook_file_truncate(struct file *const file)
 	return -EACCES;
 }
 
+static int hook_file_ioctl(struct file *file, unsigned int cmd,
+			   unsigned long arg)
+{
+	access_mask_t allowed_access = landlock_file(file)->allowed_access;
+
+	/*
+	 * It is the access rights at the time of opening the file which
+	 * determine whether IOCTL can be used on the opened file later.
+	 *
+	 * The access right is attached to the opened file in hook_file_open().
+	 */
+	if (allowed_access & LANDLOCK_ACCESS_FS_IOCTL_DEV)
+		return 0;
+
+	if (!is_device(file))
+		return 0;
+
+	if (is_masked_device_ioctl(cmd))
+		return 0;
+
+	return -EACCES;
+}
+
+static int hook_file_ioctl_compat(struct file *file, unsigned int cmd,
+				  unsigned long arg)
+{
+	access_mask_t allowed_access = landlock_file(file)->allowed_access;
+
+	/*
+	 * It is the access rights at the time of opening the file which
+	 * determine whether IOCTL can be used on the opened file later.
+	 *
+	 * The access right is attached to the opened file in hook_file_open().
+	 */
+	if (allowed_access & LANDLOCK_ACCESS_FS_IOCTL_DEV)
+		return 0;
+
+	if (!is_device(file))
+		return 0;
+
+	if (is_masked_device_ioctl_compat(cmd))
+		return 0;
+
+	return -EACCES;
+}
+
 static struct security_hook_list landlock_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(inode_free_security, hook_inode_free_security),
 
@@ -1432,6 +1649,8 @@ static struct security_hook_list landlock_hooks[] __ro_after_init = {
 	LSM_HOOK_INIT(file_alloc_security, hook_file_alloc_security),
 	LSM_HOOK_INIT(file_open, hook_file_open),
 	LSM_HOOK_INIT(file_truncate, hook_file_truncate),
+	LSM_HOOK_INIT(file_ioctl, hook_file_ioctl),
+	LSM_HOOK_INIT(file_ioctl_compat, hook_file_ioctl_compat),
 };
 
 __init void landlock_add_fs_hooks(void)
diff --git a/security/landlock/limits.h b/security/landlock/limits.h
index 93c9c6f91556..20fdb5ff3514 100644
--- a/security/landlock/limits.h
+++ b/security/landlock/limits.h
@@ -18,7 +18,7 @@
 #define LANDLOCK_MAX_NUM_LAYERS		16
 #define LANDLOCK_MAX_NUM_RULES		U32_MAX
 
-#define LANDLOCK_LAST_ACCESS_FS		LANDLOCK_ACCESS_FS_TRUNCATE
+#define LANDLOCK_LAST_ACCESS_FS		LANDLOCK_ACCESS_FS_IOCTL_DEV
 #define LANDLOCK_MASK_ACCESS_FS		((LANDLOCK_LAST_ACCESS_FS << 1) - 1)
 #define LANDLOCK_NUM_ACCESS_FS		__const_hweight64(LANDLOCK_MASK_ACCESS_FS)
 #define LANDLOCK_SHIFT_ACCESS_FS	0
diff --git a/security/landlock/syscalls.c b/security/landlock/syscalls.c
index 6788e73b6681..03b470f5a85a 100644
--- a/security/landlock/syscalls.c
+++ b/security/landlock/syscalls.c
@@ -149,7 +149,7 @@ static const struct file_operations ruleset_fops = {
 	.write = fop_dummy_write,
 };
 
-#define LANDLOCK_ABI_VERSION 4
+#define LANDLOCK_ABI_VERSION 5
 
 /**
  * sys_landlock_create_ruleset - Create a new ruleset
diff --git a/tools/testing/selftests/landlock/base_test.c b/tools/testing/selftests/landlock/base_test.c
index a6f89aaea77d..3c1e9f35b531 100644
--- a/tools/testing/selftests/landlock/base_test.c
+++ b/tools/testing/selftests/landlock/base_test.c
@@ -75,7 +75,7 @@ TEST(abi_version)
 	const struct landlock_ruleset_attr ruleset_attr = {
 		.handled_access_fs = LANDLOCK_ACCESS_FS_READ_FILE,
 	};
-	ASSERT_EQ(4, landlock_create_ruleset(NULL, 0,
+	ASSERT_EQ(5, landlock_create_ruleset(NULL, 0,
 					     LANDLOCK_CREATE_RULESET_VERSION));
 
 	ASSERT_EQ(-1, landlock_create_ruleset(&ruleset_attr, 0,
diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c
index 27744524df51..17b00e6c778d 100644
--- a/tools/testing/selftests/landlock/fs_test.c
+++ b/tools/testing/selftests/landlock/fs_test.c
@@ -536,9 +536,10 @@ TEST_F_FORK(layout1, inval)
 	LANDLOCK_ACCESS_FS_EXECUTE | \
 	LANDLOCK_ACCESS_FS_WRITE_FILE | \
 	LANDLOCK_ACCESS_FS_READ_FILE | \
-	LANDLOCK_ACCESS_FS_TRUNCATE)
+	LANDLOCK_ACCESS_FS_TRUNCATE | \
+	LANDLOCK_ACCESS_FS_IOCTL_DEV)
 
-#define ACCESS_LAST LANDLOCK_ACCESS_FS_TRUNCATE
+#define ACCESS_LAST LANDLOCK_ACCESS_FS_IOCTL_DEV
 
 #define ACCESS_ALL ( \
 	ACCESS_FILE | \
-- 
cgit v1.2.3


From e3193f0fbd6d83510ff6879ac248f42a7c0fefe7 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@collabora.com>
Date: Thu, 2 May 2024 18:51:55 +0200
Subject: drm/panthor: Make sure the tiler initial/max chunks are consistent

It doesn't make sense to have a maximum number of chunks smaller than
the initial number of chunks attached to the context.

Fix the uAPI header to reflect the new constraint, and mention the
undocumented "initial_chunk_count > 0" constraint while at it.

v3:
- Add R-b

v2:
- Fix the check

Fixes: 9cca48fa4f89 ("drm/panthor: Add the heap logical block")
Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>
Reviewed-by: Steven Price <steven.price@arm.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240502165158.1458959-3-boris.brezillon@collabora.com
---
 drivers/gpu/drm/panthor/panthor_heap.c | 3 +++
 include/uapi/drm/panthor_drm.h         | 8 ++++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/panthor/panthor_heap.c b/drivers/gpu/drm/panthor/panthor_heap.c
index c3c0ba744937..3be86ec383d6 100644
--- a/drivers/gpu/drm/panthor/panthor_heap.c
+++ b/drivers/gpu/drm/panthor/panthor_heap.c
@@ -281,6 +281,9 @@ int panthor_heap_create(struct panthor_heap_pool *pool,
 	if (initial_chunk_count == 0)
 		return -EINVAL;
 
+	if (initial_chunk_count > max_chunks)
+		return -EINVAL;
+
 	if (hweight32(chunk_size) != 1 ||
 	    chunk_size < SZ_256K || chunk_size > SZ_2M)
 		return -EINVAL;
diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
index dadb05ab1235..5db80a0682d5 100644
--- a/include/uapi/drm/panthor_drm.h
+++ b/include/uapi/drm/panthor_drm.h
@@ -895,13 +895,17 @@ struct drm_panthor_tiler_heap_create {
 	/** @vm_id: VM ID the tiler heap should be mapped to */
 	__u32 vm_id;
 
-	/** @initial_chunk_count: Initial number of chunks to allocate. */
+	/** @initial_chunk_count: Initial number of chunks to allocate. Must be at least one. */
 	__u32 initial_chunk_count;
 
 	/** @chunk_size: Chunk size. Must be a power of two at least 256KB large. */
 	__u32 chunk_size;
 
-	/** @max_chunks: Maximum number of chunks that can be allocated. */
+	/**
+	 * @max_chunks: Maximum number of chunks that can be allocated.
+	 *
+	 * Must be at least @initial_chunk_count.
+	 */
 	__u32 max_chunks;
 
 	/**
-- 
cgit v1.2.3


From 69a429905ceccad547e4a532b08f9d32c7f3422a Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@collabora.com>
Date: Thu, 2 May 2024 18:51:56 +0200
Subject: drm/panthor: Relax the constraints on the tiler chunk size

The field used to store the chunk size if 12 bits wide, and the encoding
is chunk_size = chunk_header.chunk_size << 12, which gives us a
theoretical [4k:8M] range. This range is further limited by
implementation constraints, and all known implementations seem to
impose a [128k:8M] range, so do the same here.

We also relax the power-of-two constraint, which doesn't seem to
exist on v10. This will allow userspace to fine-tune initial/max
tiler memory on memory-constrained devices.

v4:
- Actually fix the range in the kerneldoc

v3:
- Add R-bs
- Fix valid range in the kerneldoc

v2:
- Turn the power-of-two constraint into a page-aligned constraint to allow
  fine-tune of the initial/max heap memory size
- Fix the panthor_heap_create() kerneldoc

Fixes: 9cca48fa4f89 ("drm/panthor: Add the heap logical block")
Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>
Reviewed-by: Steven Price <steven.price@arm.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240502165158.1458959-4-boris.brezillon@collabora.com
---
 drivers/gpu/drm/panthor/panthor_heap.c | 8 ++++----
 include/uapi/drm/panthor_drm.h         | 6 +++++-
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/panthor/panthor_heap.c b/drivers/gpu/drm/panthor/panthor_heap.c
index 3be86ec383d6..b0fc5b9ee847 100644
--- a/drivers/gpu/drm/panthor/panthor_heap.c
+++ b/drivers/gpu/drm/panthor/panthor_heap.c
@@ -253,8 +253,8 @@ int panthor_heap_destroy(struct panthor_heap_pool *pool, u32 handle)
  * @pool: Pool to instantiate the heap context from.
  * @initial_chunk_count: Number of chunk allocated at initialization time.
  * Must be at least 1.
- * @chunk_size: The size of each chunk. Must be a power of two between 256k
- * and 2M.
+ * @chunk_size: The size of each chunk. Must be page-aligned and lie in the
+ * [128k:8M] range.
  * @max_chunks: Maximum number of chunks that can be allocated.
  * @target_in_flight: Maximum number of in-flight render passes.
  * @heap_ctx_gpu_va: Pointer holding the GPU address of the allocated heap
@@ -284,8 +284,8 @@ int panthor_heap_create(struct panthor_heap_pool *pool,
 	if (initial_chunk_count > max_chunks)
 		return -EINVAL;
 
-	if (hweight32(chunk_size) != 1 ||
-	    chunk_size < SZ_256K || chunk_size > SZ_2M)
+	if (!IS_ALIGNED(chunk_size, PAGE_SIZE) ||
+	    chunk_size < SZ_128K || chunk_size > SZ_8M)
 		return -EINVAL;
 
 	down_read(&pool->lock);
diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
index 5db80a0682d5..b8220d2e698f 100644
--- a/include/uapi/drm/panthor_drm.h
+++ b/include/uapi/drm/panthor_drm.h
@@ -898,7 +898,11 @@ struct drm_panthor_tiler_heap_create {
 	/** @initial_chunk_count: Initial number of chunks to allocate. Must be at least one. */
 	__u32 initial_chunk_count;
 
-	/** @chunk_size: Chunk size. Must be a power of two at least 256KB large. */
+	/**
+	 * @chunk_size: Chunk size.
+	 *
+	 * Must be page-aligned and lie in the [128k:8M] range.
+	 */
 	__u32 chunk_size;
 
 	/**
-- 
cgit v1.2.3


From 591eafcd46e09a2468ecf5cdceea676ac72d84bc Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@collabora.com>
Date: Thu, 2 May 2024 18:51:58 +0200
Subject: drm/panthor: Document drm_panthor_tiler_heap_destroy::handle validity
 constraints

Make sure the user is aware that drm_panthor_tiler_heap_destroy::handle
must be a handle previously returned by
DRM_IOCTL_PANTHOR_TILER_HEAP_CREATE.

v4:
- Add Steve's R-b

v3:
- New patch

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Steven Price <steven.price@arm.com>
Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240502165158.1458959-6-boris.brezillon@collabora.com
---
 include/uapi/drm/panthor_drm.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/drm/panthor_drm.h b/include/uapi/drm/panthor_drm.h
index b8220d2e698f..aaed8e12ad0b 100644
--- a/include/uapi/drm/panthor_drm.h
+++ b/include/uapi/drm/panthor_drm.h
@@ -939,7 +939,11 @@ struct drm_panthor_tiler_heap_create {
  * struct drm_panthor_tiler_heap_destroy - Arguments passed to DRM_IOCTL_PANTHOR_TILER_HEAP_DESTROY
  */
 struct drm_panthor_tiler_heap_destroy {
-	/** @handle: Handle of the tiler heap to destroy */
+	/**
+	 * @handle: Handle of the tiler heap to destroy.
+	 *
+	 * Must be a valid heap handle returned by DRM_IOCTL_PANTHOR_TILER_HEAP_CREATE.
+	 */
 	__u32 handle;
 
 	/** @pad: Padding field, MBZ. */
-- 
cgit v1.2.3


From 95feb3160eef0caa6018e175a5560b816aee8e79 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Thu, 21 Mar 2024 19:44:07 +0000
Subject: VFIO: Add the SPR_DSA and SPR_IAX devices to the denylist

Due to an erratum with the SPR_DSA and SPR_IAX devices, it is not secure to assign
these devices to virtual machines. Add the PCI IDs of these devices to the VFIO
denylist to ensure that this is handled appropriately by the VFIO subsystem.

The SPR_DSA and SPR_IAX devices are on-SOC devices for the Sapphire Rapids
(and related) family of products that perform data movement and compression.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 drivers/dma/idxd/registers.h | 3 ---
 drivers/vfio/pci/vfio_pci.c  | 2 ++
 include/linux/pci_ids.h      | 2 ++
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/drivers/dma/idxd/registers.h b/drivers/dma/idxd/registers.h
index 315c004f58e4..e16dbf9ab324 100644
--- a/drivers/dma/idxd/registers.h
+++ b/drivers/dma/idxd/registers.h
@@ -6,9 +6,6 @@
 #include <uapi/linux/idxd.h>
 
 /* PCI Config */
-#define PCI_DEVICE_ID_INTEL_DSA_SPR0	0x0b25
-#define PCI_DEVICE_ID_INTEL_IAX_SPR0	0x0cfe
-
 #define DEVICE_VERSION_1		0x100
 #define DEVICE_VERSION_2		0x200
 
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index cb5b7f865d58..e727941f589d 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -71,6 +71,8 @@ static bool vfio_pci_dev_in_denylist(struct pci_dev *pdev)
 		case PCI_DEVICE_ID_INTEL_QAT_C62X_VF:
 		case PCI_DEVICE_ID_INTEL_QAT_DH895XCC:
 		case PCI_DEVICE_ID_INTEL_QAT_DH895XCC_VF:
+		case PCI_DEVICE_ID_INTEL_DSA_SPR0:
+		case PCI_DEVICE_ID_INTEL_IAX_SPR0:
 			return true;
 		default:
 			return false;
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index c547d1d4feb1..4beb29907c2b 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2687,8 +2687,10 @@
 #define PCI_DEVICE_ID_INTEL_I960	0x0960
 #define PCI_DEVICE_ID_INTEL_I960RM	0x0962
 #define PCI_DEVICE_ID_INTEL_HDA_HSW_0	0x0a0c
+#define PCI_DEVICE_ID_INTEL_DSA_SPR0	0x0b25
 #define PCI_DEVICE_ID_INTEL_HDA_HSW_2	0x0c0c
 #define PCI_DEVICE_ID_INTEL_CENTERTON_ILB	0x0c60
+#define PCI_DEVICE_ID_INTEL_IAX_SPR0	0x0cfe
 #define PCI_DEVICE_ID_INTEL_HDA_HSW_3	0x0d0c
 #define PCI_DEVICE_ID_INTEL_HDA_BYT	0x0f04
 #define PCI_DEVICE_ID_INTEL_SST_BYT	0x0f28
-- 
cgit v1.2.3


From fec1982d70721c0062758861fec7e4e9d1103fb6 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Thu, 18 Apr 2024 22:55:39 +0200
Subject: i2c: mux: Remove class argument from i2c_mux_add_adapter()

99a741aa7a2d ("i2c: mux: gpio: remove support for class-based device
instantiation") removed the last call to i2c_mux_add_adapter() with a
non-null class argument. Therefore the class argument can be removed.

Note: Class-based device instantiation is a legacy mechanism which
shouldn't be used in new code, so we can rule out that this argument
may be needed again in the future.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Reviewed-by: Thomas Zimmermann <tzimmermann@suse.de>
Acked-by: Peter Rosin <peda@axentia.se>
Reviewed-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Acked-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Acked-by: Rob Herring (Arm) <robh@kernel.org>
Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
---
 drivers/gpu/drm/bridge/sii902x.c           |  2 +-
 drivers/i2c/i2c-mux.c                      | 24 +-----------------------
 drivers/i2c/muxes/i2c-arb-gpio-challenge.c |  2 +-
 drivers/i2c/muxes/i2c-mux-gpio.c           |  2 +-
 drivers/i2c/muxes/i2c-mux-gpmux.c          |  2 +-
 drivers/i2c/muxes/i2c-mux-ltc4306.c        |  2 +-
 drivers/i2c/muxes/i2c-mux-mlxcpld.c        |  2 +-
 drivers/i2c/muxes/i2c-mux-pca9541.c        |  2 +-
 drivers/i2c/muxes/i2c-mux-pca954x.c        |  2 +-
 drivers/i2c/muxes/i2c-mux-pinctrl.c        |  2 +-
 drivers/i2c/muxes/i2c-mux-reg.c            |  2 +-
 drivers/iio/gyro/mpu3050-i2c.c             |  2 +-
 drivers/iio/imu/inv_mpu6050/inv_mpu_i2c.c  |  2 +-
 drivers/media/dvb-frontends/af9013.c       |  2 +-
 drivers/media/dvb-frontends/lgdt3306a.c    |  2 +-
 drivers/media/dvb-frontends/m88ds3103.c    |  2 +-
 drivers/media/dvb-frontends/rtl2830.c      |  2 +-
 drivers/media/dvb-frontends/rtl2832.c      |  2 +-
 drivers/media/dvb-frontends/si2168.c       |  2 +-
 drivers/media/i2c/max9286.c                |  2 +-
 drivers/media/usb/cx231xx/cx231xx-i2c.c    |  5 +----
 drivers/of/unittest.c                      |  2 +-
 include/linux/i2c-mux.h                    |  3 +--
 23 files changed, 23 insertions(+), 49 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/bridge/sii902x.c b/drivers/gpu/drm/bridge/sii902x.c
index 8f84e98249c7..2fbeda9025bf 100644
--- a/drivers/gpu/drm/bridge/sii902x.c
+++ b/drivers/gpu/drm/bridge/sii902x.c
@@ -1092,7 +1092,7 @@ static int sii902x_init(struct sii902x *sii902x)
 	}
 
 	sii902x->i2cmux->priv = sii902x;
-	ret = i2c_mux_add_adapter(sii902x->i2cmux, 0, 0, 0);
+	ret = i2c_mux_add_adapter(sii902x->i2cmux, 0, 0);
 	if (ret)
 		goto err_unreg_audio;
 
diff --git a/drivers/i2c/i2c-mux.c b/drivers/i2c/i2c-mux.c
index 57ff09f18c37..fda72e8be885 100644
--- a/drivers/i2c/i2c-mux.c
+++ b/drivers/i2c/i2c-mux.c
@@ -127,19 +127,6 @@ static u32 i2c_mux_functionality(struct i2c_adapter *adap)
 	return parent->algo->functionality(parent);
 }
 
-/* Return all parent classes, merged */
-static unsigned int i2c_mux_parent_classes(struct i2c_adapter *parent)
-{
-	unsigned int class = 0;
-
-	do {
-		class |= parent->class;
-		parent = i2c_parent_is_i2c_adapter(parent);
-	} while (parent);
-
-	return class;
-}
-
 static void i2c_mux_lock_bus(struct i2c_adapter *adapter, unsigned int flags)
 {
 	struct i2c_mux_priv *priv = adapter->algo_data;
@@ -281,8 +268,7 @@ static const struct i2c_lock_operations i2c_parent_lock_ops = {
 };
 
 int i2c_mux_add_adapter(struct i2c_mux_core *muxc,
-			u32 force_nr, u32 chan_id,
-			unsigned int class)
+			u32 force_nr, u32 chan_id)
 {
 	struct i2c_adapter *parent = muxc->parent;
 	struct i2c_mux_priv *priv;
@@ -340,14 +326,6 @@ int i2c_mux_add_adapter(struct i2c_mux_core *muxc,
 	else
 		priv->adap.lock_ops = &i2c_parent_lock_ops;
 
-	/* Sanity check on class */
-	if (i2c_mux_parent_classes(parent) & class & ~I2C_CLASS_DEPRECATED)
-		dev_err(&parent->dev,
-			"Segment %d behind mux can't share classes with ancestors\n",
-			chan_id);
-	else
-		priv->adap.class = class;
-
 	/*
 	 * Try to populate the mux adapter's of_node, expands to
 	 * nothing if !CONFIG_OF.
diff --git a/drivers/i2c/muxes/i2c-arb-gpio-challenge.c b/drivers/i2c/muxes/i2c-arb-gpio-challenge.c
index 24168e9f7df4..7aa6e795d833 100644
--- a/drivers/i2c/muxes/i2c-arb-gpio-challenge.c
+++ b/drivers/i2c/muxes/i2c-arb-gpio-challenge.c
@@ -167,7 +167,7 @@ static int i2c_arbitrator_probe(struct platform_device *pdev)
 	}
 
 	/* Actually add the mux adapter */
-	ret = i2c_mux_add_adapter(muxc, 0, 0, 0);
+	ret = i2c_mux_add_adapter(muxc, 0, 0);
 	if (ret)
 		i2c_put_adapter(muxc->parent);
 
diff --git a/drivers/i2c/muxes/i2c-mux-gpio.c b/drivers/i2c/muxes/i2c-mux-gpio.c
index 0fbb33a3d518..d6bbb8b68333 100644
--- a/drivers/i2c/muxes/i2c-mux-gpio.c
+++ b/drivers/i2c/muxes/i2c-mux-gpio.c
@@ -207,7 +207,7 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev)
 	for (i = 0; i < mux->data.n_values; i++) {
 		u32 nr = mux->data.base_nr ? (mux->data.base_nr + i) : 0;
 
-		ret = i2c_mux_add_adapter(muxc, nr, mux->data.values[i], 0);
+		ret = i2c_mux_add_adapter(muxc, nr, mux->data.values[i]);
 		if (ret)
 			goto add_adapter_failed;
 	}
diff --git a/drivers/i2c/muxes/i2c-mux-gpmux.c b/drivers/i2c/muxes/i2c-mux-gpmux.c
index 8305661e1253..10d63307b14d 100644
--- a/drivers/i2c/muxes/i2c-mux-gpmux.c
+++ b/drivers/i2c/muxes/i2c-mux-gpmux.c
@@ -124,7 +124,7 @@ static int i2c_mux_probe(struct platform_device *pdev)
 			goto err_children;
 		}
 
-		ret = i2c_mux_add_adapter(muxc, 0, chan, 0);
+		ret = i2c_mux_add_adapter(muxc, 0, chan);
 		if (ret)
 			goto err_children;
 	}
diff --git a/drivers/i2c/muxes/i2c-mux-ltc4306.c b/drivers/i2c/muxes/i2c-mux-ltc4306.c
index 23766d853e76..19a7c370946d 100644
--- a/drivers/i2c/muxes/i2c-mux-ltc4306.c
+++ b/drivers/i2c/muxes/i2c-mux-ltc4306.c
@@ -279,7 +279,7 @@ static int ltc4306_probe(struct i2c_client *client)
 
 	/* Now create an adapter for each channel */
 	for (num = 0; num < chip->nchans; num++) {
-		ret = i2c_mux_add_adapter(muxc, 0, num, 0);
+		ret = i2c_mux_add_adapter(muxc, 0, num);
 		if (ret) {
 			i2c_mux_del_adapters(muxc);
 			return ret;
diff --git a/drivers/i2c/muxes/i2c-mux-mlxcpld.c b/drivers/i2c/muxes/i2c-mux-mlxcpld.c
index 4c6ed1d58c79..3f06aa3331a7 100644
--- a/drivers/i2c/muxes/i2c-mux-mlxcpld.c
+++ b/drivers/i2c/muxes/i2c-mux-mlxcpld.c
@@ -154,7 +154,7 @@ static int mlxcpld_mux_probe(struct platform_device *pdev)
 
 	/* Create an adapter for each channel. */
 	for (num = 0; num < pdata->num_adaps; num++) {
-		err = i2c_mux_add_adapter(muxc, 0, pdata->chan_ids[num], 0);
+		err = i2c_mux_add_adapter(muxc, 0, pdata->chan_ids[num]);
 		if (err)
 			goto virt_reg_failed;
 	}
diff --git a/drivers/i2c/muxes/i2c-mux-pca9541.c b/drivers/i2c/muxes/i2c-mux-pca9541.c
index ce0fb69249a8..e28694d991fb 100644
--- a/drivers/i2c/muxes/i2c-mux-pca9541.c
+++ b/drivers/i2c/muxes/i2c-mux-pca9541.c
@@ -314,7 +314,7 @@ static int pca9541_probe(struct i2c_client *client)
 
 	i2c_set_clientdata(client, muxc);
 
-	ret = i2c_mux_add_adapter(muxc, 0, 0, 0);
+	ret = i2c_mux_add_adapter(muxc, 0, 0);
 	if (ret)
 		return ret;
 
diff --git a/drivers/i2c/muxes/i2c-mux-pca954x.c b/drivers/i2c/muxes/i2c-mux-pca954x.c
index c3f4ff08ac38..6f84018258c4 100644
--- a/drivers/i2c/muxes/i2c-mux-pca954x.c
+++ b/drivers/i2c/muxes/i2c-mux-pca954x.c
@@ -644,7 +644,7 @@ static int pca954x_probe(struct i2c_client *client)
 
 	/* Now create an adapter for each channel */
 	for (num = 0; num < data->chip->nchans; num++) {
-		ret = i2c_mux_add_adapter(muxc, 0, num, 0);
+		ret = i2c_mux_add_adapter(muxc, 0, num);
 		if (ret)
 			goto fail_cleanup;
 	}
diff --git a/drivers/i2c/muxes/i2c-mux-pinctrl.c b/drivers/i2c/muxes/i2c-mux-pinctrl.c
index 6ebca7bfd8a2..02aaf0781e9c 100644
--- a/drivers/i2c/muxes/i2c-mux-pinctrl.c
+++ b/drivers/i2c/muxes/i2c-mux-pinctrl.c
@@ -151,7 +151,7 @@ static int i2c_mux_pinctrl_probe(struct platform_device *pdev)
 
 	/* Do not add any adapter for the idle state (if it's there at all). */
 	for (i = 0; i < num_names - !!muxc->deselect; i++) {
-		ret = i2c_mux_add_adapter(muxc, 0, i, 0);
+		ret = i2c_mux_add_adapter(muxc, 0, i);
 		if (ret)
 			goto err_del_adapter;
 	}
diff --git a/drivers/i2c/muxes/i2c-mux-reg.c b/drivers/i2c/muxes/i2c-mux-reg.c
index 8489971babd3..ef765fcd33f5 100644
--- a/drivers/i2c/muxes/i2c-mux-reg.c
+++ b/drivers/i2c/muxes/i2c-mux-reg.c
@@ -213,7 +213,7 @@ static int i2c_mux_reg_probe(struct platform_device *pdev)
 	for (i = 0; i < mux->data.n_values; i++) {
 		nr = mux->data.base_nr ? (mux->data.base_nr + i) : 0;
 
-		ret = i2c_mux_add_adapter(muxc, nr, mux->data.values[i], 0);
+		ret = i2c_mux_add_adapter(muxc, nr, mux->data.values[i]);
 		if (ret)
 			goto err_del_mux_adapters;
 	}
diff --git a/drivers/iio/gyro/mpu3050-i2c.c b/drivers/iio/gyro/mpu3050-i2c.c
index 52b6feed2637..29ecfa6fd633 100644
--- a/drivers/iio/gyro/mpu3050-i2c.c
+++ b/drivers/iio/gyro/mpu3050-i2c.c
@@ -72,7 +72,7 @@ static int mpu3050_i2c_probe(struct i2c_client *client)
 	else {
 		mpu3050->i2cmux->priv = mpu3050;
 		/* Ignore failure, not critical */
-		i2c_mux_add_adapter(mpu3050->i2cmux, 0, 0, 0);
+		i2c_mux_add_adapter(mpu3050->i2cmux, 0, 0);
 	}
 
 	return 0;
diff --git a/drivers/iio/imu/inv_mpu6050/inv_mpu_i2c.c b/drivers/iio/imu/inv_mpu6050/inv_mpu_i2c.c
index 410ea39fd495..0e03137fb3d4 100644
--- a/drivers/iio/imu/inv_mpu6050/inv_mpu_i2c.c
+++ b/drivers/iio/imu/inv_mpu6050/inv_mpu_i2c.c
@@ -142,7 +142,7 @@ static int inv_mpu_probe(struct i2c_client *client)
 		if (!st->muxc)
 			return -ENOMEM;
 		st->muxc->priv = dev_get_drvdata(&client->dev);
-		result = i2c_mux_add_adapter(st->muxc, 0, 0, 0);
+		result = i2c_mux_add_adapter(st->muxc, 0, 0);
 		if (result)
 			return result;
 		result = inv_mpu_acpi_create_mux_client(client);
diff --git a/drivers/media/dvb-frontends/af9013.c b/drivers/media/dvb-frontends/af9013.c
index a829c89792a4..5afdbe244596 100644
--- a/drivers/media/dvb-frontends/af9013.c
+++ b/drivers/media/dvb-frontends/af9013.c
@@ -1480,7 +1480,7 @@ static int af9013_probe(struct i2c_client *client)
 		goto err_regmap_exit;
 	}
 	state->muxc->priv = state;
-	ret = i2c_mux_add_adapter(state->muxc, 0, 0, 0);
+	ret = i2c_mux_add_adapter(state->muxc, 0, 0);
 	if (ret)
 		goto err_regmap_exit;
 
diff --git a/drivers/media/dvb-frontends/lgdt3306a.c b/drivers/media/dvb-frontends/lgdt3306a.c
index 263887592415..91c71b24cacb 100644
--- a/drivers/media/dvb-frontends/lgdt3306a.c
+++ b/drivers/media/dvb-frontends/lgdt3306a.c
@@ -2203,7 +2203,7 @@ static int lgdt3306a_probe(struct i2c_client *client)
 		goto err_kfree;
 	}
 	state->muxc->priv = client;
-	ret = i2c_mux_add_adapter(state->muxc, 0, 0, 0);
+	ret = i2c_mux_add_adapter(state->muxc, 0, 0);
 	if (ret)
 		goto err_kfree;
 
diff --git a/drivers/media/dvb-frontends/m88ds3103.c b/drivers/media/dvb-frontends/m88ds3103.c
index e0272054fca5..6c69bcc7a1bc 100644
--- a/drivers/media/dvb-frontends/m88ds3103.c
+++ b/drivers/media/dvb-frontends/m88ds3103.c
@@ -1866,7 +1866,7 @@ static int m88ds3103_probe(struct i2c_client *client)
 		goto err_kfree;
 	}
 	dev->muxc->priv = dev;
-	ret = i2c_mux_add_adapter(dev->muxc, 0, 0, 0);
+	ret = i2c_mux_add_adapter(dev->muxc, 0, 0);
 	if (ret)
 		goto err_kfree;
 
diff --git a/drivers/media/dvb-frontends/rtl2830.c b/drivers/media/dvb-frontends/rtl2830.c
index 35c969fd2cb5..30d10fe4b33e 100644
--- a/drivers/media/dvb-frontends/rtl2830.c
+++ b/drivers/media/dvb-frontends/rtl2830.c
@@ -838,7 +838,7 @@ static int rtl2830_probe(struct i2c_client *client)
 		goto err_regmap_exit;
 	}
 	dev->muxc->priv = client;
-	ret = i2c_mux_add_adapter(dev->muxc, 0, 0, 0);
+	ret = i2c_mux_add_adapter(dev->muxc, 0, 0);
 	if (ret)
 		goto err_regmap_exit;
 
diff --git a/drivers/media/dvb-frontends/rtl2832.c b/drivers/media/dvb-frontends/rtl2832.c
index 601cf45c3935..5142820b1b3d 100644
--- a/drivers/media/dvb-frontends/rtl2832.c
+++ b/drivers/media/dvb-frontends/rtl2832.c
@@ -1082,7 +1082,7 @@ static int rtl2832_probe(struct i2c_client *client)
 		goto err_regmap_exit;
 	}
 	dev->muxc->priv = dev;
-	ret = i2c_mux_add_adapter(dev->muxc, 0, 0, 0);
+	ret = i2c_mux_add_adapter(dev->muxc, 0, 0);
 	if (ret)
 		goto err_regmap_exit;
 
diff --git a/drivers/media/dvb-frontends/si2168.c b/drivers/media/dvb-frontends/si2168.c
index dae1f2153e8b..26828fd41e68 100644
--- a/drivers/media/dvb-frontends/si2168.c
+++ b/drivers/media/dvb-frontends/si2168.c
@@ -744,7 +744,7 @@ static int si2168_probe(struct i2c_client *client)
 		goto err_kfree;
 	}
 	dev->muxc->priv = client;
-	ret = i2c_mux_add_adapter(dev->muxc, 0, 0, 0);
+	ret = i2c_mux_add_adapter(dev->muxc, 0, 0);
 	if (ret)
 		goto err_kfree;
 
diff --git a/drivers/media/i2c/max9286.c b/drivers/media/i2c/max9286.c
index d685d445cf23..dfcb3fc03350 100644
--- a/drivers/media/i2c/max9286.c
+++ b/drivers/media/i2c/max9286.c
@@ -383,7 +383,7 @@ static int max9286_i2c_mux_init(struct max9286_priv *priv)
 	for_each_source(priv, source) {
 		unsigned int index = to_index(priv, source);
 
-		ret = i2c_mux_add_adapter(priv->mux, 0, index, 0);
+		ret = i2c_mux_add_adapter(priv->mux, 0, index);
 		if (ret < 0)
 			goto error;
 	}
diff --git a/drivers/media/usb/cx231xx/cx231xx-i2c.c b/drivers/media/usb/cx231xx/cx231xx-i2c.c
index c6659253c6fb..6da8e7943d94 100644
--- a/drivers/media/usb/cx231xx/cx231xx-i2c.c
+++ b/drivers/media/usb/cx231xx/cx231xx-i2c.c
@@ -567,10 +567,7 @@ int cx231xx_i2c_mux_create(struct cx231xx *dev)
 
 int cx231xx_i2c_mux_register(struct cx231xx *dev, int mux_no)
 {
-	return i2c_mux_add_adapter(dev->muxc,
-				   0,
-				   mux_no /* chan_id */,
-				   0 /* class */);
+	return i2c_mux_add_adapter(dev->muxc, 0, mux_no);
 }
 
 void cx231xx_i2c_mux_unregister(struct cx231xx *dev)
diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c
index 6b5c36b6a758..c8ee866d76d4 100644
--- a/drivers/of/unittest.c
+++ b/drivers/of/unittest.c
@@ -2815,7 +2815,7 @@ static int unittest_i2c_mux_probe(struct i2c_client *client)
 	if (!muxc)
 		return -ENOMEM;
 	for (i = 0; i < nchans; i++) {
-		if (i2c_mux_add_adapter(muxc, 0, i, 0)) {
+		if (i2c_mux_add_adapter(muxc, 0, i)) {
 			dev_err(dev, "Failed to register mux #%d\n", i);
 			i2c_mux_del_adapters(muxc);
 			return -ENODEV;
diff --git a/include/linux/i2c-mux.h b/include/linux/i2c-mux.h
index 98ef73b7c8fd..1784ac7afb11 100644
--- a/include/linux/i2c-mux.h
+++ b/include/linux/i2c-mux.h
@@ -56,8 +56,7 @@ struct i2c_adapter *i2c_root_adapter(struct device *dev);
  * callback functions to perform hardware-specific mux control.
  */
 int i2c_mux_add_adapter(struct i2c_mux_core *muxc,
-			u32 force_nr, u32 chan_id,
-			unsigned int class);
+			u32 force_nr, u32 chan_id);
 
 void i2c_mux_del_adapters(struct i2c_mux_core *muxc);
 
-- 
cgit v1.2.3


From 959314c438caf1b62d787f02d54a193efda38880 Mon Sep 17 00:00:00 2001
From: Mohamed Ahmed <mohamedahmedegypt2001@gmail.com>
Date: Thu, 9 May 2024 23:43:52 +0300
Subject: drm/nouveau: use tile_mode and pte_kind for VM_BIND bo allocations

Allow PTE kind and tile mode on BO create with VM_BIND, and add a
GETPARAM to indicate this change. This is needed to support modifiers in
NVK and ensure correctness when dealing with the nouveau GL driver.

The userspace modifiers implementation this is for can be found here:
https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24795

Fixes: b88baab82871 ("drm/nouveau: implement new VM_BIND uAPI")
Signed-off-by: Mohamed Ahmed <mohamedahmedegypt2001@gmail.com>
Reviewed-by: Faith Ekstrand <faith.ekstrand@collabora.com>
Signed-off-by: Danilo Krummrich <dakr@redhat.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240509204352.7597-1-mohamedahmedegypt2001@gmail.com
---
 drivers/gpu/drm/nouveau/nouveau_abi16.c |  3 +++
 drivers/gpu/drm/nouveau/nouveau_bo.c    | 44 ++++++++++++++-------------------
 include/uapi/drm/nouveau_drm.h          |  7 ++++++
 3 files changed, 29 insertions(+), 25 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/nouveau/nouveau_abi16.c b/drivers/gpu/drm/nouveau/nouveau_abi16.c
index f465fe93b1f7..d56909071de6 100644
--- a/drivers/gpu/drm/nouveau/nouveau_abi16.c
+++ b/drivers/gpu/drm/nouveau/nouveau_abi16.c
@@ -272,6 +272,9 @@ nouveau_abi16_ioctl_getparam(ABI16_IOCTL_ARGS)
 		getparam->value = (u64)ttm_resource_manager_usage(vram_mgr);
 		break;
 	}
+	case NOUVEAU_GETPARAM_HAS_VMA_TILEMODE:
+		getparam->value = 1;
+		break;
 	default:
 		NV_PRINTK(dbg, cli, "unknown parameter %lld\n", getparam->param);
 		return -EINVAL;
diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c b/drivers/gpu/drm/nouveau/nouveau_bo.c
index 1e2d28fd10dc..70fb003a6666 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -241,28 +241,28 @@ nouveau_bo_alloc(struct nouveau_cli *cli, u64 *size, int *align, u32 domain,
 	}
 
 	nvbo->contig = !(tile_flags & NOUVEAU_GEM_TILE_NONCONTIG);
-	if (!nouveau_cli_uvmm(cli) || internal) {
-		/* for BO noVM allocs, don't assign kinds */
-		if (cli->device.info.family >= NV_DEVICE_INFO_V0_FERMI) {
-			nvbo->kind = (tile_flags & 0x0000ff00) >> 8;
-			if (!nvif_mmu_kind_valid(mmu, nvbo->kind)) {
-				kfree(nvbo);
-				return ERR_PTR(-EINVAL);
-			}
 
-			nvbo->comp = mmu->kind[nvbo->kind] != nvbo->kind;
-		} else if (cli->device.info.family >= NV_DEVICE_INFO_V0_TESLA) {
-			nvbo->kind = (tile_flags & 0x00007f00) >> 8;
-			nvbo->comp = (tile_flags & 0x00030000) >> 16;
-			if (!nvif_mmu_kind_valid(mmu, nvbo->kind)) {
-				kfree(nvbo);
-				return ERR_PTR(-EINVAL);
-			}
-		} else {
-			nvbo->zeta = (tile_flags & 0x00000007);
+	if (cli->device.info.family >= NV_DEVICE_INFO_V0_FERMI) {
+		nvbo->kind = (tile_flags & 0x0000ff00) >> 8;
+		if (!nvif_mmu_kind_valid(mmu, nvbo->kind)) {
+			kfree(nvbo);
+			return ERR_PTR(-EINVAL);
+		}
+
+		nvbo->comp = mmu->kind[nvbo->kind] != nvbo->kind;
+	} else if (cli->device.info.family >= NV_DEVICE_INFO_V0_TESLA) {
+		nvbo->kind = (tile_flags & 0x00007f00) >> 8;
+		nvbo->comp = (tile_flags & 0x00030000) >> 16;
+		if (!nvif_mmu_kind_valid(mmu, nvbo->kind)) {
+			kfree(nvbo);
+			return ERR_PTR(-EINVAL);
 		}
-		nvbo->mode = tile_mode;
+	} else {
+		nvbo->zeta = (tile_flags & 0x00000007);
+	}
+	nvbo->mode = tile_mode;
 
+	if (!nouveau_cli_uvmm(cli) || internal) {
 		/* Determine the desirable target GPU page size for the buffer. */
 		for (i = 0; i < vmm->page_nr; i++) {
 			/* Because we cannot currently allow VMM maps to fail
@@ -304,12 +304,6 @@ nouveau_bo_alloc(struct nouveau_cli *cli, u64 *size, int *align, u32 domain,
 		}
 		nvbo->page = vmm->page[pi].shift;
 	} else {
-		/* reject other tile flags when in VM mode. */
-		if (tile_mode)
-			return ERR_PTR(-EINVAL);
-		if (tile_flags & ~NOUVEAU_GEM_TILE_NONCONTIG)
-			return ERR_PTR(-EINVAL);
-
 		/* Determine the desirable target GPU page size for the buffer. */
 		for (i = 0; i < vmm->page_nr; i++) {
 			/* Because we cannot currently allow VMM maps to fail
diff --git a/include/uapi/drm/nouveau_drm.h b/include/uapi/drm/nouveau_drm.h
index 8ad8d1cd1566..dd87f8f30793 100644
--- a/include/uapi/drm/nouveau_drm.h
+++ b/include/uapi/drm/nouveau_drm.h
@@ -68,6 +68,13 @@ extern "C" {
  */
 #define NOUVEAU_GETPARAM_VRAM_USED 19
 
+/*
+ * NOUVEAU_GETPARAM_HAS_VMA_TILEMODE
+ *
+ * Query whether tile mode and PTE kind are accepted with VM allocs or not.
+ */
+#define NOUVEAU_GETPARAM_HAS_VMA_TILEMODE 20
+
 struct drm_nouveau_getparam {
 	__u64 param;
 	__u64 value;
-- 
cgit v1.2.3


From 186b1ea73ad8f30d1d7afdb1d07dfd5b5de8f2da Mon Sep 17 00:00:00 2001
From: Richard Gobert <richardbgobert@gmail.com>
Date: Thu, 9 May 2024 21:08:17 +0200
Subject: net: gro: use cb instead of skb->network_header

This patch converts references of skb->network_header to napi_gro_cb's
network_offset and inner_network_offset.

Signed-off-by: Richard Gobert <richardbgobert@gmail.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://lore.kernel.org/r/20240509190819.2985-2-richardbgobert@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/gro.h        | 9 +++++++--
 net/ipv4/af_inet.c       | 4 ----
 net/ipv4/tcp_offload.c   | 3 ++-
 net/ipv6/ip6_offload.c   | 5 ++---
 net/ipv6/tcpv6_offload.c | 3 ++-
 5 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/net/gro.h b/include/net/gro.h
index 5df8bf318197..cbc1b0aaf295 100644
--- a/include/net/gro.h
+++ b/include/net/gro.h
@@ -181,12 +181,17 @@ static inline void *skb_gro_header(struct sk_buff *skb, unsigned int hlen,
 	return ptr;
 }
 
+static inline int skb_gro_receive_network_offset(const struct sk_buff *skb)
+{
+	return NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark];
+}
+
 static inline void *skb_gro_network_header(const struct sk_buff *skb)
 {
 	if (skb_gro_may_pull(skb, skb_gro_offset(skb)))
-		return skb_gro_header_fast(skb, skb_network_offset(skb));
+		return skb_gro_header_fast(skb, skb_gro_receive_network_offset(skb));
 
-	return skb_network_header(skb);
+	return skb->data + skb_gro_receive_network_offset(skb);
 }
 
 static inline __wsum inet_gro_compute_pseudo(const struct sk_buff *skb,
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index a7bad18bc8b5..428196e1541f 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1569,10 +1569,6 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
 
 	NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF));
 	NAPI_GRO_CB(skb)->flush |= flush;
-	skb_set_network_header(skb, off);
-	/* The above will be needed by the transport layer if there is one
-	 * immediately following this IP hdr.
-	 */
 	NAPI_GRO_CB(skb)->inner_network_offset = off;
 
 	/* Note : No need to call skb_gro_postpull_rcsum() here,
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index c90704befd7b..2809667ac924 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -463,7 +463,8 @@ flush:
 
 INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff)
 {
-	const struct iphdr *iph = ip_hdr(skb);
+	const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
+	const struct iphdr *iph = (struct iphdr *)(skb->data + offset);
 	struct tcphdr *th = tcp_hdr(skb);
 
 	if (unlikely(NAPI_GRO_CB(skb)->is_flist)) {
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index c8b909a9904f..288c7c6ea50f 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -67,7 +67,7 @@ static int ipv6_gro_pull_exthdrs(struct sk_buff *skb, int off, int proto)
 		off += len;
 	}
 
-	skb_gro_pull(skb, off - skb_network_offset(skb));
+	skb_gro_pull(skb, off - skb_gro_receive_network_offset(skb));
 	return proto;
 }
 
@@ -236,7 +236,6 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head,
 	if (unlikely(!iph))
 		goto out;
 
-	skb_set_network_header(skb, off);
 	NAPI_GRO_CB(skb)->inner_network_offset = off;
 
 	flush += ntohs(iph->payload_len) != skb->len - hlen;
@@ -260,7 +259,7 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head,
 	NAPI_GRO_CB(skb)->proto = proto;
 
 	flush--;
-	nlen = skb_network_header_len(skb);
+	nlen = skb_gro_offset(skb) - off;
 
 	list_for_each_entry(p, head, list) {
 		const struct ipv6hdr *iph2;
diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c
index d59f58cbd306..23971903e66d 100644
--- a/net/ipv6/tcpv6_offload.c
+++ b/net/ipv6/tcpv6_offload.c
@@ -72,7 +72,8 @@ flush:
 
 INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff)
 {
-	const struct ipv6hdr *iph = ipv6_hdr(skb);
+	const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
+	const struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + offset);
 	struct tcphdr *th = tcp_hdr(skb);
 
 	if (unlikely(NAPI_GRO_CB(skb)->is_flist)) {
-- 
cgit v1.2.3


From 4b0ebbca3e1679765c06d5c466ee7f3228d4b156 Mon Sep 17 00:00:00 2001
From: Richard Gobert <richardbgobert@gmail.com>
Date: Thu, 9 May 2024 21:08:18 +0200
Subject: net: gro: move L3 flush checks to tcp_gro_receive and
 udp_gro_receive_segment

{inet,ipv6}_gro_receive functions perform flush checks (ttl, flags,
iph->id, ...) against all packets in a loop. These flush checks are used in
all merging UDP and TCP flows.

These checks need to be done only once and only against the found p skb,
since they only affect flush and not same_flow.

This patch leverages correct network header offsets from the cb for both
outer and inner network headers - allowing these checks to be done only
once, in tcp_gro_receive and udp_gro_receive_segment. As a result,
NAPI_GRO_CB(p)->flush is not used at all. In addition, flush_id checks are
more declarative and contained in inet_gro_flush, thus removing the need
for flush_id in napi_gro_cb.

This results in less parsing code for non-loop flush tests for TCP and UDP
flows.

To make sure results are not within noise range - I've made netfilter drop
all TCP packets, and measured CPU performance in GRO (in this case GRO is
responsible for about 50% of the CPU utilization).

perf top while replaying 64 parallel IP/TCP streams merging in GRO:
(gro_receive_network_flush is compiled inline to tcp_gro_receive)
net-next:
        6.94% [kernel] [k] inet_gro_receive
        3.02% [kernel] [k] tcp_gro_receive

patch applied:
        4.27% [kernel] [k] tcp_gro_receive
        4.22% [kernel] [k] inet_gro_receive

perf top while replaying 64 parallel IP/IP/TCP streams merging in GRO (same
results for any encapsulation, in this case inet_gro_receive is top
offender in net-next)
net-next:
        10.09% [kernel] [k] inet_gro_receive
        2.08% [kernel] [k] tcp_gro_receive

patch applied:
        6.97% [kernel] [k] inet_gro_receive
        3.68% [kernel] [k] tcp_gro_receive

Signed-off-by: Richard Gobert <richardbgobert@gmail.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://lore.kernel.org/r/20240509190819.2985-3-richardbgobert@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/gro.h      | 76 ++++++++++++++++++++++++++++++++++++++++++++------
 net/core/gro.c         |  3 --
 net/ipv4/af_inet.c     | 41 +--------------------------
 net/ipv4/tcp_offload.c | 17 ++---------
 net/ipv4/udp_offload.c |  9 +-----
 net/ipv6/ip6_offload.c | 11 --------
 6 files changed, 73 insertions(+), 84 deletions(-)

(limited to 'include')

diff --git a/include/net/gro.h b/include/net/gro.h
index cbc1b0aaf295..f13634b1f4c1 100644
--- a/include/net/gro.h
+++ b/include/net/gro.h
@@ -36,15 +36,15 @@ struct napi_gro_cb {
 	/* This is non-zero if the packet cannot be merged with the new skb. */
 	u16	flush;
 
-	/* Save the IP ID here and check when we get to the transport layer */
-	u16	flush_id;
-
 	/* Number of segments aggregated. */
 	u16	count;
 
 	/* Used in ipv6_gro_receive() and foo-over-udp and esp-in-udp */
 	u16	proto;
 
+	/* used to support CHECKSUM_COMPLETE for tunneling protocols */
+	__wsum	csum;
+
 /* Used in napi_gro_cb::free */
 #define NAPI_GRO_FREE             1
 #define NAPI_GRO_FREE_STOLEN_HEAD 2
@@ -75,8 +75,8 @@ struct napi_gro_cb {
 		/* Used in GRE, set in fou/gue_gro_receive */
 		u8	is_fou:1;
 
-		/* Used to determine if flush_id can be ignored */
-		u8	is_atomic:1;
+		/* Used to determine if ipid_offset can be ignored */
+		u8	ip_fixedid:1;
 
 		/* Number of gro_receive callbacks this packet already went through */
 		u8 recursion_counter:4;
@@ -85,9 +85,6 @@ struct napi_gro_cb {
 		u8	is_flist:1;
 	);
 
-	/* used to support CHECKSUM_COMPLETE for tunneling protocols */
-	__wsum	csum;
-
 	/* L3 offsets */
 	union {
 		struct {
@@ -442,6 +439,69 @@ static inline __wsum ip6_gro_compute_pseudo(const struct sk_buff *skb,
 					    skb_gro_len(skb), proto, 0));
 }
 
+static inline int inet_gro_flush(const struct iphdr *iph, const struct iphdr *iph2,
+				 struct sk_buff *p, bool outer)
+{
+	const u32 id = ntohl(*(__be32 *)&iph->id);
+	const u32 id2 = ntohl(*(__be32 *)&iph2->id);
+	const u16 ipid_offset = (id >> 16) - (id2 >> 16);
+	const u16 count = NAPI_GRO_CB(p)->count;
+	const u32 df = id & IP_DF;
+	int flush;
+
+	/* All fields must match except length and checksum. */
+	flush = (iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | (df ^ (id2 & IP_DF));
+
+	if (flush | (outer && df))
+		return flush;
+
+	/* When we receive our second frame we can make a decision on if we
+	 * continue this flow as an atomic flow with a fixed ID or if we use
+	 * an incrementing ID.
+	 */
+	if (count == 1 && df && !ipid_offset)
+		NAPI_GRO_CB(p)->ip_fixedid = true;
+
+	return ipid_offset ^ (count * !NAPI_GRO_CB(p)->ip_fixedid);
+}
+
+static inline int ipv6_gro_flush(const struct ipv6hdr *iph, const struct ipv6hdr *iph2)
+{
+	/* <Version:4><Traffic_Class:8><Flow_Label:20> */
+	__be32 first_word = *(__be32 *)iph ^ *(__be32 *)iph2;
+
+	/* Flush if Traffic Class fields are different. */
+	return !!((first_word & htonl(0x0FF00000)) |
+		(__force __be32)(iph->hop_limit ^ iph2->hop_limit));
+}
+
+static inline int __gro_receive_network_flush(const void *th, const void *th2,
+					      struct sk_buff *p, const u16 diff,
+					      bool outer)
+{
+	const void *nh = th - diff;
+	const void *nh2 = th2 - diff;
+
+	if (((struct iphdr *)nh)->version == 6)
+		return ipv6_gro_flush(nh, nh2);
+	else
+		return inet_gro_flush(nh, nh2, p, outer);
+}
+
+static inline int gro_receive_network_flush(const void *th, const void *th2,
+					    struct sk_buff *p)
+{
+	const bool encap_mark = NAPI_GRO_CB(p)->encap_mark;
+	int off = skb_transport_offset(p);
+	int flush;
+
+	flush = __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->network_offset, encap_mark);
+	if (encap_mark)
+		flush |= __gro_receive_network_flush(th, th2, p, off - NAPI_GRO_CB(p)->inner_network_offset, false);
+
+	return flush;
+}
+
 int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb);
 int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb);
 
diff --git a/net/core/gro.c b/net/core/gro.c
index e2f84947fb74..b3b43de1a650 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -358,8 +358,6 @@ static void gro_list_prepare(const struct list_head *head,
 	list_for_each_entry(p, head, list) {
 		unsigned long diffs;
 
-		NAPI_GRO_CB(p)->flush = 0;
-
 		if (hash != skb_get_hash_raw(p)) {
 			NAPI_GRO_CB(p)->same_flow = 0;
 			continue;
@@ -499,7 +497,6 @@ found_ptype:
 					sizeof(u32))); /* Avoid slow unaligned acc */
 	*(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0;
 	NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb);
-	NAPI_GRO_CB(skb)->is_atomic = 1;
 	NAPI_GRO_CB(skb)->count = 1;
 	if (unlikely(skb_is_gso(skb))) {
 		NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 428196e1541f..44564d009e95 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1482,7 +1482,6 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
 	struct sk_buff *p;
 	unsigned int hlen;
 	unsigned int off;
-	unsigned int id;
 	int flush = 1;
 	int proto;
 
@@ -1508,13 +1507,10 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
 		goto out;
 
 	NAPI_GRO_CB(skb)->proto = proto;
-	id = ntohl(*(__be32 *)&iph->id);
-	flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
-	id >>= 16;
+	flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (ntohl(*(__be32 *)&iph->id) & ~IP_DF));
 
 	list_for_each_entry(p, head, list) {
 		struct iphdr *iph2;
-		u16 flush_id;
 
 		if (!NAPI_GRO_CB(p)->same_flow)
 			continue;
@@ -1531,43 +1527,8 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
 			NAPI_GRO_CB(p)->same_flow = 0;
 			continue;
 		}
-
-		/* All fields must match except length and checksum. */
-		NAPI_GRO_CB(p)->flush |=
-			(iph->ttl ^ iph2->ttl) |
-			(iph->tos ^ iph2->tos) |
-			((iph->frag_off ^ iph2->frag_off) & htons(IP_DF));
-
-		NAPI_GRO_CB(p)->flush |= flush;
-
-		/* We need to store of the IP ID check to be included later
-		 * when we can verify that this packet does in fact belong
-		 * to a given flow.
-		 */
-		flush_id = (u16)(id - ntohs(iph2->id));
-
-		/* This bit of code makes it much easier for us to identify
-		 * the cases where we are doing atomic vs non-atomic IP ID
-		 * checks.  Specifically an atomic check can return IP ID
-		 * values 0 - 0xFFFF, while a non-atomic check can only
-		 * return 0 or 0xFFFF.
-		 */
-		if (!NAPI_GRO_CB(p)->is_atomic ||
-		    !(iph->frag_off & htons(IP_DF))) {
-			flush_id ^= NAPI_GRO_CB(p)->count;
-			flush_id = flush_id ? 0xFFFF : 0;
-		}
-
-		/* If the previous IP ID value was based on an atomic
-		 * datagram we can overwrite the value and ignore it.
-		 */
-		if (NAPI_GRO_CB(skb)->is_atomic)
-			NAPI_GRO_CB(p)->flush_id = flush_id;
-		else
-			NAPI_GRO_CB(p)->flush_id |= flush_id;
 	}
 
-	NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF));
 	NAPI_GRO_CB(skb)->flush |= flush;
 	NAPI_GRO_CB(skb)->inner_network_offset = off;
 
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 2809667ac924..4b791e74529e 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -313,10 +313,8 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
 	if (!p)
 		goto out_check_final;
 
-	/* Include the IP ID check below from the inner most IP hdr */
 	th2 = tcp_hdr(p);
-	flush = NAPI_GRO_CB(p)->flush;
-	flush |= (__force int)(flags & TCP_FLAG_CWR);
+	flush = (__force int)(flags & TCP_FLAG_CWR);
 	flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
 		  ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
 	flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
@@ -324,16 +322,7 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
 		flush |= *(u32 *)((u8 *)th + i) ^
 			 *(u32 *)((u8 *)th2 + i);
 
-	/* When we receive our second frame we can made a decision on if we
-	 * continue this flow as an atomic flow with a fixed ID or if we use
-	 * an incrementing ID.
-	 */
-	if (NAPI_GRO_CB(p)->flush_id != 1 ||
-	    NAPI_GRO_CB(p)->count != 1 ||
-	    !NAPI_GRO_CB(p)->is_atomic)
-		flush |= NAPI_GRO_CB(p)->flush_id;
-	else
-		NAPI_GRO_CB(p)->is_atomic = false;
+	flush |= gro_receive_network_flush(th, th2, p);
 
 	mss = skb_shinfo(p)->gso_size;
 
@@ -480,7 +469,7 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff)
 				  iph->daddr, 0);
 
 	skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4 |
-			(NAPI_GRO_CB(skb)->is_atomic * SKB_GSO_TCP_FIXEDID);
+			(NAPI_GRO_CB(skb)->ip_fixedid * SKB_GSO_TCP_FIXEDID);
 
 	tcp_gro_complete(skb);
 	return 0;
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index f6d3c442e260..59448a2dbf2c 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -478,14 +478,7 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
 			return p;
 		}
 
-		flush = NAPI_GRO_CB(p)->flush;
-
-		if (NAPI_GRO_CB(p)->flush_id != 1 ||
-		    NAPI_GRO_CB(p)->count != 1 ||
-		    !NAPI_GRO_CB(p)->is_atomic)
-			flush |= NAPI_GRO_CB(p)->flush_id;
-		else
-			NAPI_GRO_CB(p)->is_atomic = false;
+		flush = gro_receive_network_flush(uh, uh2, p);
 
 		/* Terminate the flow on len mismatch or if it grow "too much".
 		 * Under small packet flood GRO count could elsewhere grow a lot
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 288c7c6ea50f..bd5aff97d8b1 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -290,19 +290,8 @@ not_same_flow:
 				   nlen - sizeof(struct ipv6hdr)))
 				goto not_same_flow;
 		}
-		/* flush if Traffic Class fields are different */
-		NAPI_GRO_CB(p)->flush |= !!((first_word & htonl(0x0FF00000)) |
-			(__force __be32)(iph->hop_limit ^ iph2->hop_limit));
-		NAPI_GRO_CB(p)->flush |= flush;
-
-		/* If the previous IP ID value was based on an atomic
-		 * datagram we can overwrite the value and ignore it.
-		 */
-		if (NAPI_GRO_CB(skb)->is_atomic)
-			NAPI_GRO_CB(p)->flush_id = 0;
 	}
 
-	NAPI_GRO_CB(skb)->is_atomic = true;
 	NAPI_GRO_CB(skb)->flush |= flush;
 
 	skb_gro_postpull_rcsum(skb, iph, nlen);
-- 
cgit v1.2.3


From c084ebd77a00b1a16d9daa57b6ecdfdf1f43c78a Mon Sep 17 00:00:00 2001
From: "Matthieu Baerts (NGI0)" <matttbe@kernel.org>
Date: Thu, 9 May 2024 20:10:10 +0200
Subject: tcp: socket option to check for MPTCP fallback to TCP

A way for an application to know if an MPTCP connection fell back to TCP
is to use getsockopt(MPTCP_INFO) and look for errors. The issue with
this technique is that the same errors -- EOPNOTSUPP (IPv4) and
ENOPROTOOPT (IPv6) -- are returned if there was a fallback, *or* if the
kernel doesn't support this socket option. The userspace then has to
look at the kernel version to understand what the errors mean.

It is not clean, and it doesn't take into account older kernels where
the socket option has been backported. A cleaner way would be to expose
this info to the TCP socket level. In case of MPTCP socket where no
fallback happened, the socket options for the TCP level will be handled
in MPTCP code, in mptcp_getsockopt_sol_tcp(). If not, that will be in
TCP code, in do_tcp_getsockopt(). So MPTCP simply has to set the value
1, while TCP has to set 0.

If the socket option is not supported, one of these two errors will be
reported:
- EOPNOTSUPP (95 - Operation not supported) for MPTCP sockets
- ENOPROTOOPT (92 - Protocol not available) for TCP sockets, e.g. on the
  socket received after an 'accept()', when the client didn't request to
  use MPTCP: this socket will be a TCP one, even if the listen socket
  was an MPTCP one.

With this new option, the kernel can return a clear answer to both "Is
this kernel new enough to tell me the fallback status?" and "If it is
new enough, is it currently a TCP or MPTCP socket?" questions, while not
breaking the previous method.

Acked-by: Mat Martineau <martineau@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Link: https://lore.kernel.org/r/20240509-upstream-net-next-20240509-mptcp-tcp_is_mptcp-v1-1-f846df999202@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/tcp.h | 2 ++
 net/ipv4/tcp.c           | 3 +++
 net/mptcp/sockopt.c      | 2 ++
 3 files changed, 7 insertions(+)

(limited to 'include')

diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index c07e9f90c084..dbf896f3146c 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -135,6 +135,8 @@ enum {
 #define TCP_AO_GET_KEYS		41	/* List MKT(s) */
 #define TCP_AO_REPAIR		42	/* Get/Set SNEs and ISNs */
 
+#define TCP_IS_MPTCP		43	/* Is MPTCP being used? */
+
 #define TCP_REPAIR_ON		1
 #define TCP_REPAIR_OFF		0
 #define TCP_REPAIR_OFF_NO_WP	-1	/* Turn off without window probes */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 06aab937d60a..681b54e1f3a6 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4363,6 +4363,9 @@ zerocopy_rcv_out:
 
 		return err;
 	}
+	case TCP_IS_MPTCP:
+		val = 0;
+		break;
 	default:
 		return -ENOPROTOOPT;
 	}
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index 1fea43f5b6f3..eaa3b79651a4 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -1348,6 +1348,8 @@ static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
 		return mptcp_put_int_option(msk, optval, optlen, msk->nodelay);
 	case TCP_NOTSENT_LOWAT:
 		return mptcp_put_int_option(msk, optval, optlen, msk->notsent_lowat);
+	case TCP_IS_MPTCP:
+		return mptcp_put_int_option(msk, optval, optlen, 1);
 	}
 	return -EOPNOTSUPP;
 }
-- 
cgit v1.2.3


From 8ec9897ec2e93a412b9e3119e3137583a85969e0 Mon Sep 17 00:00:00 2001
From: Davide Caratti <dcaratti@redhat.com>
Date: Fri, 10 May 2024 19:19:12 +0200
Subject: netlabel: fix RCU annotation for IPv4 options on socket creation

Xiumei reports the following splat when netlabel and TCP socket are used:

 =============================
 WARNING: suspicious RCU usage
 6.9.0-rc2+ #637 Not tainted
 -----------------------------
 net/ipv4/cipso_ipv4.c:1880 suspicious rcu_dereference_protected() usage!

 other info that might help us debug this:

 rcu_scheduler_active = 2, debug_locks = 1
 1 lock held by ncat/23333:
  #0: ffffffff906030c0 (rcu_read_lock){....}-{1:2}, at: netlbl_sock_setattr+0x25/0x1b0

 stack backtrace:
 CPU: 11 PID: 23333 Comm: ncat Kdump: loaded Not tainted 6.9.0-rc2+ #637
 Hardware name: Supermicro SYS-6027R-72RF/X9DRH-7TF/7F/iTF/iF, BIOS 3.0  07/26/2013
 Call Trace:
  <TASK>
  dump_stack_lvl+0xa9/0xc0
  lockdep_rcu_suspicious+0x117/0x190
  cipso_v4_sock_setattr+0x1ab/0x1b0
  netlbl_sock_setattr+0x13e/0x1b0
  selinux_netlbl_socket_post_create+0x3f/0x80
  selinux_socket_post_create+0x1a0/0x460
  security_socket_post_create+0x42/0x60
  __sock_create+0x342/0x3a0
  __sys_socket_create.part.22+0x42/0x70
  __sys_socket+0x37/0xb0
  __x64_sys_socket+0x16/0x20
  do_syscall_64+0x96/0x180
  ? do_user_addr_fault+0x68d/0xa30
  ? exc_page_fault+0x171/0x280
  ? asm_exc_page_fault+0x22/0x30
  entry_SYSCALL_64_after_hwframe+0x71/0x79
 RIP: 0033:0x7fbc0ca3fc1b
 Code: 73 01 c3 48 8b 0d 05 f2 1b 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa b8 29 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d d5 f1 1b 00 f7 d8 64 89 01 48
 RSP: 002b:00007fff18635208 EFLAGS: 00000246 ORIG_RAX: 0000000000000029
 RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007fbc0ca3fc1b
 RDX: 0000000000000006 RSI: 0000000000000001 RDI: 0000000000000002
 RBP: 000055d24f80f8a0 R08: 0000000000000003 R09: 0000000000000001

R10: 0000000000020000 R11: 0000000000000246 R12: 000055d24f80f8a0
 R13: 0000000000000000 R14: 000055d24f80fb88 R15: 0000000000000000
  </TASK>

The current implementation of cipso_v4_sock_setattr() replaces IP options
under the assumption that the caller holds the socket lock; however, such
assumption is not true, nor needed, in selinux_socket_post_create() hook.

Let all callers of cipso_v4_sock_setattr() specify the "socket lock held"
condition, except selinux_socket_post_create() _ where such condition can
safely be set as true even without holding the socket lock.

Fixes: f6d8bd051c39 ("inet: add RCU protection to inet->opt")
Reported-by: Xiumei Mu <xmu@redhat.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Acked-by: Paul Moore <paul@paul-moore.com>
Link: https://lore.kernel.org/r/f4260d000a3a55b9e8b6a3b4e3fffc7da9f82d41.1715359817.git.dcaratti@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/cipso_ipv4.h     |  6 ++++--
 include/net/netlabel.h       | 12 ++++++++++--
 net/ipv4/cipso_ipv4.c        |  7 ++++---
 net/netlabel/netlabel_kapi.c | 31 ++++++++++++++++++++++++++++---
 security/selinux/netlabel.c  |  5 ++++-
 security/smack/smack_lsm.c   |  3 ++-
 6 files changed, 52 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/net/cipso_ipv4.h b/include/net/cipso_ipv4.h
index 53dd7d988a2d..c9111bb2f59b 100644
--- a/include/net/cipso_ipv4.h
+++ b/include/net/cipso_ipv4.h
@@ -183,7 +183,8 @@ int cipso_v4_getattr(const unsigned char *cipso,
 		     struct netlbl_lsm_secattr *secattr);
 int cipso_v4_sock_setattr(struct sock *sk,
 			  const struct cipso_v4_doi *doi_def,
-			  const struct netlbl_lsm_secattr *secattr);
+			  const struct netlbl_lsm_secattr *secattr,
+			  bool sk_locked);
 void cipso_v4_sock_delattr(struct sock *sk);
 int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr);
 int cipso_v4_req_setattr(struct request_sock *req,
@@ -214,7 +215,8 @@ static inline int cipso_v4_getattr(const unsigned char *cipso,
 
 static inline int cipso_v4_sock_setattr(struct sock *sk,
 				      const struct cipso_v4_doi *doi_def,
-				      const struct netlbl_lsm_secattr *secattr)
+				      const struct netlbl_lsm_secattr *secattr,
+				      bool sk_locked)
 {
 	return -ENOSYS;
 }
diff --git a/include/net/netlabel.h b/include/net/netlabel.h
index f3ab0b8a4b18..2133ad723fc1 100644
--- a/include/net/netlabel.h
+++ b/include/net/netlabel.h
@@ -470,7 +470,8 @@ void netlbl_bitmap_setbit(unsigned char *bitmap, u32 bit, u8 state);
 int netlbl_enabled(void);
 int netlbl_sock_setattr(struct sock *sk,
 			u16 family,
-			const struct netlbl_lsm_secattr *secattr);
+			const struct netlbl_lsm_secattr *secattr,
+			bool sk_locked);
 void netlbl_sock_delattr(struct sock *sk);
 int netlbl_sock_getattr(struct sock *sk,
 			struct netlbl_lsm_secattr *secattr);
@@ -487,6 +488,7 @@ int netlbl_skbuff_getattr(const struct sk_buff *skb,
 			  u16 family,
 			  struct netlbl_lsm_secattr *secattr);
 void netlbl_skbuff_err(struct sk_buff *skb, u16 family, int error, int gateway);
+bool netlbl_sk_lock_check(struct sock *sk);
 
 /*
  * LSM label mapping cache operations
@@ -614,7 +616,8 @@ static inline int netlbl_enabled(void)
 }
 static inline int netlbl_sock_setattr(struct sock *sk,
 				      u16 family,
-				      const struct netlbl_lsm_secattr *secattr)
+				      const struct netlbl_lsm_secattr *secattr,
+				      bool sk_locked)
 {
 	return -ENOSYS;
 }
@@ -673,6 +676,11 @@ static inline struct audit_buffer *netlbl_audit_start(int type,
 {
 	return NULL;
 }
+
+static inline bool netlbl_sk_lock_check(struct sock *sk)
+{
+	return true;
+}
 #endif /* CONFIG_NETLABEL */
 
 const struct netlbl_calipso_ops *
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 8b17d83e5fde..dd6d46015058 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1815,6 +1815,7 @@ static int cipso_v4_genopt(unsigned char *buf, u32 buf_len,
  * @sk: the socket
  * @doi_def: the CIPSO DOI to use
  * @secattr: the specific security attributes of the socket
+ * @sk_locked: true if caller holds the socket lock
  *
  * Description:
  * Set the CIPSO option on the given socket using the DOI definition and
@@ -1826,7 +1827,8 @@ static int cipso_v4_genopt(unsigned char *buf, u32 buf_len,
  */
 int cipso_v4_sock_setattr(struct sock *sk,
 			  const struct cipso_v4_doi *doi_def,
-			  const struct netlbl_lsm_secattr *secattr)
+			  const struct netlbl_lsm_secattr *secattr,
+			  bool sk_locked)
 {
 	int ret_val = -EPERM;
 	unsigned char *buf = NULL;
@@ -1876,8 +1878,7 @@ int cipso_v4_sock_setattr(struct sock *sk,
 
 	sk_inet = inet_sk(sk);
 
-	old = rcu_dereference_protected(sk_inet->inet_opt,
-					lockdep_sock_is_held(sk));
+	old = rcu_dereference_protected(sk_inet->inet_opt, sk_locked);
 	if (inet_test_bit(IS_ICSK, sk)) {
 		sk_conn = inet_csk(sk);
 		if (old)
diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c
index 1ba4f58e1d35..cd9160bbc919 100644
--- a/net/netlabel/netlabel_kapi.c
+++ b/net/netlabel/netlabel_kapi.c
@@ -965,6 +965,7 @@ int netlbl_enabled(void)
  * @sk: the socket to label
  * @family: protocol family
  * @secattr: the security attributes
+ * @sk_locked: true if caller holds the socket lock
  *
  * Description:
  * Attach the correct label to the given socket using the security attributes
@@ -977,7 +978,8 @@ int netlbl_enabled(void)
  */
 int netlbl_sock_setattr(struct sock *sk,
 			u16 family,
-			const struct netlbl_lsm_secattr *secattr)
+			const struct netlbl_lsm_secattr *secattr,
+			bool sk_locked)
 {
 	int ret_val;
 	struct netlbl_dom_map *dom_entry;
@@ -997,7 +999,7 @@ int netlbl_sock_setattr(struct sock *sk,
 		case NETLBL_NLTYPE_CIPSOV4:
 			ret_val = cipso_v4_sock_setattr(sk,
 							dom_entry->def.cipso,
-							secattr);
+							secattr, sk_locked);
 			break;
 		case NETLBL_NLTYPE_UNLABELED:
 			ret_val = 0;
@@ -1090,6 +1092,28 @@ int netlbl_sock_getattr(struct sock *sk,
 	return ret_val;
 }
 
+/**
+ * netlbl_sk_lock_check - Check if the socket lock has been acquired.
+ * @sk: the socket to be checked
+ *
+ * Return: true if socket @sk is locked or if lock debugging is disabled at
+ * runtime or compile-time; false otherwise
+ *
+ */
+#ifdef CONFIG_LOCKDEP
+bool netlbl_sk_lock_check(struct sock *sk)
+{
+	if (debug_locks)
+		return lockdep_sock_is_held(sk);
+	return true;
+}
+#else
+bool netlbl_sk_lock_check(struct sock *sk)
+{
+	return true;
+}
+#endif
+
 /**
  * netlbl_conn_setattr - Label a connected socket using the correct protocol
  * @sk: the socket to label
@@ -1126,7 +1150,8 @@ int netlbl_conn_setattr(struct sock *sk,
 		switch (entry->type) {
 		case NETLBL_NLTYPE_CIPSOV4:
 			ret_val = cipso_v4_sock_setattr(sk,
-							entry->cipso, secattr);
+							entry->cipso, secattr,
+							netlbl_sk_lock_check(sk));
 			break;
 		case NETLBL_NLTYPE_UNLABELED:
 			/* just delete the protocols we support for right now
diff --git a/security/selinux/netlabel.c b/security/selinux/netlabel.c
index 8f182800e412..55885634e880 100644
--- a/security/selinux/netlabel.c
+++ b/security/selinux/netlabel.c
@@ -402,7 +402,10 @@ int selinux_netlbl_socket_post_create(struct sock *sk, u16 family)
 	secattr = selinux_netlbl_sock_genattr(sk);
 	if (secattr == NULL)
 		return -ENOMEM;
-	rc = netlbl_sock_setattr(sk, family, secattr);
+	/* On socket creation, replacement of IP options is safe even if
+	 * the caller does not hold the socket lock.
+	 */
+	rc = netlbl_sock_setattr(sk, family, secattr, true);
 	switch (rc) {
 	case 0:
 		sksec->nlbl_state = NLBL_LABELED;
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 146667937811..efeac8365ad0 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -2565,7 +2565,8 @@ static int smack_netlbl_add(struct sock *sk)
 	local_bh_disable();
 	bh_lock_sock_nested(sk);
 
-	rc = netlbl_sock_setattr(sk, sk->sk_family, &skp->smk_netlabel);
+	rc = netlbl_sock_setattr(sk, sk->sk_family, &skp->smk_netlabel,
+				 netlbl_sk_lock_check(sk));
 	switch (rc) {
 	case 0:
 		ssp->smk_state = SMK_NETLBL_LABELED;
-- 
cgit v1.2.3


From b56035101e1cdd9c4420ea5da17f09f87fb69285 Mon Sep 17 00:00:00 2001
From: Daniel Jurgens <danielj@nvidia.com>
Date: Fri, 10 May 2024 23:19:26 +0300
Subject: netdev: Add queue stats for TX stop and wake

TX queue stop and wake are counted by some drivers.
Support reporting these via netdev-genl queue stats.

Signed-off-by: Daniel Jurgens <danielj@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Link: https://lore.kernel.org/r/20240510201927.1821109-2-danielj@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/netlink/specs/netdev.yaml | 14 ++++++++++++++
 include/net/netdev_queues.h             |  3 +++
 include/uapi/linux/netdev.h             |  2 ++
 net/core/netdev-genl.c                  |  4 +++-
 tools/include/uapi/linux/netdev.h       |  2 ++
 5 files changed, 24 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/Documentation/netlink/specs/netdev.yaml b/Documentation/netlink/specs/netdev.yaml
index 2be4b3714d17..11a32373365a 100644
--- a/Documentation/netlink/specs/netdev.yaml
+++ b/Documentation/netlink/specs/netdev.yaml
@@ -439,6 +439,20 @@ attribute-sets:
           Number of the packets dropped by the device due to the transmit
           packets bitrate exceeding the device rate limit.
         type: uint
+      -
+        name: tx-stop
+        doc: |
+          Number of times driver paused accepting new tx packets
+          from the stack to this queue, because the queue was full.
+          Note that if BQL is supported and enabled on the device
+          the networking stack will avoid queuing a lot of data at once.
+        type: uint
+      -
+        name: tx-wake
+        doc: |
+          Number of times driver re-started accepting send
+          requests to this queue from the stack.
+        type: uint
 
 operations:
   list:
diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h
index e7b84f018cee..a8a7e48dfa6c 100644
--- a/include/net/netdev_queues.h
+++ b/include/net/netdev_queues.h
@@ -41,6 +41,9 @@ struct netdev_queue_stats_tx {
 	u64 hw_gso_wire_bytes;
 
 	u64 hw_drop_ratelimits;
+
+	u64 stop;
+	u64 wake;
 };
 
 /**
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index cf24f1d9adf8..a8188202413e 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -165,6 +165,8 @@ enum {
 	NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS,
 	NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES,
 	NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS,
+	NETDEV_A_QSTATS_TX_STOP,
+	NETDEV_A_QSTATS_TX_WAKE,
 
 	__NETDEV_A_QSTATS_MAX,
 	NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1)
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index 4b5054087309..1f6ae6379e0f 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -517,7 +517,9 @@ netdev_nl_stats_write_tx(struct sk_buff *rsp, struct netdev_queue_stats_tx *tx)
 	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_BYTES, tx->hw_gso_bytes) ||
 	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS, tx->hw_gso_wire_packets) ||
 	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES, tx->hw_gso_wire_bytes) ||
-	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS, tx->hw_drop_ratelimits))
+	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS, tx->hw_drop_ratelimits) ||
+	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_STOP, tx->stop) ||
+	    netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_WAKE, tx->wake))
 		return -EMSGSIZE;
 	return 0;
 }
diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h
index cf24f1d9adf8..a8188202413e 100644
--- a/tools/include/uapi/linux/netdev.h
+++ b/tools/include/uapi/linux/netdev.h
@@ -165,6 +165,8 @@ enum {
 	NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS,
 	NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES,
 	NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS,
+	NETDEV_A_QSTATS_TX_STOP,
+	NETDEV_A_QSTATS_TX_WAKE,
 
 	__NETDEV_A_QSTATS_MAX,
 	NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1)
-- 
cgit v1.2.3


From 14b1cd25346b1d615616a9c2dfdad9b4e6581e0d Mon Sep 17 00:00:00 2001
From: Steve French <stfrench@microsoft.com>
Date: Mon, 13 May 2024 17:02:05 -0500
Subject: cifs: Fix locking in cifs_strict_readv()

Fix to take the i_rwsem (through the netfs locking wrappers) before taking
cinode->lock_sem.

Fixes: 3ee1a1fc3981 ("cifs: Cut over to using netfslib")
Reported-by: Enzo Matsumiya <ematsumiya@suse.de>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/netfs/direct_read.c   |  3 ++-
 fs/smb/client/cifsglob.h |  1 +
 fs/smb/client/file.c     | 34 +++++++++++++++++++++++++---------
 include/linux/netfs.h    |  1 +
 4 files changed, 29 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c
index ad4370b3935d..10a1e4da6bda 100644
--- a/fs/netfs/direct_read.c
+++ b/fs/netfs/direct_read.c
@@ -26,7 +26,7 @@
  *
  * The caller must hold any appropriate locks.
  */
-static ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *iter)
+ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct netfs_io_request *rreq;
 	ssize_t ret;
@@ -98,6 +98,7 @@ out:
 		iov_iter_revert(iter, orig_count - iov_iter_count(iter));
 	return ret;
 }
+EXPORT_SYMBOL(netfs_unbuffered_read_iter_locked);
 
 /**
  * netfs_unbuffered_read_iter - Perform an unbuffered or direct I/O read
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 65574e69ba4f..73482734a8d8 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -1995,6 +1995,7 @@ require use of the stronger protocol */
  *				->chans_need_reconnect
  *				->chans_in_reconnect
  * cifs_tcon->tc_lock		(anything that is not protected by another lock and can change)
+ * inode->i_rwsem, taken by fs/netfs/locking.c e.g. should be taken before cifsInodeInfo locks
  * cifsInodeInfo->open_file_lock	cifsInodeInfo->openFileList	cifs_alloc_inode
  * cifsInodeInfo->writers_lock	cifsInodeInfo->writers		cifsInodeInfo_alloc
  * cifsInodeInfo->lock_sem	cifsInodeInfo->llist		cifs_init_once
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 4c981ce89f8a..9d38294a7e68 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -2916,16 +2916,32 @@ cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to)
 	 * We need to hold the sem to be sure nobody modifies lock list
 	 * with a brlock that prevents reading.
 	 */
-	down_read(&cinode->lock_sem);
-	if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(to),
-				     tcon->ses->server->vals->shared_lock_type,
-				     0, NULL, CIFS_READ_OP)) {
-		if (iocb->ki_flags & IOCB_DIRECT)
-			rc = netfs_unbuffered_read_iter(iocb, to);
-		else
-			rc = netfs_buffered_read_iter(iocb, to);
+	if (iocb->ki_flags & IOCB_DIRECT) {
+		rc = netfs_start_io_direct(inode);
+		if (rc < 0)
+			goto out;
+		down_read(&cinode->lock_sem);
+		if (!cifs_find_lock_conflict(
+			    cfile, iocb->ki_pos, iov_iter_count(to),
+			    tcon->ses->server->vals->shared_lock_type,
+			    0, NULL, CIFS_READ_OP))
+			rc = netfs_unbuffered_read_iter_locked(iocb, to);
+		up_read(&cinode->lock_sem);
+		netfs_end_io_direct(inode);
+	} else {
+		rc = netfs_start_io_read(inode);
+		if (rc < 0)
+			goto out;
+		down_read(&cinode->lock_sem);
+		if (!cifs_find_lock_conflict(
+			    cfile, iocb->ki_pos, iov_iter_count(to),
+			    tcon->ses->server->vals->shared_lock_type,
+			    0, NULL, CIFS_READ_OP))
+			rc = filemap_read(iocb, to, 0);
+		up_read(&cinode->lock_sem);
+		netfs_end_io_read(inode);
 	}
-	up_read(&cinode->lock_sem);
+out:
 	return rc;
 }
 
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index f45d06284f2f..ca56a4428043 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -389,6 +389,7 @@ struct netfs_cache_ops {
 };
 
 /* High-level read API. */
+ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *iter);
 ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter);
 ssize_t netfs_buffered_read_iter(struct kiocb *iocb, struct iov_iter *iter);
 ssize_t netfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter);
-- 
cgit v1.2.3


From 117c39200d9d760cbd5944bb89efb7b9c51965aa Mon Sep 17 00:00:00 2001
From: Vincent Donnefort <vdonnefort@google.com>
Date: Fri, 10 May 2024 15:04:31 +0100
Subject: ring-buffer: Introducing ring-buffer mapping functions

In preparation for allowing the user-space to map a ring-buffer, add
a set of mapping functions:

  ring_buffer_{map,unmap}()

And controls on the ring-buffer:

  ring_buffer_map_get_reader()  /* swap reader and head */

Mapping the ring-buffer also involves:

  A unique ID for each subbuf of the ring-buffer, currently they are
  only identified through their in-kernel VA.

  A meta-page, where are stored ring-buffer statistics and a
  description for the current reader

The linear mapping exposes the meta-page, and each subbuf of the
ring-buffer, ordered following their unique ID, assigned during the
first mapping.

Once mapped, no subbuf can get in or out of the ring-buffer: the buffer
size will remain unmodified and the splice enabling functions will in
reality simply memcpy the data instead of swapping subbufs.

Link: https://lore.kernel.org/linux-trace-kernel/20240510140435.3550353-3-vdonnefort@google.com

CC: <linux-mm@kvack.org>
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
Acked-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h     |   6 +
 include/uapi/linux/trace_mmap.h |  46 +++++
 kernel/trace/ring_buffer.c      | 414 +++++++++++++++++++++++++++++++++++++++-
 3 files changed, 463 insertions(+), 3 deletions(-)
 create mode 100644 include/uapi/linux/trace_mmap.h

(limited to 'include')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index dc5ae4e96aee..96d2140b471e 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -6,6 +6,8 @@
 #include <linux/seq_file.h>
 #include <linux/poll.h>
 
+#include <uapi/linux/trace_mmap.h>
+
 struct trace_buffer;
 struct ring_buffer_iter;
 
@@ -223,4 +225,8 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node);
 #define trace_rb_cpu_prepare	NULL
 #endif
 
+int ring_buffer_map(struct trace_buffer *buffer, int cpu,
+		    struct vm_area_struct *vma);
+int ring_buffer_unmap(struct trace_buffer *buffer, int cpu);
+int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu);
 #endif /* _LINUX_RING_BUFFER_H */
diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h
new file mode 100644
index 000000000000..b682e9925539
--- /dev/null
+++ b/include/uapi/linux/trace_mmap.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _TRACE_MMAP_H_
+#define _TRACE_MMAP_H_
+
+#include <linux/types.h>
+
+/**
+ * struct trace_buffer_meta - Ring-buffer Meta-page description
+ * @meta_page_size:	Size of this meta-page.
+ * @meta_struct_len:	Size of this structure.
+ * @subbuf_size:	Size of each sub-buffer.
+ * @nr_subbufs:		Number of subbfs in the ring-buffer, including the reader.
+ * @reader.lost_events:	Number of events lost at the time of the reader swap.
+ * @reader.id:		subbuf ID of the current reader. ID range [0 : @nr_subbufs - 1]
+ * @reader.read:	Number of bytes read on the reader subbuf.
+ * @flags:		Placeholder for now, 0 until new features are supported.
+ * @entries:		Number of entries in the ring-buffer.
+ * @overrun:		Number of entries lost in the ring-buffer.
+ * @read:		Number of entries that have been read.
+ * @Reserved1:		Internal use only.
+ * @Reserved2:		Internal use only.
+ */
+struct trace_buffer_meta {
+	__u32		meta_page_size;
+	__u32		meta_struct_len;
+
+	__u32		subbuf_size;
+	__u32		nr_subbufs;
+
+	struct {
+		__u64	lost_events;
+		__u32	id;
+		__u32	read;
+	} reader;
+
+	__u64	flags;
+
+	__u64	entries;
+	__u64	overrun;
+	__u64	read;
+
+	__u64	Reserved1;
+	__u64	Reserved2;
+};
+
+#endif /* _TRACE_MMAP_H_ */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a1e011733f6a..cb303052bff2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -9,6 +9,7 @@
 #include <linux/ring_buffer.h>
 #include <linux/trace_clock.h>
 #include <linux/sched/clock.h>
+#include <linux/cacheflush.h>
 #include <linux/trace_seq.h>
 #include <linux/spinlock.h>
 #include <linux/irq_work.h>
@@ -26,6 +27,7 @@
 #include <linux/list.h>
 #include <linux/cpu.h>
 #include <linux/oom.h>
+#include <linux/mm.h>
 
 #include <asm/local64.h>
 #include <asm/local.h>
@@ -338,6 +340,7 @@ struct buffer_page {
 	local_t		 entries;	/* entries on this page */
 	unsigned long	 real_end;	/* real end of data */
 	unsigned	 order;		/* order of the page */
+	u32		 id;		/* ID for external mapping */
 	struct buffer_data_page *page;	/* Actual data page */
 };
 
@@ -484,6 +487,12 @@ struct ring_buffer_per_cpu {
 	u64				read_stamp;
 	/* pages removed since last reset */
 	unsigned long			pages_removed;
+
+	unsigned int			mapped;
+	struct mutex			mapping_lock;
+	unsigned long			*subbuf_ids;	/* ID to subbuf VA */
+	struct trace_buffer_meta	*meta_page;
+
 	/* ring buffer pages to update, > 0 to add, < 0 to remove */
 	long				nr_pages_to_update;
 	struct list_head		new_pages; /* new pages to add */
@@ -1599,6 +1608,7 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
 	init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
 	init_waitqueue_head(&cpu_buffer->irq_work.waiters);
 	init_waitqueue_head(&cpu_buffer->irq_work.full_waiters);
+	mutex_init(&cpu_buffer->mapping_lock);
 
 	bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
 			    GFP_KERNEL, cpu_to_node(cpu));
@@ -1789,8 +1799,6 @@ bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer)
 	return buffer->time_stamp_abs;
 }
 
-static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
-
 static inline unsigned long rb_page_entries(struct buffer_page *bpage)
 {
 	return local_read(&bpage->entries) & RB_WRITE_MASK;
@@ -5211,6 +5219,22 @@ static void rb_clear_buffer_page(struct buffer_page *page)
 	page->read = 0;
 }
 
+static void rb_update_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	struct trace_buffer_meta *meta = cpu_buffer->meta_page;
+
+	meta->reader.read = cpu_buffer->reader_page->read;
+	meta->reader.id = cpu_buffer->reader_page->id;
+	meta->reader.lost_events = cpu_buffer->lost_events;
+
+	meta->entries = local_read(&cpu_buffer->entries);
+	meta->overrun = local_read(&cpu_buffer->overrun);
+	meta->read = cpu_buffer->read;
+
+	/* Some archs do not have data cache coherency between kernel and user-space */
+	flush_dcache_folio(virt_to_folio(cpu_buffer->meta_page));
+}
+
 static void
 rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 {
@@ -5255,6 +5279,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->lost_events = 0;
 	cpu_buffer->last_overrun = 0;
 
+	if (cpu_buffer->mapped)
+		rb_update_meta_page(cpu_buffer);
+
 	rb_head_page_activate(cpu_buffer);
 	cpu_buffer->pages_removed = 0;
 }
@@ -5469,6 +5496,12 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
 	cpu_buffer_a = buffer_a->buffers[cpu];
 	cpu_buffer_b = buffer_b->buffers[cpu];
 
+	/* It's up to the callers to not try to swap mapped buffers */
+	if (WARN_ON_ONCE(cpu_buffer_a->mapped || cpu_buffer_b->mapped)) {
+		ret = -EBUSY;
+		goto out;
+	}
+
 	/* At least make sure the two buffers are somewhat the same */
 	if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
 		goto out;
@@ -5733,7 +5766,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
 	 * Otherwise, we can simply swap the page with the one passed in.
 	 */
 	if (read || (len < (commit - read)) ||
-	    cpu_buffer->reader_page == cpu_buffer->commit_page) {
+	    cpu_buffer->reader_page == cpu_buffer->commit_page ||
+	    cpu_buffer->mapped) {
 		struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
 		unsigned int rpos = read;
 		unsigned int pos = 0;
@@ -5956,6 +5990,11 @@ int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
 
 		cpu_buffer = buffer->buffers[cpu];
 
+		if (cpu_buffer->mapped) {
+			err = -EBUSY;
+			goto error;
+		}
+
 		/* Update the number of pages to match the new size */
 		nr_pages = old_size * buffer->buffers[cpu]->nr_pages;
 		nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size);
@@ -6057,6 +6096,375 @@ error:
 }
 EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set);
 
+static int rb_alloc_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	struct page *page;
+
+	if (cpu_buffer->meta_page)
+		return 0;
+
+	page = alloc_page(GFP_USER | __GFP_ZERO);
+	if (!page)
+		return -ENOMEM;
+
+	cpu_buffer->meta_page = page_to_virt(page);
+
+	return 0;
+}
+
+static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	unsigned long addr = (unsigned long)cpu_buffer->meta_page;
+
+	free_page(addr);
+	cpu_buffer->meta_page = NULL;
+}
+
+static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
+				   unsigned long *subbuf_ids)
+{
+	struct trace_buffer_meta *meta = cpu_buffer->meta_page;
+	unsigned int nr_subbufs = cpu_buffer->nr_pages + 1;
+	struct buffer_page *first_subbuf, *subbuf;
+	int id = 0;
+
+	subbuf_ids[id] = (unsigned long)cpu_buffer->reader_page->page;
+	cpu_buffer->reader_page->id = id++;
+
+	first_subbuf = subbuf = rb_set_head_page(cpu_buffer);
+	do {
+		if (WARN_ON(id >= nr_subbufs))
+			break;
+
+		subbuf_ids[id] = (unsigned long)subbuf->page;
+		subbuf->id = id;
+
+		rb_inc_page(&subbuf);
+		id++;
+	} while (subbuf != first_subbuf);
+
+	/* install subbuf ID to kern VA translation */
+	cpu_buffer->subbuf_ids = subbuf_ids;
+
+	meta->meta_page_size = PAGE_SIZE;
+	meta->meta_struct_len = sizeof(*meta);
+	meta->nr_subbufs = nr_subbufs;
+	meta->subbuf_size = cpu_buffer->buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
+
+	rb_update_meta_page(cpu_buffer);
+}
+
+static struct ring_buffer_per_cpu *
+rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
+		return ERR_PTR(-EINVAL);
+
+	cpu_buffer = buffer->buffers[cpu];
+
+	mutex_lock(&cpu_buffer->mapping_lock);
+
+	if (!cpu_buffer->mapped) {
+		mutex_unlock(&cpu_buffer->mapping_lock);
+		return ERR_PTR(-ENODEV);
+	}
+
+	return cpu_buffer;
+}
+
+static void rb_put_mapped_buffer(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	mutex_unlock(&cpu_buffer->mapping_lock);
+}
+
+/*
+ * Fast-path for rb_buffer_(un)map(). Called whenever the meta-page doesn't need
+ * to be set-up or torn-down.
+ */
+static int __rb_inc_dec_mapped(struct ring_buffer_per_cpu *cpu_buffer,
+			       bool inc)
+{
+	unsigned long flags;
+
+	lockdep_assert_held(&cpu_buffer->mapping_lock);
+
+	if (inc && cpu_buffer->mapped == UINT_MAX)
+		return -EBUSY;
+
+	if (WARN_ON(!inc && cpu_buffer->mapped == 0))
+		return -EINVAL;
+
+	mutex_lock(&cpu_buffer->buffer->mutex);
+	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+	if (inc)
+		cpu_buffer->mapped++;
+	else
+		cpu_buffer->mapped--;
+
+	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+	mutex_unlock(&cpu_buffer->buffer->mutex);
+
+	return 0;
+}
+
+/*
+ *   +--------------+  pgoff == 0
+ *   |   meta page  |
+ *   +--------------+  pgoff == 1
+ *   | subbuffer 0  |
+ *   |              |
+ *   +--------------+  pgoff == (1 + (1 << subbuf_order))
+ *   | subbuffer 1  |
+ *   |              |
+ *         ...
+ */
+#ifdef CONFIG_MMU
+static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
+			struct vm_area_struct *vma)
+{
+	unsigned long nr_subbufs, nr_pages, vma_pages, pgoff = vma->vm_pgoff;
+	unsigned int subbuf_pages, subbuf_order;
+	struct page **pages;
+	int p = 0, s = 0;
+	int err;
+
+	/* Refuse MP_PRIVATE or writable mappings */
+	if (vma->vm_flags & VM_WRITE || vma->vm_flags & VM_EXEC ||
+	    !(vma->vm_flags & VM_MAYSHARE))
+		return -EPERM;
+
+	/*
+	 * Make sure the mapping cannot become writable later. Also tell the VM
+	 * to not touch these pages (VM_DONTCOPY | VM_DONTEXPAND).
+	 */
+	vm_flags_mod(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP,
+		     VM_MAYWRITE);
+
+	lockdep_assert_held(&cpu_buffer->mapping_lock);
+
+	subbuf_order = cpu_buffer->buffer->subbuf_order;
+	subbuf_pages = 1 << subbuf_order;
+
+	nr_subbufs = cpu_buffer->nr_pages + 1; /* + reader-subbuf */
+	nr_pages = ((nr_subbufs) << subbuf_order) - pgoff + 1; /* + meta-page */
+
+	vma_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+	if (!vma_pages || vma_pages > nr_pages)
+		return -EINVAL;
+
+	nr_pages = vma_pages;
+
+	pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	if (!pgoff) {
+		pages[p++] = virt_to_page(cpu_buffer->meta_page);
+
+		/*
+		 * TODO: Align sub-buffers on their size, once
+		 * vm_insert_pages() supports the zero-page.
+		 */
+	} else {
+		/* Skip the meta-page */
+		pgoff--;
+
+		if (pgoff % subbuf_pages) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		s += pgoff / subbuf_pages;
+	}
+
+	while (p < nr_pages) {
+		struct page *page = virt_to_page(cpu_buffer->subbuf_ids[s]);
+		int off = 0;
+
+		if (WARN_ON_ONCE(s >= nr_subbufs)) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		for (; off < (1 << (subbuf_order)); off++, page++) {
+			if (p >= nr_pages)
+				break;
+
+			pages[p++] = page;
+		}
+		s++;
+	}
+
+	err = vm_insert_pages(vma, vma->vm_start, pages, &nr_pages);
+
+out:
+	kfree(pages);
+
+	return err;
+}
+#else
+static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
+			struct vm_area_struct *vma)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
+int ring_buffer_map(struct trace_buffer *buffer, int cpu,
+		    struct vm_area_struct *vma)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long flags, *subbuf_ids;
+	int err = 0;
+
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
+		return -EINVAL;
+
+	cpu_buffer = buffer->buffers[cpu];
+
+	mutex_lock(&cpu_buffer->mapping_lock);
+
+	if (cpu_buffer->mapped) {
+		err = __rb_map_vma(cpu_buffer, vma);
+		if (!err)
+			err = __rb_inc_dec_mapped(cpu_buffer, true);
+		mutex_unlock(&cpu_buffer->mapping_lock);
+		return err;
+	}
+
+	/* prevent another thread from changing buffer/sub-buffer sizes */
+	mutex_lock(&buffer->mutex);
+
+	err = rb_alloc_meta_page(cpu_buffer);
+	if (err)
+		goto unlock;
+
+	/* subbuf_ids include the reader while nr_pages does not */
+	subbuf_ids = kcalloc(cpu_buffer->nr_pages + 1, sizeof(*subbuf_ids), GFP_KERNEL);
+	if (!subbuf_ids) {
+		rb_free_meta_page(cpu_buffer);
+		err = -ENOMEM;
+		goto unlock;
+	}
+
+	atomic_inc(&cpu_buffer->resize_disabled);
+
+	/*
+	 * Lock all readers to block any subbuf swap until the subbuf IDs are
+	 * assigned.
+	 */
+	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+	rb_setup_ids_meta_page(cpu_buffer, subbuf_ids);
+	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+	err = __rb_map_vma(cpu_buffer, vma);
+	if (!err) {
+		raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+		cpu_buffer->mapped = 1;
+		raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+	} else {
+		kfree(cpu_buffer->subbuf_ids);
+		cpu_buffer->subbuf_ids = NULL;
+		rb_free_meta_page(cpu_buffer);
+	}
+
+unlock:
+	mutex_unlock(&buffer->mutex);
+	mutex_unlock(&cpu_buffer->mapping_lock);
+
+	return err;
+}
+
+int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long flags;
+	int err = 0;
+
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
+		return -EINVAL;
+
+	cpu_buffer = buffer->buffers[cpu];
+
+	mutex_lock(&cpu_buffer->mapping_lock);
+
+	if (!cpu_buffer->mapped) {
+		err = -ENODEV;
+		goto out;
+	} else if (cpu_buffer->mapped > 1) {
+		__rb_inc_dec_mapped(cpu_buffer, false);
+		goto out;
+	}
+
+	mutex_lock(&buffer->mutex);
+	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+	cpu_buffer->mapped = 0;
+
+	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+	kfree(cpu_buffer->subbuf_ids);
+	cpu_buffer->subbuf_ids = NULL;
+	rb_free_meta_page(cpu_buffer);
+	atomic_dec(&cpu_buffer->resize_disabled);
+
+	mutex_unlock(&buffer->mutex);
+
+out:
+	mutex_unlock(&cpu_buffer->mapping_lock);
+
+	return err;
+}
+
+int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long reader_size;
+	unsigned long flags;
+
+	cpu_buffer = rb_get_mapped_buffer(buffer, cpu);
+	if (IS_ERR(cpu_buffer))
+		return (int)PTR_ERR(cpu_buffer);
+
+	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+consume:
+	if (rb_per_cpu_empty(cpu_buffer))
+		goto out;
+
+	reader_size = rb_page_size(cpu_buffer->reader_page);
+
+	/*
+	 * There are data to be read on the current reader page, we can
+	 * return to the caller. But before that, we assume the latter will read
+	 * everything. Let's update the kernel reader accordingly.
+	 */
+	if (cpu_buffer->reader_page->read < reader_size) {
+		while (cpu_buffer->reader_page->read < reader_size)
+			rb_advance_reader(cpu_buffer);
+		goto out;
+	}
+
+	if (WARN_ON(!rb_get_reader_page(cpu_buffer)))
+		goto out;
+
+	goto consume;
+
+out:
+	/* Some archs do not have data cache coherency between kernel and user-space */
+	flush_dcache_folio(virt_to_folio(cpu_buffer->reader_page->page));
+
+	rb_update_meta_page(cpu_buffer);
+
+	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+	rb_put_mapped_buffer(cpu_buffer);
+
+	return 0;
+}
+
 /*
  * We only allocate new buffers, never free them if the CPU goes down.
  * If we were to free the buffer, then the user would lose any trace that was in
-- 
cgit v1.2.3


From cf9f0f7c4c5bb45e7bb270e48bab6f7837825a64 Mon Sep 17 00:00:00 2001
From: Vincent Donnefort <vdonnefort@google.com>
Date: Fri, 10 May 2024 15:04:32 +0100
Subject: tracing: Allow user-space mapping of the ring-buffer

Currently, user-space extracts data from the ring-buffer via splice,
which is handy for storage or network sharing. However, due to splice
limitations, it is imposible to do real-time analysis without a copy.

A solution for that problem is to let the user-space map the ring-buffer
directly.

The mapping is exposed via the per-CPU file trace_pipe_raw. The first
element of the mapping is the meta-page. It is followed by each
subbuffer constituting the ring-buffer, ordered by their unique page ID:

  * Meta-page -- include/uapi/linux/trace_mmap.h for a description
  * Subbuf ID 0
  * Subbuf ID 1
     ...

It is therefore easy to translate a subbuf ID into an offset in the
mapping:

  reader_id = meta->reader->id;
  reader_offset = meta->meta_page_size + reader_id * meta->subbuf_size;

When new data is available, the mapper must call a newly introduced ioctl:
TRACE_MMAP_IOCTL_GET_READER. This will update the Meta-page reader ID to
point to the next reader containing unread data.

Mapping will prevent snapshot and buffer size modifications.

Link: https://lore.kernel.org/linux-trace-kernel/20240510140435.3550353-4-vdonnefort@google.com

CC: <linux-mm@kvack.org>
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/uapi/linux/trace_mmap.h |   2 +
 kernel/trace/trace.c            | 104 ++++++++++++++++++++++++++++++++++++++--
 kernel/trace/trace.h            |   1 +
 3 files changed, 102 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h
index b682e9925539..bd1066754220 100644
--- a/include/uapi/linux/trace_mmap.h
+++ b/include/uapi/linux/trace_mmap.h
@@ -43,4 +43,6 @@ struct trace_buffer_meta {
 	__u64	Reserved2;
 };
 
+#define TRACE_MMAP_IOCTL_GET_READER		_IO('T', 0x1)
+
 #endif /* _TRACE_MMAP_H_ */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 233d1af39fff..a35e7f598233 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1191,6 +1191,12 @@ static void tracing_snapshot_instance_cond(struct trace_array *tr,
 		return;
 	}
 
+	if (tr->mapped) {
+		trace_array_puts(tr, "*** BUFFER MEMORY MAPPED ***\n");
+		trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n");
+		return;
+	}
+
 	local_irq_save(flags);
 	update_max_tr(tr, current, smp_processor_id(), cond_data);
 	local_irq_restore(flags);
@@ -1323,7 +1329,7 @@ static int tracing_arm_snapshot_locked(struct trace_array *tr)
 	lockdep_assert_held(&trace_types_lock);
 
 	spin_lock(&tr->snapshot_trigger_lock);
-	if (tr->snapshot == UINT_MAX) {
+	if (tr->snapshot == UINT_MAX || tr->mapped) {
 		spin_unlock(&tr->snapshot_trigger_lock);
 		return -EBUSY;
 	}
@@ -6068,7 +6074,7 @@ static void tracing_set_nop(struct trace_array *tr)
 {
 	if (tr->current_trace == &nop_trace)
 		return;
-	
+
 	tr->current_trace->enabled--;
 
 	if (tr->current_trace->reset)
@@ -8194,15 +8200,32 @@ out:
 	return ret;
 }
 
-/* An ioctl call with cmd 0 to the ring buffer file will wake up all waiters */
 static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct ftrace_buffer_info *info = file->private_data;
 	struct trace_iterator *iter = &info->iter;
+	int err;
+
+	if (cmd == TRACE_MMAP_IOCTL_GET_READER) {
+		if (!(file->f_flags & O_NONBLOCK)) {
+			err = ring_buffer_wait(iter->array_buffer->buffer,
+					       iter->cpu_file,
+					       iter->tr->buffer_percent,
+					       NULL, NULL);
+			if (err)
+				return err;
+		}
 
-	if (cmd)
-		return -ENOIOCTLCMD;
+		return ring_buffer_map_get_reader(iter->array_buffer->buffer,
+						  iter->cpu_file);
+	} else if (cmd) {
+		return -ENOTTY;
+	}
 
+	/*
+	 * An ioctl call with cmd 0 to the ring buffer file will wake up all
+	 * waiters
+	 */
 	mutex_lock(&trace_types_lock);
 
 	/* Make sure the waiters see the new wait_index */
@@ -8214,6 +8237,76 @@ static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned
 	return 0;
 }
 
+#ifdef CONFIG_TRACER_MAX_TRACE
+static int get_snapshot_map(struct trace_array *tr)
+{
+	int err = 0;
+
+	/*
+	 * Called with mmap_lock held. lockdep would be unhappy if we would now
+	 * take trace_types_lock. Instead use the specific
+	 * snapshot_trigger_lock.
+	 */
+	spin_lock(&tr->snapshot_trigger_lock);
+
+	if (tr->snapshot || tr->mapped == UINT_MAX)
+		err = -EBUSY;
+	else
+		tr->mapped++;
+
+	spin_unlock(&tr->snapshot_trigger_lock);
+
+	/* Wait for update_max_tr() to observe iter->tr->mapped */
+	if (tr->mapped == 1)
+		synchronize_rcu();
+
+	return err;
+
+}
+static void put_snapshot_map(struct trace_array *tr)
+{
+	spin_lock(&tr->snapshot_trigger_lock);
+	if (!WARN_ON(!tr->mapped))
+		tr->mapped--;
+	spin_unlock(&tr->snapshot_trigger_lock);
+}
+#else
+static inline int get_snapshot_map(struct trace_array *tr) { return 0; }
+static inline void put_snapshot_map(struct trace_array *tr) { }
+#endif
+
+static void tracing_buffers_mmap_close(struct vm_area_struct *vma)
+{
+	struct ftrace_buffer_info *info = vma->vm_file->private_data;
+	struct trace_iterator *iter = &info->iter;
+
+	WARN_ON(ring_buffer_unmap(iter->array_buffer->buffer, iter->cpu_file));
+	put_snapshot_map(iter->tr);
+}
+
+static const struct vm_operations_struct tracing_buffers_vmops = {
+	.close		= tracing_buffers_mmap_close,
+};
+
+static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct ftrace_buffer_info *info = filp->private_data;
+	struct trace_iterator *iter = &info->iter;
+	int ret = 0;
+
+	ret = get_snapshot_map(iter->tr);
+	if (ret)
+		return ret;
+
+	ret = ring_buffer_map(iter->array_buffer->buffer, iter->cpu_file, vma);
+	if (ret)
+		put_snapshot_map(iter->tr);
+
+	vma->vm_ops = &tracing_buffers_vmops;
+
+	return ret;
+}
+
 static const struct file_operations tracing_buffers_fops = {
 	.open		= tracing_buffers_open,
 	.read		= tracing_buffers_read,
@@ -8223,6 +8316,7 @@ static const struct file_operations tracing_buffers_fops = {
 	.splice_read	= tracing_buffers_splice_read,
 	.unlocked_ioctl = tracing_buffers_ioctl,
 	.llseek		= no_llseek,
+	.mmap		= tracing_buffers_mmap,
 };
 
 static ssize_t
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 64450615ca0c..749a182dab48 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -336,6 +336,7 @@ struct trace_array {
 	bool			allocated_snapshot;
 	spinlock_t		snapshot_trigger_lock;
 	unsigned int		snapshot;
+	unsigned int		mapped;
 	unsigned long		max_latency;
 #ifdef CONFIG_FSNOTIFY
 	struct dentry		*d_max_latency;
-- 
cgit v1.2.3


From a7d6e36b9ad052926ba2ecba3a59d8bb67dabcb4 Mon Sep 17 00:00:00 2001
From: Duoming Zhou <duoming@zju.edu.cn>
Date: Thu, 9 May 2024 17:36:33 +0800
Subject: ax25: Use kernel universal linked list to implement ax25_dev_list

The origin ax25_dev_list implements its own single linked list,
which is complicated and error-prone. For example, when deleting
the node of ax25_dev_list in ax25_dev_device_down(), we have to
operate on the head node and other nodes separately.

This patch uses kernel universal linked list to replace original
ax25_dev_list, which make the operation of ax25_dev_list easier.

We should do "dev->ax25_ptr = ax25_dev;" and "dev->ax25_ptr = NULL;"
while holding the spinlock, otherwise the ax25_dev_device_up() and
ax25_dev_device_down() could race.

Suggested-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Duoming Zhou <duoming@zju.edu.cn>
Reviewed-by: Dan Carpenter <dan.carpenter@linaro.org>
Link: https://lore.kernel.org/r/85bba3af651ca0e1a519da8d0d715b949891171c.1715247018.git.duoming@zju.edu.cn
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/ax25.h  |  3 +--
 net/ax25/ax25_dev.c | 40 +++++++++++++++-------------------------
 2 files changed, 16 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/net/ax25.h b/include/net/ax25.h
index 0d939e5aee4e..c2a85fd3f5ea 100644
--- a/include/net/ax25.h
+++ b/include/net/ax25.h
@@ -216,7 +216,7 @@ typedef struct {
 struct ctl_table;
 
 typedef struct ax25_dev {
-	struct ax25_dev		*next;
+	struct list_head	list;
 
 	struct net_device	*dev;
 	netdevice_tracker	dev_tracker;
@@ -330,7 +330,6 @@ int ax25_addr_size(const ax25_digi *);
 void ax25_digi_invert(const ax25_digi *, ax25_digi *);
 
 /* ax25_dev.c */
-extern ax25_dev *ax25_dev_list;
 extern spinlock_t ax25_dev_lock;
 
 #if IS_ENABLED(CONFIG_AX25)
diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c
index 282ec581c072..f16ee5c09d07 100644
--- a/net/ax25/ax25_dev.c
+++ b/net/ax25/ax25_dev.c
@@ -22,11 +22,12 @@
 #include <net/sock.h>
 #include <linux/uaccess.h>
 #include <linux/fcntl.h>
+#include <linux/list.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
 #include <linux/init.h>
 
-ax25_dev *ax25_dev_list;
+static LIST_HEAD(ax25_dev_list);
 DEFINE_SPINLOCK(ax25_dev_lock);
 
 ax25_dev *ax25_addr_ax25dev(ax25_address *addr)
@@ -34,7 +35,7 @@ ax25_dev *ax25_addr_ax25dev(ax25_address *addr)
 	ax25_dev *ax25_dev, *res = NULL;
 
 	spin_lock_bh(&ax25_dev_lock);
-	for (ax25_dev = ax25_dev_list; ax25_dev != NULL; ax25_dev = ax25_dev->next)
+	list_for_each_entry(ax25_dev, &ax25_dev_list, list)
 		if (ax25cmp(addr, (const ax25_address *)ax25_dev->dev->dev_addr) == 0) {
 			res = ax25_dev;
 			ax25_dev_hold(ax25_dev);
@@ -59,7 +60,6 @@ void ax25_dev_device_up(struct net_device *dev)
 	}
 
 	refcount_set(&ax25_dev->refcount, 1);
-	dev->ax25_ptr     = ax25_dev;
 	ax25_dev->dev     = dev;
 	netdev_hold(dev, &ax25_dev->dev_tracker, GFP_KERNEL);
 	ax25_dev->forward = NULL;
@@ -85,8 +85,8 @@ void ax25_dev_device_up(struct net_device *dev)
 #endif
 
 	spin_lock_bh(&ax25_dev_lock);
-	ax25_dev->next = ax25_dev_list;
-	ax25_dev_list  = ax25_dev;
+	list_add(&ax25_dev->list, &ax25_dev_list);
+	dev->ax25_ptr     = ax25_dev;
 	spin_unlock_bh(&ax25_dev_lock);
 	ax25_dev_hold(ax25_dev);
 
@@ -111,32 +111,25 @@ void ax25_dev_device_down(struct net_device *dev)
 	/*
 	 *	Remove any packet forwarding that points to this device.
 	 */
-	for (s = ax25_dev_list; s != NULL; s = s->next)
+	list_for_each_entry(s, &ax25_dev_list, list)
 		if (s->forward == dev)
 			s->forward = NULL;
 
-	if ((s = ax25_dev_list) == ax25_dev) {
-		ax25_dev_list = s->next;
-		goto unlock_put;
-	}
-
-	while (s != NULL && s->next != NULL) {
-		if (s->next == ax25_dev) {
-			s->next = ax25_dev->next;
+	list_for_each_entry(s, &ax25_dev_list, list) {
+		if (s == ax25_dev) {
+			list_del(&s->list);
 			goto unlock_put;
 		}
-
-		s = s->next;
 	}
-	spin_unlock_bh(&ax25_dev_lock);
 	dev->ax25_ptr = NULL;
+	spin_unlock_bh(&ax25_dev_lock);
 	ax25_dev_put(ax25_dev);
 	return;
 
 unlock_put:
+	dev->ax25_ptr = NULL;
 	spin_unlock_bh(&ax25_dev_lock);
 	ax25_dev_put(ax25_dev);
-	dev->ax25_ptr = NULL;
 	netdev_put(dev, &ax25_dev->dev_tracker);
 	ax25_dev_put(ax25_dev);
 }
@@ -200,16 +193,13 @@ struct net_device *ax25_fwd_dev(struct net_device *dev)
  */
 void __exit ax25_dev_free(void)
 {
-	ax25_dev *s, *ax25_dev;
+	ax25_dev *s, *n;
 
 	spin_lock_bh(&ax25_dev_lock);
-	ax25_dev = ax25_dev_list;
-	while (ax25_dev != NULL) {
-		s        = ax25_dev;
-		netdev_put(ax25_dev->dev, &ax25_dev->dev_tracker);
-		ax25_dev = ax25_dev->next;
+	list_for_each_entry_safe(s, n, &ax25_dev_list, list) {
+		netdev_put(s->dev, &s->dev_tracker);
+		list_del(&s->list);
 		kfree(s);
 	}
-	ax25_dev_list = NULL;
 	spin_unlock_bh(&ax25_dev_lock);
 }
-- 
cgit v1.2.3


From e0e6adfe8c20f1b633017e4dafec1b06117da2df Mon Sep 17 00:00:00 2001
From: Shay Drory <shayd@nvidia.com>
Date: Sun, 12 May 2024 15:43:03 +0300
Subject: net/mlx5: Enable 8 ports LAG

This patch adds to mlx5 drivers support for 8 ports HCAs.
Starting with ConnectX-8 HCAs with 8 ports are possible.

As most driver parts aren't affected by such configuration most driver
code is unchanged.

Specially the only affected areas are:
- Lag
- Multiport E-Switch
- Single FDB E-Switch

All of the above are already factored in generic way, and LAG and VF LAG
are tested, so all that left is to change a #define and remove checks
which are no longer needed.
However, Multiport E-Switch is not tested yet, so it is left untouched.

This patch will allow to create hardware LAG/VF LAG when all 8 ports are
added to the same bond device.

for example, In order to activate the hardware lag a user can execute
the following:

ip link add bond0 type bond
ip link set bond0 type bond miimon 100 mode 2
ip link set eth2 master bond0
ip link set eth3 master bond0
ip link set eth4 master bond0
ip link set eth5 master bond0
ip link set eth6 master bond0
ip link set eth7 master bond0
ip link set eth8 master bond0
ip link set eth9 master bond0

Where eth2, eth3, eth4, eth5, eth6, eth7, eth8 and eth9 are the PFs of
the same HCA.

Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Link: https://lore.kernel.org/r/20240512124306.740898-2-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c | 3 ---
 include/linux/mlx5/driver.h                       | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
index 69d482f7c5a2..5e2171ff0a89 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
@@ -713,7 +713,6 @@ int mlx5_deactivate_lag(struct mlx5_lag *ldev)
 	return 0;
 }
 
-#define MLX5_LAG_OFFLOADS_SUPPORTED_PORTS 4
 bool mlx5_lag_check_prereq(struct mlx5_lag *ldev)
 {
 #ifdef CONFIG_MLX5_ESWITCH
@@ -739,8 +738,6 @@ bool mlx5_lag_check_prereq(struct mlx5_lag *ldev)
 		if (mlx5_eswitch_mode(ldev->pf[i].dev) != mode)
 			return false;
 
-	if (mode == MLX5_ESWITCH_OFFLOADS && ldev->ports > MLX5_LAG_OFFLOADS_SUPPORTED_PORTS)
-		return false;
 #else
 	for (i = 0; i < ldev->ports; i++)
 		if (mlx5_sriov_is_enabled(ldev->pf[i].dev))
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index bf9324a31ae9..8218588688b5 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -85,7 +85,7 @@ enum mlx5_sqp_t {
 };
 
 enum {
-	MLX5_MAX_PORTS	= 4,
+	MLX5_MAX_PORTS	= 8,
 };
 
 enum {
-- 
cgit v1.2.3


From db5944e16cd80efcbfe0bf1d067fdcc2f710d246 Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@nvidia.com>
Date: Sun, 12 May 2024 15:43:05 +0300
Subject: net/mlx5: Remove unused msix related exported APIs

MSIX irq allocation and free APIs are no longer
in use. Hence, remove the dead code.

Signed-off-by: Parav Pandit <parav@nvidia.com>
Reviewed-by: Dragos Tatulea <dtatulea@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>
Link: https://lore.kernel.org/r/20240512124306.740898-4-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c | 52 -----------------------
 include/linux/mlx5/driver.h                       |  7 ---
 2 files changed, 59 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
index 6bac8ad70ba6..fb8787e30d3f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
@@ -507,58 +507,6 @@ struct mlx5_irq *mlx5_irq_request(struct mlx5_core_dev *dev, u16 vecidx,
 	return irq;
 }
 
-/**
- * mlx5_msix_alloc - allocate msix interrupt
- * @dev: mlx5 device from which to request
- * @handler: interrupt handler
- * @affdesc: affinity descriptor
- * @name: interrupt name
- *
- * Returns: struct msi_map with result encoded.
- * Note: the caller must make sure to release the irq by calling
- *       mlx5_msix_free() if shutdown was initiated.
- */
-struct msi_map mlx5_msix_alloc(struct mlx5_core_dev *dev,
-			       irqreturn_t (*handler)(int, void *),
-			       const struct irq_affinity_desc *affdesc,
-			       const char *name)
-{
-	struct msi_map map;
-	int err;
-
-	if (!dev->pdev) {
-		map.virq = 0;
-		map.index = -EINVAL;
-		return map;
-	}
-
-	map = pci_msix_alloc_irq_at(dev->pdev, MSI_ANY_INDEX, affdesc);
-	if (!map.virq)
-		return map;
-
-	err = request_irq(map.virq, handler, 0, name, NULL);
-	if (err) {
-		mlx5_core_warn(dev, "err %d\n", err);
-		pci_msix_free_irq(dev->pdev, map);
-		map.virq = 0;
-		map.index = -ENOMEM;
-	}
-	return map;
-}
-EXPORT_SYMBOL(mlx5_msix_alloc);
-
-/**
- * mlx5_msix_free - free a previously allocated msix interrupt
- * @dev: mlx5 device associated with interrupt
- * @map: map previously returned by mlx5_msix_alloc()
- */
-void mlx5_msix_free(struct mlx5_core_dev *dev, struct msi_map map)
-{
-	free_irq(map.virq, NULL);
-	pci_msix_free_irq(dev->pdev, map);
-}
-EXPORT_SYMBOL(mlx5_msix_free);
-
 /**
  * mlx5_irq_release_vector - release one IRQ back to the system.
  * @irq: the irq to release.
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 8218588688b5..0aa15cac0308 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1374,11 +1374,4 @@ static inline bool mlx5_is_macsec_roce_supported(struct mlx5_core_dev *mdev)
 enum {
 	MLX5_OCTWORD = 16,
 };
-
-struct msi_map mlx5_msix_alloc(struct mlx5_core_dev *dev,
-			       irqreturn_t (*handler)(int, void *),
-			       const struct irq_affinity_desc *affdesc,
-			       const char *name);
-void mlx5_msix_free(struct mlx5_core_dev *dev, struct msi_map map);
-
 #endif /* MLX5_DRIVER_H */
-- 
cgit v1.2.3


From 92ef0fd55ac80dfc2e4654edfe5d1ddfa6e070fe Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 9 May 2024 09:20:08 -0600
Subject: net: change proto and proto_ops accept type

Rather than pass in flags, error pointer, and whether this is a kernel
invocation or not, add a struct proto_accept_arg struct as the argument.
This then holds all of these arguments, and prepares accept for being
able to pass back more information.

No functional changes in this patch.

Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 crypto/af_alg.c                    | 11 ++++++-----
 crypto/algif_hash.c                | 10 +++++-----
 drivers/xen/pvcalls-back.c         |  6 +++++-
 fs/ocfs2/cluster/tcp.c             |  5 ++++-
 include/crypto/if_alg.h            |  3 ++-
 include/linux/net.h                |  4 +++-
 include/net/inet_common.h          |  4 ++--
 include/net/inet_connection_sock.h |  2 +-
 include/net/sock.h                 | 12 +++++++++---
 net/atm/svc.c                      |  8 ++++----
 net/ax25/af_ax25.c                 |  6 +++---
 net/bluetooth/iso.c                |  4 ++--
 net/bluetooth/l2cap_sock.c         |  4 ++--
 net/bluetooth/rfcomm/sock.c        |  6 +++---
 net/bluetooth/sco.c                |  4 ++--
 net/core/sock.c                    |  4 ++--
 net/ipv4/af_inet.c                 | 10 +++++-----
 net/ipv4/inet_connection_sock.c    |  6 +++---
 net/iucv/af_iucv.c                 |  4 ++--
 net/llc/af_llc.c                   |  7 +++----
 net/mptcp/protocol.c               | 11 +++++------
 net/netrom/af_netrom.c             |  6 +++---
 net/nfc/llcp_sock.c                |  4 ++--
 net/phonet/pep.c                   | 12 ++++++------
 net/phonet/socket.c                |  7 +++----
 net/rds/tcp_listen.c               |  6 +++++-
 net/rose/af_rose.c                 |  6 +++---
 net/sctp/socket.c                  |  8 ++++----
 net/smc/af_smc.c                   |  6 +++---
 net/socket.c                       | 13 ++++++++++---
 net/tipc/socket.c                  | 13 +++++--------
 net/unix/af_unix.c                 | 21 ++++++++++-----------
 net/vmw_vsock/af_vsock.c           |  6 +++---
 net/x25/af_x25.c                   |  4 ++--
 34 files changed, 132 insertions(+), 111 deletions(-)

(limited to 'include')

diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index 5bc6d0fa7498..18cfead0081d 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -407,7 +407,8 @@ unlock:
 	return err;
 }
 
-int af_alg_accept(struct sock *sk, struct socket *newsock, bool kern)
+int af_alg_accept(struct sock *sk, struct socket *newsock,
+		  struct proto_accept_arg *arg)
 {
 	struct alg_sock *ask = alg_sk(sk);
 	const struct af_alg_type *type;
@@ -422,7 +423,7 @@ int af_alg_accept(struct sock *sk, struct socket *newsock, bool kern)
 	if (!type)
 		goto unlock;
 
-	sk2 = sk_alloc(sock_net(sk), PF_ALG, GFP_KERNEL, &alg_proto, kern);
+	sk2 = sk_alloc(sock_net(sk), PF_ALG, GFP_KERNEL, &alg_proto, arg->kern);
 	err = -ENOMEM;
 	if (!sk2)
 		goto unlock;
@@ -468,10 +469,10 @@ unlock:
 }
 EXPORT_SYMBOL_GPL(af_alg_accept);
 
-static int alg_accept(struct socket *sock, struct socket *newsock, int flags,
-		      bool kern)
+static int alg_accept(struct socket *sock, struct socket *newsock,
+		      struct proto_accept_arg *arg)
 {
-	return af_alg_accept(sock->sk, newsock, kern);
+	return af_alg_accept(sock->sk, newsock, arg);
 }
 
 static const struct proto_ops alg_proto_ops = {
diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c
index e24c829d7a01..7c7394d46a23 100644
--- a/crypto/algif_hash.c
+++ b/crypto/algif_hash.c
@@ -223,8 +223,8 @@ unlock:
 	return err ?: len;
 }
 
-static int hash_accept(struct socket *sock, struct socket *newsock, int flags,
-		       bool kern)
+static int hash_accept(struct socket *sock, struct socket *newsock,
+		       struct proto_accept_arg *arg)
 {
 	struct sock *sk = sock->sk;
 	struct alg_sock *ask = alg_sk(sk);
@@ -252,7 +252,7 @@ static int hash_accept(struct socket *sock, struct socket *newsock, int flags,
 	if (err)
 		goto out_free_state;
 
-	err = af_alg_accept(ask->parent, newsock, kern);
+	err = af_alg_accept(ask->parent, newsock, arg);
 	if (err)
 		goto out_free_state;
 
@@ -355,7 +355,7 @@ static int hash_recvmsg_nokey(struct socket *sock, struct msghdr *msg,
 }
 
 static int hash_accept_nokey(struct socket *sock, struct socket *newsock,
-			     int flags, bool kern)
+			     struct proto_accept_arg *arg)
 {
 	int err;
 
@@ -363,7 +363,7 @@ static int hash_accept_nokey(struct socket *sock, struct socket *newsock,
 	if (err)
 		return err;
 
-	return hash_accept(sock, newsock, flags, kern);
+	return hash_accept(sock, newsock, arg);
 }
 
 static struct proto_ops algif_hash_ops_nokey = {
diff --git a/drivers/xen/pvcalls-back.c b/drivers/xen/pvcalls-back.c
index d52593466a79..fd7ed65e0197 100644
--- a/drivers/xen/pvcalls-back.c
+++ b/drivers/xen/pvcalls-back.c
@@ -517,6 +517,10 @@ static void __pvcalls_back_accept(struct work_struct *work)
 {
 	struct sockpass_mapping *mappass = container_of(
 		work, struct sockpass_mapping, register_work);
+	struct proto_accept_arg arg = {
+		.flags = O_NONBLOCK,
+		.kern = true,
+	};
 	struct sock_mapping *map;
 	struct pvcalls_ioworker *iow;
 	struct pvcalls_fedata *fedata;
@@ -548,7 +552,7 @@ static void __pvcalls_back_accept(struct work_struct *work)
 	sock->type = mappass->sock->type;
 	sock->ops = mappass->sock->ops;
 
-	ret = inet_accept(mappass->sock, sock, O_NONBLOCK, true);
+	ret = inet_accept(mappass->sock, sock, &arg);
 	if (ret == -EAGAIN) {
 		sock_release(sock);
 		return;
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 960080753d3b..2b8fa3e782fb 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1784,6 +1784,9 @@ static int o2net_accept_one(struct socket *sock, int *more)
 	struct o2nm_node *node = NULL;
 	struct o2nm_node *local_node = NULL;
 	struct o2net_sock_container *sc = NULL;
+	struct proto_accept_arg arg = {
+		.flags = O_NONBLOCK,
+	};
 	struct o2net_node *nn;
 	unsigned int nofs_flag;
 
@@ -1802,7 +1805,7 @@ static int o2net_accept_one(struct socket *sock, int *more)
 
 	new_sock->type = sock->type;
 	new_sock->ops = sock->ops;
-	ret = sock->ops->accept(sock, new_sock, O_NONBLOCK, false);
+	ret = sock->ops->accept(sock, new_sock, &arg);
 	if (ret < 0)
 		goto out;
 
diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h
index 78ecaf5db04c..f7b3b93f3a49 100644
--- a/include/crypto/if_alg.h
+++ b/include/crypto/if_alg.h
@@ -166,7 +166,8 @@ int af_alg_unregister_type(const struct af_alg_type *type);
 
 int af_alg_release(struct socket *sock);
 void af_alg_release_parent(struct sock *sk);
-int af_alg_accept(struct sock *sk, struct socket *newsock, bool kern);
+int af_alg_accept(struct sock *sk, struct socket *newsock,
+		  struct proto_accept_arg *arg);
 
 void af_alg_free_sg(struct af_alg_sgl *sgl);
 
diff --git a/include/linux/net.h b/include/linux/net.h
index 15df6d5f27a7..688320b79fcc 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -153,6 +153,7 @@ struct sockaddr;
 struct msghdr;
 struct module;
 struct sk_buff;
+struct proto_accept_arg;
 typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *,
 			       unsigned int, size_t);
 typedef int (*skb_read_actor_t)(struct sock *, struct sk_buff *);
@@ -171,7 +172,8 @@ struct proto_ops {
 	int		(*socketpair)(struct socket *sock1,
 				      struct socket *sock2);
 	int		(*accept)    (struct socket *sock,
-				      struct socket *newsock, int flags, bool kern);
+				      struct socket *newsock,
+				      struct proto_accept_arg *arg);
 	int		(*getname)   (struct socket *sock,
 				      struct sockaddr *addr,
 				      int peer);
diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index f50a644d87a9..c17a6585d0b0 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -29,8 +29,8 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 			  int addr_len, int flags, int is_sendmsg);
 int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
 		       int addr_len, int flags);
-int inet_accept(struct socket *sock, struct socket *newsock, int flags,
-		bool kern);
+int inet_accept(struct socket *sock, struct socket *newsock,
+		struct proto_accept_arg *arg);
 void __inet_accept(struct socket *sock, struct socket *newsock,
 		   struct sock *newsk);
 int inet_send_prepare(struct sock *sk);
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 20e7b0c0b3d1..7d6b1254c92d 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -250,7 +250,7 @@ inet_csk_rto_backoff(const struct inet_connection_sock *icsk,
         return (unsigned long)min_t(u64, when, max_when);
 }
 
-struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern);
+struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg);
 
 int inet_csk_get_port(struct sock *sk, unsigned short snum);
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 0450494a1766..217079b3e3e8 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1194,6 +1194,12 @@ static inline void sk_prot_clear_nulls(struct sock *sk, int size)
 	       size - offsetof(struct sock, sk_node.pprev));
 }
 
+struct proto_accept_arg {
+	int flags;
+	int err;
+	bool kern;
+};
+
 /* Networking protocol blocks we attach to sockets.
  * socket layer -> transport layer interface
  */
@@ -1208,8 +1214,8 @@ struct proto {
 					int addr_len);
 	int			(*disconnect)(struct sock *sk, int flags);
 
-	struct sock *		(*accept)(struct sock *sk, int flags, int *err,
-					  bool kern);
+	struct sock *		(*accept)(struct sock *sk,
+					  struct proto_accept_arg *arg);
 
 	int			(*ioctl)(struct sock *sk, int cmd,
 					 int *karg);
@@ -1804,7 +1810,7 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
 int sock_no_bind(struct socket *, struct sockaddr *, int);
 int sock_no_connect(struct socket *, struct sockaddr *, int, int);
 int sock_no_socketpair(struct socket *, struct socket *);
-int sock_no_accept(struct socket *, struct socket *, int, bool);
+int sock_no_accept(struct socket *, struct socket *, struct proto_accept_arg *);
 int sock_no_getname(struct socket *, struct sockaddr *, int);
 int sock_no_ioctl(struct socket *, unsigned int, unsigned long);
 int sock_no_listen(struct socket *, int);
diff --git a/net/atm/svc.c b/net/atm/svc.c
index 36a814f1fbd1..f8137ae693b0 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -324,8 +324,8 @@ out:
 	return error;
 }
 
-static int svc_accept(struct socket *sock, struct socket *newsock, int flags,
-		      bool kern)
+static int svc_accept(struct socket *sock, struct socket *newsock,
+		      struct proto_accept_arg *arg)
 {
 	struct sock *sk = sock->sk;
 	struct sk_buff *skb;
@@ -336,7 +336,7 @@ static int svc_accept(struct socket *sock, struct socket *newsock, int flags,
 
 	lock_sock(sk);
 
-	error = svc_create(sock_net(sk), newsock, 0, kern);
+	error = svc_create(sock_net(sk), newsock, 0, arg->kern);
 	if (error)
 		goto out;
 
@@ -355,7 +355,7 @@ static int svc_accept(struct socket *sock, struct socket *newsock, int flags,
 				error = -sk->sk_err;
 				break;
 			}
-			if (flags & O_NONBLOCK) {
+			if (arg->flags & O_NONBLOCK) {
 				error = -EAGAIN;
 				break;
 			}
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 9169efb2f43a..8077cf2ee448 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1373,8 +1373,8 @@ out_release:
 	return err;
 }
 
-static int ax25_accept(struct socket *sock, struct socket *newsock, int flags,
-		       bool kern)
+static int ax25_accept(struct socket *sock, struct socket *newsock,
+		       struct proto_accept_arg *arg)
 {
 	struct sk_buff *skb;
 	struct sock *newsk;
@@ -1409,7 +1409,7 @@ static int ax25_accept(struct socket *sock, struct socket *newsock, int flags,
 		if (skb)
 			break;
 
-		if (flags & O_NONBLOCK) {
+		if (arg->flags & O_NONBLOCK) {
 			err = -EWOULDBLOCK;
 			break;
 		}
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index ef0cc80b4c0c..2a075119d65d 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -1186,7 +1186,7 @@ done:
 }
 
 static int iso_sock_accept(struct socket *sock, struct socket *newsock,
-			   int flags, bool kern)
+			   struct proto_accept_arg *arg)
 {
 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
 	struct sock *sk = sock->sk, *ch;
@@ -1195,7 +1195,7 @@ static int iso_sock_accept(struct socket *sock, struct socket *newsock,
 
 	lock_sock(sk);
 
-	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+	timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK);
 
 	BT_DBG("sk %p timeo %ld", sk, timeo);
 
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index 5cc83f906c12..125dddc77452 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -327,7 +327,7 @@ done:
 }
 
 static int l2cap_sock_accept(struct socket *sock, struct socket *newsock,
-			     int flags, bool kern)
+			     struct proto_accept_arg *arg)
 {
 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
 	struct sock *sk = sock->sk, *nsk;
@@ -336,7 +336,7 @@ static int l2cap_sock_accept(struct socket *sock, struct socket *newsock,
 
 	lock_sock_nested(sk, L2CAP_NESTING_PARENT);
 
-	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+	timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK);
 
 	BT_DBG("sk %p timeo %ld", sk, timeo);
 
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 29aa07e9db9d..37d63d768afb 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -468,8 +468,8 @@ done:
 	return err;
 }
 
-static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, int flags,
-			      bool kern)
+static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock,
+			      struct proto_accept_arg *arg)
 {
 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
 	struct sock *sk = sock->sk, *nsk;
@@ -483,7 +483,7 @@ static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, int f
 		goto done;
 	}
 
-	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+	timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK);
 
 	BT_DBG("sk %p timeo %ld", sk, timeo);
 
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index e0ad30862ee4..94c6f2b46279 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -647,7 +647,7 @@ done:
 }
 
 static int sco_sock_accept(struct socket *sock, struct socket *newsock,
-			   int flags, bool kern)
+			   struct proto_accept_arg *arg)
 {
 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
 	struct sock *sk = sock->sk, *ch;
@@ -656,7 +656,7 @@ static int sco_sock_accept(struct socket *sock, struct socket *newsock,
 
 	lock_sock(sk);
 
-	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+	timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK);
 
 	BT_DBG("sk %p timeo %ld", sk, timeo);
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 8d6e638b5426..8629f9aecf91 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3241,8 +3241,8 @@ int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
 }
 EXPORT_SYMBOL(sock_no_socketpair);
 
-int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
-		   bool kern)
+int sock_no_accept(struct socket *sock, struct socket *newsock,
+		   struct proto_accept_arg *arg)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index a7bad18bc8b5..de3449e16b89 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -771,16 +771,16 @@ void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *new
  *	Accept a pending connection. The TCP layer now gives BSD semantics.
  */
 
-int inet_accept(struct socket *sock, struct socket *newsock, int flags,
-		bool kern)
+int inet_accept(struct socket *sock, struct socket *newsock,
+		struct proto_accept_arg *arg)
 {
 	struct sock *sk1 = sock->sk, *sk2;
-	int err = -EINVAL;
 
 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
-	sk2 = READ_ONCE(sk1->sk_prot)->accept(sk1, flags, &err, kern);
+	arg->err = -EINVAL;
+	sk2 = READ_ONCE(sk1->sk_prot)->accept(sk1, arg);
 	if (!sk2)
-		return err;
+		return arg->err;
 
 	lock_sock(sk2);
 	__inet_accept(sock, newsock, sk2);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 3b38610958ee..7734d189c66b 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -661,7 +661,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
 /*
  * This will accept the next outstanding connection.
  */
-struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
+struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct request_sock_queue *queue = &icsk->icsk_accept_queue;
@@ -680,7 +680,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
 
 	/* Find already established connection */
 	if (reqsk_queue_empty(queue)) {
-		long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+		long timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK);
 
 		/* If this is a non blocking socket don't sleep */
 		error = -EAGAIN;
@@ -745,7 +745,7 @@ out:
 out_err:
 	newsk = NULL;
 	req = NULL;
-	*err = error;
+	arg->err = error;
 	goto out;
 }
 EXPORT_SYMBOL(inet_csk_accept);
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index c951bb9cc2e0..c3b0b610b0aa 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -795,7 +795,7 @@ done:
 
 /* Accept a pending connection */
 static int iucv_sock_accept(struct socket *sock, struct socket *newsock,
-			    int flags, bool kern)
+			    struct proto_accept_arg *arg)
 {
 	DECLARE_WAITQUEUE(wait, current);
 	struct sock *sk = sock->sk, *nsk;
@@ -809,7 +809,7 @@ static int iucv_sock_accept(struct socket *sock, struct socket *newsock,
 		goto done;
 	}
 
-	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+	timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK);
 
 	/* Wait for an incoming connection */
 	add_wait_queue_exclusive(sk_sleep(sk), &wait);
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index fde1140d899e..4eb52add7103 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -688,14 +688,13 @@ static void llc_cmsg_rcv(struct msghdr *msg, struct sk_buff *skb)
  *	llc_ui_accept - accept a new incoming connection.
  *	@sock: Socket which connections arrive on.
  *	@newsock: Socket to move incoming connection to.
- *	@flags: User specified operational flags.
- *	@kern: If the socket is kernel internal
+ *	@arg: User specified arguments
  *
  *	Accept a new incoming connection.
  *	Returns 0 upon success, negative otherwise.
  */
-static int llc_ui_accept(struct socket *sock, struct socket *newsock, int flags,
-			 bool kern)
+static int llc_ui_accept(struct socket *sock, struct socket *newsock,
+			 struct proto_accept_arg *arg)
 {
 	struct sock *sk = sock->sk, *newsk;
 	struct llc_sock *llc, *newllc;
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index bb8f96f2b86f..815ce439183c 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -3881,11 +3881,10 @@ unlock:
 }
 
 static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
-			       int flags, bool kern)
+			       struct proto_accept_arg *arg)
 {
 	struct mptcp_sock *msk = mptcp_sk(sock->sk);
 	struct sock *ssk, *newsk;
-	int err;
 
 	pr_debug("msk=%p", msk);
 
@@ -3897,9 +3896,9 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
 		return -EINVAL;
 
 	pr_debug("ssk=%p, listener=%p", ssk, mptcp_subflow_ctx(ssk));
-	newsk = inet_csk_accept(ssk, flags, &err, kern);
+	newsk = inet_csk_accept(ssk, arg);
 	if (!newsk)
-		return err;
+		return arg->err;
 
 	pr_debug("newsk=%p, subflow is mptcp=%d", newsk, sk_is_mptcp(newsk));
 	if (sk_is_mptcp(newsk)) {
@@ -3920,7 +3919,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
 		newsk = new_mptcp_sock;
 		MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
 
-		newsk->sk_kern_sock = kern;
+		newsk->sk_kern_sock = arg->kern;
 		lock_sock(newsk);
 		__inet_accept(sock, newsock, newsk);
 
@@ -3949,7 +3948,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
 		}
 	} else {
 tcpfallback:
-		newsk->sk_kern_sock = kern;
+		newsk->sk_kern_sock = arg->kern;
 		lock_sock(newsk);
 		__inet_accept(sock, newsock, newsk);
 		/* we are being invoked after accepting a non-mp-capable
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 104a80b75477..6ee148f0e6d0 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -772,8 +772,8 @@ out_release:
 	return err;
 }
 
-static int nr_accept(struct socket *sock, struct socket *newsock, int flags,
-		     bool kern)
+static int nr_accept(struct socket *sock, struct socket *newsock,
+		     struct proto_accept_arg *arg)
 {
 	struct sk_buff *skb;
 	struct sock *newsk;
@@ -805,7 +805,7 @@ static int nr_accept(struct socket *sock, struct socket *newsock, int flags,
 		if (skb)
 			break;
 
-		if (flags & O_NONBLOCK) {
+		if (arg->flags & O_NONBLOCK) {
 			err = -EWOULDBLOCK;
 			break;
 		}
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index d5344563e525..57a2f97004e1 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -447,7 +447,7 @@ struct sock *nfc_llcp_accept_dequeue(struct sock *parent,
 }
 
 static int llcp_sock_accept(struct socket *sock, struct socket *newsock,
-			    int flags, bool kern)
+			    struct proto_accept_arg *arg)
 {
 	DECLARE_WAITQUEUE(wait, current);
 	struct sock *sk = sock->sk, *new_sk;
@@ -463,7 +463,7 @@ static int llcp_sock_accept(struct socket *sock, struct socket *newsock,
 		goto error;
 	}
 
-	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+	timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK);
 
 	/* Wait for an incoming connection. */
 	add_wait_queue_exclusive(sk_sleep(sk), &wait);
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index 3dd5f52bc1b5..53a858478e22 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -759,8 +759,8 @@ static void pep_sock_close(struct sock *sk, long timeout)
 	sock_put(sk);
 }
 
-static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp,
-				    bool kern)
+static struct sock *pep_sock_accept(struct sock *sk,
+				    struct proto_accept_arg *arg)
 {
 	struct pep_sock *pn = pep_sk(sk), *newpn;
 	struct sock *newsk = NULL;
@@ -772,8 +772,8 @@ static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp,
 	u8 pipe_handle, enabled, n_sb;
 	u8 aligned = 0;
 
-	skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
-				errp);
+	skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
+				&arg->err);
 	if (!skb)
 		return NULL;
 
@@ -836,7 +836,7 @@ static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp,
 
 	/* Create a new to-be-accepted sock */
 	newsk = sk_alloc(sock_net(sk), PF_PHONET, GFP_KERNEL, sk->sk_prot,
-			 kern);
+			 arg->kern);
 	if (!newsk) {
 		pep_reject_conn(sk, skb, PN_PIPE_ERR_OVERLOAD, GFP_KERNEL);
 		err = -ENOBUFS;
@@ -878,7 +878,7 @@ static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp,
 drop:
 	release_sock(sk);
 	kfree_skb(skb);
-	*errp = err;
+	arg->err = err;
 	return newsk;
 }
 
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index 1018340d89a7..5ce0b3ee5def 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -292,18 +292,17 @@ out:
 }
 
 static int pn_socket_accept(struct socket *sock, struct socket *newsock,
-			    int flags, bool kern)
+			    struct proto_accept_arg *arg)
 {
 	struct sock *sk = sock->sk;
 	struct sock *newsk;
-	int err;
 
 	if (unlikely(sk->sk_state != TCP_LISTEN))
 		return -EINVAL;
 
-	newsk = sk->sk_prot->accept(sk, flags, &err, kern);
+	newsk = sk->sk_prot->accept(sk, arg);
 	if (!newsk)
-		return err;
+		return arg->err;
 
 	lock_sock(newsk);
 	sock_graft(newsk, newsock);
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 05008ce5c421..d89bd8d0c354 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -105,6 +105,10 @@ int rds_tcp_accept_one(struct socket *sock)
 	int conn_state;
 	struct rds_conn_path *cp;
 	struct in6_addr *my_addr, *peer_addr;
+	struct proto_accept_arg arg = {
+		.flags = O_NONBLOCK,
+		.kern = true,
+	};
 #if !IS_ENABLED(CONFIG_IPV6)
 	struct in6_addr saddr, daddr;
 #endif
@@ -119,7 +123,7 @@ int rds_tcp_accept_one(struct socket *sock)
 	if (ret)
 		goto out;
 
-	ret = sock->ops->accept(sock, new_sock, O_NONBLOCK, true);
+	ret = sock->ops->accept(sock, new_sock, &arg);
 	if (ret < 0)
 		goto out;
 
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index ef81d019b20f..59050caab65c 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -919,8 +919,8 @@ out_release:
 	return err;
 }
 
-static int rose_accept(struct socket *sock, struct socket *newsock, int flags,
-		       bool kern)
+static int rose_accept(struct socket *sock, struct socket *newsock,
+		       struct proto_accept_arg *arg)
 {
 	struct sk_buff *skb;
 	struct sock *newsk;
@@ -953,7 +953,7 @@ static int rose_accept(struct socket *sock, struct socket *newsock, int flags,
 		if (skb)
 			break;
 
-		if (flags & O_NONBLOCK) {
+		if (arg->flags & O_NONBLOCK) {
 			err = -EWOULDBLOCK;
 			break;
 		}
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 64196b1dce1d..c009383369b2 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4847,7 +4847,7 @@ static int sctp_disconnect(struct sock *sk, int flags)
  * descriptor will be returned from accept() to represent the newly
  * formed association.
  */
-static struct sock *sctp_accept(struct sock *sk, int flags, int *err, bool kern)
+static struct sock *sctp_accept(struct sock *sk, struct proto_accept_arg *arg)
 {
 	struct sctp_sock *sp;
 	struct sctp_endpoint *ep;
@@ -4871,7 +4871,7 @@ static struct sock *sctp_accept(struct sock *sk, int flags, int *err, bool kern)
 		goto out;
 	}
 
-	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+	timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK);
 
 	error = sctp_wait_for_accept(sk, timeo);
 	if (error)
@@ -4882,7 +4882,7 @@ static struct sock *sctp_accept(struct sock *sk, int flags, int *err, bool kern)
 	 */
 	asoc = list_entry(ep->asocs.next, struct sctp_association, asocs);
 
-	newsk = sp->pf->create_accept_sk(sk, asoc, kern);
+	newsk = sp->pf->create_accept_sk(sk, asoc, arg->kern);
 	if (!newsk) {
 		error = -ENOMEM;
 		goto out;
@@ -4899,7 +4899,7 @@ static struct sock *sctp_accept(struct sock *sk, int flags, int *err, bool kern)
 
 out:
 	release_sock(sk);
-	*err = error;
+	arg->err = error;
 	return newsk;
 }
 
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 9389f0cfa374..e50a286fd0fb 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -2689,7 +2689,7 @@ out:
 }
 
 static int smc_accept(struct socket *sock, struct socket *new_sock,
-		      int flags, bool kern)
+		      struct proto_accept_arg *arg)
 {
 	struct sock *sk = sock->sk, *nsk;
 	DECLARE_WAITQUEUE(wait, current);
@@ -2708,7 +2708,7 @@ static int smc_accept(struct socket *sock, struct socket *new_sock,
 	}
 
 	/* Wait for an incoming connection */
-	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+	timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK);
 	add_wait_queue_exclusive(sk_sleep(sk), &wait);
 	while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
 		set_current_state(TASK_INTERRUPTIBLE);
@@ -2735,7 +2735,7 @@ static int smc_accept(struct socket *sock, struct socket *new_sock,
 	if (rc)
 		goto out;
 
-	if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) {
+	if (lsmc->sockopt_defer_accept && !(arg->flags & O_NONBLOCK)) {
 		/* wait till data arrives on the socket */
 		timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept *
 								MSEC_PER_SEC);
diff --git a/net/socket.c b/net/socket.c
index 01a71ae10c35..6ff5f21d9633 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1898,6 +1898,9 @@ struct file *do_accept(struct file *file, unsigned file_flags,
 	struct file *newfile;
 	int err, len;
 	struct sockaddr_storage address;
+	struct proto_accept_arg arg = {
+		.flags = file_flags,
+	};
 	const struct proto_ops *ops;
 
 	sock = sock_from_file(file);
@@ -1926,8 +1929,8 @@ struct file *do_accept(struct file *file, unsigned file_flags,
 	if (err)
 		goto out_fd;
 
-	err = ops->accept(sock, newsock, sock->file->f_flags | file_flags,
-					false);
+	arg.flags |= sock->file->f_flags;
+	err = ops->accept(sock, newsock, &arg);
 	if (err < 0)
 		goto out_fd;
 
@@ -3580,6 +3583,10 @@ int kernel_accept(struct socket *sock, struct socket **newsock, int flags)
 {
 	struct sock *sk = sock->sk;
 	const struct proto_ops *ops = READ_ONCE(sock->ops);
+	struct proto_accept_arg arg = {
+		.flags = flags,
+		.kern = true,
+	};
 	int err;
 
 	err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
@@ -3587,7 +3594,7 @@ int kernel_accept(struct socket *sock, struct socket **newsock, int flags)
 	if (err < 0)
 		goto done;
 
-	err = ops->accept(sock, *newsock, flags, true);
+	err = ops->accept(sock, *newsock, &arg);
 	if (err < 0) {
 		sock_release(*newsock);
 		*newsock = NULL;
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 798397b6811e..2d58ecae4e21 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -146,8 +146,6 @@ static void tipc_data_ready(struct sock *sk);
 static void tipc_write_space(struct sock *sk);
 static void tipc_sock_destruct(struct sock *sk);
 static int tipc_release(struct socket *sock);
-static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags,
-		       bool kern);
 static void tipc_sk_timeout(struct timer_list *t);
 static int tipc_sk_publish(struct tipc_sock *tsk, struct tipc_uaddr *ua);
 static int tipc_sk_withdraw(struct tipc_sock *tsk, struct tipc_uaddr *ua);
@@ -2711,13 +2709,12 @@ static int tipc_wait_for_accept(struct socket *sock, long timeo)
  * tipc_accept - wait for connection request
  * @sock: listening socket
  * @new_sock: new socket that is to be connected
- * @flags: file-related flags associated with socket
- * @kern: caused by kernel or by userspace?
+ * @arg: arguments for accept
  *
  * Return: 0 on success, errno otherwise
  */
-static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags,
-		       bool kern)
+static int tipc_accept(struct socket *sock, struct socket *new_sock,
+		       struct proto_accept_arg *arg)
 {
 	struct sock *new_sk, *sk = sock->sk;
 	struct tipc_sock *new_tsock;
@@ -2733,14 +2730,14 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags,
 		res = -EINVAL;
 		goto exit;
 	}
-	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+	timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK);
 	res = tipc_wait_for_accept(sock, timeo);
 	if (res)
 		goto exit;
 
 	buf = skb_peek(&sk->sk_receive_queue);
 
-	res = tipc_sk_create(sock_net(sock->sk), new_sock, 0, kern);
+	res = tipc_sk_create(sock_net(sock->sk), new_sock, 0, arg->kern);
 	if (res)
 		goto exit;
 	security_sk_clone(sock->sk, new_sock->sk);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index dc1651541723..26e3b5f1ee46 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -755,7 +755,7 @@ static int unix_bind(struct socket *, struct sockaddr *, int);
 static int unix_stream_connect(struct socket *, struct sockaddr *,
 			       int addr_len, int flags);
 static int unix_socketpair(struct socket *, struct socket *);
-static int unix_accept(struct socket *, struct socket *, int, bool);
+static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg);
 static int unix_getname(struct socket *, struct sockaddr *, int);
 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
 static __poll_t unix_dgram_poll(struct file *, struct socket *,
@@ -1689,19 +1689,18 @@ static void unix_sock_inherit_flags(const struct socket *old,
 		set_bit(SOCK_PASSSEC, &new->flags);
 }
 
-static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
-		       bool kern)
+static int unix_accept(struct socket *sock, struct socket *newsock,
+		       struct proto_accept_arg *arg)
 {
 	struct sock *sk = sock->sk;
 	struct sk_buff *skb;
 	struct sock *tsk;
-	int err;
 
-	err = -EOPNOTSUPP;
+	arg->err = -EOPNOTSUPP;
 	if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 		goto out;
 
-	err = -EINVAL;
+	arg->err = -EINVAL;
 	if (sk->sk_state != TCP_LISTEN)
 		goto out;
 
@@ -1709,12 +1708,12 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
 	 * so that no locks are necessary.
 	 */
 
-	skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
-				&err);
+	skb = skb_recv_datagram(sk, (arg->flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
+				&arg->err);
 	if (!skb) {
 		/* This means receive shutdown. */
-		if (err == 0)
-			err = -EINVAL;
+		if (arg->err == 0)
+			arg->err = -EINVAL;
 		goto out;
 	}
 
@@ -1732,7 +1731,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
 	return 0;
 
 out:
-	return err;
+	return arg->err;
 }
 
 
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 54ba7316f808..4b040285aa78 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1500,8 +1500,8 @@ out:
 	return err;
 }
 
-static int vsock_accept(struct socket *sock, struct socket *newsock, int flags,
-			bool kern)
+static int vsock_accept(struct socket *sock, struct socket *newsock,
+			struct proto_accept_arg *arg)
 {
 	struct sock *listener;
 	int err;
@@ -1528,7 +1528,7 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags,
 	/* Wait for children sockets to appear; these are the new sockets
 	 * created upon connection establishment.
 	 */
-	timeout = sock_rcvtimeo(listener, flags & O_NONBLOCK);
+	timeout = sock_rcvtimeo(listener, arg->flags & O_NONBLOCK);
 	prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE);
 
 	while ((connected = vsock_dequeue_accept(listener)) == NULL &&
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index d18d51412cc0..8dda4178497c 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -871,8 +871,8 @@ static int x25_wait_for_data(struct sock *sk, long timeout)
 	return rc;
 }
 
-static int x25_accept(struct socket *sock, struct socket *newsock, int flags,
-		      bool kern)
+static int x25_accept(struct socket *sock, struct socket *newsock,
+		      struct proto_accept_arg *arg)
 {
 	struct sock *sk = sock->sk;
 	struct sock *newsk;
-- 
cgit v1.2.3


From 0645fbe760afcc5332c858d1cbf416bf77ef3c29 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 9 May 2024 09:31:05 -0600
Subject: net: have do_accept() take a struct proto_accept_arg argument

In preparation for passing in more information via this API, change
do_accept() to take a proto_accept_arg struct pointer rather than just
the file flags separately.

No functional changes in this patch.

Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/socket.h |  3 ++-
 io_uring/net.c         |  6 ++++--
 net/socket.c           | 12 +++++-------
 3 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 139c330ccf2c..89d16b90370b 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -16,6 +16,7 @@ struct cred;
 struct socket;
 struct sock;
 struct sk_buff;
+struct proto_accept_arg;
 
 #define __sockaddr_check_size(size)	\
 	BUILD_BUG_ON(((size) > sizeof(struct __kernel_sockaddr_storage)))
@@ -433,7 +434,7 @@ extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size,
 extern int __sys_sendto(int fd, void __user *buff, size_t len,
 			unsigned int flags, struct sockaddr __user *addr,
 			int addr_len);
-extern struct file *do_accept(struct file *file, unsigned file_flags,
+extern struct file *do_accept(struct file *file, struct proto_accept_arg *arg,
 			      struct sockaddr __user *upeer_sockaddr,
 			      int __user *upeer_addrlen, int flags);
 extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
diff --git a/io_uring/net.c b/io_uring/net.c
index 070dea9a4eda..d4d1fc93635c 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -1528,8 +1528,10 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
-	unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
 	bool fixed = !!accept->file_slot;
+	struct proto_accept_arg arg = {
+		.flags = force_nonblock ? O_NONBLOCK : 0,
+	};
 	struct file *file;
 	int ret, fd;
 
@@ -1543,7 +1545,7 @@ retry:
 		if (unlikely(fd < 0))
 			return fd;
 	}
-	file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
+	file = do_accept(req->file, &arg, accept->addr, accept->addr_len,
 			 accept->flags);
 	if (IS_ERR(file)) {
 		if (!fixed)
diff --git a/net/socket.c b/net/socket.c
index 6ff5f21d9633..e416920e9399 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1890,7 +1890,7 @@ SYSCALL_DEFINE2(listen, int, fd, int, backlog)
 	return __sys_listen(fd, backlog);
 }
 
-struct file *do_accept(struct file *file, unsigned file_flags,
+struct file *do_accept(struct file *file, struct proto_accept_arg *arg,
 		       struct sockaddr __user *upeer_sockaddr,
 		       int __user *upeer_addrlen, int flags)
 {
@@ -1898,9 +1898,6 @@ struct file *do_accept(struct file *file, unsigned file_flags,
 	struct file *newfile;
 	int err, len;
 	struct sockaddr_storage address;
-	struct proto_accept_arg arg = {
-		.flags = file_flags,
-	};
 	const struct proto_ops *ops;
 
 	sock = sock_from_file(file);
@@ -1929,8 +1926,8 @@ struct file *do_accept(struct file *file, unsigned file_flags,
 	if (err)
 		goto out_fd;
 
-	arg.flags |= sock->file->f_flags;
-	err = ops->accept(sock, newsock, &arg);
+	arg->flags |= sock->file->f_flags;
+	err = ops->accept(sock, newsock, arg);
 	if (err < 0)
 		goto out_fd;
 
@@ -1956,6 +1953,7 @@ out_fd:
 static int __sys_accept4_file(struct file *file, struct sockaddr __user *upeer_sockaddr,
 			      int __user *upeer_addrlen, int flags)
 {
+	struct proto_accept_arg arg = { };
 	struct file *newfile;
 	int newfd;
 
@@ -1969,7 +1967,7 @@ static int __sys_accept4_file(struct file *file, struct sockaddr __user *upeer_s
 	if (unlikely(newfd < 0))
 		return newfd;
 
-	newfile = do_accept(file, 0, upeer_sockaddr, upeer_addrlen,
+	newfile = do_accept(file, &arg, upeer_sockaddr, upeer_addrlen,
 			    flags);
 	if (IS_ERR(newfile)) {
 		put_unused_fd(newfd);
-- 
cgit v1.2.3


From 7951e36ac620a9ba1bae0ac0ddd62d2e82285725 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 9 May 2024 09:35:01 -0600
Subject: net: pass back whether socket was empty post accept

This adds an 'is_empty' argument to struct proto_accept_arg, which can
be used to pass back information on whether or not the given socket has
more connections to accept post the one just accepted.

To utilize this information, the caller should initialize the 'is_empty'
field to, eg, -1 and then check for 0/1 after the accept. If the field
has been set, the caller knows whether there are more pending connections
or not. If the field remains -1 after the accept call, the protocol
doesn't support passing back this information.

This patch wires it up for ipv4/6 TCP.

Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/net/sock.h              | 1 +
 net/ipv4/inet_connection_sock.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 217079b3e3e8..5f4d0629348f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1197,6 +1197,7 @@ static inline void sk_prot_clear_nulls(struct sock *sk, int size)
 struct proto_accept_arg {
 	int flags;
 	int err;
+	int is_empty;
 	bool kern;
 };
 
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 7734d189c66b..d81f74ce0f02 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -692,6 +692,7 @@ struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg)
 			goto out_err;
 	}
 	req = reqsk_queue_remove(queue, sk);
+	arg->is_empty = reqsk_queue_empty(queue);
 	newsk = req->sk;
 
 	if (sk->sk_protocol == IPPROTO_TCP &&
-- 
cgit v1.2.3


From f0ef433fc264b70a415257dba126acbe3fda666b Mon Sep 17 00:00:00 2001
From: "Russell King (Oracle)" <rmk+kernel@armlinux.org.uk>
Date: Mon, 13 May 2024 09:25:15 +0200
Subject: net: stmmac: introduce pcs_init/pcs_exit stmmac operations

Introduce a mechanism whereby platforms can create their PCS instances
prior to the network device being published to userspace, but after
some of the core stmmac initialisation has been completed. This means
that the data structures that platforms need will be available.

Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
Co-developed-by: Romain Gantois <romain.gantois@bootlin.com>
Signed-off-by: Romain Gantois <romain.gantois@bootlin.com>
Reviewed-by: Hariprasad Kelam <hkelam@marvell.com>
Link: https://lore.kernel.org/r/20240513-rzn1-gmac1-v7-4-6acf58b5440d@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c | 8 +++++++-
 include/linux/stmmac.h                            | 2 ++
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
index 54708440e27b..aa43117134d3 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
@@ -505,7 +505,10 @@ int stmmac_pcs_setup(struct net_device *ndev)
 	priv = netdev_priv(ndev);
 	mode = priv->plat->phy_interface;
 
-	if (priv->plat->mdio_bus_data && priv->plat->mdio_bus_data->has_xpcs) {
+	if (priv->plat->pcs_init) {
+		ret = priv->plat->pcs_init(priv);
+	} else if (priv->plat->mdio_bus_data &&
+		   priv->plat->mdio_bus_data->has_xpcs) {
 		/* Try to probe the XPCS by scanning all addresses */
 		for (addr = 0; addr < PHY_MAX_ADDR; addr++) {
 			xpcs = xpcs_create_mdiodev(priv->mii, addr, mode);
@@ -533,6 +536,9 @@ void stmmac_pcs_clean(struct net_device *ndev)
 {
 	struct stmmac_priv *priv = netdev_priv(ndev);
 
+	if (priv->plat->pcs_exit)
+		priv->plat->pcs_exit(priv);
+
 	if (!priv->hw->xpcs)
 		return;
 
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index dfa1828cd756..4a24a246c617 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -285,6 +285,8 @@ struct plat_stmmacenet_data {
 	int (*crosststamp)(ktime_t *device, struct system_counterval_t *system,
 			   void *ctx);
 	void (*dump_debug_regs)(void *priv);
+	int (*pcs_init)(struct stmmac_priv *priv);
+	void (*pcs_exit)(struct stmmac_priv *priv);
 	void *bsp_priv;
 	struct clk *stmmac_clk;
 	struct clk *pclk;
-- 
cgit v1.2.3


From 2b9669d63400ac6038c697960a3df37d680b6648 Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Fri, 10 May 2024 20:24:58 +0800
Subject: tcp: rstreason: fully support in tcp_rcv_synsent_state_process()

In this function, only updating the map can finish the job for socket
reset reason because the corresponding drop reasons are ready.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Link: https://lore.kernel.org/r/20240510122502.27850-2-kerneljasonxing@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/rstreason.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'include')

diff --git a/include/net/rstreason.h b/include/net/rstreason.h
index df3b6ac0c9b3..f87814a60205 100644
--- a/include/net/rstreason.h
+++ b/include/net/rstreason.h
@@ -8,6 +8,8 @@
 #define DEFINE_RST_REASON(FN, FNe)	\
 	FN(NOT_SPECIFIED)		\
 	FN(NO_SOCKET)			\
+	FN(TCP_INVALID_ACK_SEQUENCE)	\
+	FN(TCP_RFC7323_PAWS)		\
 	FN(MPTCP_RST_EUNSPEC)		\
 	FN(MPTCP_RST_EMPTCP)		\
 	FN(MPTCP_RST_ERESOURCE)		\
@@ -37,6 +39,17 @@ enum sk_rst_reason {
 	SK_RST_REASON_NOT_SPECIFIED,
 	/** @SK_RST_REASON_NO_SOCKET: no valid socket that can be used */
 	SK_RST_REASON_NO_SOCKET,
+	/**
+	 * @SK_RST_REASON_TCP_INVALID_ACK_SEQUENCE: Not acceptable ACK SEQ
+	 * field because ack sequence is not in the window between snd_una
+	 * and snd_nxt
+	 */
+	SK_RST_REASON_TCP_INVALID_ACK_SEQUENCE,
+	/**
+	 * @SK_RST_REASON_TCP_RFC7323_PAWS: PAWS check, corresponding to
+	 * LINUX_MIB_PAWSESTABREJECTED, LINUX_MIB_PAWSACTIVEREJECTED
+	 */
+	SK_RST_REASON_TCP_RFC7323_PAWS,
 
 	/* Copy from include/uapi/linux/mptcp.h.
 	 * These reset fields will not be changed since they adhere to
@@ -113,6 +126,10 @@ sk_rst_convert_drop_reason(enum skb_drop_reason reason)
 		return SK_RST_REASON_NOT_SPECIFIED;
 	case SKB_DROP_REASON_NO_SOCKET:
 		return SK_RST_REASON_NO_SOCKET;
+	case SKB_DROP_REASON_TCP_INVALID_ACK_SEQUENCE:
+		return SK_RST_REASON_TCP_INVALID_ACK_SEQUENCE;
+	case SKB_DROP_REASON_TCP_RFC7323_PAWS:
+		return SK_RST_REASON_TCP_RFC7323_PAWS;
 	default:
 		/* If we don't have our own corresponding reason */
 		return SK_RST_REASON_NOT_SPECIFIED;
-- 
cgit v1.2.3


From 459a2b37a41c8ef83caee4762fc50751470c28e4 Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Fri, 10 May 2024 20:24:59 +0800
Subject: tcp: rstreason: fully support in tcp_ack()

Based on the existing skb drop reason, updating the rstreason map can
help us finish the rstreason job in this function.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Link: https://lore.kernel.org/r/20240510122502.27850-3-kerneljasonxing@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/rstreason.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include')

diff --git a/include/net/rstreason.h b/include/net/rstreason.h
index f87814a60205..69404c14f45d 100644
--- a/include/net/rstreason.h
+++ b/include/net/rstreason.h
@@ -10,6 +10,8 @@
 	FN(NO_SOCKET)			\
 	FN(TCP_INVALID_ACK_SEQUENCE)	\
 	FN(TCP_RFC7323_PAWS)		\
+	FN(TCP_TOO_OLD_ACK)		\
+	FN(TCP_ACK_UNSENT_DATA)		\
 	FN(MPTCP_RST_EUNSPEC)		\
 	FN(MPTCP_RST_EMPTCP)		\
 	FN(MPTCP_RST_ERESOURCE)		\
@@ -50,6 +52,13 @@ enum sk_rst_reason {
 	 * LINUX_MIB_PAWSESTABREJECTED, LINUX_MIB_PAWSACTIVEREJECTED
 	 */
 	SK_RST_REASON_TCP_RFC7323_PAWS,
+	/** @SK_RST_REASON_TCP_TOO_OLD_ACK: TCP ACK is too old */
+	SK_RST_REASON_TCP_TOO_OLD_ACK,
+	/**
+	 * @SK_RST_REASON_TCP_ACK_UNSENT_DATA: TCP ACK for data we haven't
+	 * sent yet
+	 */
+	SK_RST_REASON_TCP_ACK_UNSENT_DATA,
 
 	/* Copy from include/uapi/linux/mptcp.h.
 	 * These reset fields will not be changed since they adhere to
@@ -130,6 +139,10 @@ sk_rst_convert_drop_reason(enum skb_drop_reason reason)
 		return SK_RST_REASON_TCP_INVALID_ACK_SEQUENCE;
 	case SKB_DROP_REASON_TCP_RFC7323_PAWS:
 		return SK_RST_REASON_TCP_RFC7323_PAWS;
+	case SKB_DROP_REASON_TCP_TOO_OLD_ACK:
+		return SK_RST_REASON_TCP_TOO_OLD_ACK;
+	case SKB_DROP_REASON_TCP_ACK_UNSENT_DATA:
+		return SK_RST_REASON_TCP_ACK_UNSENT_DATA;
 	default:
 		/* If we don't have our own corresponding reason */
 		return SK_RST_REASON_NOT_SPECIFIED;
-- 
cgit v1.2.3


From f6d5e2cc291fdf6f804c2754e345b41055d1aac4 Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Fri, 10 May 2024 20:25:00 +0800
Subject: tcp: rstreason: fully support in tcp_rcv_state_process()

Like the previous patch does in this series, finish the conversion map is
enough to let rstreason mechanism work in this function.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Link: https://lore.kernel.org/r/20240510122502.27850-4-kerneljasonxing@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/rstreason.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include')

diff --git a/include/net/rstreason.h b/include/net/rstreason.h
index 69404c14f45d..fc1b99702771 100644
--- a/include/net/rstreason.h
+++ b/include/net/rstreason.h
@@ -12,6 +12,9 @@
 	FN(TCP_RFC7323_PAWS)		\
 	FN(TCP_TOO_OLD_ACK)		\
 	FN(TCP_ACK_UNSENT_DATA)		\
+	FN(TCP_FLAGS)			\
+	FN(TCP_OLD_ACK)			\
+	FN(TCP_ABORT_ON_DATA)		\
 	FN(MPTCP_RST_EUNSPEC)		\
 	FN(MPTCP_RST_EMPTCP)		\
 	FN(MPTCP_RST_ERESOURCE)		\
@@ -59,6 +62,15 @@ enum sk_rst_reason {
 	 * sent yet
 	 */
 	SK_RST_REASON_TCP_ACK_UNSENT_DATA,
+	/** @SK_RST_REASON_TCP_FLAGS: TCP flags invalid */
+	SK_RST_REASON_TCP_FLAGS,
+	/** @SK_RST_REASON_TCP_OLD_ACK: TCP ACK is old, but in window */
+	SK_RST_REASON_TCP_OLD_ACK,
+	/**
+	 * @SK_RST_REASON_TCP_ABORT_ON_DATA: abort on data
+	 * corresponding to LINUX_MIB_TCPABORTONDATA
+	 */
+	SK_RST_REASON_TCP_ABORT_ON_DATA,
 
 	/* Copy from include/uapi/linux/mptcp.h.
 	 * These reset fields will not be changed since they adhere to
@@ -143,6 +155,12 @@ sk_rst_convert_drop_reason(enum skb_drop_reason reason)
 		return SK_RST_REASON_TCP_TOO_OLD_ACK;
 	case SKB_DROP_REASON_TCP_ACK_UNSENT_DATA:
 		return SK_RST_REASON_TCP_ACK_UNSENT_DATA;
+	case SKB_DROP_REASON_TCP_FLAGS:
+		return SK_RST_REASON_TCP_FLAGS;
+	case SKB_DROP_REASON_TCP_OLD_ACK:
+		return SK_RST_REASON_TCP_OLD_ACK;
+	case SKB_DROP_REASON_TCP_ABORT_ON_DATA:
+		return SK_RST_REASON_TCP_ABORT_ON_DATA;
 	default:
 		/* If we don't have our own corresponding reason */
 		return SK_RST_REASON_NOT_SPECIFIED;
-- 
cgit v1.2.3


From 22a32557758a7100e46dfa8f383a401125e60b16 Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Fri, 10 May 2024 20:25:01 +0800
Subject: tcp: rstreason: handle timewait cases in the receive path

There are two possible cases where TCP layer can send an RST. Since they
happen in the same place, I think using one independent reason is enough
to identify this special situation.

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Link: https://lore.kernel.org/r/20240510122502.27850-5-kerneljasonxing@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/rstreason.h | 5 +++++
 net/ipv4/tcp_ipv4.c     | 2 +-
 net/ipv6/tcp_ipv6.c     | 2 +-
 3 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/rstreason.h b/include/net/rstreason.h
index fc1b99702771..7ae5bb55559b 100644
--- a/include/net/rstreason.h
+++ b/include/net/rstreason.h
@@ -15,6 +15,7 @@
 	FN(TCP_FLAGS)			\
 	FN(TCP_OLD_ACK)			\
 	FN(TCP_ABORT_ON_DATA)		\
+	FN(TCP_TIMEWAIT_SOCKET)		\
 	FN(MPTCP_RST_EUNSPEC)		\
 	FN(MPTCP_RST_EMPTCP)		\
 	FN(MPTCP_RST_ERESOURCE)		\
@@ -72,6 +73,10 @@ enum sk_rst_reason {
 	 */
 	SK_RST_REASON_TCP_ABORT_ON_DATA,
 
+	/* Here start with the independent reasons */
+	/** @SK_RST_REASON_TCP_TIMEWAIT_SOCKET: happen on the timewait socket */
+	SK_RST_REASON_TCP_TIMEWAIT_SOCKET,
+
 	/* Copy from include/uapi/linux/mptcp.h.
 	 * These reset fields will not be changed since they adhere to
 	 * RFC 8684. So do not touch them. I'm going to list each definition
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 108a438dc247..30ef0c8f5e92 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2427,7 +2427,7 @@ do_time_wait:
 		tcp_v4_timewait_ack(sk, skb);
 		break;
 	case TCP_TW_RST:
-		tcp_v4_send_reset(sk, skb, sk_rst_convert_drop_reason(drop_reason));
+		tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
 		inet_twsk_deschedule_put(inet_twsk(sk));
 		goto discard_it;
 	case TCP_TW_SUCCESS:;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 7f6693e794bd..4c3605485b68 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1999,7 +1999,7 @@ do_time_wait:
 		tcp_v6_timewait_ack(sk, skb);
 		break;
 	case TCP_TW_RST:
-		tcp_v6_send_reset(sk, skb, sk_rst_convert_drop_reason(drop_reason));
+		tcp_v6_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
 		inet_twsk_deschedule_put(inet_twsk(sk));
 		goto discard_it;
 	case TCP_TW_SUCCESS:
-- 
cgit v1.2.3


From 11f46ea9814d2f0a3c7a5bc749d7619e47251f75 Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Fri, 10 May 2024 20:25:02 +0800
Subject: tcp: rstreason: fully support in tcp_check_req()

We're going to send an RST due to invalid syn packet which is already
checked whether 1) it is in sequence, 2) it is a retransmitted skb.

As RFC 793 says, if the state of socket is not CLOSED/LISTEN/SYN-SENT,
then we should send an RST when receiving bad syn packet:
"fourth, check the SYN bit,...If the SYN is in the window it is an
error, send a reset"

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Link: https://lore.kernel.org/r/20240510122502.27850-6-kerneljasonxing@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/rstreason.h  | 8 ++++++++
 net/ipv4/tcp_minisocks.c | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/rstreason.h b/include/net/rstreason.h
index 7ae5bb55559b..2575c85d7f7a 100644
--- a/include/net/rstreason.h
+++ b/include/net/rstreason.h
@@ -16,6 +16,7 @@
 	FN(TCP_OLD_ACK)			\
 	FN(TCP_ABORT_ON_DATA)		\
 	FN(TCP_TIMEWAIT_SOCKET)		\
+	FN(INVALID_SYN)			\
 	FN(MPTCP_RST_EUNSPEC)		\
 	FN(MPTCP_RST_EMPTCP)		\
 	FN(MPTCP_RST_ERESOURCE)		\
@@ -76,6 +77,13 @@ enum sk_rst_reason {
 	/* Here start with the independent reasons */
 	/** @SK_RST_REASON_TCP_TIMEWAIT_SOCKET: happen on the timewait socket */
 	SK_RST_REASON_TCP_TIMEWAIT_SOCKET,
+	/**
+	 * @SK_RST_REASON_INVALID_SYN: receive bad syn packet
+	 * RFC 793 says if the state is not CLOSED/LISTEN/SYN-SENT then
+	 * "fourth, check the SYN bit,...If the SYN is in the window it is
+	 * an error, send a reset"
+	 */
+	SK_RST_REASON_INVALID_SYN,
 
 	/* Copy from include/uapi/linux/mptcp.h.
 	 * These reset fields will not be changed since they adhere to
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 7d543569a180..b93619b2384b 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -879,7 +879,7 @@ embryonic_reset:
 		 * avoid becoming vulnerable to outside attack aiming at
 		 * resetting legit local connections.
 		 */
-		req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED);
+		req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_INVALID_SYN);
 	} else if (fastopen) { /* received a valid RST pkt */
 		reqsk_fastopen_remove(sk, req, true);
 		tcp_reset(sk, skb);
-- 
cgit v1.2.3


From 73c900aa3660dae2050af456d1031d098ab4cb1d Mon Sep 17 00:00:00 2001
From: Gregory Detal <gregory.detal@gmail.com>
Date: Mon, 13 May 2024 18:13:28 -0700
Subject: mptcp: add net.mptcp.available_schedulers

The sysctl lists the available schedulers that can be set using
net.mptcp.scheduler similarly to net.ipv4.tcp_available_congestion_control.

Signed-off-by: Gregory Detal <gregory.detal@gmail.com>
Reviewed-by: Mat Martineau <martineau@kernel.org>
Tested-by: Geliang Tang <geliang@kernel.org>
Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
Signed-off-by: Mat Martineau <martineau@kernel.org>
Link: https://lore.kernel.org/r/20240514011335.176158-5-martineau@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/mptcp.h  |  3 +++
 net/mptcp/ctrl.c     | 27 ++++++++++++++++++++++++++-
 net/mptcp/protocol.h |  1 +
 net/mptcp/sched.c    | 22 ++++++++++++++++++++++
 4 files changed, 52 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index fb996124b3d5..0bc4ab03f487 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -97,6 +97,9 @@ struct mptcp_out_options {
 };
 
 #define MPTCP_SCHED_NAME_MAX	16
+#define MPTCP_SCHED_MAX		128
+#define MPTCP_SCHED_BUF_MAX	(MPTCP_SCHED_NAME_MAX * MPTCP_SCHED_MAX)
+
 #define MPTCP_SUBFLOWS_MAX	8
 
 struct mptcp_sched_data {
diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c
index 49abf80b3fad..542555ba474c 100644
--- a/net/mptcp/ctrl.c
+++ b/net/mptcp/ctrl.c
@@ -133,6 +133,24 @@ static int proc_scheduler(struct ctl_table *ctl, int write,
 	return ret;
 }
 
+static int proc_available_schedulers(struct ctl_table *ctl,
+				     int write, void *buffer,
+				     size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table tbl = { .maxlen = MPTCP_SCHED_BUF_MAX, };
+	int ret;
+
+	tbl.data = kmalloc(tbl.maxlen, GFP_USER);
+	if (!tbl.data)
+		return -ENOMEM;
+
+	mptcp_get_available_schedulers(tbl.data, MPTCP_SCHED_BUF_MAX);
+	ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+	kfree(tbl.data);
+
+	return ret;
+}
+
 static struct ctl_table mptcp_sysctl_table[] = {
 	{
 		.procname = "enabled",
@@ -187,6 +205,12 @@ static struct ctl_table mptcp_sysctl_table[] = {
 		.mode = 0644,
 		.proc_handler = proc_scheduler,
 	},
+	{
+		.procname = "available_schedulers",
+		.maxlen	= MPTCP_SCHED_BUF_MAX,
+		.mode = 0644,
+		.proc_handler = proc_available_schedulers,
+	},
 	{
 		.procname = "close_timeout",
 		.maxlen = sizeof(unsigned int),
@@ -214,7 +238,8 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
 	table[4].data = &pernet->stale_loss_cnt;
 	table[5].data = &pernet->pm_type;
 	table[6].data = &pernet->scheduler;
-	table[7].data = &pernet->close_timeout;
+	/* table[7] is for available_schedulers which is read-only info */
+	table[8].data = &pernet->close_timeout;
 
 	hdr = register_net_sysctl_sz(net, MPTCP_SYSCTL_PATH, table,
 				     ARRAY_SIZE(mptcp_sysctl_table));
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 4dcce3641d1d..39fc47e6b88a 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -686,6 +686,7 @@ unsigned int mptcp_stale_loss_cnt(const struct net *net);
 unsigned int mptcp_close_timeout(const struct sock *sk);
 int mptcp_get_pm_type(const struct net *net);
 const char *mptcp_get_scheduler(const struct net *net);
+void mptcp_get_available_schedulers(char *buf, size_t maxlen);
 void __mptcp_subflow_fully_established(struct mptcp_sock *msk,
 				       struct mptcp_subflow_context *subflow,
 				       const struct mptcp_options_received *mp_opt);
diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c
index 4ab0693c069c..4a7fd0508ad2 100644
--- a/net/mptcp/sched.c
+++ b/net/mptcp/sched.c
@@ -51,6 +51,28 @@ struct mptcp_sched_ops *mptcp_sched_find(const char *name)
 	return ret;
 }
 
+/* Build string with list of available scheduler values.
+ * Similar to tcp_get_available_congestion_control()
+ */
+void mptcp_get_available_schedulers(char *buf, size_t maxlen)
+{
+	struct mptcp_sched_ops *sched;
+	size_t offs = 0;
+
+	rcu_read_lock();
+	spin_lock(&mptcp_sched_list_lock);
+	list_for_each_entry_rcu(sched, &mptcp_sched_list, list) {
+		offs += snprintf(buf + offs, maxlen - offs,
+				 "%s%s",
+				 offs == 0 ? "" : " ", sched->name);
+
+		if (WARN_ON_ONCE(offs >= maxlen))
+			break;
+	}
+	spin_unlock(&mptcp_sched_list_lock);
+	rcu_read_unlock();
+}
+
 int mptcp_register_scheduler(struct mptcp_sched_ops *sched)
 {
 	if (!sched->get_subflow)
-- 
cgit v1.2.3


From 36ac9e7f2e5786bd37c5cd91132e1f39c29b8197 Mon Sep 17 00:00:00 2001
From: Xiaolei Wang <xiaolei.wang@windriver.com>
Date: Mon, 13 May 2024 09:43:45 +0800
Subject: net: stmmac: move the EST lock to struct stmmac_priv

Reinitialize the whole EST structure would also reset the mutex
lock which is embedded in the EST structure, and then trigger
the following warning. To address this, move the lock to struct
stmmac_priv. We also need to reacquire the mutex lock when doing
this initialization.

DEBUG_LOCKS_WARN_ON(lock->magic != lock)
WARNING: CPU: 3 PID: 505 at kernel/locking/mutex.c:587 __mutex_lock+0xd84/0x1068
 Modules linked in:
 CPU: 3 PID: 505 Comm: tc Not tainted 6.9.0-rc6-00053-g0106679839f7-dirty #29
 Hardware name: NXP i.MX8MPlus EVK board (DT)
 pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
 pc : __mutex_lock+0xd84/0x1068
 lr : __mutex_lock+0xd84/0x1068
 sp : ffffffc0864e3570
 x29: ffffffc0864e3570 x28: ffffffc0817bdc78 x27: 0000000000000003
 x26: ffffff80c54f1808 x25: ffffff80c9164080 x24: ffffffc080d723ac
 x23: 0000000000000000 x22: 0000000000000002 x21: 0000000000000000
 x20: 0000000000000000 x19: ffffffc083bc3000 x18: ffffffffffffffff
 x17: ffffffc08117b080 x16: 0000000000000002 x15: ffffff80d2d40000
 x14: 00000000000002da x13: ffffff80d2d404b8 x12: ffffffc082b5a5c8
 x11: ffffffc082bca680 x10: ffffffc082bb2640 x9 : ffffffc082bb2698
 x8 : 0000000000017fe8 x7 : c0000000ffffefff x6 : 0000000000000001
 x5 : ffffff8178fe0d48 x4 : 0000000000000000 x3 : 0000000000000027
 x2 : ffffff8178fe0d50 x1 : 0000000000000000 x0 : 0000000000000000
 Call trace:
  __mutex_lock+0xd84/0x1068
  mutex_lock_nested+0x28/0x34
  tc_setup_taprio+0x118/0x68c
  stmmac_setup_tc+0x50/0xf0
  taprio_change+0x868/0xc9c

Fixes: b2aae654a479 ("net: stmmac: add mutex lock to protect est parameters")
Signed-off-by: Xiaolei Wang <xiaolei.wang@windriver.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
Reviewed-by: Andrew Halaney <ahalaney@redhat.com>
Link: https://lore.kernel.org/r/20240513014346.1718740-2-xiaolei.wang@windriver.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac.h     |  2 ++
 drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c |  8 ++++----
 drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c  | 18 ++++++++++--------
 include/linux/stmmac.h                           |  1 -
 4 files changed, 16 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
index ed38099ca740..044e61a385e4 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
@@ -261,6 +261,8 @@ struct stmmac_priv {
 	struct stmmac_extra_stats xstats ____cacheline_aligned_in_smp;
 	struct stmmac_safety_stats sstats;
 	struct plat_stmmacenet_data *plat;
+	/* Protect est parameters */
+	struct mutex est_lock;
 	struct dma_features dma_cap;
 	struct stmmac_counters mmc;
 	int hw_cap_support;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
index e04830a3a1fb..0c5aab6dd7a7 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
@@ -70,11 +70,11 @@ static int stmmac_adjust_time(struct ptp_clock_info *ptp, s64 delta)
 	/* If EST is enabled, disabled it before adjust ptp time. */
 	if (priv->plat->est && priv->plat->est->enable) {
 		est_rst = true;
-		mutex_lock(&priv->plat->est->lock);
+		mutex_lock(&priv->est_lock);
 		priv->plat->est->enable = false;
 		stmmac_est_configure(priv, priv, priv->plat->est,
 				     priv->plat->clk_ptp_rate);
-		mutex_unlock(&priv->plat->est->lock);
+		mutex_unlock(&priv->est_lock);
 	}
 
 	write_lock_irqsave(&priv->ptp_lock, flags);
@@ -87,7 +87,7 @@ static int stmmac_adjust_time(struct ptp_clock_info *ptp, s64 delta)
 		ktime_t current_time_ns, basetime;
 		u64 cycle_time;
 
-		mutex_lock(&priv->plat->est->lock);
+		mutex_lock(&priv->est_lock);
 		priv->ptp_clock_ops.gettime64(&priv->ptp_clock_ops, &current_time);
 		current_time_ns = timespec64_to_ktime(current_time);
 		time.tv_nsec = priv->plat->est->btr_reserve[0];
@@ -104,7 +104,7 @@ static int stmmac_adjust_time(struct ptp_clock_info *ptp, s64 delta)
 		priv->plat->est->enable = true;
 		ret = stmmac_est_configure(priv, priv, priv->plat->est,
 					   priv->plat->clk_ptp_rate);
-		mutex_unlock(&priv->plat->est->lock);
+		mutex_unlock(&priv->est_lock);
 		if (ret)
 			netdev_err(priv->dev, "failed to configure EST\n");
 	}
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
index cce00719937d..620c16e9be3a 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
@@ -1004,17 +1004,19 @@ static int tc_taprio_configure(struct stmmac_priv *priv,
 		if (!plat->est)
 			return -ENOMEM;
 
-		mutex_init(&priv->plat->est->lock);
+		mutex_init(&priv->est_lock);
 	} else {
+		mutex_lock(&priv->est_lock);
 		memset(plat->est, 0, sizeof(*plat->est));
+		mutex_unlock(&priv->est_lock);
 	}
 
 	size = qopt->num_entries;
 
-	mutex_lock(&priv->plat->est->lock);
+	mutex_lock(&priv->est_lock);
 	priv->plat->est->gcl_size = size;
 	priv->plat->est->enable = qopt->cmd == TAPRIO_CMD_REPLACE;
-	mutex_unlock(&priv->plat->est->lock);
+	mutex_unlock(&priv->est_lock);
 
 	for (i = 0; i < size; i++) {
 		s64 delta_ns = qopt->entries[i].interval;
@@ -1045,7 +1047,7 @@ static int tc_taprio_configure(struct stmmac_priv *priv,
 		priv->plat->est->gcl[i] = delta_ns | (gates << wid);
 	}
 
-	mutex_lock(&priv->plat->est->lock);
+	mutex_lock(&priv->est_lock);
 	/* Adjust for real system time */
 	priv->ptp_clock_ops.gettime64(&priv->ptp_clock_ops, &current_time);
 	current_time_ns = timespec64_to_ktime(current_time);
@@ -1068,7 +1070,7 @@ static int tc_taprio_configure(struct stmmac_priv *priv,
 	tc_taprio_map_maxsdu_txq(priv, qopt);
 
 	if (fpe && !priv->dma_cap.fpesel) {
-		mutex_unlock(&priv->plat->est->lock);
+		mutex_unlock(&priv->est_lock);
 		return -EOPNOTSUPP;
 	}
 
@@ -1079,7 +1081,7 @@ static int tc_taprio_configure(struct stmmac_priv *priv,
 
 	ret = stmmac_est_configure(priv, priv, priv->plat->est,
 				   priv->plat->clk_ptp_rate);
-	mutex_unlock(&priv->plat->est->lock);
+	mutex_unlock(&priv->est_lock);
 	if (ret) {
 		netdev_err(priv->dev, "failed to configure EST\n");
 		goto disable;
@@ -1096,7 +1098,7 @@ static int tc_taprio_configure(struct stmmac_priv *priv,
 
 disable:
 	if (priv->plat->est) {
-		mutex_lock(&priv->plat->est->lock);
+		mutex_lock(&priv->est_lock);
 		priv->plat->est->enable = false;
 		stmmac_est_configure(priv, priv, priv->plat->est,
 				     priv->plat->clk_ptp_rate);
@@ -1105,7 +1107,7 @@ disable:
 			priv->xstats.max_sdu_txq_drop[i] = 0;
 			priv->xstats.mtl_est_txq_hlbf[i] = 0;
 		}
-		mutex_unlock(&priv->plat->est->lock);
+		mutex_unlock(&priv->est_lock);
 	}
 
 	priv->plat->fpe_cfg->enable = false;
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index 4a24a246c617..c4caed4eef52 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -117,7 +117,6 @@ struct stmmac_axi {
 
 #define EST_GCL		1024
 struct stmmac_est {
-	struct mutex lock;
 	int enable;
 	u32 btr_reserve[2];
 	u32 btr_offset[2];
-- 
cgit v1.2.3


From bd17382ac36ed97848f1712714488e84c93605ca Mon Sep 17 00:00:00 2001
From: Xiaolei Wang <xiaolei.wang@windriver.com>
Date: Mon, 13 May 2024 09:43:46 +0800
Subject: net: stmmac: move the EST structure to struct stmmac_priv

Move the EST structure to struct stmmac_priv, because the
EST configs don't look like platform config, but EST is
enabled in runtime with the settings retrieved for the TC
TAPRIO feature also in runtime. So it's better to have the
EST-data preserved in the driver private data instead of
the platform data storage.

Signed-off-by: Xiaolei Wang <xiaolei.wang@windriver.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
Reviewed-by: Andrew Halaney <ahalaney@redhat.com>
Link: https://lore.kernel.org/r/20240513014346.1718740-3-xiaolei.wang@windriver.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac.h      | 15 +++++++++
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 18 +++++-----
 drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c  | 22 ++++++-------
 drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c   | 40 +++++++++++------------
 include/linux/stmmac.h                            | 15 ---------
 5 files changed, 54 insertions(+), 56 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
index 044e61a385e4..b23b920eedb1 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
@@ -221,6 +221,20 @@ struct stmmac_dma_conf {
 	unsigned int dma_tx_size;
 };
 
+#define EST_GCL         1024
+struct stmmac_est {
+	int enable;
+	u32 btr_reserve[2];
+	u32 btr_offset[2];
+	u32 btr[2];
+	u32 ctr[2];
+	u32 ter;
+	u32 gcl_unaligned[EST_GCL];
+	u32 gcl[EST_GCL];
+	u32 gcl_size;
+	u32 max_sdu[MTL_MAX_TX_QUEUES];
+};
+
 struct stmmac_priv {
 	/* Frequently used values are kept adjacent for cache effect */
 	u32 tx_coal_frames[MTL_MAX_TX_QUEUES];
@@ -263,6 +277,7 @@ struct stmmac_priv {
 	struct plat_stmmacenet_data *plat;
 	/* Protect est parameters */
 	struct mutex est_lock;
+	struct stmmac_est *est;
 	struct dma_features dma_cap;
 	struct stmmac_counters mmc;
 	int hw_cap_support;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index ef285b3c56ab..2e9a2da605f6 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -2498,9 +2498,9 @@ static bool stmmac_xdp_xmit_zc(struct stmmac_priv *priv, u32 queue, u32 budget)
 		if (!xsk_tx_peek_desc(pool, &xdp_desc))
 			break;
 
-		if (priv->plat->est && priv->plat->est->enable &&
-		    priv->plat->est->max_sdu[queue] &&
-		    xdp_desc.len > priv->plat->est->max_sdu[queue]) {
+		if (priv->est && priv->est->enable &&
+		    priv->est->max_sdu[queue] &&
+		    xdp_desc.len > priv->est->max_sdu[queue]) {
 			priv->xstats.max_sdu_txq_drop[queue]++;
 			continue;
 		}
@@ -4538,9 +4538,9 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 			return stmmac_tso_xmit(skb, dev);
 	}
 
-	if (priv->plat->est && priv->plat->est->enable &&
-	    priv->plat->est->max_sdu[queue] &&
-	    skb->len > priv->plat->est->max_sdu[queue]){
+	if (priv->est && priv->est->enable &&
+	    priv->est->max_sdu[queue] &&
+	    skb->len > priv->est->max_sdu[queue]){
 		priv->xstats.max_sdu_txq_drop[queue]++;
 		goto max_sdu_err;
 	}
@@ -4919,9 +4919,9 @@ static int stmmac_xdp_xmit_xdpf(struct stmmac_priv *priv, int queue,
 	if (stmmac_tx_avail(priv, queue) < STMMAC_TX_THRESH(priv))
 		return STMMAC_XDP_CONSUMED;
 
-	if (priv->plat->est && priv->plat->est->enable &&
-	    priv->plat->est->max_sdu[queue] &&
-	    xdpf->len > priv->plat->est->max_sdu[queue]) {
+	if (priv->est && priv->est->enable &&
+	    priv->est->max_sdu[queue] &&
+	    xdpf->len > priv->est->max_sdu[queue]) {
 		priv->xstats.max_sdu_txq_drop[queue]++;
 		return STMMAC_XDP_CONSUMED;
 	}
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
index 0c5aab6dd7a7..a6b1de9a251d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
@@ -68,11 +68,11 @@ static int stmmac_adjust_time(struct ptp_clock_info *ptp, s64 delta)
 	nsec = reminder;
 
 	/* If EST is enabled, disabled it before adjust ptp time. */
-	if (priv->plat->est && priv->plat->est->enable) {
+	if (priv->est && priv->est->enable) {
 		est_rst = true;
 		mutex_lock(&priv->est_lock);
-		priv->plat->est->enable = false;
-		stmmac_est_configure(priv, priv, priv->plat->est,
+		priv->est->enable = false;
+		stmmac_est_configure(priv, priv, priv->est,
 				     priv->plat->clk_ptp_rate);
 		mutex_unlock(&priv->est_lock);
 	}
@@ -90,19 +90,19 @@ static int stmmac_adjust_time(struct ptp_clock_info *ptp, s64 delta)
 		mutex_lock(&priv->est_lock);
 		priv->ptp_clock_ops.gettime64(&priv->ptp_clock_ops, &current_time);
 		current_time_ns = timespec64_to_ktime(current_time);
-		time.tv_nsec = priv->plat->est->btr_reserve[0];
-		time.tv_sec = priv->plat->est->btr_reserve[1];
+		time.tv_nsec = priv->est->btr_reserve[0];
+		time.tv_sec = priv->est->btr_reserve[1];
 		basetime = timespec64_to_ktime(time);
-		cycle_time = (u64)priv->plat->est->ctr[1] * NSEC_PER_SEC +
-			     priv->plat->est->ctr[0];
+		cycle_time = (u64)priv->est->ctr[1] * NSEC_PER_SEC +
+			     priv->est->ctr[0];
 		time = stmmac_calc_tas_basetime(basetime,
 						current_time_ns,
 						cycle_time);
 
-		priv->plat->est->btr[0] = (u32)time.tv_nsec;
-		priv->plat->est->btr[1] = (u32)time.tv_sec;
-		priv->plat->est->enable = true;
-		ret = stmmac_est_configure(priv, priv, priv->plat->est,
+		priv->est->btr[0] = (u32)time.tv_nsec;
+		priv->est->btr[1] = (u32)time.tv_sec;
+		priv->est->enable = true;
+		ret = stmmac_est_configure(priv, priv, priv->est,
 					   priv->plat->clk_ptp_rate);
 		mutex_unlock(&priv->est_lock);
 		if (ret)
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
index 620c16e9be3a..222540b55480 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c
@@ -918,7 +918,6 @@ struct timespec64 stmmac_calc_tas_basetime(ktime_t old_base_time,
 static void tc_taprio_map_maxsdu_txq(struct stmmac_priv *priv,
 				     struct tc_taprio_qopt_offload *qopt)
 {
-	struct plat_stmmacenet_data *plat = priv->plat;
 	u32 num_tc = qopt->mqprio.qopt.num_tc;
 	u32 offset, count, i, j;
 
@@ -933,7 +932,7 @@ static void tc_taprio_map_maxsdu_txq(struct stmmac_priv *priv,
 		count = qopt->mqprio.qopt.count[i];
 
 		for (j = offset; j < offset + count; j++)
-			plat->est->max_sdu[j] = qopt->max_sdu[i] + ETH_HLEN - ETH_TLEN;
+			priv->est->max_sdu[j] = qopt->max_sdu[i] + ETH_HLEN - ETH_TLEN;
 	}
 }
 
@@ -941,7 +940,6 @@ static int tc_taprio_configure(struct stmmac_priv *priv,
 			       struct tc_taprio_qopt_offload *qopt)
 {
 	u32 size, wid = priv->dma_cap.estwid, dep = priv->dma_cap.estdep;
-	struct plat_stmmacenet_data *plat = priv->plat;
 	struct timespec64 time, current_time, qopt_time;
 	ktime_t current_time_ns;
 	bool fpe = false;
@@ -998,24 +996,24 @@ static int tc_taprio_configure(struct stmmac_priv *priv,
 	if (qopt->cycle_time_extension >= BIT(wid + 7))
 		return -ERANGE;
 
-	if (!plat->est) {
-		plat->est = devm_kzalloc(priv->device, sizeof(*plat->est),
+	if (!priv->est) {
+		priv->est = devm_kzalloc(priv->device, sizeof(*priv->est),
 					 GFP_KERNEL);
-		if (!plat->est)
+		if (!priv->est)
 			return -ENOMEM;
 
 		mutex_init(&priv->est_lock);
 	} else {
 		mutex_lock(&priv->est_lock);
-		memset(plat->est, 0, sizeof(*plat->est));
+		memset(priv->est, 0, sizeof(*priv->est));
 		mutex_unlock(&priv->est_lock);
 	}
 
 	size = qopt->num_entries;
 
 	mutex_lock(&priv->est_lock);
-	priv->plat->est->gcl_size = size;
-	priv->plat->est->enable = qopt->cmd == TAPRIO_CMD_REPLACE;
+	priv->est->gcl_size = size;
+	priv->est->enable = qopt->cmd == TAPRIO_CMD_REPLACE;
 	mutex_unlock(&priv->est_lock);
 
 	for (i = 0; i < size; i++) {
@@ -1044,7 +1042,7 @@ static int tc_taprio_configure(struct stmmac_priv *priv,
 			return -EOPNOTSUPP;
 		}
 
-		priv->plat->est->gcl[i] = delta_ns | (gates << wid);
+		priv->est->gcl[i] = delta_ns | (gates << wid);
 	}
 
 	mutex_lock(&priv->est_lock);
@@ -1054,18 +1052,18 @@ static int tc_taprio_configure(struct stmmac_priv *priv,
 	time = stmmac_calc_tas_basetime(qopt->base_time, current_time_ns,
 					qopt->cycle_time);
 
-	priv->plat->est->btr[0] = (u32)time.tv_nsec;
-	priv->plat->est->btr[1] = (u32)time.tv_sec;
+	priv->est->btr[0] = (u32)time.tv_nsec;
+	priv->est->btr[1] = (u32)time.tv_sec;
 
 	qopt_time = ktime_to_timespec64(qopt->base_time);
-	priv->plat->est->btr_reserve[0] = (u32)qopt_time.tv_nsec;
-	priv->plat->est->btr_reserve[1] = (u32)qopt_time.tv_sec;
+	priv->est->btr_reserve[0] = (u32)qopt_time.tv_nsec;
+	priv->est->btr_reserve[1] = (u32)qopt_time.tv_sec;
 
 	ctr = qopt->cycle_time;
-	priv->plat->est->ctr[0] = do_div(ctr, NSEC_PER_SEC);
-	priv->plat->est->ctr[1] = (u32)ctr;
+	priv->est->ctr[0] = do_div(ctr, NSEC_PER_SEC);
+	priv->est->ctr[1] = (u32)ctr;
 
-	priv->plat->est->ter = qopt->cycle_time_extension;
+	priv->est->ter = qopt->cycle_time_extension;
 
 	tc_taprio_map_maxsdu_txq(priv, qopt);
 
@@ -1079,7 +1077,7 @@ static int tc_taprio_configure(struct stmmac_priv *priv,
 	 */
 	priv->plat->fpe_cfg->enable = fpe;
 
-	ret = stmmac_est_configure(priv, priv, priv->plat->est,
+	ret = stmmac_est_configure(priv, priv, priv->est,
 				   priv->plat->clk_ptp_rate);
 	mutex_unlock(&priv->est_lock);
 	if (ret) {
@@ -1097,10 +1095,10 @@ static int tc_taprio_configure(struct stmmac_priv *priv,
 	return 0;
 
 disable:
-	if (priv->plat->est) {
+	if (priv->est) {
 		mutex_lock(&priv->est_lock);
-		priv->plat->est->enable = false;
-		stmmac_est_configure(priv, priv, priv->plat->est,
+		priv->est->enable = false;
+		stmmac_est_configure(priv, priv, priv->est,
 				     priv->plat->clk_ptp_rate);
 		/* Reset taprio status */
 		for (i = 0; i < priv->plat->tx_queues_to_use; i++) {
diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h
index c4caed4eef52..f92c195c76ed 100644
--- a/include/linux/stmmac.h
+++ b/include/linux/stmmac.h
@@ -115,20 +115,6 @@ struct stmmac_axi {
 	bool axi_rb;
 };
 
-#define EST_GCL		1024
-struct stmmac_est {
-	int enable;
-	u32 btr_reserve[2];
-	u32 btr_offset[2];
-	u32 btr[2];
-	u32 ctr[2];
-	u32 ter;
-	u32 gcl_unaligned[EST_GCL];
-	u32 gcl[EST_GCL];
-	u32 gcl_size;
-	u32 max_sdu[MTL_MAX_TX_QUEUES];
-};
-
 struct stmmac_rxq_cfg {
 	u8 mode_to_use;
 	u32 chan;
@@ -245,7 +231,6 @@ struct plat_stmmacenet_data {
 	struct fwnode_handle *port_node;
 	struct device_node *mdio_node;
 	struct stmmac_dma_cfg *dma_cfg;
-	struct stmmac_est *est;
 	struct stmmac_fpe_cfg *fpe_cfg;
 	struct stmmac_safety_feature_cfg *safety_feat_cfg;
 	int clk_csr;
-- 
cgit v1.2.3


From 5c1672705a1a2389f5ad78e0fea6f08ed32d6f18 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 13 May 2024 08:41:55 -0700
Subject: net: revert partially applied PHY topology series

The series is causing issues with PHY drivers built as modules.
Since it was only partially applied and the merge window has
opened let's revert and try again for v6.11.

Revert 6916e461e793 ("net: phy: Introduce ethernet link topology representation")
Revert 0ec5ed6c130e ("net: sfp: pass the phy_device when disconnecting an sfp module's PHY")
Revert e75e4e074c44 ("net: phy: add helpers to handle sfp phy connect/disconnect")
Revert fdd353965b52 ("net: sfp: Add helper to return the SFP bus name")
Revert 841942bc6212 ("net: ethtool: Allow passing a phy index for some commands")

Link: https://lore.kernel.org/all/171242462917.4000.9759453824684907063.git-patchwork-notify@kernel.org/
Link: https://lore.kernel.org/all/20240507102822.2023826-1-maxime.chevallier@bootlin.com/
Link: https://lore.kernel.org/r/20240513154156.104281-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/ethtool-netlink.rst |   7 --
 MAINTAINERS                                  |   2 -
 drivers/net/phy/Makefile                     |   2 +-
 drivers/net/phy/marvell-88x2222.c            |   2 -
 drivers/net/phy/marvell.c                    |   2 -
 drivers/net/phy/marvell10g.c                 |   2 -
 drivers/net/phy/phy_device.c                 |  55 --------------
 drivers/net/phy/phy_link_topology.c          | 105 ---------------------------
 drivers/net/phy/phylink.c                    |   3 +-
 drivers/net/phy/qcom/at803x.c                |   2 -
 drivers/net/phy/qcom/qca807x.c               |   2 -
 drivers/net/phy/sfp-bus.c                    |  15 +---
 include/linux/netdevice.h                    |   4 +-
 include/linux/phy.h                          |   6 --
 include/linux/phy_link_topology.h            |  72 ------------------
 include/linux/phy_link_topology_core.h       |  25 -------
 include/linux/sfp.h                          |   8 +-
 include/uapi/linux/ethtool.h                 |  16 ----
 include/uapi/linux/ethtool_netlink.h         |   1 -
 net/core/dev.c                               |   9 ---
 net/ethtool/netlink.c                        |  48 +-----------
 net/ethtool/netlink.h                        |   5 --
 22 files changed, 8 insertions(+), 385 deletions(-)
 delete mode 100644 drivers/net/phy/phy_link_topology.c
 delete mode 100644 include/linux/phy_link_topology.h
 delete mode 100644 include/linux/phy_link_topology_core.h

(limited to 'include')

diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst
index 8bc71f249448..160bfb0ae8ba 100644
--- a/Documentation/networking/ethtool-netlink.rst
+++ b/Documentation/networking/ethtool-netlink.rst
@@ -57,7 +57,6 @@ Structure of this header is
   ``ETHTOOL_A_HEADER_DEV_INDEX``  u32     device ifindex
   ``ETHTOOL_A_HEADER_DEV_NAME``   string  device name
   ``ETHTOOL_A_HEADER_FLAGS``      u32     flags common for all requests
-  ``ETHTOOL_A_HEADER_PHY_INDEX``  u32     phy device index
   ==============================  ======  =============================
 
 ``ETHTOOL_A_HEADER_DEV_INDEX`` and ``ETHTOOL_A_HEADER_DEV_NAME`` identify the
@@ -82,12 +81,6 @@ the behaviour is backward compatible, i.e. requests from old clients not aware
 of the flag should be interpreted the way the client expects. A client must
 not set flags it does not understand.
 
-``ETHTOOL_A_HEADER_PHY_INDEX`` identifies the Ethernet PHY the message relates to.
-As there are numerous commands that are related to PHY configuration, and because
-there may be more than one PHY on the link, the PHY index can be passed in the
-request for the commands that needs it. It is, however, not mandatory, and if it
-is not passed for commands that target a PHY, the net_device.phydev pointer
-is used.
 
 Bit sets
 ========
diff --git a/MAINTAINERS b/MAINTAINERS
index 40eb26e6c60c..b4ed6480e919 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8027,8 +8027,6 @@ F:	include/linux/mii.h
 F:	include/linux/of_net.h
 F:	include/linux/phy.h
 F:	include/linux/phy_fixed.h
-F:	include/linux/phy_link_topology.h
-F:	include/linux/phy_link_topology_core.h
 F:	include/linux/phylib_stubs.h
 F:	include/linux/platform_data/mdio-bcm-unimac.h
 F:	include/linux/platform_data/mdio-gpio.h
diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile
index 1d8be374915f..202ed7f450da 100644
--- a/drivers/net/phy/Makefile
+++ b/drivers/net/phy/Makefile
@@ -2,7 +2,7 @@
 # Makefile for Linux PHY drivers
 
 libphy-y			:= phy.o phy-c45.o phy-core.o phy_device.o \
-				   linkmode.o phy_link_topology.o
+				   linkmode.o
 mdio-bus-y			+= mdio_bus.o mdio_device.o
 
 ifdef CONFIG_MDIO_DEVICE
diff --git a/drivers/net/phy/marvell-88x2222.c b/drivers/net/phy/marvell-88x2222.c
index 0b777cdd7078..b88398e6872b 100644
--- a/drivers/net/phy/marvell-88x2222.c
+++ b/drivers/net/phy/marvell-88x2222.c
@@ -553,8 +553,6 @@ static const struct sfp_upstream_ops sfp_phy_ops = {
 	.link_down = mv2222_sfp_link_down,
 	.attach = phy_sfp_attach,
 	.detach = phy_sfp_detach,
-	.connect_phy = phy_sfp_connect_phy,
-	.disconnect_phy = phy_sfp_disconnect_phy,
 };
 
 static int mv2222_probe(struct phy_device *phydev)
diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c
index 9964bf3dea2f..b89fbffa6a93 100644
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -3613,8 +3613,6 @@ static const struct sfp_upstream_ops m88e1510_sfp_ops = {
 	.module_remove = m88e1510_sfp_remove,
 	.attach = phy_sfp_attach,
 	.detach = phy_sfp_detach,
-	.connect_phy = phy_sfp_connect_phy,
-	.disconnect_phy = phy_sfp_disconnect_phy,
 };
 
 static int m88e1510_probe(struct phy_device *phydev)
diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c
index 6642eb642d4b..ad43e280930c 100644
--- a/drivers/net/phy/marvell10g.c
+++ b/drivers/net/phy/marvell10g.c
@@ -503,8 +503,6 @@ static int mv3310_sfp_insert(void *upstream, const struct sfp_eeprom_id *id)
 static const struct sfp_upstream_ops mv3310_sfp_ops = {
 	.attach = phy_sfp_attach,
 	.detach = phy_sfp_detach,
-	.connect_phy = phy_sfp_connect_phy,
-	.disconnect_phy = phy_sfp_disconnect_phy,
 	.module_insert = mv3310_sfp_insert,
 };
 
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 616bd7ba46cb..6c6ec9475709 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -29,7 +29,6 @@
 #include <linux/phy.h>
 #include <linux/phylib_stubs.h>
 #include <linux/phy_led_triggers.h>
-#include <linux/phy_link_topology.h>
 #include <linux/pse-pd/pse.h>
 #include <linux/property.h>
 #include <linux/rtnetlink.h>
@@ -277,14 +276,6 @@ static void phy_mdio_device_remove(struct mdio_device *mdiodev)
 
 static struct phy_driver genphy_driver;
 
-static struct phy_link_topology *phy_get_link_topology(struct phy_device *phydev)
-{
-	if (phydev->attached_dev)
-		return phydev->attached_dev->link_topo;
-
-	return NULL;
-}
-
 static LIST_HEAD(phy_fixup_list);
 static DEFINE_MUTEX(phy_fixup_lock);
 
@@ -1378,46 +1369,6 @@ phy_standalone_show(struct device *dev, struct device_attribute *attr,
 }
 static DEVICE_ATTR_RO(phy_standalone);
 
-/**
- * phy_sfp_connect_phy - Connect the SFP module's PHY to the upstream PHY
- * @upstream: pointer to the upstream phy device
- * @phy: pointer to the SFP module's phy device
- *
- * This helper allows keeping track of PHY devices on the link. It adds the
- * SFP module's phy to the phy namespace of the upstream phy
- */
-int phy_sfp_connect_phy(void *upstream, struct phy_device *phy)
-{
-	struct phy_device *phydev = upstream;
-	struct phy_link_topology *topo = phy_get_link_topology(phydev);
-
-	if (topo)
-		return phy_link_topo_add_phy(topo, phy, PHY_UPSTREAM_PHY, phydev);
-
-	return 0;
-}
-EXPORT_SYMBOL(phy_sfp_connect_phy);
-
-/**
- * phy_sfp_disconnect_phy - Disconnect the SFP module's PHY from the upstream PHY
- * @upstream: pointer to the upstream phy device
- * @phy: pointer to the SFP module's phy device
- *
- * This helper allows keeping track of PHY devices on the link. It removes the
- * SFP module's phy to the phy namespace of the upstream phy. As the module phy
- * will be destroyed, re-inserting the same module will add a new phy with a
- * new index.
- */
-void phy_sfp_disconnect_phy(void *upstream, struct phy_device *phy)
-{
-	struct phy_device *phydev = upstream;
-	struct phy_link_topology *topo = phy_get_link_topology(phydev);
-
-	if (topo)
-		phy_link_topo_del_phy(topo, phy);
-}
-EXPORT_SYMBOL(phy_sfp_disconnect_phy);
-
 /**
  * phy_sfp_attach - attach the SFP bus to the PHY upstream network device
  * @upstream: pointer to the phy device
@@ -1560,11 +1511,6 @@ int phy_attach_direct(struct net_device *dev, struct phy_device *phydev,
 
 		if (phydev->sfp_bus_attached)
 			dev->sfp_bus = phydev->sfp_bus;
-
-		err = phy_link_topo_add_phy(dev->link_topo, phydev,
-					    PHY_UPSTREAM_MAC, dev);
-		if (err)
-			goto error;
 	}
 
 	/* Some Ethernet drivers try to connect to a PHY device before
@@ -1992,7 +1938,6 @@ void phy_detach(struct phy_device *phydev)
 	if (dev) {
 		phydev->attached_dev->phydev = NULL;
 		phydev->attached_dev = NULL;
-		phy_link_topo_del_phy(dev->link_topo, phydev);
 	}
 	phydev->phylink = NULL;
 
diff --git a/drivers/net/phy/phy_link_topology.c b/drivers/net/phy/phy_link_topology.c
deleted file mode 100644
index 985941c5c558..000000000000
--- a/drivers/net/phy/phy_link_topology.c
+++ /dev/null
@@ -1,105 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Infrastructure to handle all PHY devices connected to a given netdev,
- * either directly or indirectly attached.
- *
- * Copyright (c) 2023 Maxime Chevallier<maxime.chevallier@bootlin.com>
- */
-
-#include <linux/phy_link_topology.h>
-#include <linux/netdevice.h>
-#include <linux/phy.h>
-#include <linux/rtnetlink.h>
-#include <linux/xarray.h>
-
-struct phy_link_topology *phy_link_topo_create(struct net_device *dev)
-{
-	struct phy_link_topology *topo;
-
-	topo = kzalloc(sizeof(*topo), GFP_KERNEL);
-	if (!topo)
-		return ERR_PTR(-ENOMEM);
-
-	xa_init_flags(&topo->phys, XA_FLAGS_ALLOC1);
-	topo->next_phy_index = 1;
-
-	return topo;
-}
-
-void phy_link_topo_destroy(struct phy_link_topology *topo)
-{
-	if (!topo)
-		return;
-
-	xa_destroy(&topo->phys);
-	kfree(topo);
-}
-
-int phy_link_topo_add_phy(struct phy_link_topology *topo,
-			  struct phy_device *phy,
-			  enum phy_upstream upt, void *upstream)
-{
-	struct phy_device_node *pdn;
-	int ret;
-
-	pdn = kzalloc(sizeof(*pdn), GFP_KERNEL);
-	if (!pdn)
-		return -ENOMEM;
-
-	pdn->phy = phy;
-	switch (upt) {
-	case PHY_UPSTREAM_MAC:
-		pdn->upstream.netdev = (struct net_device *)upstream;
-		if (phy_on_sfp(phy))
-			pdn->parent_sfp_bus = pdn->upstream.netdev->sfp_bus;
-		break;
-	case PHY_UPSTREAM_PHY:
-		pdn->upstream.phydev = (struct phy_device *)upstream;
-		if (phy_on_sfp(phy))
-			pdn->parent_sfp_bus = pdn->upstream.phydev->sfp_bus;
-		break;
-	default:
-		ret = -EINVAL;
-		goto err;
-	}
-	pdn->upstream_type = upt;
-
-	/* Attempt to re-use a previously allocated phy_index */
-	if (phy->phyindex) {
-		ret = xa_insert(&topo->phys, phy->phyindex, pdn, GFP_KERNEL);
-
-		/* Errors could be either -ENOMEM or -EBUSY. If the phy has an
-		 * index, and there's another entry at the same index, this is
-		 * unexpected and we still error-out
-		 */
-		if (ret)
-			goto err;
-		return 0;
-	}
-
-	ret = xa_alloc_cyclic(&topo->phys, &phy->phyindex, pdn, xa_limit_32b,
-			      &topo->next_phy_index, GFP_KERNEL);
-	if (ret)
-		goto err;
-
-	return 0;
-
-err:
-	kfree(pdn);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(phy_link_topo_add_phy);
-
-void phy_link_topo_del_phy(struct phy_link_topology *topo,
-			   struct phy_device *phy)
-{
-	struct phy_device_node *pdn = xa_erase(&topo->phys, phy->phyindex);
-
-	/* We delete the PHY from the topology, however we don't re-set the
-	 * phy->phyindex field. If the PHY isn't gone, we can re-assign it the
-	 * same index next time it's added back to the topology
-	 */
-
-	kfree(pdn);
-}
-EXPORT_SYMBOL_GPL(phy_link_topo_del_phy);
diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c
index b7e5c669dc8e..994471fad833 100644
--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -3411,8 +3411,7 @@ static int phylink_sfp_connect_phy(void *upstream, struct phy_device *phy)
 	return ret;
 }
 
-static void phylink_sfp_disconnect_phy(void *upstream,
-				       struct phy_device *phydev)
+static void phylink_sfp_disconnect_phy(void *upstream)
 {
 	phylink_disconnect_phy(upstream);
 }
diff --git a/drivers/net/phy/qcom/at803x.c b/drivers/net/phy/qcom/at803x.c
index 105602581a03..c8f83e5f78ab 100644
--- a/drivers/net/phy/qcom/at803x.c
+++ b/drivers/net/phy/qcom/at803x.c
@@ -770,8 +770,6 @@ static const struct sfp_upstream_ops at8031_sfp_ops = {
 	.attach = phy_sfp_attach,
 	.detach = phy_sfp_detach,
 	.module_insert = at8031_sfp_insert,
-	.connect_phy = phy_sfp_connect_phy,
-	.disconnect_phy = phy_sfp_disconnect_phy,
 };
 
 static int at8031_parse_dt(struct phy_device *phydev)
diff --git a/drivers/net/phy/qcom/qca807x.c b/drivers/net/phy/qcom/qca807x.c
index 5eb0ab1cb70e..672c6929119a 100644
--- a/drivers/net/phy/qcom/qca807x.c
+++ b/drivers/net/phy/qcom/qca807x.c
@@ -699,8 +699,6 @@ static const struct sfp_upstream_ops qca807x_sfp_ops = {
 	.detach = phy_sfp_detach,
 	.module_insert = qca807x_sfp_insert,
 	.module_remove = qca807x_sfp_remove,
-	.connect_phy = phy_sfp_connect_phy,
-	.disconnect_phy = phy_sfp_disconnect_phy,
 };
 
 static int qca807x_probe(struct phy_device *phydev)
diff --git a/drivers/net/phy/sfp-bus.c b/drivers/net/phy/sfp-bus.c
index 37c85f1e6534..2f44fc51848f 100644
--- a/drivers/net/phy/sfp-bus.c
+++ b/drivers/net/phy/sfp-bus.c
@@ -487,7 +487,7 @@ static void sfp_unregister_bus(struct sfp_bus *bus)
 			bus->socket_ops->stop(bus->sfp);
 		bus->socket_ops->detach(bus->sfp);
 		if (bus->phydev && ops && ops->disconnect_phy)
-			ops->disconnect_phy(bus->upstream, bus->phydev);
+			ops->disconnect_phy(bus->upstream);
 	}
 	bus->registered = false;
 }
@@ -743,7 +743,7 @@ void sfp_remove_phy(struct sfp_bus *bus)
 	const struct sfp_upstream_ops *ops = sfp_get_upstream_ops(bus);
 
 	if (ops && ops->disconnect_phy)
-		ops->disconnect_phy(bus->upstream, bus->phydev);
+		ops->disconnect_phy(bus->upstream);
 	bus->phydev = NULL;
 }
 EXPORT_SYMBOL_GPL(sfp_remove_phy);
@@ -860,14 +860,3 @@ void sfp_unregister_socket(struct sfp_bus *bus)
 	sfp_bus_put(bus);
 }
 EXPORT_SYMBOL_GPL(sfp_unregister_socket);
-
-const char *sfp_get_name(struct sfp_bus *bus)
-{
-	ASSERT_RTNL();
-
-	if (bus->sfp_dev)
-		return dev_name(bus->sfp_dev);
-
-	return NULL;
-}
-EXPORT_SYMBOL_GPL(sfp_get_name);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cf261fb89d73..d20c6c99eb88 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -40,6 +40,7 @@
 #include <net/dcbnl.h>
 #endif
 #include <net/netprio_cgroup.h>
+
 #include <linux/netdev_features.h>
 #include <linux/neighbour.h>
 #include <uapi/linux/netdevice.h>
@@ -51,7 +52,6 @@
 #include <net/net_trackers.h>
 #include <net/net_debug.h>
 #include <net/dropreason-core.h>
-#include <linux/phy_link_topology_core.h>
 
 struct netpoll_info;
 struct device;
@@ -1975,7 +1975,6 @@ enum netdev_reg_state {
  *	@fcoe_ddp_xid:	Max exchange id for FCoE LRO by ddp
  *
  *	@priomap:	XXX: need comments on this one
- *	@link_topo:	Physical link topology tracking attached PHYs
  *	@phydev:	Physical device may attach itself
  *			for hardware timestamping
  *	@sfp_bus:	attached &struct sfp_bus structure.
@@ -2368,7 +2367,6 @@ struct net_device {
 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 	struct netprio_map __rcu *priomap;
 #endif
-	struct phy_link_topology	*link_topo;
 	struct phy_device	*phydev;
 	struct sfp_bus		*sfp_bus;
 	struct lock_class_key	*qdisc_tx_busylock;
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 3ddfe7fe781a..e6e83304558e 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -550,9 +550,6 @@ struct macsec_ops;
  * @drv: Pointer to the driver for this PHY instance
  * @devlink: Create a link between phy dev and mac dev, if the external phy
  *           used by current mac interface is managed by another mac interface.
- * @phyindex: Unique id across the phy's parent tree of phys to address the PHY
- *	      from userspace, similar to ifindex. A zero index means the PHY
- *	      wasn't assigned an id yet.
  * @phy_id: UID for this device found during discovery
  * @c45_ids: 802.3-c45 Device Identifiers if is_c45.
  * @is_c45:  Set to true if this PHY uses clause 45 addressing.
@@ -653,7 +650,6 @@ struct phy_device {
 
 	struct device_link *devlink;
 
-	u32 phyindex;
 	u32 phy_id;
 
 	struct phy_c45_device_ids c45_ids;
@@ -1758,8 +1754,6 @@ int phy_suspend(struct phy_device *phydev);
 int phy_resume(struct phy_device *phydev);
 int __phy_resume(struct phy_device *phydev);
 int phy_loopback(struct phy_device *phydev, bool enable);
-int phy_sfp_connect_phy(void *upstream, struct phy_device *phy);
-void phy_sfp_disconnect_phy(void *upstream, struct phy_device *phy);
 void phy_sfp_attach(void *upstream, struct sfp_bus *bus);
 void phy_sfp_detach(void *upstream, struct sfp_bus *bus);
 int phy_sfp_probe(struct phy_device *phydev,
diff --git a/include/linux/phy_link_topology.h b/include/linux/phy_link_topology.h
deleted file mode 100644
index 6b79feb607e7..000000000000
--- a/include/linux/phy_link_topology.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PHY device list allow maintaining a list of PHY devices that are
- * part of a netdevice's link topology. PHYs can for example be chained,
- * as is the case when using a PHY that exposes an SFP module, on which an
- * SFP transceiver that embeds a PHY is connected.
- *
- * This list can then be used by userspace to leverage individual PHY
- * capabilities.
- */
-#ifndef __PHY_LINK_TOPOLOGY_H
-#define __PHY_LINK_TOPOLOGY_H
-
-#include <linux/ethtool.h>
-#include <linux/phy_link_topology_core.h>
-
-struct xarray;
-struct phy_device;
-struct net_device;
-struct sfp_bus;
-
-struct phy_device_node {
-	enum phy_upstream upstream_type;
-
-	union {
-		struct net_device	*netdev;
-		struct phy_device	*phydev;
-	} upstream;
-
-	struct sfp_bus *parent_sfp_bus;
-
-	struct phy_device *phy;
-};
-
-struct phy_link_topology {
-	struct xarray phys;
-	u32 next_phy_index;
-};
-
-static inline struct phy_device *
-phy_link_topo_get_phy(struct phy_link_topology *topo, u32 phyindex)
-{
-	struct phy_device_node *pdn = xa_load(&topo->phys, phyindex);
-
-	if (pdn)
-		return pdn->phy;
-
-	return NULL;
-}
-
-#if IS_REACHABLE(CONFIG_PHYLIB)
-int phy_link_topo_add_phy(struct phy_link_topology *topo,
-			  struct phy_device *phy,
-			  enum phy_upstream upt, void *upstream);
-
-void phy_link_topo_del_phy(struct phy_link_topology *lt, struct phy_device *phy);
-
-#else
-static inline int phy_link_topo_add_phy(struct phy_link_topology *topo,
-					struct phy_device *phy,
-					enum phy_upstream upt, void *upstream)
-{
-	return 0;
-}
-
-static inline void phy_link_topo_del_phy(struct phy_link_topology *topo,
-					 struct phy_device *phy)
-{
-}
-#endif
-
-#endif /* __PHY_LINK_TOPOLOGY_H */
diff --git a/include/linux/phy_link_topology_core.h b/include/linux/phy_link_topology_core.h
deleted file mode 100644
index 0a6479055745..000000000000
--- a/include/linux/phy_link_topology_core.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __PHY_LINK_TOPOLOGY_CORE_H
-#define __PHY_LINK_TOPOLOGY_CORE_H
-
-struct phy_link_topology;
-
-#if IS_REACHABLE(CONFIG_PHYLIB)
-
-struct phy_link_topology *phy_link_topo_create(struct net_device *dev);
-void phy_link_topo_destroy(struct phy_link_topology *topo);
-
-#else
-
-static inline struct phy_link_topology *phy_link_topo_create(struct net_device *dev)
-{
-	return NULL;
-}
-
-static inline void phy_link_topo_destroy(struct phy_link_topology *topo)
-{
-}
-
-#endif
-
-#endif /* __PHY_LINK_TOPOLOGY_CORE_H */
diff --git a/include/linux/sfp.h b/include/linux/sfp.h
index 5ebc57f78c95..a45da7eef9a2 100644
--- a/include/linux/sfp.h
+++ b/include/linux/sfp.h
@@ -544,7 +544,7 @@ struct sfp_upstream_ops {
 	void (*link_down)(void *priv);
 	void (*link_up)(void *priv);
 	int (*connect_phy)(void *priv, struct phy_device *);
-	void (*disconnect_phy)(void *priv, struct phy_device *);
+	void (*disconnect_phy)(void *priv);
 };
 
 #if IS_ENABLED(CONFIG_SFP)
@@ -570,7 +570,6 @@ struct sfp_bus *sfp_bus_find_fwnode(const struct fwnode_handle *fwnode);
 int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream,
 			 const struct sfp_upstream_ops *ops);
 void sfp_bus_del_upstream(struct sfp_bus *bus);
-const char *sfp_get_name(struct sfp_bus *bus);
 #else
 static inline int sfp_parse_port(struct sfp_bus *bus,
 				 const struct sfp_eeprom_id *id,
@@ -649,11 +648,6 @@ static inline int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream,
 static inline void sfp_bus_del_upstream(struct sfp_bus *bus)
 {
 }
-
-static inline const char *sfp_get_name(struct sfp_bus *bus)
-{
-	return NULL;
-}
 #endif
 
 #endif
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 041e09c3515d..8733a3117902 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -2323,20 +2323,4 @@ struct ethtool_link_settings {
 	 * __u32 map_lp_advertising[link_mode_masks_nwords];
 	 */
 };
-
-/**
- * enum phy_upstream - Represents the upstream component a given PHY device
- * is connected to, as in what is on the other end of the MII bus. Most PHYs
- * will be attached to an Ethernet MAC controller, but in some cases, there's
- * an intermediate PHY used as a media-converter, which will driver another
- * MII interface as its output.
- * @PHY_UPSTREAM_MAC: Upstream component is a MAC (a switch port,
- *		      or ethernet controller)
- * @PHY_UPSTREAM_PHY: Upstream component is a PHY (likely a media converter)
- */
-enum phy_upstream {
-	PHY_UPSTREAM_MAC,
-	PHY_UPSTREAM_PHY,
-};
-
 #endif /* _UAPI_LINUX_ETHTOOL_H */
diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index f17dbe54bf5e..b49b804b9495 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -132,7 +132,6 @@ enum {
 	ETHTOOL_A_HEADER_DEV_INDEX,		/* u32 */
 	ETHTOOL_A_HEADER_DEV_NAME,		/* string */
 	ETHTOOL_A_HEADER_FLAGS,			/* u32 - ETHTOOL_FLAG_* */
-	ETHTOOL_A_HEADER_PHY_INDEX,		/* u32 */
 
 	/* add new constants above here */
 	__ETHTOOL_A_HEADER_CNT,
diff --git a/net/core/dev.c b/net/core/dev.c
index d2ce91a334c1..e1bb6d7856d9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -158,7 +158,6 @@
 #include <net/page_pool/types.h>
 #include <net/page_pool/helpers.h>
 #include <net/rps.h>
-#include <linux/phy_link_topology_core.h>
 
 #include "dev.h"
 #include "net-sysfs.h"
@@ -10998,12 +10997,6 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 #ifdef CONFIG_NET_SCHED
 	hash_init(dev->qdisc_hash);
 #endif
-	dev->link_topo = phy_link_topo_create(dev);
-	if (IS_ERR(dev->link_topo)) {
-		dev->link_topo = NULL;
-		goto free_all;
-	}
-
 	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
 	setup(dev);
 
@@ -11092,8 +11085,6 @@ void free_netdev(struct net_device *dev)
 	free_percpu(dev->xdp_bulkq);
 	dev->xdp_bulkq = NULL;
 
-	phy_link_topo_destroy(dev->link_topo);
-
 	/*  Compatibility with error handling in drivers */
 	if (dev->reg_state == NETREG_UNINITIALIZED ||
 	    dev->reg_state == NETREG_DUMMY) {
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index 563e94e0cbd8..bd04f28d5cf4 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -4,7 +4,6 @@
 #include <linux/ethtool_netlink.h>
 #include <linux/pm_runtime.h>
 #include "netlink.h"
-#include <linux/phy_link_topology.h>
 
 static struct genl_family ethtool_genl_family;
 
@@ -31,24 +30,6 @@ const struct nla_policy ethnl_header_policy_stats[] = {
 							  ETHTOOL_FLAGS_STATS),
 };
 
-const struct nla_policy ethnl_header_policy_phy[] = {
-	[ETHTOOL_A_HEADER_DEV_INDEX]	= { .type = NLA_U32 },
-	[ETHTOOL_A_HEADER_DEV_NAME]	= { .type = NLA_NUL_STRING,
-					    .len = ALTIFNAMSIZ - 1 },
-	[ETHTOOL_A_HEADER_FLAGS]	= NLA_POLICY_MASK(NLA_U32,
-							  ETHTOOL_FLAGS_BASIC),
-	[ETHTOOL_A_HEADER_PHY_INDEX]		= NLA_POLICY_MIN(NLA_U32, 1),
-};
-
-const struct nla_policy ethnl_header_policy_phy_stats[] = {
-	[ETHTOOL_A_HEADER_DEV_INDEX]	= { .type = NLA_U32 },
-	[ETHTOOL_A_HEADER_DEV_NAME]	= { .type = NLA_NUL_STRING,
-					    .len = ALTIFNAMSIZ - 1 },
-	[ETHTOOL_A_HEADER_FLAGS]	= NLA_POLICY_MASK(NLA_U32,
-							  ETHTOOL_FLAGS_STATS),
-	[ETHTOOL_A_HEADER_PHY_INDEX]		= NLA_POLICY_MIN(NLA_U32, 1),
-};
-
 int ethnl_ops_begin(struct net_device *dev)
 {
 	int ret;
@@ -108,9 +89,8 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info,
 			       const struct nlattr *header, struct net *net,
 			       struct netlink_ext_ack *extack, bool require_dev)
 {
-	struct nlattr *tb[ARRAY_SIZE(ethnl_header_policy_phy)];
+	struct nlattr *tb[ARRAY_SIZE(ethnl_header_policy)];
 	const struct nlattr *devname_attr;
-	struct phy_device *phydev = NULL;
 	struct net_device *dev = NULL;
 	u32 flags = 0;
 	int ret;
@@ -124,7 +104,7 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info,
 	/* No validation here, command policy should have a nested policy set
 	 * for the header, therefore validation should have already been done.
 	 */
-	ret = nla_parse_nested(tb, ARRAY_SIZE(ethnl_header_policy_phy) - 1, header,
+	ret = nla_parse_nested(tb, ARRAY_SIZE(ethnl_header_policy) - 1, header,
 			       NULL, extack);
 	if (ret < 0)
 		return ret;
@@ -165,30 +145,6 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info,
 		return -EINVAL;
 	}
 
-	if (dev) {
-		if (tb[ETHTOOL_A_HEADER_PHY_INDEX]) {
-			struct nlattr *phy_id;
-
-			phy_id = tb[ETHTOOL_A_HEADER_PHY_INDEX];
-			phydev = phy_link_topo_get_phy(dev->link_topo,
-						       nla_get_u32(phy_id));
-			if (!phydev) {
-				NL_SET_BAD_ATTR(extack, phy_id);
-				return -ENODEV;
-			}
-		} else {
-			/* If we need a PHY but no phy index is specified, fallback
-			 * to dev->phydev
-			 */
-			phydev = dev->phydev;
-		}
-	} else if (tb[ETHTOOL_A_HEADER_PHY_INDEX]) {
-		NL_SET_ERR_MSG_ATTR(extack, header,
-				    "can't target a PHY without a netdev");
-		return -EINVAL;
-	}
-
-	req_info->phydev = phydev;
 	req_info->dev = dev;
 	req_info->flags = flags;
 	return 0;
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index d57a890b5d9e..9a333a8d04c1 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -250,7 +250,6 @@ static inline unsigned int ethnl_reply_header_size(void)
  * @dev:   network device the request is for (may be null)
  * @dev_tracker: refcount tracker for @dev reference
  * @flags: request flags common for all request types
- * @phydev: phy_device connected to @dev this request is for (may be null)
  *
  * This is a common base for request specific structures holding data from
  * parsed userspace request. These always embed struct ethnl_req_info at
@@ -260,7 +259,6 @@ struct ethnl_req_info {
 	struct net_device	*dev;
 	netdevice_tracker	dev_tracker;
 	u32			flags;
-	struct phy_device	*phydev;
 };
 
 static inline void ethnl_parse_header_dev_put(struct ethnl_req_info *req_info)
@@ -397,12 +395,9 @@ extern const struct ethnl_request_ops ethnl_rss_request_ops;
 extern const struct ethnl_request_ops ethnl_plca_cfg_request_ops;
 extern const struct ethnl_request_ops ethnl_plca_status_request_ops;
 extern const struct ethnl_request_ops ethnl_mm_request_ops;
-extern const struct ethnl_request_ops ethnl_phy_request_ops;
 
 extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1];
 extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1];
-extern const struct nla_policy ethnl_header_policy_phy[ETHTOOL_A_HEADER_PHY_INDEX + 1];
-extern const struct nla_policy ethnl_header_policy_phy_stats[ETHTOOL_A_HEADER_PHY_INDEX + 1];
 extern const struct nla_policy ethnl_strset_get_policy[ETHTOOL_A_STRSET_COUNTS_ONLY + 1];
 extern const struct nla_policy ethnl_linkinfo_get_policy[ETHTOOL_A_LINKINFO_HEADER + 1];
 extern const struct nla_policy ethnl_linkinfo_set_policy[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL + 1];
-- 
cgit v1.2.3


From c9d5b7b8264ba2ead8984a3a4a0c3233911342a4 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Sat, 4 May 2024 14:23:03 +0100
Subject: ftrace: Remove unused list 'ftrace_direct_funcs'

Commit 8788ca164eb4b ("ftrace: Remove the legacy _ftrace_direct API")
stopped using 'ftrace_direct_funcs' (and the associated
struct ftrace_direct_func).  Remove them.

Build tested only (on x86-64 with FTRACE and DYNAMIC_FTRACE
enabled)

Link: https://lore.kernel.org/linux-trace-kernel/20240504132303.67538-1-linux@treblig.org

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 1 -
 kernel/trace/ftrace.c  | 8 --------
 2 files changed, 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 54d53f345d14..b01cca36147f 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -83,7 +83,6 @@ static inline void early_trace_init(void) { }
 
 struct module;
 struct ftrace_hash;
-struct ftrace_direct_func;
 
 #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_MODULES) && \
 	defined(CONFIG_DYNAMIC_FTRACE)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f9223513414a..4613bf67ef2c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5318,14 +5318,6 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long *ips, unsigned int cnt,
 
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
 
-struct ftrace_direct_func {
-	struct list_head	next;
-	unsigned long		addr;
-	int			count;
-};
-
-static LIST_HEAD(ftrace_direct_funcs);
-
 static int register_ftrace_function_nolock(struct ftrace_ops *ops);
 
 /*
-- 
cgit v1.2.3


From d2cc859cc8885e98907cffd47b990491c5321a80 Mon Sep 17 00:00:00 2001
From: "Dr. David Alan Gilbert" <linux@treblig.org>
Date: Tue, 7 May 2024 00:33:05 +0100
Subject: ftrace: Remove unused global 'ftrace_direct_func_count'

Commit 8788ca164eb4b ("ftrace: Remove the legacy _ftrace_direct API")
stopped setting the 'ftrace_direct_func_count' variable, but left
it around.  Clean it up.

Link: https://lore.kernel.org/linux-trace-kernel/20240506233305.215735-1-linux@treblig.org

Signed-off-by: Dr. David Alan Gilbert <linux@treblig.org>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  2 --
 kernel/trace/fgraph.c  | 11 -----------
 kernel/trace/ftrace.c  |  1 -
 3 files changed, 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index b01cca36147f..e3a83ebd1b33 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -413,7 +413,6 @@ struct ftrace_func_entry {
 };
 
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
-extern int ftrace_direct_func_count;
 unsigned long ftrace_find_rec_direct(unsigned long ip);
 int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr);
 int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr,
@@ -425,7 +424,6 @@ void ftrace_stub_direct_tramp(void);
 
 #else
 struct ftrace_ops;
-# define ftrace_direct_func_count 0
 static inline unsigned long ftrace_find_rec_direct(unsigned long ip)
 {
 	return 0;
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index c83c005e654e..a130b2d898f7 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -125,17 +125,6 @@ int function_graph_enter(unsigned long ret, unsigned long func,
 {
 	struct ftrace_graph_ent trace;
 
-#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
-	/*
-	 * Skip graph tracing if the return location is served by direct trampoline,
-	 * since call sequence and return addresses are unpredictable anyway.
-	 * Ex: BPF trampoline may call original function and may skip frame
-	 * depending on type of BPF programs attached.
-	 */
-	if (ftrace_direct_func_count &&
-	    ftrace_find_rec_direct(ret - MCOUNT_INSN_SIZE))
-		return -EBUSY;
-#endif
 	trace.func = func;
 	trace.depth = ++current->curr_ret_depth;
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4613bf67ef2c..5a01d72f66db 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2538,7 +2538,6 @@ ftrace_find_unique_ops(struct dyn_ftrace *rec)
 /* Protected by rcu_tasks for reading, and direct_mutex for writing */
 static struct ftrace_hash __rcu *direct_functions = EMPTY_HASH;
 static DEFINE_MUTEX(direct_mutex);
-int ftrace_direct_func_count;
 
 /*
  * Search the direct_functions hash to see if the given instruction pointer
-- 
cgit v1.2.3


From 12af2b83d0b17ec8b379b721dd4a8fbcd5d791f3 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (IBM)" <rppt@kernel.org>
Date: Sun, 5 May 2024 19:06:18 +0300
Subject: mm: introduce execmem_alloc() and execmem_free()

module_alloc() is used everywhere as a mean to allocate memory for code.

Beside being semantically wrong, this unnecessarily ties all subsystems
that need to allocate code, such as ftrace, kprobes and BPF to modules and
puts the burden of code allocation to the modules code.

Several architectures override module_alloc() because of various
constraints where the executable memory can be located and this causes
additional obstacles for improvements of code allocation.

Start splitting code allocation from modules by introducing execmem_alloc()
and execmem_free() APIs.

Initially, execmem_alloc() is a wrapper for module_alloc() and
execmem_free() is a replacement of module_memfree() to allow updating all
call sites to use the new APIs.

Since architectures define different restrictions on placement,
permissions, alignment and other parameters for memory that can be used by
different subsystems that allocate executable memory, execmem_alloc() takes
a type argument, that will be used to identify the calling subsystem and to
allow architectures define parameters for ranges suitable for that
subsystem.

No functional changes.

Signed-off-by: Mike Rapoport (IBM) <rppt@kernel.org>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Acked-by: Song Liu <song@kernel.org>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 arch/powerpc/kernel/kprobes.c    |  6 ++---
 arch/s390/kernel/ftrace.c        |  4 +--
 arch/s390/kernel/kprobes.c       |  4 +--
 arch/s390/kernel/module.c        |  5 ++--
 arch/sparc/net/bpf_jit_comp_32.c |  8 +++---
 arch/x86/kernel/ftrace.c         |  6 ++---
 arch/x86/kernel/kprobes/core.c   |  4 +--
 include/linux/execmem.h          | 57 ++++++++++++++++++++++++++++++++++++++++
 include/linux/moduleloader.h     |  3 ---
 kernel/bpf/core.c                |  6 ++---
 kernel/kprobes.c                 |  8 +++---
 kernel/module/Kconfig            |  1 +
 kernel/module/main.c             | 25 ++++++------------
 mm/Kconfig                       |  3 +++
 mm/Makefile                      |  1 +
 mm/execmem.c                     | 32 ++++++++++++++++++++++
 16 files changed, 128 insertions(+), 45 deletions(-)
 create mode 100644 include/linux/execmem.h
 create mode 100644 mm/execmem.c

(limited to 'include')

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index bbca90a5e2ec..9fcd01bb2ce6 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -19,8 +19,8 @@
 #include <linux/extable.h>
 #include <linux/kdebug.h>
 #include <linux/slab.h>
-#include <linux/moduleloader.h>
 #include <linux/set_memory.h>
+#include <linux/execmem.h>
 #include <asm/code-patching.h>
 #include <asm/cacheflush.h>
 #include <asm/sstep.h>
@@ -130,7 +130,7 @@ void *alloc_insn_page(void)
 {
 	void *page;
 
-	page = module_alloc(PAGE_SIZE);
+	page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
 	if (!page)
 		return NULL;
 
@@ -142,7 +142,7 @@ void *alloc_insn_page(void)
 	}
 	return page;
 error:
-	module_memfree(page);
+	execmem_free(page);
 	return NULL;
 }
 
diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index c46381ea04ec..798249ef5646 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -7,13 +7,13 @@
  *   Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
  */
 
-#include <linux/moduleloader.h>
 #include <linux/hardirq.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/kprobes.h>
+#include <linux/execmem.h>
 #include <trace/syscall.h>
 #include <asm/asm-offsets.h>
 #include <asm/text-patching.h>
@@ -220,7 +220,7 @@ static int __init ftrace_plt_init(void)
 {
 	const char *start, *end;
 
-	ftrace_plt = module_alloc(PAGE_SIZE);
+	ftrace_plt = execmem_alloc(EXECMEM_FTRACE, PAGE_SIZE);
 	if (!ftrace_plt)
 		panic("cannot allocate ftrace plt\n");
 
diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c
index f0cf20d4b3c5..3c1b1be744de 100644
--- a/arch/s390/kernel/kprobes.c
+++ b/arch/s390/kernel/kprobes.c
@@ -9,7 +9,6 @@
 
 #define pr_fmt(fmt) "kprobes: " fmt
 
-#include <linux/moduleloader.h>
 #include <linux/kprobes.h>
 #include <linux/ptrace.h>
 #include <linux/preempt.h>
@@ -21,6 +20,7 @@
 #include <linux/slab.h>
 #include <linux/hardirq.h>
 #include <linux/ftrace.h>
+#include <linux/execmem.h>
 #include <asm/set_memory.h>
 #include <asm/sections.h>
 #include <asm/dis.h>
@@ -38,7 +38,7 @@ void *alloc_insn_page(void)
 {
 	void *page;
 
-	page = module_alloc(PAGE_SIZE);
+	page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
 	if (!page)
 		return NULL;
 	set_memory_rox((unsigned long)page, 1);
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index 42215f9404af..ac97a905e8cd 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -21,6 +21,7 @@
 #include <linux/moduleloader.h>
 #include <linux/bug.h>
 #include <linux/memory.h>
+#include <linux/execmem.h>
 #include <asm/alternative.h>
 #include <asm/nospec-branch.h>
 #include <asm/facility.h>
@@ -76,7 +77,7 @@ void *module_alloc(unsigned long size)
 #ifdef CONFIG_FUNCTION_TRACER
 void module_arch_cleanup(struct module *mod)
 {
-	module_memfree(mod->arch.trampolines_start);
+	execmem_free(mod->arch.trampolines_start);
 }
 #endif
 
@@ -510,7 +511,7 @@ static int module_alloc_ftrace_hotpatch_trampolines(struct module *me,
 
 	size = FTRACE_HOTPATCH_TRAMPOLINES_SIZE(s->sh_size);
 	numpages = DIV_ROUND_UP(size, PAGE_SIZE);
-	start = module_alloc(numpages * PAGE_SIZE);
+	start = execmem_alloc(EXECMEM_FTRACE, numpages * PAGE_SIZE);
 	if (!start)
 		return -ENOMEM;
 	set_memory_rox((unsigned long)start, numpages);
diff --git a/arch/sparc/net/bpf_jit_comp_32.c b/arch/sparc/net/bpf_jit_comp_32.c
index da2df1e84ed4..bda2dbd3f4c5 100644
--- a/arch/sparc/net/bpf_jit_comp_32.c
+++ b/arch/sparc/net/bpf_jit_comp_32.c
@@ -1,10 +1,10 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/moduleloader.h>
 #include <linux/workqueue.h>
 #include <linux/netdevice.h>
 #include <linux/filter.h>
 #include <linux/cache.h>
 #include <linux/if_vlan.h>
+#include <linux/execmem.h>
 
 #include <asm/cacheflush.h>
 #include <asm/ptrace.h>
@@ -713,7 +713,7 @@ cond_branch:			f_offset = addrs[i + filter[i].jf];
 				if (unlikely(proglen + ilen > oldproglen)) {
 					pr_err("bpb_jit_compile fatal error\n");
 					kfree(addrs);
-					module_memfree(image);
+					execmem_free(image);
 					return;
 				}
 				memcpy(image + proglen, temp, ilen);
@@ -736,7 +736,7 @@ cond_branch:			f_offset = addrs[i + filter[i].jf];
 			break;
 		}
 		if (proglen == oldproglen) {
-			image = module_alloc(proglen);
+			image = execmem_alloc(EXECMEM_BPF, proglen);
 			if (!image)
 				goto out;
 		}
@@ -758,7 +758,7 @@ out:
 void bpf_jit_free(struct bpf_prog *fp)
 {
 	if (fp->jited)
-		module_memfree(fp->bpf_func);
+		execmem_free(fp->bpf_func);
 
 	bpf_prog_unlock_free(fp);
 }
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 70139d9d2e01..c8ddb7abda7c 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -25,6 +25,7 @@
 #include <linux/memory.h>
 #include <linux/vmalloc.h>
 #include <linux/set_memory.h>
+#include <linux/execmem.h>
 
 #include <trace/syscall.h>
 
@@ -261,15 +262,14 @@ void arch_ftrace_update_code(int command)
 #ifdef CONFIG_X86_64
 
 #ifdef CONFIG_MODULES
-#include <linux/moduleloader.h>
 /* Module allocation simplifies allocating memory for code */
 static inline void *alloc_tramp(unsigned long size)
 {
-	return module_alloc(size);
+	return execmem_alloc(EXECMEM_FTRACE, size);
 }
 static inline void tramp_free(void *tramp)
 {
-	module_memfree(tramp);
+	execmem_free(tramp);
 }
 #else
 /* Trampolines can only be created if modules are supported */
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index d0e49bd7c6f3..72e6a45e7ec2 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -40,12 +40,12 @@
 #include <linux/kgdb.h>
 #include <linux/ftrace.h>
 #include <linux/kasan.h>
-#include <linux/moduleloader.h>
 #include <linux/objtool.h>
 #include <linux/vmalloc.h>
 #include <linux/pgtable.h>
 #include <linux/set_memory.h>
 #include <linux/cfi.h>
+#include <linux/execmem.h>
 
 #include <asm/text-patching.h>
 #include <asm/cacheflush.h>
@@ -495,7 +495,7 @@ void *alloc_insn_page(void)
 {
 	void *page;
 
-	page = module_alloc(PAGE_SIZE);
+	page = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
 	if (!page)
 		return NULL;
 
diff --git a/include/linux/execmem.h b/include/linux/execmem.h
new file mode 100644
index 000000000000..8eebc8ef66e7
--- /dev/null
+++ b/include/linux/execmem.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_EXECMEM_ALLOC_H
+#define _LINUX_EXECMEM_ALLOC_H
+
+#include <linux/types.h>
+#include <linux/moduleloader.h>
+
+/**
+ * enum execmem_type - types of executable memory ranges
+ *
+ * There are several subsystems that allocate executable memory.
+ * Architectures define different restrictions on placement,
+ * permissions, alignment and other parameters for memory that can be used
+ * by these subsystems.
+ * Types in this enum identify subsystems that allocate executable memory
+ * and let architectures define parameters for ranges suitable for
+ * allocations by each subsystem.
+ *
+ * @EXECMEM_DEFAULT: default parameters that would be used for types that
+ * are not explicitly defined.
+ * @EXECMEM_MODULE_TEXT: parameters for module text sections
+ * @EXECMEM_KPROBES: parameters for kprobes
+ * @EXECMEM_FTRACE: parameters for ftrace
+ * @EXECMEM_BPF: parameters for BPF
+ * @EXECMEM_TYPE_MAX:
+ */
+enum execmem_type {
+	EXECMEM_DEFAULT,
+	EXECMEM_MODULE_TEXT = EXECMEM_DEFAULT,
+	EXECMEM_KPROBES,
+	EXECMEM_FTRACE,
+	EXECMEM_BPF,
+	EXECMEM_TYPE_MAX,
+};
+
+/**
+ * execmem_alloc - allocate executable memory
+ * @type: type of the allocation
+ * @size: how many bytes of memory are required
+ *
+ * Allocates memory that will contain executable code, either generated or
+ * loaded from kernel modules.
+ *
+ * The memory will have protections defined by architecture for executable
+ * region of the @type.
+ *
+ * Return: a pointer to the allocated memory or %NULL
+ */
+void *execmem_alloc(enum execmem_type type, size_t size);
+
+/**
+ * execmem_free - free executable memory
+ * @ptr: pointer to the memory that should be freed
+ */
+void execmem_free(void *ptr);
+
+#endif /* _LINUX_EXECMEM_ALLOC_H */
diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h
index 89b1e0ed9811..a3b8caee9405 100644
--- a/include/linux/moduleloader.h
+++ b/include/linux/moduleloader.h
@@ -29,9 +29,6 @@ unsigned int arch_mod_section_prepend(struct module *mod, unsigned int section);
    sections.  Returns NULL on failure. */
 void *module_alloc(unsigned long size);
 
-/* Free memory returned from module_alloc. */
-void module_memfree(void *module_region);
-
 /* Determines if the section name is an init section (that is only used during
  * module loading).
  */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 1ea5ce5bb599..892e50afda59 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -22,7 +22,6 @@
 #include <linux/skbuff.h>
 #include <linux/vmalloc.h>
 #include <linux/random.h>
-#include <linux/moduleloader.h>
 #include <linux/bpf.h>
 #include <linux/btf.h>
 #include <linux/objtool.h>
@@ -37,6 +36,7 @@
 #include <linux/nospec.h>
 #include <linux/bpf_mem_alloc.h>
 #include <linux/memcontrol.h>
+#include <linux/execmem.h>
 
 #include <asm/barrier.h>
 #include <asm/unaligned.h>
@@ -1050,12 +1050,12 @@ void bpf_jit_uncharge_modmem(u32 size)
 
 void *__weak bpf_jit_alloc_exec(unsigned long size)
 {
-	return module_alloc(size);
+	return execmem_alloc(EXECMEM_BPF, size);
 }
 
 void __weak bpf_jit_free_exec(void *addr)
 {
-	module_memfree(addr);
+	execmem_free(addr);
 }
 
 struct bpf_binary_header *
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 65adc815fc6e..ddd7cdc16edf 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -26,7 +26,6 @@
 #include <linux/slab.h>
 #include <linux/stddef.h>
 #include <linux/export.h>
-#include <linux/moduleloader.h>
 #include <linux/kallsyms.h>
 #include <linux/freezer.h>
 #include <linux/seq_file.h>
@@ -39,6 +38,7 @@
 #include <linux/jump_label.h>
 #include <linux/static_call.h>
 #include <linux/perf_event.h>
+#include <linux/execmem.h>
 
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
@@ -113,17 +113,17 @@ enum kprobe_slot_state {
 void __weak *alloc_insn_page(void)
 {
 	/*
-	 * Use module_alloc() so this page is within +/- 2GB of where the
+	 * Use execmem_alloc() so this page is within +/- 2GB of where the
 	 * kernel image and loaded module images reside. This is required
 	 * for most of the architectures.
 	 * (e.g. x86-64 needs this to handle the %rip-relative fixups.)
 	 */
-	return module_alloc(PAGE_SIZE);
+	return execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE);
 }
 
 static void free_insn_page(void *page)
 {
-	module_memfree(page);
+	execmem_free(page);
 }
 
 struct kprobe_insn_cache kprobe_insn_slots = {
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index cb8377a18927..4047b6d48255 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -2,6 +2,7 @@
 menuconfig MODULES
 	bool "Enable loadable module support"
 	modules
+	select EXECMEM
 	help
 	  Kernel modules are small pieces of compiled code which can
 	  be inserted in the running kernel, rather than being
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 5b82b069e0d3..d56b7df0cbb6 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -57,6 +57,7 @@
 #include <linux/audit.h>
 #include <linux/cfi.h>
 #include <linux/debugfs.h>
+#include <linux/execmem.h>
 #include <uapi/linux/module.h>
 #include "internal.h"
 
@@ -1179,16 +1180,6 @@ resolve_symbol_wait(struct module *mod,
 	return ksym;
 }
 
-void __weak module_memfree(void *module_region)
-{
-	/*
-	 * This memory may be RO, and freeing RO memory in an interrupt is not
-	 * supported by vmalloc.
-	 */
-	WARN_ON(in_interrupt());
-	vfree(module_region);
-}
-
 void __weak module_arch_cleanup(struct module *mod)
 {
 }
@@ -1213,7 +1204,7 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
 	if (mod_mem_use_vmalloc(type))
 		ptr = vmalloc(size);
 	else
-		ptr = module_alloc(size);
+		ptr = execmem_alloc(EXECMEM_MODULE_TEXT, size);
 
 	if (!ptr)
 		return -ENOMEM;
@@ -1244,7 +1235,7 @@ static void module_memory_free(struct module *mod, enum mod_mem_type type)
 	if (mod_mem_use_vmalloc(type))
 		vfree(ptr);
 	else
-		module_memfree(ptr);
+		execmem_free(ptr);
 }
 
 static void free_mod_mem(struct module *mod)
@@ -2496,9 +2487,9 @@ static void do_free_init(struct work_struct *w)
 
 	llist_for_each_safe(pos, n, list) {
 		initfree = container_of(pos, struct mod_initfree, node);
-		module_memfree(initfree->init_text);
-		module_memfree(initfree->init_data);
-		module_memfree(initfree->init_rodata);
+		execmem_free(initfree->init_text);
+		execmem_free(initfree->init_data);
+		execmem_free(initfree->init_rodata);
 		kfree(initfree);
 	}
 }
@@ -2608,10 +2599,10 @@ static noinline int do_init_module(struct module *mod)
 	 * We want to free module_init, but be aware that kallsyms may be
 	 * walking this with preempt disabled.  In all the failure paths, we
 	 * call synchronize_rcu(), but we don't want to slow down the success
-	 * path. module_memfree() cannot be called in an interrupt, so do the
+	 * path. execmem_free() cannot be called in an interrupt, so do the
 	 * work and call synchronize_rcu() in a work queue.
 	 *
-	 * Note that module_alloc() on most architectures creates W+X page
+	 * Note that execmem_alloc() on most architectures creates W+X page
 	 * mappings which won't be cleaned up until do_free_init() runs.  Any
 	 * code such as mark_rodata_ro() which depends on those mappings to
 	 * be cleaned up needs to sync with the queued work by invoking
diff --git a/mm/Kconfig b/mm/Kconfig
index f30a18a0e37d..8bb60fdaba3c 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1240,6 +1240,9 @@ config LOCK_MM_AND_FIND_VMA
 config IOMMU_MM_DATA
 	bool
 
+config EXECMEM
+	bool
+
 source "mm/damon/Kconfig"
 
 endmenu
diff --git a/mm/Makefile b/mm/Makefile
index 4abb40b911ec..001336c91864 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -133,3 +133,4 @@ obj-$(CONFIG_IO_MAPPING) += io-mapping.o
 obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
 obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
 obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
+obj-$(CONFIG_EXECMEM) += execmem.o
diff --git a/mm/execmem.c b/mm/execmem.c
new file mode 100644
index 000000000000..480adc69b20d
--- /dev/null
+++ b/mm/execmem.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2002 Richard Henderson
+ * Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
+ * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
+ * Copyright (C) 2024 Mike Rapoport IBM.
+ */
+
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/execmem.h>
+#include <linux/moduleloader.h>
+
+static void *__execmem_alloc(size_t size)
+{
+	return module_alloc(size);
+}
+
+void *execmem_alloc(enum execmem_type type, size_t size)
+{
+	return __execmem_alloc(size);
+}
+
+void execmem_free(void *ptr)
+{
+	/*
+	 * This memory may be RO, and freeing RO memory in an interrupt is not
+	 * supported by vmalloc.
+	 */
+	WARN_ON(in_interrupt());
+	vfree(ptr);
+}
-- 
cgit v1.2.3


From f6bec26c0a7364d3506a3e12dab7c228ef32bd65 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (IBM)" <rppt@kernel.org>
Date: Sun, 5 May 2024 19:06:19 +0300
Subject: mm/execmem, arch: convert simple overrides of module_alloc to execmem

Several architectures override module_alloc() only to define address
range for code allocations different than VMALLOC address space.

Provide a generic implementation in execmem that uses the parameters for
address space ranges, required alignment and page protections provided
by architectures.

The architectures must fill execmem_info structure and implement
execmem_arch_setup() that returns a pointer to that structure. This way the
execmem initialization won't be called from every architecture, but rather
from a central place, namely a core_initcall() in execmem.

The execmem provides execmem_alloc() API that wraps __vmalloc_node_range()
with the parameters defined by the architectures.  If an architecture does
not implement execmem_arch_setup(), execmem_alloc() will fall back to
module_alloc().

Signed-off-by: Mike Rapoport (IBM) <rppt@kernel.org>
Acked-by: Song Liu <song@kernel.org>
Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 arch/loongarch/kernel/module.c | 19 ++++++++++--
 arch/mips/kernel/module.c      | 20 ++++++++++---
 arch/nios2/kernel/module.c     | 21 +++++++++----
 arch/parisc/kernel/module.c    | 24 ++++++++++-----
 arch/riscv/kernel/module.c     | 24 ++++++++++-----
 arch/sparc/kernel/module.c     | 20 ++++++++++---
 include/linux/execmem.h        | 47 +++++++++++++++++++++++++++++
 mm/execmem.c                   | 67 ++++++++++++++++++++++++++++++++++++++++--
 mm/mm_init.c                   |  2 ++
 9 files changed, 210 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/arch/loongarch/kernel/module.c b/arch/loongarch/kernel/module.c
index c7d0338d12c1..ca6dd7ea1610 100644
--- a/arch/loongarch/kernel/module.c
+++ b/arch/loongarch/kernel/module.c
@@ -18,6 +18,7 @@
 #include <linux/ftrace.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
+#include <linux/execmem.h>
 #include <asm/alternative.h>
 #include <asm/inst.h>
 #include <asm/unwind.h>
@@ -490,10 +491,22 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
 	return 0;
 }
 
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-			GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE, __builtin_return_address(0));
+	execmem_info = (struct execmem_info){
+		.ranges = {
+			[EXECMEM_DEFAULT] = {
+				.start	= MODULES_VADDR,
+				.end	= MODULES_END,
+				.pgprot	= PAGE_KERNEL,
+				.alignment = 1,
+			},
+		},
+	};
+
+	return &execmem_info;
 }
 
 static void module_init_ftrace_plt(const Elf_Ehdr *hdr,
diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c
index 9a6c96014904..59225a3cf918 100644
--- a/arch/mips/kernel/module.c
+++ b/arch/mips/kernel/module.c
@@ -20,6 +20,7 @@
 #include <linux/kernel.h>
 #include <linux/spinlock.h>
 #include <linux/jump_label.h>
+#include <linux/execmem.h>
 #include <asm/jump_label.h>
 
 struct mips_hi16 {
@@ -32,11 +33,22 @@ static LIST_HEAD(dbe_list);
 static DEFINE_SPINLOCK(dbe_lock);
 
 #ifdef MODULES_VADDR
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
-				__builtin_return_address(0));
+	execmem_info = (struct execmem_info){
+		.ranges = {
+			[EXECMEM_DEFAULT] = {
+				.start	= MODULES_VADDR,
+				.end	= MODULES_END,
+				.pgprot	= PAGE_KERNEL,
+				.alignment = 1,
+			},
+		},
+	};
+
+	return &execmem_info;
 }
 #endif
 
diff --git a/arch/nios2/kernel/module.c b/arch/nios2/kernel/module.c
index 9c97b7513853..0d1ee86631fc 100644
--- a/arch/nios2/kernel/module.c
+++ b/arch/nios2/kernel/module.c
@@ -18,15 +18,26 @@
 #include <linux/fs.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
+#include <linux/execmem.h>
 
 #include <asm/cacheflush.h>
 
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				    GFP_KERNEL, PAGE_KERNEL_EXEC,
-				    VM_FLUSH_RESET_PERMS, NUMA_NO_NODE,
-				    __builtin_return_address(0));
+	execmem_info = (struct execmem_info){
+		.ranges = {
+			[EXECMEM_DEFAULT] = {
+				.start	= MODULES_VADDR,
+				.end	= MODULES_END,
+				.pgprot	= PAGE_KERNEL_EXEC,
+				.alignment = 1,
+			},
+		},
+	};
+
+	return &execmem_info;
 }
 
 int apply_relocate_add(Elf32_Shdr *sechdrs, const char *strtab,
diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c
index d214bbe3c2af..bdfa85e10c1b 100644
--- a/arch/parisc/kernel/module.c
+++ b/arch/parisc/kernel/module.c
@@ -49,6 +49,7 @@
 #include <linux/bug.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
+#include <linux/execmem.h>
 
 #include <asm/unwind.h>
 #include <asm/sections.h>
@@ -173,15 +174,22 @@ static inline int reassemble_22(int as22)
 		((as22 & 0x0003ff) << 3));
 }
 
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-	/* using RWX means less protection for modules, but it's
-	 * easier than trying to map the text, data, init_text and
-	 * init_data correctly */
-	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
-				    GFP_KERNEL,
-				    PAGE_KERNEL_RWX, 0, NUMA_NO_NODE,
-				    __builtin_return_address(0));
+	execmem_info = (struct execmem_info){
+		.ranges = {
+			[EXECMEM_DEFAULT] = {
+				.start	= VMALLOC_START,
+				.end	= VMALLOC_END,
+				.pgprot	= PAGE_KERNEL_RWX,
+				.alignment = 1,
+			},
+		},
+	};
+
+	return &execmem_info;
 }
 
 #ifndef CONFIG_64BIT
diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c
index 5e5a82644451..182904127ba0 100644
--- a/arch/riscv/kernel/module.c
+++ b/arch/riscv/kernel/module.c
@@ -14,6 +14,7 @@
 #include <linux/vmalloc.h>
 #include <linux/sizes.h>
 #include <linux/pgtable.h>
+#include <linux/execmem.h>
 #include <asm/alternative.h>
 #include <asm/sections.h>
 
@@ -906,13 +907,22 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
 }
 
 #if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
-void *module_alloc(unsigned long size)
-{
-	return __vmalloc_node_range(size, 1, MODULES_VADDR,
-				    MODULES_END, GFP_KERNEL,
-				    PAGE_KERNEL, VM_FLUSH_RESET_PERMS,
-				    NUMA_NO_NODE,
-				    __builtin_return_address(0));
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
+{
+	execmem_info = (struct execmem_info){
+		.ranges = {
+			[EXECMEM_DEFAULT] = {
+				.start	= MODULES_VADDR,
+				.end	= MODULES_END,
+				.pgprot	= PAGE_KERNEL,
+				.alignment = 1,
+			},
+		},
+	};
+
+	return &execmem_info;
 }
 #endif
 
diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c
index d37adb2a0b54..8b7ee45defc3 100644
--- a/arch/sparc/kernel/module.c
+++ b/arch/sparc/kernel/module.c
@@ -14,6 +14,7 @@
 #include <linux/string.h>
 #include <linux/ctype.h>
 #include <linux/mm.h>
+#include <linux/execmem.h>
 
 #include <asm/processor.h>
 #include <asm/spitfire.h>
@@ -21,11 +22,22 @@
 
 #include "entry.h"
 
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
-				__builtin_return_address(0));
+	execmem_info = (struct execmem_info){
+		.ranges = {
+			[EXECMEM_DEFAULT] = {
+				.start	= MODULES_VADDR,
+				.end	= MODULES_END,
+				.pgprot	= PAGE_KERNEL,
+				.alignment = 1,
+			},
+		},
+	};
+
+	return &execmem_info;
 }
 
 /* Make generic code ignore STT_REGISTER dummy undefined symbols.  */
diff --git a/include/linux/execmem.h b/include/linux/execmem.h
index 8eebc8ef66e7..96fc59258467 100644
--- a/include/linux/execmem.h
+++ b/include/linux/execmem.h
@@ -33,6 +33,47 @@ enum execmem_type {
 	EXECMEM_TYPE_MAX,
 };
 
+/**
+ * struct execmem_range - definition of an address space suitable for code and
+ *			  related data allocations
+ * @start:	address space start
+ * @end:	address space end (inclusive)
+ * @pgprot:	permissions for memory in this address space
+ * @alignment:	alignment required for text allocations
+ */
+struct execmem_range {
+	unsigned long   start;
+	unsigned long   end;
+	pgprot_t        pgprot;
+	unsigned int	alignment;
+};
+
+/**
+ * struct execmem_info - architecture parameters for code allocations
+ * @ranges: array of parameter sets defining architecture specific
+ * parameters for executable memory allocations. The ranges that are not
+ * explicitly initialized by an architecture use parameters defined for
+ * @EXECMEM_DEFAULT.
+ */
+struct execmem_info {
+	struct execmem_range	ranges[EXECMEM_TYPE_MAX];
+};
+
+/**
+ * execmem_arch_setup - define parameters for allocations of executable memory
+ *
+ * A hook for architectures to define parameters for allocations of
+ * executable memory. These parameters should be filled into the
+ * @execmem_info structure.
+ *
+ * For architectures that do not implement this method a default set of
+ * parameters will be used
+ *
+ * Return: a structure defining architecture parameters and restrictions
+ * for allocations of executable memory
+ */
+struct execmem_info *execmem_arch_setup(void);
+
 /**
  * execmem_alloc - allocate executable memory
  * @type: type of the allocation
@@ -54,4 +95,10 @@ void *execmem_alloc(enum execmem_type type, size_t size);
  */
 void execmem_free(void *ptr);
 
+#ifdef CONFIG_EXECMEM
+void execmem_init(void);
+#else
+static inline void execmem_init(void) {}
+#endif
+
 #endif /* _LINUX_EXECMEM_ALLOC_H */
diff --git a/mm/execmem.c b/mm/execmem.c
index 480adc69b20d..80e61c1e7319 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -11,14 +11,30 @@
 #include <linux/execmem.h>
 #include <linux/moduleloader.h>
 
-static void *__execmem_alloc(size_t size)
+static struct execmem_info *execmem_info __ro_after_init;
+
+static void *__execmem_alloc(struct execmem_range *range, size_t size)
 {
-	return module_alloc(size);
+	unsigned long start = range->start;
+	unsigned long end = range->end;
+	unsigned int align = range->alignment;
+	pgprot_t pgprot = range->pgprot;
+
+	return __vmalloc_node_range(size, align, start, end,
+				    GFP_KERNEL, pgprot, VM_FLUSH_RESET_PERMS,
+				    NUMA_NO_NODE, __builtin_return_address(0));
 }
 
 void *execmem_alloc(enum execmem_type type, size_t size)
 {
-	return __execmem_alloc(size);
+	struct execmem_range *range;
+
+	if (!execmem_info)
+		return module_alloc(size);
+
+	range = &execmem_info->ranges[type];
+
+	return __execmem_alloc(range, size);
 }
 
 void execmem_free(void *ptr)
@@ -30,3 +46,48 @@ void execmem_free(void *ptr)
 	WARN_ON(in_interrupt());
 	vfree(ptr);
 }
+
+static bool execmem_validate(struct execmem_info *info)
+{
+	struct execmem_range *r = &info->ranges[EXECMEM_DEFAULT];
+
+	if (!r->alignment || !r->start || !r->end || !pgprot_val(r->pgprot)) {
+		pr_crit("Invalid parameters for execmem allocator, module loading will fail");
+		return false;
+	}
+
+	return true;
+}
+
+static void execmem_init_missing(struct execmem_info *info)
+{
+	struct execmem_range *default_range = &info->ranges[EXECMEM_DEFAULT];
+
+	for (int i = EXECMEM_DEFAULT + 1; i < EXECMEM_TYPE_MAX; i++) {
+		struct execmem_range *r = &info->ranges[i];
+
+		if (!r->start) {
+			r->pgprot = default_range->pgprot;
+			r->alignment = default_range->alignment;
+			r->start = default_range->start;
+			r->end = default_range->end;
+		}
+	}
+}
+
+struct execmem_info * __weak execmem_arch_setup(void)
+{
+	return NULL;
+}
+
+void __init execmem_init(void)
+{
+	struct execmem_info *info = execmem_arch_setup();
+
+	if (!info || !execmem_validate(info))
+		return;
+
+	execmem_init_missing(info);
+
+	execmem_info = info;
+}
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 549e76af8f82..b6a1fcf6e13a 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -27,6 +27,7 @@
 #include <linux/swap.h>
 #include <linux/cma.h>
 #include <linux/crash_dump.h>
+#include <linux/execmem.h>
 #include "internal.h"
 #include "slab.h"
 #include "shuffle.h"
@@ -2793,4 +2794,5 @@ void __init mm_core_init(void)
 	pti_init();
 	kmsan_init_runtime();
 	mm_cache_init();
+	execmem_init();
 }
-- 
cgit v1.2.3


From 223b5e57d0d50b0c07b933350dbcde92018d3080 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (IBM)" <rppt@kernel.org>
Date: Sun, 5 May 2024 19:06:20 +0300
Subject: mm/execmem, arch: convert remaining overrides of module_alloc to
 execmem

Extend execmem parameters to accommodate more complex overrides of
module_alloc() by architectures.

This includes specification of a fallback range required by arm, arm64
and powerpc, EXECMEM_MODULE_DATA type required by powerpc, support for
allocation of KASAN shadow required by s390 and x86 and support for
late initialization of execmem required by arm64.

The core implementation of execmem_alloc() takes care of suppressing
warnings when the initial allocation fails but there is a fallback range
defined.

Signed-off-by: Mike Rapoport (IBM) <rppt@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Song Liu <song@kernel.org>
Tested-by: Liviu Dudau <liviu@dudau.co.uk>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 arch/Kconfig                 |  8 +++++
 arch/arm/kernel/module.c     | 41 ++++++++++++++----------
 arch/arm64/Kconfig           |  1 +
 arch/arm64/kernel/module.c   | 55 ++++++++++++++++++--------------
 arch/powerpc/kernel/module.c | 60 ++++++++++++++++++++++-------------
 arch/s390/kernel/module.c    | 54 +++++++++++++-------------------
 arch/x86/kernel/module.c     | 70 ++++++++++++++---------------------------
 include/linux/execmem.h      | 30 +++++++++++++++++-
 include/linux/moduleloader.h | 12 -------
 kernel/module/main.c         | 26 ++++------------
 mm/execmem.c                 | 74 +++++++++++++++++++++++++++++++++++++-------
 11 files changed, 246 insertions(+), 185 deletions(-)

(limited to 'include')

diff --git a/arch/Kconfig b/arch/Kconfig
index 93404c802d29..ee1ccbde50ef 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -977,6 +977,14 @@ config ARCH_WANTS_MODULES_DATA_IN_VMALLOC
 	  For architectures like powerpc/32 which have constraints on module
 	  allocation and need to allocate module data outside of module area.
 
+config ARCH_WANTS_EXECMEM_LATE
+	bool
+	help
+	  For architectures that do not allocate executable memory early on
+	  boot, but rather require its initialization late when there is
+	  enough entropy for module space randomization, for instance
+	  arm64.
+
 config HAVE_IRQ_EXIT_ON_IRQ_STACK
 	bool
 	help
diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index e74d84f58b77..a98fdf6ff26c 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -16,6 +16,7 @@
 #include <linux/fs.h>
 #include <linux/string.h>
 #include <linux/gfp.h>
+#include <linux/execmem.h>
 
 #include <asm/sections.h>
 #include <asm/smp_plat.h>
@@ -34,23 +35,31 @@
 #endif
 
 #ifdef CONFIG_MMU
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-	gfp_t gfp_mask = GFP_KERNEL;
-	void *p;
-
-	/* Silence the initial allocation */
-	if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS))
-		gfp_mask |= __GFP_NOWARN;
-
-	p = __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				gfp_mask, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
-				__builtin_return_address(0));
-	if (!IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || p)
-		return p;
-	return __vmalloc_node_range(size, 1,  VMALLOC_START, VMALLOC_END,
-				GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
-				__builtin_return_address(0));
+	unsigned long fallback_start = 0, fallback_end = 0;
+
+	if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS)) {
+		fallback_start = VMALLOC_START;
+		fallback_end = VMALLOC_END;
+	}
+
+	execmem_info = (struct execmem_info){
+		.ranges = {
+			[EXECMEM_DEFAULT] = {
+				.start	= MODULES_VADDR,
+				.end	= MODULES_END,
+				.pgprot	= PAGE_KERNEL_EXEC,
+				.alignment = 1,
+				.fallback_start	= fallback_start,
+				.fallback_end	= fallback_end,
+			},
+		},
+	};
+
+	return &execmem_info;
 }
 #endif
 
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 7b11c98b3e84..74b34a78b7ac 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -105,6 +105,7 @@ config ARM64
 	select ARCH_WANT_FRAME_POINTERS
 	select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
 	select ARCH_WANT_LD_ORPHAN_WARN
+	select ARCH_WANTS_EXECMEM_LATE if EXECMEM
 	select ARCH_WANTS_NO_INSTR
 	select ARCH_WANTS_THP_SWAP if ARM64_4K_PAGES
 	select ARCH_HAS_UBSAN
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index e92da4da1b2a..b7a7a23f9f8f 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -20,6 +20,7 @@
 #include <linux/random.h>
 #include <linux/scs.h>
 #include <linux/vmalloc.h>
+#include <linux/execmem.h>
 
 #include <asm/alternative.h>
 #include <asm/insn.h>
@@ -108,41 +109,47 @@ static int __init module_init_limits(void)
 
 	return 0;
 }
-subsys_initcall(module_init_limits);
 
-void *module_alloc(unsigned long size)
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-	void *p = NULL;
+	unsigned long fallback_start = 0, fallback_end = 0;
+	unsigned long start = 0, end = 0;
+
+	module_init_limits();
 
 	/*
 	 * Where possible, prefer to allocate within direct branch range of the
 	 * kernel such that no PLTs are necessary.
 	 */
 	if (module_direct_base) {
-		p = __vmalloc_node_range(size, MODULE_ALIGN,
-					 module_direct_base,
-					 module_direct_base + SZ_128M,
-					 GFP_KERNEL | __GFP_NOWARN,
-					 PAGE_KERNEL, 0, NUMA_NO_NODE,
-					 __builtin_return_address(0));
-	}
+		start = module_direct_base;
+		end = module_direct_base + SZ_128M;
 
-	if (!p && module_plt_base) {
-		p = __vmalloc_node_range(size, MODULE_ALIGN,
-					 module_plt_base,
-					 module_plt_base + SZ_2G,
-					 GFP_KERNEL | __GFP_NOWARN,
-					 PAGE_KERNEL, 0, NUMA_NO_NODE,
-					 __builtin_return_address(0));
-	}
-
-	if (!p) {
-		pr_warn_ratelimited("%s: unable to allocate memory\n",
-				    __func__);
+		if (module_plt_base) {
+			fallback_start = module_plt_base;
+			fallback_end = module_plt_base + SZ_2G;
+		}
+	} else if (module_plt_base) {
+		start = module_plt_base;
+		end = module_plt_base + SZ_2G;
 	}
 
-	/* Memory is intended to be executable, reset the pointer tag. */
-	return kasan_reset_tag(p);
+	execmem_info = (struct execmem_info){
+		.ranges = {
+			[EXECMEM_DEFAULT] = {
+				.start	= start,
+				.end	= end,
+				.pgprot	= PAGE_KERNEL,
+				.alignment = 1,
+				.fallback_start	= fallback_start,
+				.fallback_end	= fallback_end,
+			},
+		},
+	};
+
+	return &execmem_info;
 }
 
 enum aarch64_reloc_op {
diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
index f6d6ae0a1692..ac80559015a3 100644
--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@ -10,6 +10,7 @@
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
 #include <linux/bug.h>
+#include <linux/execmem.h>
 #include <asm/module.h>
 #include <linux/uaccess.h>
 #include <asm/firmware.h>
@@ -89,39 +90,56 @@ int module_finalize(const Elf_Ehdr *hdr,
 	return 0;
 }
 
-static __always_inline void *
-__module_alloc(unsigned long size, unsigned long start, unsigned long end, bool nowarn)
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
 	pgprot_t prot = strict_module_rwx_enabled() ? PAGE_KERNEL : PAGE_KERNEL_EXEC;
-	gfp_t gfp = GFP_KERNEL | (nowarn ? __GFP_NOWARN : 0);
+	unsigned long fallback_start = 0, fallback_end = 0;
+	unsigned long start, end;
 
 	/*
-	 * Don't do huge page allocations for modules yet until more testing
-	 * is done. STRICT_MODULE_RWX may require extra work to support this
-	 * too.
+	 * BOOK3S_32 and 8xx define MODULES_VADDR for text allocations and
+	 * allow allocating data in the entire vmalloc space
 	 */
-	return __vmalloc_node_range(size, 1, start, end, gfp, prot,
-				    VM_FLUSH_RESET_PERMS,
-				    NUMA_NO_NODE, __builtin_return_address(0));
-}
-
-void *module_alloc(unsigned long size)
-{
 #ifdef MODULES_VADDR
 	unsigned long limit = (unsigned long)_etext - SZ_32M;
-	void *ptr = NULL;
 
 	BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR);
 
 	/* First try within 32M limit from _etext to avoid branch trampolines */
-	if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit)
-		ptr = __module_alloc(size, limit, MODULES_END, true);
-
-	if (!ptr)
-		ptr = __module_alloc(size, MODULES_VADDR, MODULES_END, false);
+	if (MODULES_VADDR < PAGE_OFFSET && MODULES_END > limit) {
+		start = limit;
+		fallback_start = MODULES_VADDR;
+		fallback_end = MODULES_END;
+	} else {
+		start = MODULES_VADDR;
+	}
 
-	return ptr;
+	end = MODULES_END;
 #else
-	return __module_alloc(size, VMALLOC_START, VMALLOC_END, false);
+	start = VMALLOC_START;
+	end = VMALLOC_END;
 #endif
+
+	execmem_info = (struct execmem_info){
+		.ranges = {
+			[EXECMEM_DEFAULT] = {
+				.start	= start,
+				.end	= end,
+				.pgprot	= prot,
+				.alignment = 1,
+				.fallback_start	= fallback_start,
+				.fallback_end	= fallback_end,
+			},
+			[EXECMEM_MODULE_DATA] = {
+				.start	= VMALLOC_START,
+				.end	= VMALLOC_END,
+				.pgprot	= PAGE_KERNEL,
+				.alignment = 1,
+			},
+		},
+	};
+
+	return &execmem_info;
 }
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index ac97a905e8cd..7fee64fdc1bb 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -37,41 +37,31 @@
 
 #define PLT_ENTRY_SIZE 22
 
-static unsigned long get_module_load_offset(void)
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
 {
-	static DEFINE_MUTEX(module_kaslr_mutex);
-	static unsigned long module_load_offset;
-
-	if (!kaslr_enabled())
-		return 0;
-	/*
-	 * Calculate the module_load_offset the first time this code
-	 * is called. Once calculated it stays the same until reboot.
-	 */
-	mutex_lock(&module_kaslr_mutex);
-	if (!module_load_offset)
+	unsigned long module_load_offset = 0;
+	unsigned long start;
+
+	if (kaslr_enabled())
 		module_load_offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE;
-	mutex_unlock(&module_kaslr_mutex);
-	return module_load_offset;
-}
 
-void *module_alloc(unsigned long size)
-{
-	gfp_t gfp_mask = GFP_KERNEL;
-	void *p;
-
-	if (PAGE_ALIGN(size) > MODULES_LEN)
-		return NULL;
-	p = __vmalloc_node_range(size, MODULE_ALIGN,
-				 MODULES_VADDR + get_module_load_offset(),
-				 MODULES_END, gfp_mask, PAGE_KERNEL,
-				 VM_FLUSH_RESET_PERMS | VM_DEFER_KMEMLEAK,
-				 NUMA_NO_NODE, __builtin_return_address(0));
-	if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
-		vfree(p);
-		return NULL;
-	}
-	return p;
+	start = MODULES_VADDR + module_load_offset;
+
+	execmem_info = (struct execmem_info){
+		.ranges = {
+			[EXECMEM_DEFAULT] = {
+				.flags	= EXECMEM_KASAN_SHADOW,
+				.start	= start,
+				.end	= MODULES_END,
+				.pgprot	= PAGE_KERNEL,
+				.alignment = MODULE_ALIGN,
+			},
+		},
+	};
+
+	return &execmem_info;
 }
 
 #ifdef CONFIG_FUNCTION_TRACER
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index e18914c0e38a..45b1a7c03379 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -19,6 +19,7 @@
 #include <linux/jump_label.h>
 #include <linux/random.h>
 #include <linux/memory.h>
+#include <linux/execmem.h>
 
 #include <asm/text-patching.h>
 #include <asm/page.h>
@@ -36,55 +37,30 @@ do {							\
 } while (0)
 #endif
 
-#ifdef CONFIG_RANDOMIZE_BASE
-static unsigned long module_load_offset;
+static struct execmem_info execmem_info __ro_after_init;
 
-/* Mutex protects the module_load_offset. */
-static DEFINE_MUTEX(module_kaslr_mutex);
-
-static unsigned long int get_module_load_offset(void)
-{
-	if (kaslr_enabled()) {
-		mutex_lock(&module_kaslr_mutex);
-		/*
-		 * Calculate the module_load_offset the first time this
-		 * code is called. Once calculated it stays the same until
-		 * reboot.
-		 */
-		if (module_load_offset == 0)
-			module_load_offset =
-				get_random_u32_inclusive(1, 1024) * PAGE_SIZE;
-		mutex_unlock(&module_kaslr_mutex);
-	}
-	return module_load_offset;
-}
-#else
-static unsigned long int get_module_load_offset(void)
+struct execmem_info __init *execmem_arch_setup(void)
 {
-	return 0;
-}
-#endif
-
-void *module_alloc(unsigned long size)
-{
-	gfp_t gfp_mask = GFP_KERNEL;
-	void *p;
-
-	if (PAGE_ALIGN(size) > MODULES_LEN)
-		return NULL;
-
-	p = __vmalloc_node_range(size, MODULE_ALIGN,
-				 MODULES_VADDR + get_module_load_offset(),
-				 MODULES_END, gfp_mask, PAGE_KERNEL,
-				 VM_FLUSH_RESET_PERMS | VM_DEFER_KMEMLEAK,
-				 NUMA_NO_NODE, __builtin_return_address(0));
-
-	if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) {
-		vfree(p);
-		return NULL;
-	}
-
-	return p;
+	unsigned long start, offset = 0;
+
+	if (kaslr_enabled())
+		offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE;
+
+	start = MODULES_VADDR + offset;
+
+	execmem_info = (struct execmem_info){
+		.ranges = {
+			[EXECMEM_DEFAULT] = {
+				.flags	= EXECMEM_KASAN_SHADOW,
+				.start	= start,
+				.end	= MODULES_END,
+				.pgprot	= PAGE_KERNEL,
+				.alignment = MODULE_ALIGN,
+			},
+		},
+	};
+
+	return &execmem_info;
 }
 
 #ifdef CONFIG_X86_32
diff --git a/include/linux/execmem.h b/include/linux/execmem.h
index 96fc59258467..32cef1144117 100644
--- a/include/linux/execmem.h
+++ b/include/linux/execmem.h
@@ -5,6 +5,14 @@
 #include <linux/types.h>
 #include <linux/moduleloader.h>
 
+#if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \
+		!defined(CONFIG_KASAN_VMALLOC)
+#include <linux/kasan.h>
+#define MODULE_ALIGN (PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT)
+#else
+#define MODULE_ALIGN PAGE_SIZE
+#endif
+
 /**
  * enum execmem_type - types of executable memory ranges
  *
@@ -22,6 +30,7 @@
  * @EXECMEM_KPROBES: parameters for kprobes
  * @EXECMEM_FTRACE: parameters for ftrace
  * @EXECMEM_BPF: parameters for BPF
+ * @EXECMEM_MODULE_DATA: parameters for module data sections
  * @EXECMEM_TYPE_MAX:
  */
 enum execmem_type {
@@ -30,22 +39,38 @@ enum execmem_type {
 	EXECMEM_KPROBES,
 	EXECMEM_FTRACE,
 	EXECMEM_BPF,
+	EXECMEM_MODULE_DATA,
 	EXECMEM_TYPE_MAX,
 };
 
+/**
+ * enum execmem_range_flags - options for executable memory allocations
+ * @EXECMEM_KASAN_SHADOW:	allocate kasan shadow
+ */
+enum execmem_range_flags {
+	EXECMEM_KASAN_SHADOW	= (1 << 0),
+};
+
 /**
  * struct execmem_range - definition of an address space suitable for code and
  *			  related data allocations
  * @start:	address space start
  * @end:	address space end (inclusive)
+ * @fallback_start: start of the secondary address space range for fallback
+ *                  allocations on architectures that require it
+ * @fallback_end:   start of the secondary address space (inclusive)
  * @pgprot:	permissions for memory in this address space
  * @alignment:	alignment required for text allocations
+ * @flags:	options for memory allocations for this range
  */
 struct execmem_range {
 	unsigned long   start;
 	unsigned long   end;
+	unsigned long   fallback_start;
+	unsigned long   fallback_end;
 	pgprot_t        pgprot;
 	unsigned int	alignment;
+	enum execmem_range_flags flags;
 };
 
 /**
@@ -82,6 +107,9 @@ struct execmem_info *execmem_arch_setup(void);
  * Allocates memory that will contain executable code, either generated or
  * loaded from kernel modules.
  *
+ * Allocates memory that will contain data coupled with executable code,
+ * like data sections in kernel modules.
+ *
  * The memory will have protections defined by architecture for executable
  * region of the @type.
  *
@@ -95,7 +123,7 @@ void *execmem_alloc(enum execmem_type type, size_t size);
  */
 void execmem_free(void *ptr);
 
-#ifdef CONFIG_EXECMEM
+#if defined(CONFIG_EXECMEM) && !defined(CONFIG_ARCH_WANTS_EXECMEM_LATE)
 void execmem_init(void);
 #else
 static inline void execmem_init(void) {}
diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h
index a3b8caee9405..e395461d59e5 100644
--- a/include/linux/moduleloader.h
+++ b/include/linux/moduleloader.h
@@ -25,10 +25,6 @@ int module_frob_arch_sections(Elf_Ehdr *hdr,
 /* Additional bytes needed by arch in front of individual sections */
 unsigned int arch_mod_section_prepend(struct module *mod, unsigned int section);
 
-/* Allocator used for allocating struct module, core sections and init
-   sections.  Returns NULL on failure. */
-void *module_alloc(unsigned long size);
-
 /* Determines if the section name is an init section (that is only used during
  * module loading).
  */
@@ -126,12 +122,4 @@ void module_arch_cleanup(struct module *mod);
 /* Any cleanup before freeing mod->module_init */
 void module_arch_freeing_init(struct module *mod);
 
-#if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \
-		!defined(CONFIG_KASAN_VMALLOC)
-#include <linux/kasan.h>
-#define MODULE_ALIGN (PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT)
-#else
-#define MODULE_ALIGN PAGE_SIZE
-#endif
-
 #endif
diff --git a/kernel/module/main.c b/kernel/module/main.c
index d56b7df0cbb6..91e185607d4b 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1188,24 +1188,20 @@ void __weak module_arch_freeing_init(struct module *mod)
 {
 }
 
-static bool mod_mem_use_vmalloc(enum mod_mem_type type)
-{
-	return IS_ENABLED(CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC) &&
-		mod_mem_type_is_core_data(type);
-}
-
 static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
 {
 	unsigned int size = PAGE_ALIGN(mod->mem[type].size);
+	enum execmem_type execmem_type;
 	void *ptr;
 
 	mod->mem[type].size = size;
 
-	if (mod_mem_use_vmalloc(type))
-		ptr = vmalloc(size);
+	if (mod_mem_type_is_data(type))
+		execmem_type = EXECMEM_MODULE_DATA;
 	else
-		ptr = execmem_alloc(EXECMEM_MODULE_TEXT, size);
+		execmem_type = EXECMEM_MODULE_TEXT;
 
+	ptr = execmem_alloc(execmem_type, size);
 	if (!ptr)
 		return -ENOMEM;
 
@@ -1232,10 +1228,7 @@ static void module_memory_free(struct module *mod, enum mod_mem_type type)
 {
 	void *ptr = mod->mem[type].base;
 
-	if (mod_mem_use_vmalloc(type))
-		vfree(ptr);
-	else
-		execmem_free(ptr);
+	execmem_free(ptr);
 }
 
 static void free_mod_mem(struct module *mod)
@@ -1630,13 +1623,6 @@ static void free_modinfo(struct module *mod)
 	}
 }
 
-void * __weak module_alloc(unsigned long size)
-{
-	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
-			GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
-			NUMA_NO_NODE, __builtin_return_address(0));
-}
-
 bool __weak module_init_section(const char *name)
 {
 	return strstarts(name, ".init");
diff --git a/mm/execmem.c b/mm/execmem.c
index 80e61c1e7319..0c4b36bc6d10 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -12,27 +12,49 @@
 #include <linux/moduleloader.h>
 
 static struct execmem_info *execmem_info __ro_after_init;
+static struct execmem_info default_execmem_info __ro_after_init;
 
 static void *__execmem_alloc(struct execmem_range *range, size_t size)
 {
+	bool kasan = range->flags & EXECMEM_KASAN_SHADOW;
+	unsigned long vm_flags  = VM_FLUSH_RESET_PERMS;
+	gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN;
 	unsigned long start = range->start;
 	unsigned long end = range->end;
 	unsigned int align = range->alignment;
 	pgprot_t pgprot = range->pgprot;
+	void *p;
+
+	if (kasan)
+		vm_flags |= VM_DEFER_KMEMLEAK;
+
+	p = __vmalloc_node_range(size, align, start, end, gfp_flags,
+				 pgprot, vm_flags, NUMA_NO_NODE,
+				 __builtin_return_address(0));
+	if (!p && range->fallback_start) {
+		start = range->fallback_start;
+		end = range->fallback_end;
+		p = __vmalloc_node_range(size, align, start, end, gfp_flags,
+					 pgprot, vm_flags, NUMA_NO_NODE,
+					 __builtin_return_address(0));
+	}
+
+	if (!p) {
+		pr_warn_ratelimited("execmem: unable to allocate memory\n");
+		return NULL;
+	}
+
+	if (kasan && (kasan_alloc_module_shadow(p, size, GFP_KERNEL) < 0)) {
+		vfree(p);
+		return NULL;
+	}
 
-	return __vmalloc_node_range(size, align, start, end,
-				    GFP_KERNEL, pgprot, VM_FLUSH_RESET_PERMS,
-				    NUMA_NO_NODE, __builtin_return_address(0));
+	return kasan_reset_tag(p);
 }
 
 void *execmem_alloc(enum execmem_type type, size_t size)
 {
-	struct execmem_range *range;
-
-	if (!execmem_info)
-		return module_alloc(size);
-
-	range = &execmem_info->ranges[type];
+	struct execmem_range *range = &execmem_info->ranges[type];
 
 	return __execmem_alloc(range, size);
 }
@@ -67,10 +89,16 @@ static void execmem_init_missing(struct execmem_info *info)
 		struct execmem_range *r = &info->ranges[i];
 
 		if (!r->start) {
-			r->pgprot = default_range->pgprot;
+			if (i == EXECMEM_MODULE_DATA)
+				r->pgprot = PAGE_KERNEL;
+			else
+				r->pgprot = default_range->pgprot;
 			r->alignment = default_range->alignment;
 			r->start = default_range->start;
 			r->end = default_range->end;
+			r->flags = default_range->flags;
+			r->fallback_start = default_range->fallback_start;
+			r->fallback_end = default_range->fallback_end;
 		}
 	}
 }
@@ -80,14 +108,36 @@ struct execmem_info * __weak execmem_arch_setup(void)
 	return NULL;
 }
 
-void __init execmem_init(void)
+static void __init __execmem_init(void)
 {
 	struct execmem_info *info = execmem_arch_setup();
 
-	if (!info || !execmem_validate(info))
+	if (!info) {
+		info = execmem_info = &default_execmem_info;
+		info->ranges[EXECMEM_DEFAULT].start = VMALLOC_START;
+		info->ranges[EXECMEM_DEFAULT].end = VMALLOC_END;
+		info->ranges[EXECMEM_DEFAULT].pgprot = PAGE_KERNEL_EXEC;
+		info->ranges[EXECMEM_DEFAULT].alignment = 1;
+	}
+
+	if (!execmem_validate(info))
 		return;
 
 	execmem_init_missing(info);
 
 	execmem_info = info;
 }
+
+#ifdef CONFIG_ARCH_WANTS_EXECMEM_LATE
+static int __init execmem_late_init(void)
+{
+	__execmem_init();
+	return 0;
+}
+core_initcall(execmem_late_init);
+#else
+void __init execmem_init(void)
+{
+	__execmem_init();
+}
+#endif
-- 
cgit v1.2.3


From 7582b7be16d0ba90e3dbd9575a730cabd9eb852a Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (IBM)" <rppt@kernel.org>
Date: Sun, 5 May 2024 19:06:27 +0300
Subject: kprobes: remove dependency on CONFIG_MODULES

kprobes depended on CONFIG_MODULES because it has to allocate memory for
code.

Since code allocations are now implemented with execmem, kprobes can be
enabled in non-modular kernels.

Add #ifdef CONFIG_MODULE guards for the code dealing with kprobes inside
modules, make CONFIG_KPROBES select CONFIG_EXECMEM and drop the
dependency of CONFIG_KPROBES on CONFIG_MODULES.

Signed-off-by: Mike Rapoport (IBM) <rppt@kernel.org>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
[mcgrof: rebase in light of NEED_TASKS_RCU ]
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 arch/Kconfig                |  2 +-
 include/linux/module.h      |  9 ++++++++
 kernel/kprobes.c            | 55 ++++++++++++++++++++++++++++-----------------
 kernel/trace/trace_kprobe.c | 20 ++++++++++++++++-
 4 files changed, 63 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/arch/Kconfig b/arch/Kconfig
index ee1ccbde50ef..b34946d90e4b 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -60,9 +60,9 @@ config GENERIC_ENTRY
 
 config KPROBES
 	bool "Kprobes"
-	depends on MODULES
 	depends on HAVE_KPROBES
 	select KALLSYMS
+	select EXECMEM
 	select NEED_TASKS_RCU
 	help
 	  Kprobes allows you to trap at almost any kernel address and
diff --git a/include/linux/module.h b/include/linux/module.h
index 1153b0d99a80..ffa1c603163c 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -605,6 +605,11 @@ static inline bool module_is_live(struct module *mod)
 	return mod->state != MODULE_STATE_GOING;
 }
 
+static inline bool module_is_coming(struct module *mod)
+{
+        return mod->state == MODULE_STATE_COMING;
+}
+
 struct module *__module_text_address(unsigned long addr);
 struct module *__module_address(unsigned long addr);
 bool is_module_address(unsigned long addr);
@@ -857,6 +862,10 @@ void *dereference_module_function_descriptor(struct module *mod, void *ptr)
 	return ptr;
 }
 
+static inline bool module_is_coming(struct module *mod)
+{
+	return false;
+}
 #endif /* CONFIG_MODULES */
 
 #ifdef CONFIG_SYSFS
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ddd7cdc16edf..ca2c6cbd42d2 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1588,7 +1588,7 @@ static int check_kprobe_address_safe(struct kprobe *p,
 	}
 
 	/* Get module refcount and reject __init functions for loaded modules. */
-	if (*probed_mod) {
+	if (IS_ENABLED(CONFIG_MODULES) && *probed_mod) {
 		/*
 		 * We must hold a refcount of the probed module while updating
 		 * its code to prohibit unexpected unloading.
@@ -1603,12 +1603,13 @@ static int check_kprobe_address_safe(struct kprobe *p,
 		 * kprobes in there.
 		 */
 		if (within_module_init((unsigned long)p->addr, *probed_mod) &&
-		    (*probed_mod)->state != MODULE_STATE_COMING) {
+		    !module_is_coming(*probed_mod)) {
 			module_put(*probed_mod);
 			*probed_mod = NULL;
 			ret = -ENOENT;
 		}
 	}
+
 out:
 	preempt_enable();
 	jump_label_unlock();
@@ -2488,24 +2489,6 @@ int kprobe_add_area_blacklist(unsigned long start, unsigned long end)
 	return 0;
 }
 
-/* Remove all symbols in given area from kprobe blacklist */
-static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end)
-{
-	struct kprobe_blacklist_entry *ent, *n;
-
-	list_for_each_entry_safe(ent, n, &kprobe_blacklist, list) {
-		if (ent->start_addr < start || ent->start_addr >= end)
-			continue;
-		list_del(&ent->list);
-		kfree(ent);
-	}
-}
-
-static void kprobe_remove_ksym_blacklist(unsigned long entry)
-{
-	kprobe_remove_area_blacklist(entry, entry + 1);
-}
-
 int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
 				   char *type, char *sym)
 {
@@ -2570,6 +2553,25 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
 	return ret ? : arch_populate_kprobe_blacklist();
 }
 
+#ifdef CONFIG_MODULES
+/* Remove all symbols in given area from kprobe blacklist */
+static void kprobe_remove_area_blacklist(unsigned long start, unsigned long end)
+{
+	struct kprobe_blacklist_entry *ent, *n;
+
+	list_for_each_entry_safe(ent, n, &kprobe_blacklist, list) {
+		if (ent->start_addr < start || ent->start_addr >= end)
+			continue;
+		list_del(&ent->list);
+		kfree(ent);
+	}
+}
+
+static void kprobe_remove_ksym_blacklist(unsigned long entry)
+{
+	kprobe_remove_area_blacklist(entry, entry + 1);
+}
+
 static void add_module_kprobe_blacklist(struct module *mod)
 {
 	unsigned long start, end;
@@ -2672,6 +2674,17 @@ static struct notifier_block kprobe_module_nb = {
 	.priority = 0
 };
 
+static int kprobe_register_module_notifier(void)
+{
+	return register_module_notifier(&kprobe_module_nb);
+}
+#else
+static int kprobe_register_module_notifier(void)
+{
+	return 0;
+}
+#endif /* CONFIG_MODULES */
+
 void kprobe_free_init_mem(void)
 {
 	void *start = (void *)(&__init_begin);
@@ -2731,7 +2744,7 @@ static int __init init_kprobes(void)
 	if (!err)
 		err = register_die_notifier(&kprobe_exceptions_nb);
 	if (!err)
-		err = register_module_notifier(&kprobe_module_nb);
+		err = kprobe_register_module_notifier();
 
 	kprobes_initialized = (err == 0);
 	kprobe_sysctls_init();
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 14099cc17fc9..2cb2a3951b4f 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -111,6 +111,7 @@ static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
 	return strncmp(module_name(mod), name, len) == 0 && name[len] == ':';
 }
 
+#ifdef CONFIG_MODULES
 static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
 {
 	char *p;
@@ -129,6 +130,12 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
 
 	return ret;
 }
+#else
+static inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
+{
+	return false;
+}
+#endif
 
 static bool trace_kprobe_is_busy(struct dyn_event *ev)
 {
@@ -670,6 +677,7 @@ end:
 	return ret;
 }
 
+#ifdef CONFIG_MODULES
 /* Module notifier call back, checking event on the module */
 static int trace_kprobe_module_callback(struct notifier_block *nb,
 				       unsigned long val, void *data)
@@ -704,6 +712,16 @@ static struct notifier_block trace_kprobe_module_nb = {
 	.notifier_call = trace_kprobe_module_callback,
 	.priority = 1	/* Invoked after kprobe module callback */
 };
+static int trace_kprobe_register_module_notifier(void)
+{
+	return register_module_notifier(&trace_kprobe_module_nb);
+}
+#else
+static int trace_kprobe_register_module_notifier(void)
+{
+	return 0;
+}
+#endif /* CONFIG_MODULES */
 
 static int count_symbols(void *data, unsigned long unused)
 {
@@ -1933,7 +1951,7 @@ static __init int init_kprobe_trace_early(void)
 	if (ret)
 		return ret;
 
-	if (register_module_notifier(&trace_kprobe_module_nb))
+	if (trace_kprobe_register_module_notifier())
 		return -EINVAL;
 
 	return 0;
-- 
cgit v1.2.3


From b444dfc84d6c19fa385be6d89d62b77184f81ea4 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Fri, 3 May 2024 20:38:43 +0300
Subject: devm-helpers: Fix a misspelled cancellation in the comments

Fix a misspelled cancellation in the comments.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Link: https://lore.kernel.org/r/20240503173843.2922111-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
---
 include/linux/devm-helpers.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/devm-helpers.h b/include/linux/devm-helpers.h
index 74891802200d..708ca9131402 100644
--- a/include/linux/devm-helpers.h
+++ b/include/linux/devm-helpers.h
@@ -41,7 +41,7 @@ static inline void devm_delayed_work_drop(void *res)
  * detached. A few drivers need delayed work which must be cancelled before
  * driver is detached to avoid accessing removed resources.
  * devm_delayed_work_autocancel() can be used to omit the explicit
- * cancelleation when driver is detached.
+ * cancellation when driver is detached.
  */
 static inline int devm_delayed_work_autocancel(struct device *dev,
 					       struct delayed_work *w,
@@ -66,7 +66,7 @@ static inline void devm_work_drop(void *res)
  * A few drivers need to queue work which must be cancelled before driver
  * is detached to avoid accessing removed resources.
  * devm_work_autocancel() can be used to omit the explicit
- * cancelleation when driver is detached.
+ * cancellation when driver is detached.
  */
 static inline int devm_work_autocancel(struct device *dev,
 				       struct work_struct *w,
-- 
cgit v1.2.3


From 08190cc4d8a62f2a07b4158751afd3a01638c4c5 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Fri, 10 May 2024 13:23:09 -0700
Subject: nbd: Use NULL to represent a pointer

This patch fixes the following sparse warnings:

drivers/block/nbd.c: note: in included file (through include/trace/trace_events.h, include/trace/define_trace.h, include/trace/events/nbd.h):
./include/trace/events/nbd.h:61:1: warning: Using plain integer as NULL pointer
drivers/block/nbd.c: note: in included file (through include/trace/perf.h, include/trace/define_trace.h, include/trace/events/nbd.h):
./include/trace/events/nbd.h:61:1: warning: Using plain integer as NULL pointer

Cc: Christoph Hellwig <hch@lst.de>
Cc: Josef Bacik <jbacik@fb.com>
Cc: Yu Kuai <yukuai3@huawei.com>
Cc: Markus Pargmann <mpa@pengutronix.de>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Link: https://lore.kernel.org/r/20240510202313.25209-2-bvanassche@acm.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/trace/events/nbd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/trace/events/nbd.h b/include/trace/events/nbd.h
index 9849956f34d8..390d98a05c9d 100644
--- a/include/trace/events/nbd.h
+++ b/include/trace/events/nbd.h
@@ -72,7 +72,7 @@ DECLARE_EVENT_CLASS(nbd_send_request,
 	),
 
 	TP_fast_assign(
-		__entry->nbd_request = 0;
+		__entry->nbd_request = NULL;
 		__entry->dev_index = index;
 		__entry->request = rq;
 	),
-- 
cgit v1.2.3


From 7c2cc5b1db509fb6e4a20c182cfbaf94255e4a7c Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Thu, 28 Mar 2024 15:46:01 -0400
Subject: Bluetooth: Add proper definitions for scan interval and window

This adds proper definitions for scan interval and window and then make
use of them instead their values.

Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h | 18 ++++++++++++++++--
 net/bluetooth/hci_core.c         | 16 ++++++++--------
 2 files changed, 24 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index e8f581f3f3ce..9fc1eccad9bb 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -2235,8 +2235,22 @@ void hci_mgmt_chan_unregister(struct hci_mgmt_chan *c);
 /* These LE scan and inquiry parameters were chosen according to LE General
  * Discovery Procedure specification.
  */
-#define DISCOV_LE_SCAN_WIN		0x12
-#define DISCOV_LE_SCAN_INT		0x12
+#define DISCOV_LE_SCAN_WIN		0x0012 /* 11.25 msec */
+#define DISCOV_LE_SCAN_INT		0x0012 /* 11.25 msec */
+#define DISCOV_LE_SCAN_INT_FAST		0x0060 /* 60 msec */
+#define DISCOV_LE_SCAN_WIN_FAST		0x0030 /* 30 msec */
+#define DISCOV_LE_SCAN_INT_CONN		0x0060 /* 60 msec */
+#define DISCOV_LE_SCAN_WIN_CONN		0x0060 /* 60 msec */
+#define DISCOV_LE_SCAN_INT_SLOW1	0x0800 /* 1.28 sec */
+#define DISCOV_LE_SCAN_WIN_SLOW1	0x0012 /* 11.25 msec */
+#define DISCOV_LE_SCAN_INT_SLOW2	0x1000 /* 2.56 sec */
+#define DISCOV_LE_SCAN_WIN_SLOW2	0x0024 /* 22.5 msec */
+#define DISCOV_CODED_SCAN_INT_FAST	0x0120 /* 180 msec */
+#define DISCOV_CODED_SCAN_WIN_FAST	0x0090 /* 90 msec */
+#define DISCOV_CODED_SCAN_INT_SLOW1	0x1800 /* 3.84 sec */
+#define DISCOV_CODED_SCAN_WIN_SLOW1	0x0036 /* 33.75 msec */
+#define DISCOV_CODED_SCAN_INT_SLOW2	0x3000 /* 7.68 sec */
+#define DISCOV_CODED_SCAN_WIN_SLOW2	0x006c /* 67.5 msec */
 #define DISCOV_LE_TIMEOUT		10240	/* msec */
 #define DISCOV_INTERLEAVED_TIMEOUT	5120	/* msec */
 #define DISCOV_INTERLEAVED_INQUIRY_LEN	0x04
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index bc5086423ab8..d35015219c26 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -2523,16 +2523,16 @@ struct hci_dev *hci_alloc_dev_priv(int sizeof_priv)
 	hdev->le_adv_channel_map = 0x07;
 	hdev->le_adv_min_interval = 0x0800;
 	hdev->le_adv_max_interval = 0x0800;
-	hdev->le_scan_interval = 0x0060;
-	hdev->le_scan_window = 0x0030;
-	hdev->le_scan_int_suspend = 0x0400;
-	hdev->le_scan_window_suspend = 0x0012;
+	hdev->le_scan_interval = DISCOV_LE_SCAN_INT_FAST;
+	hdev->le_scan_window = DISCOV_LE_SCAN_WIN_FAST;
+	hdev->le_scan_int_suspend = DISCOV_LE_SCAN_INT_SLOW1;
+	hdev->le_scan_window_suspend = DISCOV_LE_SCAN_WIN_SLOW1;
 	hdev->le_scan_int_discovery = DISCOV_LE_SCAN_INT;
 	hdev->le_scan_window_discovery = DISCOV_LE_SCAN_WIN;
-	hdev->le_scan_int_adv_monitor = 0x0060;
-	hdev->le_scan_window_adv_monitor = 0x0030;
-	hdev->le_scan_int_connect = 0x0060;
-	hdev->le_scan_window_connect = 0x0060;
+	hdev->le_scan_int_adv_monitor = DISCOV_LE_SCAN_INT_FAST;
+	hdev->le_scan_window_adv_monitor = DISCOV_LE_SCAN_WIN_FAST;
+	hdev->le_scan_int_connect = DISCOV_LE_SCAN_INT_CONN;
+	hdev->le_scan_window_connect = DISCOV_LE_SCAN_WIN_CONN;
 	hdev->le_conn_min_interval = 0x0018;
 	hdev->le_conn_max_interval = 0x0028;
 	hdev->le_conn_latency = 0x0000;
-- 
cgit v1.2.3


From 311527e9dafdcae0c5a20d62f4f84ad01b33b5f4 Mon Sep 17 00:00:00 2001
From: Iulia Tanasescu <iulia.tanasescu@nxp.com>
Date: Tue, 2 Apr 2024 14:39:30 +0300
Subject: Bluetooth: ISO: Make iso_get_sock_listen generic

This makes iso_get_sock_listen more generic, to return matching socket
in the state provided as argument.

Signed-off-by: Iulia Tanasescu <iulia.tanasescu@nxp.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/bluetooth.h |  2 +-
 net/bluetooth/iso.c               | 75 ++++++++++++++++++++++-----------------
 2 files changed, 43 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index eaec5d6caa29..b3228bd6cd6b 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -285,7 +285,7 @@ void bt_err_ratelimited(const char *fmt, ...);
 	bt_err_ratelimited("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__)
 
 /* Connection and socket states */
-enum {
+enum bt_sock_state {
 	BT_CONNECTED = 1, /* Equal to TCP_ESTABLISHED to make net code happy */
 	BT_OPEN,
 	BT_BOUND,
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index ef0cc80b4c0c..89dee805c60d 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -85,8 +85,9 @@ static void iso_sock_disconn(struct sock *sk);
 
 typedef bool (*iso_sock_match_t)(struct sock *sk, void *data);
 
-static struct sock *iso_get_sock_listen(bdaddr_t *src, bdaddr_t *dst,
-					iso_sock_match_t match, void *data);
+static struct sock *iso_get_sock(bdaddr_t *src, bdaddr_t *dst,
+				 enum bt_sock_state state,
+				 iso_sock_match_t match, void *data);
 
 /* ---- ISO timers ---- */
 #define ISO_CONN_TIMEOUT	(HZ * 40)
@@ -233,10 +234,11 @@ static void iso_conn_del(struct hci_conn *hcon, int err)
 		 * terminated are not processed anymore.
 		 */
 		if (test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) {
-			parent = iso_get_sock_listen(&hcon->src,
-						     &hcon->dst,
-						     iso_match_conn_sync_handle,
-						     hcon);
+			parent = iso_get_sock(&hcon->src,
+					      &hcon->dst,
+					      BT_LISTEN,
+					      iso_match_conn_sync_handle,
+					      hcon);
 
 			if (parent) {
 				set_bit(BT_SK_PA_SYNC_TERM,
@@ -581,22 +583,23 @@ static struct sock *__iso_get_sock_listen_by_sid(bdaddr_t *ba, bdaddr_t *bc,
 	return NULL;
 }
 
-/* Find socket listening:
+/* Find socket in given state:
  * source bdaddr (Unicast)
  * destination bdaddr (Broadcast only)
  * match func - pass NULL to ignore
  * match func data - pass -1 to ignore
  * Returns closest match.
  */
-static struct sock *iso_get_sock_listen(bdaddr_t *src, bdaddr_t *dst,
-					iso_sock_match_t match, void *data)
+static struct sock *iso_get_sock(bdaddr_t *src, bdaddr_t *dst,
+				 enum bt_sock_state state,
+				 iso_sock_match_t match, void *data)
 {
 	struct sock *sk = NULL, *sk1 = NULL;
 
 	read_lock(&iso_sk_list.lock);
 
 	sk_for_each(sk, &iso_sk_list.head) {
-		if (sk->sk_state != BT_LISTEN)
+		if (sk->sk_state != state)
 			continue;
 
 		/* Match Broadcast destination */
@@ -1777,32 +1780,37 @@ static void iso_conn_ready(struct iso_conn *conn)
 						 HCI_EVT_LE_BIG_SYNC_ESTABILISHED);
 
 			/* Get reference to PA sync parent socket, if it exists */
-			parent = iso_get_sock_listen(&hcon->src,
-						     &hcon->dst,
-						     iso_match_pa_sync_flag, NULL);
+			parent = iso_get_sock(&hcon->src, &hcon->dst,
+					      BT_LISTEN,
+					      iso_match_pa_sync_flag,
+					      NULL);
 			if (!parent && ev)
-				parent = iso_get_sock_listen(&hcon->src,
-							     &hcon->dst,
-							     iso_match_big, ev);
+				parent = iso_get_sock(&hcon->src,
+						      &hcon->dst,
+						      BT_LISTEN,
+						      iso_match_big, ev);
 		} else if (test_bit(HCI_CONN_PA_SYNC_FAILED, &hcon->flags)) {
 			ev2 = hci_recv_event_data(hcon->hdev,
 						  HCI_EV_LE_PA_SYNC_ESTABLISHED);
 			if (ev2)
-				parent = iso_get_sock_listen(&hcon->src,
-							     &hcon->dst,
-							     iso_match_sid, ev2);
+				parent = iso_get_sock(&hcon->src,
+						      &hcon->dst,
+						      BT_LISTEN,
+						      iso_match_sid, ev2);
 		} else if (test_bit(HCI_CONN_PA_SYNC, &hcon->flags)) {
 			ev3 = hci_recv_event_data(hcon->hdev,
 						  HCI_EVT_LE_BIG_INFO_ADV_REPORT);
 			if (ev3)
-				parent = iso_get_sock_listen(&hcon->src,
-							     &hcon->dst,
-							     iso_match_sync_handle, ev3);
+				parent = iso_get_sock(&hcon->src,
+						      &hcon->dst,
+						      BT_LISTEN,
+						      iso_match_sync_handle,
+						      ev3);
 		}
 
 		if (!parent)
-			parent = iso_get_sock_listen(&hcon->src,
-							BDADDR_ANY, NULL, NULL);
+			parent = iso_get_sock(&hcon->src, BDADDR_ANY,
+					      BT_LISTEN, NULL, NULL);
 
 		if (!parent)
 			return;
@@ -1923,8 +1931,8 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags)
 	 */
 	ev1 = hci_recv_event_data(hdev, HCI_EV_LE_PA_SYNC_ESTABLISHED);
 	if (ev1) {
-		sk = iso_get_sock_listen(&hdev->bdaddr, bdaddr, iso_match_sid,
-					 ev1);
+		sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_LISTEN,
+				  iso_match_sid, ev1);
 		if (sk && !ev1->status)
 			iso_pi(sk)->sync_handle = le16_to_cpu(ev1->handle);
 
@@ -1934,12 +1942,12 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags)
 	ev2 = hci_recv_event_data(hdev, HCI_EVT_LE_BIG_INFO_ADV_REPORT);
 	if (ev2) {
 		/* Try to get PA sync listening socket, if it exists */
-		sk = iso_get_sock_listen(&hdev->bdaddr, bdaddr,
-						iso_match_pa_sync_flag, NULL);
+		sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_LISTEN,
+				  iso_match_pa_sync_flag, NULL);
 
 		if (!sk) {
-			sk = iso_get_sock_listen(&hdev->bdaddr, bdaddr,
-						 iso_match_sync_handle, ev2);
+			sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_LISTEN,
+					  iso_match_sync_handle, ev2);
 
 			/* If PA Sync is in process of terminating,
 			 * do not handle any more BIGInfo adv reports.
@@ -1979,8 +1987,8 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags)
 		u8 *base;
 		struct hci_conn *hcon;
 
-		sk = iso_get_sock_listen(&hdev->bdaddr, bdaddr,
-					 iso_match_sync_handle_pa_report, ev3);
+		sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_LISTEN,
+				  iso_match_sync_handle_pa_report, ev3);
 		if (!sk)
 			goto done;
 
@@ -2029,7 +2037,8 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags)
 			hcon->le_per_adv_data_len = 0;
 		}
 	} else {
-		sk = iso_get_sock_listen(&hdev->bdaddr, BDADDR_ANY, NULL, NULL);
+		sk = iso_get_sock(&hdev->bdaddr, BDADDR_ANY,
+				  BT_LISTEN, NULL, NULL);
 	}
 
 done:
-- 
cgit v1.2.3


From d356c924e7a3adbea1e3e4ff4e098bcd9b99a82d Mon Sep 17 00:00:00 2001
From: Iulia Tanasescu <iulia.tanasescu@nxp.com>
Date: Tue, 2 Apr 2024 14:39:31 +0300
Subject: Bluetooth: ISO: Handle PA sync when no BIGInfo reports are generated

In case of a Broadcast Source that has PA enabled but no active BIG,
a Broadcast Sink needs to establish PA sync and parse BASE from PA
reports.

This commit moves the allocation of a PA sync hcon from the BIGInfo
advertising report event to the PA sync established event. After the
first complete PA report, the hcon is notified to the ISO layer. A
child socket is allocated and enqueued in the parent's accept queue.

BIGInfo reports also need to be processed, to extract the encryption
field and inform userspace. After the first BIGInfo report is received,
the PA sync hcon is notified again to the ISO layer. Since a socket will
be found this time, the socket state will transition to BT_CONNECTED and
the userspace will be woken up using sk_state_change.

Signed-off-by: Iulia Tanasescu <iulia.tanasescu@nxp.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h |  3 +-
 net/bluetooth/hci_event.c        | 58 ++++++++++++++-----------
 net/bluetooth/iso.c              | 94 +++++++++++++++++-----------------------
 3 files changed, 74 insertions(+), 81 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 9fc1eccad9bb..2283d12ee358 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1373,8 +1373,7 @@ hci_conn_hash_lookup_pa_sync_handle(struct hci_dev *hdev, __u16 sync_handle)
 	rcu_read_lock();
 
 	list_for_each_entry_rcu(c, &h->list, list) {
-		if (c->type != ISO_LINK ||
-			!test_bit(HCI_CONN_PA_SYNC, &c->flags))
+		if (c->type != ISO_LINK)
 			continue;
 
 		if (c->sync_handle == sync_handle) {
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 71fedb2d4b48..d228321605ae 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -1,7 +1,7 @@
 /*
    BlueZ - Bluetooth protocol stack for Linux
    Copyright (c) 2000-2001, 2010, Code Aurora Forum. All rights reserved.
-   Copyright 2023 NXP
+   Copyright 2023-2024 NXP
 
    Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
 
@@ -6492,14 +6492,16 @@ static void hci_le_pa_sync_estabilished_evt(struct hci_dev *hdev, void *data,
 	if (!(flags & HCI_PROTO_DEFER))
 		goto unlock;
 
-	if (ev->status) {
-		/* Add connection to indicate the failed PA sync event */
-		pa_sync = hci_conn_add_unset(hdev, ISO_LINK, BDADDR_ANY,
-					     HCI_ROLE_SLAVE);
+	/* Add connection to indicate PA sync event */
+	pa_sync = hci_conn_add_unset(hdev, ISO_LINK, BDADDR_ANY,
+				     HCI_ROLE_SLAVE);
 
-		if (!pa_sync)
-			goto unlock;
+	if (!pa_sync)
+		goto unlock;
+
+	pa_sync->sync_handle = le16_to_cpu(ev->handle);
 
+	if (ev->status) {
 		set_bit(HCI_CONN_PA_SYNC_FAILED, &pa_sync->flags);
 
 		/* Notify iso layer */
@@ -6516,6 +6518,7 @@ static void hci_le_per_adv_report_evt(struct hci_dev *hdev, void *data,
 	struct hci_ev_le_per_adv_report *ev = data;
 	int mask = hdev->link_mode;
 	__u8 flags = 0;
+	struct hci_conn *pa_sync;
 
 	bt_dev_dbg(hdev, "sync_handle 0x%4.4x", le16_to_cpu(ev->sync_handle));
 
@@ -6523,8 +6526,28 @@ static void hci_le_per_adv_report_evt(struct hci_dev *hdev, void *data,
 
 	mask |= hci_proto_connect_ind(hdev, BDADDR_ANY, ISO_LINK, &flags);
 	if (!(mask & HCI_LM_ACCEPT))
-		hci_le_pa_term_sync(hdev, ev->sync_handle);
+		goto unlock;
 
+	if (!(flags & HCI_PROTO_DEFER))
+		goto unlock;
+
+	pa_sync = hci_conn_hash_lookup_pa_sync_handle
+			(hdev,
+			le16_to_cpu(ev->sync_handle));
+
+	if (!pa_sync)
+		goto unlock;
+
+	if (ev->data_status == LE_PA_DATA_COMPLETE &&
+	    !test_and_set_bit(HCI_CONN_PA_SYNC, &pa_sync->flags)) {
+		/* Notify iso layer */
+		hci_connect_cfm(pa_sync, 0);
+
+		/* Notify MGMT layer */
+		mgmt_device_connected(hdev, pa_sync, NULL, 0);
+	}
+
+unlock:
 	hci_dev_unlock(hdev);
 }
 
@@ -7059,10 +7082,8 @@ static void hci_le_big_info_adv_report_evt(struct hci_dev *hdev, void *data,
 	hci_dev_lock(hdev);
 
 	mask |= hci_proto_connect_ind(hdev, BDADDR_ANY, ISO_LINK, &flags);
-	if (!(mask & HCI_LM_ACCEPT)) {
-		hci_le_pa_term_sync(hdev, ev->sync_handle);
+	if (!(mask & HCI_LM_ACCEPT))
 		goto unlock;
-	}
 
 	if (!(flags & HCI_PROTO_DEFER))
 		goto unlock;
@@ -7071,24 +7092,11 @@ static void hci_le_big_info_adv_report_evt(struct hci_dev *hdev, void *data,
 			(hdev,
 			le16_to_cpu(ev->sync_handle));
 
-	if (pa_sync)
-		goto unlock;
-
-	/* Add connection to indicate the PA sync event */
-	pa_sync = hci_conn_add_unset(hdev, ISO_LINK, BDADDR_ANY,
-				     HCI_ROLE_SLAVE);
-
 	if (!pa_sync)
 		goto unlock;
 
-	pa_sync->sync_handle = le16_to_cpu(ev->sync_handle);
-	set_bit(HCI_CONN_PA_SYNC, &pa_sync->flags);
-
 	/* Notify iso layer */
-	hci_connect_cfm(pa_sync, 0x00);
-
-	/* Notify MGMT layer */
-	mgmt_device_connected(hdev, pa_sync, NULL, 0);
+	hci_connect_cfm(pa_sync, 0);
 
 unlock:
 	hci_dev_unlock(hdev);
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index 89dee805c60d..d0ef7290e5b7 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -54,7 +54,6 @@ static void iso_sock_kill(struct sock *sk);
 enum {
 	BT_SK_BIG_SYNC,
 	BT_SK_PA_SYNC,
-	BT_SK_PA_SYNC_TERM,
 };
 
 struct iso_pinfo {
@@ -81,6 +80,7 @@ static bool check_ucast_qos(struct bt_iso_qos *qos);
 static bool check_bcast_qos(struct bt_iso_qos *qos);
 static bool iso_match_sid(struct sock *sk, void *data);
 static bool iso_match_sync_handle(struct sock *sk, void *data);
+static bool iso_match_sync_handle_pa_report(struct sock *sk, void *data);
 static void iso_sock_disconn(struct sock *sk);
 
 typedef bool (*iso_sock_match_t)(struct sock *sk, void *data);
@@ -197,21 +197,10 @@ static void iso_chan_del(struct sock *sk, int err)
 	sock_set_flag(sk, SOCK_ZAPPED);
 }
 
-static bool iso_match_conn_sync_handle(struct sock *sk, void *data)
-{
-	struct hci_conn *hcon = data;
-
-	if (test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags))
-		return false;
-
-	return hcon->sync_handle == iso_pi(sk)->sync_handle;
-}
-
 static void iso_conn_del(struct hci_conn *hcon, int err)
 {
 	struct iso_conn *conn = hcon->iso_data;
 	struct sock *sk;
-	struct sock *parent;
 
 	if (!conn)
 		return;
@@ -227,26 +216,6 @@ static void iso_conn_del(struct hci_conn *hcon, int err)
 
 	if (sk) {
 		lock_sock(sk);
-
-		/* While a PA sync hcon is in the process of closing,
-		 * mark parent socket with a flag, so that any residual
-		 * BIGInfo adv reports that arrive before PA sync is
-		 * terminated are not processed anymore.
-		 */
-		if (test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) {
-			parent = iso_get_sock(&hcon->src,
-					      &hcon->dst,
-					      BT_LISTEN,
-					      iso_match_conn_sync_handle,
-					      hcon);
-
-			if (parent) {
-				set_bit(BT_SK_PA_SYNC_TERM,
-					&iso_pi(parent)->flags);
-				sock_put(parent);
-			}
-		}
-
 		iso_sock_clear_timer(sk);
 		iso_chan_del(sk, err);
 		release_sock(sk);
@@ -860,6 +829,7 @@ static struct sock *iso_sock_alloc(struct net *net, struct socket *sock,
 	iso_pi(sk)->src_type = BDADDR_LE_PUBLIC;
 
 	iso_pi(sk)->qos = default_qos;
+	iso_pi(sk)->sync_handle = -1;
 
 	bt_sock_link(&iso_sk_list, sk);
 	return sk;
@@ -907,7 +877,6 @@ static int iso_sock_bind_bc(struct socket *sock, struct sockaddr *addr,
 		return -EINVAL;
 
 	iso_pi(sk)->dst_type = sa->iso_bc->bc_bdaddr_type;
-	iso_pi(sk)->sync_handle = -1;
 
 	if (sa->iso_bc->bc_sid > 0x0f)
 		return -EINVAL;
@@ -984,7 +953,8 @@ static int iso_sock_bind(struct socket *sock, struct sockaddr *addr,
 	/* Allow the user to bind a PA sync socket to a number
 	 * of BISes to sync to.
 	 */
-	if (sk->sk_state == BT_CONNECT2 &&
+	if ((sk->sk_state == BT_CONNECT2 ||
+	     sk->sk_state == BT_CONNECTED) &&
 	    test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) {
 		err = iso_sock_bind_pa_sk(sk, sa, addr_len);
 		goto done;
@@ -1396,6 +1366,16 @@ static int iso_sock_recvmsg(struct socket *sock, struct msghdr *msg,
 			}
 			release_sock(sk);
 			return 0;
+		case BT_CONNECTED:
+			if (test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) {
+				iso_conn_big_sync(sk);
+				sk->sk_state = BT_LISTEN;
+				release_sock(sk);
+				return 0;
+			}
+
+			release_sock(sk);
+			break;
 		case BT_CONNECT:
 			release_sock(sk);
 			return iso_connect_cis(sk);
@@ -1541,7 +1521,9 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname,
 
 	case BT_ISO_QOS:
 		if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND &&
-		    sk->sk_state != BT_CONNECT2) {
+		    sk->sk_state != BT_CONNECT2 &&
+		    (!test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags) ||
+		    sk->sk_state != BT_CONNECTED)) {
 			err = -EINVAL;
 			break;
 		}
@@ -1762,7 +1744,7 @@ static void iso_conn_ready(struct iso_conn *conn)
 	struct sock *sk = conn->sk;
 	struct hci_ev_le_big_sync_estabilished *ev = NULL;
 	struct hci_ev_le_pa_sync_established *ev2 = NULL;
-	struct hci_evt_le_big_info_adv_report *ev3 = NULL;
+	struct hci_ev_le_per_adv_report *ev3 = NULL;
 	struct hci_conn *hcon;
 
 	BT_DBG("conn %p", conn);
@@ -1799,12 +1781,12 @@ static void iso_conn_ready(struct iso_conn *conn)
 						      iso_match_sid, ev2);
 		} else if (test_bit(HCI_CONN_PA_SYNC, &hcon->flags)) {
 			ev3 = hci_recv_event_data(hcon->hdev,
-						  HCI_EVT_LE_BIG_INFO_ADV_REPORT);
+						  HCI_EV_LE_PER_ADV_REPORT);
 			if (ev3)
 				parent = iso_get_sock(&hcon->src,
 						      &hcon->dst,
 						      BT_LISTEN,
-						      iso_match_sync_handle,
+						      iso_match_sync_handle_pa_report,
 						      ev3);
 		}
 
@@ -1847,7 +1829,6 @@ static void iso_conn_ready(struct iso_conn *conn)
 
 		if (ev3) {
 			iso_pi(sk)->qos = iso_pi(parent)->qos;
-			iso_pi(sk)->qos.bcast.encryption = ev3->encryption;
 			hcon->iso_qos = iso_pi(sk)->qos;
 			iso_pi(sk)->bc_num_bis = iso_pi(parent)->bc_num_bis;
 			memcpy(iso_pi(sk)->bc_bis, iso_pi(parent)->bc_bis, ISO_MAX_NUM_BIS);
@@ -1941,26 +1922,29 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags)
 
 	ev2 = hci_recv_event_data(hdev, HCI_EVT_LE_BIG_INFO_ADV_REPORT);
 	if (ev2) {
-		/* Try to get PA sync listening socket, if it exists */
-		sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_LISTEN,
-				  iso_match_pa_sync_flag, NULL);
-
-		if (!sk) {
-			sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_LISTEN,
-					  iso_match_sync_handle, ev2);
-
-			/* If PA Sync is in process of terminating,
-			 * do not handle any more BIGInfo adv reports.
-			 */
-
-			if (sk && test_bit(BT_SK_PA_SYNC_TERM,
-					   &iso_pi(sk)->flags))
-				return 0;
+		/* Check if BIGInfo report has already been handled */
+		sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_CONNECTED,
+				  iso_match_sync_handle, ev2);
+		if (sk) {
+			sock_put(sk);
+			sk = NULL;
+			goto done;
 		}
 
+		/* Try to get PA sync socket, if it exists */
+		sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_CONNECT2,
+				  iso_match_sync_handle, ev2);
+		if (!sk)
+			sk = iso_get_sock(&hdev->bdaddr, bdaddr,
+					  BT_LISTEN,
+					  iso_match_sync_handle,
+					  ev2);
+
 		if (sk) {
 			int err;
 
+			iso_pi(sk)->qos.bcast.encryption = ev2->encryption;
+
 			if (ev2->num_bis < iso_pi(sk)->bc_num_bis)
 				iso_pi(sk)->bc_num_bis = ev2->num_bis;
 
@@ -1979,6 +1963,8 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags)
 				}
 			}
 		}
+
+		goto done;
 	}
 
 	ev3 = hci_recv_event_data(hdev, HCI_EV_LE_PER_ADV_REPORT);
-- 
cgit v1.2.3


From 1c08108f3014881ad5f4c35a2abaf9c65475035d Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Wed, 27 Mar 2024 10:23:51 -0600
Subject: Bluetooth: L2CAP: Avoid -Wflex-array-member-not-at-end warnings

-Wflex-array-member-not-at-end is coming in GCC-14, and we are getting
ready to enable it globally.

There are currently a couple of objects (`req` and `rsp`), in a couple
of structures, that contain flexible structures (`struct l2cap_ecred_conn_req`
and `struct l2cap_ecred_conn_rsp`), for example:

struct l2cap_ecred_rsp_data {
        struct {
                struct l2cap_ecred_conn_rsp rsp;
                __le16 scid[L2CAP_ECRED_MAX_CID];
        } __packed pdu;
        int count;
};

in the struct above, `struct l2cap_ecred_conn_rsp` is a flexible
structure:

struct l2cap_ecred_conn_rsp {
        __le16 mtu;
        __le16 mps;
        __le16 credits;
        __le16 result;
        __le16 dcid[];
};

So, in order to avoid ending up with a flexible-array member in the
middle of another structure, we use the `struct_group_tagged()` (and
`__struct_group()` when the flexible structure is `__packed`) helper
to separate the flexible array from the rest of the members in the
flexible structure:

struct l2cap_ecred_conn_rsp {
        struct_group_tagged(l2cap_ecred_conn_rsp_hdr, hdr,

	... the rest of members

        );
        __le16 dcid[];
};

With the change described above, we now declare objects of the type of
the tagged struct, in this example `struct l2cap_ecred_conn_rsp_hdr`,
without embedding flexible arrays in the middle of other structures:

struct l2cap_ecred_rsp_data {
        struct {
                struct l2cap_ecred_conn_rsp_hdr rsp;
                __le16 scid[L2CAP_ECRED_MAX_CID];
        } __packed pdu;
        int count;
};

Also, when the flexible-array member needs to be accessed, we use
`container_of()` to retrieve a pointer to the flexible structure.

We also use the `DEFINE_RAW_FLEX()` helper for a couple of on-stack
definitions of a flexible structure where the size of the flexible-array
member is known at compile-time.

So, with these changes, fix the following warnings:
net/bluetooth/l2cap_core.c:1260:45: warning: structure containing a
flexible array member is not at the end of another structure
[-Wflex-array-member-not-at-end]
net/bluetooth/l2cap_core.c:3740:45: warning: structure containing a
flexible array member is not at the end of another structure
[-Wflex-array-member-not-at-end]
net/bluetooth/l2cap_core.c:4999:45: warning: structure containing a
flexible array member is not at the end of another structure
[-Wflex-array-member-not-at-end]
net/bluetooth/l2cap_core.c:7116:47: warning: structure containing a
flexible array member is not at the end of another structure
[-Wflex-array-member-not-at-end]

Link: https://github.com/KSPP/linux/issues/202
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/l2cap.h | 22 +++++++++++++--------
 net/bluetooth/l2cap_core.c    | 46 ++++++++++++++++++++-----------------------
 2 files changed, 35 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h
index a4278aa618ab..7d4a3e766e7b 100644
--- a/include/net/bluetooth/l2cap.h
+++ b/include/net/bluetooth/l2cap.h
@@ -463,18 +463,24 @@ struct l2cap_le_credits {
 #define L2CAP_ECRED_MAX_CID		5
 
 struct l2cap_ecred_conn_req {
-	__le16 psm;
-	__le16 mtu;
-	__le16 mps;
-	__le16 credits;
+	/* New members must be added within the struct_group() macro below. */
+	__struct_group(l2cap_ecred_conn_req_hdr, hdr, __packed,
+		__le16 psm;
+		__le16 mtu;
+		__le16 mps;
+		__le16 credits;
+	);
 	__le16 scid[];
 } __packed;
 
 struct l2cap_ecred_conn_rsp {
-	__le16 mtu;
-	__le16 mps;
-	__le16 credits;
-	__le16 result;
+	/* New members must be added within the struct_group() macro below. */
+	struct_group_tagged(l2cap_ecred_conn_rsp_hdr, hdr,
+		__le16 mtu;
+		__le16 mps;
+		__le16 credits;
+		__le16 result;
+	);
 	__le16 dcid[];
 };
 
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 9223b1a698e3..fa714f5bd0b3 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -1260,7 +1260,7 @@ static void l2cap_le_connect(struct l2cap_chan *chan)
 
 struct l2cap_ecred_conn_data {
 	struct {
-		struct l2cap_ecred_conn_req req;
+		struct l2cap_ecred_conn_req_hdr req;
 		__le16 scid[5];
 	} __packed pdu;
 	struct l2cap_chan *chan;
@@ -3740,7 +3740,7 @@ static void l2cap_ecred_list_defer(struct l2cap_chan *chan, void *data)
 
 struct l2cap_ecred_rsp_data {
 	struct {
-		struct l2cap_ecred_conn_rsp rsp;
+		struct l2cap_ecred_conn_rsp_hdr rsp;
 		__le16 scid[L2CAP_ECRED_MAX_CID];
 	} __packed pdu;
 	int count;
@@ -3749,6 +3749,8 @@ struct l2cap_ecred_rsp_data {
 static void l2cap_ecred_rsp_defer(struct l2cap_chan *chan, void *data)
 {
 	struct l2cap_ecred_rsp_data *rsp = data;
+	struct l2cap_ecred_conn_rsp *rsp_flex =
+		container_of(&rsp->pdu.rsp, struct l2cap_ecred_conn_rsp, hdr);
 
 	if (test_bit(FLAG_ECRED_CONN_REQ_SENT, &chan->flags))
 		return;
@@ -3758,7 +3760,7 @@ static void l2cap_ecred_rsp_defer(struct l2cap_chan *chan, void *data)
 
 	/* Include all channels pending with the same ident */
 	if (!rsp->pdu.rsp.result)
-		rsp->pdu.rsp.dcid[rsp->count++] = cpu_to_le16(chan->scid);
+		rsp_flex->dcid[rsp->count++] = cpu_to_le16(chan->scid);
 	else
 		l2cap_chan_del(chan, ECONNRESET);
 }
@@ -4996,10 +4998,7 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn,
 				       u8 *data)
 {
 	struct l2cap_ecred_conn_req *req = (void *) data;
-	struct {
-		struct l2cap_ecred_conn_rsp rsp;
-		__le16 dcid[L2CAP_ECRED_MAX_CID];
-	} __packed pdu;
+	DEFINE_RAW_FLEX(struct l2cap_ecred_conn_rsp, pdu, dcid, L2CAP_ECRED_MAX_CID);
 	struct l2cap_chan *chan, *pchan;
 	u16 mtu, mps;
 	__le16 psm;
@@ -5018,7 +5017,7 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn,
 	cmd_len -= sizeof(*req);
 	num_scid = cmd_len / sizeof(u16);
 
-	if (num_scid > ARRAY_SIZE(pdu.dcid)) {
+	if (num_scid > L2CAP_ECRED_MAX_CID) {
 		result = L2CAP_CR_LE_INVALID_PARAMS;
 		goto response;
 	}
@@ -5047,7 +5046,7 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn,
 
 	BT_DBG("psm 0x%2.2x mtu %u mps %u", __le16_to_cpu(psm), mtu, mps);
 
-	memset(&pdu, 0, sizeof(pdu));
+	memset(pdu, 0, sizeof(*pdu));
 
 	/* Check if we have socket listening on psm */
 	pchan = l2cap_global_chan_by_psm(BT_LISTEN, psm, &conn->hcon->src,
@@ -5073,8 +5072,8 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn,
 
 		BT_DBG("scid[%d] 0x%4.4x", i, scid);
 
-		pdu.dcid[i] = 0x0000;
-		len += sizeof(*pdu.dcid);
+		pdu->dcid[i] = 0x0000;
+		len += sizeof(*pdu->dcid);
 
 		/* Check for valid dynamic CID range */
 		if (scid < L2CAP_CID_DYN_START || scid > L2CAP_CID_LE_DYN_END) {
@@ -5108,13 +5107,13 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn,
 		l2cap_ecred_init(chan, __le16_to_cpu(req->credits));
 
 		/* Init response */
-		if (!pdu.rsp.credits) {
-			pdu.rsp.mtu = cpu_to_le16(chan->imtu);
-			pdu.rsp.mps = cpu_to_le16(chan->mps);
-			pdu.rsp.credits = cpu_to_le16(chan->rx_credits);
+		if (!pdu->credits) {
+			pdu->mtu = cpu_to_le16(chan->imtu);
+			pdu->mps = cpu_to_le16(chan->mps);
+			pdu->credits = cpu_to_le16(chan->rx_credits);
 		}
 
-		pdu.dcid[i] = cpu_to_le16(chan->scid);
+		pdu->dcid[i] = cpu_to_le16(chan->scid);
 
 		__set_chan_timer(chan, chan->ops->get_sndtimeo(chan));
 
@@ -5136,13 +5135,13 @@ unlock:
 	l2cap_chan_put(pchan);
 
 response:
-	pdu.rsp.result = cpu_to_le16(result);
+	pdu->result = cpu_to_le16(result);
 
 	if (defer)
 		return 0;
 
 	l2cap_send_cmd(conn, cmd->ident, L2CAP_ECRED_CONN_RSP,
-		       sizeof(pdu.rsp) + len, &pdu);
+		       sizeof(*pdu) + len, pdu);
 
 	return 0;
 }
@@ -7113,14 +7112,11 @@ EXPORT_SYMBOL_GPL(l2cap_chan_connect);
 static void l2cap_ecred_reconfigure(struct l2cap_chan *chan)
 {
 	struct l2cap_conn *conn = chan->conn;
-	struct {
-		struct l2cap_ecred_reconf_req req;
-		__le16 scid;
-	} pdu;
+	DEFINE_RAW_FLEX(struct l2cap_ecred_reconf_req, pdu, scid, 1);
 
-	pdu.req.mtu = cpu_to_le16(chan->imtu);
-	pdu.req.mps = cpu_to_le16(chan->mps);
-	pdu.scid    = cpu_to_le16(chan->scid);
+	pdu->mtu = cpu_to_le16(chan->imtu);
+	pdu->mps = cpu_to_le16(chan->mps);
+	pdu->scid[0] = cpu_to_le16(chan->scid);
 
 	chan->ident = l2cap_get_ident(conn);
 
-- 
cgit v1.2.3


From c4585edf708edb5277a3cc4b8581ccb833f3307d Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Fri, 26 Apr 2024 16:52:46 -0600
Subject: Bluetooth: hci_conn, hci_sync: Use __counted_by() to avoid -Wfamnae
 warnings

Prepare for the coming implementation by GCC and Clang of the
__counted_by attribute. Flexible array members annotated with
__counted_by can have their accesses bounds-checked at run-time
via CONFIG_UBSAN_BOUNDS (for array indexing) and CONFIG_FORTIFY_SOURCE
(for strcpy/memcpy-family functions).

Also, -Wflex-array-member-not-at-end is coming in GCC-14, and we are
getting ready to enable it globally.

So, use the `DEFINE_FLEX()` helper for multiple on-stack definitions
of a flexible structure where the size of the flexible-array member
is known at compile-time, and refactor the rest of the code,
accordingly.

Notice that, due to the use of `__counted_by()` in `struct
hci_cp_le_create_cis`, the for loop in function `hci_cs_le_create_cis()`
had to be modified. Once the index `i`, through which `cp->cis[i]` is
accessed, falls in the interval [0, cp->num_cis), `cp->num_cis` cannot
be decremented all the way down to zero while accessing `cp->cis[]`:

net/bluetooth/hci_event.c:4310:
4310    for (i = 0; cp->num_cis; cp->num_cis--, i++) {
                ...
4314            handle = __le16_to_cpu(cp->cis[i].cis_handle);

otherwise, only half (one iteration before `cp->num_cis == i`) or half
plus one (one iteration before `cp->num_cis < i`) of the items in the
array will be accessed before running into an out-of-bounds issue. So,
in order to avoid this, set `cp->num_cis` to zero just after the for
loop.

Also, make use of `aux_num_cis` variable to update `cmd->num_cis` after
a `list_for_each_entry_rcu()` loop.

With these changes, fix the following warnings:
net/bluetooth/hci_sync.c:1239:56: warning: structure containing a flexible
array member is not at the end of another structure
[-Wflex-array-member-not-at-end]
net/bluetooth/hci_sync.c:1415:51: warning: structure containing a flexible
array member is not at the end of another structure
[-Wflex-array-member-not-at-end]
net/bluetooth/hci_sync.c:1731:51: warning: structure containing a flexible
array member is not at the end of another structure
[-Wflex-array-member-not-at-end]
net/bluetooth/hci_sync.c:6497:45: warning: structure containing a flexible
array member is not at the end of another structure
[-Wflex-array-member-not-at-end]

Link: https://github.com/KSPP/linux/issues/202
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h |  8 ++---
 net/bluetooth/hci_event.c   |  3 +-
 net/bluetooth/hci_sync.c    | 84 ++++++++++++++++++---------------------------
 3 files changed, 40 insertions(+), 55 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 5c12761cbc0e..0b0ac7f06570 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -2026,7 +2026,7 @@ struct hci_cp_le_set_ext_adv_data {
 	__u8  operation;
 	__u8  frag_pref;
 	__u8  length;
-	__u8  data[];
+	__u8  data[] __counted_by(length);
 } __packed;
 
 #define HCI_OP_LE_SET_EXT_SCAN_RSP_DATA		0x2038
@@ -2035,7 +2035,7 @@ struct hci_cp_le_set_ext_scan_rsp_data {
 	__u8  operation;
 	__u8  frag_pref;
 	__u8  length;
-	__u8  data[];
+	__u8  data[] __counted_by(length);
 } __packed;
 
 #define HCI_OP_LE_SET_EXT_ADV_ENABLE		0x2039
@@ -2061,7 +2061,7 @@ struct hci_cp_le_set_per_adv_data {
 	__u8  handle;
 	__u8  operation;
 	__u8  length;
-	__u8  data[];
+	__u8  data[] __counted_by(length);
 } __packed;
 
 #define HCI_OP_LE_SET_PER_ADV_ENABLE		0x2040
@@ -2162,7 +2162,7 @@ struct hci_cis {
 
 struct hci_cp_le_create_cis {
 	__u8    num_cis;
-	struct hci_cis cis[];
+	struct hci_cis cis[] __counted_by(num_cis);
 } __packed;
 
 #define HCI_OP_LE_REMOVE_CIG			0x2065
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index d228321605ae..122a6a9092e0 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -4307,7 +4307,7 @@ static void hci_cs_le_create_cis(struct hci_dev *hdev, u8 status)
 	hci_dev_lock(hdev);
 
 	/* Remove connection if command failed */
-	for (i = 0; cp->num_cis; cp->num_cis--, i++) {
+	for (i = 0; i < cp->num_cis; i++) {
 		struct hci_conn *conn;
 		u16 handle;
 
@@ -4323,6 +4323,7 @@ static void hci_cs_le_create_cis(struct hci_dev *hdev, u8 status)
 			hci_conn_del(conn);
 		}
 	}
+	cp->num_cis = 0;
 
 	if (pending)
 		hci_le_create_cis_pending(hdev);
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 4c707eb64e6f..406e36cacd79 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -1235,31 +1235,27 @@ int hci_setup_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance)
 
 static int hci_set_ext_scan_rsp_data_sync(struct hci_dev *hdev, u8 instance)
 {
-	struct {
-		struct hci_cp_le_set_ext_scan_rsp_data cp;
-		u8 data[HCI_MAX_EXT_AD_LENGTH];
-	} pdu;
+	DEFINE_FLEX(struct hci_cp_le_set_ext_scan_rsp_data, pdu, data, length,
+		    HCI_MAX_EXT_AD_LENGTH);
 	u8 len;
 	struct adv_info *adv = NULL;
 	int err;
 
-	memset(&pdu, 0, sizeof(pdu));
-
 	if (instance) {
 		adv = hci_find_adv_instance(hdev, instance);
 		if (!adv || !adv->scan_rsp_changed)
 			return 0;
 	}
 
-	len = eir_create_scan_rsp(hdev, instance, pdu.data);
+	len = eir_create_scan_rsp(hdev, instance, pdu->data);
 
-	pdu.cp.handle = instance;
-	pdu.cp.length = len;
-	pdu.cp.operation = LE_SET_ADV_DATA_OP_COMPLETE;
-	pdu.cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG;
+	pdu->handle = instance;
+	pdu->length = len;
+	pdu->operation = LE_SET_ADV_DATA_OP_COMPLETE;
+	pdu->frag_pref = LE_SET_ADV_DATA_NO_FRAG;
 
 	err = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_SCAN_RSP_DATA,
-				    sizeof(pdu.cp) + len, &pdu.cp,
+				    struct_size(pdu, data, len), pdu,
 				    HCI_CMD_TIMEOUT);
 	if (err)
 		return err;
@@ -1267,7 +1263,7 @@ static int hci_set_ext_scan_rsp_data_sync(struct hci_dev *hdev, u8 instance)
 	if (adv) {
 		adv->scan_rsp_changed = false;
 	} else {
-		memcpy(hdev->scan_rsp_data, pdu.data, len);
+		memcpy(hdev->scan_rsp_data, pdu->data, len);
 		hdev->scan_rsp_data_len = len;
 	}
 
@@ -1411,14 +1407,10 @@ static int hci_set_per_adv_params_sync(struct hci_dev *hdev, u8 instance,
 
 static int hci_set_per_adv_data_sync(struct hci_dev *hdev, u8 instance)
 {
-	struct {
-		struct hci_cp_le_set_per_adv_data cp;
-		u8 data[HCI_MAX_PER_AD_LENGTH];
-	} pdu;
+	DEFINE_FLEX(struct hci_cp_le_set_per_adv_data, pdu, data, length,
+		    HCI_MAX_PER_AD_LENGTH);
 	u8 len;
 
-	memset(&pdu, 0, sizeof(pdu));
-
 	if (instance) {
 		struct adv_info *adv = hci_find_adv_instance(hdev, instance);
 
@@ -1426,14 +1418,14 @@ static int hci_set_per_adv_data_sync(struct hci_dev *hdev, u8 instance)
 			return 0;
 	}
 
-	len = eir_create_per_adv_data(hdev, instance, pdu.data);
+	len = eir_create_per_adv_data(hdev, instance, pdu->data);
 
-	pdu.cp.length = len;
-	pdu.cp.handle = instance;
-	pdu.cp.operation = LE_SET_ADV_DATA_OP_COMPLETE;
+	pdu->length = len;
+	pdu->handle = instance;
+	pdu->operation = LE_SET_ADV_DATA_OP_COMPLETE;
 
 	return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PER_ADV_DATA,
-				     sizeof(pdu.cp) + len, &pdu,
+				     struct_size(pdu, data, len), pdu,
 				     HCI_CMD_TIMEOUT);
 }
 
@@ -1727,31 +1719,27 @@ int hci_le_terminate_big_sync(struct hci_dev *hdev, u8 handle, u8 reason)
 
 static int hci_set_ext_adv_data_sync(struct hci_dev *hdev, u8 instance)
 {
-	struct {
-		struct hci_cp_le_set_ext_adv_data cp;
-		u8 data[HCI_MAX_EXT_AD_LENGTH];
-	} pdu;
+	DEFINE_FLEX(struct hci_cp_le_set_ext_adv_data, pdu, data, length,
+		    HCI_MAX_EXT_AD_LENGTH);
 	u8 len;
 	struct adv_info *adv = NULL;
 	int err;
 
-	memset(&pdu, 0, sizeof(pdu));
-
 	if (instance) {
 		adv = hci_find_adv_instance(hdev, instance);
 		if (!adv || !adv->adv_data_changed)
 			return 0;
 	}
 
-	len = eir_create_adv_data(hdev, instance, pdu.data);
+	len = eir_create_adv_data(hdev, instance, pdu->data);
 
-	pdu.cp.length = len;
-	pdu.cp.handle = instance;
-	pdu.cp.operation = LE_SET_ADV_DATA_OP_COMPLETE;
-	pdu.cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG;
+	pdu->length = len;
+	pdu->handle = instance;
+	pdu->operation = LE_SET_ADV_DATA_OP_COMPLETE;
+	pdu->frag_pref = LE_SET_ADV_DATA_NO_FRAG;
 
 	err = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_ADV_DATA,
-				    sizeof(pdu.cp) + len, &pdu.cp,
+				    struct_size(pdu, data, len), pdu,
 				    HCI_CMD_TIMEOUT);
 	if (err)
 		return err;
@@ -1760,7 +1748,7 @@ static int hci_set_ext_adv_data_sync(struct hci_dev *hdev, u8 instance)
 	if (adv) {
 		adv->adv_data_changed = false;
 	} else {
-		memcpy(hdev->adv_data, pdu.data, len);
+		memcpy(hdev->adv_data, pdu->data, len);
 		hdev->adv_data_len = len;
 	}
 
@@ -6493,10 +6481,8 @@ done:
 
 int hci_le_create_cis_sync(struct hci_dev *hdev)
 {
-	struct {
-		struct hci_cp_le_create_cis cp;
-		struct hci_cis cis[0x1f];
-	} cmd;
+	DEFINE_FLEX(struct hci_cp_le_create_cis, cmd, cis, num_cis, 0x1f);
+	size_t aux_num_cis = 0;
 	struct hci_conn *conn;
 	u8 cig = BT_ISO_QOS_CIG_UNSET;
 
@@ -6523,8 +6509,6 @@ int hci_le_create_cis_sync(struct hci_dev *hdev)
 	 * remains pending.
 	 */
 
-	memset(&cmd, 0, sizeof(cmd));
-
 	hci_dev_lock(hdev);
 
 	rcu_read_lock();
@@ -6561,7 +6545,7 @@ int hci_le_create_cis_sync(struct hci_dev *hdev)
 		goto done;
 
 	list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) {
-		struct hci_cis *cis = &cmd.cis[cmd.cp.num_cis];
+		struct hci_cis *cis = &cmd->cis[aux_num_cis];
 
 		if (hci_conn_check_create_cis(conn) ||
 		    conn->iso_qos.ucast.cig != cig)
@@ -6570,25 +6554,25 @@ int hci_le_create_cis_sync(struct hci_dev *hdev)
 		set_bit(HCI_CONN_CREATE_CIS, &conn->flags);
 		cis->acl_handle = cpu_to_le16(conn->parent->handle);
 		cis->cis_handle = cpu_to_le16(conn->handle);
-		cmd.cp.num_cis++;
+		aux_num_cis++;
 
-		if (cmd.cp.num_cis >= ARRAY_SIZE(cmd.cis))
+		if (aux_num_cis >= 0x1f)
 			break;
 	}
+	cmd->num_cis = aux_num_cis;
 
 done:
 	rcu_read_unlock();
 
 	hci_dev_unlock(hdev);
 
-	if (!cmd.cp.num_cis)
+	if (!aux_num_cis)
 		return 0;
 
 	/* Wait for HCI_LE_CIS_Established */
 	return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CREATE_CIS,
-					sizeof(cmd.cp) + sizeof(cmd.cis[0]) *
-					cmd.cp.num_cis, &cmd,
-					HCI_EVT_LE_CIS_ESTABLISHED,
+					struct_size(cmd, cis, cmd->num_cis),
+					cmd, HCI_EVT_LE_CIS_ESTABLISHED,
 					conn->conn_timeout, NULL);
 }
 
-- 
cgit v1.2.3


From c90748b898cf9a0654aeadac1ba2254e3ab8e9fa Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Fri, 26 Apr 2024 10:45:17 -0600
Subject: Bluetooth: hci_conn: Use __counted_by() to avoid -Wfamnae warning

Prepare for the coming implementation by GCC and Clang of the
__counted_by attribute. Flexible array members annotated with
__counted_by can have their accesses bounds-checked at run-time
via CONFIG_UBSAN_BOUNDS (for array indexing) and CONFIG_FORTIFY_SOURCE
(for strcpy/memcpy-family functions).

Also, -Wflex-array-member-not-at-end is coming in GCC-14, and we are
getting ready to enable it globally.

So, use the `DEFINE_FLEX()` helper for an on-stack definition of
a flexible structure where the size of the flexible-array member
is known at compile-time, and refactor the rest of the code,
accordingly.

With these changes, fix the following warning:
net/bluetooth/hci_conn.c:2116:50: warning: structure containing a flexible
array member is not at the end of another structure
[-Wflex-array-member-not-at-end]

Link: https://github.com/KSPP/linux/issues/202
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h |  2 +-
 net/bluetooth/hci_conn.c    | 26 +++++++++++---------------
 2 files changed, 12 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 0b0ac7f06570..c4c6b8810701 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -2216,7 +2216,7 @@ struct hci_cp_le_big_create_sync {
 	__u8    mse;
 	__le16  timeout;
 	__u8    num_bis;
-	__u8    bis[];
+	__u8    bis[] __counted_by(num_bis);
 } __packed;
 
 #define HCI_OP_LE_BIG_TERM_SYNC			0x206c
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index c1d6b4d713b4..8375f7bd58ee 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -2109,13 +2109,10 @@ int hci_le_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon,
 			   struct bt_iso_qos *qos,
 			   __u16 sync_handle, __u8 num_bis, __u8 bis[])
 {
-	struct _packed {
-		struct hci_cp_le_big_create_sync cp;
-		__u8  bis[0x11];
-	} pdu;
+	DEFINE_FLEX(struct hci_cp_le_big_create_sync, pdu, bis, num_bis, 0x11);
 	int err;
 
-	if (num_bis < 0x01 || num_bis > sizeof(pdu.bis))
+	if (num_bis < 0x01 || num_bis > pdu->num_bis)
 		return -EINVAL;
 
 	err = qos_set_big(hdev, qos);
@@ -2125,18 +2122,17 @@ int hci_le_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon,
 	if (hcon)
 		hcon->iso_qos.bcast.big = qos->bcast.big;
 
-	memset(&pdu, 0, sizeof(pdu));
-	pdu.cp.handle = qos->bcast.big;
-	pdu.cp.sync_handle = cpu_to_le16(sync_handle);
-	pdu.cp.encryption = qos->bcast.encryption;
-	memcpy(pdu.cp.bcode, qos->bcast.bcode, sizeof(pdu.cp.bcode));
-	pdu.cp.mse = qos->bcast.mse;
-	pdu.cp.timeout = cpu_to_le16(qos->bcast.timeout);
-	pdu.cp.num_bis = num_bis;
-	memcpy(pdu.bis, bis, num_bis);
+	pdu->handle = qos->bcast.big;
+	pdu->sync_handle = cpu_to_le16(sync_handle);
+	pdu->encryption = qos->bcast.encryption;
+	memcpy(pdu->bcode, qos->bcast.bcode, sizeof(pdu->bcode));
+	pdu->mse = qos->bcast.mse;
+	pdu->timeout = cpu_to_le16(qos->bcast.timeout);
+	pdu->num_bis = num_bis;
+	memcpy(pdu->bis, bis, num_bis);
 
 	return hci_send_cmd(hdev, HCI_OP_LE_BIG_CREATE_SYNC,
-			    sizeof(pdu.cp) + num_bis, &pdu);
+			    sizeof(*pdu) + num_bis, pdu);
 }
 
 static void create_big_complete(struct hci_dev *hdev, void *data, int err)
-- 
cgit v1.2.3


From ce60b9231b66710b6ee24042ded26efee120ecfc Mon Sep 17 00:00:00 2001
From: Sebastian Urban <surban@surban.net>
Date: Wed, 1 May 2024 12:08:58 +0200
Subject: Bluetooth: compute LE flow credits based on recvbuf space

Previously LE flow credits were returned to the
sender even if the socket's receive buffer was
full. This meant that no back-pressure
was applied to the sender, thus it continued to
send data, resulting in data loss without any
error being reported. Furthermore, the amount
of credits was essentially fixed to a small
amount, leading to reduced performance.

This is fixed by computing the number of returned
LE flow credits based on the estimated available
space in the receive buffer of an L2CAP socket.
Consequently, if the receive buffer is full, no
credits are returned until the buffer is read and
thus cleared by user-space.

Since the computation of available receive buffer
space can only be performed approximately (due to
sk_buff overhead) and the receive buffer size may
be changed by user-space after flow credits have
been sent, superfluous received data is temporary
stored within l2cap_pinfo. This is necessary
because Bluetooth LE provides no retransmission
mechanism once the data has been acked by the
physical layer.

If receive buffer space estimation is not possible
at the moment, we fall back to providing credits
for one full packet as before. This is currently
the case during connection setup, when MPS is not
yet available.

Fixes: b1c325c23d75 ("Bluetooth: Implement returning of LE L2CAP credits")
Signed-off-by: Sebastian Urban <surban@surban.net>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/l2cap.h | 11 +++++-
 net/bluetooth/l2cap_core.c    | 56 ++++++++++++++++++++++----
 net/bluetooth/l2cap_sock.c    | 91 ++++++++++++++++++++++++++++++++++---------
 3 files changed, 132 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h
index 7d4a3e766e7b..5cfdc813491a 100644
--- a/include/net/bluetooth/l2cap.h
+++ b/include/net/bluetooth/l2cap.h
@@ -554,6 +554,9 @@ struct l2cap_chan {
 	__u16		tx_credits;
 	__u16		rx_credits;
 
+	/* estimated available receive buffer space or -1 if unknown */
+	ssize_t		rx_avail;
+
 	__u8		tx_state;
 	__u8		rx_state;
 
@@ -688,10 +691,15 @@ struct l2cap_user {
 /* ----- L2CAP socket info ----- */
 #define l2cap_pi(sk) ((struct l2cap_pinfo *) sk)
 
+struct l2cap_rx_busy {
+	struct list_head	list;
+	struct sk_buff		*skb;
+};
+
 struct l2cap_pinfo {
 	struct bt_sock		bt;
 	struct l2cap_chan	*chan;
-	struct sk_buff		*rx_busy_skb;
+	struct list_head	rx_busy;
 };
 
 enum {
@@ -949,6 +957,7 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
 int l2cap_chan_reconfigure(struct l2cap_chan *chan, __u16 mtu);
 int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len);
 void l2cap_chan_busy(struct l2cap_chan *chan, int busy);
+void l2cap_chan_rx_avail(struct l2cap_chan *chan, ssize_t rx_avail);
 int l2cap_chan_check_security(struct l2cap_chan *chan, bool initiator);
 void l2cap_chan_set_defaults(struct l2cap_chan *chan);
 int l2cap_ertm_init(struct l2cap_chan *chan);
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index fa714f5bd0b3..443423538a4c 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -457,6 +457,9 @@ struct l2cap_chan *l2cap_chan_create(void)
 	/* Set default lock nesting level */
 	atomic_set(&chan->nesting, L2CAP_NESTING_NORMAL);
 
+	/* Available receive buffer space is initially unknown */
+	chan->rx_avail = -1;
+
 	write_lock(&chan_list_lock);
 	list_add(&chan->global_l, &chan_list);
 	write_unlock(&chan_list_lock);
@@ -538,6 +541,28 @@ void l2cap_chan_set_defaults(struct l2cap_chan *chan)
 }
 EXPORT_SYMBOL_GPL(l2cap_chan_set_defaults);
 
+static __u16 l2cap_le_rx_credits(struct l2cap_chan *chan)
+{
+	size_t sdu_len = chan->sdu ? chan->sdu->len : 0;
+
+	if (chan->mps == 0)
+		return 0;
+
+	/* If we don't know the available space in the receiver buffer, give
+	 * enough credits for a full packet.
+	 */
+	if (chan->rx_avail == -1)
+		return (chan->imtu / chan->mps) + 1;
+
+	/* If we know how much space is available in the receive buffer, give
+	 * out as many credits as would fill the buffer.
+	 */
+	if (chan->rx_avail <= sdu_len)
+		return 0;
+
+	return DIV_ROUND_UP(chan->rx_avail - sdu_len, chan->mps);
+}
+
 static void l2cap_le_flowctl_init(struct l2cap_chan *chan, u16 tx_credits)
 {
 	chan->sdu = NULL;
@@ -546,8 +571,7 @@ static void l2cap_le_flowctl_init(struct l2cap_chan *chan, u16 tx_credits)
 	chan->tx_credits = tx_credits;
 	/* Derive MPS from connection MTU to stop HCI fragmentation */
 	chan->mps = min_t(u16, chan->imtu, chan->conn->mtu - L2CAP_HDR_SIZE);
-	/* Give enough credits for a full packet */
-	chan->rx_credits = (chan->imtu / chan->mps) + 1;
+	chan->rx_credits = l2cap_le_rx_credits(chan);
 
 	skb_queue_head_init(&chan->tx_q);
 }
@@ -559,7 +583,7 @@ static void l2cap_ecred_init(struct l2cap_chan *chan, u16 tx_credits)
 	/* L2CAP implementations shall support a minimum MPS of 64 octets */
 	if (chan->mps < L2CAP_ECRED_MIN_MPS) {
 		chan->mps = L2CAP_ECRED_MIN_MPS;
-		chan->rx_credits = (chan->imtu / chan->mps) + 1;
+		chan->rx_credits = l2cap_le_rx_credits(chan);
 	}
 }
 
@@ -6512,9 +6536,7 @@ static void l2cap_chan_le_send_credits(struct l2cap_chan *chan)
 {
 	struct l2cap_conn *conn = chan->conn;
 	struct l2cap_le_credits pkt;
-	u16 return_credits;
-
-	return_credits = (chan->imtu / chan->mps) + 1;
+	u16 return_credits = l2cap_le_rx_credits(chan);
 
 	if (chan->rx_credits >= return_credits)
 		return;
@@ -6533,6 +6555,19 @@ static void l2cap_chan_le_send_credits(struct l2cap_chan *chan)
 	l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CREDITS, sizeof(pkt), &pkt);
 }
 
+void l2cap_chan_rx_avail(struct l2cap_chan *chan, ssize_t rx_avail)
+{
+	if (chan->rx_avail == rx_avail)
+		return;
+
+	BT_DBG("chan %p has %zd bytes avail for rx", chan, rx_avail);
+
+	chan->rx_avail = rx_avail;
+
+	if (chan->state == BT_CONNECTED)
+		l2cap_chan_le_send_credits(chan);
+}
+
 static int l2cap_ecred_recv(struct l2cap_chan *chan, struct sk_buff *skb)
 {
 	int err;
@@ -6542,6 +6577,12 @@ static int l2cap_ecred_recv(struct l2cap_chan *chan, struct sk_buff *skb)
 	/* Wait recv to confirm reception before updating the credits */
 	err = chan->ops->recv(chan, skb);
 
+	if (err < 0 && chan->rx_avail != -1) {
+		BT_ERR("Queueing received LE L2CAP data failed");
+		l2cap_send_disconn_req(chan, ECONNRESET);
+		return err;
+	}
+
 	/* Update credits whenever an SDU is received */
 	l2cap_chan_le_send_credits(chan);
 
@@ -6564,7 +6605,8 @@ static int l2cap_ecred_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb)
 	}
 
 	chan->rx_credits--;
-	BT_DBG("rx_credits %u -> %u", chan->rx_credits + 1, chan->rx_credits);
+	BT_DBG("chan %p: rx_credits %u -> %u",
+	       chan, chan->rx_credits + 1, chan->rx_credits);
 
 	/* Update if remote had run out of credits, this should only happens
 	 * if the remote is not using the entire MPS.
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index 5cc83f906c12..8645461d45e8 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -1131,6 +1131,34 @@ static int l2cap_sock_sendmsg(struct socket *sock, struct msghdr *msg,
 	return err;
 }
 
+static void l2cap_publish_rx_avail(struct l2cap_chan *chan)
+{
+	struct sock *sk = chan->data;
+	ssize_t avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc);
+	int expected_skbs, skb_overhead;
+
+	if (avail <= 0) {
+		l2cap_chan_rx_avail(chan, 0);
+		return;
+	}
+
+	if (!chan->mps) {
+		l2cap_chan_rx_avail(chan, -1);
+		return;
+	}
+
+	/* Correct available memory by estimated sk_buff overhead.
+	 * This is significant due to small transfer sizes. However, accept
+	 * at least one full packet if receive space is non-zero.
+	 */
+	expected_skbs = DIV_ROUND_UP(avail, chan->mps);
+	skb_overhead = expected_skbs * sizeof(struct sk_buff);
+	if (skb_overhead < avail)
+		l2cap_chan_rx_avail(chan, avail - skb_overhead);
+	else
+		l2cap_chan_rx_avail(chan, -1);
+}
+
 static int l2cap_sock_recvmsg(struct socket *sock, struct msghdr *msg,
 			      size_t len, int flags)
 {
@@ -1167,28 +1195,33 @@ static int l2cap_sock_recvmsg(struct socket *sock, struct msghdr *msg,
 	else
 		err = bt_sock_recvmsg(sock, msg, len, flags);
 
-	if (pi->chan->mode != L2CAP_MODE_ERTM)
+	if (pi->chan->mode != L2CAP_MODE_ERTM &&
+	    pi->chan->mode != L2CAP_MODE_LE_FLOWCTL &&
+	    pi->chan->mode != L2CAP_MODE_EXT_FLOWCTL)
 		return err;
 
-	/* Attempt to put pending rx data in the socket buffer */
-
 	lock_sock(sk);
 
-	if (!test_bit(CONN_LOCAL_BUSY, &pi->chan->conn_state))
-		goto done;
+	l2cap_publish_rx_avail(pi->chan);
 
-	if (pi->rx_busy_skb) {
-		if (!__sock_queue_rcv_skb(sk, pi->rx_busy_skb))
-			pi->rx_busy_skb = NULL;
-		else
+	/* Attempt to put pending rx data in the socket buffer */
+	while (!list_empty(&pi->rx_busy)) {
+		struct l2cap_rx_busy *rx_busy =
+			list_first_entry(&pi->rx_busy,
+					 struct l2cap_rx_busy,
+					 list);
+		if (__sock_queue_rcv_skb(sk, rx_busy->skb) < 0)
 			goto done;
+		list_del(&rx_busy->list);
+		kfree(rx_busy);
 	}
 
 	/* Restore data flow when half of the receive buffer is
 	 * available.  This avoids resending large numbers of
 	 * frames.
 	 */
-	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf >> 1)
+	if (test_bit(CONN_LOCAL_BUSY, &pi->chan->conn_state) &&
+	    atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf >> 1)
 		l2cap_chan_busy(pi->chan, 0);
 
 done:
@@ -1449,17 +1482,20 @@ static struct l2cap_chan *l2cap_sock_new_connection_cb(struct l2cap_chan *chan)
 static int l2cap_sock_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb)
 {
 	struct sock *sk = chan->data;
+	struct l2cap_pinfo *pi = l2cap_pi(sk);
 	int err;
 
 	lock_sock(sk);
 
-	if (l2cap_pi(sk)->rx_busy_skb) {
+	if (chan->mode == L2CAP_MODE_ERTM && !list_empty(&pi->rx_busy)) {
 		err = -ENOMEM;
 		goto done;
 	}
 
 	if (chan->mode != L2CAP_MODE_ERTM &&
-	    chan->mode != L2CAP_MODE_STREAMING) {
+	    chan->mode != L2CAP_MODE_STREAMING &&
+	    chan->mode != L2CAP_MODE_LE_FLOWCTL &&
+	    chan->mode != L2CAP_MODE_EXT_FLOWCTL) {
 		/* Even if no filter is attached, we could potentially
 		 * get errors from security modules, etc.
 		 */
@@ -1470,7 +1506,9 @@ static int l2cap_sock_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb)
 
 	err = __sock_queue_rcv_skb(sk, skb);
 
-	/* For ERTM, handle one skb that doesn't fit into the recv
+	l2cap_publish_rx_avail(chan);
+
+	/* For ERTM and LE, handle a skb that doesn't fit into the recv
 	 * buffer.  This is important to do because the data frames
 	 * have already been acked, so the skb cannot be discarded.
 	 *
@@ -1479,8 +1517,18 @@ static int l2cap_sock_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb)
 	 * acked and reassembled until there is buffer space
 	 * available.
 	 */
-	if (err < 0 && chan->mode == L2CAP_MODE_ERTM) {
-		l2cap_pi(sk)->rx_busy_skb = skb;
+	if (err < 0 &&
+	    (chan->mode == L2CAP_MODE_ERTM ||
+	     chan->mode == L2CAP_MODE_LE_FLOWCTL ||
+	     chan->mode == L2CAP_MODE_EXT_FLOWCTL)) {
+		struct l2cap_rx_busy *rx_busy =
+			kmalloc(sizeof(*rx_busy), GFP_KERNEL);
+		if (!rx_busy) {
+			err = -ENOMEM;
+			goto done;
+		}
+		rx_busy->skb = skb;
+		list_add_tail(&rx_busy->list, &pi->rx_busy);
 		l2cap_chan_busy(chan, 1);
 		err = 0;
 	}
@@ -1706,6 +1754,8 @@ static const struct l2cap_ops l2cap_chan_ops = {
 
 static void l2cap_sock_destruct(struct sock *sk)
 {
+	struct l2cap_rx_busy *rx_busy, *next;
+
 	BT_DBG("sk %p", sk);
 
 	if (l2cap_pi(sk)->chan) {
@@ -1713,9 +1763,10 @@ static void l2cap_sock_destruct(struct sock *sk)
 		l2cap_chan_put(l2cap_pi(sk)->chan);
 	}
 
-	if (l2cap_pi(sk)->rx_busy_skb) {
-		kfree_skb(l2cap_pi(sk)->rx_busy_skb);
-		l2cap_pi(sk)->rx_busy_skb = NULL;
+	list_for_each_entry_safe(rx_busy, next, &l2cap_pi(sk)->rx_busy, list) {
+		kfree_skb(rx_busy->skb);
+		list_del(&rx_busy->list);
+		kfree(rx_busy);
 	}
 
 	skb_queue_purge(&sk->sk_receive_queue);
@@ -1799,6 +1850,8 @@ static void l2cap_sock_init(struct sock *sk, struct sock *parent)
 
 	chan->data = sk;
 	chan->ops = &l2cap_chan_ops;
+
+	l2cap_publish_rx_avail(chan);
 }
 
 static struct proto l2cap_proto = {
@@ -1820,6 +1873,8 @@ static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock,
 	sk->sk_destruct = l2cap_sock_destruct;
 	sk->sk_sndtimeo = L2CAP_CONN_TIMEOUT;
 
+	INIT_LIST_HEAD(&l2cap_pi(sk)->rx_busy);
+
 	chan = l2cap_chan_create();
 	if (!chan) {
 		sk_free(sk);
-- 
cgit v1.2.3


From 21d74b6b4e82d27eead168d2b1ce3096937c0237 Mon Sep 17 00:00:00 2001
From: Mahesh Talewad <mahesh.talewad@nxp.com>
Date: Thu, 2 May 2024 12:01:23 +0300
Subject: LE Create Connection command timeout increased to 20 secs

On our DUT, we can see that the host issues create connection cancel
command after 4-sec if there is no connection complete event for
LE create connection cmd.
As per core spec v5.3 section 7.8.5, advertisement interval range is-

Advertising_Interval_Min
Default : 0x0800(1.28s)
Time Range: 20ms to 10.24s

Advertising_Interval_Max
Default : 0x0800(1.28s)
Time Range: 20ms to 10.24s

If the remote device is using adv interval of > 4 sec, it is
difficult to make a connection with the current timeout value.
Also, with the default interval of 1.28 sec, we will get only
3 chances to capture the adv packets with the 4 sec window.
Hence we want to increase this timeout to 20sec.

Signed-off-by: Mahesh Talewad <mahesh.talewad@nxp.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h | 1 -
 net/bluetooth/hci_core.c    | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index c4c6b8810701..38eb7ec86a1a 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -456,7 +456,6 @@ enum {
 #define HCI_AUTO_OFF_TIMEOUT	msecs_to_jiffies(2000)	/* 2 seconds */
 #define HCI_ACL_CONN_TIMEOUT	msecs_to_jiffies(20000)	/* 20 seconds */
 #define HCI_LE_CONN_TIMEOUT	msecs_to_jiffies(20000)	/* 20 seconds */
-#define HCI_LE_AUTOCONN_TIMEOUT	msecs_to_jiffies(4000)	/* 4 seconds */
 
 /* HCI data types */
 #define HCI_COMMAND_PKT		0x01
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 34cd05856401..b52125512985 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -2556,7 +2556,7 @@ struct hci_dev *hci_alloc_dev_priv(int sizeof_priv)
 	hdev->le_rx_def_phys = HCI_LE_SET_PHY_1M;
 	hdev->le_num_of_adv_sets = HCI_MAX_ADV_INSTANCES;
 	hdev->def_multi_adv_rotation_duration = HCI_DEFAULT_ADV_DURATION;
-	hdev->def_le_autoconnect_timeout = HCI_LE_AUTOCONN_TIMEOUT;
+	hdev->def_le_autoconnect_timeout = HCI_LE_CONN_TIMEOUT;
 	hdev->min_le_tx_power = HCI_TX_POWER_INVALID;
 	hdev->max_le_tx_power = HCI_TX_POWER_INVALID;
 
-- 
cgit v1.2.3


From ea9e148c803b24ebbc7a74171f22f42c8fd8d644 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Thu, 2 May 2024 10:22:00 -0600
Subject: Bluetooth: hci_conn: Use __counted_by() and avoid -Wfamnae warning

Prepare for the coming implementation by GCC and Clang of the
__counted_by attribute. Flexible array members annotated with
__counted_by can have their accesses bounds-checked at run-time
via CONFIG_UBSAN_BOUNDS (for array indexing) and CONFIG_FORTIFY_SOURCE
(for strcpy/memcpy-family functions).

Also, -Wflex-array-member-not-at-end is coming in GCC-14, and we are
getting ready to enable it globally.

So, use the `DEFINE_FLEX()` helper for an on-stack definition of
a flexible structure where the size of the flexible-array member
is known at compile-time, and refactor the rest of the code,
accordingly.

With these changes, fix the following warning:
net/bluetooth/hci_conn.c:669:41: warning: structure containing a
flexible array member is not at the end of another structure
[-Wflex-array-member-not-at-end]

Link: https://github.com/KSPP/linux/issues/202
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h |  2 +-
 net/bluetooth/hci_conn.c    | 38 ++++++++++++++++----------------------
 2 files changed, 17 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 38eb7ec86a1a..21ebd70f3dcc 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -2143,7 +2143,7 @@ struct hci_cp_le_set_cig_params {
 	__le16  c_latency;
 	__le16  p_latency;
 	__u8    num_cis;
-	struct hci_cis_params cis[];
+	struct hci_cis_params cis[] __counted_by(num_cis);
 } __packed;
 
 struct hci_rp_le_set_cig_params {
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index fc1aeb513b08..b7f8e99001dd 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -664,11 +664,6 @@ static void le_conn_timeout(struct work_struct *work)
 	hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM);
 }
 
-struct iso_cig_params {
-	struct hci_cp_le_set_cig_params cp;
-	struct hci_cis_params cis[0x1f];
-};
-
 struct iso_list_data {
 	union {
 		u8  cig;
@@ -1722,34 +1717,33 @@ static int hci_le_create_big(struct hci_conn *conn, struct bt_iso_qos *qos)
 
 static int set_cig_params_sync(struct hci_dev *hdev, void *data)
 {
+	DEFINE_FLEX(struct hci_cp_le_set_cig_params, pdu, cis, num_cis, 0x1f);
 	u8 cig_id = PTR_UINT(data);
 	struct hci_conn *conn;
 	struct bt_iso_qos *qos;
-	struct iso_cig_params pdu;
+	u8 aux_num_cis = 0;
 	u8 cis_id;
 
 	conn = hci_conn_hash_lookup_cig(hdev, cig_id);
 	if (!conn)
 		return 0;
 
-	memset(&pdu, 0, sizeof(pdu));
-
 	qos = &conn->iso_qos;
-	pdu.cp.cig_id = cig_id;
-	hci_cpu_to_le24(qos->ucast.out.interval, pdu.cp.c_interval);
-	hci_cpu_to_le24(qos->ucast.in.interval, pdu.cp.p_interval);
-	pdu.cp.sca = qos->ucast.sca;
-	pdu.cp.packing = qos->ucast.packing;
-	pdu.cp.framing = qos->ucast.framing;
-	pdu.cp.c_latency = cpu_to_le16(qos->ucast.out.latency);
-	pdu.cp.p_latency = cpu_to_le16(qos->ucast.in.latency);
+	pdu->cig_id = cig_id;
+	hci_cpu_to_le24(qos->ucast.out.interval, pdu->c_interval);
+	hci_cpu_to_le24(qos->ucast.in.interval, pdu->p_interval);
+	pdu->sca = qos->ucast.sca;
+	pdu->packing = qos->ucast.packing;
+	pdu->framing = qos->ucast.framing;
+	pdu->c_latency = cpu_to_le16(qos->ucast.out.latency);
+	pdu->p_latency = cpu_to_le16(qos->ucast.in.latency);
 
 	/* Reprogram all CIS(s) with the same CIG, valid range are:
 	 * num_cis: 0x00 to 0x1F
 	 * cis_id: 0x00 to 0xEF
 	 */
 	for (cis_id = 0x00; cis_id < 0xf0 &&
-	     pdu.cp.num_cis < ARRAY_SIZE(pdu.cis); cis_id++) {
+	     aux_num_cis < pdu->num_cis; cis_id++) {
 		struct hci_cis_params *cis;
 
 		conn = hci_conn_hash_lookup_cis(hdev, NULL, 0, cig_id, cis_id);
@@ -1758,7 +1752,7 @@ static int set_cig_params_sync(struct hci_dev *hdev, void *data)
 
 		qos = &conn->iso_qos;
 
-		cis = &pdu.cis[pdu.cp.num_cis++];
+		cis = &pdu->cis[aux_num_cis++];
 		cis->cis_id = cis_id;
 		cis->c_sdu  = cpu_to_le16(conn->iso_qos.ucast.out.sdu);
 		cis->p_sdu  = cpu_to_le16(conn->iso_qos.ucast.in.sdu);
@@ -1769,14 +1763,14 @@ static int set_cig_params_sync(struct hci_dev *hdev, void *data)
 		cis->c_rtn  = qos->ucast.out.rtn;
 		cis->p_rtn  = qos->ucast.in.rtn;
 	}
+	pdu->num_cis = aux_num_cis;
 
-	if (!pdu.cp.num_cis)
+	if (!pdu->num_cis)
 		return 0;
 
 	return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_CIG_PARAMS,
-				     sizeof(pdu.cp) +
-				     pdu.cp.num_cis * sizeof(pdu.cis[0]), &pdu,
-				     HCI_CMD_TIMEOUT);
+				     struct_size(pdu, cis, pdu->num_cis),
+				     pdu, HCI_CMD_TIMEOUT);
 }
 
 static bool hci_le_set_cig_params(struct hci_conn *conn, struct bt_iso_qos *qos)
-- 
cgit v1.2.3


From a5b862c6a221459d54e494e88965b48dcfa6cc44 Mon Sep 17 00:00:00 2001
From: Sungwoo Kim <iam@sung-woo.kim>
Date: Sat, 4 May 2024 15:23:29 -0400
Subject: Bluetooth: L2CAP: Fix div-by-zero in l2cap_le_flowctl_init()

l2cap_le_flowctl_init() can cause both div-by-zero and an integer
overflow since hdev->le_mtu may not fall in the valid range.

Move MTU from hci_dev to hci_conn to validate MTU and stop the connection
process earlier if MTU is invalid.
Also, add a missing validation in read_buffer_size() and make it return
an error value if the validation fails.
Now hci_conn_add() returns ERR_PTR() as it can fail due to the both a
kzalloc failure and invalid MTU value.

divide error: 0000 [#1] PREEMPT SMP KASAN NOPTI
CPU: 0 PID: 67 Comm: kworker/u5:0 Tainted: G        W          6.9.0-rc5+ #20
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
Workqueue: hci0 hci_rx_work
RIP: 0010:l2cap_le_flowctl_init+0x19e/0x3f0 net/bluetooth/l2cap_core.c:547
Code: e8 17 17 0c 00 66 41 89 9f 84 00 00 00 bf 01 00 00 00 41 b8 02 00 00 00 4c
89 fe 4c 89 e2 89 d9 e8 27 17 0c 00 44 89 f0 31 d2 <66> f7 f3 89 c3 ff c3 4d 8d
b7 88 00 00 00 4c 89 f0 48 c1 e8 03 42
RSP: 0018:ffff88810bc0f858 EFLAGS: 00010246
RAX: 00000000000002a0 RBX: 0000000000000000 RCX: dffffc0000000000
RDX: 0000000000000000 RSI: ffff88810bc0f7c0 RDI: ffffc90002dcb66f
RBP: ffff88810bc0f880 R08: aa69db2dda70ff01 R09: 0000ffaaaaaaaaaa
R10: 0084000000ffaaaa R11: 0000000000000000 R12: ffff88810d65a084
R13: dffffc0000000000 R14: 00000000000002a0 R15: ffff88810d65a000
FS:  0000000000000000(0000) GS:ffff88811ac00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000020000100 CR3: 0000000103268003 CR4: 0000000000770ef0
PKRU: 55555554
Call Trace:
 <TASK>
 l2cap_le_connect_req net/bluetooth/l2cap_core.c:4902 [inline]
 l2cap_le_sig_cmd net/bluetooth/l2cap_core.c:5420 [inline]
 l2cap_le_sig_channel net/bluetooth/l2cap_core.c:5486 [inline]
 l2cap_recv_frame+0xe59d/0x11710 net/bluetooth/l2cap_core.c:6809
 l2cap_recv_acldata+0x544/0x10a0 net/bluetooth/l2cap_core.c:7506
 hci_acldata_packet net/bluetooth/hci_core.c:3939 [inline]
 hci_rx_work+0x5e5/0xb20 net/bluetooth/hci_core.c:4176
 process_one_work kernel/workqueue.c:3254 [inline]
 process_scheduled_works+0x90f/0x1530 kernel/workqueue.c:3335
 worker_thread+0x926/0xe70 kernel/workqueue.c:3416
 kthread+0x2e3/0x380 kernel/kthread.c:388
 ret_from_fork+0x5c/0x90 arch/x86/kernel/process.c:147
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244
 </TASK>
Modules linked in:
---[ end trace 0000000000000000 ]---

Fixes: 6ed58ec520ad ("Bluetooth: Use LE buffers for LE traffic")
Suggested-by: Luiz Augusto von Dentz <luiz.dentz@gmail.com>
Signed-off-by: Sungwoo Kim <iam@sung-woo.kim>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h      |  9 +++++
 include/net/bluetooth/hci_core.h |  1 +
 net/bluetooth/hci_conn.c         | 75 +++++++++++++++++++++++++++-------------
 net/bluetooth/hci_event.c        | 31 +++++++++++------
 net/bluetooth/iso.c              |  2 +-
 net/bluetooth/l2cap_core.c       | 17 ++-------
 net/bluetooth/sco.c              |  6 ++--
 7 files changed, 88 insertions(+), 53 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 21ebd70f3dcc..de530262d824 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -1665,6 +1665,15 @@ struct hci_cp_le_set_event_mask {
 	__u8     mask[8];
 } __packed;
 
+/* BLUETOOTH CORE SPECIFICATION Version 5.4 | Vol 4, Part E
+ * 7.8.2 LE Read Buffer Size command
+ * MAX_LE_MTU is 0xffff.
+ * 0 is also valid. It means that no dedicated LE Buffer exists.
+ * It should use the HCI_Read_Buffer_Size command and mtu is shared
+ * between BR/EDR and LE.
+ */
+#define HCI_MIN_LE_MTU 0x001b
+
 #define HCI_OP_LE_READ_BUFFER_SIZE	0x2002
 struct hci_rp_le_read_buffer_size {
 	__u8     status;
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 2283d12ee358..be012f393bcb 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -706,6 +706,7 @@ struct hci_conn {
 	__u16		handle;
 	__u16		sync_handle;
 	__u16		state;
+	__u16		mtu;
 	__u8		mode;
 	__u8		type;
 	__u8		role;
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index b7f8e99001dd..f21e7c73021d 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -904,11 +904,37 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
 {
 	struct hci_conn *conn;
 
+	switch (type) {
+	case ACL_LINK:
+		if (!hdev->acl_mtu)
+			return ERR_PTR(-ECONNREFUSED);
+		break;
+	case ISO_LINK:
+		if (hdev->iso_mtu)
+			/* Dedicated ISO Buffer exists */
+			break;
+		fallthrough;
+	case LE_LINK:
+		if (hdev->le_mtu && hdev->le_mtu < HCI_MIN_LE_MTU)
+			return ERR_PTR(-ECONNREFUSED);
+		if (!hdev->le_mtu && hdev->acl_mtu < HCI_MIN_LE_MTU)
+			return ERR_PTR(-ECONNREFUSED);
+		break;
+	case SCO_LINK:
+	case ESCO_LINK:
+		if (!hdev->sco_pkts)
+			/* Controller does not support SCO or eSCO over HCI */
+			return ERR_PTR(-ECONNREFUSED);
+		break;
+	default:
+		return ERR_PTR(-ECONNREFUSED);
+	}
+
 	bt_dev_dbg(hdev, "dst %pMR handle 0x%4.4x", dst, handle);
 
 	conn = kzalloc(sizeof(*conn), GFP_KERNEL);
 	if (!conn)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	bacpy(&conn->dst, dst);
 	bacpy(&conn->src, &hdev->bdaddr);
@@ -939,10 +965,12 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
 	switch (type) {
 	case ACL_LINK:
 		conn->pkt_type = hdev->pkt_type & ACL_PTYPE_MASK;
+		conn->mtu = hdev->acl_mtu;
 		break;
 	case LE_LINK:
 		/* conn->src should reflect the local identity address */
 		hci_copy_identity_address(hdev, &conn->src, &conn->src_type);
+		conn->mtu = hdev->le_mtu ? hdev->le_mtu : hdev->acl_mtu;
 		break;
 	case ISO_LINK:
 		/* conn->src should reflect the local identity address */
@@ -954,6 +982,8 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
 		else if (conn->role == HCI_ROLE_MASTER)
 			conn->cleanup = cis_cleanup;
 
+		conn->mtu = hdev->iso_mtu ? hdev->iso_mtu :
+			    hdev->le_mtu ? hdev->le_mtu : hdev->acl_mtu;
 		break;
 	case SCO_LINK:
 		if (lmp_esco_capable(hdev))
@@ -961,9 +991,12 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
 					(hdev->esco_type & EDR_ESCO_MASK);
 		else
 			conn->pkt_type = hdev->pkt_type & SCO_PTYPE_MASK;
+
+		conn->mtu = hdev->sco_mtu;
 		break;
 	case ESCO_LINK:
 		conn->pkt_type = hdev->esco_type & ~EDR_ESCO_MASK;
+		conn->mtu = hdev->sco_mtu;
 		break;
 	}
 
@@ -1006,7 +1039,7 @@ struct hci_conn *hci_conn_add_unset(struct hci_dev *hdev, int type,
 
 	handle = hci_conn_hash_alloc_unset(hdev);
 	if (unlikely(handle < 0))
-		return NULL;
+		return ERR_PTR(-ECONNREFUSED);
 
 	return hci_conn_add(hdev, type, dst, role, handle);
 }
@@ -1312,8 +1345,8 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst,
 		bacpy(&conn->dst, dst);
 	} else {
 		conn = hci_conn_add_unset(hdev, LE_LINK, dst, role);
-		if (!conn)
-			return ERR_PTR(-ENOMEM);
+		if (IS_ERR(conn))
+			return conn;
 		hci_conn_hold(conn);
 		conn->pending_sec_level = sec_level;
 	}
@@ -1489,8 +1522,8 @@ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst,
 		return ERR_PTR(-EADDRINUSE);
 
 	conn = hci_conn_add_unset(hdev, ISO_LINK, dst, HCI_ROLE_MASTER);
-	if (!conn)
-		return ERR_PTR(-ENOMEM);
+	if (IS_ERR(conn))
+		return conn;
 
 	conn->state = BT_CONNECT;
 
@@ -1533,8 +1566,8 @@ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst,
 	BT_DBG("requesting refresh of dst_addr");
 
 	conn = hci_conn_add_unset(hdev, LE_LINK, dst, HCI_ROLE_MASTER);
-	if (!conn)
-		return ERR_PTR(-ENOMEM);
+	if (IS_ERR(conn))
+		return conn;
 
 	if (hci_explicit_conn_params_set(hdev, dst, dst_type) < 0) {
 		hci_conn_del(conn);
@@ -1581,8 +1614,8 @@ struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst,
 	acl = hci_conn_hash_lookup_ba(hdev, ACL_LINK, dst);
 	if (!acl) {
 		acl = hci_conn_add_unset(hdev, ACL_LINK, dst, HCI_ROLE_MASTER);
-		if (!acl)
-			return ERR_PTR(-ENOMEM);
+		if (IS_ERR(acl))
+			return acl;
 	}
 
 	hci_conn_hold(acl);
@@ -1650,9 +1683,9 @@ struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst,
 	sco = hci_conn_hash_lookup_ba(hdev, type, dst);
 	if (!sco) {
 		sco = hci_conn_add_unset(hdev, type, dst, HCI_ROLE_MASTER);
-		if (!sco) {
+		if (IS_ERR(sco)) {
 			hci_conn_drop(acl);
-			return ERR_PTR(-ENOMEM);
+			return sco;
 		}
 	}
 
@@ -1841,8 +1874,8 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst,
 				       qos->ucast.cis);
 	if (!cis) {
 		cis = hci_conn_add_unset(hdev, ISO_LINK, dst, HCI_ROLE_MASTER);
-		if (!cis)
-			return ERR_PTR(-ENOMEM);
+		if (IS_ERR(cis))
+			return cis;
 		cis->cleanup = cis_cleanup;
 		cis->dst_type = dst_type;
 		cis->iso_qos.ucast.cig = BT_ISO_QOS_CIG_UNSET;
@@ -1977,14 +2010,8 @@ static void hci_iso_qos_setup(struct hci_dev *hdev, struct hci_conn *conn,
 			      struct bt_iso_io_qos *qos, __u8 phy)
 {
 	/* Only set MTU if PHY is enabled */
-	if (!qos->sdu && qos->phy) {
-		if (hdev->iso_mtu > 0)
-			qos->sdu = hdev->iso_mtu;
-		else if (hdev->le_mtu > 0)
-			qos->sdu = hdev->le_mtu;
-		else
-			qos->sdu = hdev->acl_mtu;
-	}
+	if (!qos->sdu && qos->phy)
+		qos->sdu = conn->mtu;
 
 	/* Use the same PHY as ACL if set to any */
 	if (qos->phy == BT_ISO_PHY_ANY)
@@ -2065,8 +2092,8 @@ struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst,
 		return ERR_PTR(-EBUSY);
 
 	conn = hci_conn_add_unset(hdev, ISO_LINK, dst, HCI_ROLE_SLAVE);
-	if (!conn)
-		return ERR_PTR(-ENOMEM);
+	if (IS_ERR(conn))
+		return conn;
 
 	conn->iso_qos = *qos;
 	conn->state = BT_LISTEN;
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 122a6a9092e0..97c6f34ba640 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -954,6 +954,9 @@ static u8 hci_cc_read_buffer_size(struct hci_dev *hdev, void *data,
 	BT_DBG("%s acl mtu %d:%d sco mtu %d:%d", hdev->name, hdev->acl_mtu,
 	       hdev->acl_pkts, hdev->sco_mtu, hdev->sco_pkts);
 
+	if (!hdev->acl_mtu || !hdev->acl_pkts)
+		return HCI_ERROR_INVALID_PARAMETERS;
+
 	return rp->status;
 }
 
@@ -1263,6 +1266,9 @@ static u8 hci_cc_le_read_buffer_size(struct hci_dev *hdev, void *data,
 
 	BT_DBG("%s le mtu %d:%d", hdev->name, hdev->le_mtu, hdev->le_pkts);
 
+	if (hdev->le_mtu && hdev->le_mtu < HCI_MIN_LE_MTU)
+		return HCI_ERROR_INVALID_PARAMETERS;
+
 	return rp->status;
 }
 
@@ -2341,8 +2347,8 @@ static void hci_cs_create_conn(struct hci_dev *hdev, __u8 status)
 		if (!conn) {
 			conn = hci_conn_add_unset(hdev, ACL_LINK, &cp->bdaddr,
 						  HCI_ROLE_MASTER);
-			if (!conn)
-				bt_dev_err(hdev, "no memory for new connection");
+			if (IS_ERR(conn))
+				bt_dev_err(hdev, "connection err: %ld", PTR_ERR(conn));
 		}
 	}
 
@@ -3153,8 +3159,8 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data,
 						      BDADDR_BREDR)) {
 			conn = hci_conn_add_unset(hdev, ev->link_type,
 						  &ev->bdaddr, HCI_ROLE_SLAVE);
-			if (!conn) {
-				bt_dev_err(hdev, "no memory for new conn");
+			if (IS_ERR(conn)) {
+				bt_dev_err(hdev, "connection err: %ld", PTR_ERR(conn));
 				goto unlock;
 			}
 		} else {
@@ -3342,8 +3348,8 @@ static void hci_conn_request_evt(struct hci_dev *hdev, void *data,
 	if (!conn) {
 		conn = hci_conn_add_unset(hdev, ev->link_type, &ev->bdaddr,
 					  HCI_ROLE_SLAVE);
-		if (!conn) {
-			bt_dev_err(hdev, "no memory for new connection");
+		if (IS_ERR(conn)) {
+			bt_dev_err(hdev, "connection err: %ld", PTR_ERR(conn));
 			goto unlock;
 		}
 	}
@@ -3820,6 +3826,9 @@ static u8 hci_cc_le_read_buffer_size_v2(struct hci_dev *hdev, void *data,
 	BT_DBG("%s acl mtu %d:%d iso mtu %d:%d", hdev->name, hdev->acl_mtu,
 	       hdev->acl_pkts, hdev->iso_mtu, hdev->iso_pkts);
 
+	if (hdev->le_mtu && hdev->le_mtu < HCI_MIN_LE_MTU)
+		return HCI_ERROR_INVALID_PARAMETERS;
+
 	return rp->status;
 }
 
@@ -5768,8 +5777,8 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
 			goto unlock;
 
 		conn = hci_conn_add_unset(hdev, LE_LINK, bdaddr, role);
-		if (!conn) {
-			bt_dev_err(hdev, "no memory for new connection");
+		if (IS_ERR(conn)) {
+			bt_dev_err(hdev, "connection err: %ld", PTR_ERR(conn));
 			goto unlock;
 		}
 
@@ -6497,7 +6506,7 @@ static void hci_le_pa_sync_estabilished_evt(struct hci_dev *hdev, void *data,
 	pa_sync = hci_conn_add_unset(hdev, ISO_LINK, BDADDR_ANY,
 				     HCI_ROLE_SLAVE);
 
-	if (!pa_sync)
+	if (IS_ERR(pa_sync))
 		goto unlock;
 
 	pa_sync->sync_handle = le16_to_cpu(ev->handle);
@@ -6921,7 +6930,7 @@ static void hci_le_cis_req_evt(struct hci_dev *hdev, void *data,
 	if (!cis) {
 		cis = hci_conn_add(hdev, ISO_LINK, &acl->dst, HCI_ROLE_SLAVE,
 				   cis_handle);
-		if (!cis) {
+		if (IS_ERR(cis)) {
 			hci_le_reject_cis(hdev, ev->cis_handle);
 			goto unlock;
 		}
@@ -7030,7 +7039,7 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data,
 		if (!bis) {
 			bis = hci_conn_add(hdev, ISO_LINK, BDADDR_ANY,
 					   HCI_ROLE_SLAVE, handle);
-			if (!bis)
+			if (IS_ERR(bis))
 				continue;
 		}
 
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index d0ef7290e5b7..00c0d8413c63 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -1258,7 +1258,7 @@ static int iso_sock_sendmsg(struct socket *sock, struct msghdr *msg,
 		return -ENOTCONN;
 	}
 
-	mtu = iso_pi(sk)->conn->hcon->hdev->iso_mtu;
+	mtu = iso_pi(sk)->conn->hcon->mtu;
 
 	release_sock(sk);
 
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 443423538a4c..adc8f990f13e 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -6264,7 +6264,7 @@ static int l2cap_finish_move(struct l2cap_chan *chan)
 	BT_DBG("chan %p", chan);
 
 	chan->rx_state = L2CAP_RX_STATE_RECV;
-	chan->conn->mtu = chan->conn->hcon->hdev->acl_mtu;
+	chan->conn->mtu = chan->conn->hcon->mtu;
 
 	return l2cap_resegment(chan);
 }
@@ -6331,7 +6331,7 @@ static int l2cap_rx_state_wait_f(struct l2cap_chan *chan,
 	 */
 	chan->next_tx_seq = control->reqseq;
 	chan->unacked_frames = 0;
-	chan->conn->mtu = chan->conn->hcon->hdev->acl_mtu;
+	chan->conn->mtu = chan->conn->hcon->mtu;
 
 	err = l2cap_resegment(chan);
 
@@ -6889,18 +6889,7 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon)
 
 	BT_DBG("hcon %p conn %p hchan %p", hcon, conn, hchan);
 
-	switch (hcon->type) {
-	case LE_LINK:
-		if (hcon->hdev->le_mtu) {
-			conn->mtu = hcon->hdev->le_mtu;
-			break;
-		}
-		fallthrough;
-	default:
-		conn->mtu = hcon->hdev->acl_mtu;
-		break;
-	}
-
+	conn->mtu = hcon->mtu;
 	conn->feat_mask = 0;
 
 	conn->local_fixed_chan = L2CAP_FC_SIG_BREDR | L2CAP_FC_CONNLESS;
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index e0ad30862ee4..71d36582d4ef 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -126,7 +126,6 @@ static void sco_sock_clear_timer(struct sock *sk)
 /* ---- SCO connections ---- */
 static struct sco_conn *sco_conn_add(struct hci_conn *hcon)
 {
-	struct hci_dev *hdev = hcon->hdev;
 	struct sco_conn *conn = hcon->sco_data;
 
 	if (conn) {
@@ -144,9 +143,10 @@ static struct sco_conn *sco_conn_add(struct hci_conn *hcon)
 
 	hcon->sco_data = conn;
 	conn->hcon = hcon;
+	conn->mtu = hcon->mtu;
 
-	if (hdev->sco_mtu > 0)
-		conn->mtu = hdev->sco_mtu;
+	if (hcon->mtu > 0)
+		conn->mtu = hcon->mtu;
 	else
 		conn->mtu = 60;
 
-- 
cgit v1.2.3


From 84a4bb6548a29326564f0e659fb8064503ecc1c7 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Mon, 6 May 2024 18:33:52 -0400
Subject: Bluetooth: HCI: Remove HCI_AMP support

Since BT_HS has been remove HCI_AMP controllers no longer has any use so
remove it along with the capability of creating AMP controllers.

Since we no longer need to differentiate between AMP and Primary
controllers, as only HCI_PRIMARY is left, this also remove
hdev->dev_type altogether.

Fixes: e7b02296fb40 ("Bluetooth: Remove BT_HS")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/btmrvl_main.c  |   9 ---
 drivers/bluetooth/btrsi.c        |   1 -
 drivers/bluetooth/btsdio.c       |   8 ---
 drivers/bluetooth/btusb.c        |   5 --
 drivers/bluetooth/hci_bcm4377.c  |   1 -
 drivers/bluetooth/hci_ldisc.c    |   6 --
 drivers/bluetooth/hci_serdev.c   |   5 --
 drivers/bluetooth/hci_uart.h     |   1 -
 drivers/bluetooth/hci_vhci.c     |  10 +--
 drivers/bluetooth/virtio_bt.c    |   2 -
 include/net/bluetooth/hci.h      | 114 ------------------------------
 include/net/bluetooth/hci_core.h |  46 +-----------
 include/uapi/linux/virtio_bt.h   |   1 -
 net/bluetooth/hci_conn.c         |   3 +-
 net/bluetooth/hci_core.c         | 132 +++--------------------------------
 net/bluetooth/hci_event.c        | 147 ---------------------------------------
 net/bluetooth/hci_sock.c         |   5 +-
 net/bluetooth/hci_sync.c         | 112 ++---------------------------
 net/bluetooth/l2cap_core.c       |  21 ++----
 net/bluetooth/mgmt.c             |  84 +++++++---------------
 20 files changed, 49 insertions(+), 664 deletions(-)

(limited to 'include')

diff --git a/drivers/bluetooth/btmrvl_main.c b/drivers/bluetooth/btmrvl_main.c
index 9658b33c824a..18f34998a120 100644
--- a/drivers/bluetooth/btmrvl_main.c
+++ b/drivers/bluetooth/btmrvl_main.c
@@ -121,13 +121,6 @@ int btmrvl_process_event(struct btmrvl_private *priv, struct sk_buff *skb)
 				((event->data[2] == MODULE_BROUGHT_UP) ||
 				(event->data[2] == MODULE_ALREADY_UP)) ?
 				"Bring-up succeed" : "Bring-up failed");
-
-			if (event->length > 3 && event->data[3])
-				priv->btmrvl_dev.dev_type = HCI_AMP;
-			else
-				priv->btmrvl_dev.dev_type = HCI_PRIMARY;
-
-			BT_DBG("dev_type: %d", priv->btmrvl_dev.dev_type);
 		} else if (priv->btmrvl_dev.sendcmdflag &&
 				event->data[1] == MODULE_SHUTDOWN_REQ) {
 			BT_DBG("EVENT:%s", (event->data[2]) ?
@@ -686,8 +679,6 @@ int btmrvl_register_hdev(struct btmrvl_private *priv)
 	hdev->wakeup = btmrvl_wakeup;
 	SET_HCIDEV_DEV(hdev, &card->func->dev);
 
-	hdev->dev_type = priv->btmrvl_dev.dev_type;
-
 	ret = hci_register_dev(hdev);
 	if (ret < 0) {
 		BT_ERR("Can not register HCI device");
diff --git a/drivers/bluetooth/btrsi.c b/drivers/bluetooth/btrsi.c
index 634cf8f5ed2d..0c91d7635ac3 100644
--- a/drivers/bluetooth/btrsi.c
+++ b/drivers/bluetooth/btrsi.c
@@ -134,7 +134,6 @@ static int rsi_hci_attach(void *priv, struct rsi_proto_ops *ops)
 		hdev->bus = HCI_USB;
 
 	hci_set_drvdata(hdev, h_adapter);
-	hdev->dev_type = HCI_PRIMARY;
 	hdev->open = rsi_hci_open;
 	hdev->close = rsi_hci_close;
 	hdev->flush = rsi_hci_flush;
diff --git a/drivers/bluetooth/btsdio.c b/drivers/bluetooth/btsdio.c
index f19d31ee37ea..fdcfe9c50313 100644
--- a/drivers/bluetooth/btsdio.c
+++ b/drivers/bluetooth/btsdio.c
@@ -32,9 +32,6 @@ static const struct sdio_device_id btsdio_table[] = {
 	/* Generic Bluetooth Type-B SDIO device */
 	{ SDIO_DEVICE_CLASS(SDIO_CLASS_BT_B) },
 
-	/* Generic Bluetooth AMP controller */
-	{ SDIO_DEVICE_CLASS(SDIO_CLASS_BT_AMP) },
-
 	{ }	/* Terminating entry */
 };
 
@@ -319,11 +316,6 @@ static int btsdio_probe(struct sdio_func *func,
 	hdev->bus = HCI_SDIO;
 	hci_set_drvdata(hdev, data);
 
-	if (id->class == SDIO_CLASS_BT_AMP)
-		hdev->dev_type = HCI_AMP;
-	else
-		hdev->dev_type = HCI_PRIMARY;
-
 	data->hdev = hdev;
 
 	SET_HCIDEV_DEV(hdev, &func->dev);
diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index 90d1f1697357..e384ef6ff050 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -4359,11 +4359,6 @@ static int btusb_probe(struct usb_interface *intf,
 	hdev->bus = HCI_USB;
 	hci_set_drvdata(hdev, data);
 
-	if (id->driver_info & BTUSB_AMP)
-		hdev->dev_type = HCI_AMP;
-	else
-		hdev->dev_type = HCI_PRIMARY;
-
 	data->hdev = hdev;
 
 	SET_HCIDEV_DEV(hdev, &intf->dev);
diff --git a/drivers/bluetooth/hci_bcm4377.c b/drivers/bluetooth/hci_bcm4377.c
index 9a7243d5db71..0c2f15235b4c 100644
--- a/drivers/bluetooth/hci_bcm4377.c
+++ b/drivers/bluetooth/hci_bcm4377.c
@@ -2361,7 +2361,6 @@ static int bcm4377_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	bcm4377->hdev = hdev;
 
 	hdev->bus = HCI_PCI;
-	hdev->dev_type = HCI_PRIMARY;
 	hdev->open = bcm4377_hci_open;
 	hdev->close = bcm4377_hci_close;
 	hdev->send = bcm4377_hci_send_frame;
diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c
index a26367e9fb19..17a2f158a0df 100644
--- a/drivers/bluetooth/hci_ldisc.c
+++ b/drivers/bluetooth/hci_ldisc.c
@@ -667,11 +667,6 @@ static int hci_uart_register_dev(struct hci_uart *hu)
 	if (!test_bit(HCI_UART_RESET_ON_INIT, &hu->hdev_flags))
 		set_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks);
 
-	if (test_bit(HCI_UART_CREATE_AMP, &hu->hdev_flags))
-		hdev->dev_type = HCI_AMP;
-	else
-		hdev->dev_type = HCI_PRIMARY;
-
 	/* Only call open() for the protocol after hdev is fully initialized as
 	 * open() (or a timer/workqueue it starts) may attempt to reference it.
 	 */
@@ -722,7 +717,6 @@ static int hci_uart_set_flags(struct hci_uart *hu, unsigned long flags)
 {
 	unsigned long valid_flags = BIT(HCI_UART_RAW_DEVICE) |
 				    BIT(HCI_UART_RESET_ON_INIT) |
-				    BIT(HCI_UART_CREATE_AMP) |
 				    BIT(HCI_UART_INIT_PENDING) |
 				    BIT(HCI_UART_EXT_CONFIG) |
 				    BIT(HCI_UART_VND_DETECT);
diff --git a/drivers/bluetooth/hci_serdev.c b/drivers/bluetooth/hci_serdev.c
index 85c0d9b68f5f..89a22e9b3253 100644
--- a/drivers/bluetooth/hci_serdev.c
+++ b/drivers/bluetooth/hci_serdev.c
@@ -366,11 +366,6 @@ int hci_uart_register_device_priv(struct hci_uart *hu,
 	if (test_bit(HCI_UART_EXT_CONFIG, &hu->hdev_flags))
 		set_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks);
 
-	if (test_bit(HCI_UART_CREATE_AMP, &hu->hdev_flags))
-		hdev->dev_type = HCI_AMP;
-	else
-		hdev->dev_type = HCI_PRIMARY;
-
 	if (test_bit(HCI_UART_INIT_PENDING, &hu->hdev_flags))
 		return 0;
 
diff --git a/drivers/bluetooth/hci_uart.h b/drivers/bluetooth/hci_uart.h
index 68c8c7e95d64..00bf7ae82c5b 100644
--- a/drivers/bluetooth/hci_uart.h
+++ b/drivers/bluetooth/hci_uart.h
@@ -37,7 +37,6 @@
 
 #define HCI_UART_RAW_DEVICE	0
 #define HCI_UART_RESET_ON_INIT	1
-#define HCI_UART_CREATE_AMP	2
 #define HCI_UART_INIT_PENDING	3
 #define HCI_UART_EXT_CONFIG	4
 #define HCI_UART_VND_DETECT	5
diff --git a/drivers/bluetooth/hci_vhci.c b/drivers/bluetooth/hci_vhci.c
index 572d68d52965..28750a40f0ed 100644
--- a/drivers/bluetooth/hci_vhci.c
+++ b/drivers/bluetooth/hci_vhci.c
@@ -384,17 +384,10 @@ static int __vhci_create_device(struct vhci_data *data, __u8 opcode)
 {
 	struct hci_dev *hdev;
 	struct sk_buff *skb;
-	__u8 dev_type;
 
 	if (data->hdev)
 		return -EBADFD;
 
-	/* bits 0-1 are dev_type (Primary or AMP) */
-	dev_type = opcode & 0x03;
-
-	if (dev_type != HCI_PRIMARY && dev_type != HCI_AMP)
-		return -EINVAL;
-
 	/* bits 2-5 are reserved (must be zero) */
 	if (opcode & 0x3c)
 		return -EINVAL;
@@ -412,7 +405,6 @@ static int __vhci_create_device(struct vhci_data *data, __u8 opcode)
 	data->hdev = hdev;
 
 	hdev->bus = HCI_VIRTUAL;
-	hdev->dev_type = dev_type;
 	hci_set_drvdata(hdev, data);
 
 	hdev->open  = vhci_open_dev;
@@ -634,7 +626,7 @@ static void vhci_open_timeout(struct work_struct *work)
 	struct vhci_data *data = container_of(work, struct vhci_data,
 					      open_timeout.work);
 
-	vhci_create_device(data, amp ? HCI_AMP : HCI_PRIMARY);
+	vhci_create_device(data, 0x00);
 }
 
 static int vhci_open(struct inode *inode, struct file *file)
diff --git a/drivers/bluetooth/virtio_bt.c b/drivers/bluetooth/virtio_bt.c
index 2ac70b560c46..18208e152a36 100644
--- a/drivers/bluetooth/virtio_bt.c
+++ b/drivers/bluetooth/virtio_bt.c
@@ -274,7 +274,6 @@ static int virtbt_probe(struct virtio_device *vdev)
 
 	switch (type) {
 	case VIRTIO_BT_CONFIG_TYPE_PRIMARY:
-	case VIRTIO_BT_CONFIG_TYPE_AMP:
 		break;
 	default:
 		return -EINVAL;
@@ -303,7 +302,6 @@ static int virtbt_probe(struct virtio_device *vdev)
 	vbt->hdev = hdev;
 
 	hdev->bus = HCI_VIRTIO;
-	hdev->dev_type = type;
 	hci_set_drvdata(hdev, vbt);
 
 	hdev->open  = virtbt_open;
diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index de530262d824..fe932ca3bc8c 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -33,9 +33,6 @@
 #define HCI_MAX_FRAME_SIZE	(HCI_MAX_ACL_SIZE + 4)
 
 #define HCI_LINK_KEY_SIZE	16
-#define HCI_AMP_LINK_KEY_SIZE	(2 * HCI_LINK_KEY_SIZE)
-
-#define HCI_MAX_AMP_ASSOC_SIZE	672
 
 #define HCI_MAX_CPB_DATA_SIZE	252
 
@@ -71,26 +68,6 @@
 #define HCI_SMD		9
 #define HCI_VIRTIO	10
 
-/* HCI controller types */
-#define HCI_PRIMARY	0x00
-#define HCI_AMP		0x01
-
-/* First BR/EDR Controller shall have ID = 0 */
-#define AMP_ID_BREDR	0x00
-
-/* AMP controller types */
-#define AMP_TYPE_BREDR	0x00
-#define AMP_TYPE_80211	0x01
-
-/* AMP controller status */
-#define AMP_STATUS_POWERED_DOWN			0x00
-#define AMP_STATUS_BLUETOOTH_ONLY		0x01
-#define AMP_STATUS_NO_CAPACITY			0x02
-#define AMP_STATUS_LOW_CAPACITY			0x03
-#define AMP_STATUS_MEDIUM_CAPACITY		0x04
-#define AMP_STATUS_HIGH_CAPACITY		0x05
-#define AMP_STATUS_FULL_CAPACITY		0x06
-
 /* HCI device quirks */
 enum {
 	/* When this quirk is set, the HCI Reset command is send when
@@ -527,7 +504,6 @@ enum {
 #define ESCO_LINK	0x02
 /* Low Energy links do not have defined link type. Use invented one */
 #define LE_LINK		0x80
-#define AMP_LINK	0x81
 #define ISO_LINK	0x82
 #define INVALID_LINK	0xff
 
@@ -943,56 +919,6 @@ struct hci_cp_io_capability_neg_reply {
 	__u8     reason;
 } __packed;
 
-#define HCI_OP_CREATE_PHY_LINK		0x0435
-struct hci_cp_create_phy_link {
-	__u8     phy_handle;
-	__u8     key_len;
-	__u8     key_type;
-	__u8     key[HCI_AMP_LINK_KEY_SIZE];
-} __packed;
-
-#define HCI_OP_ACCEPT_PHY_LINK		0x0436
-struct hci_cp_accept_phy_link {
-	__u8     phy_handle;
-	__u8     key_len;
-	__u8     key_type;
-	__u8     key[HCI_AMP_LINK_KEY_SIZE];
-} __packed;
-
-#define HCI_OP_DISCONN_PHY_LINK		0x0437
-struct hci_cp_disconn_phy_link {
-	__u8     phy_handle;
-	__u8     reason;
-} __packed;
-
-struct ext_flow_spec {
-	__u8       id;
-	__u8       stype;
-	__le16     msdu;
-	__le32     sdu_itime;
-	__le32     acc_lat;
-	__le32     flush_to;
-} __packed;
-
-#define HCI_OP_CREATE_LOGICAL_LINK	0x0438
-#define HCI_OP_ACCEPT_LOGICAL_LINK	0x0439
-struct hci_cp_create_accept_logical_link {
-	__u8                  phy_handle;
-	struct ext_flow_spec  tx_flow_spec;
-	struct ext_flow_spec  rx_flow_spec;
-} __packed;
-
-#define HCI_OP_DISCONN_LOGICAL_LINK	0x043a
-struct hci_cp_disconn_logical_link {
-	__le16   log_handle;
-} __packed;
-
-#define HCI_OP_LOGICAL_LINK_CANCEL	0x043b
-struct hci_cp_logical_link_cancel {
-	__u8     phy_handle;
-	__u8     flow_spec_id;
-} __packed;
-
 #define HCI_OP_ENHANCED_SETUP_SYNC_CONN		0x043d
 struct hci_coding_format {
 	__u8	id;
@@ -1614,46 +1540,6 @@ struct hci_rp_read_enc_key_size {
 	__u8     key_size;
 } __packed;
 
-#define HCI_OP_READ_LOCAL_AMP_INFO	0x1409
-struct hci_rp_read_local_amp_info {
-	__u8     status;
-	__u8     amp_status;
-	__le32   total_bw;
-	__le32   max_bw;
-	__le32   min_latency;
-	__le32   max_pdu;
-	__u8     amp_type;
-	__le16   pal_cap;
-	__le16   max_assoc_size;
-	__le32   max_flush_to;
-	__le32   be_flush_to;
-} __packed;
-
-#define HCI_OP_READ_LOCAL_AMP_ASSOC	0x140a
-struct hci_cp_read_local_amp_assoc {
-	__u8     phy_handle;
-	__le16   len_so_far;
-	__le16   max_len;
-} __packed;
-struct hci_rp_read_local_amp_assoc {
-	__u8     status;
-	__u8     phy_handle;
-	__le16   rem_len;
-	__u8     frag[];
-} __packed;
-
-#define HCI_OP_WRITE_REMOTE_AMP_ASSOC	0x140b
-struct hci_cp_write_remote_amp_assoc {
-	__u8     phy_handle;
-	__le16   len_so_far;
-	__le16   rem_len;
-	__u8     frag[];
-} __packed;
-struct hci_rp_write_remote_amp_assoc {
-	__u8     status;
-	__u8     phy_handle;
-} __packed;
-
 #define HCI_OP_GET_MWS_TRANSPORT_CONFIG	0x140c
 
 #define HCI_OP_ENABLE_DUT_MODE		0x1803
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index be012f393bcb..431d0c8a0e27 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -126,7 +126,6 @@ enum suspended_state {
 struct hci_conn_hash {
 	struct list_head list;
 	unsigned int     acl_num;
-	unsigned int     amp_num;
 	unsigned int     sco_num;
 	unsigned int     iso_num;
 	unsigned int     le_num;
@@ -341,14 +340,6 @@ struct adv_monitor {
 /* Default authenticated payload timeout 30s */
 #define DEFAULT_AUTH_PAYLOAD_TIMEOUT   0x0bb8
 
-struct amp_assoc {
-	__u16	len;
-	__u16	offset;
-	__u16	rem_len;
-	__u16	len_so_far;
-	__u8	data[HCI_MAX_AMP_ASSOC_SIZE];
-};
-
 #define HCI_MAX_PAGES	3
 
 struct hci_dev {
@@ -361,7 +352,6 @@ struct hci_dev {
 	unsigned long	flags;
 	__u16		id;
 	__u8		bus;
-	__u8		dev_type;
 	bdaddr_t	bdaddr;
 	bdaddr_t	setup_addr;
 	bdaddr_t	public_addr;
@@ -467,21 +457,6 @@ struct hci_dev {
 	__u16		sniff_min_interval;
 	__u16		sniff_max_interval;
 
-	__u8		amp_status;
-	__u32		amp_total_bw;
-	__u32		amp_max_bw;
-	__u32		amp_min_latency;
-	__u32		amp_max_pdu;
-	__u8		amp_type;
-	__u16		amp_pal_cap;
-	__u16		amp_assoc_size;
-	__u32		amp_max_flush_to;
-	__u32		amp_be_flush_to;
-
-	struct amp_assoc	loc_assoc;
-
-	__u8		flow_ctl_mode;
-
 	unsigned int	auto_accept_delay;
 
 	unsigned long	quirks;
@@ -501,11 +476,6 @@ struct hci_dev {
 	unsigned int	le_pkts;
 	unsigned int	iso_pkts;
 
-	__u16		block_len;
-	__u16		block_mtu;
-	__u16		num_blocks;
-	__u16		block_cnt;
-
 	unsigned long	acl_last_tx;
 	unsigned long	sco_last_tx;
 	unsigned long	le_last_tx;
@@ -778,7 +748,6 @@ struct hci_conn {
 	void		*l2cap_data;
 	void		*sco_data;
 	void		*iso_data;
-	struct amp_mgr	*amp_mgr;
 
 	struct list_head link_list;
 	struct hci_conn	*parent;
@@ -805,7 +774,6 @@ struct hci_chan {
 	struct sk_buff_head data_q;
 	unsigned int	sent;
 	__u8		state;
-	bool		amp;
 };
 
 struct hci_conn_params {
@@ -1014,9 +982,6 @@ static inline void hci_conn_hash_add(struct hci_dev *hdev, struct hci_conn *c)
 	case ACL_LINK:
 		h->acl_num++;
 		break;
-	case AMP_LINK:
-		h->amp_num++;
-		break;
 	case LE_LINK:
 		h->le_num++;
 		if (c->role == HCI_ROLE_SLAVE)
@@ -1043,9 +1008,6 @@ static inline void hci_conn_hash_del(struct hci_dev *hdev, struct hci_conn *c)
 	case ACL_LINK:
 		h->acl_num--;
 		break;
-	case AMP_LINK:
-		h->amp_num--;
-		break;
 	case LE_LINK:
 		h->le_num--;
 		if (c->role == HCI_ROLE_SLAVE)
@@ -1067,8 +1029,6 @@ static inline unsigned int hci_conn_num(struct hci_dev *hdev, __u8 type)
 	switch (type) {
 	case ACL_LINK:
 		return h->acl_num;
-	case AMP_LINK:
-		return h->amp_num;
 	case LE_LINK:
 		return h->le_num;
 	case SCO_LINK:
@@ -1085,7 +1045,7 @@ static inline unsigned int hci_conn_count(struct hci_dev *hdev)
 {
 	struct hci_conn_hash *c = &hdev->conn_hash;
 
-	return c->acl_num + c->amp_num + c->sco_num + c->le_num + c->iso_num;
+	return c->acl_num + c->sco_num + c->le_num + c->iso_num;
 }
 
 static inline bool hci_conn_valid(struct hci_dev *hdev, struct hci_conn *conn)
@@ -1610,10 +1570,6 @@ static inline void hci_conn_drop(struct hci_conn *conn)
 			}
 			break;
 
-		case AMP_LINK:
-			timeo = conn->disc_timeout;
-			break;
-
 		default:
 			timeo = 0;
 			break;
diff --git a/include/uapi/linux/virtio_bt.h b/include/uapi/linux/virtio_bt.h
index af798f4c9680..3cc7d633456b 100644
--- a/include/uapi/linux/virtio_bt.h
+++ b/include/uapi/linux/virtio_bt.h
@@ -13,7 +13,6 @@
 
 enum virtio_bt_config_type {
 	VIRTIO_BT_CONFIG_TYPE_PRIMARY	= 0,
-	VIRTIO_BT_CONFIG_TYPE_AMP	= 1,
 };
 
 enum virtio_bt_config_vendor {
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index f21e7c73021d..0c76dcde5361 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -1168,8 +1168,7 @@ struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src, uint8_t src_type)
 
 	list_for_each_entry(d, &hci_dev_list, list) {
 		if (!test_bit(HCI_UP, &d->flags) ||
-		    hci_dev_test_flag(d, HCI_USER_CHANNEL) ||
-		    d->dev_type != HCI_PRIMARY)
+		    hci_dev_test_flag(d, HCI_USER_CHANNEL))
 			continue;
 
 		/* Simple routing:
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index b52125512985..7bc431af7b04 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -402,11 +402,6 @@ int hci_inquiry(void __user *arg)
 		goto done;
 	}
 
-	if (hdev->dev_type != HCI_PRIMARY) {
-		err = -EOPNOTSUPP;
-		goto done;
-	}
-
 	if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) {
 		err = -EOPNOTSUPP;
 		goto done;
@@ -759,11 +754,6 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg)
 		goto done;
 	}
 
-	if (hdev->dev_type != HCI_PRIMARY) {
-		err = -EOPNOTSUPP;
-		goto done;
-	}
-
 	if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) {
 		err = -EOPNOTSUPP;
 		goto done;
@@ -917,7 +907,7 @@ int hci_get_dev_info(void __user *arg)
 
 	strscpy(di.name, hdev->name, sizeof(di.name));
 	di.bdaddr   = hdev->bdaddr;
-	di.type     = (hdev->bus & 0x0f) | ((hdev->dev_type & 0x03) << 4);
+	di.type     = (hdev->bus & 0x0f);
 	di.flags    = flags;
 	di.pkt_type = hdev->pkt_type;
 	if (lmp_bredr_capable(hdev)) {
@@ -1033,8 +1023,7 @@ static void hci_power_on(struct work_struct *work)
 	 */
 	if (hci_dev_test_flag(hdev, HCI_RFKILLED) ||
 	    hci_dev_test_flag(hdev, HCI_UNCONFIGURED) ||
-	    (hdev->dev_type == HCI_PRIMARY &&
-	     !bacmp(&hdev->bdaddr, BDADDR_ANY) &&
+	    (!bacmp(&hdev->bdaddr, BDADDR_ANY) &&
 	     !bacmp(&hdev->static_addr, BDADDR_ANY))) {
 		hci_dev_clear_flag(hdev, HCI_AUTO_OFF);
 		hci_dev_do_close(hdev);
@@ -2642,21 +2631,7 @@ int hci_register_dev(struct hci_dev *hdev)
 	if (!hdev->open || !hdev->close || !hdev->send)
 		return -EINVAL;
 
-	/* Do not allow HCI_AMP devices to register at index 0,
-	 * so the index can be used as the AMP controller ID.
-	 */
-	switch (hdev->dev_type) {
-	case HCI_PRIMARY:
-		id = ida_alloc_max(&hci_index_ida, HCI_MAX_ID - 1, GFP_KERNEL);
-		break;
-	case HCI_AMP:
-		id = ida_alloc_range(&hci_index_ida, 1, HCI_MAX_ID - 1,
-				     GFP_KERNEL);
-		break;
-	default:
-		return -EINVAL;
-	}
-
+	id = ida_alloc_max(&hci_index_ida, HCI_MAX_ID - 1, GFP_KERNEL);
 	if (id < 0)
 		return id;
 
@@ -2708,12 +2683,10 @@ int hci_register_dev(struct hci_dev *hdev)
 	hci_dev_set_flag(hdev, HCI_SETUP);
 	hci_dev_set_flag(hdev, HCI_AUTO_OFF);
 
-	if (hdev->dev_type == HCI_PRIMARY) {
-		/* Assume BR/EDR support until proven otherwise (such as
-		 * through reading supported features during init.
-		 */
-		hci_dev_set_flag(hdev, HCI_BREDR_ENABLED);
-	}
+	/* Assume BR/EDR support until proven otherwise (such as
+	 * through reading supported features during init.
+	 */
+	hci_dev_set_flag(hdev, HCI_BREDR_ENABLED);
 
 	write_lock(&hci_dev_list_lock);
 	list_add(&hdev->list, &hci_dev_list);
@@ -3249,17 +3222,7 @@ static void hci_queue_acl(struct hci_chan *chan, struct sk_buff_head *queue,
 
 	hci_skb_pkt_type(skb) = HCI_ACLDATA_PKT;
 
-	switch (hdev->dev_type) {
-	case HCI_PRIMARY:
-		hci_add_acl_hdr(skb, conn->handle, flags);
-		break;
-	case HCI_AMP:
-		hci_add_acl_hdr(skb, chan->handle, flags);
-		break;
-	default:
-		bt_dev_err(hdev, "unknown dev_type %d", hdev->dev_type);
-		return;
-	}
+	hci_add_acl_hdr(skb, conn->handle, flags);
 
 	list = skb_shinfo(skb)->frag_list;
 	if (!list) {
@@ -3419,9 +3382,6 @@ static inline void hci_quote_sent(struct hci_conn *conn, int num, int *quote)
 	case ACL_LINK:
 		cnt = hdev->acl_cnt;
 		break;
-	case AMP_LINK:
-		cnt = hdev->block_cnt;
-		break;
 	case SCO_LINK:
 	case ESCO_LINK:
 		cnt = hdev->sco_cnt;
@@ -3619,12 +3579,6 @@ static void hci_prio_recalculate(struct hci_dev *hdev, __u8 type)
 
 }
 
-static inline int __get_blocks(struct hci_dev *hdev, struct sk_buff *skb)
-{
-	/* Calculate count of blocks used by this packet */
-	return DIV_ROUND_UP(skb->len - HCI_ACL_HDR_SIZE, hdev->block_len);
-}
-
 static void __check_timeout(struct hci_dev *hdev, unsigned int cnt, u8 type)
 {
 	unsigned long last_tx;
@@ -3738,81 +3692,15 @@ static void hci_sched_acl_pkt(struct hci_dev *hdev)
 		hci_prio_recalculate(hdev, ACL_LINK);
 }
 
-static void hci_sched_acl_blk(struct hci_dev *hdev)
-{
-	unsigned int cnt = hdev->block_cnt;
-	struct hci_chan *chan;
-	struct sk_buff *skb;
-	int quote;
-	u8 type;
-
-	BT_DBG("%s", hdev->name);
-
-	if (hdev->dev_type == HCI_AMP)
-		type = AMP_LINK;
-	else
-		type = ACL_LINK;
-
-	__check_timeout(hdev, cnt, type);
-
-	while (hdev->block_cnt > 0 &&
-	       (chan = hci_chan_sent(hdev, type, &quote))) {
-		u32 priority = (skb_peek(&chan->data_q))->priority;
-		while (quote > 0 && (skb = skb_peek(&chan->data_q))) {
-			int blocks;
-
-			BT_DBG("chan %p skb %p len %d priority %u", chan, skb,
-			       skb->len, skb->priority);
-
-			/* Stop if priority has changed */
-			if (skb->priority < priority)
-				break;
-
-			skb = skb_dequeue(&chan->data_q);
-
-			blocks = __get_blocks(hdev, skb);
-			if (blocks > hdev->block_cnt)
-				return;
-
-			hci_conn_enter_active_mode(chan->conn,
-						   bt_cb(skb)->force_active);
-
-			hci_send_frame(hdev, skb);
-			hdev->acl_last_tx = jiffies;
-
-			hdev->block_cnt -= blocks;
-			quote -= blocks;
-
-			chan->sent += blocks;
-			chan->conn->sent += blocks;
-		}
-	}
-
-	if (cnt != hdev->block_cnt)
-		hci_prio_recalculate(hdev, type);
-}
-
 static void hci_sched_acl(struct hci_dev *hdev)
 {
 	BT_DBG("%s", hdev->name);
 
 	/* No ACL link over BR/EDR controller */
-	if (!hci_conn_num(hdev, ACL_LINK) && hdev->dev_type == HCI_PRIMARY)
-		return;
-
-	/* No AMP link over AMP controller */
-	if (!hci_conn_num(hdev, AMP_LINK) && hdev->dev_type == HCI_AMP)
+	if (!hci_conn_num(hdev, ACL_LINK))
 		return;
 
-	switch (hdev->flow_ctl_mode) {
-	case HCI_FLOW_CTL_MODE_PACKET_BASED:
-		hci_sched_acl_pkt(hdev);
-		break;
-
-	case HCI_FLOW_CTL_MODE_BLOCK_BASED:
-		hci_sched_acl_blk(hdev);
-		break;
-	}
+	hci_sched_acl_pkt(hdev);
 }
 
 static void hci_sched_le(struct hci_dev *hdev)
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 97c6f34ba640..a487f9df8145 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -913,21 +913,6 @@ static u8 hci_cc_read_local_ext_features(struct hci_dev *hdev, void *data,
 	return rp->status;
 }
 
-static u8 hci_cc_read_flow_control_mode(struct hci_dev *hdev, void *data,
-					struct sk_buff *skb)
-{
-	struct hci_rp_read_flow_control_mode *rp = data;
-
-	bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
-
-	if (rp->status)
-		return rp->status;
-
-	hdev->flow_ctl_mode = rp->mode;
-
-	return rp->status;
-}
-
 static u8 hci_cc_read_buffer_size(struct hci_dev *hdev, void *data,
 				  struct sk_buff *skb)
 {
@@ -1071,28 +1056,6 @@ static u8 hci_cc_write_page_scan_type(struct hci_dev *hdev, void *data,
 	return rp->status;
 }
 
-static u8 hci_cc_read_data_block_size(struct hci_dev *hdev, void *data,
-				      struct sk_buff *skb)
-{
-	struct hci_rp_read_data_block_size *rp = data;
-
-	bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
-
-	if (rp->status)
-		return rp->status;
-
-	hdev->block_mtu = __le16_to_cpu(rp->max_acl_len);
-	hdev->block_len = __le16_to_cpu(rp->block_len);
-	hdev->num_blocks = __le16_to_cpu(rp->num_blocks);
-
-	hdev->block_cnt = hdev->num_blocks;
-
-	BT_DBG("%s blk mtu %d cnt %d len %d", hdev->name, hdev->block_mtu,
-	       hdev->block_cnt, hdev->block_len);
-
-	return rp->status;
-}
-
 static u8 hci_cc_read_clock(struct hci_dev *hdev, void *data,
 			    struct sk_buff *skb)
 {
@@ -1127,30 +1090,6 @@ unlock:
 	return rp->status;
 }
 
-static u8 hci_cc_read_local_amp_info(struct hci_dev *hdev, void *data,
-				     struct sk_buff *skb)
-{
-	struct hci_rp_read_local_amp_info *rp = data;
-
-	bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
-
-	if (rp->status)
-		return rp->status;
-
-	hdev->amp_status = rp->amp_status;
-	hdev->amp_total_bw = __le32_to_cpu(rp->total_bw);
-	hdev->amp_max_bw = __le32_to_cpu(rp->max_bw);
-	hdev->amp_min_latency = __le32_to_cpu(rp->min_latency);
-	hdev->amp_max_pdu = __le32_to_cpu(rp->max_pdu);
-	hdev->amp_type = rp->amp_type;
-	hdev->amp_pal_cap = __le16_to_cpu(rp->pal_cap);
-	hdev->amp_assoc_size = __le16_to_cpu(rp->max_assoc_size);
-	hdev->amp_be_flush_to = __le32_to_cpu(rp->be_flush_to);
-	hdev->amp_max_flush_to = __le32_to_cpu(rp->max_flush_to);
-
-	return rp->status;
-}
-
 static u8 hci_cc_read_inq_rsp_tx_power(struct hci_dev *hdev, void *data,
 				       struct sk_buff *skb)
 {
@@ -4120,12 +4059,6 @@ static const struct hci_cc {
 	HCI_CC(HCI_OP_READ_PAGE_SCAN_TYPE, hci_cc_read_page_scan_type,
 	       sizeof(struct hci_rp_read_page_scan_type)),
 	HCI_CC_STATUS(HCI_OP_WRITE_PAGE_SCAN_TYPE, hci_cc_write_page_scan_type),
-	HCI_CC(HCI_OP_READ_DATA_BLOCK_SIZE, hci_cc_read_data_block_size,
-	       sizeof(struct hci_rp_read_data_block_size)),
-	HCI_CC(HCI_OP_READ_FLOW_CONTROL_MODE, hci_cc_read_flow_control_mode,
-	       sizeof(struct hci_rp_read_flow_control_mode)),
-	HCI_CC(HCI_OP_READ_LOCAL_AMP_INFO, hci_cc_read_local_amp_info,
-	       sizeof(struct hci_rp_read_local_amp_info)),
 	HCI_CC(HCI_OP_READ_CLOCK, hci_cc_read_clock,
 	       sizeof(struct hci_rp_read_clock)),
 	HCI_CC(HCI_OP_READ_ENC_KEY_SIZE, hci_cc_read_enc_key_size,
@@ -4461,11 +4394,6 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data,
 			     flex_array_size(ev, handles, ev->num)))
 		return;
 
-	if (hdev->flow_ctl_mode != HCI_FLOW_CTL_MODE_PACKET_BASED) {
-		bt_dev_err(hdev, "wrong event for mode %d", hdev->flow_ctl_mode);
-		return;
-	}
-
 	bt_dev_dbg(hdev, "num %d", ev->num);
 
 	for (i = 0; i < ev->num; i++) {
@@ -4533,78 +4461,6 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data,
 	queue_work(hdev->workqueue, &hdev->tx_work);
 }
 
-static struct hci_conn *__hci_conn_lookup_handle(struct hci_dev *hdev,
-						 __u16 handle)
-{
-	struct hci_chan *chan;
-
-	switch (hdev->dev_type) {
-	case HCI_PRIMARY:
-		return hci_conn_hash_lookup_handle(hdev, handle);
-	case HCI_AMP:
-		chan = hci_chan_lookup_handle(hdev, handle);
-		if (chan)
-			return chan->conn;
-		break;
-	default:
-		bt_dev_err(hdev, "unknown dev_type %d", hdev->dev_type);
-		break;
-	}
-
-	return NULL;
-}
-
-static void hci_num_comp_blocks_evt(struct hci_dev *hdev, void *data,
-				    struct sk_buff *skb)
-{
-	struct hci_ev_num_comp_blocks *ev = data;
-	int i;
-
-	if (!hci_ev_skb_pull(hdev, skb, HCI_EV_NUM_COMP_BLOCKS,
-			     flex_array_size(ev, handles, ev->num_hndl)))
-		return;
-
-	if (hdev->flow_ctl_mode != HCI_FLOW_CTL_MODE_BLOCK_BASED) {
-		bt_dev_err(hdev, "wrong event for mode %d",
-			   hdev->flow_ctl_mode);
-		return;
-	}
-
-	bt_dev_dbg(hdev, "num_blocks %d num_hndl %d", ev->num_blocks,
-		   ev->num_hndl);
-
-	for (i = 0; i < ev->num_hndl; i++) {
-		struct hci_comp_blocks_info *info = &ev->handles[i];
-		struct hci_conn *conn = NULL;
-		__u16  handle, block_count;
-
-		handle = __le16_to_cpu(info->handle);
-		block_count = __le16_to_cpu(info->blocks);
-
-		conn = __hci_conn_lookup_handle(hdev, handle);
-		if (!conn)
-			continue;
-
-		conn->sent -= block_count;
-
-		switch (conn->type) {
-		case ACL_LINK:
-		case AMP_LINK:
-			hdev->block_cnt += block_count;
-			if (hdev->block_cnt > hdev->num_blocks)
-				hdev->block_cnt = hdev->num_blocks;
-			break;
-
-		default:
-			bt_dev_err(hdev, "unknown type %d conn %p",
-				   conn->type, conn);
-			break;
-		}
-	}
-
-	queue_work(hdev->workqueue, &hdev->tx_work);
-}
-
 static void hci_mode_change_evt(struct hci_dev *hdev, void *data,
 				struct sk_buff *skb)
 {
@@ -7520,9 +7376,6 @@ static const struct hci_ev {
 	/* [0x3e = HCI_EV_LE_META] */
 	HCI_EV_REQ_VL(HCI_EV_LE_META, hci_le_meta_evt,
 		      sizeof(struct hci_ev_le_meta), HCI_MAX_EVENT_SIZE),
-	/* [0x48 = HCI_EV_NUM_COMP_BLOCKS] */
-	HCI_EV(HCI_EV_NUM_COMP_BLOCKS, hci_num_comp_blocks_evt,
-	       sizeof(struct hci_ev_num_comp_blocks)),
 	/* [0xff = HCI_EV_VENDOR] */
 	HCI_EV_VL(HCI_EV_VENDOR, msft_vendor_evt, 0, HCI_MAX_EVENT_SIZE),
 };
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 703b84bd48d5..69c2ba1e843e 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -485,7 +485,7 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event)
 			return NULL;
 
 		ni = skb_put(skb, HCI_MON_NEW_INDEX_SIZE);
-		ni->type = hdev->dev_type;
+		ni->type = 0x00; /* Old hdev->dev_type */
 		ni->bus = hdev->bus;
 		bacpy(&ni->bdaddr, &hdev->bdaddr);
 		memcpy_and_pad(ni->name, sizeof(ni->name), hdev->name,
@@ -1007,9 +1007,6 @@ static int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd,
 	if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED))
 		return -EOPNOTSUPP;
 
-	if (hdev->dev_type != HCI_PRIMARY)
-		return -EOPNOTSUPP;
-
 	switch (cmd) {
 	case HCISETRAW:
 		if (!capable(CAP_NET_ADMIN))
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 813c11d61bca..891cae8a30da 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -3511,10 +3511,6 @@ static int hci_unconf_init_sync(struct hci_dev *hdev)
 /* Read Local Supported Features. */
 static int hci_read_local_features_sync(struct hci_dev *hdev)
 {
-	 /* Not all AMP controllers support this command */
-	if (hdev->dev_type == HCI_AMP && !(hdev->commands[14] & 0x20))
-		return 0;
-
 	return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_FEATURES,
 				     0, NULL, HCI_CMD_TIMEOUT);
 }
@@ -3549,51 +3545,6 @@ static int hci_read_local_cmds_sync(struct hci_dev *hdev)
 	return 0;
 }
 
-/* Read Local AMP Info */
-static int hci_read_local_amp_info_sync(struct hci_dev *hdev)
-{
-	return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_AMP_INFO,
-				     0, NULL, HCI_CMD_TIMEOUT);
-}
-
-/* Read Data Blk size */
-static int hci_read_data_block_size_sync(struct hci_dev *hdev)
-{
-	return __hci_cmd_sync_status(hdev, HCI_OP_READ_DATA_BLOCK_SIZE,
-				     0, NULL, HCI_CMD_TIMEOUT);
-}
-
-/* Read Flow Control Mode */
-static int hci_read_flow_control_mode_sync(struct hci_dev *hdev)
-{
-	return __hci_cmd_sync_status(hdev, HCI_OP_READ_FLOW_CONTROL_MODE,
-				     0, NULL, HCI_CMD_TIMEOUT);
-}
-
-/* Read Location Data */
-static int hci_read_location_data_sync(struct hci_dev *hdev)
-{
-	return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCATION_DATA,
-				     0, NULL, HCI_CMD_TIMEOUT);
-}
-
-/* AMP Controller init stage 1 command sequence */
-static const struct hci_init_stage amp_init1[] = {
-	/* HCI_OP_READ_LOCAL_VERSION */
-	HCI_INIT(hci_read_local_version_sync),
-	/* HCI_OP_READ_LOCAL_COMMANDS */
-	HCI_INIT(hci_read_local_cmds_sync),
-	/* HCI_OP_READ_LOCAL_AMP_INFO */
-	HCI_INIT(hci_read_local_amp_info_sync),
-	/* HCI_OP_READ_DATA_BLOCK_SIZE */
-	HCI_INIT(hci_read_data_block_size_sync),
-	/* HCI_OP_READ_FLOW_CONTROL_MODE */
-	HCI_INIT(hci_read_flow_control_mode_sync),
-	/* HCI_OP_READ_LOCATION_DATA */
-	HCI_INIT(hci_read_location_data_sync),
-	{}
-};
-
 static int hci_init1_sync(struct hci_dev *hdev)
 {
 	int err;
@@ -3607,28 +3558,9 @@ static int hci_init1_sync(struct hci_dev *hdev)
 			return err;
 	}
 
-	switch (hdev->dev_type) {
-	case HCI_PRIMARY:
-		hdev->flow_ctl_mode = HCI_FLOW_CTL_MODE_PACKET_BASED;
-		return hci_init_stage_sync(hdev, br_init1);
-	case HCI_AMP:
-		hdev->flow_ctl_mode = HCI_FLOW_CTL_MODE_BLOCK_BASED;
-		return hci_init_stage_sync(hdev, amp_init1);
-	default:
-		bt_dev_err(hdev, "Unknown device type %d", hdev->dev_type);
-		break;
-	}
-
-	return 0;
+	return hci_init_stage_sync(hdev, br_init1);
 }
 
-/* AMP Controller init stage 2 command sequence */
-static const struct hci_init_stage amp_init2[] = {
-	/* HCI_OP_READ_LOCAL_FEATURES */
-	HCI_INIT(hci_read_local_features_sync),
-	{}
-};
-
 /* Read Buffer Size (ACL mtu, max pkt, etc.) */
 static int hci_read_buffer_size_sync(struct hci_dev *hdev)
 {
@@ -3886,9 +3818,6 @@ static int hci_init2_sync(struct hci_dev *hdev)
 
 	bt_dev_dbg(hdev, "");
 
-	if (hdev->dev_type == HCI_AMP)
-		return hci_init_stage_sync(hdev, amp_init2);
-
 	err = hci_init_stage_sync(hdev, hci_init2);
 	if (err)
 		return err;
@@ -4716,13 +4645,6 @@ static int hci_init_sync(struct hci_dev *hdev)
 	if (err < 0)
 		return err;
 
-	/* HCI_PRIMARY covers both single-mode LE, BR/EDR and dual-mode
-	 * BR/EDR/LE type controllers. AMP controllers only need the
-	 * first two stages of init.
-	 */
-	if (hdev->dev_type != HCI_PRIMARY)
-		return 0;
-
 	err = hci_init3_sync(hdev);
 	if (err < 0)
 		return err;
@@ -4951,12 +4873,8 @@ int hci_dev_open_sync(struct hci_dev *hdev)
 		 * In case of user channel usage, it is not important
 		 * if a public address or static random address is
 		 * available.
-		 *
-		 * This check is only valid for BR/EDR controllers
-		 * since AMP controllers do not have an address.
 		 */
 		if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
-		    hdev->dev_type == HCI_PRIMARY &&
 		    !bacmp(&hdev->bdaddr, BDADDR_ANY) &&
 		    !bacmp(&hdev->static_addr, BDADDR_ANY)) {
 			ret = -EADDRNOTAVAIL;
@@ -4991,8 +4909,7 @@ int hci_dev_open_sync(struct hci_dev *hdev)
 		    !hci_dev_test_flag(hdev, HCI_CONFIG) &&
 		    !hci_dev_test_flag(hdev, HCI_UNCONFIGURED) &&
 		    !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
-		    hci_dev_test_flag(hdev, HCI_MGMT) &&
-		    hdev->dev_type == HCI_PRIMARY) {
+		    hci_dev_test_flag(hdev, HCI_MGMT)) {
 			ret = hci_powered_update_sync(hdev);
 			mgmt_power_on(hdev, ret);
 		}
@@ -5137,8 +5054,7 @@ int hci_dev_close_sync(struct hci_dev *hdev)
 
 	auto_off = hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF);
 
-	if (!auto_off && hdev->dev_type == HCI_PRIMARY &&
-	    !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
+	if (!auto_off && !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
 	    hci_dev_test_flag(hdev, HCI_MGMT))
 		__mgmt_power_off(hdev);
 
@@ -5200,9 +5116,6 @@ int hci_dev_close_sync(struct hci_dev *hdev)
 	hdev->flags &= BIT(HCI_RAW);
 	hci_dev_clear_volatile_flags(hdev);
 
-	/* Controller radio is available but is currently powered down */
-	hdev->amp_status = AMP_STATUS_POWERED_DOWN;
-
 	memset(hdev->eir, 0, sizeof(hdev->eir));
 	memset(hdev->dev_class, 0, sizeof(hdev->dev_class));
 	bacpy(&hdev->random_addr, BDADDR_ANY);
@@ -5239,8 +5152,7 @@ static int hci_power_on_sync(struct hci_dev *hdev)
 	 */
 	if (hci_dev_test_flag(hdev, HCI_RFKILLED) ||
 	    hci_dev_test_flag(hdev, HCI_UNCONFIGURED) ||
-	    (hdev->dev_type == HCI_PRIMARY &&
-	     !bacmp(&hdev->bdaddr, BDADDR_ANY) &&
+	    (!bacmp(&hdev->bdaddr, BDADDR_ANY) &&
 	     !bacmp(&hdev->static_addr, BDADDR_ANY))) {
 		hci_dev_clear_flag(hdev, HCI_AUTO_OFF);
 		hci_dev_close_sync(hdev);
@@ -5342,27 +5254,11 @@ int hci_stop_discovery_sync(struct hci_dev *hdev)
 	return 0;
 }
 
-static int hci_disconnect_phy_link_sync(struct hci_dev *hdev, u16 handle,
-					u8 reason)
-{
-	struct hci_cp_disconn_phy_link cp;
-
-	memset(&cp, 0, sizeof(cp));
-	cp.phy_handle = HCI_PHY_HANDLE(handle);
-	cp.reason = reason;
-
-	return __hci_cmd_sync_status(hdev, HCI_OP_DISCONN_PHY_LINK,
-				     sizeof(cp), &cp, HCI_CMD_TIMEOUT);
-}
-
 static int hci_disconnect_sync(struct hci_dev *hdev, struct hci_conn *conn,
 			       u8 reason)
 {
 	struct hci_cp_disconnect cp;
 
-	if (conn->type == AMP_LINK)
-		return hci_disconnect_phy_link_sync(hdev, conn->handle, reason);
-
 	if (test_bit(HCI_CONN_BIG_CREATED, &conn->flags)) {
 		/* This is a BIS connection, hci_conn_del will
 		 * do the necessary cleanup.
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index adc8f990f13e..5b509b767557 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -3932,7 +3932,7 @@ static inline int l2cap_command_rej(struct l2cap_conn *conn,
 }
 
 static void l2cap_connect(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd,
-			  u8 *data, u8 rsp_code, u8 amp_id)
+			  u8 *data, u8 rsp_code)
 {
 	struct l2cap_conn_req *req = (struct l2cap_conn_req *) data;
 	struct l2cap_conn_rsp rsp;
@@ -4011,17 +4011,8 @@ static void l2cap_connect(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd,
 				status = L2CAP_CS_AUTHOR_PEND;
 				chan->ops->defer(chan);
 			} else {
-				/* Force pending result for AMP controllers.
-				 * The connection will succeed after the
-				 * physical link is up.
-				 */
-				if (amp_id == AMP_ID_BREDR) {
-					l2cap_state_change(chan, BT_CONFIG);
-					result = L2CAP_CR_SUCCESS;
-				} else {
-					l2cap_state_change(chan, BT_CONNECT2);
-					result = L2CAP_CR_PEND;
-				}
+				l2cap_state_change(chan, BT_CONNECT2);
+				result = L2CAP_CR_PEND;
 				status = L2CAP_CS_NO_INFO;
 			}
 		} else {
@@ -4086,7 +4077,7 @@ static int l2cap_connect_req(struct l2cap_conn *conn,
 		mgmt_device_connected(hdev, hcon, NULL, 0);
 	hci_dev_unlock(hdev);
 
-	l2cap_connect(conn, cmd, data, L2CAP_CONN_RSP, 0);
+	l2cap_connect(conn, cmd, data, L2CAP_CONN_RSP);
 	return 0;
 }
 
@@ -7491,10 +7482,6 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
 	struct l2cap_conn *conn = hcon->l2cap_data;
 	int len;
 
-	/* For AMP controller do not create l2cap conn */
-	if (!conn && hcon->hdev->dev_type != HCI_PRIMARY)
-		goto drop;
-
 	if (!conn)
 		conn = l2cap_conn_add(hcon);
 
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 965f621ef865..80f220b7e19d 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -443,8 +443,7 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data,
 
 	count = 0;
 	list_for_each_entry(d, &hci_dev_list, list) {
-		if (d->dev_type == HCI_PRIMARY &&
-		    !hci_dev_test_flag(d, HCI_UNCONFIGURED))
+		if (!hci_dev_test_flag(d, HCI_UNCONFIGURED))
 			count++;
 	}
 
@@ -468,8 +467,7 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data,
 		if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks))
 			continue;
 
-		if (d->dev_type == HCI_PRIMARY &&
-		    !hci_dev_test_flag(d, HCI_UNCONFIGURED)) {
+		if (!hci_dev_test_flag(d, HCI_UNCONFIGURED)) {
 			rp->index[count++] = cpu_to_le16(d->id);
 			bt_dev_dbg(hdev, "Added hci%u", d->id);
 		}
@@ -503,8 +501,7 @@ static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev,
 
 	count = 0;
 	list_for_each_entry(d, &hci_dev_list, list) {
-		if (d->dev_type == HCI_PRIMARY &&
-		    hci_dev_test_flag(d, HCI_UNCONFIGURED))
+		if (hci_dev_test_flag(d, HCI_UNCONFIGURED))
 			count++;
 	}
 
@@ -528,8 +525,7 @@ static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev,
 		if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks))
 			continue;
 
-		if (d->dev_type == HCI_PRIMARY &&
-		    hci_dev_test_flag(d, HCI_UNCONFIGURED)) {
+		if (hci_dev_test_flag(d, HCI_UNCONFIGURED)) {
 			rp->index[count++] = cpu_to_le16(d->id);
 			bt_dev_dbg(hdev, "Added hci%u", d->id);
 		}
@@ -561,10 +557,8 @@ static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev,
 	read_lock(&hci_dev_list_lock);
 
 	count = 0;
-	list_for_each_entry(d, &hci_dev_list, list) {
-		if (d->dev_type == HCI_PRIMARY || d->dev_type == HCI_AMP)
-			count++;
-	}
+	list_for_each_entry(d, &hci_dev_list, list)
+		count++;
 
 	rp = kmalloc(struct_size(rp, entry, count), GFP_ATOMIC);
 	if (!rp) {
@@ -585,16 +579,10 @@ static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev,
 		if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks))
 			continue;
 
-		if (d->dev_type == HCI_PRIMARY) {
-			if (hci_dev_test_flag(d, HCI_UNCONFIGURED))
-				rp->entry[count].type = 0x01;
-			else
-				rp->entry[count].type = 0x00;
-		} else if (d->dev_type == HCI_AMP) {
-			rp->entry[count].type = 0x02;
-		} else {
-			continue;
-		}
+		if (hci_dev_test_flag(d, HCI_UNCONFIGURED))
+			rp->entry[count].type = 0x01;
+		else
+			rp->entry[count].type = 0x00;
 
 		rp->entry[count].bus = d->bus;
 		rp->entry[count++].index = cpu_to_le16(d->id);
@@ -9331,23 +9319,14 @@ void mgmt_index_added(struct hci_dev *hdev)
 	if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks))
 		return;
 
-	switch (hdev->dev_type) {
-	case HCI_PRIMARY:
-		if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
-			mgmt_index_event(MGMT_EV_UNCONF_INDEX_ADDED, hdev,
-					 NULL, 0, HCI_MGMT_UNCONF_INDEX_EVENTS);
-			ev.type = 0x01;
-		} else {
-			mgmt_index_event(MGMT_EV_INDEX_ADDED, hdev, NULL, 0,
-					 HCI_MGMT_INDEX_EVENTS);
-			ev.type = 0x00;
-		}
-		break;
-	case HCI_AMP:
-		ev.type = 0x02;
-		break;
-	default:
-		return;
+	if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
+		mgmt_index_event(MGMT_EV_UNCONF_INDEX_ADDED, hdev, NULL, 0,
+				 HCI_MGMT_UNCONF_INDEX_EVENTS);
+		ev.type = 0x01;
+	} else {
+		mgmt_index_event(MGMT_EV_INDEX_ADDED, hdev, NULL, 0,
+				 HCI_MGMT_INDEX_EVENTS);
+		ev.type = 0x00;
 	}
 
 	ev.bus = hdev->bus;
@@ -9364,25 +9343,16 @@ void mgmt_index_removed(struct hci_dev *hdev)
 	if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks))
 		return;
 
-	switch (hdev->dev_type) {
-	case HCI_PRIMARY:
-		mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &status);
+	mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &status);
 
-		if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
-			mgmt_index_event(MGMT_EV_UNCONF_INDEX_REMOVED, hdev,
-					 NULL, 0, HCI_MGMT_UNCONF_INDEX_EVENTS);
-			ev.type = 0x01;
-		} else {
-			mgmt_index_event(MGMT_EV_INDEX_REMOVED, hdev, NULL, 0,
-					 HCI_MGMT_INDEX_EVENTS);
-			ev.type = 0x00;
-		}
-		break;
-	case HCI_AMP:
-		ev.type = 0x02;
-		break;
-	default:
-		return;
+	if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
+		mgmt_index_event(MGMT_EV_UNCONF_INDEX_REMOVED, hdev, NULL, 0,
+				 HCI_MGMT_UNCONF_INDEX_EVENTS);
+		ev.type = 0x01;
+	} else {
+		mgmt_index_event(MGMT_EV_INDEX_REMOVED, hdev, NULL, 0,
+				 HCI_MGMT_INDEX_EVENTS);
+		ev.type = 0x00;
 	}
 
 	ev.bus = hdev->bus;
-- 
cgit v1.2.3


From e77f43d531af41e9ce299eab10dcae8fa5dbc293 Mon Sep 17 00:00:00 2001
From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Date: Mon, 13 May 2024 16:07:55 -0400
Subject: Bluetooth: hci_core: Fix not handling hdev->le_num_of_adv_sets=1

If hdev->le_num_of_adv_sets is set to 1 it means that only handle 0x00
can be used, but since the MGMT interface instances start from 1
(instance 0 means all instances in case of MGMT_OP_REMOVE_ADVERTISING)
the code needs to map the instance to handle otherwise users will not be
able to advertise as instance 1 would attempt to use handle 0x01.

Fixes: 1d0fac2c38ed ("Bluetooth: Use controller sets when available")
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci_core.h |  1 +
 net/bluetooth/hci_core.c         |  9 +++++++++
 net/bluetooth/hci_sync.c         | 17 ++++++++---------
 3 files changed, 18 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 431d0c8a0e27..9231396fe96f 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -246,6 +246,7 @@ struct adv_info {
 	bool	periodic;
 	__u8	mesh;
 	__u8	instance;
+	__u8	handle;
 	__u32	flags;
 	__u16	timeout;
 	__u16	remaining_time;
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 7bc431af7b04..dd3b0f501018 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -1765,6 +1765,15 @@ struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance,
 
 		adv->pending = true;
 		adv->instance = instance;
+
+		/* If controller support only one set and the instance is set to
+		 * 1 then there is no option other than using handle 0x00.
+		 */
+		if (hdev->le_num_of_adv_sets == 1 && instance == 1)
+			adv->handle = 0x00;
+		else
+			adv->handle = instance;
+
 		list_add(&adv->list, &hdev->adv_instances);
 		hdev->adv_instance_cnt++;
 	}
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 891cae8a30da..16daa79b7981 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -1043,11 +1043,10 @@ static int hci_disable_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance)
 	struct hci_cp_ext_adv_set *set;
 	u8 data[sizeof(*cp) + sizeof(*set) * 1];
 	u8 size;
+	struct adv_info *adv = NULL;
 
 	/* If request specifies an instance that doesn't exist, fail */
 	if (instance > 0) {
-		struct adv_info *adv;
-
 		adv = hci_find_adv_instance(hdev, instance);
 		if (!adv)
 			return -EINVAL;
@@ -1066,7 +1065,7 @@ static int hci_disable_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance)
 	cp->num_of_sets = !!instance;
 	cp->enable = 0x00;
 
-	set->handle = instance;
+	set->handle = adv ? adv->handle : instance;
 
 	size = sizeof(*cp) + sizeof(*set) * cp->num_of_sets;
 
@@ -1249,7 +1248,7 @@ static int hci_set_ext_scan_rsp_data_sync(struct hci_dev *hdev, u8 instance)
 
 	len = eir_create_scan_rsp(hdev, instance, pdu->data);
 
-	pdu->handle = instance;
+	pdu->handle = adv ? adv->handle : instance;
 	pdu->length = len;
 	pdu->operation = LE_SET_ADV_DATA_OP_COMPLETE;
 	pdu->frag_pref = LE_SET_ADV_DATA_NO_FRAG;
@@ -1331,7 +1330,7 @@ int hci_enable_ext_advertising_sync(struct hci_dev *hdev, u8 instance)
 
 	memset(set, 0, sizeof(*set));
 
-	set->handle = instance;
+	set->handle = adv ? adv->handle : instance;
 
 	/* Set duration per instance since controller is responsible for
 	 * scheduling it.
@@ -1410,10 +1409,10 @@ static int hci_set_per_adv_data_sync(struct hci_dev *hdev, u8 instance)
 	DEFINE_FLEX(struct hci_cp_le_set_per_adv_data, pdu, data, length,
 		    HCI_MAX_PER_AD_LENGTH);
 	u8 len;
+	struct adv_info *adv = NULL;
 
 	if (instance) {
-		struct adv_info *adv = hci_find_adv_instance(hdev, instance);
-
+		adv = hci_find_adv_instance(hdev, instance);
 		if (!adv || !adv->periodic)
 			return 0;
 	}
@@ -1421,7 +1420,7 @@ static int hci_set_per_adv_data_sync(struct hci_dev *hdev, u8 instance)
 	len = eir_create_per_adv_data(hdev, instance, pdu->data);
 
 	pdu->length = len;
-	pdu->handle = instance;
+	pdu->handle = adv ? adv->handle : instance;
 	pdu->operation = LE_SET_ADV_DATA_OP_COMPLETE;
 
 	return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PER_ADV_DATA,
@@ -1734,7 +1733,7 @@ static int hci_set_ext_adv_data_sync(struct hci_dev *hdev, u8 instance)
 	len = eir_create_adv_data(hdev, instance, pdu->data);
 
 	pdu->length = len;
-	pdu->handle = instance;
+	pdu->handle = adv ? adv->handle : instance;
 	pdu->operation = LE_SET_ADV_DATA_OP_COMPLETE;
 	pdu->frag_pref = LE_SET_ADV_DATA_NO_FRAG;
 
-- 
cgit v1.2.3


From 386f0cffae4654f6db1c0d8ad8273511f925f719 Mon Sep 17 00:00:00 2001
From: Richard Gobert <richardbgobert@gmail.com>
Date: Tue, 14 May 2024 19:06:15 +0200
Subject: net: gro: fix napi_gro_cb zeroed alignment

Add 2 byte padding to napi_gro_cb struct to ensure zeroed member is
aligned after flush_id member was removed in the original commit.

Fixes: 4b0ebbca3e16 ("net: gro: move L3 flush checks to tcp_gro_receive and udp_gro_receive_segment")
Suggested-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Richard Gobert <richardbgobert@gmail.com>
Link: https://lore.kernel.org/r/fca08735-c245-49e5-af72-82900634f144@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/gro.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/net/gro.h b/include/net/gro.h
index f13634b1f4c1..b9b58c1f8d19 100644
--- a/include/net/gro.h
+++ b/include/net/gro.h
@@ -42,8 +42,7 @@ struct napi_gro_cb {
 	/* Used in ipv6_gro_receive() and foo-over-udp and esp-in-udp */
 	u16	proto;
 
-	/* used to support CHECKSUM_COMPLETE for tunneling protocols */
-	__wsum	csum;
+	u16	pad;
 
 /* Used in napi_gro_cb::free */
 #define NAPI_GRO_FREE             1
@@ -85,6 +84,9 @@ struct napi_gro_cb {
 		u8	is_flist:1;
 	);
 
+	/* used to support CHECKSUM_COMPLETE for tunneling protocols */
+	__wsum	csum;
+
 	/* L3 offsets */
 	union {
 		struct {
-- 
cgit v1.2.3


From 4452e8ef8c364113495f414d7e6846d74d7eff81 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Mon, 6 May 2024 21:44:49 +0200
Subject: s390/iucv: Provide iucv_alloc_device() / iucv_release_device()

Provide iucv_alloc_device() and iucv_release_device() helper functions,
which can be used to deduplicate more or less identical IUCV device
allocation and release code in four different drivers.

Suggested-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Alexandra Winter <wintera@linux.ibm.com>
Link: https://lore.kernel.org/r/20240506194454.1160315-2-hca@linux.ibm.com
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
---
 include/net/iucv/iucv.h |  6 ++++++
 net/iucv/iucv.c         | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+)

(limited to 'include')

diff --git a/include/net/iucv/iucv.h b/include/net/iucv/iucv.h
index 5cd7871127c9..b3736e66fe1a 100644
--- a/include/net/iucv/iucv.h
+++ b/include/net/iucv/iucv.h
@@ -84,6 +84,12 @@ struct iucv_array {
 extern const struct bus_type iucv_bus;
 extern struct device *iucv_root;
 
+struct device_driver;
+
+struct device *iucv_alloc_device(const struct attribute_group **attrs,
+				 struct device_driver *driver, void *priv,
+				 const char *fmt, ...) __printf(4, 5);
+
 /*
  * struct iucv_path
  * pathid: 16 bit path identification
diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
index a4ab615ca3e3..9db7c2c0ae72 100644
--- a/net/iucv/iucv.c
+++ b/net/iucv/iucv.c
@@ -76,6 +76,41 @@ EXPORT_SYMBOL(iucv_bus);
 struct device *iucv_root;
 EXPORT_SYMBOL(iucv_root);
 
+static void iucv_release_device(struct device *device)
+{
+	kfree(device);
+}
+
+struct device *iucv_alloc_device(const struct attribute_group **attrs,
+				 struct device_driver *driver,
+				 void *priv, const char *fmt, ...)
+{
+	struct device *dev;
+	va_list vargs;
+	int rc;
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		goto out_error;
+	va_start(vargs, fmt);
+	rc = dev_set_name(dev, fmt, vargs);
+	va_end(vargs);
+	if (rc)
+		goto out_error;
+	dev->bus = &iucv_bus;
+	dev->parent = iucv_root;
+	dev->driver = driver;
+	dev->groups = attrs;
+	dev->release = iucv_release_device;
+	dev_set_drvdata(dev, priv);
+	return dev;
+
+out_error:
+	kfree(dev);
+	return NULL;
+}
+EXPORT_SYMBOL(iucv_alloc_device);
+
 static int iucv_available;
 
 /* General IUCV interrupt structure */
-- 
cgit v1.2.3


From effb83572685eaa70d05a8dd6307ca574a11fcf3 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Mon, 6 May 2024 21:44:54 +0200
Subject: s390/iucv: Unexport iucv_root

There is no user of iucv_root outside of the core IUCV code left.
Therefore remove the EXPORT_SYMBOL.

Acked-by: Alexandra Winter <wintera@linux.ibm.com>
Link: https://lore.kernel.org/r/20240506194454.1160315-7-hca@linux.ibm.com
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
---
 include/net/iucv/iucv.h | 1 -
 net/iucv/iucv.c         | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/net/iucv/iucv.h b/include/net/iucv/iucv.h
index b3736e66fe1a..4d114e6d6d23 100644
--- a/include/net/iucv/iucv.h
+++ b/include/net/iucv/iucv.h
@@ -82,7 +82,6 @@ struct iucv_array {
 } __attribute__ ((aligned (8)));
 
 extern const struct bus_type iucv_bus;
-extern struct device *iucv_root;
 
 struct device_driver;
 
diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
index 9db7c2c0ae72..2e61f19621ee 100644
--- a/net/iucv/iucv.c
+++ b/net/iucv/iucv.c
@@ -73,8 +73,7 @@ const struct bus_type iucv_bus = {
 };
 EXPORT_SYMBOL(iucv_bus);
 
-struct device *iucv_root;
-EXPORT_SYMBOL(iucv_root);
+static struct device *iucv_root;
 
 static void iucv_release_device(struct device *device)
 {
-- 
cgit v1.2.3


From 21c38a3bd4ee3fb7337d013a638302fb5e5f9dc2 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <hawk@kernel.org>
Date: Wed, 1 May 2024 16:04:11 +0200
Subject: cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints

This closely resembles helpers added for the global cgroup_rstat_lock in
commit fc29e04ae1ad ("cgroup/rstat: add cgroup_rstat_lock helpers and
tracepoints"). This is for the per CPU lock cgroup_rstat_cpu_lock.

Based on production workloads, we observe the fast-path "update" function
cgroup_rstat_updated() is invoked around 3 million times per sec, while the
"flush" function cgroup_rstat_flush_locked(), walking each possible CPU,
can see periodic spikes of 700 invocations/sec.

For this reason, the tracepoints are split into normal and fastpath
versions for this per-CPU lock. Making it feasible for production to
continuously monitor the non-fastpath tracepoint to detect lock contention
issues. The reason for monitoring is that lock disables IRQs which can
disturb e.g. softirq processing on the local CPUs involved. When the
global cgroup_rstat_lock stops disabling IRQs (e.g converted to a mutex),
this per CPU lock becomes the next bottleneck that can introduce latency
variations.

A practical bpftrace script for monitoring contention latency:

 bpftrace -e '
   tracepoint:cgroup:cgroup_rstat_cpu_lock_contended {
     @start[tid]=nsecs; @cnt[probe]=count()}
   tracepoint:cgroup:cgroup_rstat_cpu_locked {
     if (args->contended) {
       @wait_ns=hist(nsecs-@start[tid]); delete(@start[tid]);}
     @cnt[probe]=count()}
   interval:s:1 {time("%H:%M:%S "); print(@wait_ns); print(@cnt); clear(@cnt);}'

Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/trace/events/cgroup.h | 56 ++++++++++++++++++++++++++++++----
 kernel/cgroup/rstat.c         | 70 +++++++++++++++++++++++++++++++++++--------
 2 files changed, 108 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h
index 13f375800135..0b95865a90f3 100644
--- a/include/trace/events/cgroup.h
+++ b/include/trace/events/cgroup.h
@@ -206,15 +206,15 @@ DEFINE_EVENT(cgroup_event, cgroup_notify_frozen,
 
 DECLARE_EVENT_CLASS(cgroup_rstat,
 
-	TP_PROTO(struct cgroup *cgrp, int cpu_in_loop, bool contended),
+	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
 
-	TP_ARGS(cgrp, cpu_in_loop, contended),
+	TP_ARGS(cgrp, cpu, contended),
 
 	TP_STRUCT__entry(
 		__field(	int,		root			)
 		__field(	int,		level			)
 		__field(	u64,		id			)
-		__field(	int,		cpu_in_loop		)
+		__field(	int,		cpu			)
 		__field(	bool,		contended		)
 	),
 
@@ -222,15 +222,16 @@ DECLARE_EVENT_CLASS(cgroup_rstat,
 		__entry->root = cgrp->root->hierarchy_id;
 		__entry->id = cgroup_id(cgrp);
 		__entry->level = cgrp->level;
-		__entry->cpu_in_loop = cpu_in_loop;
+		__entry->cpu = cpu;
 		__entry->contended = contended;
 	),
 
-	TP_printk("root=%d id=%llu level=%d cpu_in_loop=%d lock contended:%d",
+	TP_printk("root=%d id=%llu level=%d cpu=%d lock contended:%d",
 		  __entry->root, __entry->id, __entry->level,
-		  __entry->cpu_in_loop, __entry->contended)
+		  __entry->cpu, __entry->contended)
 );
 
+/* Related to global: cgroup_rstat_lock */
 DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,
 
 	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
@@ -252,6 +253,49 @@ DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,
 	TP_ARGS(cgrp, cpu, contended)
 );
 
+/* Related to per CPU: cgroup_rstat_cpu_lock */
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended,
+
+	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+	TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended_fastpath,
+
+	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+	TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked,
+
+	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+	TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked_fastpath,
+
+	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+	TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock,
+
+	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+	TP_ARGS(cgrp, cpu, contended)
+);
+
+DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath,
+
+	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
+
+	TP_ARGS(cgrp, cpu, contended)
+);
+
 #endif /* _TRACE_CGROUP_H */
 
 /* This part must be outside protection */
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 52e3b0ed1cee..fb8b49437573 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -19,6 +19,60 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
 	return per_cpu_ptr(cgrp->rstat_cpu, cpu);
 }
 
+/*
+ * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
+ *
+ * This makes it easier to diagnose locking issues and contention in
+ * production environments. The parameter @fast_path determine the
+ * tracepoints being added, allowing us to diagnose "flush" related
+ * operations without handling high-frequency fast-path "update" events.
+ */
+static __always_inline
+unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu,
+				     struct cgroup *cgrp, const bool fast_path)
+{
+	unsigned long flags;
+	bool contended;
+
+	/*
+	 * The _irqsave() is needed because cgroup_rstat_lock is
+	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
+	 * this lock with the _irq() suffix only disables interrupts on
+	 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
+	 * interrupts on both configurations. The _irqsave() ensures
+	 * that interrupts are always disabled and later restored.
+	 */
+	contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
+	if (contended) {
+		if (fast_path)
+			trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended);
+		else
+			trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended);
+
+		raw_spin_lock_irqsave(cpu_lock, flags);
+	}
+
+	if (fast_path)
+		trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended);
+	else
+		trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended);
+
+	return flags;
+}
+
+static __always_inline
+void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu,
+			      struct cgroup *cgrp, unsigned long flags,
+			      const bool fast_path)
+{
+	if (fast_path)
+		trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false);
+	else
+		trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false);
+
+	raw_spin_unlock_irqrestore(cpu_lock, flags);
+}
+
 /**
  * cgroup_rstat_updated - keep track of updated rstat_cpu
  * @cgrp: target cgroup
@@ -44,7 +98,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
 	if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
 		return;
 
-	raw_spin_lock_irqsave(cpu_lock, flags);
+	flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true);
 
 	/* put @cgrp and all ancestors on the corresponding updated lists */
 	while (true) {
@@ -72,7 +126,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
 		cgrp = parent;
 	}
 
-	raw_spin_unlock_irqrestore(cpu_lock, flags);
+	_cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true);
 }
 
 /**
@@ -153,15 +207,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
 	struct cgroup *head = NULL, *parent, *child;
 	unsigned long flags;
 
-	/*
-	 * The _irqsave() is needed because cgroup_rstat_lock is
-	 * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
-	 * this lock with the _irq() suffix only disables interrupts on
-	 * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
-	 * interrupts on both configurations. The _irqsave() ensures
-	 * that interrupts are always disabled and later restored.
-	 */
-	raw_spin_lock_irqsave(cpu_lock, flags);
+	flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false);
 
 	/* Return NULL if this subtree is not on-list */
 	if (!rstatc->updated_next)
@@ -198,7 +244,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
 	if (child != root)
 		head = cgroup_rstat_push_children(head, child, cpu);
 unlock_ret:
-	raw_spin_unlock_irqrestore(cpu_lock, flags);
+	_cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false);
 	return head;
 }
 
-- 
cgit v1.2.3


From 372c669271bff736c5bc275c982d8d1b4f1f147c Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Wed, 10 Apr 2024 17:13:01 -0500
Subject: Revert "PCI/MSI: Provide stubs for IMS functions"

This reverts commit 41efa431244f6498833ff8ee8dde28c4924c5479.

IMS (Interrupt Message Store) support appeared in v6.2, but there are no
users yet.

Remove it for now.  We can add it back when a user comes along.  If this is
re-added later, this could be squashed with these commits:

  0194425af0c8 ("PCI/MSI: Provide IMS (Interrupt Message Store) support")
  c9e5bea27383 ("PCI/MSI: Provide pci_ims_alloc/free_irq()")

which added the non-stub implementations.

Link: https://lore.kernel.org/r/20240410221307.2162676-2-helgaas@kernel.org
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/pci.h | 34 ++++++++--------------------------
 1 file changed, 8 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 16493426a04f..abbef75b2b92 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1658,8 +1658,6 @@ struct msix_entry {
 	u16	entry;	/* Driver uses to specify entry, OS writes */
 };
 
-struct msi_domain_template;
-
 #ifdef CONFIG_PCI_MSI
 int pci_msi_vec_count(struct pci_dev *dev);
 void pci_disable_msi(struct pci_dev *dev);
@@ -1692,11 +1690,6 @@ void pci_msix_free_irq(struct pci_dev *pdev, struct msi_map map);
 void pci_free_irq_vectors(struct pci_dev *dev);
 int pci_irq_vector(struct pci_dev *dev, unsigned int nr);
 const struct cpumask *pci_irq_get_affinity(struct pci_dev *pdev, int vec);
-bool pci_create_ims_domain(struct pci_dev *pdev, const struct msi_domain_template *template,
-			   unsigned int hwsize, void *data);
-struct msi_map pci_ims_alloc_irq(struct pci_dev *pdev, union msi_instance_cookie *icookie,
-				 const struct irq_affinity_desc *affdesc);
-void pci_ims_free_irq(struct pci_dev *pdev, struct msi_map map);
 
 #else
 static inline int pci_msi_vec_count(struct pci_dev *dev) { return -ENOSYS; }
@@ -1760,25 +1753,6 @@ static inline const struct cpumask *pci_irq_get_affinity(struct pci_dev *pdev,
 {
 	return cpu_possible_mask;
 }
-
-static inline bool pci_create_ims_domain(struct pci_dev *pdev,
-					 const struct msi_domain_template *template,
-					 unsigned int hwsize, void *data)
-{ return false; }
-
-static inline struct msi_map pci_ims_alloc_irq(struct pci_dev *pdev,
-					       union msi_instance_cookie *icookie,
-					       const struct irq_affinity_desc *affdesc)
-{
-	struct msi_map map = { .index = -ENOSYS, };
-
-	return map;
-}
-
-static inline void pci_ims_free_irq(struct pci_dev *pdev, struct msi_map map)
-{
-}
-
 #endif
 
 /**
@@ -2690,6 +2664,14 @@ static inline bool pci_is_thunderbolt_attached(struct pci_dev *pdev)
 void pci_uevent_ers(struct pci_dev *pdev, enum  pci_ers_result err_type);
 #endif
 
+struct msi_domain_template;
+
+bool pci_create_ims_domain(struct pci_dev *pdev, const struct msi_domain_template *template,
+			   unsigned int hwsize, void *data);
+struct msi_map pci_ims_alloc_irq(struct pci_dev *pdev, union msi_instance_cookie *icookie,
+				 const struct irq_affinity_desc *affdesc);
+void pci_ims_free_irq(struct pci_dev *pdev, struct msi_map map);
+
 #include <linux/dma-mapping.h>
 
 #define pci_printk(level, pdev, fmt, arg...) \
-- 
cgit v1.2.3


From 1794808fb1b34c30bac9f1e5d41b9df628f54260 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Wed, 10 Apr 2024 17:13:02 -0500
Subject: Revert "PCI/MSI: Provide pci_ims_alloc/free_irq()"

This reverts commit c9e5bea273834a63b5e9ba90ad94b305ba50704e.

IMS (Interrupt Message Store) support appeared in v6.2, but there are no
users yet.

Remove it for now.  We can add it back when a user comes along.  If this is
re-added later, the relevant part of 41efa431244f ("PCI/MSI: Provide stubs
for IMS functions") should be squashed into it.

Link: https://lore.kernel.org/r/20240410221307.2162676-3-helgaas@kernel.org
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/pci/msi/api.c | 50 --------------------------------------------------
 include/linux/pci.h   |  3 ---
 2 files changed, 53 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/msi/api.c b/drivers/pci/msi/api.c
index be679aa5db64..3c1cd29c5cf2 100644
--- a/drivers/pci/msi/api.c
+++ b/drivers/pci/msi/api.c
@@ -365,56 +365,6 @@ const struct cpumask *pci_irq_get_affinity(struct pci_dev *dev, int nr)
 }
 EXPORT_SYMBOL(pci_irq_get_affinity);
 
-/**
- * pci_ims_alloc_irq - Allocate an interrupt on a PCI/IMS interrupt domain
- * @dev:	The PCI device to operate on
- * @icookie:	Pointer to an IMS implementation specific cookie for this
- *		IMS instance (PASID, queue ID, pointer...).
- *		The cookie content is copied into the MSI descriptor for the
- *		interrupt chip callbacks or domain specific setup functions.
- * @affdesc:	Optional pointer to an interrupt affinity descriptor
- *
- * There is no index for IMS allocations as IMS is an implementation
- * specific storage and does not have any direct associations between
- * index, which might be a pure software construct, and device
- * functionality. This association is established by the driver either via
- * the index - if there is a hardware table - or in case of purely software
- * managed IMS implementation the association happens via the
- * irq_write_msi_msg() callback of the implementation specific interrupt
- * chip, which utilizes the provided @icookie to store the MSI message in
- * the appropriate place.
- *
- * Return: A struct msi_map
- *
- *	On success msi_map::index contains the allocated index (>= 0) and
- *	msi_map::virq the allocated Linux interrupt number (> 0).
- *
- *	On fail msi_map::index contains the error code and msi_map::virq
- *	is set to 0.
- */
-struct msi_map pci_ims_alloc_irq(struct pci_dev *dev, union msi_instance_cookie *icookie,
-				 const struct irq_affinity_desc *affdesc)
-{
-	return msi_domain_alloc_irq_at(&dev->dev, MSI_SECONDARY_DOMAIN, MSI_ANY_INDEX,
-				       affdesc, icookie);
-}
-EXPORT_SYMBOL_GPL(pci_ims_alloc_irq);
-
-/**
- * pci_ims_free_irq - Allocate an interrupt on a PCI/IMS interrupt domain
- *		      which was allocated via pci_ims_alloc_irq()
- * @dev:	The PCI device to operate on
- * @map:	A struct msi_map describing the interrupt to free as
- *		returned from pci_ims_alloc_irq()
- */
-void pci_ims_free_irq(struct pci_dev *dev, struct msi_map map)
-{
-	if (WARN_ON_ONCE(map.index < 0 || map.virq <= 0))
-		return;
-	msi_domain_free_irqs_range(&dev->dev, MSI_SECONDARY_DOMAIN, map.index, map.index);
-}
-EXPORT_SYMBOL_GPL(pci_ims_free_irq);
-
 /**
  * pci_free_irq_vectors() - Free previously allocated IRQs for a device
  * @dev: the PCI device to operate on
diff --git a/include/linux/pci.h b/include/linux/pci.h
index abbef75b2b92..3deb3e42d990 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2668,9 +2668,6 @@ struct msi_domain_template;
 
 bool pci_create_ims_domain(struct pci_dev *pdev, const struct msi_domain_template *template,
 			   unsigned int hwsize, void *data);
-struct msi_map pci_ims_alloc_irq(struct pci_dev *pdev, union msi_instance_cookie *icookie,
-				 const struct irq_affinity_desc *affdesc);
-void pci_ims_free_irq(struct pci_dev *pdev, struct msi_map map);
 
 #include <linux/dma-mapping.h>
 
-- 
cgit v1.2.3


From b966b1102871ad3842708294f6ec018eede75463 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Wed, 10 Apr 2024 17:13:03 -0500
Subject: Revert "PCI/MSI: Provide IMS (Interrupt Message Store) support"

This reverts commit 0194425af0c87acaad457989a2c6d90dba58e776.

IMS (Interrupt Message Store) support appeared in v6.2, but there are no
users yet.

Remove it for now.  We can add it back when a user comes along.  If this is
re-added later, the relevant part of 41efa431244f ("PCI/MSI: Provide stubs
for IMS functions") should be squashed into it.

Link: https://lore.kernel.org/r/20240410221307.2162676-4-helgaas@kernel.org
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/pci/msi/irqdomain.c | 59 ---------------------------------------------
 include/linux/pci.h         |  5 ----
 2 files changed, 64 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/msi/irqdomain.c b/drivers/pci/msi/irqdomain.c
index cfd84a899c82..03d2dd25790d 100644
--- a/drivers/pci/msi/irqdomain.c
+++ b/drivers/pci/msi/irqdomain.c
@@ -355,65 +355,6 @@ bool pci_msi_domain_supports(struct pci_dev *pdev, unsigned int feature_mask,
 	return (supported & feature_mask) == feature_mask;
 }
 
-/**
- * pci_create_ims_domain - Create a secondary IMS domain for a PCI device
- * @pdev:	The PCI device to operate on
- * @template:	The MSI info template which describes the domain
- * @hwsize:	The size of the hardware entry table or 0 if the domain
- *		is purely software managed
- * @data:	Optional pointer to domain specific data to be stored
- *		in msi_domain_info::data
- *
- * Return: True on success, false otherwise
- *
- * An IMS domain is expected to have the following constraints:
- *	- The index space is managed by the core code
- *
- *	- There is no requirement for consecutive index ranges
- *
- *	- The interrupt chip must provide the following callbacks:
- *		- irq_mask()
- *		- irq_unmask()
- *		- irq_write_msi_msg()
- *
- *	- The interrupt chip must provide the following optional callbacks
- *	  when the irq_mask(), irq_unmask() and irq_write_msi_msg() callbacks
- *	  cannot operate directly on hardware, e.g. in the case that the
- *	  interrupt message store is in queue memory:
- *		- irq_bus_lock()
- *		- irq_bus_unlock()
- *
- *	  These callbacks are invoked from preemptible task context and are
- *	  allowed to sleep. In this case the mandatory callbacks above just
- *	  store the information. The irq_bus_unlock() callback is supposed
- *	  to make the change effective before returning.
- *
- *	- Interrupt affinity setting is handled by the underlying parent
- *	  interrupt domain and communicated to the IMS domain via
- *	  irq_write_msi_msg().
- *
- * The domain is automatically destroyed when the PCI device is removed.
- */
-bool pci_create_ims_domain(struct pci_dev *pdev, const struct msi_domain_template *template,
-			   unsigned int hwsize, void *data)
-{
-	struct irq_domain *domain = dev_get_msi_domain(&pdev->dev);
-
-	if (!domain || !irq_domain_is_msi_parent(domain))
-		return false;
-
-	if (template->info.bus_token != DOMAIN_BUS_PCI_DEVICE_IMS ||
-	    !(template->info.flags & MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS) ||
-	    !(template->info.flags & MSI_FLAG_FREE_MSI_DESCS) ||
-	    !template->chip.irq_mask || !template->chip.irq_unmask ||
-	    !template->chip.irq_write_msi_msg || template->chip.irq_set_affinity)
-		return false;
-
-	return msi_create_device_irq_domain(&pdev->dev, MSI_SECONDARY_DOMAIN, template,
-					    hwsize, data, NULL);
-}
-EXPORT_SYMBOL_GPL(pci_create_ims_domain);
-
 /*
  * Users of the generic MSI infrastructure expect a device to have a single ID,
  * so with DMA aliases we have to pick the least-worst compromise. Devices with
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 3deb3e42d990..98fb20bcd054 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2664,11 +2664,6 @@ static inline bool pci_is_thunderbolt_attached(struct pci_dev *pdev)
 void pci_uevent_ers(struct pci_dev *pdev, enum  pci_ers_result err_type);
 #endif
 
-struct msi_domain_template;
-
-bool pci_create_ims_domain(struct pci_dev *pdev, const struct msi_domain_template *template,
-			   unsigned int hwsize, void *data);
-
 #include <linux/dma-mapping.h>
 
 #define pci_printk(level, pdev, fmt, arg...) \
-- 
cgit v1.2.3


From 8a1940bca94bbf060bba4fc4f69c37270d5828f8 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Wed, 10 Apr 2024 17:13:07 -0500
Subject: Revert "genirq/msi: Provide constants for PCI/IMS support"

This reverts commit e23d4192bf9b612bce5b24f22719fd3cc6edaa69.

IMS (Interrupt Message Store) support appeared in v6.2, but there are no
users yet.

Remove it for now.  We can add it back when a user comes along.

Link: https://lore.kernel.org/r/20240410221307.2162676-8-helgaas@kernel.org
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irqdomain_defs.h | 1 -
 include/linux/msi.h            | 2 --
 include/linux/msi_api.h        | 1 -
 3 files changed, 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/irqdomain_defs.h b/include/linux/irqdomain_defs.h
index 5c1fe6f1fcde..36653e2ee1c9 100644
--- a/include/linux/irqdomain_defs.h
+++ b/include/linux/irqdomain_defs.h
@@ -25,7 +25,6 @@ enum irq_domain_bus_token {
 	DOMAIN_BUS_PCI_DEVICE_MSIX,
 	DOMAIN_BUS_DMAR,
 	DOMAIN_BUS_AMDVI,
-	DOMAIN_BUS_PCI_DEVICE_IMS,
 	DOMAIN_BUS_DEVICE_MSI,
 	DOMAIN_BUS_WIRED_TO_MSI,
 };
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 26d07e23052e..84859a9aa091 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -573,8 +573,6 @@ enum {
 	MSI_FLAG_MSIX_CONTIGUOUS	= (1 << 19),
 	/* PCI/MSI-X vectors can be dynamically allocated/freed post MSI-X enable */
 	MSI_FLAG_PCI_MSIX_ALLOC_DYN	= (1 << 20),
-	/* Support for PCI/IMS */
-	MSI_FLAG_PCI_IMS		= (1 << 21),
 };
 
 /**
diff --git a/include/linux/msi_api.h b/include/linux/msi_api.h
index 391087ad99b1..5ae72d1912c4 100644
--- a/include/linux/msi_api.h
+++ b/include/linux/msi_api.h
@@ -15,7 +15,6 @@ struct device;
  */
 enum msi_domain_ids {
 	MSI_DEFAULT_DOMAIN,
-	MSI_SECONDARY_DOMAIN,
 	MSI_MAX_DEVICE_IRQDOMAINS,
 };
 
-- 
cgit v1.2.3


From 1a7d0890dd4a502a202aaec792a6c04e6e049547 Mon Sep 17 00:00:00 2001
From: Stephen Brennan <stephen.s.brennan@oracle.com>
Date: Wed, 1 May 2024 09:29:56 -0700
Subject: kprobe/ftrace: bail out if ftrace was killed

If an error happens in ftrace, ftrace_kill() will prevent disarming
kprobes. Eventually, the ftrace_ops associated with the kprobes will be
freed, yet the kprobes will still be active, and when triggered, they
will use the freed memory, likely resulting in a page fault and panic.

This behavior can be reproduced quite easily, by creating a kprobe and
then triggering a ftrace_kill(). For simplicity, we can simulate an
ftrace error with a kernel module like [1]:

[1]: https://github.com/brenns10/kernel_stuff/tree/master/ftrace_killer

  sudo perf probe --add commit_creds
  sudo perf trace -e probe:commit_creds
  # In another terminal
  make
  sudo insmod ftrace_killer.ko  # calls ftrace_kill(), simulating bug
  # Back to perf terminal
  # ctrl-c
  sudo perf probe --del commit_creds

After a short period, a page fault and panic would occur as the kprobe
continues to execute and uses the freed ftrace_ops. While ftrace_kill()
is supposed to be used only in extreme circumstances, it is invoked in
FTRACE_WARN_ON() and so there are many places where an unexpected bug
could be triggered, yet the system may continue operating, possibly
without the administrator noticing. If ftrace_kill() does not panic the
system, then we should do everything we can to continue operating,
rather than leave a ticking time bomb.

Link: https://lore.kernel.org/all/20240501162956.229427-1-stephen.s.brennan@oracle.com/

Signed-off-by: Stephen Brennan <stephen.s.brennan@oracle.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Acked-by: Guo Ren <guoren@kernel.org>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
---
 arch/csky/kernel/probes/ftrace.c     | 3 +++
 arch/loongarch/kernel/ftrace_dyn.c   | 3 +++
 arch/parisc/kernel/ftrace.c          | 3 +++
 arch/powerpc/kernel/kprobes-ftrace.c | 3 +++
 arch/riscv/kernel/probes/ftrace.c    | 3 +++
 arch/s390/kernel/ftrace.c            | 3 +++
 arch/x86/kernel/kprobes/ftrace.c     | 3 +++
 include/linux/kprobes.h              | 7 +++++++
 kernel/kprobes.c                     | 6 ++++++
 kernel/trace/ftrace.c                | 1 +
 10 files changed, 35 insertions(+)

(limited to 'include')

diff --git a/arch/csky/kernel/probes/ftrace.c b/arch/csky/kernel/probes/ftrace.c
index 834cffcfbce3..7ba4b98076de 100644
--- a/arch/csky/kernel/probes/ftrace.c
+++ b/arch/csky/kernel/probes/ftrace.c
@@ -12,6 +12,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 	struct kprobe_ctlblk *kcb;
 	struct pt_regs *regs;
 
+	if (unlikely(kprobe_ftrace_disabled))
+		return;
+
 	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
diff --git a/arch/loongarch/kernel/ftrace_dyn.c b/arch/loongarch/kernel/ftrace_dyn.c
index 73858c9029cc..bff058317062 100644
--- a/arch/loongarch/kernel/ftrace_dyn.c
+++ b/arch/loongarch/kernel/ftrace_dyn.c
@@ -287,6 +287,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 	struct kprobe *p;
 	struct kprobe_ctlblk *kcb;
 
+	if (unlikely(kprobe_ftrace_disabled))
+		return;
+
 	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c
index 621a4b386ae4..c91f9c2e61ed 100644
--- a/arch/parisc/kernel/ftrace.c
+++ b/arch/parisc/kernel/ftrace.c
@@ -206,6 +206,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 	struct kprobe *p;
 	int bit;
 
+	if (unlikely(kprobe_ftrace_disabled))
+		return;
+
 	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c
index 072ebe7f290b..f8208c027148 100644
--- a/arch/powerpc/kernel/kprobes-ftrace.c
+++ b/arch/powerpc/kernel/kprobes-ftrace.c
@@ -21,6 +21,9 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip,
 	struct pt_regs *regs;
 	int bit;
 
+	if (unlikely(kprobe_ftrace_disabled))
+		return;
+
 	bit = ftrace_test_recursion_trylock(nip, parent_nip);
 	if (bit < 0)
 		return;
diff --git a/arch/riscv/kernel/probes/ftrace.c b/arch/riscv/kernel/probes/ftrace.c
index 7142ec42e889..a69dfa610aa8 100644
--- a/arch/riscv/kernel/probes/ftrace.c
+++ b/arch/riscv/kernel/probes/ftrace.c
@@ -11,6 +11,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 	struct kprobe_ctlblk *kcb;
 	int bit;
 
+	if (unlikely(kprobe_ftrace_disabled))
+		return;
+
 	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index c46381ea04ec..7f6f8c438c26 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -296,6 +296,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 	struct kprobe *p;
 	int bit;
 
+	if (unlikely(kprobe_ftrace_disabled))
+		return;
+
 	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index dd2ec14adb77..15af7e98e161 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -21,6 +21,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 	struct kprobe_ctlblk *kcb;
 	int bit;
 
+	if (unlikely(kprobe_ftrace_disabled))
+		return;
+
 	bit = ftrace_test_recursion_trylock(ip, parent_ip);
 	if (bit < 0)
 		return;
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 0ff44d6633e3..5fcbc254d186 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -378,11 +378,15 @@ static inline void wait_for_kprobe_optimizer(void) { }
 extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
 				  struct ftrace_ops *ops, struct ftrace_regs *fregs);
 extern int arch_prepare_kprobe_ftrace(struct kprobe *p);
+/* Set when ftrace has been killed: kprobes on ftrace must be disabled for safety */
+extern bool kprobe_ftrace_disabled __read_mostly;
+extern void kprobe_ftrace_kill(void);
 #else
 static inline int arch_prepare_kprobe_ftrace(struct kprobe *p)
 {
 	return -EINVAL;
 }
+static inline void kprobe_ftrace_kill(void) {}
 #endif /* CONFIG_KPROBES_ON_FTRACE */
 
 /* Get the kprobe at this addr (if any) - called with preemption disabled */
@@ -495,6 +499,9 @@ static inline void kprobe_flush_task(struct task_struct *tk)
 static inline void kprobe_free_init_mem(void)
 {
 }
+static inline void kprobe_ftrace_kill(void)
+{
+}
 static inline int disable_kprobe(struct kprobe *kp)
 {
 	return -EOPNOTSUPP;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 65adc815fc6e..166ebf81dc45 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1068,6 +1068,7 @@ static struct ftrace_ops kprobe_ipmodify_ops __read_mostly = {
 
 static int kprobe_ipmodify_enabled;
 static int kprobe_ftrace_enabled;
+bool kprobe_ftrace_disabled;
 
 static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
 			       int *cnt)
@@ -1136,6 +1137,11 @@ static int disarm_kprobe_ftrace(struct kprobe *p)
 		ipmodify ? &kprobe_ipmodify_ops : &kprobe_ftrace_ops,
 		ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled);
 }
+
+void kprobe_ftrace_kill()
+{
+	kprobe_ftrace_disabled = true;
+}
 #else	/* !CONFIG_KPROBES_ON_FTRACE */
 static inline int arm_kprobe_ftrace(struct kprobe *p)
 {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index da1710499698..96db99c347b3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -7895,6 +7895,7 @@ void ftrace_kill(void)
 	ftrace_disabled = 1;
 	ftrace_enabled = 0;
 	ftrace_trace_function = ftrace_stub;
+	kprobe_ftrace_kill();
 }
 
 /**
-- 
cgit v1.2.3


From 0e1fdd222f0ac47b2b806fecfe05c1a6f797930f Mon Sep 17 00:00:00 2001
From: Damien Le Moal <dlemoal@kernel.org>
Date: Mon, 25 Mar 2024 16:09:39 +0900
Subject: PCI: Remove PCI_IRQ_LEGACY

Replace the last references to PCI_IRQ_LEGACY with PCI_IRQ_INTX in pci.h
header file. With this change, PCI_IRQ_LEGACY is unused and we can remove
its definition.

Link: https://lore.kernel.org/r/20240325070944.3600338-29-dlemoal@kernel.org
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 16493426a04f..b19992a5dfaf 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1077,8 +1077,6 @@ enum {
 #define PCI_IRQ_MSIX		(1 << 2) /* Allow MSI-X interrupts */
 #define PCI_IRQ_AFFINITY	(1 << 3) /* Auto-assign affinity */
 
-#define PCI_IRQ_LEGACY		PCI_IRQ_INTX /* Deprecated! Use PCI_IRQ_INTX */
-
 /* These external functions are only available when PCI support is enabled */
 #ifdef CONFIG_PCI
 
@@ -1648,8 +1646,7 @@ int pci_set_vga_state(struct pci_dev *pdev, bool decode,
  */
 #define PCI_IRQ_VIRTUAL		(1 << 4)
 
-#define PCI_IRQ_ALL_TYPES \
-	(PCI_IRQ_LEGACY | PCI_IRQ_MSI | PCI_IRQ_MSIX)
+#define PCI_IRQ_ALL_TYPES	(PCI_IRQ_INTX | PCI_IRQ_MSI | PCI_IRQ_MSIX)
 
 #include <linux/dmapool.h>
 
@@ -1719,7 +1716,7 @@ pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
 			       unsigned int max_vecs, unsigned int flags,
 			       struct irq_affinity *aff_desc)
 {
-	if ((flags & PCI_IRQ_LEGACY) && min_vecs == 1 && dev->irq)
+	if ((flags & PCI_IRQ_INTX) && min_vecs == 1 && dev->irq)
 		return 1;
 	return -ENOSPC;
 }
-- 
cgit v1.2.3


From d41abe063f9d206ded67dda9d205de7a669edd6f Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Wed, 27 Mar 2024 13:02:34 -0500
Subject: PCI: Update pci_find_capability() stub return types

f646c2a0a668 ("PCI: Return u8 from pci_find_capability() and similar") and
ee8b1c478a9f ("PCI: Return u16 from pci_find_ext_capability() and similar")
updated the return type of the extern declarations, but neglected to update
the type of the stubs used when CONFIG_PCI is not enabled.

Update them to match the extern declarations.

Link: https://lore.kernel.org/r/20240327180234.1529164-1-helgaas@kernel.org
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@linux.intel.com>
---
 include/linux/pci.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index b19992a5dfaf..6a09bd9636d5 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2011,10 +2011,9 @@ static inline int pci_register_driver(struct pci_driver *drv)
 static inline void pci_unregister_driver(struct pci_driver *drv) { }
 static inline u8 pci_find_capability(struct pci_dev *dev, int cap)
 { return 0; }
-static inline int pci_find_next_capability(struct pci_dev *dev, u8 post,
-					   int cap)
+static inline u8 pci_find_next_capability(struct pci_dev *dev, u8 post, int cap)
 { return 0; }
-static inline int pci_find_ext_capability(struct pci_dev *dev, int cap)
+static inline u16 pci_find_ext_capability(struct pci_dev *dev, int cap)
 { return 0; }
 
 static inline u64 pci_get_dsn(struct pci_dev *dev)
-- 
cgit v1.2.3


From 844177a80753fc173131f3e591124c8dcbc89812 Mon Sep 17 00:00:00 2001
From: Heiner Kallweit <hkallweit1@gmail.com>
Date: Sat, 23 Mar 2024 18:16:36 +0100
Subject: PCI: Remove unused pci_enable_device_io()

After the last user was removed, remove this PCI core function.  It's very
unlikely that we'll see a new device requiring io space access, even though
memory space access is supported.

Link: https://lore.kernel.org/r/213ebf62-53a3-42b7-8518-ecd5cd6d6b08@gmail.com
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
---
 drivers/pci/pci.c   | 14 --------------
 include/linux/pci.h |  1 -
 2 files changed, 15 deletions(-)

(limited to 'include')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 5a25facb3ce7..d3fabfca7e6f 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -2110,20 +2110,6 @@ static int pci_enable_device_flags(struct pci_dev *dev, unsigned long flags)
 	return err;
 }
 
-/**
- * pci_enable_device_io - Initialize a device for use with IO space
- * @dev: PCI device to be initialized
- *
- * Initialize device before it's used by a driver. Ask low-level code
- * to enable I/O resources. Wake up the device if it was suspended.
- * Beware, this function can fail.
- */
-int pci_enable_device_io(struct pci_dev *dev)
-{
-	return pci_enable_device_flags(dev, IORESOURCE_IO);
-}
-EXPORT_SYMBOL(pci_enable_device_io);
-
 /**
  * pci_enable_device_mem - Initialize a device for use with Memory space
  * @dev: PCI device to be initialized
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 6a09bd9636d5..69b10f2fb606 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1313,7 +1313,6 @@ int pci_user_write_config_word(struct pci_dev *dev, int where, u16 val);
 int pci_user_write_config_dword(struct pci_dev *dev, int where, u32 val);
 
 int __must_check pci_enable_device(struct pci_dev *dev);
-int __must_check pci_enable_device_io(struct pci_dev *dev);
 int __must_check pci_enable_device_mem(struct pci_dev *dev);
 int __must_check pci_reenable_device(struct pci_dev *);
 int __must_check pcim_enable_device(struct pci_dev *pdev);
-- 
cgit v1.2.3


From 6613443ffc49d03e27f0404978f685c4eac43fba Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Date: Thu, 8 Feb 2024 15:23:21 +0200
Subject: PCI: Do not wait for disconnected devices when resuming
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On runtime resume, pci_dev_wait() is called:

  pci_pm_runtime_resume()
    pci_pm_bridge_power_up_actions()
      pci_bridge_wait_for_secondary_bus()
        pci_dev_wait()

While a device is runtime suspended along with its PCI hierarchy, the
device could get disconnected. In such case, the link will not come up no
matter how long pci_dev_wait() waits for it.

Besides the above mentioned case, there could be other ways to get the
device disconnected while pci_dev_wait() is waiting for the link to come
up.

Make pci_dev_wait() exit if the device is already disconnected to avoid
unnecessary delay.

The use cases of pci_dev_wait() boil down to two:

  1. Waiting for the device after reset
  2. pci_bridge_wait_for_secondary_bus()

The callers in both cases seem to benefit from propagating the
disconnection as error even if device disconnection would be more
analoguous to the case where there is no device in the first place which
return 0 from pci_dev_wait(). In the case 2, it results in unnecessary
marking of the devices disconnected again but that is just harmless extra
work.

Also make sure compiler does not become too clever with dev->error_state
and use READ_ONCE() to force a fetch for the up-to-date value.

Link: https://lore.kernel.org/r/20240208132322.4811-1-ilpo.jarvinen@linux.intel.com
Reported-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Tested-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci.c   | 5 +++++
 include/linux/pci.h | 7 ++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index d3fabfca7e6f..a9848316c623 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1277,6 +1277,11 @@ static int pci_dev_wait(struct pci_dev *dev, char *reset_type, int timeout)
 	for (;;) {
 		u32 id;
 
+		if (pci_dev_is_disconnected(dev)) {
+			pci_dbg(dev, "disconnected; not waiting\n");
+			return -ENOTTY;
+		}
+
 		pci_read_config_dword(dev, PCI_COMMAND, &id);
 		if (!PCI_POSSIBLE_ERROR(id))
 			break;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 69b10f2fb606..2151c7f9be86 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2514,7 +2514,12 @@ static inline struct pci_dev *pcie_find_root_port(struct pci_dev *dev)
 
 static inline bool pci_dev_is_disconnected(const struct pci_dev *dev)
 {
-	return dev->error_state == pci_channel_io_perm_failure;
+	/*
+	 * error_state is set in pci_dev_set_io_state() using xchg/cmpxchg()
+	 * and read w/o common lock. READ_ONCE() ensures compiler cannot cache
+	 * the value (e.g. inside the loop in pci_dev_wait()).
+	 */
+	return READ_ONCE(dev->error_state) == pci_channel_io_perm_failure;
 }
 
 void pci_request_acs(void);
-- 
cgit v1.2.3


From 2a56c462fe5a2ee61d38e2d7b772bee56115a00c Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 9 Apr 2024 15:53:58 +0530
Subject: OPP: Fix required_opp_tables for multiple genpds using same table

The required_opp_tables parsing is not perfect, as the OPP core does the
parsing solely based on the DT node pointers.

The core sets the required_opp_tables entry to the first OPP table in
the "opp_tables" list, that matches with the node pointer.

If the target DT OPP table is used by multiple devices and they all
create separate instances of 'struct opp_table' from it, then it is
possible that the required_opp_tables entry may be set to the incorrect
sibling device.

Unfortunately, there is no clear way to initialize the right values
during the initial parsing and we need to do this at a later point of
time.

Cross check the OPP table again while the genpds are attached and fix
them if required.

Also add a new API for the genpd core to fetch the device pointer for
the genpd.

Cc: Thorsten Leemhuis <regressions@leemhuis.info>
Reported-by: Vladimir Lypak <vladimir.lypak@gmail.com>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218682
Co-developed-by: Vladimir Lypak <vladimir.lypak@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
---
 drivers/opp/core.c        | 31 ++++++++++++++++++++++++++++++-
 drivers/pmdomain/core.c   | 10 ++++++++++
 include/linux/pm_domain.h |  6 ++++++
 3 files changed, 46 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index e233734b7220..cb4611fe1b5b 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -2394,7 +2394,8 @@ static void _opp_detach_genpd(struct opp_table *opp_table)
 static int _opp_attach_genpd(struct opp_table *opp_table, struct device *dev,
 			const char * const *names, struct device ***virt_devs)
 {
-	struct device *virt_dev;
+	struct device *virt_dev, *gdev;
+	struct opp_table *genpd_table;
 	int index = 0, ret = -EINVAL;
 	const char * const *name = names;
 
@@ -2427,6 +2428,34 @@ static int _opp_attach_genpd(struct opp_table *opp_table, struct device *dev,
 			goto err;
 		}
 
+		/*
+		 * The required_opp_tables parsing is not perfect, as the OPP
+		 * core does the parsing solely based on the DT node pointers.
+		 * The core sets the required_opp_tables entry to the first OPP
+		 * table in the "opp_tables" list, that matches with the node
+		 * pointer.
+		 *
+		 * If the target DT OPP table is used by multiple devices and
+		 * they all create separate instances of 'struct opp_table' from
+		 * it, then it is possible that the required_opp_tables entry
+		 * may be set to the incorrect sibling device.
+		 *
+		 * Cross check it again and fix if required.
+		 */
+		gdev = dev_to_genpd_dev(virt_dev);
+		if (IS_ERR(gdev))
+			return PTR_ERR(gdev);
+
+		genpd_table = _find_opp_table(gdev);
+		if (!IS_ERR(genpd_table)) {
+			if (genpd_table != opp_table->required_opp_tables[index]) {
+				dev_pm_opp_put_opp_table(opp_table->required_opp_tables[index]);
+				opp_table->required_opp_tables[index] = genpd_table;
+			} else {
+				dev_pm_opp_put_opp_table(genpd_table);
+			}
+		}
+
 		/*
 		 * Add the virtual genpd device as a user of the OPP table, so
 		 * we can call dev_pm_opp_set_opp() on it directly.
diff --git a/drivers/pmdomain/core.c b/drivers/pmdomain/core.c
index 4215ffd9b11c..c40eda92a85a 100644
--- a/drivers/pmdomain/core.c
+++ b/drivers/pmdomain/core.c
@@ -184,6 +184,16 @@ static struct generic_pm_domain *dev_to_genpd(struct device *dev)
 	return pd_to_genpd(dev->pm_domain);
 }
 
+struct device *dev_to_genpd_dev(struct device *dev)
+{
+	struct generic_pm_domain *genpd = dev_to_genpd(dev);
+
+	if (IS_ERR(genpd))
+		return ERR_CAST(genpd);
+
+	return &genpd->dev;
+}
+
 static int genpd_stop_dev(const struct generic_pm_domain *genpd,
 			  struct device *dev)
 {
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 772d3280d35f..f24546a3d3db 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -260,6 +260,7 @@ int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
 int pm_genpd_init(struct generic_pm_domain *genpd,
 		  struct dev_power_governor *gov, bool is_off);
 int pm_genpd_remove(struct generic_pm_domain *genpd);
+struct device *dev_to_genpd_dev(struct device *dev);
 int dev_pm_genpd_set_performance_state(struct device *dev, unsigned int state);
 int dev_pm_genpd_add_notifier(struct device *dev, struct notifier_block *nb);
 int dev_pm_genpd_remove_notifier(struct device *dev);
@@ -307,6 +308,11 @@ static inline int pm_genpd_remove(struct generic_pm_domain *genpd)
 	return -EOPNOTSUPP;
 }
 
+static inline struct device *dev_to_genpd_dev(struct device *dev)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
 static inline int dev_pm_genpd_set_performance_state(struct device *dev,
 						     unsigned int state)
 {
-- 
cgit v1.2.3


From c6ab5c915da460c0397960af3c308386c3f3247b Mon Sep 17 00:00:00 2001
From: Stefan Berger <stefanb@linux.ibm.com>
Date: Thu, 9 May 2024 21:59:21 -0400
Subject: crypto: ecc - Prevent ecc_digits_from_bytes from reading too many
 bytes

Prevent ecc_digits_from_bytes from reading too many bytes from the input
byte array in case an insufficient number of bytes is provided to fill the
output digit array of ndigits. Therefore, initialize the most significant
digits with 0 to avoid trying to read too many bytes later on. Convert the
function into a regular function since it is getting too big for an inline
function.

If too many bytes are provided on the input byte array the extra bytes
are ignored since the input variable 'ndigits' limits the number of digits
that will be filled.

Fixes: d67c96fb97b5 ("crypto: ecdsa - Convert byte arrays with key coordinates to digits")
Reviewed-by: Jarkko Sakkinen <jarkko@kernel.org>
Signed-off-by: Stefan Berger <stefanb@linux.ibm.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/ecc.c                  | 22 ++++++++++++++++++++++
 include/crypto/internal/ecc.h | 15 ++-------------
 2 files changed, 24 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/crypto/ecc.c b/crypto/ecc.c
index c1d2e884be1e..fe761256e335 100644
--- a/crypto/ecc.c
+++ b/crypto/ecc.c
@@ -68,6 +68,28 @@ const struct ecc_curve *ecc_get_curve(unsigned int curve_id)
 }
 EXPORT_SYMBOL(ecc_get_curve);
 
+void ecc_digits_from_bytes(const u8 *in, unsigned int nbytes,
+			   u64 *out, unsigned int ndigits)
+{
+	int diff = ndigits - DIV_ROUND_UP(nbytes, sizeof(u64));
+	unsigned int o = nbytes & 7;
+	__be64 msd = 0;
+
+	/* diff > 0: not enough input bytes: set most significant digits to 0 */
+	if (diff > 0) {
+		ndigits -= diff;
+		memset(&out[ndigits - 1], 0, diff * sizeof(u64));
+	}
+
+	if (o) {
+		memcpy((u8 *)&msd + sizeof(msd) - o, in, o);
+		out[--ndigits] = be64_to_cpu(msd);
+		in += o;
+	}
+	ecc_swap_digits(in, out, ndigits);
+}
+EXPORT_SYMBOL(ecc_digits_from_bytes);
+
 static u64 *ecc_alloc_digits_space(unsigned int ndigits)
 {
 	size_t len = ndigits * sizeof(u64);
diff --git a/include/crypto/internal/ecc.h b/include/crypto/internal/ecc.h
index 7ca1f463d1ec..f7e75e1e71f3 100644
--- a/include/crypto/internal/ecc.h
+++ b/include/crypto/internal/ecc.h
@@ -64,19 +64,8 @@ static inline void ecc_swap_digits(const void *in, u64 *out, unsigned int ndigit
  * @out       Output digits array
  * @ndigits:  Number of digits to create from byte array
  */
-static inline void ecc_digits_from_bytes(const u8 *in, unsigned int nbytes,
-					 u64 *out, unsigned int ndigits)
-{
-	unsigned int o = nbytes & 7;
-	__be64 msd = 0;
-
-	if (o) {
-		memcpy((u8 *)&msd + sizeof(msd) - o, in, o);
-		out[--ndigits] = be64_to_cpu(msd);
-		in += o;
-	}
-	ecc_swap_digits(in, out, ndigits);
-}
+void ecc_digits_from_bytes(const u8 *in, unsigned int nbytes,
+			   u64 *out, unsigned int ndigits);
 
 /**
  * ecc_is_key_valid() - Validate a given ECDH private key
-- 
cgit v1.2.3


From 6cbd1d6d36c5d8312de99d1dfa3bec40ac840ce0 Mon Sep 17 00:00:00 2001
From: Samuel Holland <samuel.holland@sifive.com>
Date: Fri, 29 Mar 2024 00:18:16 -0700
Subject: arch: add ARCH_HAS_KERNEL_FPU_SUPPORT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Several architectures provide an API to enable the FPU and run
floating-point SIMD code in kernel space.  However, the function names,
header locations, and semantics are inconsistent across architectures, and
FPU support may be gated behind other Kconfig options.

provide a standard way for architectures to declare that kernel space
FPU support is available. Architectures selecting this option must
implement what is currently the most common API (kernel_fpu_begin() and
kernel_fpu_end(), plus a new function kernel_fpu_available()) and
provide the appropriate CFLAGS for compiling floating-point C code.

Link: https://lkml.kernel.org/r/20240329072441.591471-2-samuel.holland@sifive.com
Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
Suggested-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Christian König <christian.koenig@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nathan Chancellor <nathan@kernel.org>
Cc: Nicolas Schier <nicolas@fjasle.eu>
Cc: Palmer Dabbelt <palmer@rivosinc.com>
Cc: Russell King <linux@armlinux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: WANG Xuerui <git@xen0n.name>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 Documentation/core-api/floating-point.rst | 78 +++++++++++++++++++++++++++++++
 Documentation/core-api/index.rst          |  1 +
 Makefile                                  |  5 ++
 arch/Kconfig                              |  6 +++
 include/linux/fpu.h                       | 12 +++++
 5 files changed, 102 insertions(+)
 create mode 100644 Documentation/core-api/floating-point.rst
 create mode 100644 include/linux/fpu.h

(limited to 'include')

diff --git a/Documentation/core-api/floating-point.rst b/Documentation/core-api/floating-point.rst
new file mode 100644
index 000000000000..a8d0d4b05052
--- /dev/null
+++ b/Documentation/core-api/floating-point.rst
@@ -0,0 +1,78 @@
+.. SPDX-License-Identifier: GPL-2.0+
+
+Floating-point API
+==================
+
+Kernel code is normally prohibited from using floating-point (FP) registers or
+instructions, including the C float and double data types. This rule reduces
+system call overhead, because the kernel does not need to save and restore the
+userspace floating-point register state.
+
+However, occasionally drivers or library functions may need to include FP code.
+This is supported by isolating the functions containing FP code to a separate
+translation unit (a separate source file), and saving/restoring the FP register
+state around calls to those functions. This creates "critical sections" of
+floating-point usage.
+
+The reason for this isolation is to prevent the compiler from generating code
+touching the FP registers outside these critical sections. Compilers sometimes
+use FP registers to optimize inlined ``memcpy`` or variable assignment, as
+floating-point registers may be wider than general-purpose registers.
+
+Usability of floating-point code within the kernel is architecture-specific.
+Additionally, because a single kernel may be configured to support platforms
+both with and without a floating-point unit, FPU availability must be checked
+both at build time and at run time.
+
+Several architectures implement the generic kernel floating-point API from
+``linux/fpu.h``, as described below. Some other architectures implement their
+own unique APIs, which are documented separately.
+
+Build-time API
+--------------
+
+Floating-point code may be built if the option ``ARCH_HAS_KERNEL_FPU_SUPPORT``
+is enabled. For C code, such code must be placed in a separate file, and that
+file must have its compilation flags adjusted using the following pattern::
+
+    CFLAGS_foo.o += $(CC_FLAGS_FPU)
+    CFLAGS_REMOVE_foo.o += $(CC_FLAGS_NO_FPU)
+
+Architectures are expected to define one or both of these variables in their
+top-level Makefile as needed. For example::
+
+    CC_FLAGS_FPU := -mhard-float
+
+or::
+
+    CC_FLAGS_NO_FPU := -msoft-float
+
+Normal kernel code is assumed to use the equivalent of ``CC_FLAGS_NO_FPU``.
+
+Runtime API
+-----------
+
+The runtime API is provided in ``linux/fpu.h``. This header cannot be included
+from files implementing FP code (those with their compilation flags adjusted as
+above). Instead, it must be included when defining the FP critical sections.
+
+.. c:function:: bool kernel_fpu_available( void )
+
+        This function reports if floating-point code can be used on this CPU or
+        platform. The value returned by this function is not expected to change
+        at runtime, so it only needs to be called once, not before every
+        critical section.
+
+.. c:function:: void kernel_fpu_begin( void )
+                void kernel_fpu_end( void )
+
+        These functions create a floating-point critical section. It is only
+        valid to call ``kernel_fpu_begin()`` after a previous call to
+        ``kernel_fpu_available()`` returned ``true``. These functions are only
+        guaranteed to be callable from (preemptible or non-preemptible) process
+        context.
+
+        Preemption may be disabled inside critical sections, so their size
+        should be minimized. They are *not* required to be reentrant. If the
+        caller expects to nest critical sections, it must implement its own
+        reference counting.
diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index 7a3a08d81f11..974beccd671f 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -48,6 +48,7 @@ Library functionality that is used throughout the kernel.
    errseq
    wrappers/atomic_t
    wrappers/atomic_bitops
+   floating-point
 
 Low level entry and exit
 ========================
diff --git a/Makefile b/Makefile
index 77b283d16e73..452913ce42c8 100644
--- a/Makefile
+++ b/Makefile
@@ -970,6 +970,11 @@ KBUILD_CFLAGS	+= $(CC_FLAGS_CFI)
 export CC_FLAGS_CFI
 endif
 
+# Architectures can define flags to add/remove for floating-point support
+CC_FLAGS_FPU	+= -D_LINUX_FPU_COMPILATION_UNIT
+export CC_FLAGS_FPU
+export CC_FLAGS_NO_FPU
+
 ifneq ($(CONFIG_FUNCTION_ALIGNMENT),0)
 # Set the minimal function alignment. Use the newer GCC option
 # -fmin-function-alignment if it is available, or fall back to -falign-funtions.
diff --git a/arch/Kconfig b/arch/Kconfig
index b34946d90e4b..975dd22a2dbd 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1594,6 +1594,12 @@ config ARCH_HAS_NONLEAF_PMD_YOUNG
 	  address translations. Page table walkers that clear the accessed bit
 	  may use this capability to reduce their search space.
 
+config ARCH_HAS_KERNEL_FPU_SUPPORT
+	bool
+	help
+	  Architectures that select this option can run floating-point code in
+	  the kernel, as described in Documentation/core-api/floating-point.rst.
+
 source "kernel/gcov/Kconfig"
 
 source "scripts/gcc-plugins/Kconfig"
diff --git a/include/linux/fpu.h b/include/linux/fpu.h
new file mode 100644
index 000000000000..2fb63e22913b
--- /dev/null
+++ b/include/linux/fpu.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_FPU_H
+#define _LINUX_FPU_H
+
+#ifdef _LINUX_FPU_COMPILATION_UNIT
+#error FP code must be compiled separately. See Documentation/core-api/floating-point.rst.
+#endif
+
+#include <asm/fpu.h>
+
+#endif
-- 
cgit v1.2.3


From 1c00f9368628dde7337defd3699025e3611a816f Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 30 Apr 2024 15:28:23 +1000
Subject: mm: lift gfp_kmemleak_mask() to gfp.h

Patch series "mm: fix nested allocation context filtering".

This patchset is the followup to the comment I made earlier today:

https://lore.kernel.org/linux-xfs/ZjAyIWUzDipofHFJ@dread.disaster.area/

Tl;dr: Memory allocations that are done inside the public memory
allocation API need to obey the reclaim recursion constraints placed on
the allocation by the original caller, including the "don't track
recursion for this allocation" case defined by __GFP_NOLOCKDEP.

These nested allocations are generally in debug code that is tracking
something about the allocation (kmemleak, KASAN, etc) and so are
allocating private kernel objects that only that debug system will use.

Neither the page-owner code nor the stack depot code get this right.  They
also also clear GFP_ZONEMASK as a separate operation, which is completely
redundant because the constraint filter applied immediately after
guarantees that GFP_ZONEMASK bits are cleared.

kmemleak gets this filtering right.  It preserves the allocation
constraints for deadlock prevention and clears all other context flags
whilst also ensuring that the nested allocation will fail quickly,
silently and without depleting emergency kernel reserves if there is no
memory available.

This can be made much more robust, immune to whack-a-mole games and the
code greatly simplified by lifting gfp_kmemleak_mask() to
include/linux/gfp.h and using that everywhere.  Also document it so that
there is no excuse for not knowing about it when writing new debug code
that nests allocations.

Tested with lockdep, KASAN + page_owner=on and kmemleak=on over multiple
fstests runs with XFS.


This patch (of 3):

Any "internal" nested allocation done from within an allocation context
needs to obey the high level allocation gfp_mask constraints.  This is
necessary for debug code like KASAN, kmemleak, lockdep, etc that allocate
memory for saving stack traces and other information during memory
allocation.  If they don't obey things like __GFP_NOLOCKDEP or
__GFP_NOWARN, they produce false positive failure detections.

kmemleak gets this right by using gfp_kmemleak_mask() to pass through the
relevant context flags to the nested allocation to ensure that the
allocation follows the constraints of the caller context.

KASAN recently was foudn to be missing __GFP_NOLOCKDEP due to stack depot
allocations, and even more recently the page owner tracking code was also
found to be missing __GFP_NOLOCKDEP support.

We also don't wan't want KASAN or lockdep to drive the system into OOM
kill territory by exhausting emergency reserves.  This is something that
kmemleak also gets right by adding (__GFP_NORETRY | __GFP_NOMEMALLOC |
__GFP_NOWARN) to the allocation mask.

Hence it is clear that we need to define a common nested allocation filter
mask for these sorts of third party nested allocations used in debug code.
So to start this process, lift gfp_kmemleak_mask() to gfp.h and rename it
to gfp_nested_mask(), and convert the kmemleak callers to use it.

Link: https://lkml.kernel.org/r/20240430054604.4169568-1-david@fromorbit.com
Link: https://lkml.kernel.org/r/20240430054604.4169568-2-david@fromorbit.com
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Marco Elver <elver@google.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/gfp.h | 25 +++++++++++++++++++++++++
 mm/kmemleak.c       | 12 ++++--------
 2 files changed, 29 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 450c2cbcf04b..7f9691d375f0 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -156,6 +156,31 @@ static inline int gfp_zonelist(gfp_t flags)
 	return ZONELIST_FALLBACK;
 }
 
+/*
+ * gfp flag masking for nested internal allocations.
+ *
+ * For code that needs to do allocations inside the public allocation API (e.g.
+ * memory allocation tracking code) the allocations need to obey the caller
+ * allocation context constrains to prevent allocation context mismatches (e.g.
+ * GFP_KERNEL allocations in GFP_NOFS contexts) from potential deadlock
+ * situations.
+ *
+ * It is also assumed that these nested allocations are for internal kernel
+ * object storage purposes only and are not going to be used for DMA, etc. Hence
+ * we strip out all the zone information and leave just the context information
+ * intact.
+ *
+ * Further, internal allocations must fail before the higher level allocation
+ * can fail, so we must make them fail faster and fail silently. We also don't
+ * want them to deplete emergency reserves.  Hence nested allocations must be
+ * prepared for these allocations to fail.
+ */
+static inline gfp_t gfp_nested_mask(gfp_t flags)
+{
+	return ((flags & (GFP_KERNEL | GFP_ATOMIC | __GFP_NOLOCKDEP)) |
+		(__GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN));
+}
+
 /*
  * We get the zone list from the current node and the gfp_mask.
  * This zone list contains a maximum of MAX_NUMNODES*MAX_NR_ZONES zones.
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index fdcf01f62202..d5b6fba44fc9 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -114,12 +114,6 @@
 
 #define BYTES_PER_POINTER	sizeof(void *)
 
-/* GFP bitmask for kmemleak internal allocations */
-#define gfp_kmemleak_mask(gfp)	(((gfp) & (GFP_KERNEL | GFP_ATOMIC | \
-					   __GFP_NOLOCKDEP)) | \
-				 __GFP_NORETRY | __GFP_NOMEMALLOC | \
-				 __GFP_NOWARN)
-
 /* scanning area inside a memory block */
 struct kmemleak_scan_area {
 	struct hlist_node node;
@@ -463,7 +457,8 @@ static struct kmemleak_object *mem_pool_alloc(gfp_t gfp)
 
 	/* try the slab allocator first */
 	if (object_cache) {
-		object = kmem_cache_alloc_noprof(object_cache, gfp_kmemleak_mask(gfp));
+		object = kmem_cache_alloc_noprof(object_cache,
+						 gfp_nested_mask(gfp));
 		if (object)
 			return object;
 	}
@@ -947,7 +942,8 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
 	untagged_objp = (unsigned long)kasan_reset_tag((void *)object->pointer);
 
 	if (scan_area_cache)
-		area = kmem_cache_alloc_noprof(scan_area_cache, gfp_kmemleak_mask(gfp));
+		area = kmem_cache_alloc_noprof(scan_area_cache,
+					       gfp_nested_mask(gfp));
 
 	raw_spin_lock_irqsave(&object->lock, flags);
 	if (!area) {
-- 
cgit v1.2.3


From 9f2c2d6ba13da08643c65b948ce5e3d616864c47 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 7 May 2024 23:01:31 +0300
Subject: bitops: Move aligned_byte_mask() to wordpart.h

The bitops.h is for bit related operations. The aligned_byte_mask()
is about byte (or part of the machine word) operations, for which
we have a separate header, move the mentioned macro to wordpart.h
to consolidate similar operations.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Yury Norov <yury.norov@gmail.com>
---
 include/linux/bitops.h   | 7 -------
 include/linux/wordpart.h | 7 +++++++
 lib/usercopy.c           | 1 +
 3 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 57ecef354f47..3313d2c04e6d 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -8,13 +8,6 @@
 
 #include <uapi/linux/kernel.h>
 
-/* Set bits in the first 'n' bytes when loaded from memory */
-#ifdef __LITTLE_ENDIAN
-#  define aligned_byte_mask(n) ((1UL << 8*(n))-1)
-#else
-#  define aligned_byte_mask(n) (~0xffUL << (BITS_PER_LONG - 8 - 8*(n)))
-#endif
-
 #define BITS_PER_TYPE(type)	(sizeof(type) * BITS_PER_BYTE)
 #define BITS_TO_LONGS(nr)	__KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(long))
 #define BITS_TO_U64(nr)		__KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(u64))
diff --git a/include/linux/wordpart.h b/include/linux/wordpart.h
index f6f8f83b15b0..4ca1ba66d2f0 100644
--- a/include/linux/wordpart.h
+++ b/include/linux/wordpart.h
@@ -39,4 +39,11 @@
  */
 #define REPEAT_BYTE(x)	((~0ul / 0xff) * (x))
 
+/* Set bits in the first 'n' bytes when loaded from memory */
+#ifdef __LITTLE_ENDIAN
+#  define aligned_byte_mask(n) ((1UL << 8*(n))-1)
+#else
+#  define aligned_byte_mask(n) (~0xffUL << (BITS_PER_LONG - 8 - 8*(n)))
+#endif
+
 #endif // _LINUX_WORDPART_H
diff --git a/lib/usercopy.c b/lib/usercopy.c
index d29fe29c6849..4b62e6299cc8 100644
--- a/lib/usercopy.c
+++ b/lib/usercopy.c
@@ -3,6 +3,7 @@
 #include <linux/fault-inject-usercopy.h>
 #include <linux/instrumented.h>
 #include <linux/uaccess.h>
+#include <linux/wordpart.h>
 #include <linux/nospec.h>
 
 /* out-of-line parts */
-- 
cgit v1.2.3


From 182ebe56742c3ffbdc724142a599fc5cf91542c7 Mon Sep 17 00:00:00 2001
From: Andrew Davis <afd@ti.com>
Date: Wed, 10 Apr 2024 08:59:30 -0500
Subject: mailbox: omap: Remove unused omap_mbox_{enable,disable}_irq()
 functions

These function are not used, remove these here.

While here, remove the leading _ from the driver internal functions that
do the same thing as the functions removed.

Signed-off-by: Andrew Davis <afd@ti.com>
Signed-off-by: Jassi Brar <jassisinghbrar@gmail.com>
---
 drivers/mailbox/omap-mailbox.c | 42 ++++++++++--------------------------------
 include/linux/omap-mailbox.h   |  3 ---
 2 files changed, 10 insertions(+), 35 deletions(-)

(limited to 'include')

diff --git a/drivers/mailbox/omap-mailbox.c b/drivers/mailbox/omap-mailbox.c
index c961706fe61d..624a7ccc2728 100644
--- a/drivers/mailbox/omap-mailbox.c
+++ b/drivers/mailbox/omap-mailbox.c
@@ -197,7 +197,7 @@ static int is_mbox_irq(struct omap_mbox *mbox, omap_mbox_irq_t irq)
 	return (int)(enable & status & bit);
 }
 
-static void _omap_mbox_enable_irq(struct omap_mbox *mbox, omap_mbox_irq_t irq)
+static void omap_mbox_enable_irq(struct omap_mbox *mbox, omap_mbox_irq_t irq)
 {
 	u32 l;
 	struct omap_mbox_fifo *fifo = (irq == IRQ_TX) ?
@@ -210,7 +210,7 @@ static void _omap_mbox_enable_irq(struct omap_mbox *mbox, omap_mbox_irq_t irq)
 	mbox_write_reg(mbox->parent, l, irqenable);
 }
 
-static void _omap_mbox_disable_irq(struct omap_mbox *mbox, omap_mbox_irq_t irq)
+static void omap_mbox_disable_irq(struct omap_mbox *mbox, omap_mbox_irq_t irq)
 {
 	struct omap_mbox_fifo *fifo = (irq == IRQ_TX) ?
 				&mbox->tx_fifo : &mbox->rx_fifo;
@@ -227,28 +227,6 @@ static void _omap_mbox_disable_irq(struct omap_mbox *mbox, omap_mbox_irq_t irq)
 	mbox_write_reg(mbox->parent, bit, irqdisable);
 }
 
-void omap_mbox_enable_irq(struct mbox_chan *chan, omap_mbox_irq_t irq)
-{
-	struct omap_mbox *mbox = mbox_chan_to_omap_mbox(chan);
-
-	if (WARN_ON(!mbox))
-		return;
-
-	_omap_mbox_enable_irq(mbox, irq);
-}
-EXPORT_SYMBOL(omap_mbox_enable_irq);
-
-void omap_mbox_disable_irq(struct mbox_chan *chan, omap_mbox_irq_t irq)
-{
-	struct omap_mbox *mbox = mbox_chan_to_omap_mbox(chan);
-
-	if (WARN_ON(!mbox))
-		return;
-
-	_omap_mbox_disable_irq(mbox, irq);
-}
-EXPORT_SYMBOL(omap_mbox_disable_irq);
-
 /*
  * Message receiver(workqueue)
  */
@@ -269,7 +247,7 @@ static void mbox_rx_work(struct work_struct *work)
 		spin_lock_irq(&mq->lock);
 		if (mq->full) {
 			mq->full = false;
-			_omap_mbox_enable_irq(mq->mbox, IRQ_RX);
+			omap_mbox_enable_irq(mq->mbox, IRQ_RX);
 		}
 		spin_unlock_irq(&mq->lock);
 	}
@@ -280,7 +258,7 @@ static void mbox_rx_work(struct work_struct *work)
  */
 static void __mbox_tx_interrupt(struct omap_mbox *mbox)
 {
-	_omap_mbox_disable_irq(mbox, IRQ_TX);
+	omap_mbox_disable_irq(mbox, IRQ_TX);
 	ack_mbox_irq(mbox, IRQ_TX);
 	mbox_chan_txdone(mbox->chan, 0);
 }
@@ -293,7 +271,7 @@ static void __mbox_rx_interrupt(struct omap_mbox *mbox)
 
 	while (!mbox_fifo_empty(mbox)) {
 		if (unlikely(kfifo_avail(&mq->fifo) < sizeof(msg))) {
-			_omap_mbox_disable_irq(mbox, IRQ_RX);
+			omap_mbox_disable_irq(mbox, IRQ_RX);
 			mq->full = true;
 			goto nomem;
 		}
@@ -375,7 +353,7 @@ static int omap_mbox_startup(struct omap_mbox *mbox)
 	if (mbox->send_no_irq)
 		mbox->chan->txdone_method = TXDONE_BY_ACK;
 
-	_omap_mbox_enable_irq(mbox, IRQ_RX);
+	omap_mbox_enable_irq(mbox, IRQ_RX);
 
 	return 0;
 
@@ -386,7 +364,7 @@ fail_request_irq:
 
 static void omap_mbox_fini(struct omap_mbox *mbox)
 {
-	_omap_mbox_disable_irq(mbox, IRQ_RX);
+	omap_mbox_disable_irq(mbox, IRQ_RX);
 	free_irq(mbox->irq, mbox);
 	flush_work(&mbox->rxq->work);
 	mbox_queue_free(mbox->rxq);
@@ -533,10 +511,10 @@ static int omap_mbox_chan_send_noirq(struct omap_mbox *mbox, u32 msg)
 	int ret = -EBUSY;
 
 	if (!mbox_fifo_full(mbox)) {
-		_omap_mbox_enable_irq(mbox, IRQ_RX);
+		omap_mbox_enable_irq(mbox, IRQ_RX);
 		mbox_fifo_write(mbox, msg);
 		ret = 0;
-		_omap_mbox_disable_irq(mbox, IRQ_RX);
+		omap_mbox_disable_irq(mbox, IRQ_RX);
 
 		/* we must read and ack the interrupt directly from here */
 		mbox_fifo_read(mbox);
@@ -556,7 +534,7 @@ static int omap_mbox_chan_send(struct omap_mbox *mbox, u32 msg)
 	}
 
 	/* always enable the interrupt */
-	_omap_mbox_enable_irq(mbox, IRQ_TX);
+	omap_mbox_enable_irq(mbox, IRQ_TX);
 	return ret;
 }
 
diff --git a/include/linux/omap-mailbox.h b/include/linux/omap-mailbox.h
index 8aa984ec1f38..426a80fb32b5 100644
--- a/include/linux/omap-mailbox.h
+++ b/include/linux/omap-mailbox.h
@@ -20,7 +20,4 @@ struct mbox_client;
 struct mbox_chan *omap_mbox_request_channel(struct mbox_client *cl,
 					    const char *chan_name);
 
-void omap_mbox_enable_irq(struct mbox_chan *chan, omap_mbox_irq_t irq);
-void omap_mbox_disable_irq(struct mbox_chan *chan, omap_mbox_irq_t irq);
-
 #endif /* OMAP_MAILBOX_H */
-- 
cgit v1.2.3


From 6faf89a89f45f649afd0d081fa99add399582595 Mon Sep 17 00:00:00 2001
From: Andrew Davis <afd@ti.com>
Date: Wed, 10 Apr 2024 08:59:31 -0500
Subject: mailbox: omap: Remove unused omap_mbox_request_channel() function

This function is not used, remove this function.

Signed-off-by: Andrew Davis <afd@ti.com>
Signed-off-by: Jassi Brar <jassisinghbrar@gmail.com>
---
 drivers/mailbox/omap-mailbox.c | 36 ------------------------------------
 include/linux/omap-mailbox.h   |  6 ------
 2 files changed, 42 deletions(-)

(limited to 'include')

diff --git a/drivers/mailbox/omap-mailbox.c b/drivers/mailbox/omap-mailbox.c
index 624a7ccc2728..8151722eef38 100644
--- a/drivers/mailbox/omap-mailbox.c
+++ b/drivers/mailbox/omap-mailbox.c
@@ -389,42 +389,6 @@ static struct omap_mbox *omap_mbox_device_find(struct omap_mbox_device *mdev,
 	return mbox;
 }
 
-struct mbox_chan *omap_mbox_request_channel(struct mbox_client *cl,
-					    const char *chan_name)
-{
-	struct device *dev = cl->dev;
-	struct omap_mbox *mbox = NULL;
-	struct omap_mbox_device *mdev;
-	int ret;
-
-	if (!dev)
-		return ERR_PTR(-ENODEV);
-
-	if (dev->of_node) {
-		pr_err("%s: please use mbox_request_channel(), this API is supported only for OMAP non-DT usage\n",
-		       __func__);
-		return ERR_PTR(-ENODEV);
-	}
-
-	mutex_lock(&omap_mbox_devices_lock);
-	list_for_each_entry(mdev, &omap_mbox_devices, elem) {
-		mbox = omap_mbox_device_find(mdev, chan_name);
-		if (mbox)
-			break;
-	}
-	mutex_unlock(&omap_mbox_devices_lock);
-
-	if (!mbox || !mbox->chan)
-		return ERR_PTR(-ENOENT);
-
-	ret = mbox_bind_client(mbox->chan, cl);
-	if (ret)
-		return ERR_PTR(ret);
-
-	return mbox->chan;
-}
-EXPORT_SYMBOL(omap_mbox_request_channel);
-
 static struct class omap_mbox_class = { .name = "mbox", };
 
 static int omap_mbox_register(struct omap_mbox_device *mdev)
diff --git a/include/linux/omap-mailbox.h b/include/linux/omap-mailbox.h
index 426a80fb32b5..f8ddf8e81416 100644
--- a/include/linux/omap-mailbox.h
+++ b/include/linux/omap-mailbox.h
@@ -14,10 +14,4 @@ typedef int __bitwise omap_mbox_irq_t;
 #define IRQ_TX ((__force omap_mbox_irq_t) 1)
 #define IRQ_RX ((__force omap_mbox_irq_t) 2)
 
-struct mbox_chan;
-struct mbox_client;
-
-struct mbox_chan *omap_mbox_request_channel(struct mbox_client *cl,
-					    const char *chan_name);
-
 #endif /* OMAP_MAILBOX_H */
-- 
cgit v1.2.3


From 6979e8be50af143a373bf5905a176434a5880ca4 Mon Sep 17 00:00:00 2001
From: Andrew Davis <afd@ti.com>
Date: Wed, 10 Apr 2024 08:59:32 -0500
Subject: mailbox: omap: Move omap_mbox_irq_t into driver

This is only used internal to the driver, move it out of the
public header and into the driver file. While we are here,
this is not used as a bitwise, so drop that and make it a
simple enum type.

Signed-off-by: Andrew Davis <afd@ti.com>
Signed-off-by: Jassi Brar <jassisinghbrar@gmail.com>
---
 drivers/mailbox/omap-mailbox.c | 5 +++++
 include/linux/omap-mailbox.h   | 4 ----
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/mailbox/omap-mailbox.c b/drivers/mailbox/omap-mailbox.c
index 8151722eef38..c083734b6954 100644
--- a/drivers/mailbox/omap-mailbox.c
+++ b/drivers/mailbox/omap-mailbox.c
@@ -51,6 +51,11 @@
 #define MBOX_INTR_CFG_TYPE1		0
 #define MBOX_INTR_CFG_TYPE2		1
 
+typedef enum {
+	IRQ_TX = 1,
+	IRQ_RX = 2,
+} omap_mbox_irq_t;
+
 struct omap_mbox_fifo {
 	unsigned long msg;
 	unsigned long fifo_stat;
diff --git a/include/linux/omap-mailbox.h b/include/linux/omap-mailbox.h
index f8ddf8e81416..3cc5c4ed7f5a 100644
--- a/include/linux/omap-mailbox.h
+++ b/include/linux/omap-mailbox.h
@@ -10,8 +10,4 @@ typedef uintptr_t mbox_msg_t;
 
 #define omap_mbox_message(data) (u32)(mbox_msg_t)(data)
 
-typedef int __bitwise omap_mbox_irq_t;
-#define IRQ_TX ((__force omap_mbox_irq_t) 1)
-#define IRQ_RX ((__force omap_mbox_irq_t) 2)
-
 #endif /* OMAP_MAILBOX_H */
-- 
cgit v1.2.3


From cd251970b19edc8821792ab35070a0c0dbc8a47a Mon Sep 17 00:00:00 2001
From: Cristian Marussi <cristian.marussi@arm.com>
Date: Thu, 18 Apr 2024 11:52:09 +0100
Subject: dt-bindings: mailbox: arm,mhuv3: Add bindings

Add bindings for the ARM MHUv3 Mailbox controller.

Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Cristian Marussi <cristian.marussi@arm.com>
Signed-off-by: Jassi Brar <jassisinghbrar@gmail.com>
---
 .../devicetree/bindings/mailbox/arm,mhuv3.yaml     | 224 +++++++++++++++++++++
 include/dt-bindings/arm/mhuv3-dt.h                 |  13 ++
 2 files changed, 237 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/mailbox/arm,mhuv3.yaml
 create mode 100644 include/dt-bindings/arm/mhuv3-dt.h

(limited to 'include')

diff --git a/Documentation/devicetree/bindings/mailbox/arm,mhuv3.yaml b/Documentation/devicetree/bindings/mailbox/arm,mhuv3.yaml
new file mode 100644
index 000000000000..449b55afeb7d
--- /dev/null
+++ b/Documentation/devicetree/bindings/mailbox/arm,mhuv3.yaml
@@ -0,0 +1,224 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/mailbox/arm,mhuv3.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: ARM MHUv3 Mailbox Controller
+
+maintainers:
+  - Sudeep Holla <sudeep.holla@arm.com>
+  - Cristian Marussi <cristian.marussi@arm.com>
+
+description: |
+  The Arm Message Handling Unit (MHU) Version 3 is a mailbox controller that
+  enables unidirectional communications with remote processors through various
+  possible transport protocols.
+  The controller can optionally support a varying number of extensions that, in
+  turn, enable different kinds of transport to be used for communication.
+  Number, type and characteristics of each supported extension can be discovered
+  dynamically at runtime.
+
+  Given the unidirectional nature of the controller, an MHUv3 mailbox controller
+  is composed of a MHU Sender (MHUS) containing a PostBox (PBX) block and a MHU
+  Receiver (MHUR) containing a MailBox (MBX) block, where
+
+   PBX is used to
+      - Configure the MHU
+      - Send Transfers to the Receiver
+      - Optionally receive acknowledgment of a Transfer from the Receiver
+
+   MBX is used to
+      - Configure the MHU
+      - Receive Transfers from the Sender
+      - Optionally acknowledge Transfers sent by the Sender
+
+  Both PBX and MBX need to be present and defined in the DT description if you
+  need to establish a bidirectional communication, since you will have to
+  acquire two distinct unidirectional channels, one for each block.
+
+  As a consequence both blocks needs to be represented separately and specified
+  as distinct DT nodes in order to properly describe their resources.
+
+  Note that, though, thanks to the runtime discoverability, there is no need to
+  identify the type of blocks with distinct compatibles.
+
+  Following are the MHUv3 possible extensions.
+
+  - Doorbell Extension (DBE): DBE defines a type of channel called a Doorbell
+    Channel (DBCH). DBCH enables a single bit Transfer to be sent from the
+    Sender to Receiver. The Transfer indicates that an event has occurred.
+    When DBE is implemented, the number of DBCHs that an implementation of the
+    MHU can support is between 1 and 128, numbered starting from 0 in ascending
+    order and discoverable at run-time.
+    Each DBCH contains 32 individual fields, referred to as flags, each of which
+    can be used independently. It is possible for the Sender to send multiple
+    Transfers at once using a single DBCH, so long as each Transfer uses
+    a different flag in the DBCH.
+    Optionally, data may be transmitted through an out-of-band shared memory
+    region, wherein the MHU Doorbell is used strictly as an interrupt generation
+    mechanism, but this is out of the scope of these bindings.
+
+  - FastChannel Extension (FCE): FCE defines a type of channel called a Fast
+    Channel (FCH). FCH is intended for lower overhead communication between
+    Sender and Receiver at the expense of determinism. An FCH allows the Sender
+    to update the channel value at any time, regardless of whether the previous
+    value has been seen by the Receiver. When the Receiver reads the channel's
+    content it gets the last value written to the channel.
+    FCH is considered lossy in nature, and means that the Sender has no way of
+    knowing if, or when, the Receiver will act on the Transfer.
+    FCHs are expected to behave as RAM which generates interrupts when writes
+    occur to the locations within the RAM.
+    When FCE is implemented, the number of FCHs that an implementation of the
+    MHU can support is between 1-1024, if the FastChannel word-size is 32-bits,
+    or between 1-512, when the FastChannel word-size is 64-bits.
+    FCHs are numbered from 0 in ascending order.
+    Note that the number of FCHs and the word-size are implementation defined,
+    not configurable but discoverable at run-time.
+    Optionally, data may be transmitted through an out-of-band shared memory
+    region, wherein the MHU FastChannel is used as an interrupt generation
+    mechanism which carries also a pointer to such out-of-band data, but this
+    is out of the scope of these bindings.
+
+  - FIFO Extension (FE): FE defines a Channel type called a FIFO Channel (FFCH).
+    FFCH allows a Sender to send
+       - Multiple Transfers to the Receiver without having to wait for the
+         previous Transfer to be acknowledged by the Receiver, as long as the
+         FIFO has room for the Transfer.
+       - Transfers which require the Receiver to provide acknowledgment.
+       - Transfers which have in-band payload.
+    In all cases, the data is guaranteed to be observed by the Receiver in the
+    same order which the Sender sent it.
+    When FE is implemented, the number of FFCHs that an implementation of the
+    MHU can support is between 1 and 64, numbered starting from 0 in ascending
+    order. The number of FFCHs, their depth (same for all implemented FFCHs) and
+    the access-granularity are implementation defined, not configurable but
+    discoverable at run-time.
+    Optionally, additional data may be transmitted through an out-of-band shared
+    memory region, wherein the MHU FIFO is used to transmit, in order, a small
+    part of the payload (like a header) and a reference to the shared memory
+    area holding the remaining, bigger, chunk of the payload, but this is out of
+    the scope of these bindings.
+
+properties:
+  compatible:
+    const: arm,mhuv3
+
+  reg:
+    maxItems: 1
+
+  interrupts:
+    minItems: 1
+    maxItems: 74
+
+  interrupt-names:
+    description: |
+      The MHUv3 controller generates a number of events some of which are used
+      to generate interrupts; as a consequence it can expose a varying number of
+      optional PBX/MBX interrupts, representing the events generated during the
+      operation of the various transport protocols associated with different
+      extensions. All interrupts of the MHU are level-sensitive.
+      Some of these optional interrupts are defined per-channel, where the
+      number of channels effectively available is implementation defined and
+      run-time discoverable.
+      In the following names are enumerated using patterns, with per-channel
+      interrupts implicitly capped at the maximum channels allowed by the
+      specification for each extension type.
+      For the sake of simplicity maxItems is anyway capped to a most plausible
+      number, assuming way less channels would be implemented than actually
+      possible.
+
+      The only mandatory interrupts on the MHU are:
+        - combined
+        - mbx-fch-xfer-<N> but only if mbx-fcgrp-xfer-<N> is not implemented.
+
+    minItems: 1
+    maxItems: 74
+    items:
+      oneOf:
+        - const: combined
+          description: PBX/MBX Combined interrupt
+        - const: combined-ffch
+          description: PBX/MBX FIFO Combined interrupt
+        - pattern: '^ffch-low-tide-[0-9]+$'
+          description: PBX/MBX FIFO Channel <N> Low Tide interrupt
+        - pattern: '^ffch-high-tide-[0-9]+$'
+          description: PBX/MBX FIFO Channel <N> High Tide interrupt
+        - pattern: '^ffch-flush-[0-9]+$'
+          description: PBX/MBX FIFO Channel <N> Flush interrupt
+        - pattern: '^mbx-dbch-xfer-[0-9]+$'
+          description: MBX Doorbell Channel <N> Transfer interrupt
+        - pattern: '^mbx-fch-xfer-[0-9]+$'
+          description: MBX FastChannel <N> Transfer interrupt
+        - pattern: '^mbx-fchgrp-xfer-[0-9]+$'
+          description: MBX FastChannel <N> Group Transfer interrupt
+        - pattern: '^mbx-ffch-xfer-[0-9]+$'
+          description: MBX FIFO Channel <N> Transfer interrupt
+        - pattern: '^pbx-dbch-xfer-ack-[0-9]+$'
+          description: PBX Doorbell Channel <N> Transfer Ack interrupt
+        - pattern: '^pbx-ffch-xfer-ack-[0-9]+$'
+          description: PBX FIFO Channel <N> Transfer Ack interrupt
+
+  '#mbox-cells':
+    description: |
+      The first argument in the consumers 'mboxes' property represents the
+      extension type, the second is for the channel number while the third
+      depends on extension type.
+
+      Extension types constants are defined in <dt-bindings/arm/mhuv3-dt.h>.
+
+      Extension type for DBE is DBE_EXT and the third parameter represents the
+      doorbell flag number to use.
+      Extension type for FCE is FCE_EXT, third parameter unused.
+      Extension type for FE is FE_EXT, third parameter unused.
+
+      mboxes = <&mhu DBE_EXT 0 5>; // DBE, Doorbell Channel Window 0, doorbell 5.
+      mboxes = <&mhu DBE_EXT 7>; // DBE, Doorbell Channel Window 1, doorbell 7.
+      mboxes = <&mhu FCE_EXT 0 0>; // FCE, FastChannel Window 0.
+      mboxes = <&mhu FCE_EXT 3 0>; // FCE, FastChannel Window 3.
+      mboxes = <&mhu FE_EXT 1 0>; // FE, FIFO Channel Window 1.
+      mboxes = <&mhu FE_EXT 7 0>; // FE, FIFO Channel Window 7.
+    const: 3
+
+  clocks:
+    maxItems: 1
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - interrupt-names
+  - '#mbox-cells'
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+
+    soc {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        mailbox@2aaa0000 {
+            compatible = "arm,mhuv3";
+            #mbox-cells = <3>;
+            reg = <0 0x2aaa0000 0 0x10000>;
+            clocks = <&clock 0>;
+            interrupt-names = "combined", "pbx-dbch-xfer-ack-1",
+                               "ffch-high-tide-0";
+            interrupts = <GIC_SPI 36 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 37 IRQ_TYPE_LEVEL_HIGH>;
+        };
+
+        mailbox@2ab00000 {
+            compatible = "arm,mhuv3";
+            #mbox-cells = <3>;
+            reg = <0 0x2aab0000 0 0x10000>;
+            clocks = <&clock 0>;
+            interrupt-names = "combined", "mbx-dbch-xfer-1", "ffch-low-tide-0";
+            interrupts = <GIC_SPI 35 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 38 IRQ_TYPE_LEVEL_HIGH>,
+                         <GIC_SPI 39 IRQ_TYPE_LEVEL_HIGH>;
+        };
+    };
diff --git a/include/dt-bindings/arm/mhuv3-dt.h b/include/dt-bindings/arm/mhuv3-dt.h
new file mode 100644
index 000000000000..4575406919dd
--- /dev/null
+++ b/include/dt-bindings/arm/mhuv3-dt.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+/*
+ * This header provides constants for the defined MHUv3 types.
+ */
+
+#ifndef _DT_BINDINGS_ARM_MHUV3_DT_H
+#define _DT_BINDINGS_ARM_MHUV3_DT_H
+
+#define DBE_EXT		0
+#define FCE_EXT		1
+#define FE_EXT		2
+
+#endif /* _DT_BINDINGS_ARM_MHUV3_DT_H */
-- 
cgit v1.2.3


From b195acf5266d2dee4067f89345c3e6b88d925311 Mon Sep 17 00:00:00 2001
From: Shenghao Ding <shenghao-ding@ti.com>
Date: Sat, 18 May 2024 22:15:46 +0800
Subject: ASoC: tas2781: Fix wrong loading calibrated data sequence

Calibrated data will be set to default after loading DSP config params,
which will cause speaker protection work abnormally. Reload calibrated
data after loading DSP config params. Remove declaration of unused API
which load calibrated data in wrong sequence, changed the copyright year
and correct file name in license
header.

Fixes: ef3bcde75d06 ("ASoC: tas2781: Add tas2781 driver")
Signed-off-by: Shenghao Ding <shenghao-ding@ti.com>
Link: https://msgid.link/r/20240518141546.1742-1-shenghao-ding@ti.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/tas2781-dsp.h       |   7 ++-
 sound/soc/codecs/tas2781-fmwlib.c | 103 ++++++++++----------------------------
 sound/soc/codecs/tas2781-i2c.c    |   4 +-
 3 files changed, 32 insertions(+), 82 deletions(-)

(limited to 'include')

diff --git a/include/sound/tas2781-dsp.h b/include/sound/tas2781-dsp.h
index ea9af2726a53..7fba7ea26a4b 100644
--- a/include/sound/tas2781-dsp.h
+++ b/include/sound/tas2781-dsp.h
@@ -2,7 +2,7 @@
 //
 // ALSA SoC Texas Instruments TAS2781 Audio Smart Amplifier
 //
-// Copyright (C) 2022 - 2023 Texas Instruments Incorporated
+// Copyright (C) 2022 - 2024 Texas Instruments Incorporated
 // https://www.ti.com
 //
 // The TAS2781 driver implements a flexible and configurable
@@ -13,8 +13,8 @@
 // Author: Kevin Lu <kevin-lu@ti.com>
 //
 
-#ifndef __TASDEVICE_DSP_H__
-#define __TASDEVICE_DSP_H__
+#ifndef __TAS2781_DSP_H__
+#define __TAS2781_DSP_H__
 
 #define MAIN_ALL_DEVICES			0x0d
 #define MAIN_DEVICE_A				0x01
@@ -180,7 +180,6 @@ void tasdevice_calbin_remove(void *context);
 int tasdevice_select_tuningprm_cfg(void *context, int prm,
 	int cfg_no, int rca_conf_no);
 int tasdevice_prmg_load(void *context, int prm_no);
-int tasdevice_prmg_calibdata_load(void *context, int prm_no);
 void tasdevice_tuning_switch(void *context, int state);
 int tas2781_load_calibration(void *context, char *file_name,
 	unsigned short i);
diff --git a/sound/soc/codecs/tas2781-fmwlib.c b/sound/soc/codecs/tas2781-fmwlib.c
index a6be81adcb83..265a8ca25cbb 100644
--- a/sound/soc/codecs/tas2781-fmwlib.c
+++ b/sound/soc/codecs/tas2781-fmwlib.c
@@ -2151,6 +2151,24 @@ static int tasdevice_load_data(struct tasdevice_priv *tas_priv,
 	return ret;
 }
 
+static void tasdev_load_calibrated_data(struct tasdevice_priv *priv, int i)
+{
+	struct tasdevice_calibration *cal;
+	struct tasdevice_fw *cal_fmw;
+
+	cal_fmw = priv->tasdevice[i].cali_data_fmw;
+
+	/* No calibrated data for current devices, playback will go ahead. */
+	if (!cal_fmw)
+		return;
+
+	cal = cal_fmw->calibrations;
+	if (cal)
+		return;
+
+	load_calib_data(priv, &cal->dev_data);
+}
+
 int tasdevice_select_tuningprm_cfg(void *context, int prm_no,
 	int cfg_no, int rca_conf_no)
 {
@@ -2210,21 +2228,9 @@ int tasdevice_select_tuningprm_cfg(void *context, int prm_no,
 		for (i = 0; i < tas_priv->ndev; i++) {
 			if (tas_priv->tasdevice[i].is_loaderr == true)
 				continue;
-			else if (tas_priv->tasdevice[i].is_loaderr == false
-				&& tas_priv->tasdevice[i].is_loading == true) {
-				struct tasdevice_fw *cal_fmw =
-					tas_priv->tasdevice[i].cali_data_fmw;
-
-				if (cal_fmw) {
-					struct tasdevice_calibration
-						*cal = cal_fmw->calibrations;
-
-					if (cal)
-						load_calib_data(tas_priv,
-							&(cal->dev_data));
-				}
+			if (tas_priv->tasdevice[i].is_loaderr == false &&
+				tas_priv->tasdevice[i].is_loading == true)
 				tas_priv->tasdevice[i].cur_prog = prm_no;
-			}
 		}
 	}
 
@@ -2245,11 +2251,15 @@ int tasdevice_select_tuningprm_cfg(void *context, int prm_no,
 		tasdevice_load_data(tas_priv, &(conf->dev_data));
 		for (i = 0; i < tas_priv->ndev; i++) {
 			if (tas_priv->tasdevice[i].is_loaderr == true) {
-				status |= 1 << (i + 4);
+				status |= BIT(i + 4);
 				continue;
-			} else if (tas_priv->tasdevice[i].is_loaderr == false
-				&& tas_priv->tasdevice[i].is_loading == true)
+			}
+
+			if (tas_priv->tasdevice[i].is_loaderr == false &&
+				tas_priv->tasdevice[i].is_loading == true) {
+				tasdev_load_calibrated_data(tas_priv, i);
 				tas_priv->tasdevice[i].cur_conf = cfg_no;
+			}
 		}
 	} else
 		dev_dbg(tas_priv->dev, "%s: Unneeded loading dsp conf %d\n",
@@ -2308,65 +2318,6 @@ out:
 }
 EXPORT_SYMBOL_NS_GPL(tasdevice_prmg_load, SND_SOC_TAS2781_FMWLIB);
 
-int tasdevice_prmg_calibdata_load(void *context, int prm_no)
-{
-	struct tasdevice_priv *tas_priv = (struct tasdevice_priv *) context;
-	struct tasdevice_fw *tas_fmw = tas_priv->fmw;
-	struct tasdevice_prog *program;
-	int prog_status = 0;
-	int i;
-
-	if (!tas_fmw) {
-		dev_err(tas_priv->dev, "%s: Firmware is NULL\n", __func__);
-		goto out;
-	}
-
-	if (prm_no >= tas_fmw->nr_programs) {
-		dev_err(tas_priv->dev,
-			"%s: prm(%d) is not in range of Programs %u\n",
-			__func__, prm_no, tas_fmw->nr_programs);
-		goto out;
-	}
-
-	for (i = 0, prog_status = 0; i < tas_priv->ndev; i++) {
-		if (prm_no >= 0 && tas_priv->tasdevice[i].cur_prog != prm_no) {
-			tas_priv->tasdevice[i].cur_conf = -1;
-			tas_priv->tasdevice[i].is_loading = true;
-			prog_status++;
-		}
-		tas_priv->tasdevice[i].is_loaderr = false;
-	}
-
-	if (prog_status) {
-		program = &(tas_fmw->programs[prm_no]);
-		tasdevice_load_data(tas_priv, &(program->dev_data));
-		for (i = 0; i < tas_priv->ndev; i++) {
-			if (tas_priv->tasdevice[i].is_loaderr == true)
-				continue;
-			else if (tas_priv->tasdevice[i].is_loaderr == false
-				&& tas_priv->tasdevice[i].is_loading == true) {
-				struct tasdevice_fw *cal_fmw =
-					tas_priv->tasdevice[i].cali_data_fmw;
-
-				if (cal_fmw) {
-					struct tasdevice_calibration *cal =
-						cal_fmw->calibrations;
-
-					if (cal)
-						load_calib_data(tas_priv,
-							&(cal->dev_data));
-				}
-				tas_priv->tasdevice[i].cur_prog = prm_no;
-			}
-		}
-	}
-
-out:
-	return prog_status;
-}
-EXPORT_SYMBOL_NS_GPL(tasdevice_prmg_calibdata_load,
-	SND_SOC_TAS2781_FMWLIB);
-
 void tasdevice_tuning_switch(void *context, int state)
 {
 	struct tasdevice_priv *tas_priv = (struct tasdevice_priv *) context;
diff --git a/sound/soc/codecs/tas2781-i2c.c b/sound/soc/codecs/tas2781-i2c.c
index b5abff230e43..9350972dfefe 100644
--- a/sound/soc/codecs/tas2781-i2c.c
+++ b/sound/soc/codecs/tas2781-i2c.c
@@ -2,7 +2,7 @@
 //
 // ALSA SoC Texas Instruments TAS2563/TAS2781 Audio Smart Amplifier
 //
-// Copyright (C) 2022 - 2023 Texas Instruments Incorporated
+// Copyright (C) 2022 - 2024 Texas Instruments Incorporated
 // https://www.ti.com
 //
 // The TAS2563/TAS2781 driver implements a flexible and configurable
@@ -414,7 +414,7 @@ static void tasdevice_fw_ready(const struct firmware *fmw,
 				__func__, tas_priv->cal_binaryname[i]);
 	}
 
-	tasdevice_prmg_calibdata_load(tas_priv, 0);
+	tasdevice_prmg_load(tas_priv, 0);
 	tas_priv->cur_prog = 0;
 out:
 	if (tas_priv->fw_state == TASDEVICE_DSP_FW_FAIL) {
-- 
cgit v1.2.3


From 7c6c5249f061b64fc6b5b90bc147169a048691bf Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 25 Mar 2024 16:36:05 +1100
Subject: NFS: add atomic_open for NFSv3 to handle O_TRUNC correctly.

With two clients, each with NFSv3 mounts of the same directory, the sequence:

   client1            client2
  ls -l afile
                      echo hello there > afile
  echo HELLO > afile
  cat afile

will show
   HELLO
   there

because the O_TRUNC requested in the final 'echo' doesn't take effect.
This is because the "Negative dentry, just create a file" section in
lookup_open() assumes that the file *does* get created since the dentry
was negative, so it sets FMODE_CREATED, and this causes do_open() to
clear O_TRUNC and so the file doesn't get truncated.

Even mounting with -o lookupcache=none does not help as
nfs_neg_need_reval() always returns false if LOOKUP_CREATE is set.

This patch fixes the problem by providing an atomic_open inode operation
for NFSv3 (and v2).  The code is largely the code from the branch in
lookup_open() when atomic_open is not provided.  The significant change
is that the O_TRUNC flag is passed a new nfs_do_create() which add
'trunc' handling to nfs_create().

With this change we also optimise away an unnecessary LOOKUP before the
file is created.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
---
 fs/nfs/dir.c           | 54 +++++++++++++++++++++++++++++++++++++++++++++++---
 fs/nfs/nfs3proc.c      |  1 +
 fs/nfs/proc.c          |  1 +
 include/linux/nfs_fs.h |  3 +++
 4 files changed, 56 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ac505671efbd..342930996226 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -56,6 +56,8 @@ static int nfs_readdir(struct file *, struct dir_context *);
 static int nfs_fsync_dir(struct file *, loff_t, loff_t, int);
 static loff_t nfs_llseek_dir(struct file *, loff_t, int);
 static void nfs_readdir_clear_array(struct folio *);
+static int nfs_do_create(struct inode *dir, struct dentry *dentry,
+			 umode_t mode, int open_flags);
 
 const struct file_operations nfs_dir_operations = {
 	.llseek		= nfs_llseek_dir,
@@ -2243,6 +2245,41 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 
 #endif /* CONFIG_NFSV4 */
 
+int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry,
+			struct file *file, unsigned int open_flags,
+			umode_t mode)
+{
+
+	/* Same as look+open from lookup_open(), but with different O_TRUNC
+	 * handling.
+	 */
+	int error = 0;
+
+	if (open_flags & O_CREAT) {
+		file->f_mode |= FMODE_CREATED;
+		error = nfs_do_create(dir, dentry, mode, open_flags);
+		if (error)
+			return error;
+		return finish_open(file, dentry, NULL);
+	} else if (d_in_lookup(dentry)) {
+		/* The only flags nfs_lookup considers are
+		 * LOOKUP_EXCL and LOOKUP_RENAME_TARGET, and
+		 * we want those to be zero so the lookup isn't skipped.
+		 */
+		struct dentry *res = nfs_lookup(dir, dentry, 0);
+
+		d_lookup_done(dentry);
+		if (unlikely(res)) {
+			if (IS_ERR(res))
+				return PTR_ERR(res);
+			return finish_no_open(file, res);
+		}
+	}
+	return finish_no_open(file, NULL);
+
+}
+EXPORT_SYMBOL_GPL(nfs_atomic_open_v23);
+
 struct dentry *
 nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle,
 				struct nfs_fattr *fattr)
@@ -2303,18 +2340,23 @@ EXPORT_SYMBOL_GPL(nfs_instantiate);
  * that the operation succeeded on the server, but an error in the
  * reply path made it appear to have failed.
  */
-int nfs_create(struct mnt_idmap *idmap, struct inode *dir,
-	       struct dentry *dentry, umode_t mode, bool excl)
+static int nfs_do_create(struct inode *dir, struct dentry *dentry,
+			 umode_t mode, int open_flags)
 {
 	struct iattr attr;
-	int open_flags = excl ? O_CREAT | O_EXCL : O_CREAT;
 	int error;
 
+	open_flags |= O_CREAT;
+
 	dfprintk(VFS, "NFS: create(%s/%lu), %pd\n",
 			dir->i_sb->s_id, dir->i_ino, dentry);
 
 	attr.ia_mode = mode;
 	attr.ia_valid = ATTR_MODE;
+	if (open_flags & O_TRUNC) {
+		attr.ia_size = 0;
+		attr.ia_valid |= ATTR_SIZE;
+	}
 
 	trace_nfs_create_enter(dir, dentry, open_flags);
 	error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags);
@@ -2326,6 +2368,12 @@ out_err:
 	d_drop(dentry);
 	return error;
 }
+
+int nfs_create(struct mnt_idmap *idmap, struct inode *dir,
+	       struct dentry *dentry, umode_t mode, bool excl)
+{
+	return nfs_do_create(dir, dentry, mode, excl ? O_EXCL : 0);
+}
 EXPORT_SYMBOL_GPL(nfs_create);
 
 /*
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index cbbe3f0193b8..74bda639a7cf 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -986,6 +986,7 @@ static int nfs3_have_delegation(struct inode *inode, fmode_t flags)
 
 static const struct inode_operations nfs3_dir_inode_operations = {
 	.create		= nfs_create,
+	.atomic_open	= nfs_atomic_open_v23,
 	.lookup		= nfs_lookup,
 	.link		= nfs_link,
 	.unlink		= nfs_unlink,
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index ad3a321ae997..d105e5b2659d 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -695,6 +695,7 @@ static int nfs_have_delegation(struct inode *inode, fmode_t flags)
 static const struct inode_operations nfs_dir_inode_operations = {
 	.create		= nfs_create,
 	.lookup		= nfs_lookup,
+	.atomic_open	= nfs_atomic_open_v23,
 	.link		= nfs_link,
 	.unlink		= nfs_unlink,
 	.symlink	= nfs_symlink,
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index d59116ac8209..039898d70954 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -561,6 +561,9 @@ extern int nfs_may_open(struct inode *inode, const struct cred *cred, int openfl
 extern void nfs_access_zap_cache(struct inode *inode);
 extern int nfs_access_get_cached(struct inode *inode, const struct cred *cred,
 				 u32 *mask, bool may_block);
+extern int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry,
+			       struct file *file, unsigned int open_flags,
+			       umode_t mode);
 
 /*
  * linux/fs/nfs/symlink.c
-- 
cgit v1.2.3


From f4f4276f985a5aac7b310a4ed040b47e275e7591 Mon Sep 17 00:00:00 2001
From: Matti Vaittinen <mazziesaccount@gmail.com>
Date: Mon, 20 May 2024 15:31:33 +0300
Subject: regulator: pickable ranges: don't always cache vsel

Some PMICs treat the vsel_reg same as apply-bit. Eg, when voltage range
is changed, the new voltage setting is not taking effect until the vsel
register is written.

Add a flag 'range_applied_by_vsel' to the regulator desc to indicate this
behaviour and to force the vsel value to be written to hardware if range
was changed, even if the old selector was same as the new one.

Signed-off-by: Matti Vaittinen <mazziesaccount@gmail.com>
Link: https://msgid.link/r/ZktCpcGZdgHWuN_L@fedora
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 drivers/regulator/helpers.c      | 43 ++++++++++++++++++++++++++++------------
 include/linux/regulator/driver.h |  3 +++
 2 files changed, 33 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/regulator/helpers.c b/drivers/regulator/helpers.c
index d49268336553..6e1ace660b8c 100644
--- a/drivers/regulator/helpers.c
+++ b/drivers/regulator/helpers.c
@@ -161,6 +161,32 @@ int regulator_get_voltage_sel_pickable_regmap(struct regulator_dev *rdev)
 }
 EXPORT_SYMBOL_GPL(regulator_get_voltage_sel_pickable_regmap);
 
+static int write_separate_vsel_and_range(struct regulator_dev *rdev,
+					 unsigned int sel, unsigned int range)
+{
+	bool range_updated;
+	int ret;
+
+	ret = regmap_update_bits_base(rdev->regmap, rdev->desc->vsel_range_reg,
+				      rdev->desc->vsel_range_mask,
+				      range, &range_updated, false, false);
+	if (ret)
+		return ret;
+
+	/*
+	 * Some PMICs treat the vsel_reg same as apply-bit. Force it to be
+	 * written if the range changed, even if the old selector was same as
+	 * the new one
+	 */
+	if (rdev->desc->range_applied_by_vsel && range_updated)
+		return regmap_write_bits(rdev->regmap,
+					rdev->desc->vsel_reg,
+					rdev->desc->vsel_mask, sel);
+
+	return regmap_update_bits(rdev->regmap, rdev->desc->vsel_reg,
+				  rdev->desc->vsel_mask, sel);
+}
+
 /**
  * regulator_set_voltage_sel_pickable_regmap - pickable range set_voltage_sel
  *
@@ -199,21 +225,12 @@ int regulator_set_voltage_sel_pickable_regmap(struct regulator_dev *rdev,
 	range = rdev->desc->linear_range_selectors_bitfield[i];
 	range <<= ffs(rdev->desc->vsel_range_mask) - 1;
 
-	if (rdev->desc->vsel_reg == rdev->desc->vsel_range_reg) {
-		ret = regmap_update_bits(rdev->regmap,
-					 rdev->desc->vsel_reg,
+	if (rdev->desc->vsel_reg == rdev->desc->vsel_range_reg)
+		ret = regmap_update_bits(rdev->regmap, rdev->desc->vsel_reg,
 					 rdev->desc->vsel_range_mask |
 					 rdev->desc->vsel_mask, sel | range);
-	} else {
-		ret = regmap_update_bits(rdev->regmap,
-					 rdev->desc->vsel_range_reg,
-					 rdev->desc->vsel_range_mask, range);
-		if (ret)
-			return ret;
-
-		ret = regmap_update_bits(rdev->regmap, rdev->desc->vsel_reg,
-				  rdev->desc->vsel_mask, sel);
-	}
+	else
+		ret = write_separate_vsel_and_range(rdev, sel, range);
 
 	if (ret)
 		return ret;
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 22a07c0900a4..f230a472ccd3 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -299,6 +299,8 @@ enum regulator_type {
  * @vsel_range_reg: Register for range selector when using pickable ranges
  *		    and ``regulator_map_*_voltage_*_pickable`` functions.
  * @vsel_range_mask: Mask for register bitfield used for range selector
+ * @range_applied_by_vsel: A flag to indicate that changes to vsel_range_reg
+ *			   are only effective after vsel_reg is written
  * @vsel_reg: Register for selector when using ``regulator_map_*_voltage_*``
  * @vsel_mask: Mask for register bitfield used for selector
  * @vsel_step: Specify the resolution of selector stepping when setting
@@ -389,6 +391,7 @@ struct regulator_desc {
 
 	unsigned int vsel_range_reg;
 	unsigned int vsel_range_mask;
+	bool range_applied_by_vsel;
 	unsigned int vsel_reg;
 	unsigned int vsel_mask;
 	unsigned int vsel_step;
-- 
cgit v1.2.3


From 16e00683dc74cf1fcdf00046b90852bee05eb94a Mon Sep 17 00:00:00 2001
From: Steve French <stfrench@microsoft.com>
Date: Wed, 15 May 2024 18:06:03 -0500
Subject: smb3: reenable swapfiles over SMB3 mounts

With the changes to folios/netfs it is now easier to reenable
swapfile support over SMB3 which fixes various xfstests

Reviewed-by: David Howells <dhowells@redhat.com>
Suggested-by: David Howells <dhowells@redhat.com>
Fixes: e1209d3a7a67 ("mm: introduce ->swap_rw and use it for reads from SWP_FS_OPS swap-space")
Signed-off-by: Steve French <stfrench@microsoft.com>
---
 fs/netfs/direct_write.c |  3 ++-
 fs/smb/client/file.c    | 23 +++++++++++++++++++++++
 include/linux/netfs.h   |  2 ++
 3 files changed, 27 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
index 608ba6416919..f516460e994e 100644
--- a/fs/netfs/direct_write.c
+++ b/fs/netfs/direct_write.c
@@ -27,7 +27,7 @@ static void netfs_cleanup_dio_write(struct netfs_io_request *wreq)
  * Perform an unbuffered write where we may have to do an RMW operation on an
  * encrypted file.  This can also be used for direct I/O writes.
  */
-static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter,
+ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter,
 						  struct netfs_group *netfs_group)
 {
 	struct netfs_io_request *wreq;
@@ -117,6 +117,7 @@ out:
 	netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
 	return ret;
 }
+EXPORT_SYMBOL(netfs_unbuffered_write_iter_locked);
 
 /**
  * netfs_unbuffered_write_iter - Unbuffered write to a file
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 9d38294a7e68..9d5c2440abfc 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -3189,6 +3189,28 @@ static void cifs_swap_deactivate(struct file *file)
 	/* do we need to unpin (or unlock) the file */
 }
 
+/**
+ * cifs_swap_rw - SMB3 address space operation for swap I/O
+ * @iocb: target I/O control block
+ * @iter: I/O buffer
+ *
+ * Perform IO to the swap-file.  This is much like direct IO.
+ */
+static int cifs_swap_rw(struct kiocb *iocb, struct iov_iter *iter)
+{
+	ssize_t ret;
+
+	WARN_ON_ONCE(iov_iter_count(iter) != PAGE_SIZE);
+
+	if (iov_iter_rw(iter) == READ)
+		ret = netfs_unbuffered_read_iter_locked(iocb, iter);
+	else
+		ret = netfs_unbuffered_write_iter_locked(iocb, iter, NULL);
+	if (ret < 0)
+		return ret;
+	return 0;
+}
+
 const struct address_space_operations cifs_addr_ops = {
 	.read_folio	= netfs_read_folio,
 	.readahead	= netfs_readahead,
@@ -3204,6 +3226,7 @@ const struct address_space_operations cifs_addr_ops = {
 	 */
 	.swap_activate	= cifs_swap_activate,
 	.swap_deactivate = cifs_swap_deactivate,
+	.swap_rw = cifs_swap_rw,
 };
 
 /*
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index ca56a4428043..d2d291a9cdad 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -400,6 +400,8 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
 ssize_t netfs_buffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *from,
 					 struct netfs_group *netfs_group);
 ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from);
+ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter,
+					   struct netfs_group *netfs_group);
 ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from);
 
 /* Address operations API */
-- 
cgit v1.2.3


From 5a507b7d2be15fddb95bf8dee01110b723e2bcd9 Mon Sep 17 00:00:00 2001
From: Wayne Lin <Wayne.Lin@amd.com>
Date: Thu, 7 Mar 2024 14:29:57 +0800
Subject: drm/mst: Fix NULL pointer dereference at drm_dp_add_payload_part2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[Why]
Commit:
- commit 5aa1dfcdf0a4 ("drm/mst: Refactor the flow for payload allocation/removement")
accidently overwrite the commit
- commit 54d217406afe ("drm: use mgr->dev in drm_dbg_kms in drm_dp_add_payload_part2")
which cause regression.

[How]
Recover the original NULL fix and remove the unnecessary input parameter 'state' for
drm_dp_add_payload_part2().

Fixes: 5aa1dfcdf0a4 ("drm/mst: Refactor the flow for payload allocation/removement")
Reported-by: Leon Weiß <leon.weiss@ruhr-uni-bochum.de>
Link: https://lore.kernel.org/r/38c253ea42072cc825dc969ac4e6b9b600371cc8.camel@ruhr-uni-bochum.de/
Cc: lyude@redhat.com
Cc: imre.deak@intel.com
Cc: stable@vger.kernel.org
Cc: regressions@lists.linux.dev
Reviewed-by: Harry Wentland <harry.wentland@amd.com>
Acked-by: Jani Nikula <jani.nikula@intel.com>
Signed-off-by: Wayne Lin <Wayne.Lin@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240307062957.2323620-1-Wayne.Lin@amd.com
(cherry picked from commit 4545614c1d8da603e57b60dd66224d81b6ffc305)
---
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c | 2 +-
 drivers/gpu/drm/display/drm_dp_mst_topology.c             | 4 +---
 drivers/gpu/drm/i915/display/intel_dp_mst.c               | 2 +-
 drivers/gpu/drm/nouveau/dispnv50/disp.c                   | 2 +-
 include/drm/display/drm_dp_mst_helper.h                   | 1 -
 5 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
index c27063305a13..2c36f3d00ca2 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c
@@ -363,7 +363,7 @@ void dm_helpers_dp_mst_send_payload_allocation(
 	mst_state = to_drm_dp_mst_topology_state(mst_mgr->base.state);
 	new_payload = drm_atomic_get_mst_payload_state(mst_state, aconnector->mst_output_port);
 
-	ret = drm_dp_add_payload_part2(mst_mgr, mst_state->base.state, new_payload);
+	ret = drm_dp_add_payload_part2(mst_mgr, new_payload);
 
 	if (ret) {
 		amdgpu_dm_set_mst_status(&aconnector->mst_status,
diff --git a/drivers/gpu/drm/display/drm_dp_mst_topology.c b/drivers/gpu/drm/display/drm_dp_mst_topology.c
index 3577786b5db2..7f8e1cfbe19d 100644
--- a/drivers/gpu/drm/display/drm_dp_mst_topology.c
+++ b/drivers/gpu/drm/display/drm_dp_mst_topology.c
@@ -3421,7 +3421,6 @@ EXPORT_SYMBOL(drm_dp_remove_payload_part2);
 /**
  * drm_dp_add_payload_part2() - Execute payload update part 2
  * @mgr: Manager to use.
- * @state: The global atomic state
  * @payload: The payload to update
  *
  * If @payload was successfully assigned a starting time slot by drm_dp_add_payload_part1(), this
@@ -3430,14 +3429,13 @@ EXPORT_SYMBOL(drm_dp_remove_payload_part2);
  * Returns: 0 on success, negative error code on failure.
  */
 int drm_dp_add_payload_part2(struct drm_dp_mst_topology_mgr *mgr,
-			     struct drm_atomic_state *state,
 			     struct drm_dp_mst_atomic_payload *payload)
 {
 	int ret = 0;
 
 	/* Skip failed payloads */
 	if (payload->payload_allocation_status != DRM_DP_MST_PAYLOAD_ALLOCATION_DFP) {
-		drm_dbg_kms(state->dev, "Part 1 of payload creation for %s failed, skipping part 2\n",
+		drm_dbg_kms(mgr->dev, "Part 1 of payload creation for %s failed, skipping part 2\n",
 			    payload->port->connector->name);
 		return -EIO;
 	}
diff --git a/drivers/gpu/drm/i915/display/intel_dp_mst.c b/drivers/gpu/drm/i915/display/intel_dp_mst.c
index c772ba19c547..715d2f59f565 100644
--- a/drivers/gpu/drm/i915/display/intel_dp_mst.c
+++ b/drivers/gpu/drm/i915/display/intel_dp_mst.c
@@ -1241,7 +1241,7 @@ static void intel_mst_enable_dp(struct intel_atomic_state *state,
 	if (first_mst_stream)
 		intel_ddi_wait_for_fec_status(encoder, pipe_config, true);
 
-	drm_dp_add_payload_part2(&intel_dp->mst_mgr, &state->base,
+	drm_dp_add_payload_part2(&intel_dp->mst_mgr,
 				 drm_atomic_get_mst_payload_state(mst_state, connector->port));
 
 	if (DISPLAY_VER(dev_priv) >= 12)
diff --git a/drivers/gpu/drm/nouveau/dispnv50/disp.c b/drivers/gpu/drm/nouveau/dispnv50/disp.c
index 0c3d88ad0b0e..88728a0b2c25 100644
--- a/drivers/gpu/drm/nouveau/dispnv50/disp.c
+++ b/drivers/gpu/drm/nouveau/dispnv50/disp.c
@@ -915,7 +915,7 @@ nv50_msto_cleanup(struct drm_atomic_state *state,
 		msto->disabled = false;
 		drm_dp_remove_payload_part2(mgr, new_mst_state, old_payload, new_payload);
 	} else if (msto->enabled) {
-		drm_dp_add_payload_part2(mgr, state, new_payload);
+		drm_dp_add_payload_part2(mgr, new_payload);
 		msto->enabled = false;
 	}
 }
diff --git a/include/drm/display/drm_dp_mst_helper.h b/include/drm/display/drm_dp_mst_helper.h
index 3546b58a121b..cfe096389d94 100644
--- a/include/drm/display/drm_dp_mst_helper.h
+++ b/include/drm/display/drm_dp_mst_helper.h
@@ -871,7 +871,6 @@ int drm_dp_add_payload_part1(struct drm_dp_mst_topology_mgr *mgr,
 			     struct drm_dp_mst_topology_state *mst_state,
 			     struct drm_dp_mst_atomic_payload *payload);
 int drm_dp_add_payload_part2(struct drm_dp_mst_topology_mgr *mgr,
-			     struct drm_atomic_state *state,
 			     struct drm_dp_mst_atomic_payload *payload);
 void drm_dp_remove_payload_part1(struct drm_dp_mst_topology_mgr *mgr,
 				 struct drm_dp_mst_topology_state *mst_state,
-- 
cgit v1.2.3


From 93022482b2948a9a7e9b5a2bb685f2e1cb4c3348 Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Mon, 20 May 2024 15:45:33 -0700
Subject: x86/cpu: Fix x86_match_cpu() to match just X86_VENDOR_INTEL

Code in v6.9 arch/x86/kernel/smpboot.c was changed by commit

  4db64279bc2b ("x86/cpu: Switch to new Intel CPU model defines") from:

  static const struct x86_cpu_id intel_cod_cpu[] = {
          X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, 0),       /* COD */
          X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, 0),     /* COD */
          X86_MATCH_INTEL_FAM6_MODEL(ANY, 1),             /* SNC */	<--- 443
          {}
  };

  static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
  {
          const struct x86_cpu_id *id = x86_match_cpu(intel_cod_cpu);

to:

  static const struct x86_cpu_id intel_cod_cpu[] = {
           X86_MATCH_VFM(INTEL_HASWELL_X,   0),    /* COD */
           X86_MATCH_VFM(INTEL_BROADWELL_X, 0),    /* COD */
           X86_MATCH_VFM(INTEL_ANY,         1),    /* SNC */
           {}
   };

  static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
  {
          const struct x86_cpu_id *id = x86_match_cpu(intel_cod_cpu);

On an Intel CPU with SNC enabled this code previously matched the rule on line
443 to avoid printing messages about insane cache configuration.  The new code
did not match any rules.

Expanding the macros for the intel_cod_cpu[] array shows that the old is
equivalent to:

  static const struct x86_cpu_id intel_cod_cpu[] = {
  [0] = { .vendor = 0, .family = 6, .model = 0x3F, .steppings = 0, .feature = 0, .driver_data = 0 },
  [1] = { .vendor = 0, .family = 6, .model = 0x4F, .steppings = 0, .feature = 0, .driver_data = 0 },
  [2] = { .vendor = 0, .family = 6, .model = 0x00, .steppings = 0, .feature = 0, .driver_data = 1 },
  [3] = { .vendor = 0, .family = 0, .model = 0x00, .steppings = 0, .feature = 0, .driver_data = 0 }
  }

while the new code expands to:

  static const struct x86_cpu_id intel_cod_cpu[] = {
  [0] = { .vendor = 0, .family = 6, .model = 0x3F, .steppings = 0, .feature = 0, .driver_data = 0 },
  [1] = { .vendor = 0, .family = 6, .model = 0x4F, .steppings = 0, .feature = 0, .driver_data = 0 },
  [2] = { .vendor = 0, .family = 0, .model = 0x00, .steppings = 0, .feature = 0, .driver_data = 1 },
  [3] = { .vendor = 0, .family = 0, .model = 0x00, .steppings = 0, .feature = 0, .driver_data = 0 }
  }

Looking at the code for x86_match_cpu():

  const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
  {
           const struct x86_cpu_id *m;
           struct cpuinfo_x86 *c = &boot_cpu_data;

           for (m = match;
                m->vendor | m->family | m->model | m->steppings | m->feature;
                m++) {
       		...
           }
           return NULL;

it is clear that there was no match because the ANY entry in the table (array
index 2) is now the loop termination condition (all of vendor, family, model,
steppings, and feature are zero).

So this code was working before because the "ANY" check was looking for any
Intel CPU in family 6. But fails now because the family is a wild card. So the
root cause is that x86_match_cpu() has never been able to match on a rule with
just X86_VENDOR_INTEL and all other fields set to wildcards.

Add a new flags field to struct x86_cpu_id that has a bit set to indicate that
this entry in the array is valid. Update X86_MATCH*() macros to set that bit.
Change the end-marker check in x86_match_cpu() to just check the flags field
for this bit.

Backporter notes: The commit in Fixes is really the one that is broken:
you can't have m->vendor as part of the loop termination conditional in
x86_match_cpu() because it can happen - as it has happened above
- that that whole conditional is 0 albeit vendor == 0 is a valid case
- X86_VENDOR_INTEL is 0.

However, the only case where the above happens is the SNC check added by
4db64279bc2b1 so you only need this fix if you have backported that
other commit

  4db64279bc2b ("x86/cpu: Switch to new Intel CPU model defines")

Fixes: 644e9cbbe3fc ("Add driver auto probing for x86 features v4")
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Suggested-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Cc: <stable+noautosel@kernel.org> # see above
Link: https://lore.kernel.org/r/20240517144312.GBZkdtAOuJZCvxhFbJ@fat_crate.local
---
 arch/x86/include/asm/cpu_device_id.h | 5 +++++
 arch/x86/kernel/cpu/match.c          | 4 +---
 include/linux/mod_devicetable.h      | 2 ++
 3 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h
index 970a232009c3..b6325ee30871 100644
--- a/arch/x86/include/asm/cpu_device_id.h
+++ b/arch/x86/include/asm/cpu_device_id.h
@@ -53,6 +53,9 @@
 #define X86_CENTAUR_FAM6_C7_D		0xd
 #define X86_CENTAUR_FAM6_NANO		0xf
 
+/* x86_cpu_id::flags */
+#define X86_CPU_ID_FLAG_ENTRY_VALID	BIT(0)
+
 #define X86_STEPPINGS(mins, maxs)    GENMASK(maxs, mins)
 /**
  * X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE - Base macro for CPU matching
@@ -79,6 +82,7 @@
 	.model		= _model,					\
 	.steppings	= _steppings,					\
 	.feature	= _feature,					\
+	.flags		= X86_CPU_ID_FLAG_ENTRY_VALID,			\
 	.driver_data	= (unsigned long) _data				\
 }
 
@@ -89,6 +93,7 @@
 	.model		= _model,					\
 	.steppings	= _steppings,					\
 	.feature	= _feature,					\
+	.flags		= X86_CPU_ID_FLAG_ENTRY_VALID,			\
 	.driver_data	= (unsigned long) _data				\
 }
 
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
index 8651643bddae..8e7de733320a 100644
--- a/arch/x86/kernel/cpu/match.c
+++ b/arch/x86/kernel/cpu/match.c
@@ -38,9 +38,7 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
 	const struct x86_cpu_id *m;
 	struct cpuinfo_x86 *c = &boot_cpu_data;
 
-	for (m = match;
-	     m->vendor | m->family | m->model | m->steppings | m->feature;
-	     m++) {
+	for (m = match; m->flags & X86_CPU_ID_FLAG_ENTRY_VALID; m++) {
 		if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor)
 			continue;
 		if (m->family != X86_FAMILY_ANY && c->x86 != m->family)
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 7a9a07ea451b..4338b1b4ac44 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -690,6 +690,8 @@ struct x86_cpu_id {
 	__u16 model;
 	__u16 steppings;
 	__u16 feature;	/* bit index */
+	/* Solely for kernel-internal use: DO NOT EXPORT to userspace! */
+	__u16 flags;
 	kernel_ulong_t driver_data;
 };
 
-- 
cgit v1.2.3


From db5247d9bf5c6ade9fd70b4e4897441e0269b233 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Fri, 15 Mar 2024 19:47:06 -0500
Subject: vhost_task: Handle SIGKILL by flushing work and exiting

Instead of lingering until the device is closed, this has us handle
SIGKILL by:

1. marking the worker as killed so we no longer try to use it with
   new virtqueues and new flush operations.
2. setting the virtqueue to worker mapping so no new works are queued.
3. running all the exiting works.

Suggested-by: Edward Adam Davis <eadavis@qq.com>
Reported-and-tested-by: syzbot+98edc2df894917b3431f@syzkaller.appspotmail.com
Message-Id: <tencent_546DA49414E876EEBECF2C78D26D242EE50A@qq.com>
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Message-Id: <20240316004707.45557-9-michael.christie@oracle.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/vhost.c            | 54 +++++++++++++++++++++++++++++++++++++---
 drivers/vhost/vhost.h            |  2 ++
 include/linux/sched/vhost_task.h |  3 ++-
 kernel/vhost_task.c              | 53 +++++++++++++++++++++++++--------------
 4 files changed, 88 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index c6448ff37768..b60955682474 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -273,7 +273,7 @@ static void __vhost_worker_flush(struct vhost_worker *worker)
 {
 	struct vhost_flush_struct flush;
 
-	if (!worker->attachment_cnt)
+	if (!worker->attachment_cnt || worker->killed)
 		return;
 
 	init_completion(&flush.wait_event);
@@ -388,7 +388,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	__vhost_vq_meta_reset(vq);
 }
 
-static bool vhost_worker(void *data)
+static bool vhost_run_work_list(void *data)
 {
 	struct vhost_worker *worker = data;
 	struct vhost_work *work, *work_next;
@@ -413,6 +413,40 @@ static bool vhost_worker(void *data)
 	return !!node;
 }
 
+static void vhost_worker_killed(void *data)
+{
+	struct vhost_worker *worker = data;
+	struct vhost_dev *dev = worker->dev;
+	struct vhost_virtqueue *vq;
+	int i, attach_cnt = 0;
+
+	mutex_lock(&worker->mutex);
+	worker->killed = true;
+
+	for (i = 0; i < dev->nvqs; i++) {
+		vq = dev->vqs[i];
+
+		mutex_lock(&vq->mutex);
+		if (worker ==
+		    rcu_dereference_check(vq->worker,
+					  lockdep_is_held(&vq->mutex))) {
+			rcu_assign_pointer(vq->worker, NULL);
+			attach_cnt++;
+		}
+		mutex_unlock(&vq->mutex);
+	}
+
+	worker->attachment_cnt -= attach_cnt;
+	if (attach_cnt)
+		synchronize_rcu();
+	/*
+	 * Finish vhost_worker_flush calls and any other works that snuck in
+	 * before the synchronize_rcu.
+	 */
+	vhost_run_work_list(worker);
+	mutex_unlock(&worker->mutex);
+}
+
 static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
 {
 	kfree(vq->indirect);
@@ -627,9 +661,11 @@ static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
 	if (!worker)
 		return NULL;
 
+	worker->dev = dev;
 	snprintf(name, sizeof(name), "vhost-%d", current->pid);
 
-	vtsk = vhost_task_create(vhost_worker, worker, name);
+	vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed,
+				 worker, name);
 	if (!vtsk)
 		goto free_worker;
 
@@ -661,6 +697,11 @@ static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq,
 	struct vhost_worker *old_worker;
 
 	mutex_lock(&worker->mutex);
+	if (worker->killed) {
+		mutex_unlock(&worker->mutex);
+		return;
+	}
+
 	mutex_lock(&vq->mutex);
 
 	old_worker = rcu_dereference_check(vq->worker,
@@ -681,6 +722,11 @@ static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq,
 	 * device wide flushes which doesn't use RCU for execution.
 	 */
 	mutex_lock(&old_worker->mutex);
+	if (old_worker->killed) {
+		mutex_unlock(&old_worker->mutex);
+		return;
+	}
+
 	/*
 	 * We don't want to call synchronize_rcu for every vq during setup
 	 * because it will slow down VM startup. If we haven't done
@@ -758,7 +804,7 @@ static int vhost_free_worker(struct vhost_dev *dev,
 		return -ENODEV;
 
 	mutex_lock(&worker->mutex);
-	if (worker->attachment_cnt) {
+	if (worker->attachment_cnt || worker->killed) {
 		mutex_unlock(&worker->mutex);
 		return -EBUSY;
 	}
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 91ade037f08e..bb75a292d50c 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -28,12 +28,14 @@ struct vhost_work {
 
 struct vhost_worker {
 	struct vhost_task	*vtsk;
+	struct vhost_dev	*dev;
 	/* Used to serialize device wide flushing with worker swapping. */
 	struct mutex		mutex;
 	struct llist_head	work_list;
 	u64			kcov_handle;
 	u32			id;
 	int			attachment_cnt;
+	bool			killed;
 };
 
 /* Poll a file (eventfd or socket) */
diff --git a/include/linux/sched/vhost_task.h b/include/linux/sched/vhost_task.h
index bc60243d43b3..25446c5d3508 100644
--- a/include/linux/sched/vhost_task.h
+++ b/include/linux/sched/vhost_task.h
@@ -4,7 +4,8 @@
 
 struct vhost_task;
 
-struct vhost_task *vhost_task_create(bool (*fn)(void *), void *arg,
+struct vhost_task *vhost_task_create(bool (*fn)(void *),
+				     void (*handle_kill)(void *), void *arg,
 				     const char *name);
 void vhost_task_start(struct vhost_task *vtsk);
 void vhost_task_stop(struct vhost_task *vtsk);
diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c
index da35e5b7f047..8800f5acc007 100644
--- a/kernel/vhost_task.c
+++ b/kernel/vhost_task.c
@@ -10,38 +10,32 @@
 
 enum vhost_task_flags {
 	VHOST_TASK_FLAGS_STOP,
+	VHOST_TASK_FLAGS_KILLED,
 };
 
 struct vhost_task {
 	bool (*fn)(void *data);
+	void (*handle_sigkill)(void *data);
 	void *data;
 	struct completion exited;
 	unsigned long flags;
 	struct task_struct *task;
+	/* serialize SIGKILL and vhost_task_stop calls */
+	struct mutex exit_mutex;
 };
 
 static int vhost_task_fn(void *data)
 {
 	struct vhost_task *vtsk = data;
-	bool dead = false;
 
 	for (;;) {
 		bool did_work;
 
-		if (!dead && signal_pending(current)) {
+		if (signal_pending(current)) {
 			struct ksignal ksig;
-			/*
-			 * Calling get_signal will block in SIGSTOP,
-			 * or clear fatal_signal_pending, but remember
-			 * what was set.
-			 *
-			 * This thread won't actually exit until all
-			 * of the file descriptors are closed, and
-			 * the release function is called.
-			 */
-			dead = get_signal(&ksig);
-			if (dead)
-				clear_thread_flag(TIF_SIGPENDING);
+
+			if (get_signal(&ksig))
+				break;
 		}
 
 		/* mb paired w/ vhost_task_stop */
@@ -57,7 +51,19 @@ static int vhost_task_fn(void *data)
 			schedule();
 	}
 
+	mutex_lock(&vtsk->exit_mutex);
+	/*
+	 * If a vhost_task_stop and SIGKILL race, we can ignore the SIGKILL.
+	 * When the vhost layer has called vhost_task_stop it's already stopped
+	 * new work and flushed.
+	 */
+	if (!test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags)) {
+		set_bit(VHOST_TASK_FLAGS_KILLED, &vtsk->flags);
+		vtsk->handle_sigkill(vtsk->data);
+	}
+	mutex_unlock(&vtsk->exit_mutex);
 	complete(&vtsk->exited);
+
 	do_exit(0);
 }
 
@@ -78,12 +84,17 @@ EXPORT_SYMBOL_GPL(vhost_task_wake);
  * @vtsk: vhost_task to stop
  *
  * vhost_task_fn ensures the worker thread exits after
- * VHOST_TASK_FLAGS_SOP becomes true.
+ * VHOST_TASK_FLAGS_STOP becomes true.
  */
 void vhost_task_stop(struct vhost_task *vtsk)
 {
-	set_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags);
-	vhost_task_wake(vtsk);
+	mutex_lock(&vtsk->exit_mutex);
+	if (!test_bit(VHOST_TASK_FLAGS_KILLED, &vtsk->flags)) {
+		set_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags);
+		vhost_task_wake(vtsk);
+	}
+	mutex_unlock(&vtsk->exit_mutex);
+
 	/*
 	 * Make sure vhost_task_fn is no longer accessing the vhost_task before
 	 * freeing it below.
@@ -96,14 +107,16 @@ EXPORT_SYMBOL_GPL(vhost_task_stop);
 /**
  * vhost_task_create - create a copy of a task to be used by the kernel
  * @fn: vhost worker function
- * @arg: data to be passed to fn
+ * @handle_sigkill: vhost function to handle when we are killed
+ * @arg: data to be passed to fn and handled_kill
  * @name: the thread's name
  *
  * This returns a specialized task for use by the vhost layer or NULL on
  * failure. The returned task is inactive, and the caller must fire it up
  * through vhost_task_start().
  */
-struct vhost_task *vhost_task_create(bool (*fn)(void *), void *arg,
+struct vhost_task *vhost_task_create(bool (*fn)(void *),
+				     void (*handle_sigkill)(void *), void *arg,
 				     const char *name)
 {
 	struct kernel_clone_args args = {
@@ -122,8 +135,10 @@ struct vhost_task *vhost_task_create(bool (*fn)(void *), void *arg,
 	if (!vtsk)
 		return NULL;
 	init_completion(&vtsk->exited);
+	mutex_init(&vtsk->exit_mutex);
 	vtsk->data = arg;
 	vtsk->fn = fn;
+	vtsk->handle_sigkill = handle_sigkill;
 
 	args.fn_arg = vtsk;
 
-- 
cgit v1.2.3


From e4544c550eb1857845bb7114bac53edd04dc078f Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 18 Mar 2024 13:06:45 +0100
Subject: virtio-mem: support suspend+resume

With virtio-mem, primarily hibernation is problematic: as the machine shuts
down, the virtio-mem device loses its state. Powering the machine back up
is like losing a bunch of DIMMs. While there would be ways to add limited
support, suspend+resume is more commonly used for VMs and "easier" to
support cleanly.

s2idle can be supported without any device dependencies. Similarly, one
would expect suspend-to-ram (i.e., S3) to work out of the box. However,
QEMU currently unplugs all device memory when resuming the VM, using a
cold reset on the "wakeup" path. In order to support S3, we need a feature
flag for the device to tell us if memory remains plugged when waking up. In
the future, QEMU will implement this feature.

So let's always support s2idle and support S3 with plugged memory only if
the device indicates support. Block hibernation early using the PM
notifier.

Trying to hibernate now fails early:
	# echo disk > /sys/power/state
	[   26.455369] PM: hibernation: hibernation entry
	[   26.458271] virtio_mem virtio0: hibernation is not supported.
	[   26.462498] PM: hibernation: hibernation exit
	-bash: echo: write error: Operation not permitted

s2idle works even without the new feature bit:
	# echo s2idle > /sys/power/mem_sleep
	# echo mem > /sys/power/state
	[   52.083725] PM: suspend entry (s2idle)
	[   52.095950] Filesystems sync: 0.010 seconds
	[   52.101493] Freezing user space processes
	[   52.104213] Freezing user space processes completed (elapsed 0.001 seconds)
	[   52.106520] OOM killer disabled.
	[   52.107655] Freezing remaining freezable tasks
	[   52.110880] Freezing remaining freezable tasks completed (elapsed 0.001 seconds)
	[   52.113296] printk: Suspending console(s) (use no_console_suspend to debug)

S3 does not work without the feature bit when memory is plugged:
	# echo deep > /sys/power/mem_sleep
	# echo mem > /sys/power/state
	[   32.788281] PM: suspend entry (deep)
	[   32.816630] Filesystems sync: 0.027 seconds
	[   32.820029] Freezing user space processes
	[   32.823870] Freezing user space processes completed (elapsed 0.001 seconds)
	[   32.827756] OOM killer disabled.
	[   32.829608] Freezing remaining freezable tasks
	[   32.833842] Freezing remaining freezable tasks completed (elapsed 0.001 seconds)
	[   32.837953] printk: Suspending console(s) (use no_console_suspend to debug)
	[   32.916172] virtio_mem virtio0: suspend+resume with plugged memory is not supported
	[   32.916181] virtio-pci 0000:00:02.0: PM: pci_pm_suspend(): virtio_pci_freeze+0x0/0x50 returns -1
	[   32.916197] virtio-pci 0000:00:02.0: PM: dpm_run_callback(): pci_pm_suspend+0x0/0x170 returns -1
	[   32.916210] virtio-pci 0000:00:02.0: PM: failed to suspend async: error -1

But S3 works with the new feature bit when memory is plugged (patched
QEMU):
	# echo deep > /sys/power/mem_sleep
	# echo mem > /sys/power/state
	[   33.983694] PM: suspend entry (deep)
	[   34.009828] Filesystems sync: 0.024 seconds
	[   34.013589] Freezing user space processes
	[   34.016722] Freezing user space processes completed (elapsed 0.001 seconds)
	[   34.019092] OOM killer disabled.
	[   34.020291] Freezing remaining freezable tasks
	[   34.023549] Freezing remaining freezable tasks completed (elapsed 0.001 seconds)
	[   34.026090] printk: Suspending console(s) (use no_console_suspend to debug)

Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20240318120645.105664-1-david@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_mem.c     | 68 +++++++++++++++++++++++++++++++++++++----
 include/uapi/linux/virtio_mem.h |  2 ++
 2 files changed, 64 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index 8e3223294442..51088d02de32 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -21,6 +21,7 @@
 #include <linux/bitmap.h>
 #include <linux/lockdep.h>
 #include <linux/log2.h>
+#include <linux/suspend.h>
 
 #include <acpi/acpi_numa.h>
 
@@ -252,6 +253,9 @@ struct virtio_mem {
 	/* Memory notifier (online/offline events). */
 	struct notifier_block memory_notifier;
 
+	/* Notifier to block hibernation image storing/reloading. */
+	struct notifier_block pm_notifier;
+
 #ifdef CONFIG_PROC_VMCORE
 	/* vmcore callback for /proc/vmcore handling in kdump mode */
 	struct vmcore_cb vmcore_cb;
@@ -1111,6 +1115,25 @@ static int virtio_mem_memory_notifier_cb(struct notifier_block *nb,
 	return rc;
 }
 
+static int virtio_mem_pm_notifier_cb(struct notifier_block *nb,
+				     unsigned long action, void *arg)
+{
+	struct virtio_mem *vm = container_of(nb, struct virtio_mem,
+					     pm_notifier);
+	switch (action) {
+	case PM_HIBERNATION_PREPARE:
+	case PM_RESTORE_PREPARE:
+		/*
+		 * When restarting the VM, all memory is unplugged. Don't
+		 * allow to hibernate and restore from an image.
+		 */
+		dev_err(&vm->vdev->dev, "hibernation is not supported.\n");
+		return NOTIFY_BAD;
+	default:
+		return NOTIFY_OK;
+	}
+}
+
 /*
  * Set a range of pages PG_offline. Remember pages that were never onlined
  * (via generic_online_page()) using PageDirty().
@@ -2615,11 +2638,19 @@ static int virtio_mem_init_hotplug(struct virtio_mem *vm)
 	rc = register_memory_notifier(&vm->memory_notifier);
 	if (rc)
 		goto out_unreg_group;
-	rc = register_virtio_mem_device(vm);
+	/* Block hibernation as early as possible. */
+	vm->pm_notifier.priority = INT_MAX;
+	vm->pm_notifier.notifier_call = virtio_mem_pm_notifier_cb;
+	rc = register_pm_notifier(&vm->pm_notifier);
 	if (rc)
 		goto out_unreg_mem;
+	rc = register_virtio_mem_device(vm);
+	if (rc)
+		goto out_unreg_pm;
 
 	return 0;
+out_unreg_pm:
+	unregister_pm_notifier(&vm->pm_notifier);
 out_unreg_mem:
 	unregister_memory_notifier(&vm->memory_notifier);
 out_unreg_group:
@@ -2897,6 +2928,7 @@ static void virtio_mem_deinit_hotplug(struct virtio_mem *vm)
 
 	/* unregister callbacks */
 	unregister_virtio_mem_device(vm);
+	unregister_pm_notifier(&vm->pm_notifier);
 	unregister_memory_notifier(&vm->memory_notifier);
 
 	/*
@@ -2960,17 +2992,40 @@ static void virtio_mem_config_changed(struct virtio_device *vdev)
 #ifdef CONFIG_PM_SLEEP
 static int virtio_mem_freeze(struct virtio_device *vdev)
 {
+	struct virtio_mem *vm = vdev->priv;
+
 	/*
-	 * When restarting the VM, all memory is usually unplugged. Don't
-	 * allow to suspend/hibernate.
+	 * We block hibernation using the PM notifier completely. The workqueue
+	 * is already frozen by the PM core at this point, so we simply
+	 * reset the device and cleanup the queues.
 	 */
-	dev_err(&vdev->dev, "save/restore not supported.\n");
-	return -EPERM;
+	if (pm_suspend_target_state != PM_SUSPEND_TO_IDLE &&
+	    vm->plugged_size &&
+	    !virtio_has_feature(vm->vdev, VIRTIO_MEM_F_PERSISTENT_SUSPEND)) {
+		dev_err(&vm->vdev->dev,
+			"suspending with plugged memory is not supported\n");
+		return -EPERM;
+	}
+
+	virtio_reset_device(vdev);
+	vdev->config->del_vqs(vdev);
+	vm->vq = NULL;
+	return 0;
 }
 
 static int virtio_mem_restore(struct virtio_device *vdev)
 {
-	return -EPERM;
+	struct virtio_mem *vm = vdev->priv;
+	int ret;
+
+	ret = virtio_mem_init_vq(vm);
+	if (ret)
+		return ret;
+	virtio_device_ready(vdev);
+
+	/* Let's check if anything changed. */
+	virtio_mem_config_changed(vdev);
+	return 0;
 }
 #endif
 
@@ -2979,6 +3034,7 @@ static unsigned int virtio_mem_features[] = {
 	VIRTIO_MEM_F_ACPI_PXM,
 #endif
 	VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE,
+	VIRTIO_MEM_F_PERSISTENT_SUSPEND,
 };
 
 static const struct virtio_device_id virtio_mem_id_table[] = {
diff --git a/include/uapi/linux/virtio_mem.h b/include/uapi/linux/virtio_mem.h
index e9122f1d0e0c..6e4b2cf6b7f1 100644
--- a/include/uapi/linux/virtio_mem.h
+++ b/include/uapi/linux/virtio_mem.h
@@ -90,6 +90,8 @@
 #define VIRTIO_MEM_F_ACPI_PXM		0
 /* unplugged memory must not be accessed */
 #define VIRTIO_MEM_F_UNPLUGGED_INACCESSIBLE	1
+/* plugged memory will remain plugged when suspending+resuming */
+#define VIRTIO_MEM_F_PERSISTENT_SUSPEND		2
 
 
 /* --- virtio-mem: guest -> host requests --- */
-- 
cgit v1.2.3


From dbaaabd60e1662d2659eaeab0a4fc521667737ed Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 8 Apr 2024 11:38:30 -0700
Subject: clang: work around asm input constraint problems

Work around clang problems with asm constraints that have multiple
possibilities, particularly "g" and "rm".

Clang seems to turn inputs like that into the most generic form, which
is the memory input - but to make matters worse, clang won't even use a
possible original memory location, but will spill the value to stack,
and use the stack for the asm input.

See

  https://github.com/llvm/llvm-project/issues/20571#issuecomment-980933442

for some explanation of why clang has this strange behavior, but the end
result is that "g" and "rm" really end up generating horrid code.

Link: https://github.com/llvm/llvm-project/issues/20571
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-clang.h | 10 ++++++++++
 include/linux/compiler_types.h |  9 +++++++++
 2 files changed, 19 insertions(+)

(limited to 'include')

diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index 49feac0162a5..4c1a39dcb624 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -118,3 +118,13 @@
 
 #define __diag_ignore_all(option, comment) \
 	__diag_clang(13, ignore, option)
+
+/*
+ * clang has horrible behavior with "g" or "rm" constraints for asm
+ * inputs, turning them into something worse than "m". Avoid using
+ * constraints with multiple possible uses (but "ir" seems to be ok):
+ *
+ *	https://github.com/llvm/llvm-project/issues/20571
+ */
+#define ASM_INPUT_G "ir"
+#define ASM_INPUT_RM "r"
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index 8f8236317d5b..f82bdac63961 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -391,6 +391,15 @@ struct ftrace_likely_data {
 #define asm_goto_output(x...) asm volatile goto(x)
 #endif
 
+/*
+ * Clang has trouble with constraints with multiple
+ * alternative behaviors (mainly "g" and "rm").
+ */
+#ifndef ASM_INPUT_G
+  #define ASM_INPUT_G "g"
+  #define ASM_INPUT_RM "rm"
+#endif
+
 #ifdef CONFIG_CC_HAS_ASM_INLINE
 #define asm_inline asm __inline
 #else
-- 
cgit v1.2.3


From 7caa9765465f60b6d88e22264892cee12d971888 Mon Sep 17 00:00:00 2001
From: Puranjay Mohan <puranjay@kernel.org>
Date: Fri, 5 Apr 2024 14:24:53 +0000
Subject: ftrace: riscv: move from REGS to ARGS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit replaces riscv's support for FTRACE_WITH_REGS with support
for FTRACE_WITH_ARGS. This is required for the ongoing effort to stop
relying on stop_machine() for RISCV's implementation of ftrace.

The main relevant benefit that this change will bring for the above
use-case is that now we don't have separate ftrace_caller and
ftrace_regs_caller trampolines. This will allow the callsite to call
ftrace_caller by modifying a single instruction. Now the callsite can
do something similar to:

When not tracing:            |             When tracing:

func:                                      func:
  auipc t0, ftrace_caller_top                auipc t0, ftrace_caller_top
  nop  <=========<Enable/Disable>=========>  jalr  t0, ftrace_caller_bottom
  [...]                                      [...]

The above assumes that we are dropping the support of calling a direct
trampoline from the callsite. We need to drop this as the callsite can't
change the target address to call, it can only enable/disable a call to
a preset target (ftrace_caller in the above diagram). We can later optimize
this by calling an intermediate dispatcher trampoline before ftrace_caller.

Currently, ftrace_regs_caller saves all CPU registers in the format of
struct pt_regs and allows the tracer to modify them. We don't need to
save all of the CPU registers because at function entry only a subset of
pt_regs is live:

|----------+----------+---------------------------------------------|
| Register | ABI Name | Description                                 |
|----------+----------+---------------------------------------------|
| x1       | ra       | Return address for traced function          |
| x2       | sp       | Stack pointer                               |
| x5       | t0       | Return address for ftrace_caller trampoline |
| x8       | s0/fp    | Frame pointer                               |
| x10-11   | a0-1     | Function arguments/return values            |
| x12-17   | a2-7     | Function arguments                          |
|----------+----------+---------------------------------------------|

See RISCV calling convention[1] for the above table.

Saving just the live registers decreases the amount of stack space
required from 288 Bytes to 112 Bytes.

Basic testing was done with this on the VisionFive 2 development board.

Note:
  - Moving from REGS to ARGS will mean that RISCV will stop supporting
    KPROBES_ON_FTRACE as it requires full pt_regs to be saved.
  - KPROBES_ON_FTRACE will be supplanted by FPROBES see [2].

[1] https://riscv.org/wp-content/uploads/2015/01/riscv-calling.pdf
[2] https://lore.kernel.org/all/170887410337.564249.6360118840946697039.stgit@devnote2/

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Tested-by: Björn Töpel <bjorn@rivosinc.com>
Reviewed-by: Björn Töpel <bjorn@rivosinc.com>
Link: https://lore.kernel.org/r/20240405142453.4187-1-puranjay@kernel.org
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/Kconfig              |   2 +-
 arch/riscv/include/asm/ftrace.h |  76 ++++++++++++++++--
 arch/riscv/kernel/asm-offsets.c |  18 +++++
 arch/riscv/kernel/ftrace.c      |  17 ++--
 arch/riscv/kernel/mcount-dyn.S  | 171 ++++++++++++----------------------------
 include/linux/ftrace.h          |   3 +-
 6 files changed, 145 insertions(+), 142 deletions(-)

(limited to 'include')

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index b219570d4ad7..9320a9d01596 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -128,7 +128,7 @@ config RISCV
 	select HAVE_DMA_CONTIGUOUS if MMU
 	select HAVE_DYNAMIC_FTRACE if !XIP_KERNEL && MMU && (CLANG_SUPPORTS_DYNAMIC_FTRACE || GCC_SUPPORTS_DYNAMIC_FTRACE)
 	select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
-	select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE
+	select HAVE_DYNAMIC_FTRACE_WITH_ARGS if HAVE_DYNAMIC_FTRACE
 	select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_GRAPH_RETVAL if HAVE_FUNCTION_GRAPH_TRACER
diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h
index 1276d7d9ca8b..9eb31a7ea0aa 100644
--- a/arch/riscv/include/asm/ftrace.h
+++ b/arch/riscv/include/asm/ftrace.h
@@ -124,20 +124,82 @@ struct dyn_ftrace;
 int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec);
 #define ftrace_init_nop ftrace_init_nop
 
-#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS
+#define arch_ftrace_get_regs(regs) NULL
 struct ftrace_ops;
-struct ftrace_regs;
+struct ftrace_regs {
+	unsigned long epc;
+	unsigned long ra;
+	unsigned long sp;
+	unsigned long s0;
+	unsigned long t1;
+	union {
+		unsigned long args[8];
+		struct {
+			unsigned long a0;
+			unsigned long a1;
+			unsigned long a2;
+			unsigned long a3;
+			unsigned long a4;
+			unsigned long a5;
+			unsigned long a6;
+			unsigned long a7;
+		};
+	};
+};
+
+static __always_inline unsigned long ftrace_regs_get_instruction_pointer(const struct ftrace_regs
+									 *fregs)
+{
+	return fregs->epc;
+}
+
+static __always_inline void ftrace_regs_set_instruction_pointer(struct ftrace_regs *fregs,
+								unsigned long pc)
+{
+	fregs->epc = pc;
+}
+
+static __always_inline unsigned long ftrace_regs_get_stack_pointer(const struct ftrace_regs *fregs)
+{
+	return fregs->sp;
+}
+
+static __always_inline unsigned long ftrace_regs_get_argument(struct ftrace_regs *fregs,
+							      unsigned int n)
+{
+	if (n < 8)
+		return fregs->args[n];
+	return 0;
+}
+
+static __always_inline unsigned long ftrace_regs_get_return_value(const struct ftrace_regs *fregs)
+{
+	return fregs->a0;
+}
+
+static __always_inline void ftrace_regs_set_return_value(struct ftrace_regs *fregs,
+							 unsigned long ret)
+{
+	fregs->a0 = ret;
+}
+
+static __always_inline void ftrace_override_function_with_return(struct ftrace_regs *fregs)
+{
+	fregs->epc = fregs->ra;
+}
+
+int ftrace_regs_query_register_offset(const char *name);
+
 void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
 		       struct ftrace_ops *op, struct ftrace_regs *fregs);
 #define ftrace_graph_func ftrace_graph_func
 
-static inline void __arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned long addr)
+static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, unsigned long addr)
 {
-		regs->t1 = addr;
+	fregs->t1 = addr;
 }
-#define arch_ftrace_set_direct_caller(fregs, addr) \
-	__arch_ftrace_set_direct_caller(&(fregs)->regs, addr)
-#endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
index a03129f40c46..b09ca5f944f7 100644
--- a/arch/riscv/kernel/asm-offsets.c
+++ b/arch/riscv/kernel/asm-offsets.c
@@ -9,6 +9,7 @@
 #include <linux/kbuild.h>
 #include <linux/mm.h>
 #include <linux/sched.h>
+#include <linux/ftrace.h>
 #include <linux/suspend.h>
 #include <asm/kvm_host.h>
 #include <asm/thread_info.h>
@@ -488,4 +489,21 @@ void asm_offsets(void)
 	DEFINE(STACKFRAME_SIZE_ON_STACK, ALIGN(sizeof(struct stackframe), STACK_ALIGN));
 	OFFSET(STACKFRAME_FP, stackframe, fp);
 	OFFSET(STACKFRAME_RA, stackframe, ra);
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS
+	DEFINE(FREGS_SIZE_ON_STACK, ALIGN(sizeof(struct ftrace_regs), STACK_ALIGN));
+	DEFINE(FREGS_EPC,	    offsetof(struct ftrace_regs, epc));
+	DEFINE(FREGS_RA,	    offsetof(struct ftrace_regs, ra));
+	DEFINE(FREGS_SP,	    offsetof(struct ftrace_regs, sp));
+	DEFINE(FREGS_S0,	    offsetof(struct ftrace_regs, s0));
+	DEFINE(FREGS_T1,	    offsetof(struct ftrace_regs, t1));
+	DEFINE(FREGS_A0,	    offsetof(struct ftrace_regs, a0));
+	DEFINE(FREGS_A1,	    offsetof(struct ftrace_regs, a1));
+	DEFINE(FREGS_A2,	    offsetof(struct ftrace_regs, a2));
+	DEFINE(FREGS_A3,	    offsetof(struct ftrace_regs, a3));
+	DEFINE(FREGS_A4,	    offsetof(struct ftrace_regs, a4));
+	DEFINE(FREGS_A5,	    offsetof(struct ftrace_regs, a5));
+	DEFINE(FREGS_A6,	    offsetof(struct ftrace_regs, a6));
+	DEFINE(FREGS_A7,	    offsetof(struct ftrace_regs, a7));
+#endif
 }
diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c
index 4f4987a6d83d..4d14eadcd765 100644
--- a/arch/riscv/kernel/ftrace.c
+++ b/arch/riscv/kernel/ftrace.c
@@ -127,10 +127,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
 {
 	int ret = __ftrace_modify_call((unsigned long)&ftrace_call,
 				       (unsigned long)func, true, true);
-	if (!ret) {
-		ret = __ftrace_modify_call((unsigned long)&ftrace_regs_call,
-					   (unsigned long)func, true, true);
-	}
 
 	return ret;
 }
@@ -172,7 +168,7 @@ void arch_ftrace_update_code(int command)
 }
 #endif
 
-#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
 int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
 		       unsigned long addr)
 {
@@ -214,16 +210,13 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
 }
 
 #ifdef CONFIG_DYNAMIC_FTRACE
-#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS
 void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
 		       struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
-	struct pt_regs *regs = arch_ftrace_get_regs(fregs);
-	unsigned long *parent = (unsigned long *)&regs->ra;
-
-	prepare_ftrace_return(parent, ip, frame_pointer(regs));
+	prepare_ftrace_return(&fregs->ra, ip, fregs->s0);
 }
-#else /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */
+#else /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */
 extern void ftrace_graph_call(void);
 int ftrace_enable_ftrace_graph_caller(void)
 {
@@ -236,6 +229,6 @@ int ftrace_disable_ftrace_graph_caller(void)
 	return __ftrace_modify_call((unsigned long)&ftrace_graph_call,
 				    (unsigned long)&prepare_ftrace_return, false, true);
 }
-#endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */
 #endif /* CONFIG_DYNAMIC_FTRACE */
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/riscv/kernel/mcount-dyn.S b/arch/riscv/kernel/mcount-dyn.S
index b7561288e8da..745dd4c4a69c 100644
--- a/arch/riscv/kernel/mcount-dyn.S
+++ b/arch/riscv/kernel/mcount-dyn.S
@@ -56,138 +56,77 @@
 	addi	sp, sp, ABI_SIZE_ON_STACK
 	.endm
 
-#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS
 
 /**
-* SAVE_ABI_REGS - save regs against the pt_regs struct
-*
-* @all: tell if saving all the regs
-*
-* If all is set, all the regs will be saved, otherwise only ABI
-* related regs (a0-a7,epc,ra and optional s0) will be saved.
+* SAVE_ABI_REGS - save regs against the ftrace_regs struct
 *
 * After the stack is established,
 *
 * 0(sp) stores the PC of the traced function which can be accessed
-* by &(fregs)->regs->epc in tracing function. Note that the real
+* by &(fregs)->epc in tracing function. Note that the real
 * function entry address should be computed with -FENTRY_RA_OFFSET.
 *
 * 8(sp) stores the function return address (i.e. parent IP) that
-* can be accessed by &(fregs)->regs->ra in tracing function.
+* can be accessed by &(fregs)->ra in tracing function.
 *
 * The other regs are saved at the respective localtion and accessed
-* by the respective pt_regs member.
+* by the respective ftrace_regs member.
 *
 * Here is the layout of stack for your reference.
 *
 * PT_SIZE_ON_STACK  ->  +++++++++
 *                       + ..... +
-*                       + t3-t6 +
-*                       + s2-s11+
 *                       + a0-a7 + --++++-> ftrace_caller saved
-*                       + s1    +   +
-*                       + s0    + --+
-*                       + t0-t2 +   +
-*                       + tp    +   +
-*                       + gp    +   +
+*                       + t1    + --++++-> direct tramp address
+*                       + s0    + --+ // frame pointer
 *                       + sp    +   +
 *                       + ra    + --+ // parent IP
 *               sp  ->  + epc   + --+ // PC
 *                       +++++++++
 **/
-	.macro SAVE_ABI_REGS, all=0
-	addi	sp, sp, -PT_SIZE_ON_STACK
-
-	REG_S	t0,  PT_EPC(sp)
-	REG_S	x1,  PT_RA(sp)
-
-	// save the ABI regs
-
-	REG_S	x10, PT_A0(sp)
-	REG_S	x11, PT_A1(sp)
-	REG_S	x12, PT_A2(sp)
-	REG_S	x13, PT_A3(sp)
-	REG_S	x14, PT_A4(sp)
-	REG_S	x15, PT_A5(sp)
-	REG_S	x16, PT_A6(sp)
-	REG_S	x17, PT_A7(sp)
-
-	// save the leftover regs
+	.macro SAVE_ABI_REGS
+	mv	t4, sp			// Save original SP in T4
+	addi	sp, sp, -FREGS_SIZE_ON_STACK
 
-	.if \all == 1
-	REG_S	x2, PT_SP(sp)
-	REG_S	x3, PT_GP(sp)
-	REG_S	x4, PT_TP(sp)
-	REG_S	x5, PT_T0(sp)
-	REG_S	x6, PT_T1(sp)
-	REG_S	x7, PT_T2(sp)
-	REG_S	x8, PT_S0(sp)
-	REG_S	x9, PT_S1(sp)
-	REG_S	x18, PT_S2(sp)
-	REG_S	x19, PT_S3(sp)
-	REG_S	x20, PT_S4(sp)
-	REG_S	x21, PT_S5(sp)
-	REG_S	x22, PT_S6(sp)
-	REG_S	x23, PT_S7(sp)
-	REG_S	x24, PT_S8(sp)
-	REG_S	x25, PT_S9(sp)
-	REG_S	x26, PT_S10(sp)
-	REG_S	x27, PT_S11(sp)
-	REG_S	x28, PT_T3(sp)
-	REG_S	x29, PT_T4(sp)
-	REG_S	x30, PT_T5(sp)
-	REG_S	x31, PT_T6(sp)
-
-	// save s0 if FP_TEST defined
-
-	.else
+	REG_S	t0,  FREGS_EPC(sp)
+	REG_S	x1,  FREGS_RA(sp)
+	REG_S	t4,  FREGS_SP(sp)	// Put original SP on stack
 #ifdef HAVE_FUNCTION_GRAPH_FP_TEST
-	REG_S	x8, PT_S0(sp)
+	REG_S	x8,  FREGS_S0(sp)
 #endif
-	.endif
+	REG_S	x6,  FREGS_T1(sp)
+
+	// save the arguments
+	REG_S	x10, FREGS_A0(sp)
+	REG_S	x11, FREGS_A1(sp)
+	REG_S	x12, FREGS_A2(sp)
+	REG_S	x13, FREGS_A3(sp)
+	REG_S	x14, FREGS_A4(sp)
+	REG_S	x15, FREGS_A5(sp)
+	REG_S	x16, FREGS_A6(sp)
+	REG_S	x17, FREGS_A7(sp)
 	.endm
 
 	.macro RESTORE_ABI_REGS, all=0
-	REG_L	t0, PT_EPC(sp)
-	REG_L	x1, PT_RA(sp)
-	REG_L	x10, PT_A0(sp)
-	REG_L	x11, PT_A1(sp)
-	REG_L	x12, PT_A2(sp)
-	REG_L	x13, PT_A3(sp)
-	REG_L	x14, PT_A4(sp)
-	REG_L	x15, PT_A5(sp)
-	REG_L	x16, PT_A6(sp)
-	REG_L	x17, PT_A7(sp)
-
-	.if \all == 1
-	REG_L	x2, PT_SP(sp)
-	REG_L	x3, PT_GP(sp)
-	REG_L	x4, PT_TP(sp)
-	REG_L	x6, PT_T1(sp)
-	REG_L	x7, PT_T2(sp)
-	REG_L	x8, PT_S0(sp)
-	REG_L	x9, PT_S1(sp)
-	REG_L	x18, PT_S2(sp)
-	REG_L	x19, PT_S3(sp)
-	REG_L	x20, PT_S4(sp)
-	REG_L	x21, PT_S5(sp)
-	REG_L	x22, PT_S6(sp)
-	REG_L	x23, PT_S7(sp)
-	REG_L	x24, PT_S8(sp)
-	REG_L	x25, PT_S9(sp)
-	REG_L	x26, PT_S10(sp)
-	REG_L	x27, PT_S11(sp)
-	REG_L	x28, PT_T3(sp)
-	REG_L	x29, PT_T4(sp)
-	REG_L	x30, PT_T5(sp)
-	REG_L	x31, PT_T6(sp)
-
-	.else
+	REG_L	t0, FREGS_EPC(sp)
+	REG_L	x1, FREGS_RA(sp)
 #ifdef HAVE_FUNCTION_GRAPH_FP_TEST
-	REG_L	x8, PT_S0(sp)
+	REG_L	x8, FREGS_S0(sp)
 #endif
-	.endif
-	addi	sp, sp, PT_SIZE_ON_STACK
+	REG_L	x6,  FREGS_T1(sp)
+
+	// restore the arguments
+	REG_L	x10, FREGS_A0(sp)
+	REG_L	x11, FREGS_A1(sp)
+	REG_L	x12, FREGS_A2(sp)
+	REG_L	x13, FREGS_A3(sp)
+	REG_L	x14, FREGS_A4(sp)
+	REG_L	x15, FREGS_A5(sp)
+	REG_L	x16, FREGS_A6(sp)
+	REG_L	x17, FREGS_A7(sp)
+
+	addi	sp, sp, FREGS_SIZE_ON_STACK
 	.endm
 
 	.macro PREPARE_ARGS
@@ -198,9 +137,9 @@
 	mv	a3, sp
 	.endm
 
-#endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */
 
-#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+#ifndef CONFIG_DYNAMIC_FTRACE_WITH_ARGS
 SYM_FUNC_START(ftrace_caller)
 	SAVE_ABI
 
@@ -227,33 +166,23 @@ SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL)
 	jr	t0
 SYM_FUNC_END(ftrace_caller)
 
-#else /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */
-SYM_FUNC_START(ftrace_regs_caller)
+#else /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */
+SYM_FUNC_START(ftrace_caller)
 	mv	t1, zero
-	SAVE_ABI_REGS 1
+	SAVE_ABI_REGS
 	PREPARE_ARGS
 
-SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
+SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
 	call	ftrace_stub
 
-	RESTORE_ABI_REGS 1
+	RESTORE_ABI_REGS
 	bnez	t1, .Ldirect
 	jr	t0
 .Ldirect:
 	jr	t1
-SYM_FUNC_END(ftrace_regs_caller)
-
-SYM_FUNC_START(ftrace_caller)
-	SAVE_ABI_REGS 0
-	PREPARE_ARGS
-
-SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
-	call	ftrace_stub
-
-	RESTORE_ABI_REGS 0
-	jr	t0
 SYM_FUNC_END(ftrace_caller)
-#endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */
+
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */
 
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
 SYM_CODE_START(ftrace_stub_direct_tramp)
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index e3a83ebd1b33..800995c425e0 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -819,7 +819,8 @@ static inline int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec)
 extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr);
 
 #if defined(CONFIG_DYNAMIC_FTRACE_WITH_REGS) || \
-	defined(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS)
+	defined(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS) || \
+	defined(CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS)
 /**
  * ftrace_modify_call - convert from one addr to another (no nop)
  * @rec: the call site record (e.g. mcount/fentry)
-- 
cgit v1.2.3


From 2c92ca849fcc6ee7d0c358e9959abc9f58661aea Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
Date: Thu, 16 May 2024 13:34:54 -0400
Subject: tracing/treewide: Remove second parameter of __assign_str()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With the rework of how the __string() handles dynamic strings where it
saves off the source string in field in the helper structure[1], the
assignment of that value to the trace event field is stored in the helper
value and does not need to be passed in again.

This means that with:

  __string(field, mystring)

Which use to be assigned with __assign_str(field, mystring), no longer
needs the second parameter and it is unused. With this, __assign_str()
will now only get a single parameter.

There's over 700 users of __assign_str() and because coccinelle does not
handle the TRACE_EVENT() macro I ended up using the following sed script:

  git grep -l __assign_str | while read a ; do
      sed -e 's/\(__assign_str([^,]*[^ ,]\) *,[^;]*/\1)/' $a > /tmp/test-file;
      mv /tmp/test-file $a;
  done

I then searched for __assign_str() that did not end with ';' as those
were multi line assignments that the sed script above would fail to catch.

Note, the same updates will need to be done for:

  __assign_str_len()
  __assign_rel_str()
  __assign_rel_str_len()

I tested this with both an allmodconfig and an allyesconfig (build only for both).

[1] https://lore.kernel.org/linux-trace-kernel/20240222211442.634192653@goodmis.org/

Link: https://lore.kernel.org/linux-trace-kernel/20240516133454.681ba6a0@rorschach.local.home

Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Julia Lawall <Julia.Lawall@inria.fr>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Acked-by: Jani Nikula <jani.nikula@intel.com>
Acked-by: Christian König <christian.koenig@amd.com> for the amdgpu parts.
Acked-by: Thomas Hellström <thomas.hellstrom@linux.intel.com> #for
Acked-by: Rafael J. Wysocki <rafael@kernel.org> # for thermal
Acked-by: Takashi Iwai <tiwai@suse.de>
Acked-by: Darrick J. Wong <djwong@kernel.org>	# xfs
Tested-by: Guenter Roeck <linux@roeck-us.net>
---
 arch/arm64/kernel/trace-events-emulation.h         |   2 +-
 arch/powerpc/include/asm/trace.h                   |   4 +-
 arch/x86/kvm/trace.h                               |   2 +-
 drivers/base/regmap/trace.h                        |  18 ++--
 drivers/base/trace.h                               |   2 +-
 drivers/block/rnbd/rnbd-srv-trace.h                |  12 +--
 drivers/bus/mhi/host/trace.h                       |  12 +--
 drivers/cxl/core/trace.h                           |  32 +++---
 drivers/dma-buf/sync_trace.h                       |   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h          |  16 +--
 .../drm/amd/display/amdgpu_dm/amdgpu_dm_trace.h    |   2 +-
 drivers/gpu/drm/i915/display/intel_display_trace.h |  56 +++++-----
 drivers/gpu/drm/lima/lima_trace.h                  |   2 +-
 drivers/gpu/drm/msm/disp/dpu1/dpu_trace.h          |  12 +--
 drivers/gpu/drm/scheduler/gpu_scheduler_trace.h    |   4 +-
 drivers/gpu/drm/virtio/virtgpu_trace.h             |   2 +-
 drivers/infiniband/core/cma_trace.h                |   4 +-
 drivers/infiniband/hw/hfi1/hfi.h                   |   2 +-
 drivers/infiniband/hw/hfi1/trace_dbg.h             |   2 +-
 drivers/infiniband/hw/hfi1/trace_rx.h              |   2 +-
 drivers/infiniband/hw/hfi1/trace_tid.h             |   4 +-
 drivers/infiniband/hw/hfi1/trace_tx.h              |   4 +-
 drivers/infiniband/sw/rdmavt/trace.h               |   2 +-
 drivers/infiniband/sw/rdmavt/trace_rvt.h           |   2 +-
 drivers/interconnect/trace.h                       |  10 +-
 drivers/iommu/intel/trace.h                        |  14 +--
 drivers/media/platform/nvidia/tegra-vde/trace.h    |   2 +-
 drivers/misc/mei/mei-trace.h                       |   6 +-
 drivers/net/dsa/mv88e6xxx/trace.h                  |   4 +-
 .../net/ethernet/freescale/dpaa/dpaa_eth_trace.h   |   2 +-
 .../net/ethernet/freescale/dpaa2/dpaa2-eth-trace.h |   4 +-
 .../net/ethernet/fungible/funeth/funeth_trace.h    |   6 +-
 drivers/net/ethernet/hisilicon/hns3/hns3_trace.h   |   4 +-
 .../ethernet/hisilicon/hns3/hns3pf/hclge_trace.h   |  12 +--
 .../ethernet/hisilicon/hns3/hns3vf/hclgevf_trace.h |  10 +-
 drivers/net/ethernet/intel/i40e/i40e_trace.h       |  10 +-
 drivers/net/ethernet/intel/iavf/iavf_trace.h       |   6 +-
 drivers/net/ethernet/intel/ice/ice_trace.h         |  12 +--
 .../net/ethernet/marvell/octeontx2/af/rvu_trace.h  |  12 +--
 .../mellanox/mlx5/core/diag/cmd_tracepoint.h       |   4 +-
 .../mellanox/mlx5/core/diag/en_rep_tracepoint.h    |   2 +-
 .../mellanox/mlx5/core/diag/en_tc_tracepoint.h     |   2 +-
 .../mellanox/mlx5/core/diag/fw_tracer_tracepoint.h |   5 +-
 .../mellanox/mlx5/core/esw/diag/qos_tracepoint.h   |   8 +-
 .../mlx5/core/sf/dev/diag/dev_tracepoint.h         |   2 +-
 .../mellanox/mlx5/core/sf/diag/sf_tracepoint.h     |  14 +--
 .../mellanox/mlx5/core/sf/diag/vhca_tracepoint.h   |   2 +-
 drivers/net/fjes/fjes_trace.h                      |  10 +-
 drivers/net/hyperv/netvsc_trace.h                  |   8 +-
 drivers/net/wireless/ath/ath10k/trace.h            |  64 +++++------
 drivers/net/wireless/ath/ath11k/trace.h            |  44 ++++----
 drivers/net/wireless/ath/ath12k/trace.h            |  16 +--
 drivers/net/wireless/ath/ath6kl/trace.h            |   4 +-
 drivers/net/wireless/ath/trace.h                   |   4 +-
 .../broadcom/brcm80211/brcmfmac/tracepoint.h       |   4 +-
 .../brcm80211/brcmsmac/brcms_trace_brcmsmac.h      |   2 +-
 .../brcm80211/brcmsmac/brcms_trace_brcmsmac_msg.h  |   2 +-
 .../brcm80211/brcmsmac/brcms_trace_brcmsmac_tx.h   |   6 +-
 .../net/wireless/intel/iwlwifi/iwl-devtrace-msg.h  |   2 +-
 drivers/net/wireless/intel/iwlwifi/iwl-devtrace.h  |   2 +-
 drivers/soc/qcom/pmic_pdcharger_ulog.h             |   2 +-
 drivers/soc/qcom/trace-aoss.h                      |   4 +-
 drivers/soc/qcom/trace-rpmh.h                      |   4 +-
 drivers/thermal/thermal_trace.h                    |  10 +-
 drivers/usb/cdns3/cdns3-trace.h                    |  26 ++---
 drivers/usb/cdns3/cdnsp-trace.h                    |  10 +-
 drivers/usb/chipidea/trace.h                       |   4 +-
 drivers/usb/dwc3/trace.h                           |   8 +-
 drivers/usb/gadget/udc/cdns2/cdns2-trace.h         |  22 ++--
 drivers/usb/gadget/udc/trace.h                     |   4 +-
 drivers/usb/mtu3/mtu3_trace.h                      |   8 +-
 drivers/usb/musb/musb_trace.h                      |  12 +--
 fs/bcachefs/trace.h                                |   6 +-
 fs/nfs/nfs4trace.h                                 |  42 ++++----
 fs/nfs/nfstrace.h                                  |  41 ++++---
 fs/nfsd/trace.h                                    |  40 +++----
 fs/ocfs2/ocfs2_trace.h                             |  60 +++++------
 fs/smb/client/trace.h                              |  18 ++--
 fs/xfs/scrub/trace.h                               |  10 +-
 fs/xfs/xfs_trace.h                                 |  28 ++---
 include/ras/ras_event.h                            |  12 +--
 include/trace/events/asoc.h                        |  22 ++--
 include/trace/events/avc.h                         |   6 +-
 include/trace/events/bridge.h                      |  16 +--
 include/trace/events/btrfs.h                       |   6 +-
 include/trace/events/cgroup.h                      |  10 +-
 include/trace/events/clk.h                         |  18 ++--
 include/trace/events/cma.h                         |   8 +-
 include/trace/events/devfreq.h                     |   4 +-
 include/trace/events/devlink.h                     |  50 ++++-----
 include/trace/events/dma_fence.h                   |   4 +-
 include/trace/events/erofs.h                       |   2 +-
 include/trace/events/f2fs.h                        |  20 ++--
 include/trace/events/habanalabs.h                  |  10 +-
 include/trace/events/huge_memory.h                 |   4 +-
 include/trace/events/hwmon.h                       |   6 +-
 include/trace/events/initcall.h                    |   2 +-
 include/trace/events/intel_ish.h                   |   2 +-
 include/trace/events/io_uring.h                    |  14 +--
 include/trace/events/iocost.h                      |  14 +--
 include/trace/events/iommu.h                       |   8 +-
 include/trace/events/irq.h                         |   2 +-
 include/trace/events/iscsi.h                       |   2 +-
 include/trace/events/kmem.h                        |   2 +-
 include/trace/events/lock.h                        |   4 +-
 include/trace/events/mmap_lock.h                   |   4 +-
 include/trace/events/mmc.h                         |   4 +-
 include/trace/events/module.h                      |   8 +-
 include/trace/events/napi.h                        |   2 +-
 include/trace/events/neigh.h                       |   6 +-
 include/trace/events/net.h                         |  12 +--
 include/trace/events/netlink.h                     |   2 +-
 include/trace/events/oom.h                         |   2 +-
 include/trace/events/osnoise.h                     |   2 +-
 include/trace/events/power.h                       |  23 ++--
 include/trace/events/pwc.h                         |   4 +-
 include/trace/events/qdisc.h                       |  12 +--
 include/trace/events/qla.h                         |   2 +-
 include/trace/events/qrtr.h                        |   2 +-
 include/trace/events/regulator.h                   |   6 +-
 include/trace/events/rpcgss.h                      |  20 ++--
 include/trace/events/rpcrdma.h                     |  52 ++++-----
 include/trace/events/rpm.h                         |   6 +-
 include/trace/events/sched.h                       |   8 +-
 include/trace/events/sof.h                         |  12 +--
 include/trace/events/sof_intel.h                   |  16 +--
 include/trace/events/sunrpc.h                      | 118 ++++++++++-----------
 include/trace/events/swiotlb.h                     |   2 +-
 include/trace/events/target.h                      |   4 +-
 include/trace/events/tegra_apb_dma.h               |   6 +-
 include/trace/events/ufs.h                         |  24 ++---
 include/trace/events/workqueue.h                   |   2 +-
 include/trace/events/xdp.h                         |   2 +-
 include/trace/stages/stage6_event_callback.h       |   4 +-
 kernel/trace/bpf_trace.h                           |   2 +-
 net/batman-adv/trace.h                             |   4 +-
 net/dsa/trace.h                                    |  34 +++---
 net/ieee802154/trace.h                             |   2 +-
 net/mac80211/trace.h                               |   2 +-
 net/openvswitch/openvswitch_trace.h                |   8 +-
 net/smc/smc_tracepoint.h                           |   4 +-
 net/tipc/trace.h                                   |  16 +--
 net/wireless/trace.h                               |   2 +-
 samples/trace_events/trace-events-sample.h         |  19 ++--
 sound/core/pcm_trace.h                             |   2 +-
 sound/hda/trace.h                                  |   6 +-
 sound/soc/intel/avs/trace.h                        |   4 +-
 147 files changed, 794 insertions(+), 808 deletions(-)

(limited to 'include')

diff --git a/arch/arm64/kernel/trace-events-emulation.h b/arch/arm64/kernel/trace-events-emulation.h
index 6c40f58b844a..c51b547b583e 100644
--- a/arch/arm64/kernel/trace-events-emulation.h
+++ b/arch/arm64/kernel/trace-events-emulation.h
@@ -18,7 +18,7 @@ TRACE_EVENT(instruction_emulation,
 	),
 
 	TP_fast_assign(
-		__assign_str(instr, instr);
+		__assign_str(instr);
 		__entry->addr = addr;
 	),
 
diff --git a/arch/powerpc/include/asm/trace.h b/arch/powerpc/include/asm/trace.h
index d9ac3a4f46e1..a7b69b25296b 100644
--- a/arch/powerpc/include/asm/trace.h
+++ b/arch/powerpc/include/asm/trace.h
@@ -137,7 +137,7 @@ TRACE_EVENT(rtas_input,
 
 	TP_fast_assign(
 		__entry->nargs = be32_to_cpu(rtas_args->nargs);
-		__assign_str(name, name);
+		__assign_str(name);
 		be32_to_cpu_array(__get_dynamic_array(inputs), rtas_args->args, __entry->nargs);
 	),
 
@@ -162,7 +162,7 @@ TRACE_EVENT(rtas_output,
 	TP_fast_assign(
 		__entry->nr_other = be32_to_cpu(rtas_args->nret) - 1;
 		__entry->status = be32_to_cpu(rtas_args->rets[0]);
-		__assign_str(name, name);
+		__assign_str(name);
 		be32_to_cpu_array(__get_dynamic_array(other_outputs),
 				  &rtas_args->rets[1], __entry->nr_other);
 	),
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 9d0b02ef307e..e19fed438a67 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1678,7 +1678,7 @@ TRACE_EVENT(kvm_nested_vmenter_failed,
 	),
 
 	TP_fast_assign(
-		__assign_str(msg, msg);
+		__assign_str(msg);
 		__entry->err = err;
 	),
 
diff --git a/drivers/base/regmap/trace.h b/drivers/base/regmap/trace.h
index 704e106e5dbd..bcc5a8b226a6 100644
--- a/drivers/base/regmap/trace.h
+++ b/drivers/base/regmap/trace.h
@@ -27,7 +27,7 @@ DECLARE_EVENT_CLASS(regmap_reg,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, regmap_name(map));
+		__assign_str(name);
 		__entry->reg = reg;
 		__entry->val = val;
 	),
@@ -74,7 +74,7 @@ DECLARE_EVENT_CLASS(regmap_bulk,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, regmap_name(map));
+		__assign_str(name);
 		__entry->reg = reg;
 		__entry->val_len = val_len;
 		memcpy(__get_dynamic_array(buf), val, val_len);
@@ -113,7 +113,7 @@ DECLARE_EVENT_CLASS(regmap_block,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, regmap_name(map));
+		__assign_str(name);
 		__entry->reg = reg;
 		__entry->count = count;
 	),
@@ -163,9 +163,9 @@ TRACE_EVENT(regcache_sync,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, regmap_name(map));
-		__assign_str(status, status);
-		__assign_str(type, type);
+		__assign_str(name);
+		__assign_str(status);
+		__assign_str(type);
 	),
 
 	TP_printk("%s type=%s status=%s", __get_str(name),
@@ -184,7 +184,7 @@ DECLARE_EVENT_CLASS(regmap_bool,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, regmap_name(map));
+		__assign_str(name);
 		__entry->flag = flag;
 	),
 
@@ -216,7 +216,7 @@ DECLARE_EVENT_CLASS(regmap_async,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, regmap_name(map));
+		__assign_str(name);
 	),
 
 	TP_printk("%s", __get_str(name))
@@ -264,7 +264,7 @@ TRACE_EVENT(regcache_drop_region,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, regmap_name(map));
+		__assign_str(name);
 		__entry->from = from;
 		__entry->to = to;
 	),
diff --git a/drivers/base/trace.h b/drivers/base/trace.h
index 3192e18f877e..e52b6eae060d 100644
--- a/drivers/base/trace.h
+++ b/drivers/base/trace.h
@@ -28,7 +28,7 @@ DECLARE_EVENT_CLASS(devres,
 		__field(size_t, size)
 	),
 	TP_fast_assign(
-		__assign_str(devname, dev_name(dev));
+		__assign_str(devname);
 		__entry->op = op;
 		__entry->node = node;
 		__entry->name = name;
diff --git a/drivers/block/rnbd/rnbd-srv-trace.h b/drivers/block/rnbd/rnbd-srv-trace.h
index 8dedf73bdd28..89d0bcb17195 100644
--- a/drivers/block/rnbd/rnbd-srv-trace.h
+++ b/drivers/block/rnbd/rnbd-srv-trace.h
@@ -27,7 +27,7 @@ DECLARE_EVENT_CLASS(rnbd_srv_link_class,
 
 	TP_fast_assign(
 		__entry->qdepth = srv->queue_depth;
-		__assign_str(sessname, srv->sessname);
+		__assign_str(sessname);
 	),
 
 	TP_printk("sessname: %s qdepth: %d",
@@ -85,7 +85,7 @@ TRACE_EVENT(process_rdma,
 	),
 
 	TP_fast_assign(
-		__assign_str(sessname, srv->sessname);
+		__assign_str(sessname);
 		__entry->dir = id->dir;
 		__entry->ver = srv->ver;
 		__entry->device_id = le32_to_cpu(msg->device_id);
@@ -130,7 +130,7 @@ TRACE_EVENT(process_msg_sess_info,
 		__entry->proto_ver = srv->ver;
 		__entry->clt_ver = msg->ver;
 		__entry->srv_ver = RNBD_PROTO_VER_MAJOR;
-		__assign_str(sessname, srv->sessname);
+		__assign_str(sessname);
 	),
 
 	TP_printk("Session %s using proto-ver %d (clt-ver: %d, srv-ver: %d)",
@@ -165,8 +165,8 @@ TRACE_EVENT(process_msg_open,
 
 	TP_fast_assign(
 		__entry->access_mode = msg->access_mode;
-		__assign_str(sessname, srv->sessname);
-		__assign_str(dev_name, msg->dev_name);
+		__assign_str(sessname);
+		__assign_str(dev_name);
 	),
 
 	TP_printk("Open message received: session='%s' path='%s' access_mode=%s",
@@ -189,7 +189,7 @@ TRACE_EVENT(process_msg_close,
 
 	TP_fast_assign(
 		__entry->device_id = le32_to_cpu(msg->device_id);
-		__assign_str(sessname, srv->sessname);
+		__assign_str(sessname);
 	),
 
 	TP_printk("Close message received: session='%s' device id='%d'",
diff --git a/drivers/bus/mhi/host/trace.h b/drivers/bus/mhi/host/trace.h
index 368515dcb22d..95613c8ebe06 100644
--- a/drivers/bus/mhi/host/trace.h
+++ b/drivers/bus/mhi/host/trace.h
@@ -103,7 +103,7 @@ TRACE_EVENT(mhi_gen_tre,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, mhi_cntrl->mhi_dev->name);
+		__assign_str(name);
 		__entry->ch_num = mhi_chan->chan;
 		__entry->wp = mhi_tre;
 		__entry->tre_ptr = mhi_tre->ptr;
@@ -131,7 +131,7 @@ TRACE_EVENT(mhi_intvec_states,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, mhi_cntrl->mhi_dev->name);
+		__assign_str(name);
 		__entry->local_ee = mhi_cntrl->ee;
 		__entry->state = mhi_cntrl->dev_state;
 		__entry->dev_ee = dev_ee;
@@ -158,7 +158,7 @@ TRACE_EVENT(mhi_tryset_pm_state,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, mhi_cntrl->mhi_dev->name);
+		__assign_str(name);
 		if (pm_state)
 			pm_state = __fls(pm_state);
 		__entry->pm_state = pm_state;
@@ -184,7 +184,7 @@ DECLARE_EVENT_CLASS(mhi_process_event_ring,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, mhi_cntrl->mhi_dev->name);
+		__assign_str(name);
 		__entry->rp = rp;
 		__entry->ptr = rp->ptr;
 		__entry->dword0 = rp->dword[0];
@@ -226,7 +226,7 @@ DECLARE_EVENT_CLASS(mhi_update_channel_state,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, mhi_cntrl->mhi_dev->name);
+		__assign_str(name);
 		__entry->ch_num = mhi_chan->chan;
 		__entry->state = state;
 		__entry->reason = reason;
@@ -265,7 +265,7 @@ TRACE_EVENT(mhi_pm_st_transition,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, mhi_cntrl->mhi_dev->name);
+		__assign_str(name);
 		__entry->state = state;
 	),
 
diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h
index 07a0394b1d99..ee5cd4eb2f16 100644
--- a/drivers/cxl/core/trace.h
+++ b/drivers/cxl/core/trace.h
@@ -60,8 +60,8 @@ TRACE_EVENT(cxl_aer_uncorrectable_error,
 		__array(u32, header_log, CXL_HEADERLOG_SIZE_U32)
 	),
 	TP_fast_assign(
-		__assign_str(memdev, dev_name(&cxlmd->dev));
-		__assign_str(host, dev_name(cxlmd->dev.parent));
+		__assign_str(memdev);
+		__assign_str(host);
 		__entry->serial = cxlmd->cxlds->serial;
 		__entry->status = status;
 		__entry->first_error = fe;
@@ -106,8 +106,8 @@ TRACE_EVENT(cxl_aer_correctable_error,
 		__field(u32, status)
 	),
 	TP_fast_assign(
-		__assign_str(memdev, dev_name(&cxlmd->dev));
-		__assign_str(host, dev_name(cxlmd->dev.parent));
+		__assign_str(memdev);
+		__assign_str(host);
 		__entry->serial = cxlmd->cxlds->serial;
 		__entry->status = status;
 	),
@@ -142,8 +142,8 @@ TRACE_EVENT(cxl_overflow,
 	),
 
 	TP_fast_assign(
-		__assign_str(memdev, dev_name(&cxlmd->dev));
-		__assign_str(host, dev_name(cxlmd->dev.parent));
+		__assign_str(memdev);
+		__assign_str(host);
 		__entry->serial = cxlmd->cxlds->serial;
 		__entry->log = log;
 		__entry->count = le16_to_cpu(payload->overflow_err_count);
@@ -200,8 +200,8 @@ TRACE_EVENT(cxl_overflow,
 	__field(u8, hdr_maint_op_class)
 
 #define CXL_EVT_TP_fast_assign(cxlmd, l, hdr)					\
-	__assign_str(memdev, dev_name(&(cxlmd)->dev));				\
-	__assign_str(host, dev_name((cxlmd)->dev.parent));			\
+	__assign_str(memdev);				\
+	__assign_str(host);			\
 	__entry->log = (l);							\
 	__entry->serial = (cxlmd)->cxlds->serial;				\
 	__entry->hdr_length = (hdr).length;					\
@@ -359,10 +359,10 @@ TRACE_EVENT(cxl_general_media,
 		__entry->validity_flags = get_unaligned_le16(&rec->validity_flags);
 		__entry->hpa = hpa;
 		if (cxlr) {
-			__assign_str(region_name, dev_name(&cxlr->dev));
+			__assign_str(region_name);
 			uuid_copy(&__entry->region_uuid, &cxlr->params.uuid);
 		} else {
-			__assign_str(region_name, "");
+			__assign_str(region_name);
 			uuid_copy(&__entry->region_uuid, &uuid_null);
 		}
 	),
@@ -462,10 +462,10 @@ TRACE_EVENT(cxl_dram,
 			CXL_EVENT_DER_CORRECTION_MASK_SIZE);
 		__entry->hpa = hpa;
 		if (cxlr) {
-			__assign_str(region_name, dev_name(&cxlr->dev));
+			__assign_str(region_name);
 			uuid_copy(&__entry->region_uuid, &cxlr->params.uuid);
 		} else {
-			__assign_str(region_name, "");
+			__assign_str(region_name);
 			uuid_copy(&__entry->region_uuid, &uuid_null);
 		}
 	),
@@ -692,8 +692,8 @@ TRACE_EVENT(cxl_poison,
 	    ),
 
 	TP_fast_assign(
-		__assign_str(memdev, dev_name(&cxlmd->dev));
-		__assign_str(host, dev_name(cxlmd->dev.parent));
+		__assign_str(memdev);
+		__assign_str(host);
 		__entry->serial = cxlmd->cxlds->serial;
 		__entry->overflow_ts = cxl_poison_overflow(flags, overflow_ts);
 		__entry->dpa = cxl_poison_record_dpa(record);
@@ -702,12 +702,12 @@ TRACE_EVENT(cxl_poison,
 		__entry->trace_type = trace_type;
 		__entry->flags = flags;
 		if (cxlr) {
-			__assign_str(region, dev_name(&cxlr->dev));
+			__assign_str(region);
 			memcpy(__entry->uuid, &cxlr->params.uuid, 16);
 			__entry->hpa = cxl_trace_hpa(cxlr, cxlmd,
 						     __entry->dpa);
 		} else {
-			__assign_str(region, "");
+			__assign_str(region);
 			memset(__entry->uuid, 0, 16);
 			__entry->hpa = ULLONG_MAX;
 		}
diff --git a/drivers/dma-buf/sync_trace.h b/drivers/dma-buf/sync_trace.h
index 06e468a218ff..d71dcf954b8d 100644
--- a/drivers/dma-buf/sync_trace.h
+++ b/drivers/dma-buf/sync_trace.h
@@ -20,7 +20,7 @@ TRACE_EVENT(sync_timeline,
 	),
 
 	TP_fast_assign(
-			__assign_str(name, timeline->name);
+			__assign_str(name);
 			__entry->value = timeline->value;
 	),
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
index f539b1d00234..7aafeb763e5d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
@@ -178,10 +178,10 @@ TRACE_EVENT(amdgpu_cs_ioctl,
 
 	    TP_fast_assign(
 			   __entry->sched_job_id = job->base.id;
-			   __assign_str(timeline, AMDGPU_JOB_GET_TIMELINE_NAME(job));
+			   __assign_str(timeline);
 			   __entry->context = job->base.s_fence->finished.context;
 			   __entry->seqno = job->base.s_fence->finished.seqno;
-			   __assign_str(ring, to_amdgpu_ring(job->base.sched)->name);
+			   __assign_str(ring);
 			   __entry->num_ibs = job->num_ibs;
 			   ),
 	    TP_printk("sched_job=%llu, timeline=%s, context=%u, seqno=%u, ring_name=%s, num_ibs=%u",
@@ -203,10 +203,10 @@ TRACE_EVENT(amdgpu_sched_run_job,
 
 	    TP_fast_assign(
 			   __entry->sched_job_id = job->base.id;
-			   __assign_str(timeline, AMDGPU_JOB_GET_TIMELINE_NAME(job));
+			   __assign_str(timeline);
 			   __entry->context = job->base.s_fence->finished.context;
 			   __entry->seqno = job->base.s_fence->finished.seqno;
-			   __assign_str(ring, to_amdgpu_ring(job->base.sched)->name);
+			   __assign_str(ring);
 			   __entry->num_ibs = job->num_ibs;
 			   ),
 	    TP_printk("sched_job=%llu, timeline=%s, context=%u, seqno=%u, ring_name=%s, num_ibs=%u",
@@ -231,7 +231,7 @@ TRACE_EVENT(amdgpu_vm_grab_id,
 
 	    TP_fast_assign(
 			   __entry->pasid = vm->pasid;
-			   __assign_str(ring, ring->name);
+			   __assign_str(ring);
 			   __entry->vmid = job->vmid;
 			   __entry->vm_hub = ring->vm_hub,
 			   __entry->pd_addr = job->vm_pd_addr;
@@ -425,7 +425,7 @@ TRACE_EVENT(amdgpu_vm_flush,
 			     ),
 
 	    TP_fast_assign(
-			   __assign_str(ring, ring->name);
+			   __assign_str(ring);
 			   __entry->vmid = vmid;
 			   __entry->vm_hub = ring->vm_hub;
 			   __entry->pd_addr = pd_addr;
@@ -526,7 +526,7 @@ TRACE_EVENT(amdgpu_ib_pipe_sync,
 			     ),
 
 	    TP_fast_assign(
-			   __assign_str(ring, sched_job->base.sched->name);
+			   __assign_str(ring);
 			   __entry->id = sched_job->base.id;
 			   __entry->fence = fence;
 			   __entry->ctx = fence->context;
@@ -563,7 +563,7 @@ TRACE_EVENT(amdgpu_runpm_reference_dumps,
 			     ),
 	    TP_fast_assign(
 			   __entry->index = index;
-			   __assign_str(func, func);
+			   __assign_str(func);
 			   ),
 	    TP_printk("amdgpu runpm reference dump 0x%x: 0x%s\n",
 		      __entry->index,
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_trace.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_trace.h
index 133af994a08c..4686d4b0cbad 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_trace.h
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_trace.h
@@ -87,7 +87,7 @@ TRACE_EVENT(amdgpu_dc_performance,
 			__entry->writes = write_count;
 			__entry->read_delta = read_count - *last_read;
 			__entry->write_delta = write_count - *last_write;
-			__assign_str(func, func);
+			__assign_str(func);
 			__entry->line = line;
 			*last_read = read_count;
 			*last_write = write_count;
diff --git a/drivers/gpu/drm/i915/display/intel_display_trace.h b/drivers/gpu/drm/i915/display/intel_display_trace.h
index 7862e7cefe02..49a5e6d9dc0d 100644
--- a/drivers/gpu/drm/i915/display/intel_display_trace.h
+++ b/drivers/gpu/drm/i915/display/intel_display_trace.h
@@ -34,7 +34,7 @@ TRACE_EVENT(intel_pipe_enable,
 	    TP_fast_assign(
 			   struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
 			   struct intel_crtc *it__;
-			   __assign_str(dev, __dev_name_kms(crtc));
+			   __assign_str(dev);
 			   for_each_intel_crtc(&dev_priv->drm, it__) {
 				   __entry->frame[it__->pipe] = intel_crtc_get_vblank_counter(it__);
 				   __entry->scanline[it__->pipe] = intel_get_crtc_scanline(it__);
@@ -63,7 +63,7 @@ TRACE_EVENT(intel_pipe_disable,
 	    TP_fast_assign(
 			   struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
 			   struct intel_crtc *it__;
-			   __assign_str(dev, __dev_name_kms(crtc));
+			   __assign_str(dev);
 			   for_each_intel_crtc(&dev_priv->drm, it__) {
 				   __entry->frame[it__->pipe] = intel_crtc_get_vblank_counter(it__);
 				   __entry->scanline[it__->pipe] = intel_get_crtc_scanline(it__);
@@ -91,7 +91,7 @@ TRACE_EVENT(intel_pipe_crc,
 			     ),
 
 	    TP_fast_assign(
-			   __assign_str(dev, __dev_name_kms(crtc));
+			   __assign_str(dev);
 			   __entry->pipe = crtc->pipe;
 			   __entry->frame = intel_crtc_get_vblank_counter(crtc);
 			   __entry->scanline = intel_get_crtc_scanline(crtc);
@@ -119,7 +119,7 @@ TRACE_EVENT(intel_cpu_fifo_underrun,
 
 	    TP_fast_assign(
 			    struct intel_crtc *crtc = intel_crtc_for_pipe(dev_priv, pipe);
-			   __assign_str(dev, __dev_name_kms(crtc));
+			   __assign_str(dev);
 			   __entry->pipe = pipe;
 			   __entry->frame = intel_crtc_get_vblank_counter(crtc);
 			   __entry->scanline = intel_get_crtc_scanline(crtc);
@@ -144,7 +144,7 @@ TRACE_EVENT(intel_pch_fifo_underrun,
 	    TP_fast_assign(
 			   enum pipe pipe = pch_transcoder;
 			   struct intel_crtc *crtc = intel_crtc_for_pipe(dev_priv, pipe);
-			   __assign_str(dev, __dev_name_i915(dev_priv));
+			   __assign_str(dev);
 			   __entry->pipe = pipe;
 			   __entry->frame = intel_crtc_get_vblank_counter(crtc);
 			   __entry->scanline = intel_get_crtc_scanline(crtc);
@@ -169,7 +169,7 @@ TRACE_EVENT(intel_memory_cxsr,
 
 	    TP_fast_assign(
 			   struct intel_crtc *crtc;
-			   __assign_str(dev, __dev_name_i915(dev_priv));
+			   __assign_str(dev);
 			   for_each_intel_crtc(&dev_priv->drm, crtc) {
 				   __entry->frame[crtc->pipe] = intel_crtc_get_vblank_counter(crtc);
 				   __entry->scanline[crtc->pipe] = intel_get_crtc_scanline(crtc);
@@ -209,7 +209,7 @@ TRACE_EVENT(g4x_wm,
 			     ),
 
 	    TP_fast_assign(
-			   __assign_str(dev, __dev_name_kms(crtc));
+			   __assign_str(dev);
 			   __entry->pipe = crtc->pipe;
 			   __entry->frame = intel_crtc_get_vblank_counter(crtc);
 			   __entry->scanline = intel_get_crtc_scanline(crtc);
@@ -256,7 +256,7 @@ TRACE_EVENT(vlv_wm,
 			     ),
 
 	    TP_fast_assign(
-			   __assign_str(dev, __dev_name_kms(crtc));
+			   __assign_str(dev);
 			   __entry->pipe = crtc->pipe;
 			   __entry->frame = intel_crtc_get_vblank_counter(crtc);
 			   __entry->scanline = intel_get_crtc_scanline(crtc);
@@ -293,7 +293,7 @@ TRACE_EVENT(vlv_fifo_size,
 			     ),
 
 	    TP_fast_assign(
-			   __assign_str(dev, __dev_name_kms(crtc));
+			   __assign_str(dev);
 			   __entry->pipe = crtc->pipe;
 			   __entry->frame = intel_crtc_get_vblank_counter(crtc);
 			   __entry->scanline = intel_get_crtc_scanline(crtc);
@@ -323,8 +323,8 @@ TRACE_EVENT(intel_plane_update_noarm,
 			     ),
 
 	    TP_fast_assign(
-			   __assign_str(dev, __dev_name_kms(plane));
-			   __assign_str(name, plane->base.name);
+			   __assign_str(dev);
+			   __assign_str(name);
 			   __entry->pipe = crtc->pipe;
 			   __entry->frame = intel_crtc_get_vblank_counter(crtc);
 			   __entry->scanline = intel_get_crtc_scanline(crtc);
@@ -354,8 +354,8 @@ TRACE_EVENT(intel_plane_update_arm,
 			     ),
 
 	    TP_fast_assign(
-			   __assign_str(dev, __dev_name_kms(plane));
-			   __assign_str(name, plane->base.name);
+			   __assign_str(dev);
+			   __assign_str(name);
 			   __entry->pipe = crtc->pipe;
 			   __entry->frame = intel_crtc_get_vblank_counter(crtc);
 			   __entry->scanline = intel_get_crtc_scanline(crtc);
@@ -383,8 +383,8 @@ TRACE_EVENT(intel_plane_disable_arm,
 			     ),
 
 	    TP_fast_assign(
-			   __assign_str(dev, __dev_name_kms(plane));
-			   __assign_str(name, plane->base.name);
+			   __assign_str(dev);
+			   __assign_str(name);
 			   __entry->pipe = crtc->pipe;
 			   __entry->frame = intel_crtc_get_vblank_counter(crtc);
 			   __entry->scanline = intel_get_crtc_scanline(crtc);
@@ -410,8 +410,8 @@ TRACE_EVENT(intel_fbc_activate,
 	    TP_fast_assign(
 			   struct intel_crtc *crtc = intel_crtc_for_pipe(to_i915(plane->base.dev),
 									 plane->pipe);
-			   __assign_str(dev, __dev_name_kms(plane));
-			   __assign_str(name, plane->base.name);
+			   __assign_str(dev);
+			   __assign_str(name);
 			   __entry->pipe = crtc->pipe;
 			   __entry->frame = intel_crtc_get_vblank_counter(crtc);
 			   __entry->scanline = intel_get_crtc_scanline(crtc);
@@ -437,8 +437,8 @@ TRACE_EVENT(intel_fbc_deactivate,
 	    TP_fast_assign(
 			   struct intel_crtc *crtc = intel_crtc_for_pipe(to_i915(plane->base.dev),
 									 plane->pipe);
-			   __assign_str(dev, __dev_name_kms(plane));
-			   __assign_str(name, plane->base.name);
+			   __assign_str(dev);
+			   __assign_str(name);
 			   __entry->pipe = crtc->pipe;
 			   __entry->frame = intel_crtc_get_vblank_counter(crtc);
 			   __entry->scanline = intel_get_crtc_scanline(crtc);
@@ -464,8 +464,8 @@ TRACE_EVENT(intel_fbc_nuke,
 	    TP_fast_assign(
 			   struct intel_crtc *crtc = intel_crtc_for_pipe(to_i915(plane->base.dev),
 									 plane->pipe);
-			   __assign_str(dev, __dev_name_kms(plane));
-			   __assign_str(name, plane->base.name);
+			   __assign_str(dev);
+			   __assign_str(name);
 			   __entry->pipe = crtc->pipe;
 			   __entry->frame = intel_crtc_get_vblank_counter(crtc);
 			   __entry->scanline = intel_get_crtc_scanline(crtc);
@@ -488,7 +488,7 @@ TRACE_EVENT(intel_crtc_vblank_work_start,
 			     ),
 
 	    TP_fast_assign(
-			   __assign_str(dev, __dev_name_kms(crtc));
+			   __assign_str(dev);
 			   __entry->pipe = crtc->pipe;
 			   __entry->frame = intel_crtc_get_vblank_counter(crtc);
 			   __entry->scanline = intel_get_crtc_scanline(crtc);
@@ -511,7 +511,7 @@ TRACE_EVENT(intel_crtc_vblank_work_end,
 			     ),
 
 	    TP_fast_assign(
-			   __assign_str(dev, __dev_name_kms(crtc));
+			   __assign_str(dev);
 			   __entry->pipe = crtc->pipe;
 			   __entry->frame = intel_crtc_get_vblank_counter(crtc);
 			   __entry->scanline = intel_get_crtc_scanline(crtc);
@@ -536,7 +536,7 @@ TRACE_EVENT(intel_pipe_update_start,
 			     ),
 
 	    TP_fast_assign(
-			   __assign_str(dev, __dev_name_kms(crtc));
+			   __assign_str(dev);
 			   __entry->pipe = crtc->pipe;
 			   __entry->frame = intel_crtc_get_vblank_counter(crtc);
 			   __entry->scanline = intel_get_crtc_scanline(crtc);
@@ -564,7 +564,7 @@ TRACE_EVENT(intel_pipe_update_vblank_evaded,
 			     ),
 
 	    TP_fast_assign(
-			   __assign_str(dev, __dev_name_kms(crtc));
+			   __assign_str(dev);
 			   __entry->pipe = crtc->pipe;
 			   __entry->frame = crtc->debug.start_vbl_count;
 			   __entry->scanline = crtc->debug.scanline_start;
@@ -590,7 +590,7 @@ TRACE_EVENT(intel_pipe_update_end,
 			     ),
 
 	    TP_fast_assign(
-			   __assign_str(dev, __dev_name_kms(crtc));
+			   __assign_str(dev);
 			   __entry->pipe = crtc->pipe;
 			   __entry->frame = frame;
 			   __entry->scanline = scanline_end;
@@ -613,7 +613,7 @@ TRACE_EVENT(intel_frontbuffer_invalidate,
 			     ),
 
 	    TP_fast_assign(
-			   __assign_str(dev, __dev_name_i915(i915));
+			   __assign_str(dev);
 			   __entry->frontbuffer_bits = frontbuffer_bits;
 			   __entry->origin = origin;
 			   ),
@@ -634,7 +634,7 @@ TRACE_EVENT(intel_frontbuffer_flush,
 			     ),
 
 	    TP_fast_assign(
-			   __assign_str(dev, __dev_name_i915(i915));
+			   __assign_str(dev);
 			   __entry->frontbuffer_bits = frontbuffer_bits;
 			   __entry->origin = origin;
 			   ),
diff --git a/drivers/gpu/drm/lima/lima_trace.h b/drivers/gpu/drm/lima/lima_trace.h
index 494b9790b1da..3a349d10304e 100644
--- a/drivers/gpu/drm/lima/lima_trace.h
+++ b/drivers/gpu/drm/lima/lima_trace.h
@@ -24,7 +24,7 @@ DECLARE_EVENT_CLASS(lima_task,
 		__entry->task_id = task->base.id;
 		__entry->context = task->base.s_fence->finished.context;
 		__entry->seqno = task->base.s_fence->finished.seqno;
-		__assign_str(pipe, task->base.sched->name);
+		__assign_str(pipe);
 		),
 
 	TP_printk("task=%llu, context=%u seqno=%u pipe=%s",
diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_trace.h b/drivers/gpu/drm/msm/disp/dpu1/dpu_trace.h
index bd92fb2979aa..0fdd41162e4b 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_trace.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_trace.h
@@ -113,7 +113,7 @@ TRACE_EVENT(tracing_mark_write,
 	),
 	TP_fast_assign(
 			__entry->pid = pid;
-			__assign_str(trace_name, name);
+			__assign_str(trace_name);
 			__entry->trace_begin = trace_begin;
 	),
 	TP_printk("%s|%d|%s", __entry->trace_begin ? "B" : "E",
@@ -130,7 +130,7 @@ TRACE_EVENT(dpu_trace_counter,
 	),
 	TP_fast_assign(
 			__entry->pid = current->tgid;
-			__assign_str(counter_name, name);
+			__assign_str(counter_name);
 			__entry->value = value;
 	),
 	TP_printk("%d|%s|%d", __entry->pid,
@@ -379,7 +379,7 @@ TRACE_EVENT(dpu_enc_rc,
 		__entry->sw_event = sw_event;
 		__entry->idle_pc_supported = idle_pc_supported;
 		__entry->rc_state = rc_state;
-		__assign_str(stage_str, stage);
+		__assign_str(stage_str);
 	),
 	TP_printk("%s: id:%u, sw_event:%d, idle_pc_supported:%s, rc_state:%d",
 		  __get_str(stage_str), __entry->drm_id, __entry->sw_event,
@@ -401,7 +401,7 @@ TRACE_EVENT(dpu_enc_frame_done_cb_not_busy,
 	TP_fast_assign(
 		__entry->drm_id = drm_id;
 		__entry->event = event;
-		__assign_str(intf_mode_str, intf_mode);
+		__assign_str(intf_mode_str);
 		__entry->intf_idx = intf_idx;
 		__entry->wb_idx = wb_idx;
 	),
@@ -446,7 +446,7 @@ TRACE_EVENT(dpu_enc_trigger_flush,
 	),
 	TP_fast_assign(
 		__entry->drm_id = drm_id;
-		__assign_str(intf_mode_str, intf_mode);
+		__assign_str(intf_mode_str);
 		__entry->intf_idx = intf_idx;
 		__entry->wb_idx = wb_idx;
 		__entry->pending_kickoff_cnt = pending_kickoff_cnt;
@@ -946,7 +946,7 @@ TRACE_EVENT(dpu_core_perf_update_clk,
 		__field(	u64,			clk_rate	)
 	),
 	TP_fast_assign(
-		__assign_str(dev_name, dev->unique);
+		__assign_str(dev_name);
 		__entry->stop_req = stop_req;
 		__entry->clk_rate = clk_rate;
 	),
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h
index f8ed093b7356..c75302ca3427 100644
--- a/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h
+++ b/drivers/gpu/drm/scheduler/gpu_scheduler_trace.h
@@ -48,7 +48,7 @@ DECLARE_EVENT_CLASS(drm_sched_job,
 			   __entry->entity = entity;
 			   __entry->id = sched_job->id;
 			   __entry->fence = &sched_job->s_fence->finished;
-			   __assign_str(name, sched_job->sched->name);
+			   __assign_str(name);
 			   __entry->job_count = spsc_queue_count(&entity->job_queue);
 			   __entry->hw_job_count = atomic_read(
 				   &sched_job->sched->credit_count);
@@ -94,7 +94,7 @@ TRACE_EVENT(drm_sched_job_wait_dep,
 			     ),
 
 	    TP_fast_assign(
-			   __assign_str(name, sched_job->sched->name);
+			   __assign_str(name);
 			   __entry->id = sched_job->id;
 			   __entry->fence = fence;
 			   __entry->ctx = fence->context;
diff --git a/drivers/gpu/drm/virtio/virtgpu_trace.h b/drivers/gpu/drm/virtio/virtgpu_trace.h
index 031bc77689d5..227bf0ae7ed5 100644
--- a/drivers/gpu/drm/virtio/virtgpu_trace.h
+++ b/drivers/gpu/drm/virtio/virtgpu_trace.h
@@ -25,7 +25,7 @@ DECLARE_EVENT_CLASS(virtio_gpu_cmd,
 	TP_fast_assign(
 		       __entry->dev = vq->vdev->index;
 		       __entry->vq = vq->index;
-		       __assign_str(name, vq->name);
+		       __assign_str(name);
 		       __entry->type = le32_to_cpu(hdr->type);
 		       __entry->flags = le32_to_cpu(hdr->flags);
 		       __entry->fence_id = le64_to_cpu(hdr->fence_id);
diff --git a/drivers/infiniband/core/cma_trace.h b/drivers/infiniband/core/cma_trace.h
index 47f3c6e4be89..dc622f3778be 100644
--- a/drivers/infiniband/core/cma_trace.h
+++ b/drivers/infiniband/core/cma_trace.h
@@ -84,7 +84,7 @@ TRACE_EVENT(cm_id_attach,
 		       sizeof(struct sockaddr_in6));
 		memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr,
 		       sizeof(struct sockaddr_in6));
-		__assign_str(devname, device->name);
+		__assign_str(devname);
 	),
 
 	TP_printk("cm.id=%u src=%pISpc dst=%pISpc device=%s",
@@ -334,7 +334,7 @@ DECLARE_EVENT_CLASS(cma_client_class,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, device->name);
+		__assign_str(name);
 	),
 
 	TP_printk("device name=%s",
diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
index 4b3f1cb125fc..eb38f81aeeb1 100644
--- a/drivers/infiniband/hw/hfi1/hfi.h
+++ b/drivers/infiniband/hw/hfi1/hfi.h
@@ -2425,7 +2425,7 @@ static inline bool hfi1_need_drop(struct hfi1_devdata *dd)
 int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp);
 
 #define DD_DEV_ENTRY(dd)       __string(dev, dev_name(&(dd)->pcidev->dev))
-#define DD_DEV_ASSIGN(dd)      __assign_str(dev, dev_name(&(dd)->pcidev->dev))
+#define DD_DEV_ASSIGN(dd)      __assign_str(dev)
 
 static inline void hfi1_update_ah_attr(struct ib_device *ibdev,
 				       struct rdma_ah_attr *attr)
diff --git a/drivers/infiniband/hw/hfi1/trace_dbg.h b/drivers/infiniband/hw/hfi1/trace_dbg.h
index 75599d5168db..58304b91380f 100644
--- a/drivers/infiniband/hw/hfi1/trace_dbg.h
+++ b/drivers/infiniband/hw/hfi1/trace_dbg.h
@@ -33,7 +33,7 @@ DECLARE_EVENT_CLASS(hfi1_trace_template,
 		    TP_STRUCT__entry(__string(function, function)
 				     __vstring(msg, vaf->fmt, vaf->va)
 				     ),
-		    TP_fast_assign(__assign_str(function, function);
+		    TP_fast_assign(__assign_str(function);
 				   __assign_vstr(msg, vaf->fmt, vaf->va);
 				   ),
 		    TP_printk("(%s) %s",
diff --git a/drivers/infiniband/hw/hfi1/trace_rx.h b/drivers/infiniband/hw/hfi1/trace_rx.h
index e6904aa80c00..8d5e12fe88a5 100644
--- a/drivers/infiniband/hw/hfi1/trace_rx.h
+++ b/drivers/infiniband/hw/hfi1/trace_rx.h
@@ -90,7 +90,7 @@ TRACE_EVENT(hfi1_mmu_invalidate,
 	    TP_fast_assign(
 			__entry->ctxt = ctxt;
 			__entry->subctxt = subctxt;
-			__assign_str(type, type);
+			__assign_str(type);
 			__entry->start = start;
 			__entry->end = end;
 	    ),
diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h
index d129b8195959..e358f5b885fa 100644
--- a/drivers/infiniband/hw/hfi1/trace_tid.h
+++ b/drivers/infiniband/hw/hfi1/trace_tid.h
@@ -358,7 +358,7 @@ DECLARE_EVENT_CLASS(/* msg */
 	),
 	TP_fast_assign(/* assign */
 		__entry->qpn = qp ? qp->ibqp.qp_num : 0;
-		__assign_str(msg, msg);
+		__assign_str(msg);
 		__entry->more = more;
 	),
 	TP_printk(/* print */
@@ -651,7 +651,7 @@ DECLARE_EVENT_CLASS(/* tid_node */
 	TP_fast_assign(/* assign */
 		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
 		__entry->qpn = qp->ibqp.qp_num;
-		__assign_str(msg, msg);
+		__assign_str(msg);
 		__entry->index = index;
 		__entry->base = base;
 		__entry->map = map;
diff --git a/drivers/infiniband/hw/hfi1/trace_tx.h b/drivers/infiniband/hw/hfi1/trace_tx.h
index c79856d4fdfb..c0ba6b0a2c4e 100644
--- a/drivers/infiniband/hw/hfi1/trace_tx.h
+++ b/drivers/infiniband/hw/hfi1/trace_tx.h
@@ -740,8 +740,8 @@ TRACE_EVENT(hfi1_sdma_state,
 		__string(newstate, nstate)
 	    ),
 	    TP_fast_assign(DD_DEV_ASSIGN(sde->dd);
-		__assign_str(curstate, cstate);
-		__assign_str(newstate, nstate);
+		__assign_str(curstate);
+		__assign_str(newstate);
 	    ),
 	    TP_printk("[%s] current state %s new state %s",
 		      __get_str(dev),
diff --git a/drivers/infiniband/sw/rdmavt/trace.h b/drivers/infiniband/sw/rdmavt/trace.h
index 4341965a5ea7..bdb6b9326b64 100644
--- a/drivers/infiniband/sw/rdmavt/trace.h
+++ b/drivers/infiniband/sw/rdmavt/trace.h
@@ -4,7 +4,7 @@
  */
 
 #define RDI_DEV_ENTRY(rdi)   __string(dev, rvt_get_ibdev_name(rdi))
-#define RDI_DEV_ASSIGN(rdi)  __assign_str(dev, rvt_get_ibdev_name(rdi))
+#define RDI_DEV_ASSIGN(rdi)  __assign_str(dev)
 
 #include "trace_rvt.h"
 #include "trace_qp.h"
diff --git a/drivers/infiniband/sw/rdmavt/trace_rvt.h b/drivers/infiniband/sw/rdmavt/trace_rvt.h
index df33c2ca9710..a00489e66ddf 100644
--- a/drivers/infiniband/sw/rdmavt/trace_rvt.h
+++ b/drivers/infiniband/sw/rdmavt/trace_rvt.h
@@ -24,7 +24,7 @@ TRACE_EVENT(rvt_dbg,
 	),
 	TP_fast_assign(
 		RDI_DEV_ASSIGN(rdi);
-		__assign_str(msg, msg);
+		__assign_str(msg);
 	),
 	TP_printk("[%s]: %s", __get_str(dev), __get_str(msg))
 );
diff --git a/drivers/interconnect/trace.h b/drivers/interconnect/trace.h
index 3d668ff566bf..206373546528 100644
--- a/drivers/interconnect/trace.h
+++ b/drivers/interconnect/trace.h
@@ -32,9 +32,9 @@ TRACE_EVENT(icc_set_bw,
 	),
 
 	TP_fast_assign(
-		__assign_str(path_name, p->name);
-		__assign_str(dev, dev_name(p->reqs[i].dev));
-		__assign_str(node_name, n->name);
+		__assign_str(path_name);
+		__assign_str(dev);
+		__assign_str(node_name);
 		__entry->avg_bw = avg_bw;
 		__entry->peak_bw = peak_bw;
 		__entry->node_avg_bw = n->avg_bw;
@@ -64,8 +64,8 @@ TRACE_EVENT(icc_set_bw_end,
 	),
 
 	TP_fast_assign(
-		__assign_str(path_name, p->name);
-		__assign_str(dev, dev_name(p->reqs[0].dev));
+		__assign_str(path_name);
+		__assign_str(dev);
 		__entry->ret = ret;
 	),
 
diff --git a/drivers/iommu/intel/trace.h b/drivers/iommu/intel/trace.h
index 961ac1c1bc21..9defdae6ebae 100644
--- a/drivers/iommu/intel/trace.h
+++ b/drivers/iommu/intel/trace.h
@@ -32,7 +32,7 @@ TRACE_EVENT(qi_submit,
 	),
 
 	TP_fast_assign(
-		__assign_str(iommu, iommu->name);
+		__assign_str(iommu);
 		__entry->qw0 = qw0;
 		__entry->qw1 = qw1;
 		__entry->qw2 = qw2;
@@ -79,8 +79,8 @@ TRACE_EVENT(prq_report,
 		__entry->dw2 = dw2;
 		__entry->dw3 = dw3;
 		__entry->seq = seq;
-		__assign_str(iommu, iommu->name);
-		__assign_str(dev, dev_name(dev));
+		__assign_str(iommu);
+		__assign_str(dev);
 	),
 
 	TP_printk("%s/%s seq# %ld: %s",
@@ -102,8 +102,8 @@ DECLARE_EVENT_CLASS(cache_tag_log,
 		__field(u32, users)
 	),
 	TP_fast_assign(
-		__assign_str(iommu, tag->iommu->name);
-		__assign_str(dev, dev_name(tag->dev));
+		__assign_str(iommu);
+		__assign_str(dev);
 		__entry->type = tag->type;
 		__entry->domain_id = tag->domain_id;
 		__entry->pasid = tag->pasid;
@@ -152,8 +152,8 @@ DECLARE_EVENT_CLASS(cache_tag_flush,
 		__field(unsigned long, mask)
 	),
 	TP_fast_assign(
-		__assign_str(iommu, tag->iommu->name);
-		__assign_str(dev, dev_name(tag->dev));
+		__assign_str(iommu);
+		__assign_str(dev);
 		__entry->type = tag->type;
 		__entry->domain_id = tag->domain_id;
 		__entry->pasid = tag->pasid;
diff --git a/drivers/media/platform/nvidia/tegra-vde/trace.h b/drivers/media/platform/nvidia/tegra-vde/trace.h
index 7853ab095ca4..e8a75a7bd05d 100644
--- a/drivers/media/platform/nvidia/tegra-vde/trace.h
+++ b/drivers/media/platform/nvidia/tegra-vde/trace.h
@@ -20,7 +20,7 @@ DECLARE_EVENT_CLASS(register_access,
 		__field(u32, value)
 	),
 	TP_fast_assign(
-		__assign_str(hw_name, tegra_vde_reg_base_name(vde, base));
+		__assign_str(hw_name);
 		__entry->offset = offset;
 		__entry->value = value;
 	),
diff --git a/drivers/misc/mei/mei-trace.h b/drivers/misc/mei/mei-trace.h
index fe46ff2b9d69..5312edbf5190 100644
--- a/drivers/misc/mei/mei-trace.h
+++ b/drivers/misc/mei/mei-trace.h
@@ -26,7 +26,7 @@ TRACE_EVENT(mei_reg_read,
 		__field(u32, val)
 	),
 	TP_fast_assign(
-		__assign_str(dev, dev_name(dev));
+		__assign_str(dev);
 		__entry->reg  = reg;
 		__entry->offs = offs;
 		__entry->val = val;
@@ -45,7 +45,7 @@ TRACE_EVENT(mei_reg_write,
 		__field(u32, val)
 	),
 	TP_fast_assign(
-		__assign_str(dev, dev_name(dev));
+		__assign_str(dev);
 		__entry->reg = reg;
 		__entry->offs = offs;
 		__entry->val = val;
@@ -64,7 +64,7 @@ TRACE_EVENT(mei_pci_cfg_read,
 		__field(u32, val)
 	),
 	TP_fast_assign(
-		__assign_str(dev, dev_name(dev));
+		__assign_str(dev);
 		__entry->reg  = reg;
 		__entry->offs = offs;
 		__entry->val = val;
diff --git a/drivers/net/dsa/mv88e6xxx/trace.h b/drivers/net/dsa/mv88e6xxx/trace.h
index f59ca04768e7..5bd015b2b97a 100644
--- a/drivers/net/dsa/mv88e6xxx/trace.h
+++ b/drivers/net/dsa/mv88e6xxx/trace.h
@@ -28,7 +28,7 @@ DECLARE_EVENT_CLASS(mv88e6xxx_atu_violation,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, dev_name(dev));
+		__assign_str(name);
 		__entry->spid = spid;
 		__entry->portvec = portvec;
 		memcpy(__entry->addr, addr, ETH_ALEN);
@@ -68,7 +68,7 @@ DECLARE_EVENT_CLASS(mv88e6xxx_vtu_violation,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, dev_name(dev));
+		__assign_str(name);
 		__entry->spid = spid;
 		__entry->vid = vid;
 	),
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth_trace.h b/drivers/net/ethernet/freescale/dpaa/dpaa_eth_trace.h
index 889f89df9930..6f0e58a2a58a 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth_trace.h
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth_trace.h
@@ -57,7 +57,7 @@ DECLARE_EVENT_CLASS(dpaa_eth_fd,
 		__entry->fd_offset = qm_fd_get_offset(fd);
 		__entry->fd_length = qm_fd_get_length(fd);
 		__entry->fd_status = fd->status;
-		__assign_str(name, netdev->name);
+		__assign_str(name);
 	),
 
 	/* This is what gets printed when the trace event is triggered */
diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-trace.h b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-trace.h
index 9b43fadb9b11..956767e0869c 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-trace.h
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-trace.h
@@ -48,7 +48,7 @@ DECLARE_EVENT_CLASS(dpaa2_eth_fd,
 				   __entry->fd_addr = dpaa2_fd_get_addr(fd);
 				   __entry->fd_len = dpaa2_fd_get_len(fd);
 				   __entry->fd_offset = dpaa2_fd_get_offset(fd);
-				   __assign_str(name, netdev->name);
+				   __assign_str(name);
 		    ),
 
 		    /* This is what gets printed when the trace event is
@@ -144,7 +144,7 @@ DECLARE_EVENT_CLASS(dpaa2_eth_buf,
 				   __entry->dma_addr = dma_addr;
 				   __entry->map_size = map_size;
 				   __entry->bpid = bpid;
-				   __assign_str(name, netdev->name);
+				   __assign_str(name);
 		    ),
 
 		    /* This is what gets printed when the trace event is
diff --git a/drivers/net/ethernet/fungible/funeth/funeth_trace.h b/drivers/net/ethernet/fungible/funeth/funeth_trace.h
index 9e58dfec19d5..b9985900f30b 100644
--- a/drivers/net/ethernet/fungible/funeth/funeth_trace.h
+++ b/drivers/net/ethernet/fungible/funeth/funeth_trace.h
@@ -32,7 +32,7 @@ TRACE_EVENT(funeth_tx,
 		__entry->len = len;
 		__entry->sqe_idx = sqe_idx;
 		__entry->ngle = ngle;
-		__assign_str(devname, txq->netdev->name);
+		__assign_str(devname);
 	),
 
 	TP_printk("%s: Txq %u, SQE idx %u, len %u, num GLEs %u",
@@ -62,7 +62,7 @@ TRACE_EVENT(funeth_tx_free,
 		__entry->sqe_idx = sqe_idx;
 		__entry->num_sqes = num_sqes;
 		__entry->hw_head = hw_head;
-		__assign_str(devname, txq->netdev->name);
+		__assign_str(devname);
 	),
 
 	TP_printk("%s: Txq %u, SQE idx %u, SQEs %u, HW head %u",
@@ -97,7 +97,7 @@ TRACE_EVENT(funeth_rx,
 		__entry->len = pkt_len;
 		__entry->hash = hash;
 		__entry->cls_vec = cls_vec;
-		__assign_str(devname, rxq->netdev->name);
+		__assign_str(devname);
 	),
 
 	TP_printk("%s: Rxq %u, CQ head %u, RQEs %u, len %u, hash %u, CV %#x",
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_trace.h b/drivers/net/ethernet/hisilicon/hns3/hns3_trace.h
index b8a1ecb4b8fb..3362b8d14d4f 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_trace.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_trace.h
@@ -84,7 +84,7 @@ TRACE_EVENT(hns3_tx_desc,
 		__entry->desc_dma = ring->desc_dma_addr,
 		memcpy(__entry->desc, &ring->desc[cur_ntu],
 		       sizeof(struct hns3_desc));
-		__assign_str(devname, ring->tqp->handle->kinfo.netdev->name);
+		__assign_str(devname);
 	),
 
 	TP_printk(
@@ -117,7 +117,7 @@ TRACE_EVENT(hns3_rx_desc,
 		__entry->buf_dma = ring->desc_cb[ring->next_to_clean].dma;
 		memcpy(__entry->desc, &ring->desc[ring->next_to_clean],
 		       sizeof(struct hns3_desc));
-		__assign_str(devname, ring->tqp->handle->kinfo.netdev->name);
+		__assign_str(devname);
 	),
 
 	TP_printk(
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_trace.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_trace.h
index 7e47f0c21d88..7103cf04bffc 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_trace.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_trace.h
@@ -33,8 +33,8 @@ TRACE_EVENT(hclge_pf_mbx_get,
 		__entry->vfid = req->mbx_src_vfid;
 		__entry->code = req->msg.code;
 		__entry->subcode = req->msg.subcode;
-		__assign_str(pciname, pci_name(hdev->pdev));
-		__assign_str(devname, hdev->vport[0].nic.kinfo.netdev->name);
+		__assign_str(pciname);
+		__assign_str(devname);
 		memcpy(__entry->mbx_data, req,
 		       sizeof(struct hclge_mbx_vf_to_pf_cmd));
 	),
@@ -64,8 +64,8 @@ TRACE_EVENT(hclge_pf_mbx_send,
 	TP_fast_assign(
 		__entry->vfid = req->dest_vfid;
 		__entry->code = le16_to_cpu(req->msg.code);
-		__assign_str(pciname, pci_name(hdev->pdev));
-		__assign_str(devname, hdev->vport[0].nic.kinfo.netdev->name);
+		__assign_str(pciname);
+		__assign_str(devname);
 		memcpy(__entry->mbx_data, req,
 		       sizeof(struct hclge_mbx_pf_to_vf_cmd));
 	),
@@ -101,7 +101,7 @@ DECLARE_EVENT_CLASS(hclge_pf_cmd_template,
 			__entry->rsv = le16_to_cpu(desc->rsv);
 			__entry->index = index;
 			__entry->num = num;
-			__assign_str(pciname, pci_name(hw->cmq.csq.pdev));
+			__assign_str(pciname);
 			for (i = 0; i < HCLGE_DESC_DATA_LEN; i++)
 				__entry->data[i] = le32_to_cpu(desc->data[i]);),
 
@@ -144,7 +144,7 @@ DECLARE_EVENT_CLASS(hclge_pf_special_cmd_template,
 		    TP_fast_assign(int i;
 			__entry->index = index;
 			__entry->num = num;
-			__assign_str(pciname, pci_name(hw->cmq.csq.pdev));
+			__assign_str(pciname);
 			for (i = 0; i < PF_DESC_LEN; i++)
 				__entry->data[i] = le32_to_cpu(data[i]);
 		),
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_trace.h b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_trace.h
index e2e3a2602b6a..66b084309c91 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_trace.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_trace.h
@@ -30,8 +30,8 @@ TRACE_EVENT(hclge_vf_mbx_get,
 	TP_fast_assign(
 		__entry->vfid = req->dest_vfid;
 		__entry->code = le16_to_cpu(req->msg.code);
-		__assign_str(pciname, pci_name(hdev->pdev));
-		__assign_str(devname, hdev->nic.kinfo.netdev->name);
+		__assign_str(pciname);
+		__assign_str(devname);
 		memcpy(__entry->mbx_data, req,
 		       sizeof(struct hclge_mbx_pf_to_vf_cmd));
 	),
@@ -63,8 +63,8 @@ TRACE_EVENT(hclge_vf_mbx_send,
 		__entry->vfid = req->mbx_src_vfid;
 		__entry->code = req->msg.code;
 		__entry->subcode = req->msg.subcode;
-		__assign_str(pciname, pci_name(hdev->pdev));
-		__assign_str(devname, hdev->nic.kinfo.netdev->name);
+		__assign_str(pciname);
+		__assign_str(devname);
 		memcpy(__entry->mbx_data, req,
 		       sizeof(struct hclge_mbx_vf_to_pf_cmd));
 	),
@@ -101,7 +101,7 @@ DECLARE_EVENT_CLASS(hclge_vf_cmd_template,
 			__entry->rsv = le16_to_cpu(desc->rsv);
 			__entry->index = index;
 			__entry->num = num;
-			__assign_str(pciname, pci_name(hw->cmq.csq.pdev));
+			__assign_str(pciname);
 			for (i = 0; i < HCLGE_DESC_DATA_LEN; i++)
 				__entry->data[i] = le32_to_cpu(desc->data[i]);),
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_trace.h b/drivers/net/ethernet/intel/i40e/i40e_trace.h
index 33b4e30f5e00..759f3d1c4c8f 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_trace.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_trace.h
@@ -89,8 +89,8 @@ TRACE_EVENT(i40e_napi_poll,
 		__entry->tx_clean_complete = tx_clean_complete;
 		__entry->irq_num = q->irq_num;
 		__entry->curr_cpu = get_cpu();
-		__assign_str(qname, q->name);
-		__assign_str(dev_name, napi->dev ? napi->dev->name : NO_DEV);
+		__assign_str(qname);
+		__assign_str(dev_name);
 		__assign_bitmask(irq_affinity, cpumask_bits(&q->affinity_mask),
 				 nr_cpumask_bits);
 	),
@@ -132,7 +132,7 @@ DECLARE_EVENT_CLASS(
 		__entry->ring = ring;
 		__entry->desc = desc;
 		__entry->buf = buf;
-		__assign_str(devname, ring->netdev->name);
+		__assign_str(devname);
 	),
 
 	TP_printk(
@@ -177,7 +177,7 @@ DECLARE_EVENT_CLASS(
 		__entry->ring = ring;
 		__entry->desc = desc;
 		__entry->xdp = xdp;
-		__assign_str(devname, ring->netdev->name);
+		__assign_str(devname);
 	),
 
 	TP_printk(
@@ -219,7 +219,7 @@ DECLARE_EVENT_CLASS(
 	TP_fast_assign(
 		__entry->skb = skb;
 		__entry->ring = ring;
-		__assign_str(devname, ring->netdev->name);
+		__assign_str(devname);
 	),
 
 	TP_printk(
diff --git a/drivers/net/ethernet/intel/iavf/iavf_trace.h b/drivers/net/ethernet/intel/iavf/iavf_trace.h
index 82fda6f5abf0..62212011c807 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_trace.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_trace.h
@@ -83,7 +83,7 @@ DECLARE_EVENT_CLASS(
 		__entry->ring = ring;
 		__entry->desc = desc;
 		__entry->buf = buf;
-		__assign_str(devname, ring->netdev->name);
+		__assign_str(devname);
 	),
 
 	TP_printk(
@@ -128,7 +128,7 @@ DECLARE_EVENT_CLASS(
 		__entry->ring = ring;
 		__entry->desc = desc;
 		__entry->skb = skb;
-		__assign_str(devname, ring->netdev->name);
+		__assign_str(devname);
 	),
 
 	TP_printk(
@@ -170,7 +170,7 @@ DECLARE_EVENT_CLASS(
 	TP_fast_assign(
 		__entry->skb = skb;
 		__entry->ring = ring;
-		__assign_str(devname, ring->netdev->name);
+		__assign_str(devname);
 	),
 
 	TP_printk(
diff --git a/drivers/net/ethernet/intel/ice/ice_trace.h b/drivers/net/ethernet/intel/ice/ice_trace.h
index b2f5c9fe0149..244cddd2a9ea 100644
--- a/drivers/net/ethernet/intel/ice/ice_trace.h
+++ b/drivers/net/ethernet/intel/ice/ice_trace.h
@@ -69,7 +69,7 @@ DECLARE_EVENT_CLASS(ice_rx_dim_template,
 
 		    TP_fast_assign(__entry->q_vector = q_vector;
 				   __entry->dim = dim;
-				   __assign_str(devname, q_vector->rx.rx_ring->netdev->name);),
+				   __assign_str(devname);),
 
 		    TP_printk("netdev: %s Rx-Q: %d dim-state: %d dim-profile: %d dim-tune: %d dim-st-right: %d dim-st-left: %d dim-tired: %d",
 			      __get_str(devname),
@@ -96,7 +96,7 @@ DECLARE_EVENT_CLASS(ice_tx_dim_template,
 
 		    TP_fast_assign(__entry->q_vector = q_vector;
 				   __entry->dim = dim;
-				   __assign_str(devname, q_vector->tx.tx_ring->netdev->name);),
+				   __assign_str(devname);),
 
 		    TP_printk("netdev: %s Tx-Q: %d dim-state: %d dim-profile: %d dim-tune: %d dim-st-right: %d dim-st-left: %d dim-tired: %d",
 			      __get_str(devname),
@@ -128,7 +128,7 @@ DECLARE_EVENT_CLASS(ice_tx_template,
 		    TP_fast_assign(__entry->ring = ring;
 				   __entry->desc = desc;
 				   __entry->buf = buf;
-				   __assign_str(devname, ring->netdev->name);),
+				   __assign_str(devname);),
 
 		    TP_printk("netdev: %s ring: %pK desc: %pK buf %pK", __get_str(devname),
 			      __entry->ring, __entry->desc, __entry->buf)
@@ -156,7 +156,7 @@ DECLARE_EVENT_CLASS(ice_rx_template,
 
 		    TP_fast_assign(__entry->ring = ring;
 				   __entry->desc = desc;
-				   __assign_str(devname, ring->netdev->name);),
+				   __assign_str(devname);),
 
 		    TP_printk("netdev: %s ring: %pK desc: %pK", __get_str(devname),
 			      __entry->ring, __entry->desc)
@@ -180,7 +180,7 @@ DECLARE_EVENT_CLASS(ice_rx_indicate_template,
 		    TP_fast_assign(__entry->ring = ring;
 				   __entry->desc = desc;
 				   __entry->skb = skb;
-				   __assign_str(devname, ring->netdev->name);),
+				   __assign_str(devname);),
 
 		    TP_printk("netdev: %s ring: %pK desc: %pK skb %pK", __get_str(devname),
 			      __entry->ring, __entry->desc, __entry->skb)
@@ -203,7 +203,7 @@ DECLARE_EVENT_CLASS(ice_xmit_template,
 
 		    TP_fast_assign(__entry->ring = ring;
 				   __entry->skb = skb;
-				   __assign_str(devname, ring->netdev->name);),
+				   __assign_str(devname);),
 
 		    TP_printk("netdev: %s skb: %pK ring: %pK", __get_str(devname),
 			      __entry->skb, __entry->ring)
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_trace.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu_trace.h
index 28984d0e848a..5704520f9b02 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_trace.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_trace.h
@@ -24,7 +24,7 @@ TRACE_EVENT(otx2_msg_alloc,
 			     __field(u16, id)
 			     __field(u64, size)
 	    ),
-	    TP_fast_assign(__assign_str(dev, pci_name(pdev));
+	    TP_fast_assign(__assign_str(dev);
 			   __entry->id = id;
 			   __entry->size = size;
 	    ),
@@ -39,7 +39,7 @@ TRACE_EVENT(otx2_msg_send,
 			     __field(u16, num_msgs)
 			     __field(u64, msg_size)
 	    ),
-	    TP_fast_assign(__assign_str(dev, pci_name(pdev));
+	    TP_fast_assign(__assign_str(dev);
 			   __entry->num_msgs = num_msgs;
 			   __entry->msg_size = msg_size;
 	    ),
@@ -55,7 +55,7 @@ TRACE_EVENT(otx2_msg_check,
 			     __field(u16, rspid)
 			     __field(int, rc)
 	    ),
-	    TP_fast_assign(__assign_str(dev, pci_name(pdev));
+	    TP_fast_assign(__assign_str(dev);
 			   __entry->reqid = reqid;
 			   __entry->rspid = rspid;
 			   __entry->rc = rc;
@@ -72,8 +72,8 @@ TRACE_EVENT(otx2_msg_interrupt,
 			     __string(str, msg)
 			     __field(u64, intr)
 	    ),
-	    TP_fast_assign(__assign_str(dev, pci_name(pdev));
-			   __assign_str(str, msg);
+	    TP_fast_assign(__assign_str(dev);
+			   __assign_str(str);
 			   __entry->intr = intr;
 	    ),
 	    TP_printk("[%s] mbox interrupt %s (0x%llx)\n", __get_str(dev),
@@ -87,7 +87,7 @@ TRACE_EVENT(otx2_msg_process,
 			     __field(u16, id)
 			     __field(int, err)
 	    ),
-	    TP_fast_assign(__assign_str(dev, pci_name(pdev));
+	    TP_fast_assign(__assign_str(dev);
 			   __entry->id = id;
 			   __entry->err = err;
 	    ),
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/cmd_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/cmd_tracepoint.h
index 406ebe17405f..b4b3a43e56a0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/cmd_tracepoint.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/cmd_tracepoint.h
@@ -22,10 +22,10 @@ TRACE_EVENT(mlx5_cmd,
 			    __field(u32, syndrome)
 			    __field(int, err)
 			    ),
-	    TP_fast_assign(__assign_str(command_str, command_str);
+	    TP_fast_assign(__assign_str(command_str);
 			__entry->opcode = opcode;
 			__entry->op_mod = op_mod;
-			__assign_str(status_str, status_str);
+			__assign_str(status_str);
 			__entry->status = status;
 			__entry->syndrome = syndrome;
 			__entry->err = err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/en_rep_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/en_rep_tracepoint.h
index f15718db5d0e..78e481b2c015 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/en_rep_tracepoint.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/en_rep_tracepoint.h
@@ -25,7 +25,7 @@ TRACE_EVENT(mlx5e_rep_neigh_update,
 			struct in6_addr *pin6;
 			__be32 *p32;
 
-			__assign_str(devname, nhe->neigh_dev->name);
+			__assign_str(devname);
 			__entry->neigh_connected = neigh_connected;
 			memcpy(__entry->ha, ha, ETH_ALEN);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/en_tc_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/en_tc_tracepoint.h
index ac52ef37f38a..4b1ca228012b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/en_tc_tracepoint.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/en_tc_tracepoint.h
@@ -86,7 +86,7 @@ TRACE_EVENT(mlx5e_tc_update_neigh_used_value,
 			struct in6_addr *pin6;
 			__be32 *p32;
 
-			__assign_str(devname, nhe->neigh_dev->name);
+			__assign_str(devname);
 			__entry->neigh_used = neigh_used;
 
 			p32 = (__be32 *)__entry->v4;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer_tracepoint.h
index 3038be575923..50f8a7630f86 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer_tracepoint.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer_tracepoint.h
@@ -55,12 +55,11 @@ TRACE_EVENT(mlx5_fw,
 	),
 
 	TP_fast_assign(
-		__assign_str(dev_name,
-			     dev_name(tracer->dev->device));
+		__assign_str(dev_name);
 		__entry->trace_timestamp = trace_timestamp;
 		__entry->lost = lost;
 		__entry->event_id = event_id;
-		__assign_str(msg, msg);
+		__assign_str(msg);
 	),
 
 	TP_printk("%s [0x%llx] %d [0x%x] %s",
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/diag/qos_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/esw/diag/qos_tracepoint.h
index 458baf0c6415..1ce332f21ebe 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/diag/qos_tracepoint.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/diag/qos_tracepoint.h
@@ -17,7 +17,7 @@ TRACE_EVENT(mlx5_esw_vport_qos_destroy,
 			     __field(unsigned short, vport_id)
 			     __field(unsigned int,   tsar_ix)
 			     ),
-	    TP_fast_assign(__assign_str(devname, dev_name(vport->dev->device));
+	    TP_fast_assign(__assign_str(devname);
 		    __entry->vport_id = vport->vport;
 		    __entry->tsar_ix = vport->qos.esw_tsar_ix;
 	    ),
@@ -36,7 +36,7 @@ DECLARE_EVENT_CLASS(mlx5_esw_vport_qos_template,
 				     __field(unsigned int, max_rate)
 				     __field(void *, group)
 				     ),
-		    TP_fast_assign(__assign_str(devname, dev_name(vport->dev->device));
+		    TP_fast_assign(__assign_str(devname);
 			    __entry->vport_id = vport->vport;
 			    __entry->tsar_ix = vport->qos.esw_tsar_ix;
 			    __entry->bw_share = bw_share;
@@ -68,7 +68,7 @@ DECLARE_EVENT_CLASS(mlx5_esw_group_qos_template,
 				     __field(const void *, group)
 				     __field(unsigned int, tsar_ix)
 				     ),
-		    TP_fast_assign(__assign_str(devname, dev_name(dev->device));
+		    TP_fast_assign(__assign_str(devname);
 			    __entry->group = group;
 			    __entry->tsar_ix = tsar_ix;
 		    ),
@@ -102,7 +102,7 @@ TRACE_EVENT(mlx5_esw_group_qos_config,
 			     __field(unsigned int, bw_share)
 			     __field(unsigned int, max_rate)
 			     ),
-	    TP_fast_assign(__assign_str(devname, dev_name(dev->device));
+	    TP_fast_assign(__assign_str(devname);
 		    __entry->group = group;
 		    __entry->tsar_ix = tsar_ix;
 		    __entry->bw_share = bw_share;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/diag/dev_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/diag/dev_tracepoint.h
index 7f7c9af5deed..0537de86f981 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/diag/dev_tracepoint.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/dev/diag/dev_tracepoint.h
@@ -22,7 +22,7 @@ DECLARE_EVENT_CLASS(mlx5_sf_dev_template,
 				     __field(u16, hw_fn_id)
 				     __field(u32, sfnum)
 		    ),
-		    TP_fast_assign(__assign_str(devname, dev_name(dev->device));
+		    TP_fast_assign(__assign_str(devname);
 				   __entry->sfdev = sfdev;
 				   __entry->aux_id = aux_id;
 				   __entry->hw_fn_id = sfdev->fn_id;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/diag/sf_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/diag/sf_tracepoint.h
index 8bf1cd90930d..302ce00da5a9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/diag/sf_tracepoint.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/diag/sf_tracepoint.h
@@ -24,7 +24,7 @@ TRACE_EVENT(mlx5_sf_add,
 			     __field(u16, hw_fn_id)
 			     __field(u32, sfnum)
 			    ),
-	    TP_fast_assign(__assign_str(devname, dev_name(dev->device));
+	    TP_fast_assign(__assign_str(devname);
 		    __entry->port_index = port_index;
 		    __entry->controller = controller;
 		    __entry->hw_fn_id = hw_fn_id;
@@ -46,7 +46,7 @@ TRACE_EVENT(mlx5_sf_free,
 			     __field(u32, controller)
 			     __field(u16, hw_fn_id)
 			    ),
-	    TP_fast_assign(__assign_str(devname, dev_name(dev->device));
+	    TP_fast_assign(__assign_str(devname);
 		    __entry->port_index = port_index;
 		    __entry->controller = controller;
 		    __entry->hw_fn_id = hw_fn_id;
@@ -67,7 +67,7 @@ TRACE_EVENT(mlx5_sf_hwc_alloc,
 			     __field(u16, hw_fn_id)
 			     __field(u32, sfnum)
 			    ),
-	    TP_fast_assign(__assign_str(devname, dev_name(dev->device));
+	    TP_fast_assign(__assign_str(devname);
 		    __entry->controller = controller;
 		    __entry->hw_fn_id = hw_fn_id;
 		    __entry->sfnum = sfnum;
@@ -84,7 +84,7 @@ TRACE_EVENT(mlx5_sf_hwc_free,
 	    TP_STRUCT__entry(__string(devname, dev_name(dev->device))
 			     __field(u16, hw_fn_id)
 			    ),
-	    TP_fast_assign(__assign_str(devname, dev_name(dev->device));
+	    TP_fast_assign(__assign_str(devname);
 		    __entry->hw_fn_id = hw_fn_id;
 	    ),
 	    TP_printk("(%s) hw_id=0x%x\n", __get_str(devname), __entry->hw_fn_id)
@@ -97,7 +97,7 @@ TRACE_EVENT(mlx5_sf_hwc_deferred_free,
 	    TP_STRUCT__entry(__string(devname, dev_name(dev->device))
 			     __field(u16, hw_fn_id)
 			    ),
-	    TP_fast_assign(__assign_str(devname, dev_name(dev->device));
+	    TP_fast_assign(__assign_str(devname);
 		    __entry->hw_fn_id = hw_fn_id;
 	    ),
 	    TP_printk("(%s) hw_id=0x%x\n", __get_str(devname), __entry->hw_fn_id)
@@ -113,7 +113,7 @@ DECLARE_EVENT_CLASS(mlx5_sf_state_template,
 				     __field(unsigned int, port_index)
 				     __field(u32, controller)
 				     __field(u16, hw_fn_id)),
-		    TP_fast_assign(__assign_str(devname, dev_name(dev->device));
+		    TP_fast_assign(__assign_str(devname);
 				   __entry->port_index = port_index;
 				   __entry->controller = controller;
 				   __entry->hw_fn_id = hw_fn_id;
@@ -152,7 +152,7 @@ TRACE_EVENT(mlx5_sf_update_state,
 			     __field(u16, hw_fn_id)
 			     __field(u8, state)
 			    ),
-	    TP_fast_assign(__assign_str(devname, dev_name(dev->device));
+	    TP_fast_assign(__assign_str(devname);
 		    __entry->port_index = port_index;
 		    __entry->controller = controller;
 		    __entry->hw_fn_id = hw_fn_id;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/diag/vhca_tracepoint.h b/drivers/net/ethernet/mellanox/mlx5/core/sf/diag/vhca_tracepoint.h
index fd814a190b8b..6352cb004a18 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/sf/diag/vhca_tracepoint.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/diag/vhca_tracepoint.h
@@ -20,7 +20,7 @@ TRACE_EVENT(mlx5_sf_vhca_event,
 			     __field(u32, sfnum)
 			     __field(u8, vhca_state)
 			    ),
-	    TP_fast_assign(__assign_str(devname, dev_name(dev->device));
+	    TP_fast_assign(__assign_str(devname);
 		    __entry->hw_fn_id = event->function_id;
 		    __entry->sfnum = event->sw_function_id;
 		    __entry->vhca_state = event->new_vhca_state;
diff --git a/drivers/net/fjes/fjes_trace.h b/drivers/net/fjes/fjes_trace.h
index 6437ddbd7842..166ef015262b 100644
--- a/drivers/net/fjes/fjes_trace.h
+++ b/drivers/net/fjes/fjes_trace.h
@@ -85,7 +85,7 @@ TRACE_EVENT(fjes_hw_request_info_err,
 		__string(err, err)
 	),
 	TP_fast_assign(
-		__assign_str(err, err);
+		__assign_str(err);
 	),
 	TP_printk("%s", __get_str(err))
 );
@@ -145,7 +145,7 @@ TRACE_EVENT(fjes_hw_register_buff_addr_err,
 		__string(err, err)
 	),
 	TP_fast_assign(
-		__assign_str(err, err);
+		__assign_str(err);
 	),
 	TP_printk("%s", __get_str(err))
 );
@@ -189,7 +189,7 @@ TRACE_EVENT(fjes_hw_unregister_buff_addr_err,
 		__string(err, err)
 	),
 	TP_fast_assign(
-		__assign_str(err, err);
+		__assign_str(err);
 	),
 	TP_printk("%s", __get_str(err))
 );
@@ -232,7 +232,7 @@ TRACE_EVENT(fjes_hw_start_debug_err,
 		 __string(err, err)
 	),
 	TP_fast_assign(
-		__assign_str(err, err);
+		__assign_str(err);
 	),
 	TP_printk("%s", __get_str(err))
 );
@@ -258,7 +258,7 @@ TRACE_EVENT(fjes_hw_stop_debug_err,
 		 __string(err, err)
 	),
 	TP_fast_assign(
-		__assign_str(err, err);
+		__assign_str(err);
 	),
 	TP_printk("%s", __get_str(err))
 );
diff --git a/drivers/net/hyperv/netvsc_trace.h b/drivers/net/hyperv/netvsc_trace.h
index f7585563dea5..05e620cbdd29 100644
--- a/drivers/net/hyperv/netvsc_trace.h
+++ b/drivers/net/hyperv/netvsc_trace.h
@@ -51,7 +51,7 @@ DECLARE_EVENT_CLASS(rndis_msg_class,
 	       __field(	 u32,  msg_len	   )
        ),
        TP_fast_assign(
-	       __assign_str(name, ndev->name);
+	       __assign_str(name);
 	       __entry->queue	 = q;
 	       __entry->req_id	 = msg->msg.init_req.req_id;
 	       __entry->msg_type = msg->ndis_msg_type;
@@ -121,7 +121,7 @@ TRACE_EVENT(nvsp_send,
 		__field(  u32,	msg_type    )
 	),
 	TP_fast_assign(
-		__assign_str(name, ndev->name);
+		__assign_str(name);
 		__entry->msg_type = msg->hdr.msg_type;
 	),
 	TP_printk("dev=%s type=%s",
@@ -142,7 +142,7 @@ TRACE_EVENT(nvsp_send_pkt,
 		__field(  u32,	section_size  )
 	),
 	TP_fast_assign(
-		__assign_str(name, ndev->name);
+		__assign_str(name);
 		__entry->qid = chan->offermsg.offer.sub_channel_index;
 		__entry->channel_type = rpkt->channel_type;
 		__entry->section_index = rpkt->send_buf_section_index;
@@ -165,7 +165,7 @@ TRACE_EVENT(nvsp_recv,
 		__field(  u32,	msg_type    )
 	),
 	TP_fast_assign(
-		__assign_str(name, ndev->name);
+		__assign_str(name);
 		__entry->qid = chan->offermsg.offer.sub_channel_index;
 		__entry->msg_type = msg->hdr.msg_type;
 	),
diff --git a/drivers/net/wireless/ath/ath10k/trace.h b/drivers/net/wireless/ath/ath10k/trace.h
index 64e7a767d963..68b78ca17eaa 100644
--- a/drivers/net/wireless/ath/ath10k/trace.h
+++ b/drivers/net/wireless/ath/ath10k/trace.h
@@ -55,8 +55,8 @@ DECLARE_EVENT_CLASS(ath10k_log_event,
 		__vstring(msg, vaf->fmt, vaf->va)
 	),
 	TP_fast_assign(
-		__assign_str(device, dev_name(ar->dev));
-		__assign_str(driver, dev_driver_string(ar->dev));
+		__assign_str(device);
+		__assign_str(driver);
 		__assign_vstr(msg, vaf->fmt, vaf->va);
 	),
 	TP_printk(
@@ -92,8 +92,8 @@ TRACE_EVENT(ath10k_log_dbg,
 		__vstring(msg, vaf->fmt, vaf->va)
 	),
 	TP_fast_assign(
-		__assign_str(device, dev_name(ar->dev));
-		__assign_str(driver, dev_driver_string(ar->dev));
+		__assign_str(device);
+		__assign_str(driver);
 		__entry->level = level;
 		__assign_vstr(msg, vaf->fmt, vaf->va);
 	),
@@ -121,10 +121,10 @@ TRACE_EVENT(ath10k_log_dbg_dump,
 	),
 
 	TP_fast_assign(
-		__assign_str(device, dev_name(ar->dev));
-		__assign_str(driver, dev_driver_string(ar->dev));
-		__assign_str(msg, msg);
-		__assign_str(prefix, prefix);
+		__assign_str(device);
+		__assign_str(driver);
+		__assign_str(msg);
+		__assign_str(prefix);
 		__entry->buf_len = buf_len;
 		memcpy(__get_dynamic_array(buf), buf, buf_len);
 	),
@@ -152,8 +152,8 @@ TRACE_EVENT(ath10k_wmi_cmd,
 	),
 
 	TP_fast_assign(
-		__assign_str(device, dev_name(ar->dev));
-		__assign_str(driver, dev_driver_string(ar->dev));
+		__assign_str(device);
+		__assign_str(driver);
 		__entry->id = id;
 		__entry->buf_len = buf_len;
 		memcpy(__get_dynamic_array(buf), buf, buf_len);
@@ -182,8 +182,8 @@ TRACE_EVENT(ath10k_wmi_event,
 	),
 
 	TP_fast_assign(
-		__assign_str(device, dev_name(ar->dev));
-		__assign_str(driver, dev_driver_string(ar->dev));
+		__assign_str(device);
+		__assign_str(driver);
 		__entry->id = id;
 		__entry->buf_len = buf_len;
 		memcpy(__get_dynamic_array(buf), buf, buf_len);
@@ -211,8 +211,8 @@ TRACE_EVENT(ath10k_htt_stats,
 	),
 
 	TP_fast_assign(
-		__assign_str(device, dev_name(ar->dev));
-		__assign_str(driver, dev_driver_string(ar->dev));
+		__assign_str(device);
+		__assign_str(driver);
 		__entry->buf_len = buf_len;
 		memcpy(__get_dynamic_array(buf), buf, buf_len);
 	),
@@ -239,8 +239,8 @@ TRACE_EVENT(ath10k_wmi_dbglog,
 	),
 
 	TP_fast_assign(
-		__assign_str(device, dev_name(ar->dev));
-		__assign_str(driver, dev_driver_string(ar->dev));
+		__assign_str(device);
+		__assign_str(driver);
 		__entry->hw_type = ar->hw_rev;
 		__entry->buf_len = buf_len;
 		memcpy(__get_dynamic_array(buf), buf, buf_len);
@@ -269,8 +269,8 @@ TRACE_EVENT(ath10k_htt_pktlog,
 	),
 
 	TP_fast_assign(
-		__assign_str(device, dev_name(ar->dev));
-		__assign_str(driver, dev_driver_string(ar->dev));
+		__assign_str(device);
+		__assign_str(driver);
 		__entry->hw_type = ar->hw_rev;
 		__entry->buf_len = buf_len;
 		memcpy(__get_dynamic_array(pktlog), buf, buf_len);
@@ -301,8 +301,8 @@ TRACE_EVENT(ath10k_htt_tx,
 	),
 
 	TP_fast_assign(
-		__assign_str(device, dev_name(ar->dev));
-		__assign_str(driver, dev_driver_string(ar->dev));
+		__assign_str(device);
+		__assign_str(driver);
 		__entry->msdu_id = msdu_id;
 		__entry->msdu_len = msdu_len;
 		__entry->vdev_id = vdev_id;
@@ -332,8 +332,8 @@ TRACE_EVENT(ath10k_txrx_tx_unref,
 	),
 
 	TP_fast_assign(
-		__assign_str(device, dev_name(ar->dev));
-		__assign_str(driver, dev_driver_string(ar->dev));
+		__assign_str(device);
+		__assign_str(driver);
 		__entry->msdu_id = msdu_id;
 	),
 
@@ -358,8 +358,8 @@ DECLARE_EVENT_CLASS(ath10k_hdr_event,
 	),
 
 	TP_fast_assign(
-		__assign_str(device, dev_name(ar->dev));
-		__assign_str(driver, dev_driver_string(ar->dev));
+		__assign_str(device);
+		__assign_str(driver);
 		__entry->len = ath10k_frm_hdr_len(data, len);
 		memcpy(__get_dynamic_array(data), data, __entry->len);
 	),
@@ -386,8 +386,8 @@ DECLARE_EVENT_CLASS(ath10k_payload_event,
 	),
 
 	TP_fast_assign(
-		__assign_str(device, dev_name(ar->dev));
-		__assign_str(driver, dev_driver_string(ar->dev));
+		__assign_str(device);
+		__assign_str(driver);
 		__entry->len = len - ath10k_frm_hdr_len(data, len);
 		memcpy(__get_dynamic_array(payload),
 		       data + ath10k_frm_hdr_len(data, len), __entry->len);
@@ -435,8 +435,8 @@ TRACE_EVENT(ath10k_htt_rx_desc,
 	),
 
 	TP_fast_assign(
-		__assign_str(device, dev_name(ar->dev));
-		__assign_str(driver, dev_driver_string(ar->dev));
+		__assign_str(device);
+		__assign_str(driver);
 		__entry->hw_type = ar->hw_rev;
 		__entry->len = len;
 		memcpy(__get_dynamic_array(rxdesc), data, len);
@@ -472,8 +472,8 @@ TRACE_EVENT(ath10k_wmi_diag_container,
 	),
 
 	TP_fast_assign(
-		__assign_str(device, dev_name(ar->dev));
-		__assign_str(driver, dev_driver_string(ar->dev));
+		__assign_str(device);
+		__assign_str(driver);
 		__entry->type = type;
 		__entry->timestamp = timestamp;
 		__entry->code = code;
@@ -505,8 +505,8 @@ TRACE_EVENT(ath10k_wmi_diag,
 	),
 
 	TP_fast_assign(
-		__assign_str(device, dev_name(ar->dev));
-		__assign_str(driver, dev_driver_string(ar->dev));
+		__assign_str(device);
+		__assign_str(driver);
 		__entry->len = len;
 		memcpy(__get_dynamic_array(data), data, len);
 	),
diff --git a/drivers/net/wireless/ath/ath11k/trace.h b/drivers/net/wireless/ath/ath11k/trace.h
index 235ab8ea715f..75246b0a82e3 100644
--- a/drivers/net/wireless/ath/ath11k/trace.h
+++ b/drivers/net/wireless/ath/ath11k/trace.h
@@ -48,8 +48,8 @@ TRACE_EVENT(ath11k_htt_pktlog,
 	),
 
 	TP_fast_assign(
-		__assign_str(device, dev_name(ar->ab->dev));
-		__assign_str(driver, dev_driver_string(ar->ab->dev));
+		__assign_str(device);
+		__assign_str(driver);
 		__entry->buf_len = buf_len;
 		__entry->pktlog_checksum = pktlog_checksum;
 		memcpy(__get_dynamic_array(pktlog), buf, buf_len);
@@ -77,8 +77,8 @@ TRACE_EVENT(ath11k_htt_ppdu_stats,
 	),
 
 	TP_fast_assign(
-		__assign_str(device, dev_name(ar->ab->dev));
-		__assign_str(driver, dev_driver_string(ar->ab->dev));
+		__assign_str(device);
+		__assign_str(driver);
 		__entry->len = len;
 		memcpy(__get_dynamic_array(ppdu), data, len);
 	),
@@ -105,8 +105,8 @@ TRACE_EVENT(ath11k_htt_rxdesc,
 	),
 
 	TP_fast_assign(
-		__assign_str(device, dev_name(ar->ab->dev));
-		__assign_str(driver, dev_driver_string(ar->ab->dev));
+		__assign_str(device);
+		__assign_str(driver);
 		__entry->len = len;
 		__entry->log_type = log_type;
 		memcpy(__get_dynamic_array(rxdesc), data, len);
@@ -130,8 +130,8 @@ DECLARE_EVENT_CLASS(ath11k_log_event,
 		__vstring(msg, vaf->fmt, vaf->va)
 	),
 	TP_fast_assign(
-		__assign_str(device, dev_name(ab->dev));
-		__assign_str(driver, dev_driver_string(ab->dev));
+		__assign_str(device);
+		__assign_str(driver);
 		__assign_vstr(msg, vaf->fmt, vaf->va);
 	),
 	TP_printk(
@@ -171,8 +171,8 @@ TRACE_EVENT(ath11k_wmi_cmd,
 	),
 
 	TP_fast_assign(
-		__assign_str(device, dev_name(ab->dev));
-		__assign_str(driver, dev_driver_string(ab->dev));
+		__assign_str(device);
+		__assign_str(driver);
 		__entry->id = id;
 		__entry->buf_len = buf_len;
 		memcpy(__get_dynamic_array(buf), buf, buf_len);
@@ -201,8 +201,8 @@ TRACE_EVENT(ath11k_wmi_event,
 	),
 
 	TP_fast_assign(
-		__assign_str(device, dev_name(ab->dev));
-		__assign_str(driver, dev_driver_string(ab->dev));
+		__assign_str(device);
+		__assign_str(driver);
 		__entry->id = id;
 		__entry->buf_len = buf_len;
 		memcpy(__get_dynamic_array(buf), buf, buf_len);
@@ -230,8 +230,8 @@ TRACE_EVENT(ath11k_log_dbg,
 	),
 
 	TP_fast_assign(
-		__assign_str(device, dev_name(ab->dev));
-		__assign_str(driver, dev_driver_string(ab->dev));
+		__assign_str(device);
+		__assign_str(driver);
 		__entry->level = level;
 		WARN_ON_ONCE(vsnprintf(__get_dynamic_array(msg),
 				       ATH11K_MSG_MAX, vaf->fmt,
@@ -262,10 +262,10 @@ TRACE_EVENT(ath11k_log_dbg_dump,
 	),
 
 	TP_fast_assign(
-		__assign_str(device, dev_name(ab->dev));
-		__assign_str(driver, dev_driver_string(ab->dev));
-		__assign_str(msg, msg);
-		__assign_str(prefix, prefix);
+		__assign_str(device);
+		__assign_str(driver);
+		__assign_str(msg);
+		__assign_str(prefix);
 		__entry->buf_len = buf_len;
 		memcpy(__get_dynamic_array(buf), buf, buf_len);
 	),
@@ -292,8 +292,8 @@ TRACE_EVENT(ath11k_wmi_diag,
 	),
 
 	TP_fast_assign(
-		__assign_str(device, dev_name(ab->dev));
-		__assign_str(driver, dev_driver_string(ab->dev));
+		__assign_str(device);
+		__assign_str(driver);
 		__entry->len = len;
 		memcpy(__get_dynamic_array(data), data, len);
 	),
@@ -318,8 +318,8 @@ TRACE_EVENT(ath11k_ps_timekeeper,
 			 __field(u32, peer_ps_timestamp)
 	),
 
-	TP_fast_assign(__assign_str(device, dev_name(ar->ab->dev));
-		       __assign_str(driver, dev_driver_string(ar->ab->dev));
+	TP_fast_assign(__assign_str(device);
+		       __assign_str(driver);
 		       memcpy(__get_dynamic_array(peer_addr), peer_addr,
 			      ETH_ALEN);
 		       __entry->peer_ps_state = peer_ps_state;
diff --git a/drivers/net/wireless/ath/ath12k/trace.h b/drivers/net/wireless/ath/ath12k/trace.h
index 240737e1542d..253c67accb0e 100644
--- a/drivers/net/wireless/ath/ath12k/trace.h
+++ b/drivers/net/wireless/ath/ath12k/trace.h
@@ -36,8 +36,8 @@ TRACE_EVENT(ath12k_htt_pktlog,
 	),
 
 	TP_fast_assign(
-		__assign_str(device, dev_name(ar->ab->dev));
-		__assign_str(driver, dev_driver_string(ar->ab->dev));
+		__assign_str(device);
+		__assign_str(driver);
 		__entry->buf_len = buf_len;
 		__entry->pktlog_checksum = pktlog_checksum;
 		memcpy(__get_dynamic_array(pktlog), buf, buf_len);
@@ -73,8 +73,8 @@ TRACE_EVENT(ath12k_htt_ppdu_stats,
 	),
 
 	TP_fast_assign(
-		__assign_str(device, dev_name(ar->ab->dev));
-		__assign_str(driver, dev_driver_string(ar->ab->dev));
+		__assign_str(device);
+		__assign_str(driver);
 		__entry->len = len;
 		__entry->info = ar->pdev->timestamp.info;
 		__entry->sync_tstmp_lo_us = ar->pdev->timestamp.sync_timestamp_hi_us;
@@ -117,8 +117,8 @@ TRACE_EVENT(ath12k_htt_rxdesc,
 	),
 
 	TP_fast_assign(
-		__assign_str(device, dev_name(ar->ab->dev));
-		__assign_str(driver, dev_driver_string(ar->ab->dev));
+		__assign_str(device);
+		__assign_str(driver);
 		__entry->len = len;
 		__entry->type = type;
 		__entry->info = ar->pdev->timestamp.info;
@@ -153,8 +153,8 @@ TRACE_EVENT(ath12k_wmi_diag,
 	),
 
 	TP_fast_assign(
-		__assign_str(device, dev_name(ab->dev));
-		__assign_str(driver, dev_driver_string(ab->dev));
+		__assign_str(device);
+		__assign_str(driver);
 		__entry->len = len;
 		memcpy(__get_dynamic_array(data), data, len);
 	),
diff --git a/drivers/net/wireless/ath/ath6kl/trace.h b/drivers/net/wireless/ath/ath6kl/trace.h
index 231a94769ddb..8577aa459c58 100644
--- a/drivers/net/wireless/ath/ath6kl/trace.h
+++ b/drivers/net/wireless/ath/ath6kl/trace.h
@@ -304,8 +304,8 @@ TRACE_EVENT(ath6kl_log_dbg_dump,
 	),
 
 	TP_fast_assign(
-		__assign_str(msg, msg);
-		__assign_str(prefix, prefix);
+		__assign_str(msg);
+		__assign_str(prefix);
 		__entry->buf_len = buf_len;
 		memcpy(__get_dynamic_array(buf), buf, buf_len);
 	),
diff --git a/drivers/net/wireless/ath/trace.h b/drivers/net/wireless/ath/trace.h
index 9935cf475b6d..82aac0a4baff 100644
--- a/drivers/net/wireless/ath/trace.h
+++ b/drivers/net/wireless/ath/trace.h
@@ -44,8 +44,8 @@ TRACE_EVENT(ath_log,
 	    ),
 
 	    TP_fast_assign(
-		    __assign_str(device, wiphy_name(wiphy));
-		    __assign_str(driver, KBUILD_MODNAME);
+		    __assign_str(device);
+		    __assign_str(driver);
 		    __assign_vstr(msg, vaf->fmt, vaf->va);
 	    ),
 
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/tracepoint.h b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/tracepoint.h
index 5d66e94c806d..96032322b165 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/tracepoint.h
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/tracepoint.h
@@ -41,7 +41,7 @@ TRACE_EVENT(brcmf_err,
 		__vstring(msg, vaf->fmt, vaf->va)
 	),
 	TP_fast_assign(
-		__assign_str(func, func);
+		__assign_str(func);
 		__assign_vstr(msg, vaf->fmt, vaf->va);
 	),
 	TP_printk("%s: %s", __get_str(func), __get_str(msg))
@@ -57,7 +57,7 @@ TRACE_EVENT(brcmf_dbg,
 	),
 	TP_fast_assign(
 		__entry->level = level;
-		__assign_str(func, func);
+		__assign_str(func);
 		__assign_vstr(msg, vaf->fmt, vaf->va);
 	),
 	TP_printk("%s: %s", __get_str(func), __get_str(msg))
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/brcms_trace_brcmsmac.h b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/brcms_trace_brcmsmac.h
index a0da3248b942..53b3dba50737 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/brcms_trace_brcmsmac.h
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/brcms_trace_brcmsmac.h
@@ -81,7 +81,7 @@ TRACE_EVENT(brcms_macintstatus,
 		__field(u32, mask)
 	),
 	TP_fast_assign(
-		__assign_str(dev, dev_name(dev));
+		__assign_str(dev);
 		__entry->in_isr = in_isr;
 		__entry->macintstatus = macintstatus;
 		__entry->mask = mask;
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/brcms_trace_brcmsmac_msg.h b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/brcms_trace_brcmsmac_msg.h
index 42b0a91656c4..908ce3c864fe 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/brcms_trace_brcmsmac_msg.h
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/brcms_trace_brcmsmac_msg.h
@@ -71,7 +71,7 @@ TRACE_EVENT(brcms_dbg,
 	),
 	TP_fast_assign(
 		__entry->level = level;
-		__assign_str(func, func);
+		__assign_str(func);
 		__assign_vstr(msg, vaf->fmt, vaf->va);
 	),
 	TP_printk("%s: %s", __get_str(func), __get_str(msg))
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/brcms_trace_brcmsmac_tx.h b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/brcms_trace_brcmsmac_tx.h
index cf2cc070f1e5..24ac34fa0207 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/brcms_trace_brcmsmac_tx.h
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/brcms_trace_brcmsmac_tx.h
@@ -31,7 +31,7 @@ TRACE_EVENT(brcms_txdesc,
 		__dynamic_array(u8, txh, txh_len)
 	),
 	TP_fast_assign(
-		__assign_str(dev, dev_name(dev));
+		__assign_str(dev);
 		memcpy(__get_dynamic_array(txh), txh, txh_len);
 	),
 	TP_printk("[%s] txdesc", __get_str(dev))
@@ -54,7 +54,7 @@ TRACE_EVENT(brcms_txstatus,
 		__field(u16, ackphyrxsh)
 	),
 	TP_fast_assign(
-		__assign_str(dev, dev_name(dev));
+		__assign_str(dev);
 		__entry->framelen = framelen;
 		__entry->frameid = frameid;
 		__entry->status = status;
@@ -85,7 +85,7 @@ TRACE_EVENT(brcms_ampdu_session,
 		__field(u16, dma_len)
 	),
 	TP_fast_assign(
-		__assign_str(dev, dev_name(dev));
+		__assign_str(dev);
 		__entry->max_ampdu_len = max_ampdu_len;
 		__entry->max_ampdu_frames = max_ampdu_frames;
 		__entry->ampdu_len = ampdu_len;
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-devtrace-msg.h b/drivers/net/wireless/intel/iwlwifi/iwl-devtrace-msg.h
index 1d6c292cf545..0db1fa5477af 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-devtrace-msg.h
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-devtrace-msg.h
@@ -57,7 +57,7 @@ TRACE_EVENT(iwlwifi_dbg,
 	),
 	TP_fast_assign(
 		__entry->level = level;
-		__assign_str(function, function);
+		__assign_str(function);
 		__assign_vstr(msg, vaf->fmt, vaf->va);
 	),
 	TP_printk("%s", __get_str(msg))
diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-devtrace.h b/drivers/net/wireless/intel/iwlwifi/iwl-devtrace.h
index c3e09f4fefeb..76166e1b10e5 100644
--- a/drivers/net/wireless/intel/iwlwifi/iwl-devtrace.h
+++ b/drivers/net/wireless/intel/iwlwifi/iwl-devtrace.h
@@ -87,7 +87,7 @@ static inline void trace_ ## name(proto) {}
 #endif
 
 #define DEV_ENTRY	__string(dev, dev_name(dev))
-#define DEV_ASSIGN	__assign_str(dev, dev_name(dev))
+#define DEV_ASSIGN	__assign_str(dev)
 
 #include "iwl-devtrace-io.h"
 #include "iwl-devtrace-ucode.h"
diff --git a/drivers/soc/qcom/pmic_pdcharger_ulog.h b/drivers/soc/qcom/pmic_pdcharger_ulog.h
index 152e3a6b5480..1cfa58f0e34c 100644
--- a/drivers/soc/qcom/pmic_pdcharger_ulog.h
+++ b/drivers/soc/qcom/pmic_pdcharger_ulog.h
@@ -18,7 +18,7 @@ TRACE_EVENT(pmic_pdcharger_ulog_msg,
 		__string(msg, msg)
 	),
 	TP_fast_assign(
-		__assign_str(msg, msg);
+		__assign_str(msg);
 	),
 	TP_printk("%s", __get_str(msg))
 );
diff --git a/drivers/soc/qcom/trace-aoss.h b/drivers/soc/qcom/trace-aoss.h
index 554029b33b44..fb5b0470c40d 100644
--- a/drivers/soc/qcom/trace-aoss.h
+++ b/drivers/soc/qcom/trace-aoss.h
@@ -18,7 +18,7 @@ TRACE_EVENT(aoss_send,
 		__string(msg, msg)
 	),
 	TP_fast_assign(
-		__assign_str(msg, msg);
+		__assign_str(msg);
 	),
 	TP_printk("%s", __get_str(msg))
 );
@@ -31,7 +31,7 @@ TRACE_EVENT(aoss_send_done,
 		__field(int, ret)
 	),
 	TP_fast_assign(
-		__assign_str(msg, msg);
+		__assign_str(msg);
 		__entry->ret = ret;
 	),
 	TP_printk("%s: %d", __get_str(msg), __entry->ret)
diff --git a/drivers/soc/qcom/trace-rpmh.h b/drivers/soc/qcom/trace-rpmh.h
index be6b42ecc1f8..593ec1d4e010 100644
--- a/drivers/soc/qcom/trace-rpmh.h
+++ b/drivers/soc/qcom/trace-rpmh.h
@@ -26,7 +26,7 @@ TRACE_EVENT(rpmh_tx_done,
 	),
 
 	TP_fast_assign(
-		       __assign_str(name, d->name);
+		       __assign_str(name);
 		       __entry->m = m;
 		       __entry->addr = r->cmds[0].addr;
 		       __entry->data = r->cmds[0].data;
@@ -55,7 +55,7 @@ TRACE_EVENT(rpmh_send_msg,
 	),
 
 	TP_fast_assign(
-		       __assign_str(name, d->name);
+		       __assign_str(name);
 		       __entry->m = m;
 		       __entry->state = state;
 		       __entry->n = n;
diff --git a/drivers/thermal/thermal_trace.h b/drivers/thermal/thermal_trace.h
index 88a962f560f2..df8f4edd6068 100644
--- a/drivers/thermal/thermal_trace.h
+++ b/drivers/thermal/thermal_trace.h
@@ -37,7 +37,7 @@ TRACE_EVENT(thermal_temperature,
 	),
 
 	TP_fast_assign(
-		__assign_str(thermal_zone, tz->type);
+		__assign_str(thermal_zone);
 		__entry->id = tz->id;
 		__entry->temp_prev = tz->last_temperature;
 		__entry->temp = tz->temperature;
@@ -60,7 +60,7 @@ TRACE_EVENT(cdev_update,
 	),
 
 	TP_fast_assign(
-		__assign_str(type, cdev->type);
+		__assign_str(type);
 		__entry->target = target;
 	),
 
@@ -82,7 +82,7 @@ TRACE_EVENT(thermal_zone_trip,
 	),
 
 	TP_fast_assign(
-		__assign_str(thermal_zone, tz->type);
+		__assign_str(thermal_zone);
 		__entry->id = tz->id;
 		__entry->trip = trip;
 		__entry->trip_type = trip_type;
@@ -156,7 +156,7 @@ TRACE_EVENT(thermal_power_devfreq_get_power,
 	),
 
 	TP_fast_assign(
-		__assign_str(type, cdev->type);
+		__assign_str(type);
 		__entry->freq = freq;
 		__entry->busy_time = status->busy_time;
 		__entry->total_time = status->total_time;
@@ -184,7 +184,7 @@ TRACE_EVENT(thermal_power_devfreq_limit,
 	),
 
 	TP_fast_assign(
-		__assign_str(type, cdev->type);
+		__assign_str(type);
 		__entry->freq = freq;
 		__entry->cdev_state = cdev_state;
 		__entry->power = power;
diff --git a/drivers/usb/cdns3/cdns3-trace.h b/drivers/usb/cdns3/cdns3-trace.h
index 40db89e3333c..c4e542f1b9b7 100644
--- a/drivers/usb/cdns3/cdns3-trace.h
+++ b/drivers/usb/cdns3/cdns3-trace.h
@@ -33,7 +33,7 @@ TRACE_EVENT(cdns3_halt,
 		__field(u8, flush)
 	),
 	TP_fast_assign(
-		__assign_str(name, ep_priv->name);
+		__assign_str(name);
 		__entry->halt = halt;
 		__entry->flush = flush;
 	),
@@ -49,8 +49,8 @@ TRACE_EVENT(cdns3_wa1,
 		__string(msg, msg)
 	),
 	TP_fast_assign(
-		__assign_str(ep_name, ep_priv->name);
-		__assign_str(msg, msg);
+		__assign_str(ep_name);
+		__assign_str(msg);
 	),
 	TP_printk("WA1: %s %s", __get_str(ep_name), __get_str(msg))
 );
@@ -63,8 +63,8 @@ TRACE_EVENT(cdns3_wa2,
 		__string(msg, msg)
 	),
 	TP_fast_assign(
-		__assign_str(ep_name, ep_priv->name);
-		__assign_str(msg, msg);
+		__assign_str(ep_name);
+		__assign_str(msg);
 	),
 	TP_printk("WA2: %s %s", __get_str(ep_name), __get_str(msg))
 );
@@ -77,7 +77,7 @@ DECLARE_EVENT_CLASS(cdns3_log_doorbell,
 		__field(u32, ep_trbaddr)
 	),
 	TP_fast_assign(
-		__assign_str(name, ep_name);
+		__assign_str(name);
 		__entry->ep_trbaddr = ep_trbaddr;
 	),
 	TP_printk("%s, ep_trbaddr %08x", __get_str(name),
@@ -125,7 +125,7 @@ DECLARE_EVENT_CLASS(cdns3_log_epx_irq,
 		__field(u32, use_streams)
 	),
 	TP_fast_assign(
-		__assign_str(ep_name, priv_ep->name);
+		__assign_str(ep_name);
 		__entry->ep_sts = readl(&priv_dev->regs->ep_sts);
 		__entry->ep_traddr = readl(&priv_dev->regs->ep_traddr);
 		__entry->ep_last_sid = priv_ep->last_stream_id;
@@ -214,7 +214,7 @@ DECLARE_EVENT_CLASS(cdns3_log_request,
 		__field(unsigned int, stream_id)
 	),
 	TP_fast_assign(
-		__assign_str(name, req->priv_ep->name);
+		__assign_str(name);
 		__entry->req = req;
 		__entry->buf = req->request.buf;
 		__entry->actual = req->request.actual;
@@ -294,7 +294,7 @@ DECLARE_EVENT_CLASS(cdns3_stream_split_transfer_len,
 		__field(unsigned int, stream_id)
 	),
 	TP_fast_assign(
-		__assign_str(name, req->priv_ep->name);
+		__assign_str(name);
 		__entry->req = req;
 		__entry->actual = req->request.length;
 		__entry->length = req->request.actual;
@@ -329,7 +329,7 @@ DECLARE_EVENT_CLASS(cdns3_log_aligned_request,
 		__field(u32, aligned_buf_size)
 	),
 	TP_fast_assign(
-		__assign_str(name, priv_req->priv_ep->name);
+		__assign_str(name);
 		__entry->req = &priv_req->request;
 		__entry->buf = priv_req->request.buf;
 		__entry->dma = priv_req->request.dma;
@@ -364,7 +364,7 @@ DECLARE_EVENT_CLASS(cdns3_log_map_request,
 		__field(dma_addr_t, dma)
 	),
 	TP_fast_assign(
-		__assign_str(name, priv_req->priv_ep->name);
+		__assign_str(name);
 		__entry->req = &priv_req->request;
 		__entry->buf = priv_req->request.buf;
 		__entry->dma = priv_req->request.dma;
@@ -395,7 +395,7 @@ DECLARE_EVENT_CLASS(cdns3_log_trb,
 		__field(unsigned int, last_stream_id)
 	),
 	TP_fast_assign(
-		__assign_str(name, priv_ep->name);
+		__assign_str(name);
 		__entry->trb = trb;
 		__entry->buffer = le32_to_cpu(trb->buffer);
 		__entry->length = le32_to_cpu(trb->length);
@@ -467,7 +467,7 @@ DECLARE_EVENT_CLASS(cdns3_log_ep,
 		__field(u8, dequeue)
 	),
 	TP_fast_assign(
-		__assign_str(name, priv_ep->name);
+		__assign_str(name);
 		__entry->maxpacket = priv_ep->endpoint.maxpacket;
 		__entry->maxpacket_limit = priv_ep->endpoint.maxpacket_limit;
 		__entry->max_streams = priv_ep->endpoint.max_streams;
diff --git a/drivers/usb/cdns3/cdnsp-trace.h b/drivers/usb/cdns3/cdnsp-trace.h
index 4b51011eb00b..f2bcf77a5d0a 100644
--- a/drivers/usb/cdns3/cdnsp-trace.h
+++ b/drivers/usb/cdns3/cdnsp-trace.h
@@ -48,7 +48,7 @@ DECLARE_EVENT_CLASS(cdnsp_log_ep,
 		__field(u8, drbls_count)
 	),
 	TP_fast_assign(
-		__assign_str(name, pep->name);
+		__assign_str(name);
 		__entry->state = pep->ep_state;
 		__entry->stream_id = stream_id;
 		__entry->enabled = pep->ep_state & EP_HAS_STREAMS;
@@ -138,7 +138,7 @@ DECLARE_EVENT_CLASS(cdnsp_log_simple,
 		__string(text, msg)
 	),
 	TP_fast_assign(
-		__assign_str(text, msg);
+		__assign_str(text);
 	),
 	TP_printk("%s", __get_str(text))
 );
@@ -303,7 +303,7 @@ DECLARE_EVENT_CLASS(cdnsp_log_bounce,
 		__field(unsigned int, unalign)
 	),
 	TP_fast_assign(
-		__assign_str(name, preq->pep->name);
+		__assign_str(name);
 		__entry->new_buf_len = new_buf_len;
 		__entry->offset = offset;
 		__entry->dma = dma;
@@ -470,7 +470,7 @@ DECLARE_EVENT_CLASS(cdnsp_log_request,
 
 	),
 	TP_fast_assign(
-		__assign_str(name, req->pep->name);
+		__assign_str(name);
 		__entry->request = &req->request;
 		__entry->preq = req;
 		__entry->buf = req->request.buf;
@@ -674,7 +674,7 @@ DECLARE_EVENT_CLASS(cdnsp_log_td_info,
 		__field(dma_addr_t, trb_dma)
 	),
 	TP_fast_assign(
-		__assign_str(name, preq->pep->name);
+		__assign_str(name);
 		__entry->request = &preq->request;
 		__entry->preq = preq;
 		__entry->first_trb = preq->td.first_trb;
diff --git a/drivers/usb/chipidea/trace.h b/drivers/usb/chipidea/trace.h
index ca0e65b48f0a..1875419cd17f 100644
--- a/drivers/usb/chipidea/trace.h
+++ b/drivers/usb/chipidea/trace.h
@@ -31,7 +31,7 @@ TRACE_EVENT(ci_log,
 		__vstring(msg, vaf->fmt, vaf->va)
 	),
 	TP_fast_assign(
-		__assign_str(name, dev_name(ci->dev));
+		__assign_str(name);
 		__assign_vstr(msg, vaf->fmt, vaf->va);
 	),
 	TP_printk("%s: %s", __get_str(name), __get_str(msg))
@@ -51,7 +51,7 @@ DECLARE_EVENT_CLASS(ci_log_trb,
 		__field(u32, type)
 	),
 	TP_fast_assign(
-		__assign_str(name, hwep->name);
+		__assign_str(name);
 		__entry->req = &hwreq->req;
 		__entry->td = td;
 		__entry->dma = td->dma;
diff --git a/drivers/usb/dwc3/trace.h b/drivers/usb/dwc3/trace.h
index d2997d17cfbe..bdeb1aaf65d8 100644
--- a/drivers/usb/dwc3/trace.h
+++ b/drivers/usb/dwc3/trace.h
@@ -112,7 +112,7 @@ DECLARE_EVENT_CLASS(dwc3_log_request,
 		__field(int, no_interrupt)
 	),
 	TP_fast_assign(
-		__assign_str(name, req->dep->name);
+		__assign_str(name);
 		__entry->req = req;
 		__entry->actual = req->request.actual;
 		__entry->length = req->request.length;
@@ -193,7 +193,7 @@ DECLARE_EVENT_CLASS(dwc3_log_gadget_ep_cmd,
 		__field(int, cmd_status)
 	),
 	TP_fast_assign(
-		__assign_str(name, dep->name);
+		__assign_str(name);
 		__entry->cmd = cmd;
 		__entry->param0 = params->param0;
 		__entry->param1 = params->param1;
@@ -229,7 +229,7 @@ DECLARE_EVENT_CLASS(dwc3_log_trb,
 		__field(u32, dequeue)
 	),
 	TP_fast_assign(
-		__assign_str(name, dep->name);
+		__assign_str(name);
 		__entry->trb = trb;
 		__entry->bpl = trb->bpl;
 		__entry->bph = trb->bph;
@@ -301,7 +301,7 @@ DECLARE_EVENT_CLASS(dwc3_log_ep,
 		__field(u8, trb_dequeue)
 	),
 	TP_fast_assign(
-		__assign_str(name, dep->name);
+		__assign_str(name);
 		__entry->maxpacket = dep->endpoint.maxpacket;
 		__entry->maxpacket_limit = dep->endpoint.maxpacket_limit;
 		__entry->max_streams = dep->endpoint.max_streams;
diff --git a/drivers/usb/gadget/udc/cdns2/cdns2-trace.h b/drivers/usb/gadget/udc/cdns2/cdns2-trace.h
index 61f241634ea5..ade1752956b1 100644
--- a/drivers/usb/gadget/udc/cdns2/cdns2-trace.h
+++ b/drivers/usb/gadget/udc/cdns2/cdns2-trace.h
@@ -64,7 +64,7 @@ DECLARE_EVENT_CLASS(cdns2_log_simple,
 		__string(text, msg)
 	),
 	TP_fast_assign(
-		__assign_str(text, msg);
+		__assign_str(text);
 	),
 	TP_printk("%s", __get_str(text))
 );
@@ -103,7 +103,7 @@ TRACE_EVENT(cdns2_ep_halt,
 		__field(u8, flush)
 	),
 	TP_fast_assign(
-		__assign_str(name, ep_priv->name);
+		__assign_str(name);
 		__entry->halt = halt;
 		__entry->flush = flush;
 	),
@@ -119,8 +119,8 @@ TRACE_EVENT(cdns2_wa1,
 		__string(msg, msg)
 	),
 	TP_fast_assign(
-		__assign_str(ep_name, ep_priv->name);
-		__assign_str(msg, msg);
+		__assign_str(ep_name);
+		__assign_str(msg);
 	),
 	TP_printk("WA1: %s %s", __get_str(ep_name), __get_str(msg))
 );
@@ -134,7 +134,7 @@ DECLARE_EVENT_CLASS(cdns2_log_doorbell,
 		__field(u32, ep_trbaddr)
 	),
 	TP_fast_assign(
-		__assign_str(name, pep->name);
+		__assign_str(name);
 		__entry->ep_trbaddr = ep_trbaddr;
 	),
 	TP_printk("%s, ep_trbaddr %08x", __get_str(name),
@@ -196,7 +196,7 @@ DECLARE_EVENT_CLASS(cdns2_log_epx_irq,
 		__field(u32, ep_traddr)
 	),
 	TP_fast_assign(
-		__assign_str(ep_name, pep->name);
+		__assign_str(ep_name);
 		__entry->ep_sts = readl(&pdev->adma_regs->ep_sts);
 		__entry->ep_ists = readl(&pdev->adma_regs->ep_ists);
 		__entry->ep_traddr = readl(&pdev->adma_regs->ep_traddr);
@@ -288,7 +288,7 @@ DECLARE_EVENT_CLASS(cdns2_log_request,
 		__field(int, end_trb)
 	),
 	TP_fast_assign(
-		__assign_str(name, preq->pep->name);
+		__assign_str(name);
 		__entry->request = &preq->request;
 		__entry->preq = preq;
 		__entry->buf = preq->request.buf;
@@ -380,7 +380,7 @@ DECLARE_EVENT_CLASS(cdns2_log_map_request,
 		__field(dma_addr_t, dma)
 	),
 	TP_fast_assign(
-		__assign_str(name, priv_req->pep->name);
+		__assign_str(name);
 		__entry->req = &priv_req->request;
 		__entry->buf = priv_req->request.buf;
 		__entry->dma = priv_req->request.dma;
@@ -411,7 +411,7 @@ DECLARE_EVENT_CLASS(cdns2_log_trb,
 		__field(u32, type)
 	),
 	TP_fast_assign(
-		__assign_str(name, pep->name);
+		__assign_str(name);
 		__entry->trb = trb;
 		__entry->buffer = le32_to_cpu(trb->buffer);
 		__entry->length = le32_to_cpu(trb->length);
@@ -476,7 +476,7 @@ DECLARE_EVENT_CLASS(cdns2_log_ep,
 		__field(u8, dequeue)
 	),
 	TP_fast_assign(
-		__assign_str(name, pep->name);
+		__assign_str(name);
 		__entry->maxpacket = pep->endpoint.maxpacket;
 		__entry->maxpacket_limit = pep->endpoint.maxpacket_limit;
 		__entry->flags = pep->ep_state;
@@ -568,7 +568,7 @@ DECLARE_EVENT_CLASS(cdns2_log_epx_reg_config,
 		__field(u32, ep_cfg_reg)
 	),
 	TP_fast_assign(
-		__assign_str(ep_name, pep->name);
+		__assign_str(ep_name);
 		__entry->burst_size = pep->trb_burst_size;
 		__entry->maxpack_reg = pep->dir ? readw(&pdev->epx_regs->txmaxpack[pep->num - 1]) :
 						  readw(&pdev->epx_regs->rxmaxpack[pep->num - 1]);
diff --git a/drivers/usb/gadget/udc/trace.h b/drivers/usb/gadget/udc/trace.h
index a5ed26fbc2da..4e334298b0e8 100644
--- a/drivers/usb/gadget/udc/trace.h
+++ b/drivers/usb/gadget/udc/trace.h
@@ -157,7 +157,7 @@ DECLARE_EVENT_CLASS(udc_log_ep,
 		__field(int, ret)
 	),
 	TP_fast_assign(
-		__assign_str(name, ep->name);
+		__assign_str(name);
 		__entry->maxpacket = ep->maxpacket;
 		__entry->maxpacket_limit = ep->maxpacket_limit;
 		__entry->max_streams = ep->max_streams;
@@ -233,7 +233,7 @@ DECLARE_EVENT_CLASS(udc_log_req,
 		__field(struct usb_request *, req)
 	),
 	TP_fast_assign(
-		__assign_str(name, ep->name);
+		__assign_str(name);
 		__entry->length = req->length;
 		__entry->actual = req->actual;
 		__entry->num_sgs = req->num_sgs;
diff --git a/drivers/usb/mtu3/mtu3_trace.h b/drivers/usb/mtu3/mtu3_trace.h
index 03d2a9bac27e..89870175d635 100644
--- a/drivers/usb/mtu3/mtu3_trace.h
+++ b/drivers/usb/mtu3/mtu3_trace.h
@@ -26,7 +26,7 @@ TRACE_EVENT(mtu3_log,
 		__vstring(msg, vaf->fmt, vaf->va)
 	),
 	TP_fast_assign(
-		__assign_str(name, dev_name(dev));
+		__assign_str(name);
 		__assign_vstr(msg, vaf->fmt, vaf->va);
 	),
 	TP_printk("%s: %s", __get_str(name), __get_str(msg))
@@ -127,7 +127,7 @@ DECLARE_EVENT_CLASS(mtu3_log_request,
 		__field(int, no_interrupt)
 	),
 	TP_fast_assign(
-		__assign_str(name, mreq->mep->name);
+		__assign_str(name);
 		__entry->mreq = mreq;
 		__entry->gpd = mreq->gpd;
 		__entry->actual = mreq->request.actual;
@@ -182,7 +182,7 @@ DECLARE_EVENT_CLASS(mtu3_log_gpd,
 		__field(u32, dw3)
 	),
 	TP_fast_assign(
-		__assign_str(name, mep->name);
+		__assign_str(name);
 		__entry->gpd = gpd;
 		__entry->dw0 = le32_to_cpu(gpd->dw0_info);
 		__entry->dw1 = le32_to_cpu(gpd->next_gpd);
@@ -226,7 +226,7 @@ DECLARE_EVENT_CLASS(mtu3_log_ep,
 		__field(struct mtu3_gpd_ring *, gpd_ring)
 	),
 	TP_fast_assign(
-		__assign_str(name, mep->name);
+		__assign_str(name);
 		__entry->type = mep->type;
 		__entry->slot = mep->slot;
 		__entry->maxp = mep->ep.maxpacket;
diff --git a/drivers/usb/musb/musb_trace.h b/drivers/usb/musb/musb_trace.h
index f246b14394c4..726e6697d475 100644
--- a/drivers/usb/musb/musb_trace.h
+++ b/drivers/usb/musb/musb_trace.h
@@ -31,7 +31,7 @@ TRACE_EVENT(musb_log,
 		__vstring(msg, vaf->fmt, vaf->va)
 	),
 	TP_fast_assign(
-		__assign_str(name, dev_name(musb->controller));
+		__assign_str(name);
 		__assign_vstr(msg, vaf->fmt, vaf->va);
 	),
 	TP_printk("%s: %s", __get_str(name), __get_str(msg))
@@ -46,9 +46,9 @@ TRACE_EVENT(musb_state,
 		__string(desc, desc)
 	),
 	TP_fast_assign(
-		__assign_str(name, dev_name(musb->controller));
+		__assign_str(name);
 		__entry->devctl = devctl;
-		__assign_str(desc, desc);
+		__assign_str(desc);
 	),
 	TP_printk("%s: devctl: %02x %s", __get_str(name), __entry->devctl,
 		  __get_str(desc))
@@ -160,7 +160,7 @@ TRACE_EVENT(musb_isr,
 		__field(u16, int_rx)
 	),
 	TP_fast_assign(
-		__assign_str(name, dev_name(musb->controller));
+		__assign_str(name);
 		__entry->int_usb = musb->int_usb;
 		__entry->int_tx = musb->int_tx;
 		__entry->int_rx = musb->int_rx;
@@ -184,7 +184,7 @@ DECLARE_EVENT_CLASS(musb_urb,
 		__field(u32, actual_len)
 	),
 	TP_fast_assign(
-		__assign_str(name, dev_name(musb->controller));
+		__assign_str(name);
 		__entry->urb = urb;
 		__entry->pipe = urb->pipe;
 		__entry->status = urb->status;
@@ -325,7 +325,7 @@ DECLARE_EVENT_CLASS(musb_cppi41,
 	),
 	TP_fast_assign(
 		__entry->ch = ch;
-		__assign_str(name, dev_name(ch->hw_ep->musb->controller));
+		__assign_str(name);
 		__entry->hwep = ch->hw_ep->epnum;
 		__entry->port = ch->port_num;
 		__entry->is_tx = ch->is_tx;
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 362e1fc7ef6a..84fcf26e306e 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -43,7 +43,7 @@ DECLARE_EVENT_CLASS(fs_str,
 
 	TP_fast_assign(
 		__entry->dev		= c->dev;
-		__assign_str(str, str);
+		__assign_str(str);
 	),
 
 	TP_printk("%d,%d\n%s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str))
@@ -64,7 +64,7 @@ DECLARE_EVENT_CLASS(trans_str,
 		__entry->dev		= trans->c->dev;
 		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
 		__entry->caller_ip		= caller_ip;
-		__assign_str(str, str);
+		__assign_str(str);
 	),
 
 	TP_printk("%d,%d %s %pS %s",
@@ -85,7 +85,7 @@ DECLARE_EVENT_CLASS(trans_str_nocaller,
 	TP_fast_assign(
 		__entry->dev		= trans->c->dev;
 		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
-		__assign_str(str, str);
+		__assign_str(str);
 	),
 
 	TP_printk("%d,%d %s %s",
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 10985a4b8259..4de8780a7c48 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -47,7 +47,7 @@ DECLARE_EVENT_CLASS(nfs4_clientid_event,
 
 		TP_fast_assign(
 			__entry->error = error < 0 ? -error : 0;
-			__assign_str(dstaddr, clp->cl_hostname);
+			__assign_str(dstaddr);
 		),
 
 		TP_printk(
@@ -94,8 +94,8 @@ TRACE_EVENT(nfs4_trunked_exchange_id,
 
 		TP_fast_assign(
 			__entry->error = error < 0 ? -error : 0;
-			__assign_str(main_addr, clp->cl_hostname);
-			__assign_str(trunk_addr, addr);
+			__assign_str(main_addr);
+			__assign_str(trunk_addr);
 		),
 
 		TP_printk(
@@ -365,7 +365,7 @@ TRACE_EVENT(nfs4_state_mgr,
 
 		TP_fast_assign(
 			__entry->state = clp->cl_state;
-			__assign_str(hostname, clp->cl_hostname);
+			__assign_str(hostname);
 		),
 
 		TP_printk(
@@ -393,8 +393,8 @@ TRACE_EVENT(nfs4_state_mgr_failed,
 		TP_fast_assign(
 			__entry->error = status < 0 ? -status : 0;
 			__entry->state = clp->cl_state;
-			__assign_str(hostname, clp->cl_hostname);
-			__assign_str(section, section);
+			__assign_str(hostname);
+			__assign_str(section);
 		),
 
 		TP_printk(
@@ -578,7 +578,7 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
 				__entry->fhandle = 0;
 			}
 			__entry->dir = NFS_FILEID(d_inode(ctx->dentry->d_parent));
-			__assign_str(name, ctx->dentry->d_name.name);
+			__assign_str(name);
 		),
 
 		TP_printk(
@@ -1072,7 +1072,7 @@ DECLARE_EVENT_CLASS(nfs4_lookup_event,
 			__entry->dev = dir->i_sb->s_dev;
 			__entry->dir = NFS_FILEID(dir);
 			__entry->error = -error;
-			__assign_str(name, name->name);
+			__assign_str(name);
 		),
 
 		TP_printk(
@@ -1156,8 +1156,8 @@ TRACE_EVENT(nfs4_rename,
 			__entry->olddir = NFS_FILEID(olddir);
 			__entry->newdir = NFS_FILEID(newdir);
 			__entry->error = error < 0 ? -error : 0;
-			__assign_str(oldname, oldname->name);
-			__assign_str(newname, newname->name);
+			__assign_str(oldname);
+			__assign_str(newname);
 		),
 
 		TP_printk(
@@ -1359,7 +1359,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event,
 				__entry->fileid = 0;
 				__entry->dev = 0;
 			}
-			__assign_str(dstaddr, clp ? clp->cl_hostname : "unknown");
+			__assign_str(dstaddr);
 		),
 
 		TP_printk(
@@ -1416,7 +1416,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event,
 				__entry->fileid = 0;
 				__entry->dev = 0;
 			}
-			__assign_str(dstaddr, clp ? clp->cl_hostname : "unknown");
+			__assign_str(dstaddr);
 			__entry->stateid_seq =
 				be32_to_cpu(stateid->seqid);
 			__entry->stateid_hash =
@@ -1960,7 +1960,7 @@ DECLARE_EVENT_CLASS(nfs4_deviceid_event,
 		),
 
 		TP_fast_assign(
-			__assign_str(dstaddr, clp->cl_hostname);
+			__assign_str(dstaddr);
 			memcpy(__entry->deviceid, deviceid->data,
 			       NFS4_DEVICEID4_SIZE);
 		),
@@ -1998,7 +1998,7 @@ DECLARE_EVENT_CLASS(nfs4_deviceid_status,
 		TP_fast_assign(
 			__entry->dev = server->s_dev;
 			__entry->status = status;
-			__assign_str(dstaddr, server->nfs_client->cl_hostname);
+			__assign_str(dstaddr);
 			memcpy(__entry->deviceid, deviceid->data,
 			       NFS4_DEVICEID4_SIZE);
 		),
@@ -2036,8 +2036,8 @@ TRACE_EVENT(fl_getdevinfo,
 		),
 
 		TP_fast_assign(
-			__assign_str(mds_addr, server->nfs_client->cl_hostname);
-			__assign_str(ds_ips, ds_remotestr);
+			__assign_str(mds_addr);
+			__assign_str(ds_ips);
 			memcpy(__entry->deviceid, deviceid->data,
 			       NFS4_DEVICEID4_SIZE);
 		),
@@ -2083,9 +2083,7 @@ DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event,
 				be32_to_cpu(hdr->args.stateid.seqid);
 			__entry->stateid_hash =
 				nfs_stateid_hash(&hdr->args.stateid);
-			__assign_str(dstaddr, hdr->ds_clp ?
-				rpc_peeraddr2str(hdr->ds_clp->cl_rpcclient,
-					RPC_DISPLAY_ADDR) : "unknown");
+			__assign_str(dstaddr);
 		),
 
 		TP_printk(
@@ -2139,9 +2137,7 @@ TRACE_EVENT(ff_layout_commit_error,
 			__entry->dev = inode->i_sb->s_dev;
 			__entry->offset = data->args.offset;
 			__entry->count = data->args.count;
-			__assign_str(dstaddr, data->ds_clp ?
-				rpc_peeraddr2str(data->ds_clp->cl_rpcclient,
-					RPC_DISPLAY_ADDR) : "unknown");
+			__assign_str(dstaddr);
 		),
 
 		TP_printk(
@@ -2579,7 +2575,7 @@ DECLARE_EVENT_CLASS(nfs4_xattr_event,
 			__entry->dev = inode->i_sb->s_dev;
 			__entry->fileid = NFS_FILEID(inode);
 			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
-			__assign_str(name, name);
+			__assign_str(name);
 		),
 
 		TP_printk(
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index afedb449b54f..1e710654af11 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -409,7 +409,7 @@ DECLARE_EVENT_CLASS(nfs_lookup_event,
 			__entry->dir = NFS_FILEID(dir);
 			__entry->flags = flags;
 			__entry->fileid = d_is_negative(dentry) ? 0 : NFS_FILEID(d_inode(dentry));
-			__assign_str(name, dentry->d_name.name);
+			__assign_str(name);
 		),
 
 		TP_printk(
@@ -457,7 +457,7 @@ DECLARE_EVENT_CLASS(nfs_lookup_event_done,
 			__entry->error = error < 0 ? -error : 0;
 			__entry->flags = flags;
 			__entry->fileid = d_is_negative(dentry) ? 0 : NFS_FILEID(d_inode(dentry));
-			__assign_str(name, dentry->d_name.name);
+			__assign_str(name);
 		),
 
 		TP_printk(
@@ -512,7 +512,7 @@ TRACE_EVENT(nfs_atomic_open_enter,
 			__entry->dir = NFS_FILEID(dir);
 			__entry->flags = flags;
 			__entry->fmode = (__force unsigned long)ctx->mode;
-			__assign_str(name, ctx->dentry->d_name.name);
+			__assign_str(name);
 		),
 
 		TP_printk(
@@ -551,7 +551,7 @@ TRACE_EVENT(nfs_atomic_open_exit,
 			__entry->dir = NFS_FILEID(dir);
 			__entry->flags = flags;
 			__entry->fmode = (__force unsigned long)ctx->mode;
-			__assign_str(name, ctx->dentry->d_name.name);
+			__assign_str(name);
 		),
 
 		TP_printk(
@@ -587,7 +587,7 @@ TRACE_EVENT(nfs_create_enter,
 			__entry->dev = dir->i_sb->s_dev;
 			__entry->dir = NFS_FILEID(dir);
 			__entry->flags = flags;
-			__assign_str(name, dentry->d_name.name);
+			__assign_str(name);
 		),
 
 		TP_printk(
@@ -623,7 +623,7 @@ TRACE_EVENT(nfs_create_exit,
 			__entry->dev = dir->i_sb->s_dev;
 			__entry->dir = NFS_FILEID(dir);
 			__entry->flags = flags;
-			__assign_str(name, dentry->d_name.name);
+			__assign_str(name);
 		),
 
 		TP_printk(
@@ -654,7 +654,7 @@ DECLARE_EVENT_CLASS(nfs_directory_event,
 		TP_fast_assign(
 			__entry->dev = dir->i_sb->s_dev;
 			__entry->dir = NFS_FILEID(dir);
-			__assign_str(name, dentry->d_name.name);
+			__assign_str(name);
 		),
 
 		TP_printk(
@@ -693,7 +693,7 @@ DECLARE_EVENT_CLASS(nfs_directory_event_done,
 			__entry->dev = dir->i_sb->s_dev;
 			__entry->dir = NFS_FILEID(dir);
 			__entry->error = error < 0 ? -error : 0;
-			__assign_str(name, dentry->d_name.name);
+			__assign_str(name);
 		),
 
 		TP_printk(
@@ -747,7 +747,7 @@ TRACE_EVENT(nfs_link_enter,
 			__entry->dev = inode->i_sb->s_dev;
 			__entry->fileid = NFS_FILEID(inode);
 			__entry->dir = NFS_FILEID(dir);
-			__assign_str(name, dentry->d_name.name);
+			__assign_str(name);
 		),
 
 		TP_printk(
@@ -783,7 +783,7 @@ TRACE_EVENT(nfs_link_exit,
 			__entry->fileid = NFS_FILEID(inode);
 			__entry->dir = NFS_FILEID(dir);
 			__entry->error = error < 0 ? -error : 0;
-			__assign_str(name, dentry->d_name.name);
+			__assign_str(name);
 		),
 
 		TP_printk(
@@ -819,8 +819,8 @@ DECLARE_EVENT_CLASS(nfs_rename_event,
 			__entry->dev = old_dir->i_sb->s_dev;
 			__entry->old_dir = NFS_FILEID(old_dir);
 			__entry->new_dir = NFS_FILEID(new_dir);
-			__assign_str(old_name, old_dentry->d_name.name);
-			__assign_str(new_name, new_dentry->d_name.name);
+			__assign_str(old_name);
+			__assign_str(new_name);
 		),
 
 		TP_printk(
@@ -868,8 +868,8 @@ DECLARE_EVENT_CLASS(nfs_rename_event_done,
 			__entry->error = -error;
 			__entry->old_dir = NFS_FILEID(old_dir);
 			__entry->new_dir = NFS_FILEID(new_dir);
-			__assign_str(old_name, old_dentry->d_name.name);
-			__assign_str(new_name, new_dentry->d_name.name);
+			__assign_str(old_name);
+			__assign_str(new_name);
 		),
 
 		TP_printk(
@@ -1636,8 +1636,8 @@ TRACE_EVENT(nfs_mount_assign,
 	),
 
 	TP_fast_assign(
-		__assign_str(option, option);
-		__assign_str(value, value);
+		__assign_str(option);
+		__assign_str(value);
 	),
 
 	TP_printk("option %s=%s",
@@ -1657,7 +1657,7 @@ TRACE_EVENT(nfs_mount_option,
 	),
 
 	TP_fast_assign(
-		__assign_str(option, param->key);
+		__assign_str(option);
 	),
 
 	TP_printk("option %s", __get_str(option))
@@ -1675,7 +1675,7 @@ TRACE_EVENT(nfs_mount_path,
 	),
 
 	TP_fast_assign(
-		__assign_str(path, path);
+		__assign_str(path);
 	),
 
 	TP_printk("path='%s'", __get_str(path))
@@ -1710,9 +1710,8 @@ DECLARE_EVENT_CLASS(nfs_xdr_event,
 			__entry->xid = be32_to_cpu(rqstp->rq_xid);
 			__entry->version = task->tk_client->cl_vers;
 			__entry->error = error;
-			__assign_str(program,
-				     task->tk_client->cl_program->name);
-			__assign_str(procedure, task->tk_msg.rpc_proc->p_name);
+			__assign_str(program);
+			__assign_str(procedure);
 		),
 
 		TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index b5e48d504062..77bbd23aa150 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -104,7 +104,7 @@ TRACE_EVENT(nfsd_compound,
 	TP_fast_assign(
 		__entry->xid = be32_to_cpu(rqst->rq_xid);
 		__entry->opcnt = opcnt;
-		__assign_str(tag, tag);
+		__assign_str(tag);
 	),
 	TP_printk("xid=0x%08x opcnt=%u tag=%s",
 		__entry->xid, __entry->opcnt, __get_str(tag)
@@ -127,7 +127,7 @@ TRACE_EVENT(nfsd_compound_status,
 		__entry->args_opcnt = args_opcnt;
 		__entry->resp_opcnt = resp_opcnt;
 		__entry->status = be32_to_cpu(status);
-		__assign_str(name, name);
+		__assign_str(name);
 	),
 	TP_printk("op=%u/%u %s status=%d",
 		__entry->resp_opcnt, __entry->args_opcnt,
@@ -318,7 +318,7 @@ TRACE_EVENT(nfsd_exp_find_key,
 	TP_fast_assign(
 		__entry->fsidtype = key->ek_fsidtype;
 		memcpy(__entry->fsid, key->ek_fsid, 4*6);
-		__assign_str(auth_domain, key->ek_client->name);
+		__assign_str(auth_domain);
 		__entry->status = status;
 	),
 	TP_printk("fsid=%x::%s domain=%s status=%d",
@@ -342,8 +342,8 @@ TRACE_EVENT(nfsd_expkey_update,
 	TP_fast_assign(
 		__entry->fsidtype = key->ek_fsidtype;
 		memcpy(__entry->fsid, key->ek_fsid, 4*6);
-		__assign_str(auth_domain, key->ek_client->name);
-		__assign_str(path, exp_path);
+		__assign_str(auth_domain);
+		__assign_str(path);
 		__entry->cache = !test_bit(CACHE_NEGATIVE, &key->h.flags);
 	),
 	TP_printk("fsid=%x::%s domain=%s path=%s cache=%s",
@@ -365,8 +365,8 @@ TRACE_EVENT(nfsd_exp_get_by_name,
 		__field(int, status)
 	),
 	TP_fast_assign(
-		__assign_str(path, key->ex_path.dentry->d_name.name);
-		__assign_str(auth_domain, key->ex_client->name);
+		__assign_str(path);
+		__assign_str(auth_domain);
 		__entry->status = status;
 	),
 	TP_printk("path=%s domain=%s status=%d",
@@ -385,8 +385,8 @@ TRACE_EVENT(nfsd_export_update,
 		__field(bool, cache)
 	),
 	TP_fast_assign(
-		__assign_str(path, key->ex_path.dentry->d_name.name);
-		__assign_str(auth_domain, key->ex_client->name);
+		__assign_str(path);
+		__assign_str(auth_domain);
 		__entry->cache = !test_bit(CACHE_NEGATIVE, &key->h.flags);
 	),
 	TP_printk("path=%s domain=%s cache=%s",
@@ -485,7 +485,7 @@ TRACE_EVENT(nfsd_dirent,
 	TP_fast_assign(
 		__entry->fh_hash = fhp ? knfsd_fh_hash(&fhp->fh_handle) : 0;
 		__entry->ino = ino;
-		__assign_str(name, name);
+		__assign_str(name);
 	),
 	TP_printk("fh_hash=0x%08x ino=%llu name=%s",
 		__entry->fh_hash, __entry->ino, __get_str(name)
@@ -1000,7 +1000,7 @@ DECLARE_EVENT_CLASS(nfsd_clid_class,
 		__entry->flavor = clp->cl_cred.cr_flavor;
 		memcpy(__entry->verifier, (void *)&clp->cl_verifier,
 		       NFS4_VERIFIER_SIZE);
-		__assign_str(name, clp->cl_name.data);
+		__assign_str(name);
 	),
 	TP_printk("addr=%pISpc name='%s' verifier=0x%s flavor=%s client=%08x:%08x",
 		__entry->addr, __get_str(name),
@@ -1519,7 +1519,7 @@ TRACE_EVENT(nfsd_cb_setup,
 	TP_fast_assign(
 		__entry->cl_boot = clp->cl_clientid.cl_boot;
 		__entry->cl_id = clp->cl_clientid.cl_id;
-		__assign_str(netid, netid);
+		__assign_str(netid);
 		__entry->authflavor = authflavor;
 		__assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
 				  clp->cl_cb_conn.cb_addrlen)
@@ -1864,7 +1864,7 @@ TRACE_EVENT(nfsd_ctl_unlock_ip,
 	),
 	TP_fast_assign(
 		__entry->netns_ino = net->ns.inum;
-		__assign_str(address, address);
+		__assign_str(address);
 	),
 	TP_printk("address=%s",
 		__get_str(address)
@@ -1883,7 +1883,7 @@ TRACE_EVENT(nfsd_ctl_unlock_fs,
 	),
 	TP_fast_assign(
 		__entry->netns_ino = net->ns.inum;
-		__assign_str(path, path);
+		__assign_str(path);
 	),
 	TP_printk("path=%s",
 		__get_str(path)
@@ -1907,8 +1907,8 @@ TRACE_EVENT(nfsd_ctl_filehandle,
 	TP_fast_assign(
 		__entry->netns_ino = net->ns.inum;
 		__entry->maxsize = maxsize;
-		__assign_str(domain, domain);
-		__assign_str(path, path);
+		__assign_str(domain);
+		__assign_str(path);
 	),
 	TP_printk("domain=%s path=%s maxsize=%d",
 		__get_str(domain), __get_str(path), __entry->maxsize
@@ -1968,7 +1968,7 @@ TRACE_EVENT(nfsd_ctl_version,
 	),
 	TP_fast_assign(
 		__entry->netns_ino = net->ns.inum;
-		__assign_str(mesg, mesg);
+		__assign_str(mesg);
 	),
 	TP_printk("%s",
 		__get_str(mesg)
@@ -2009,7 +2009,7 @@ TRACE_EVENT(nfsd_ctl_ports_addxprt,
 	TP_fast_assign(
 		__entry->netns_ino = net->ns.inum;
 		__entry->port = port;
-		__assign_str(transport, transport);
+		__assign_str(transport);
 	),
 	TP_printk("transport=%s port=%d",
 		__get_str(transport), __entry->port
@@ -2070,7 +2070,7 @@ TRACE_EVENT(nfsd_ctl_time,
 	TP_fast_assign(
 		__entry->netns_ino = net->ns.inum;
 		__entry->time = time;
-		__assign_str(name, name);
+		__assign_str(name);
 	),
 	TP_printk("file=%s time=%d",
 		__get_str(name), __entry->time
@@ -2089,7 +2089,7 @@ TRACE_EVENT(nfsd_ctl_recoverydir,
 	),
 	TP_fast_assign(
 		__entry->netns_ino = net->ns.inum;
-		__assign_str(recdir, recdir);
+		__assign_str(recdir);
 	),
 	TP_printk("recdir=%s",
 		__get_str(recdir)
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 9898c11bdfa1..60e208b01c8d 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -82,7 +82,7 @@ DECLARE_EVENT_CLASS(ocfs2__string,
 		__string(name,name)
 	),
 	TP_fast_assign(
-		__assign_str(name, name);
+		__assign_str(name);
 	),
 	TP_printk("%s", __get_str(name))
 );
@@ -1289,7 +1289,7 @@ DECLARE_EVENT_CLASS(ocfs2__file_ops,
 		__entry->dentry = dentry;
 		__entry->ino = ino;
 		__entry->d_len = d_len;
-		__assign_str(d_name, d_name);
+		__assign_str(d_name);
 		__entry->para = para;
 	),
 	TP_printk("%p %p %p %llu %llu %.*s", __entry->inode, __entry->file,
@@ -1425,7 +1425,7 @@ TRACE_EVENT(ocfs2_setattr,
 		__entry->dentry = dentry;
 		__entry->ino = ino;
 		__entry->d_len = d_len;
-		__assign_str(d_name, d_name);
+		__assign_str(d_name);
 		__entry->ia_valid = ia_valid;
 		__entry->ia_mode = ia_mode;
 		__entry->ia_uid = ia_uid;
@@ -1683,7 +1683,7 @@ TRACE_EVENT(ocfs2_parse_options,
 	),
 	TP_fast_assign(
 		__entry->is_remount = is_remount;
-		__assign_str(options, options);
+		__assign_str(options);
 	),
 	TP_printk("%d %s", __entry->is_remount, __get_str(options))
 );
@@ -1718,8 +1718,8 @@ TRACE_EVENT(ocfs2_initialize_super,
 		__field(int, cluster_bits)
 	),
 	TP_fast_assign(
-		__assign_str(label, label);
-		__assign_str(uuid_str, uuid_str);
+		__assign_str(label);
+		__assign_str(uuid_str);
 		__entry->root_dir = root_dir;
 		__entry->system_dir = system_dir;
 		__entry->cluster_bits = cluster_bits;
@@ -1746,7 +1746,7 @@ TRACE_EVENT(ocfs2_init_xattr_set_ctxt,
 		__field(int, credits)
 	),
 	TP_fast_assign(
-		__assign_str(name, name);
+		__assign_str(name);
 		__entry->meta = meta;
 		__entry->clusters = clusters;
 		__entry->credits = credits;
@@ -1770,7 +1770,7 @@ DECLARE_EVENT_CLASS(ocfs2__xattr_find,
 	),
 	TP_fast_assign(
 		__entry->ino = ino;
-		__assign_str(name, name);
+		__assign_str(name);
 		__entry->name_index = name_index;
 		__entry->hash = hash;
 		__entry->location = location;
@@ -2019,7 +2019,7 @@ TRACE_EVENT(ocfs2_sync_dquot_helper,
 		__entry->dq_id = dq_id;
 		__entry->dq_type = dq_type;
 		__entry->type = type;
-		__assign_str(s_id, s_id);
+		__assign_str(s_id);
 	),
 	TP_printk("%u %u %lu %s", __entry->dq_id, __entry->dq_type,
 		  __entry->type, __get_str(s_id))
@@ -2060,7 +2060,7 @@ TRACE_EVENT(ocfs2_dx_dir_search,
 	TP_fast_assign(
 		__entry->ino = ino;
 		__entry->namelen = namelen;
-		__assign_str(name, name);
+		__assign_str(name);
 		__entry->major_hash = major_hash;
 		__entry->minor_hash = minor_hash;
 		__entry->blkno = blkno;
@@ -2088,7 +2088,7 @@ TRACE_EVENT(ocfs2_find_files_on_disk,
 	),
 	TP_fast_assign(
 		__entry->namelen = namelen;
-		__assign_str(name, name);
+		__assign_str(name);
 		__entry->blkno = blkno;
 		__entry->dir = dir;
 	),
@@ -2107,7 +2107,7 @@ TRACE_EVENT(ocfs2_check_dir_for_entry,
 	TP_fast_assign(
 		__entry->dir = dir;
 		__entry->namelen = namelen;
-		__assign_str(name, name);
+		__assign_str(name);
 	),
 	TP_printk("%llu %.*s", __entry->dir,
 		  __entry->namelen, __get_str(name))
@@ -2135,7 +2135,7 @@ TRACE_EVENT(ocfs2_dx_dir_index_root_block,
 		__entry->major_hash = major_hash;
 		__entry->minor_hash = minor_hash;
 		__entry->namelen = namelen;
-		__assign_str(name, name);
+		__assign_str(name);
 		__entry->num_used = num_used;
 	),
 	TP_printk("%llu %x %x %.*s %u", __entry->dir,
@@ -2171,7 +2171,7 @@ DECLARE_EVENT_CLASS(ocfs2__dentry_ops,
 		__entry->dir = dir;
 		__entry->dentry = dentry;
 		__entry->name_len = name_len;
-		__assign_str(name, name);
+		__assign_str(name);
 		__entry->dir_blkno = dir_blkno;
 		__entry->extra = extra;
 	),
@@ -2217,7 +2217,7 @@ TRACE_EVENT(ocfs2_mknod,
 		__entry->dir = dir;
 		__entry->dentry = dentry;
 		__entry->name_len = name_len;
-		__assign_str(name, name);
+		__assign_str(name);
 		__entry->dir_blkno = dir_blkno;
 		__entry->dev = dev;
 		__entry->mode = mode;
@@ -2241,9 +2241,9 @@ TRACE_EVENT(ocfs2_link,
 	TP_fast_assign(
 		__entry->ino = ino;
 		__entry->old_len = old_len;
-		__assign_str(old_name, old_name);
+		__assign_str(old_name);
 		__entry->name_len = name_len;
-		__assign_str(name, name);
+		__assign_str(name);
 	),
 	TP_printk("%llu %.*s %.*s", __entry->ino,
 		  __entry->old_len, __get_str(old_name),
@@ -2279,9 +2279,9 @@ TRACE_EVENT(ocfs2_rename,
 		__entry->new_dir = new_dir;
 		__entry->new_dentry = new_dentry;
 		__entry->old_len = old_len;
-		__assign_str(old_name, old_name);
+		__assign_str(old_name);
 		__entry->new_len = new_len;
-		__assign_str(new_name, new_name);
+		__assign_str(new_name);
 	),
 	TP_printk("%p %p %p %p %.*s %.*s",
 		  __entry->old_dir, __entry->old_dentry,
@@ -2301,7 +2301,7 @@ TRACE_EVENT(ocfs2_rename_target_exists,
 	),
 	TP_fast_assign(
 		__entry->new_len = new_len;
-		__assign_str(new_name, new_name);
+		__assign_str(new_name);
 	),
 	TP_printk("%.*s", __entry->new_len, __get_str(new_name))
 );
@@ -2344,7 +2344,7 @@ TRACE_EVENT(ocfs2_symlink_begin,
 		__entry->dentry = dentry;
 		__entry->symname = symname;
 		__entry->len = len;
-		__assign_str(name, name);
+		__assign_str(name);
 	),
 	TP_printk("%p %p %s %.*s", __entry->dir, __entry->dentry,
 		  __entry->symname, __entry->len, __get_str(name))
@@ -2360,7 +2360,7 @@ TRACE_EVENT(ocfs2_blkno_stringify,
 	),
 	TP_fast_assign(
 		__entry->blkno = blkno;
-		__assign_str(name, name);
+		__assign_str(name);
 		__entry->namelen = namelen;
 	),
 	TP_printk("%llu %s %d", __entry->blkno, __get_str(name),
@@ -2381,7 +2381,7 @@ TRACE_EVENT(ocfs2_orphan_del,
 	),
 	TP_fast_assign(
 		__entry->dir = dir;
-		__assign_str(name, name);
+		__assign_str(name);
 		__entry->namelen = namelen;
 	),
 	TP_printk("%llu %s %d", __entry->dir, __get_str(name),
@@ -2403,7 +2403,7 @@ TRACE_EVENT(ocfs2_dentry_revalidate,
 	TP_fast_assign(
 		__entry->dentry = dentry;
 		__entry->len = len;
-		__assign_str(name, name);
+		__assign_str(name);
 	),
 	TP_printk("%p %.*s", __entry->dentry, __entry->len, __get_str(name))
 );
@@ -2420,7 +2420,7 @@ TRACE_EVENT(ocfs2_dentry_revalidate_negative,
 	),
 	TP_fast_assign(
 		__entry->len = len;
-		__assign_str(name, name);
+		__assign_str(name);
 		__entry->pgen = pgen;
 		__entry->gen = gen;
 	),
@@ -2445,7 +2445,7 @@ TRACE_EVENT(ocfs2_find_local_alias,
 	),
 	TP_fast_assign(
 		__entry->len = len;
-		__assign_str(name, name);
+		__assign_str(name);
 	),
 	TP_printk("%.*s", __entry->len, __get_str(name))
 );
@@ -2462,7 +2462,7 @@ TRACE_EVENT(ocfs2_dentry_attach_lock,
 	),
 	TP_fast_assign(
 		__entry->len = len;
-		__assign_str(name, name);
+		__assign_str(name);
 		__entry->parent = parent;
 		__entry->fsdata = fsdata;
 	),
@@ -2480,7 +2480,7 @@ TRACE_EVENT(ocfs2_dentry_attach_lock_found,
 		__field(unsigned long long, ino)
 	),
 	TP_fast_assign(
-		__assign_str(name, name);
+		__assign_str(name);
 		__entry->parent = parent;
 		__entry->ino = ino;
 	),
@@ -2527,7 +2527,7 @@ TRACE_EVENT(ocfs2_get_parent,
 	TP_fast_assign(
 		__entry->child = child;
 		__entry->len = len;
-		__assign_str(name, name);
+		__assign_str(name);
 		__entry->ino = ino;
 	),
 	TP_printk("%p %.*s %llu", __entry->child, __entry->len,
@@ -2551,7 +2551,7 @@ TRACE_EVENT(ocfs2_encode_fh_begin,
 	TP_fast_assign(
 		__entry->dentry = dentry;
 		__entry->name_len = name_len;
-		__assign_str(name, name);
+		__assign_str(name);
 		__entry->fh = fh;
 		__entry->len = len;
 		__entry->connectable = connectable;
diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h
index af97389e983e..36d47ce59631 100644
--- a/fs/smb/client/trace.h
+++ b/fs/smb/client/trace.h
@@ -518,7 +518,7 @@ DECLARE_EVENT_CLASS(smb3_inf_compound_enter_class,
 		__entry->xid = xid;
 		__entry->tid = tid;
 		__entry->sesid = sesid;
-		__assign_str(path, full_path);
+		__assign_str(path);
 	),
 	TP_printk("xid=%u sid=0x%llx tid=0x%x path=%s",
 		__entry->xid, __entry->sesid, __entry->tid,
@@ -762,7 +762,7 @@ DECLARE_EVENT_CLASS(smb3_exit_err_class,
 	),
 	TP_fast_assign(
 		__entry->xid = xid;
-		__assign_str(func_name, func_name);
+		__assign_str(func_name);
 		__entry->rc = rc;
 	),
 	TP_printk("\t%s: xid=%u rc=%d",
@@ -815,7 +815,7 @@ DECLARE_EVENT_CLASS(smb3_enter_exit_class,
 	),
 	TP_fast_assign(
 		__entry->xid = xid;
-		__assign_str(func_name, func_name);
+		__assign_str(func_name);
 	),
 	TP_printk("\t%s: xid=%u",
 		__get_str(func_name), __entry->xid)
@@ -852,7 +852,7 @@ DECLARE_EVENT_CLASS(smb3_tcon_class,
 		__entry->xid = xid;
 		__entry->tid = tid;
 		__entry->sesid = sesid;
-		__assign_str(name, unc_name);
+		__assign_str(name);
 		__entry->rc = rc;
 	),
 	TP_printk("xid=%u sid=0x%llx tid=0x%x unc_name=%s rc=%d",
@@ -896,7 +896,7 @@ DECLARE_EVENT_CLASS(smb3_open_enter_class,
 		__entry->xid = xid;
 		__entry->tid = tid;
 		__entry->sesid = sesid;
-		__assign_str(path, full_path);
+		__assign_str(path);
 		__entry->create_options = create_options;
 		__entry->desired_access = desired_access;
 	),
@@ -1098,7 +1098,7 @@ DECLARE_EVENT_CLASS(smb3_connect_class,
 		__entry->conn_id = conn_id;
 		pss = (struct sockaddr_storage *)__entry->dst_addr;
 		*pss = *dst_addr;
-		__assign_str(hostname, hostname);
+		__assign_str(hostname);
 	),
 	TP_printk("conn_id=0x%llx server=%s addr=%pISpsfc",
 		__entry->conn_id,
@@ -1134,7 +1134,7 @@ DECLARE_EVENT_CLASS(smb3_connect_err_class,
 		__entry->rc = rc;
 		pss = (struct sockaddr_storage *)__entry->dst_addr;
 		*pss = *dst_addr;
-		__assign_str(hostname, hostname);
+		__assign_str(hostname);
 	),
 	TP_printk("rc=%d conn_id=0x%llx server=%s addr=%pISpsfc",
 		__entry->rc,
@@ -1166,7 +1166,7 @@ DECLARE_EVENT_CLASS(smb3_reconnect_class,
 	TP_fast_assign(
 		__entry->currmid = currmid;
 		__entry->conn_id = conn_id;
-		__assign_str(hostname, hostname);
+		__assign_str(hostname);
 	),
 	TP_printk("conn_id=0x%llx server=%s current_mid=%llu",
 		__entry->conn_id,
@@ -1255,7 +1255,7 @@ DECLARE_EVENT_CLASS(smb3_credit_class,
 	TP_fast_assign(
 		__entry->currmid = currmid;
 		__entry->conn_id = conn_id;
-		__assign_str(hostname, hostname);
+		__assign_str(hostname);
 		__entry->credits = credits;
 		__entry->credits_to_add = credits_to_add;
 		__entry->in_flight = in_flight;
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index e27daa51cab6..92ef4cdc486e 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -561,7 +561,7 @@ TRACE_EVENT(xchk_btree_op_error,
 
 		__entry->dev = sc->mp->m_super->s_dev;
 		__entry->type = sc->sm->sm_type;
-		__assign_str(name, cur->bc_ops->name);
+		__assign_str(name);
 		__entry->level = level;
 		__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
 		__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
@@ -604,7 +604,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error,
 		__entry->ino = sc->ip->i_ino;
 		__entry->whichfork = cur->bc_ino.whichfork;
 		__entry->type = sc->sm->sm_type;
-		__assign_str(name, cur->bc_ops->name);
+		__assign_str(name);
 		__entry->level = level;
 		__entry->ptr = cur->bc_levels[level].ptr;
 		__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
@@ -644,7 +644,7 @@ TRACE_EVENT(xchk_btree_error,
 		xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level);
 		__entry->dev = sc->mp->m_super->s_dev;
 		__entry->type = sc->sm->sm_type;
-		__assign_str(name, cur->bc_ops->name);
+		__assign_str(name);
 		__entry->level = level;
 		__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
 		__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
@@ -684,7 +684,7 @@ TRACE_EVENT(xchk_ifork_btree_error,
 		__entry->ino = sc->ip->i_ino;
 		__entry->whichfork = cur->bc_ino.whichfork;
 		__entry->type = sc->sm->sm_type;
-		__assign_str(name, cur->bc_ops->name);
+		__assign_str(name);
 		__entry->level = level;
 		__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
 		__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
@@ -723,7 +723,7 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class,
 
 		__entry->dev = sc->mp->m_super->s_dev;
 		__entry->type = sc->sm->sm_type;
-		__assign_str(name, cur->bc_ops->name);
+		__assign_str(name);
 		__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
 		__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
 		__entry->level = level;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 05cb59bd0b80..25ff6fe1eb6c 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -168,7 +168,7 @@ TRACE_EVENT(xlog_intent_recovery_failed,
 	),
 	TP_fast_assign(
 		__entry->dev = mp->m_super->s_dev;
-		__assign_str(name, ops->name);
+		__assign_str(name);
 		__entry->error = error;
 	),
 	TP_printk("dev %d:%d optype %s error %d",
@@ -1913,7 +1913,7 @@ TRACE_EVENT(xfs_alloc_cur_check,
 	),
 	TP_fast_assign(
 		__entry->dev = cur->bc_mp->m_super->s_dev;
-		__assign_str(name, cur->bc_ops->name);
+		__assign_str(name);
 		__entry->bno = bno;
 		__entry->len = len;
 		__entry->diff = diff;
@@ -2473,7 +2473,7 @@ DECLARE_EVENT_CLASS(xfs_btree_cur_class,
 	),
 	TP_fast_assign(
 		__entry->dev = cur->bc_mp->m_super->s_dev;
-		__assign_str(name, cur->bc_ops->name);
+		__assign_str(name);
 		__entry->level = level;
 		__entry->nlevels = cur->bc_nlevels;
 		__entry->ptr = cur->bc_levels[level].ptr;
@@ -2523,7 +2523,7 @@ TRACE_EVENT(xfs_btree_alloc_block,
 			__entry->ino = 0;
 			break;
 		}
-		__assign_str(name, cur->bc_ops->name);
+		__assign_str(name);
 		__entry->error = error;
 		if (!error && stat) {
 			if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
@@ -2567,7 +2567,7 @@ TRACE_EVENT(xfs_btree_free_block,
 			__entry->ino = cur->bc_ino.ip->i_ino;
 		else
 			__entry->ino = 0;
-		__assign_str(name, cur->bc_ops->name);
+		__assign_str(name);
 		__entry->agbno = xfs_daddr_to_agbno(cur->bc_mp,
 							xfs_buf_daddr(bp));
 	),
@@ -2643,7 +2643,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class,
 	),
 	TP_fast_assign(
 		__entry->dev = mp ? mp->m_super->s_dev : 0;
-		__assign_str(name, dfp->dfp_ops->name);
+		__assign_str(name);
 		__entry->intent = dfp->dfp_intent;
 		__entry->flags = dfp->dfp_flags;
 		__entry->committed = dfp->dfp_done != NULL;
@@ -2732,7 +2732,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_item_class,
 	),
 	TP_fast_assign(
 		__entry->dev = mp ? mp->m_super->s_dev : 0;
-		__assign_str(name, dfp->dfp_ops->name);
+		__assign_str(name);
 		__entry->intent = dfp->dfp_intent;
 		__entry->item = item;
 		__entry->committed = dfp->dfp_done != NULL;
@@ -4244,7 +4244,7 @@ TRACE_EVENT(xfs_btree_commit_afakeroot,
 	),
 	TP_fast_assign(
 		__entry->dev = cur->bc_mp->m_super->s_dev;
-		__assign_str(name, cur->bc_ops->name);
+		__assign_str(name);
 		__entry->agno = cur->bc_ag.pag->pag_agno;
 		__entry->agbno = cur->bc_ag.afake->af_root;
 		__entry->levels = cur->bc_ag.afake->af_levels;
@@ -4273,7 +4273,7 @@ TRACE_EVENT(xfs_btree_commit_ifakeroot,
 	),
 	TP_fast_assign(
 		__entry->dev = cur->bc_mp->m_super->s_dev;
-		__assign_str(name, cur->bc_ops->name);
+		__assign_str(name);
 		__entry->agno = XFS_INO_TO_AGNO(cur->bc_mp,
 					cur->bc_ino.ip->i_ino);
 		__entry->agino = XFS_INO_TO_AGINO(cur->bc_mp,
@@ -4312,7 +4312,7 @@ TRACE_EVENT(xfs_btree_bload_level_geometry,
 	),
 	TP_fast_assign(
 		__entry->dev = cur->bc_mp->m_super->s_dev;
-		__assign_str(name, cur->bc_ops->name);
+		__assign_str(name);
 		__entry->level = level;
 		__entry->nlevels = cur->bc_nlevels;
 		__entry->nr_this_level = nr_this_level;
@@ -4350,7 +4350,7 @@ TRACE_EVENT(xfs_btree_bload_block,
 	),
 	TP_fast_assign(
 		__entry->dev = cur->bc_mp->m_super->s_dev;
-		__assign_str(name, cur->bc_ops->name);
+		__assign_str(name);
 		__entry->level = level;
 		__entry->block_idx = block_idx;
 		__entry->nr_blocks = nr_blocks;
@@ -4573,7 +4573,7 @@ TRACE_EVENT(xfs_force_shutdown,
 		__entry->dev = mp->m_super->s_dev;
 		__entry->ptag = ptag;
 		__entry->flags = flags;
-		__assign_str(fname, fname);
+		__assign_str(fname);
 		__entry->line_num = line_num;
 	),
 	TP_printk("dev %d:%d tag %s flags %s file %s line_num %d",
@@ -4755,7 +4755,7 @@ DECLARE_EVENT_CLASS(xfbtree_freesp_class,
 	),
 	TP_fast_assign(
 		__entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
-		__assign_str(btname, cur->bc_ops->name);
+		__assign_str(btname);
 		__entry->nlevels = cur->bc_nlevels;
 		__entry->fileoff = fileoff;
 	),
@@ -5122,7 +5122,7 @@ DECLARE_EVENT_CLASS(xfs_getparents_rec_class,
 		__entry->bufsize = ppi->gp_bufsize;
 		__entry->parent_ino = pptr->gpr_parent.ha_fid.fid_ino;
 		__entry->parent_gen = pptr->gpr_parent.ha_fid.fid_gen;
-		__assign_str(name, pptr->gpr_name);
+		__assign_str(name);
 	),
 	TP_printk("dev %d:%d ino 0x%llx firstu %u reclen %u bufsize %u parent_ino 0x%llx parent_gen 0x%x name '%s'",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index c011ea236e9b..7c47151d5c72 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -61,7 +61,7 @@ TRACE_EVENT(extlog_mem_event,
 		else
 			__entry->pa_mask_lsb = ~0;
 		__entry->fru_id = *fru_id;
-		__assign_str(fru_text, fru_text);
+		__assign_str(fru_text);
 		cper_mem_err_pack(mem, &__entry->data);
 	),
 
@@ -131,8 +131,8 @@ TRACE_EVENT(mc_event,
 
 	TP_fast_assign(
 		__entry->error_type		= err_type;
-		__assign_str(msg, error_msg);
-		__assign_str(label, label);
+		__assign_str(msg);
+		__assign_str(label);
 		__entry->error_count		= error_count;
 		__entry->mc_index		= mc_index;
 		__entry->top_layer		= top_layer;
@@ -141,7 +141,7 @@ TRACE_EVENT(mc_event,
 		__entry->address		= address;
 		__entry->grain_bits		= grain_bits;
 		__entry->syndrome		= syndrome;
-		__assign_str(driver_detail, driver_detail);
+		__assign_str(driver_detail);
 	),
 
 	TP_printk("%d %s error%s:%s%s on %s (mc:%d location:%d:%d:%d address:0x%08lx grain:%d syndrome:0x%08lx%s%s)",
@@ -239,7 +239,7 @@ TRACE_EVENT(non_standard_event,
 	TP_fast_assign(
 		memcpy(__entry->sec_type, sec_type, UUID_SIZE);
 		memcpy(__entry->fru_id, fru_id, UUID_SIZE);
-		__assign_str(fru_text, fru_text);
+		__assign_str(fru_text);
 		__entry->sev = sev;
 		__entry->len = len;
 		memcpy(__get_dynamic_array(buf), err, len);
@@ -313,7 +313,7 @@ TRACE_EVENT(aer_event,
 	),
 
 	TP_fast_assign(
-		__assign_str(dev_name, dev_name);
+		__assign_str(dev_name);
 		__entry->status		= status;
 		__entry->severity	= severity;
 		__entry->tlp_header_valid = tlp_header_valid;
diff --git a/include/trace/events/asoc.h b/include/trace/events/asoc.h
index 517015ef36a8..202fc3680c36 100644
--- a/include/trace/events/asoc.h
+++ b/include/trace/events/asoc.h
@@ -32,8 +32,8 @@ DECLARE_EVENT_CLASS(snd_soc_dapm,
 	),
 
 	TP_fast_assign(
-		__assign_str(card_name, dapm->card->name);
-		__assign_str(comp_name, dapm->component ? dapm->component->name : "(none)");
+		__assign_str(card_name);
+		__assign_str(comp_name);
 		__entry->val = val;
 	),
 
@@ -69,7 +69,7 @@ DECLARE_EVENT_CLASS(snd_soc_dapm_basic,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, card->name);
+		__assign_str(name);
 		__entry->event = event;
 	),
 
@@ -104,7 +104,7 @@ DECLARE_EVENT_CLASS(snd_soc_dapm_widget,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, w->name);
+		__assign_str(name);
 		__entry->val = val;
 	),
 
@@ -150,7 +150,7 @@ TRACE_EVENT(snd_soc_dapm_walk_done,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, card->name);
+		__assign_str(name);
 		__entry->power_checks = card->dapm_stats.power_checks;
 		__entry->path_checks = card->dapm_stats.path_checks;
 		__entry->neighbour_checks = card->dapm_stats.neighbour_checks;
@@ -179,9 +179,9 @@ TRACE_EVENT(snd_soc_dapm_path,
 	),
 
 	TP_fast_assign(
-		__assign_str(wname, widget->name);
-		__assign_str(pname, path->name ? path->name : DAPM_DIRECT);
-		__assign_str(pnname, path->node[dir]->name);
+		__assign_str(wname);
+		__assign_str(pname);
+		__assign_str(pnname);
 		__entry->path_connect = path->connect;
 		__entry->path_node = (long)path->node[dir];
 		__entry->path_dir = dir;
@@ -226,7 +226,7 @@ TRACE_EVENT(snd_soc_jack_irq,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, name);
+		__assign_str(name);
 	),
 
 	TP_printk("%s", __get_str(name))
@@ -245,7 +245,7 @@ TRACE_EVENT(snd_soc_jack_report,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, jack->jack->id);
+		__assign_str(name);
 		__entry->mask = mask;
 		__entry->val = val;
 	),
@@ -266,7 +266,7 @@ TRACE_EVENT(snd_soc_jack_notify,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, jack->jack->id);
+		__assign_str(name);
 		__entry->val = val;
 	),
 
diff --git a/include/trace/events/avc.h b/include/trace/events/avc.h
index b55fda2e0773..fed0f141d5f6 100644
--- a/include/trace/events/avc.h
+++ b/include/trace/events/avc.h
@@ -36,9 +36,9 @@ TRACE_EVENT(selinux_audited,
 		__entry->denied		= sad->denied;
 		__entry->audited	= sad->audited;
 		__entry->result		= sad->result;
-		__assign_str(tcontext, tcontext);
-		__assign_str(scontext, scontext);
-		__assign_str(tclass, tclass);
+		__assign_str(tcontext);
+		__assign_str(scontext);
+		__assign_str(tclass);
 	),
 
 	TP_printk("requested=0x%x denied=0x%x audited=0x%x result=%d scontext=%s tcontext=%s tclass=%s",
diff --git a/include/trace/events/bridge.h b/include/trace/events/bridge.h
index a6b3a4e409f0..3fe4725c83ff 100644
--- a/include/trace/events/bridge.h
+++ b/include/trace/events/bridge.h
@@ -25,7 +25,7 @@ TRACE_EVENT(br_fdb_add,
 	),
 
 	TP_fast_assign(
-		__assign_str(dev, dev->name);
+		__assign_str(dev);
 		memcpy(__entry->addr, addr, ETH_ALEN);
 		__entry->vid = vid;
 		__entry->nlh_flags = nlh_flags;
@@ -54,8 +54,8 @@ TRACE_EVENT(br_fdb_external_learn_add,
 	),
 
 	TP_fast_assign(
-		__assign_str(br_dev, br->dev->name);
-		__assign_str(dev, p ? p->dev->name : "null");
+		__assign_str(br_dev);
+		__assign_str(dev);
 		memcpy(__entry->addr, addr, ETH_ALEN);
 		__entry->vid = vid;
 	),
@@ -80,8 +80,8 @@ TRACE_EVENT(fdb_delete,
 	),
 
 	TP_fast_assign(
-		__assign_str(br_dev, br->dev->name);
-		__assign_str(dev, f->dst ? f->dst->dev->name : "null");
+		__assign_str(br_dev);
+		__assign_str(dev);
 		memcpy(__entry->addr, f->key.addr.addr, ETH_ALEN);
 		__entry->vid = f->key.vlan_id;
 	),
@@ -108,8 +108,8 @@ TRACE_EVENT(br_fdb_update,
 	),
 
 	TP_fast_assign(
-		__assign_str(br_dev, br->dev->name);
-		__assign_str(dev, source->dev->name);
+		__assign_str(br_dev);
+		__assign_str(dev);
 		memcpy(__entry->addr, addr, ETH_ALEN);
 		__entry->vid = vid;
 		__entry->flags = flags;
@@ -141,7 +141,7 @@ TRACE_EVENT(br_mdb_full,
 	TP_fast_assign(
 		struct in6_addr *in6;
 
-		__assign_str(dev, dev->name);
+		__assign_str(dev);
 		__entry->vid = group->vid;
 
 		if (!group->proto) {
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index d2d94d7c3fb5..fadf406b5260 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1140,7 +1140,7 @@ TRACE_EVENT(btrfs_space_reservation,
 	),
 
 	TP_fast_assign_btrfs(fs_info,
-		__assign_str(type, type);
+		__assign_str(type);
 		__entry->val		= val;
 		__entry->bytes		= bytes;
 		__entry->reserve	= reserve;
@@ -1169,7 +1169,7 @@ TRACE_EVENT(btrfs_trigger_flush,
 		__entry->flags	= flags;
 		__entry->bytes	= bytes;
 		__entry->flush	= flush;
-		__assign_str(reason, reason);
+		__assign_str(reason);
 	),
 
 	TP_printk_btrfs("%s: flush=%d(%s) flags=%llu(%s) bytes=%llu",
@@ -1622,7 +1622,7 @@ DECLARE_EVENT_CLASS(btrfs_workqueue,
 
 	TP_fast_assign_btrfs(btrfs_workqueue_owner(wq),
 		__entry->wq		= wq;
-		__assign_str(name, name);
+		__assign_str(name);
 	),
 
 	TP_printk_btrfs("name=%s wq=%p", __get_str(name),
diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h
index 0b95865a90f3..af2755bda6eb 100644
--- a/include/trace/events/cgroup.h
+++ b/include/trace/events/cgroup.h
@@ -23,7 +23,7 @@ DECLARE_EVENT_CLASS(cgroup_root,
 	TP_fast_assign(
 		__entry->root = root->hierarchy_id;
 		__entry->ss_mask = root->subsys_mask;
-		__assign_str(name, root->name);
+		__assign_str(name);
 	),
 
 	TP_printk("root=%d ss_mask=%#x name=%s",
@@ -68,7 +68,7 @@ DECLARE_EVENT_CLASS(cgroup,
 		__entry->root = cgrp->root->hierarchy_id;
 		__entry->id = cgroup_id(cgrp);
 		__entry->level = cgrp->level;
-		__assign_str(path, path);
+		__assign_str(path);
 	),
 
 	TP_printk("root=%d id=%llu level=%d path=%s",
@@ -137,9 +137,9 @@ DECLARE_EVENT_CLASS(cgroup_migrate,
 		__entry->dst_root = dst_cgrp->root->hierarchy_id;
 		__entry->dst_id = cgroup_id(dst_cgrp);
 		__entry->dst_level = dst_cgrp->level;
-		__assign_str(dst_path, path);
+		__assign_str(dst_path);
 		__entry->pid = task->pid;
-		__assign_str(comm, task->comm);
+		__assign_str(comm);
 	),
 
 	TP_printk("dst_root=%d dst_id=%llu dst_level=%d dst_path=%s pid=%d comm=%s",
@@ -181,7 +181,7 @@ DECLARE_EVENT_CLASS(cgroup_event,
 		__entry->root = cgrp->root->hierarchy_id;
 		__entry->id = cgroup_id(cgrp);
 		__entry->level = cgrp->level;
-		__assign_str(path, path);
+		__assign_str(path);
 		__entry->val = val;
 	),
 
diff --git a/include/trace/events/clk.h b/include/trace/events/clk.h
index daed3c7a48c1..759f7371a6dc 100644
--- a/include/trace/events/clk.h
+++ b/include/trace/events/clk.h
@@ -23,7 +23,7 @@ DECLARE_EVENT_CLASS(clk,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, core->name);
+		__assign_str(name);
 	),
 
 	TP_printk("%s", __get_str(name))
@@ -97,7 +97,7 @@ DECLARE_EVENT_CLASS(clk_rate,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, core->name);
+		__assign_str(name);
 		__entry->rate = rate;
 	),
 
@@ -145,7 +145,7 @@ DECLARE_EVENT_CLASS(clk_rate_range,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, core->name);
+		__assign_str(name);
 		__entry->min = min;
 		__entry->max = max;
 	),
@@ -174,8 +174,8 @@ DECLARE_EVENT_CLASS(clk_parent,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, core->name);
-		__assign_str(pname, parent ? parent->name : "none");
+		__assign_str(name);
+		__assign_str(pname);
 	),
 
 	TP_printk("%s %s", __get_str(name), __get_str(pname))
@@ -207,7 +207,7 @@ DECLARE_EVENT_CLASS(clk_phase,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, core->name);
+		__assign_str(name);
 		__entry->phase = phase;
 	),
 
@@ -241,7 +241,7 @@ DECLARE_EVENT_CLASS(clk_duty_cycle,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, core->name);
+		__assign_str(name);
 		__entry->num = duty->num;
 		__entry->den = duty->den;
 	),
@@ -279,8 +279,8 @@ DECLARE_EVENT_CLASS(clk_rate_request,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, req->core ? req->core->name : "none");
-		__assign_str(pname, req->best_parent_hw ? clk_hw_get_name(req->best_parent_hw) : "none");
+		__assign_str(name);
+		__assign_str(pname);
 		__entry->min = req->min_rate;
 		__entry->max = req->max_rate;
 		__entry->prate = req->best_parent_rate;
diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h
index 25103e67737c..383c09f583ac 100644
--- a/include/trace/events/cma.h
+++ b/include/trace/events/cma.h
@@ -23,7 +23,7 @@ TRACE_EVENT(cma_release,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, name);
+		__assign_str(name);
 		__entry->pfn = pfn;
 		__entry->page = page;
 		__entry->count = count;
@@ -49,7 +49,7 @@ TRACE_EVENT(cma_alloc_start,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, name);
+		__assign_str(name);
 		__entry->count = count;
 		__entry->align = align;
 	),
@@ -77,7 +77,7 @@ TRACE_EVENT(cma_alloc_finish,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, name);
+		__assign_str(name);
 		__entry->pfn = pfn;
 		__entry->page = page;
 		__entry->count = count;
@@ -110,7 +110,7 @@ TRACE_EVENT(cma_alloc_busy_retry,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, name);
+		__assign_str(name);
 		__entry->pfn = pfn;
 		__entry->page = page;
 		__entry->count = count;
diff --git a/include/trace/events/devfreq.h b/include/trace/events/devfreq.h
index 7627c620bbda..6cbc4d59fd96 100644
--- a/include/trace/events/devfreq.h
+++ b/include/trace/events/devfreq.h
@@ -23,7 +23,7 @@ TRACE_EVENT(devfreq_frequency,
 	),
 
 	TP_fast_assign(
-		__assign_str(dev_name, dev_name(&devfreq->dev));
+		__assign_str(dev_name);
 		__entry->freq = freq;
 		__entry->prev_freq = prev_freq;
 		__entry->busy_time = devfreq->last_status.busy_time;
@@ -54,7 +54,7 @@ TRACE_EVENT(devfreq_monitor,
 		__entry->busy_time = devfreq->last_status.busy_time;
 		__entry->total_time = devfreq->last_status.total_time;
 		__entry->polling_ms = devfreq->profile->polling_ms;
-		__assign_str(dev_name, dev_name(&devfreq->dev));
+		__assign_str(dev_name);
 	),
 
 	TP_printk("dev_name=%-30s freq=%-12lu polling_ms=%-3u load=%-2lu",
diff --git a/include/trace/events/devlink.h b/include/trace/events/devlink.h
index 77ff7cfc6049..f241e204fe6b 100644
--- a/include/trace/events/devlink.h
+++ b/include/trace/events/devlink.h
@@ -31,9 +31,9 @@ TRACE_EVENT(devlink_hwmsg,
 	),
 
 	TP_fast_assign(
-		__assign_str(bus_name, devlink_to_dev(devlink)->bus->name);
-		__assign_str(dev_name, dev_name(devlink_to_dev(devlink)));
-		__assign_str(driver_name, devlink_to_dev(devlink)->driver->name);
+		__assign_str(bus_name);
+		__assign_str(dev_name);
+		__assign_str(driver_name);
 		__entry->incoming = incoming;
 		__entry->type = type;
 		memcpy(__get_dynamic_array(buf), buf, len);
@@ -63,11 +63,11 @@ TRACE_EVENT(devlink_hwerr,
 		),
 
 	TP_fast_assign(
-		__assign_str(bus_name, devlink_to_dev(devlink)->bus->name);
-		__assign_str(dev_name, dev_name(devlink_to_dev(devlink)));
-		__assign_str(driver_name, devlink_to_dev(devlink)->driver->name);
+		__assign_str(bus_name);
+		__assign_str(dev_name);
+		__assign_str(driver_name);
 		__entry->err = err;
-		__assign_str(msg, msg);
+		__assign_str(msg);
 		),
 
 	TP_printk("bus_name=%s dev_name=%s driver_name=%s err=%d %s",
@@ -93,11 +93,11 @@ TRACE_EVENT(devlink_health_report,
 	),
 
 	TP_fast_assign(
-		__assign_str(bus_name, devlink_to_dev(devlink)->bus->name);
-		__assign_str(dev_name, dev_name(devlink_to_dev(devlink)));
-		__assign_str(driver_name, devlink_to_dev(devlink)->driver->name);
-		__assign_str(reporter_name, reporter_name);
-		__assign_str(msg, msg);
+		__assign_str(bus_name);
+		__assign_str(dev_name);
+		__assign_str(driver_name);
+		__assign_str(reporter_name);
+		__assign_str(msg);
 	),
 
 	TP_printk("bus_name=%s dev_name=%s driver_name=%s reporter_name=%s: %s",
@@ -125,10 +125,10 @@ TRACE_EVENT(devlink_health_recover_aborted,
 	),
 
 	TP_fast_assign(
-		__assign_str(bus_name, devlink_to_dev(devlink)->bus->name);
-		__assign_str(dev_name, dev_name(devlink_to_dev(devlink)));
-		__assign_str(driver_name, devlink_to_dev(devlink)->driver->name);
-		__assign_str(reporter_name, reporter_name);
+		__assign_str(bus_name);
+		__assign_str(dev_name);
+		__assign_str(driver_name);
+		__assign_str(reporter_name);
 		__entry->health_state = health_state;
 		__entry->time_since_last_recover = time_since_last_recover;
 	),
@@ -158,10 +158,10 @@ TRACE_EVENT(devlink_health_reporter_state_update,
 	),
 
 	TP_fast_assign(
-		__assign_str(bus_name, devlink_to_dev(devlink)->bus->name);
-		__assign_str(dev_name, dev_name(devlink_to_dev(devlink)));
-		__assign_str(driver_name, devlink_to_dev(devlink)->driver->name);
-		__assign_str(reporter_name, reporter_name);
+		__assign_str(bus_name);
+		__assign_str(dev_name);
+		__assign_str(driver_name);
+		__assign_str(reporter_name);
 		__entry->new_state = new_state;
 	),
 
@@ -192,11 +192,11 @@ TRACE_EVENT(devlink_trap_report,
 	TP_fast_assign(
 		struct net_device *input_dev = metadata->input_dev;
 
-		__assign_str(bus_name, devlink_to_dev(devlink)->bus->name);
-		__assign_str(dev_name, dev_name(devlink_to_dev(devlink)));
-		__assign_str(driver_name, devlink_to_dev(devlink)->driver->name);
-		__assign_str(trap_name, metadata->trap_name);
-		__assign_str(trap_group_name, metadata->trap_group_name);
+		__assign_str(bus_name);
+		__assign_str(dev_name);
+		__assign_str(driver_name);
+		__assign_str(trap_name);
+		__assign_str(trap_group_name);
 		strscpy(__entry->input_dev_name, input_dev ? input_dev->name : "NULL", IFNAMSIZ);
 	),
 
diff --git a/include/trace/events/dma_fence.h b/include/trace/events/dma_fence.h
index 3963e79ca7b4..a4de3df8500b 100644
--- a/include/trace/events/dma_fence.h
+++ b/include/trace/events/dma_fence.h
@@ -23,8 +23,8 @@ DECLARE_EVENT_CLASS(dma_fence,
 	),
 
 	TP_fast_assign(
-		__assign_str(driver, fence->ops->get_driver_name(fence));
-		__assign_str(timeline, fence->ops->get_timeline_name(fence));
+		__assign_str(driver);
+		__assign_str(timeline);
 		__entry->context = fence->context;
 		__entry->seqno = fence->seqno;
 	),
diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h
index e18684b02c3d..b9bbfd855f2a 100644
--- a/include/trace/events/erofs.h
+++ b/include/trace/events/erofs.h
@@ -47,7 +47,7 @@ TRACE_EVENT(erofs_lookup,
 	TP_fast_assign(
 		__entry->dev	= dir->i_sb->s_dev;
 		__entry->nid	= EROFS_I(dir)->nid;
-		__assign_str(name, dentry->d_name.name);
+		__assign_str(name);
 		__entry->flags	= flags;
 	),
 
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 371ba28415f5..ed794b5fefbe 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -354,7 +354,7 @@ TRACE_EVENT(f2fs_unlink_enter,
 		__entry->ino	= dir->i_ino;
 		__entry->size	= dir->i_size;
 		__entry->blocks	= dir->i_blocks;
-		__assign_str(name, dentry->d_name.name);
+		__assign_str(name);
 	),
 
 	TP_printk("dev = (%d,%d), dir ino = %lu, i_size = %lld, "
@@ -843,7 +843,7 @@ TRACE_EVENT(f2fs_lookup_start,
 	TP_fast_assign(
 		__entry->dev	= dir->i_sb->s_dev;
 		__entry->ino	= dir->i_ino;
-		__assign_str(name, dentry->d_name.name);
+		__assign_str(name);
 		__entry->flags	= flags;
 	),
 
@@ -871,7 +871,7 @@ TRACE_EVENT(f2fs_lookup_end,
 	TP_fast_assign(
 		__entry->dev	= dir->i_sb->s_dev;
 		__entry->ino	= dir->i_ino;
-		__assign_str(name, dentry->d_name.name);
+		__assign_str(name);
 		__entry->cino	= ino;
 		__entry->err	= err;
 	),
@@ -903,9 +903,9 @@ TRACE_EVENT(f2fs_rename_start,
 	TP_fast_assign(
 		__entry->dev		= old_dir->i_sb->s_dev;
 		__entry->ino		= old_dir->i_ino;
-		__assign_str(old_name, old_dentry->d_name.name);
+		__assign_str(old_name);
 		__entry->new_pino	= new_dir->i_ino;
-		__assign_str(new_name, new_dentry->d_name.name);
+		__assign_str(new_name);
 		__entry->flags		= flags;
 	),
 
@@ -937,8 +937,8 @@ TRACE_EVENT(f2fs_rename_end,
 	TP_fast_assign(
 		__entry->dev		= old_dentry->d_sb->s_dev;
 		__entry->ino		= old_dentry->d_inode->i_ino;
-		__assign_str(old_name, old_dentry->d_name.name);
-		__assign_str(new_name, new_dentry->d_name.name);
+		__assign_str(old_name);
+		__assign_str(new_name);
 		__entry->flags		= flags;
 		__entry->ret		= ret;
 	),
@@ -1557,7 +1557,7 @@ TRACE_EVENT(f2fs_write_checkpoint,
 	TP_fast_assign(
 		__entry->dev		= sb->s_dev;
 		__entry->reason		= reason;
-		__assign_str(dest_msg, msg);
+		__assign_str(dest_msg);
 	),
 
 	TP_printk("dev = (%d,%d), checkpoint for %s, state = %s",
@@ -2333,12 +2333,12 @@ DECLARE_EVENT_CLASS(f2fs__rw_start,
 		 * because this screws up the tooling that parses
 		 * the traces.
 		 */
-		__assign_str(pathbuf, pathname);
+		__assign_str(pathbuf);
 		(void)strreplace(__get_str(pathbuf), ' ', '_');
 		__entry->offset = offset;
 		__entry->bytes = bytes;
 		__entry->i_size = i_size_read(inode);
-		__assign_str(cmdline, command);
+		__assign_str(cmdline);
 		(void)strreplace(__get_str(cmdline), ' ', '_');
 		__entry->pid = pid;
 		__entry->ino = inode->i_ino;
diff --git a/include/trace/events/habanalabs.h b/include/trace/events/habanalabs.h
index a78d21fa9f29..4a2bb2c896d1 100644
--- a/include/trace/events/habanalabs.h
+++ b/include/trace/events/habanalabs.h
@@ -27,7 +27,7 @@ DECLARE_EVENT_CLASS(habanalabs_mmu_template,
 	),
 
 	TP_fast_assign(
-		__assign_str(dname, dev_name(dev));
+		__assign_str(dname);
 		__entry->virt_addr = virt_addr;
 		__entry->phys_addr = phys_addr;
 		__entry->page_size = page_size;
@@ -64,7 +64,7 @@ DECLARE_EVENT_CLASS(habanalabs_dma_alloc_template,
 	),
 
 	TP_fast_assign(
-		__assign_str(dname, dev_name(dev));
+		__assign_str(dname);
 		__entry->cpu_addr = cpu_addr;
 		__entry->dma_addr = dma_addr;
 		__entry->size = size;
@@ -103,7 +103,7 @@ DECLARE_EVENT_CLASS(habanalabs_dma_map_template,
 	),
 
 	TP_fast_assign(
-		__assign_str(dname, dev_name(dev));
+		__assign_str(dname);
 		__entry->phys_addr = phys_addr;
 		__entry->dma_addr = dma_addr;
 		__entry->len = len;
@@ -141,7 +141,7 @@ DECLARE_EVENT_CLASS(habanalabs_comms_template,
 	),
 
 	TP_fast_assign(
-		__assign_str(dname, dev_name(dev));
+		__assign_str(dname);
 		__entry->op_str = op_str;
 	),
 
@@ -178,7 +178,7 @@ DECLARE_EVENT_CLASS(habanalabs_reg_access_template,
 	),
 
 	TP_fast_assign(
-		__assign_str(dname, dev_name(dev));
+		__assign_str(dname);
 		__entry->addr = addr;
 		__entry->val = val;
 	),
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index ab576898a126..b5f5369b6300 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -191,7 +191,7 @@ TRACE_EVENT(mm_khugepaged_scan_file,
 	TP_fast_assign(
 		__entry->mm = mm;
 		__entry->pfn = folio ? folio_pfn(folio) : -1;
-		__assign_str(filename, file->f_path.dentry->d_iname);
+		__assign_str(filename);
 		__entry->present = present;
 		__entry->swap = swap;
 		__entry->result = result;
@@ -228,7 +228,7 @@ TRACE_EVENT(mm_khugepaged_collapse_file,
 		__entry->index = index;
 		__entry->addr = addr;
 		__entry->is_shmem = is_shmem;
-		__assign_str(filename, file->f_path.dentry->d_iname);
+		__assign_str(filename);
 		__entry->nr = nr;
 		__entry->result = result;
 	),
diff --git a/include/trace/events/hwmon.h b/include/trace/events/hwmon.h
index d7a1d0ffb679..d1ff560cd9b5 100644
--- a/include/trace/events/hwmon.h
+++ b/include/trace/events/hwmon.h
@@ -21,7 +21,7 @@ DECLARE_EVENT_CLASS(hwmon_attr_class,
 
 	TP_fast_assign(
 		__entry->index = index;
-		__assign_str(attr_name, attr_name);
+		__assign_str(attr_name);
 		__entry->val = val;
 	),
 
@@ -57,8 +57,8 @@ TRACE_EVENT(hwmon_attr_show_string,
 
 	TP_fast_assign(
 		__entry->index = index;
-		__assign_str(attr_name, attr_name);
-		__assign_str(label, s);
+		__assign_str(attr_name);
+		__assign_str(label);
 	),
 
 	TP_printk("index=%d, attr_name=%s, val=%s",
diff --git a/include/trace/events/initcall.h b/include/trace/events/initcall.h
index eb903c3f195f..5282afdf3ddf 100644
--- a/include/trace/events/initcall.h
+++ b/include/trace/events/initcall.h
@@ -18,7 +18,7 @@ TRACE_EVENT(initcall_level,
 	),
 
 	TP_fast_assign(
-		__assign_str(level, level);
+		__assign_str(level);
 	),
 
 	TP_printk("level=%s", __get_str(level))
diff --git a/include/trace/events/intel_ish.h b/include/trace/events/intel_ish.h
index e6d7ff55ee8c..64b6612c41bc 100644
--- a/include/trace/events/intel_ish.h
+++ b/include/trace/events/intel_ish.h
@@ -18,7 +18,7 @@ TRACE_EVENT(ishtp_dump,
 	),
 
 	TP_fast_assign(
-		__assign_str(message, message);
+		__assign_str(message);
 	),
 
 	TP_printk("%s", __get_str(message))
diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index e948df7ce625..412c9c210a32 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -164,7 +164,7 @@ TRACE_EVENT(io_uring_queue_async_work,
 		__entry->work		= &req->work;
 		__entry->rw		= rw;
 
-		__assign_str(op_str, io_uring_get_opcode(req->opcode));
+		__assign_str(op_str);
 	),
 
 	TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%llx, %s queue, work %p",
@@ -202,7 +202,7 @@ TRACE_EVENT(io_uring_defer,
 		__entry->data	= req->cqe.user_data;
 		__entry->opcode	= req->opcode;
 
-		__assign_str(op_str, io_uring_get_opcode(req->opcode));
+		__assign_str(op_str);
 	),
 
 	TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s",
@@ -303,7 +303,7 @@ TRACE_EVENT(io_uring_fail_link,
 		__entry->opcode		= req->opcode;
 		__entry->link		= link;
 
-		__assign_str(op_str, io_uring_get_opcode(req->opcode));
+		__assign_str(op_str);
 	),
 
 	TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, link %p",
@@ -392,7 +392,7 @@ TRACE_EVENT(io_uring_submit_req,
 		__entry->flags		= (__force unsigned long long) req->flags;
 		__entry->sq_thread	= req->ctx->flags & IORING_SETUP_SQPOLL;
 
-		__assign_str(op_str, io_uring_get_opcode(req->opcode));
+		__assign_str(op_str);
 	),
 
 	TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%llx, "
@@ -436,7 +436,7 @@ TRACE_EVENT(io_uring_poll_arm,
 		__entry->mask		= mask;
 		__entry->events		= events;
 
-		__assign_str(op_str, io_uring_get_opcode(req->opcode));
+		__assign_str(op_str);
 	),
 
 	TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask 0x%x, events 0x%x",
@@ -475,7 +475,7 @@ TRACE_EVENT(io_uring_task_add,
 		__entry->opcode		= req->opcode;
 		__entry->mask		= mask;
 
-		__assign_str(op_str, io_uring_get_opcode(req->opcode));
+		__assign_str(op_str);
 	),
 
 	TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask %x",
@@ -538,7 +538,7 @@ TRACE_EVENT(io_uring_req_failed,
 		__entry->addr3		= sqe->addr3;
 		__entry->error		= error;
 
-		__assign_str(op_str, io_uring_get_opcode(sqe->opcode));
+		__assign_str(op_str);
 	),
 
 	TP_printk("ring %p, req %p, user_data 0x%llx, "
diff --git a/include/trace/events/iocost.h b/include/trace/events/iocost.h
index af8bfed528fc..e772b1bc60d6 100644
--- a/include/trace/events/iocost.h
+++ b/include/trace/events/iocost.h
@@ -34,8 +34,8 @@ DECLARE_EVENT_CLASS(iocost_iocg_state,
 	),
 
 	TP_fast_assign(
-		__assign_str(devname, ioc_name(iocg->ioc));
-		__assign_str(cgroup, path);
+		__assign_str(devname);
+		__assign_str(cgroup);
 		__entry->now = now->now;
 		__entry->vnow = now->vnow;
 		__entry->vrate = iocg->ioc->vtime_base_rate;
@@ -93,8 +93,8 @@ DECLARE_EVENT_CLASS(iocg_inuse_update,
 	),
 
 	TP_fast_assign(
-		__assign_str(devname, ioc_name(iocg->ioc));
-		__assign_str(cgroup, path);
+		__assign_str(devname);
+		__assign_str(cgroup);
 		__entry->now = now->now;
 		__entry->old_inuse = old_inuse;
 		__entry->new_inuse = new_inuse;
@@ -159,7 +159,7 @@ TRACE_EVENT(iocost_ioc_vrate_adj,
 	),
 
 	TP_fast_assign(
-		__assign_str(devname, ioc_name(ioc));
+		__assign_str(devname);
 		__entry->old_vrate = ioc->vtime_base_rate;
 		__entry->new_vrate = new_vrate;
 		__entry->busy_level = ioc->busy_level;
@@ -200,8 +200,8 @@ TRACE_EVENT(iocost_iocg_forgive_debt,
 	),
 
 	TP_fast_assign(
-		__assign_str(devname, ioc_name(iocg->ioc));
-		__assign_str(cgroup, path);
+		__assign_str(devname);
+		__assign_str(cgroup);
 		__entry->now = now->now;
 		__entry->vnow = now->vnow;
 		__entry->usage_pct = usage_pct;
diff --git a/include/trace/events/iommu.h b/include/trace/events/iommu.h
index 70743db1fb75..373007e567cb 100644
--- a/include/trace/events/iommu.h
+++ b/include/trace/events/iommu.h
@@ -28,7 +28,7 @@ DECLARE_EVENT_CLASS(iommu_group_event,
 
 	TP_fast_assign(
 		__entry->gid = group_id;
-		__assign_str(device, dev_name(dev));
+		__assign_str(device);
 	),
 
 	TP_printk("IOMMU: groupID=%d device=%s",
@@ -62,7 +62,7 @@ DECLARE_EVENT_CLASS(iommu_device_event,
 	),
 
 	TP_fast_assign(
-		__assign_str(device, dev_name(dev));
+		__assign_str(device);
 	),
 
 	TP_printk("IOMMU: device=%s", __get_str(device)
@@ -138,8 +138,8 @@ DECLARE_EVENT_CLASS(iommu_error,
 	),
 
 	TP_fast_assign(
-		__assign_str(device, dev_name(dev));
-		__assign_str(driver, dev_driver_string(dev));
+		__assign_str(device);
+		__assign_str(driver);
 		__entry->iova = iova;
 		__entry->flags = flags;
 	),
diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h
index a07b4607b663..837c1740d0d0 100644
--- a/include/trace/events/irq.h
+++ b/include/trace/events/irq.h
@@ -63,7 +63,7 @@ TRACE_EVENT(irq_handler_entry,
 
 	TP_fast_assign(
 		__entry->irq = irq;
-		__assign_str(name, action->name);
+		__assign_str(name);
 	),
 
 	TP_printk("irq=%d name=%s", __entry->irq, __get_str(name))
diff --git a/include/trace/events/iscsi.h b/include/trace/events/iscsi.h
index 8ff2a3ca5d75..990fd154f586 100644
--- a/include/trace/events/iscsi.h
+++ b/include/trace/events/iscsi.h
@@ -30,7 +30,7 @@ DECLARE_EVENT_CLASS(iscsi_log_msg,
 	),
 
 	TP_fast_assign(
-		__assign_str(dname, dev_name(dev));
+		__assign_str(dname);
 		__assign_vstr(msg, vaf->fmt, vaf->va);
 	),
 
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index 6e62cc64cd92..8a829e0f6e55 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -126,7 +126,7 @@ TRACE_EVENT(kmem_cache_free,
 	TP_fast_assign(
 		__entry->call_site	= call_site;
 		__entry->ptr		= ptr;
-		__assign_str(name, s->name);
+		__assign_str(name);
 	),
 
 	TP_printk("call_site=%pS ptr=%p name=%s",
diff --git a/include/trace/events/lock.h b/include/trace/events/lock.h
index 9ebd081e057e..8e89baa3775f 100644
--- a/include/trace/events/lock.h
+++ b/include/trace/events/lock.h
@@ -37,7 +37,7 @@ TRACE_EVENT(lock_acquire,
 
 	TP_fast_assign(
 		__entry->flags = (trylock ? 1 : 0) | (read ? 2 : 0);
-		__assign_str(name, lock->name);
+		__assign_str(name);
 		__entry->lockdep_addr = lock;
 	),
 
@@ -59,7 +59,7 @@ DECLARE_EVENT_CLASS(lock,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, lock->name);
+		__assign_str(name);
 		__entry->lockdep_addr = lock;
 	),
 
diff --git a/include/trace/events/mmap_lock.h b/include/trace/events/mmap_lock.h
index 14db8044c1ff..f2827f98a44f 100644
--- a/include/trace/events/mmap_lock.h
+++ b/include/trace/events/mmap_lock.h
@@ -27,7 +27,7 @@ DECLARE_EVENT_CLASS(mmap_lock,
 
 	TP_fast_assign(
 		__entry->mm = mm;
-		__assign_str(memcg_path, memcg_path);
+		__assign_str(memcg_path);
 		__entry->write = write;
 	),
 
@@ -65,7 +65,7 @@ TRACE_EVENT_FN(mmap_lock_acquire_returned,
 
 	TP_fast_assign(
 		__entry->mm = mm;
-		__assign_str(memcg_path, memcg_path);
+		__assign_str(memcg_path);
 		__entry->write = write;
 		__entry->success = success;
 	),
diff --git a/include/trace/events/mmc.h b/include/trace/events/mmc.h
index 7b706ff21335..f1c2e94f7f68 100644
--- a/include/trace/events/mmc.h
+++ b/include/trace/events/mmc.h
@@ -68,7 +68,7 @@ TRACE_EVENT(mmc_request_start,
 		__entry->need_retune = host->need_retune;
 		__entry->hold_retune = host->hold_retune;
 		__entry->retune_period = host->retune_period;
-		__assign_str(name, mmc_hostname(host));
+		__assign_str(name);
 		__entry->mrq = mrq;
 	),
 
@@ -156,7 +156,7 @@ TRACE_EVENT(mmc_request_done,
 		__entry->need_retune = host->need_retune;
 		__entry->hold_retune = host->hold_retune;
 		__entry->retune_period = host->retune_period;
-		__assign_str(name, mmc_hostname(host));
+		__assign_str(name);
 		__entry->mrq = mrq;
 	),
 
diff --git a/include/trace/events/module.h b/include/trace/events/module.h
index 097485c73c01..e5a006be9dc6 100644
--- a/include/trace/events/module.h
+++ b/include/trace/events/module.h
@@ -41,7 +41,7 @@ TRACE_EVENT(module_load,
 
 	TP_fast_assign(
 		__entry->taints = mod->taints;
-		__assign_str(name, mod->name);
+		__assign_str(name);
 	),
 
 	TP_printk("%s %s", __get_str(name), show_module_flags(__entry->taints))
@@ -58,7 +58,7 @@ TRACE_EVENT(module_free,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, mod->name);
+		__assign_str(name);
 	),
 
 	TP_printk("%s", __get_str(name))
@@ -82,7 +82,7 @@ DECLARE_EVENT_CLASS(module_refcnt,
 	TP_fast_assign(
 		__entry->ip	= ip;
 		__entry->refcnt	= atomic_read(&mod->refcnt);
-		__assign_str(name, mod->name);
+		__assign_str(name);
 	),
 
 	TP_printk("%s call_site=%ps refcnt=%d",
@@ -119,7 +119,7 @@ TRACE_EVENT(module_request,
 	TP_fast_assign(
 		__entry->ip	= ip;
 		__entry->wait	= wait;
-		__assign_str(name, name);
+		__assign_str(name);
 	),
 
 	TP_printk("%s wait=%d call_site=%ps",
diff --git a/include/trace/events/napi.h b/include/trace/events/napi.h
index dc03cf8e0369..b567b9ffedc1 100644
--- a/include/trace/events/napi.h
+++ b/include/trace/events/napi.h
@@ -26,7 +26,7 @@ TRACE_EVENT(napi_poll,
 
 	TP_fast_assign(
 		__entry->napi = napi;
-		__assign_str(dev_name, napi->dev ? napi->dev->name : NO_DEV);
+		__assign_str(dev_name);
 		__entry->work = work;
 		__entry->budget = budget;
 	),
diff --git a/include/trace/events/neigh.h b/include/trace/events/neigh.h
index 833143d0992e..12362c35dbc0 100644
--- a/include/trace/events/neigh.h
+++ b/include/trace/events/neigh.h
@@ -42,7 +42,7 @@ TRACE_EVENT(neigh_create,
 		__be32 *p32;
 
 		__entry->family = tbl->family;
-		__assign_str(dev, (dev ? dev->name : "NULL"));
+		__assign_str(dev);
 		__entry->entries = atomic_read(&tbl->gc_entries);
 		__entry->created = n != NULL;
 		__entry->gc_exempt = exempt_from_gc;
@@ -103,7 +103,7 @@ TRACE_EVENT(neigh_update,
 		__be32 *p32;
 
 		__entry->family = n->tbl->family;
-		__assign_str(dev, (n->dev ? n->dev->name : "NULL"));
+		__assign_str(dev);
 		__entry->lladdr_len = lladdr_len;
 		memcpy(__entry->lladdr, n->ha, lladdr_len);
 		__entry->flags = n->flags;
@@ -180,7 +180,7 @@ DECLARE_EVENT_CLASS(neigh__update,
 		__be32 *p32;
 
 		__entry->family = n->tbl->family;
-		__assign_str(dev, (n->dev ? n->dev->name : "NULL"));
+		__assign_str(dev);
 		__entry->lladdr_len = lladdr_len;
 		memcpy(__entry->lladdr, n->ha, lladdr_len);
 		__entry->flags = n->flags;
diff --git a/include/trace/events/net.h b/include/trace/events/net.h
index f667c76a3b02..d55162c12f90 100644
--- a/include/trace/events/net.h
+++ b/include/trace/events/net.h
@@ -38,7 +38,7 @@ TRACE_EVENT(net_dev_start_xmit,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, dev->name);
+		__assign_str(name);
 		__entry->queue_mapping = skb->queue_mapping;
 		__entry->skbaddr = skb;
 		__entry->vlan_tagged = skb_vlan_tag_present(skb);
@@ -89,7 +89,7 @@ TRACE_EVENT(net_dev_xmit,
 		__entry->skbaddr = skb;
 		__entry->len = skb_len;
 		__entry->rc = rc;
-		__assign_str(name, dev->name);
+		__assign_str(name);
 	),
 
 	TP_printk("dev=%s skbaddr=%p len=%u rc=%d",
@@ -110,8 +110,8 @@ TRACE_EVENT(net_dev_xmit_timeout,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, dev->name);
-		__assign_str(driver, netdev_drivername(dev));
+		__assign_str(name);
+		__assign_str(driver);
 		__entry->queue_index = queue_index;
 	),
 
@@ -134,7 +134,7 @@ DECLARE_EVENT_CLASS(net_dev_template,
 	TP_fast_assign(
 		__entry->skbaddr = skb;
 		__entry->len = skb->len;
-		__assign_str(name, skb->dev->name);
+		__assign_str(name);
 	),
 
 	TP_printk("dev=%s skbaddr=%p len=%u",
@@ -191,7 +191,7 @@ DECLARE_EVENT_CLASS(net_dev_rx_verbose_template,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, skb->dev->name);
+		__assign_str(name);
 #ifdef CONFIG_NET_RX_BUSY_POLL
 		__entry->napi_id = skb->napi_id;
 #else
diff --git a/include/trace/events/netlink.h b/include/trace/events/netlink.h
index 3b7be3b386a4..f036b8a20505 100644
--- a/include/trace/events/netlink.h
+++ b/include/trace/events/netlink.h
@@ -17,7 +17,7 @@ TRACE_EVENT(netlink_extack,
 	),
 
 	TP_fast_assign(
-		__assign_str(msg, msg);
+		__assign_str(msg);
 	),
 
 	TP_printk("msg=%s", __get_str(msg))
diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h
index b799f3bcba82..a42be4c8563b 100644
--- a/include/trace/events/oom.h
+++ b/include/trace/events/oom.h
@@ -92,7 +92,7 @@ TRACE_EVENT(mark_victim,
 
 	TP_fast_assign(
 		__entry->pid = task->pid;
-		__assign_str(comm, task->comm);
+		__assign_str(comm);
 		__entry->total_vm = PG_COUNT_TO_KB(task->mm->total_vm);
 		__entry->anon_rss = PG_COUNT_TO_KB(get_mm_counter(task->mm, MM_ANONPAGES));
 		__entry->file_rss = PG_COUNT_TO_KB(get_mm_counter(task->mm, MM_FILEPAGES));
diff --git a/include/trace/events/osnoise.h b/include/trace/events/osnoise.h
index 82f741ec0f57..a2379a4f0684 100644
--- a/include/trace/events/osnoise.h
+++ b/include/trace/events/osnoise.h
@@ -75,7 +75,7 @@ TRACE_EVENT(irq_noise,
 	),
 
 	TP_fast_assign(
-		__assign_str(desc, desc);
+		__assign_str(desc);
 		__entry->vector = vector;
 		__entry->start = start;
 		__entry->duration = duration;
diff --git a/include/trace/events/power.h b/include/trace/events/power.h
index 77f14f7a11d4..d2349b6b531a 100644
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -76,7 +76,7 @@ TRACE_EVENT(powernv_throttle,
 
 	TP_fast_assign(
 		__entry->chip_id = chip_id;
-		__assign_str(reason, reason);
+		__assign_str(reason);
 		__entry->pmax = pmax;
 	),
 
@@ -210,11 +210,10 @@ TRACE_EVENT(device_pm_callback_start,
 	),
 
 	TP_fast_assign(
-		__assign_str(device, dev_name(dev));
-		__assign_str(driver, dev_driver_string(dev));
-		__assign_str(parent,
-			dev->parent ? dev_name(dev->parent) : "none");
-		__assign_str(pm_ops, pm_ops ? pm_ops : "none ");
+		__assign_str(device);
+		__assign_str(driver);
+		__assign_str(parent);
+		__assign_str(pm_ops);
 		__entry->event = event;
 	),
 
@@ -236,8 +235,8 @@ TRACE_EVENT(device_pm_callback_end,
 	),
 
 	TP_fast_assign(
-		__assign_str(device, dev_name(dev));
-		__assign_str(driver, dev_driver_string(dev));
+		__assign_str(device);
+		__assign_str(driver);
 		__entry->error = error;
 	),
 
@@ -279,7 +278,7 @@ DECLARE_EVENT_CLASS(wakeup_source,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, name);
+		__assign_str(name);
 		__entry->state = state;
 	),
 
@@ -318,7 +317,7 @@ DECLARE_EVENT_CLASS(clock,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, name);
+		__assign_str(name);
 		__entry->state = state;
 		__entry->cpu_id = cpu_id;
 	),
@@ -364,7 +363,7 @@ DECLARE_EVENT_CLASS(power_domain,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, name);
+		__assign_str(name);
 		__entry->state = state;
 		__entry->cpu_id = cpu_id;
 ),
@@ -486,7 +485,7 @@ DECLARE_EVENT_CLASS(dev_pm_qos_request,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, name);
+		__assign_str(name);
 		__entry->type = type;
 		__entry->new_value = new_value;
 	),
diff --git a/include/trace/events/pwc.h b/include/trace/events/pwc.h
index a2da764a3b41..0543702542d9 100644
--- a/include/trace/events/pwc.h
+++ b/include/trace/events/pwc.h
@@ -26,7 +26,7 @@ TRACE_EVENT(pwc_handler_enter,
 		__entry->urb__actual_length = urb->actual_length;
 		__entry->fbuf__filled = (pdev->fill_buf
 					 ? pdev->fill_buf->filled : 0);
-		__assign_str(name, pdev->v4l2_dev.name);
+		__assign_str(name);
 	),
 	TP_printk("dev=%s (fbuf=%p filled=%d) urb=%p (status=%d actual_length=%u)",
 		__get_str(name),
@@ -50,7 +50,7 @@ TRACE_EVENT(pwc_handler_exit,
 		__entry->urb = urb;
 		__entry->fbuf = pdev->fill_buf;
 		__entry->fbuf__filled = pdev->fill_buf->filled;
-		__assign_str(name, pdev->v4l2_dev.name);
+		__assign_str(name);
 	),
 	TP_printk(" dev=%s (fbuf=%p filled=%d) urb=%p",
 		__get_str(name),
diff --git a/include/trace/events/qdisc.h b/include/trace/events/qdisc.h
index 1f4258308b96..f1b5e816e7e5 100644
--- a/include/trace/events/qdisc.h
+++ b/include/trace/events/qdisc.h
@@ -88,8 +88,8 @@ TRACE_EVENT(qdisc_reset,
 	),
 
 	TP_fast_assign(
-		__assign_str(dev, qdisc_dev(q)->name);
-		__assign_str(kind, q->ops->id);
+		__assign_str(dev);
+		__assign_str(kind);
 		__entry->parent = q->parent;
 		__entry->handle = q->handle;
 	),
@@ -113,8 +113,8 @@ TRACE_EVENT(qdisc_destroy,
 	),
 
 	TP_fast_assign(
-		__assign_str(dev, qdisc_dev(q)->name);
-		__assign_str(kind, q->ops->id);
+		__assign_str(dev);
+		__assign_str(kind);
 		__entry->parent = q->parent;
 		__entry->handle = q->handle;
 	),
@@ -137,8 +137,8 @@ TRACE_EVENT(qdisc_create,
 	),
 
 	TP_fast_assign(
-		__assign_str(dev, dev->name);
-		__assign_str(kind, ops->id);
+		__assign_str(dev);
+		__assign_str(kind);
 		__entry->parent = parent;
 	),
 
diff --git a/include/trace/events/qla.h b/include/trace/events/qla.h
index e7fd55e7dc3d..8800c35525a1 100644
--- a/include/trace/events/qla.h
+++ b/include/trace/events/qla.h
@@ -25,7 +25,7 @@ DECLARE_EVENT_CLASS(qla_log_event,
 		__vstring(msg, vaf->fmt, vaf->va)
 	),
 	TP_fast_assign(
-		__assign_str(buf, buf);
+		__assign_str(buf);
 		__assign_vstr(msg, vaf->fmt, vaf->va);
 	),
 
diff --git a/include/trace/events/qrtr.h b/include/trace/events/qrtr.h
index 441132c67133..14f822983741 100644
--- a/include/trace/events/qrtr.h
+++ b/include/trace/events/qrtr.h
@@ -102,7 +102,7 @@ TRACE_EVENT(qrtr_ns_message,
 	),
 
 	TP_fast_assign(
-		__assign_str(ctrl_pkt_str, ctrl_pkt_str);
+		__assign_str(ctrl_pkt_str);
 		__entry->sq_node = sq_node;
 		__entry->sq_port = sq_port;
 	),
diff --git a/include/trace/events/regulator.h b/include/trace/events/regulator.h
index 72b3ba93b0a5..c58481a5d955 100644
--- a/include/trace/events/regulator.h
+++ b/include/trace/events/regulator.h
@@ -23,7 +23,7 @@ DECLARE_EVENT_CLASS(regulator_basic,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, name);
+		__assign_str(name);
 	),
 
 	TP_printk("name=%s", __get_str(name))
@@ -119,7 +119,7 @@ DECLARE_EVENT_CLASS(regulator_range,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, name);
+		__assign_str(name);
 		__entry->min  = min;
 		__entry->max  = max;
 	),
@@ -152,7 +152,7 @@ DECLARE_EVENT_CLASS(regulator_value,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, name);
+		__assign_str(name);
 		__entry->val  = val;
 	),
 
diff --git a/include/trace/events/rpcgss.h b/include/trace/events/rpcgss.h
index f50fcafc69de..7f0c1ceae726 100644
--- a/include/trace/events/rpcgss.h
+++ b/include/trace/events/rpcgss.h
@@ -154,7 +154,7 @@ DECLARE_EVENT_CLASS(rpcgss_ctx_class,
 	TP_fast_assign(
 		__entry->cred = gc;
 		__entry->service = gc->gc_service;
-		__assign_str(principal, gc->gc_principal);
+		__assign_str(principal);
 	),
 
 	TP_printk("cred=%p service=%s principal='%s'",
@@ -189,7 +189,7 @@ DECLARE_EVENT_CLASS(rpcgss_svc_gssapi_class,
 	TP_fast_assign(
 		__entry->xid = __be32_to_cpu(rqstp->rq_xid);
 		__entry->maj_stat = maj_stat;
-		__assign_str(addr, rqstp->rq_xprt->xpt_remotebuf);
+		__assign_str(addr);
 	),
 
 	TP_printk("addr=%s xid=0x%08x maj_stat=%s",
@@ -225,7 +225,7 @@ TRACE_EVENT(rpcgss_svc_wrap_failed,
 
 	TP_fast_assign(
 		__entry->xid = be32_to_cpu(rqstp->rq_xid);
-		__assign_str(addr, rqstp->rq_xprt->xpt_remotebuf);
+		__assign_str(addr);
 	),
 
 	TP_printk("addr=%s xid=0x%08x", __get_str(addr), __entry->xid)
@@ -245,7 +245,7 @@ TRACE_EVENT(rpcgss_svc_unwrap_failed,
 
 	TP_fast_assign(
 		__entry->xid = be32_to_cpu(rqstp->rq_xid);
-		__assign_str(addr, rqstp->rq_xprt->xpt_remotebuf);
+		__assign_str(addr);
 	),
 
 	TP_printk("addr=%s xid=0x%08x", __get_str(addr), __entry->xid)
@@ -271,7 +271,7 @@ TRACE_EVENT(rpcgss_svc_seqno_bad,
 		__entry->expected = expected;
 		__entry->received = received;
 		__entry->xid = __be32_to_cpu(rqstp->rq_xid);
-		__assign_str(addr, rqstp->rq_xprt->xpt_remotebuf);
+		__assign_str(addr);
 	),
 
 	TP_printk("addr=%s xid=0x%08x expected seqno %u, received seqno %u",
@@ -299,7 +299,7 @@ TRACE_EVENT(rpcgss_svc_accept_upcall,
 		__entry->minor_status = minor_status;
 		__entry->major_status = major_status;
 		__entry->xid = be32_to_cpu(rqstp->rq_xid);
-		__assign_str(addr, rqstp->rq_xprt->xpt_remotebuf);
+		__assign_str(addr);
 	),
 
 	TP_printk("addr=%s xid=0x%08x major_status=%s (0x%08lx) minor_status=%u",
@@ -327,7 +327,7 @@ TRACE_EVENT(rpcgss_svc_authenticate,
 	TP_fast_assign(
 		__entry->xid = be32_to_cpu(rqstp->rq_xid);
 		__entry->seqno = gc->gc_seq;
-		__assign_str(addr, rqstp->rq_xprt->xpt_remotebuf);
+		__assign_str(addr);
 	),
 
 	TP_printk("addr=%s xid=0x%08x seqno=%u", __get_str(addr),
@@ -563,7 +563,7 @@ TRACE_EVENT(rpcgss_upcall_msg,
 	),
 
 	TP_fast_assign(
-		__assign_str(msg, buf);
+		__assign_str(msg);
 	),
 
 	TP_printk("msg='%s'", __get_str(msg))
@@ -618,7 +618,7 @@ TRACE_EVENT(rpcgss_context,
 		__entry->timeout = timeout;
 		__entry->window_size = window_size;
 		__entry->len = len;
-		__assign_str(acceptor, data);
+		__assign_str(acceptor);
 	),
 
 	TP_printk("win_size=%u expiry=%lu now=%lu timeout=%u acceptor=%.*s",
@@ -677,7 +677,7 @@ TRACE_EVENT(rpcgss_oid_to_mech,
 	),
 
 	TP_fast_assign(
-		__assign_str(oid, oid);
+		__assign_str(oid);
 	),
 
 	TP_printk("mech for oid %s was not found", __get_str(oid))
diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index 027ac3ab457d..14392652273a 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -304,8 +304,8 @@ DECLARE_EVENT_CLASS(xprtrdma_reply_class,
 		__entry->xid = be32_to_cpu(rep->rr_xid);
 		__entry->version = be32_to_cpu(rep->rr_vers);
 		__entry->proc = be32_to_cpu(rep->rr_proc);
-		__assign_str(addr, rpcrdma_addrstr(rep->rr_rxprt));
-		__assign_str(port, rpcrdma_portstr(rep->rr_rxprt));
+		__assign_str(addr);
+		__assign_str(port);
 	),
 
 	TP_printk("peer=[%s]:%s xid=0x%08x version=%u proc=%u",
@@ -335,8 +335,8 @@ DECLARE_EVENT_CLASS(xprtrdma_rxprt,
 	),
 
 	TP_fast_assign(
-		__assign_str(addr, rpcrdma_addrstr(r_xprt));
-		__assign_str(port, rpcrdma_portstr(r_xprt));
+		__assign_str(addr);
+		__assign_str(port);
 	),
 
 	TP_printk("peer=[%s]:%s",
@@ -369,8 +369,8 @@ DECLARE_EVENT_CLASS(xprtrdma_connect_class,
 	TP_fast_assign(
 		__entry->rc = rc;
 		__entry->connect_status = r_xprt->rx_ep->re_connect_status;
-		__assign_str(addr, rpcrdma_addrstr(r_xprt));
-		__assign_str(port, rpcrdma_portstr(r_xprt));
+		__assign_str(addr);
+		__assign_str(port);
 	),
 
 	TP_printk("peer=[%s]:%s rc=%d connection status=%d",
@@ -608,8 +608,8 @@ DECLARE_EVENT_CLASS(xprtrdma_callback_class,
 
 	TP_fast_assign(
 		__entry->xid = be32_to_cpu(rqst->rq_xid);
-		__assign_str(addr, rpcrdma_addrstr(r_xprt));
-		__assign_str(port, rpcrdma_portstr(r_xprt));
+		__assign_str(addr);
+		__assign_str(port);
 	),
 
 	TP_printk("peer=[%s]:%s xid=0x%08x",
@@ -687,8 +687,8 @@ TRACE_EVENT(xprtrdma_op_connect,
 
 	TP_fast_assign(
 		__entry->delay = delay;
-		__assign_str(addr, rpcrdma_addrstr(r_xprt));
-		__assign_str(port, rpcrdma_portstr(r_xprt));
+		__assign_str(addr);
+		__assign_str(port);
 	),
 
 	TP_printk("peer=[%s]:%s delay=%lu",
@@ -716,8 +716,8 @@ TRACE_EVENT(xprtrdma_op_set_cto,
 	TP_fast_assign(
 		__entry->connect = connect;
 		__entry->reconnect = reconnect;
-		__assign_str(addr, rpcrdma_addrstr(r_xprt));
-		__assign_str(port, rpcrdma_portstr(r_xprt));
+		__assign_str(addr);
+		__assign_str(port);
 	),
 
 	TP_printk("peer=[%s]:%s connect=%lu reconnect=%lu",
@@ -746,8 +746,8 @@ TRACE_EVENT(xprtrdma_createmrs,
 
 	TP_fast_assign(
 		__entry->count = count;
-		__assign_str(addr, rpcrdma_addrstr(r_xprt));
-		__assign_str(port, rpcrdma_portstr(r_xprt));
+		__assign_str(addr);
+		__assign_str(port);
 	),
 
 	TP_printk("peer=[%s]:%s created %u MRs",
@@ -775,8 +775,8 @@ TRACE_EVENT(xprtrdma_nomrs_err,
 
 		__entry->task_id = rqst->rq_task->tk_pid;
 		__entry->client_id = rqst->rq_task->tk_client->cl_clid;
-		__assign_str(addr, rpcrdma_addrstr(r_xprt));
-		__assign_str(port, rpcrdma_portstr(r_xprt));
+		__assign_str(addr);
+		__assign_str(port);
 	),
 
 	TP_printk(SUNRPC_TRACE_TASK_SPECIFIER " peer=[%s]:%s",
@@ -1001,8 +1001,8 @@ TRACE_EVENT(xprtrdma_post_recvs,
 		__entry->cq_id = ep->re_attr.recv_cq->res.id;
 		__entry->count = count;
 		__entry->posted = ep->re_receive_count;
-		__assign_str(addr, rpcrdma_addrstr(r_xprt));
-		__assign_str(port, rpcrdma_portstr(r_xprt));
+		__assign_str(addr);
+		__assign_str(port);
 	),
 
 	TP_printk("peer=[%s]:%s cq.id=%d %u new recvs, %d active",
@@ -1031,8 +1031,8 @@ TRACE_EVENT(xprtrdma_post_recvs_err,
 
 		__entry->cq_id = ep->re_attr.recv_cq->res.id;
 		__entry->status = status;
-		__assign_str(addr, rpcrdma_addrstr(r_xprt));
-		__assign_str(port, rpcrdma_portstr(r_xprt));
+		__assign_str(addr);
+		__assign_str(port);
 	),
 
 	TP_printk("peer=[%s]:%s cq.id=%d rc=%d",
@@ -1445,8 +1445,8 @@ TRACE_EVENT(xprtrdma_cb_setup,
 
 	TP_fast_assign(
 		__entry->reqs = reqs;
-		__assign_str(addr, rpcrdma_addrstr(r_xprt));
-		__assign_str(port, rpcrdma_portstr(r_xprt));
+		__assign_str(addr);
+		__assign_str(port);
 	),
 
 	TP_printk("peer=[%s]:%s %u reqs",
@@ -1476,7 +1476,7 @@ DECLARE_EVENT_CLASS(svcrdma_accept_class,
 
 	TP_fast_assign(
 		__entry->status = status;
-		__assign_str(addr, rdma->sc_xprt.xpt_remotebuf);
+		__assign_str(addr);
 	),
 
 	TP_printk("addr=%s status=%ld",
@@ -1962,7 +1962,7 @@ TRACE_EVENT(svcrdma_send_err,
 	TP_fast_assign(
 		__entry->status = status;
 		__entry->xid = __be32_to_cpu(rqst->rq_xid);
-		__assign_str(addr, rqst->rq_xprt->xpt_remotebuf);
+		__assign_str(addr);
 	),
 
 	TP_printk("addr=%s xid=0x%08x status=%d", __get_str(addr),
@@ -2025,7 +2025,7 @@ TRACE_EVENT(svcrdma_rq_post_err,
 
 	TP_fast_assign(
 		__entry->status = status;
-		__assign_str(addr, rdma->sc_xprt.xpt_remotebuf);
+		__assign_str(addr);
 	),
 
 	TP_printk("addr=%s status=%d",
@@ -2138,7 +2138,7 @@ TRACE_EVENT(svcrdma_qp_error,
 
 	TP_fast_assign(
 		__entry->event = event->event;
-		__assign_str(device, event->device->name);
+		__assign_str(device);
 		snprintf(__entry->addr, sizeof(__entry->addr) - 1,
 			 "%pISpc", sap);
 	),
diff --git a/include/trace/events/rpm.h b/include/trace/events/rpm.h
index bd120e23ce12..2b0b4b6ef862 100644
--- a/include/trace/events/rpm.h
+++ b/include/trace/events/rpm.h
@@ -33,7 +33,7 @@ DECLARE_EVENT_CLASS(rpm_internal,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, dev_name(dev));
+		__assign_str(name);
 		__entry->flags = flags;
 		__entry->usage_count = atomic_read(
 			&dev->power.usage_count);
@@ -92,7 +92,7 @@ TRACE_EVENT(rpm_return_int,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, dev_name(dev));
+		__assign_str(name);
 		__entry->ip = ip;
 		__entry->ret = ret;
 	),
@@ -135,7 +135,7 @@ TRACE_EVENT(rpm_status,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, dev_name(dev));
+		__assign_str(name);
 		__entry->status = status;
 	),
 
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 68973f650c26..6df2b4685b08 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -411,7 +411,7 @@ TRACE_EVENT(sched_process_exec,
 	),
 
 	TP_fast_assign(
-		__assign_str(filename, bprm->filename);
+		__assign_str(filename);
 		__entry->pid		= p->pid;
 		__entry->old_pid	= old_pid;
 	),
@@ -445,10 +445,10 @@ TRACE_EVENT(sched_prepare_exec,
 	),
 
 	TP_fast_assign(
-		__assign_str(interp, bprm->interp);
-		__assign_str(filename, bprm->filename);
+		__assign_str(interp);
+		__assign_str(filename);
 		__entry->pid = task->pid;
-		__assign_str(comm, task->comm);
+		__assign_str(comm);
 	),
 
 	TP_printk("interp=%s filename=%s pid=%d comm=%s",
diff --git a/include/trace/events/sof.h b/include/trace/events/sof.h
index cd8e6844cca2..3061423c0667 100644
--- a/include/trace/events/sof.h
+++ b/include/trace/events/sof.h
@@ -23,7 +23,7 @@ DECLARE_EVENT_CLASS(sof_widget_template,
 		__field(int, use_count)
 	),
 	TP_fast_assign(
-		__assign_str(name, swidget->widget->name);
+		__assign_str(name);
 		__entry->use_count = swidget->use_count;
 	),
 	TP_printk("name=%s use_count=%d", __get_str(name), __entry->use_count)
@@ -49,7 +49,7 @@ TRACE_EVENT(sof_ipc3_period_elapsed_position,
 		__field(u64, wallclock)
 	),
 	TP_fast_assign(
-		__assign_str(device_name, dev_name(sdev->dev));
+		__assign_str(device_name);
 		__entry->host_posn = posn->host_posn;
 		__entry->dai_posn = posn->dai_posn;
 		__entry->wallclock = posn->wallclock;
@@ -75,7 +75,7 @@ TRACE_EVENT(sof_pcm_pointer_position,
 		__field(unsigned long, dai_posn)
 	),
 	TP_fast_assign(
-		__assign_str(device_name, dev_name(sdev->dev));
+		__assign_str(device_name);
 		__entry->pcm_id = le32_to_cpu(spcm->pcm.pcm_id);
 		__entry->stream = substream->stream;
 		__entry->dma_posn = dma_posn;
@@ -93,7 +93,7 @@ TRACE_EVENT(sof_stream_position_ipc_rx,
 		__string(device_name, dev_name(dev))
 	),
 	TP_fast_assign(
-		__assign_str(device_name, dev_name(dev));
+		__assign_str(device_name);
 	),
 	TP_printk("device_name=%s", __get_str(device_name))
 );
@@ -107,8 +107,8 @@ TRACE_EVENT(sof_ipc4_fw_config,
 		__field(u32, value)
 	),
 	TP_fast_assign(
-		__assign_str(device_name, dev_name(sdev->dev));
-		__assign_str(key, key);
+		__assign_str(device_name);
+		__assign_str(key);
 		__entry->value = value;
 	),
 	TP_printk("device_name=%s key=%s value=%d",
diff --git a/include/trace/events/sof_intel.h b/include/trace/events/sof_intel.h
index 4cac5fef99a2..9e579e57b15c 100644
--- a/include/trace/events/sof_intel.h
+++ b/include/trace/events/sof_intel.h
@@ -22,8 +22,8 @@ TRACE_EVENT(sof_intel_hda_irq,
 		__string(source, source)
 	),
 	TP_fast_assign(
-		__assign_str(device_name, dev_name(sdev->dev));
-		__assign_str(source, source);
+		__assign_str(device_name);
+		__assign_str(source);
 	),
 	TP_printk("device_name=%s source=%s",
 		  __get_str(device_name), __get_str(source))
@@ -38,7 +38,7 @@ DECLARE_EVENT_CLASS(sof_intel_ipc_firmware_template,
 		__field(u32, msg_ext)
 	),
 	TP_fast_assign(
-		__assign_str(device_name, dev_name(sdev->dev));
+		__assign_str(device_name);
 		__entry->msg = msg;
 		__entry->msg_ext = msg_ext;
 	),
@@ -64,7 +64,7 @@ TRACE_EVENT(sof_intel_D0I3C_updated,
 		__field(u8, reg)
 	),
 	TP_fast_assign(
-		__assign_str(device_name, dev_name(sdev->dev));
+		__assign_str(device_name);
 		__entry->reg = reg;
 	),
 	TP_printk("device_name=%s register=%#x",
@@ -79,7 +79,7 @@ TRACE_EVENT(sof_intel_hda_irq_ipc_check,
 		__field(u32, irq_status)
 	),
 	TP_fast_assign(
-		__assign_str(device_name, dev_name(sdev->dev));
+		__assign_str(device_name);
 		__entry->irq_status = irq_status;
 	),
 	TP_printk("device_name=%s irq_status=%#x",
@@ -100,7 +100,7 @@ TRACE_EVENT(sof_intel_hda_dsp_pcm,
 		__field(unsigned long, pos)
 	),
 	TP_fast_assign(
-		__assign_str(device_name, dev_name(sdev->dev));
+		__assign_str(device_name);
 		__entry->hstream_index = hstream->index;
 		__entry->substream = substream->stream;
 		__entry->pos = pos;
@@ -119,7 +119,7 @@ TRACE_EVENT(sof_intel_hda_dsp_stream_status,
 		__field(u32, status)
 	),
 	TP_fast_assign(
-		__assign_str(device_name, dev_name(dev));
+		__assign_str(device_name);
 		__entry->stream = s->index;
 		__entry->status = status;
 	),
@@ -135,7 +135,7 @@ TRACE_EVENT(sof_intel_hda_dsp_check_stream_irq,
 		__field(u32, status)
 	),
 	TP_fast_assign(
-		__assign_str(device_name, dev_name(sdev->dev));
+		__assign_str(device_name);
 		__entry->status = status;
 	),
 	TP_printk("device_name=%s status=%#x",
diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
index ac05ed06a071..5e8495216689 100644
--- a/include/trace/events/sunrpc.h
+++ b/include/trace/events/sunrpc.h
@@ -188,10 +188,10 @@ TRACE_EVENT(rpc_clnt_new,
 		__entry->client_id = clnt->cl_clid;
 		__entry->xprtsec = args->xprtsec.policy;
 		__entry->flags = args->flags;
-		__assign_str(program, clnt->cl_program->name);
-		__assign_str(server, xprt->servername);
-		__assign_str(addr, xprt->address_strings[RPC_DISPLAY_ADDR]);
-		__assign_str(port, xprt->address_strings[RPC_DISPLAY_PORT]);
+		__assign_str(program);
+		__assign_str(server);
+		__assign_str(addr);
+		__assign_str(port);
 	),
 
 	TP_printk("client=" SUNRPC_TRACE_CLID_SPECIFIER " peer=[%s]:%s"
@@ -220,8 +220,8 @@ TRACE_EVENT(rpc_clnt_new_err,
 
 	TP_fast_assign(
 		__entry->error = error;
-		__assign_str(program, program);
-		__assign_str(server, server);
+		__assign_str(program);
+		__assign_str(server);
 	),
 
 	TP_printk("program=%s server=%s error=%d",
@@ -325,8 +325,8 @@ TRACE_EVENT(rpc_request,
 		__entry->client_id = task->tk_client->cl_clid;
 		__entry->version = task->tk_client->cl_vers;
 		__entry->async = RPC_IS_ASYNC(task);
-		__assign_str(progname, task->tk_client->cl_program->name);
-		__assign_str(procname, rpc_proc_name(task));
+		__assign_str(progname);
+		__assign_str(procname);
 	),
 
 	TP_printk(SUNRPC_TRACE_TASK_SPECIFIER " %sv%d %s (%ssync)",
@@ -439,7 +439,7 @@ DECLARE_EVENT_CLASS(rpc_task_queued,
 		__entry->runstate = task->tk_runstate;
 		__entry->status = task->tk_status;
 		__entry->flags = task->tk_flags;
-		__assign_str(q_name, rpc_qname(q));
+		__assign_str(q_name);
 		),
 
 	TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
@@ -515,10 +515,10 @@ DECLARE_EVENT_CLASS(rpc_reply_event,
 		__entry->task_id = task->tk_pid;
 		__entry->client_id = task->tk_client->cl_clid;
 		__entry->xid = be32_to_cpu(task->tk_rqstp->rq_xid);
-		__assign_str(progname, task->tk_client->cl_program->name);
+		__assign_str(progname);
 		__entry->version = task->tk_client->cl_vers;
-		__assign_str(procname, rpc_proc_name(task));
-		__assign_str(servername, task->tk_xprt->servername);
+		__assign_str(procname);
+		__assign_str(servername);
 	),
 
 	TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
@@ -647,8 +647,8 @@ TRACE_EVENT(rpc_stats_latency,
 		__entry->task_id = task->tk_pid;
 		__entry->xid = be32_to_cpu(task->tk_rqstp->rq_xid);
 		__entry->version = task->tk_client->cl_vers;
-		__assign_str(progname, task->tk_client->cl_program->name);
-		__assign_str(procname, rpc_proc_name(task));
+		__assign_str(progname);
+		__assign_str(procname);
 		__entry->backlog = ktime_to_us(backlog);
 		__entry->rtt = ktime_to_us(rtt);
 		__entry->execute = ktime_to_us(execute);
@@ -697,16 +697,15 @@ TRACE_EVENT(rpc_xdr_overflow,
 
 			__entry->task_id = task->tk_pid;
 			__entry->client_id = task->tk_client->cl_clid;
-			__assign_str(progname,
-				     task->tk_client->cl_program->name);
+			__assign_str(progname);
 			__entry->version = task->tk_client->cl_vers;
-			__assign_str(procedure, task->tk_msg.rpc_proc->p_name);
+			__assign_str(procedure);
 		} else {
 			__entry->task_id = -1;
 			__entry->client_id = -1;
-			__assign_str(progname, "unknown");
+			__assign_str(progname);
 			__entry->version = 0;
-			__assign_str(procedure, "unknown");
+			__assign_str(procedure);
 		}
 		__entry->requested = requested;
 		__entry->end = xdr->end;
@@ -763,10 +762,9 @@ TRACE_EVENT(rpc_xdr_alignment,
 
 		__entry->task_id = task->tk_pid;
 		__entry->client_id = task->tk_client->cl_clid;
-		__assign_str(progname,
-			     task->tk_client->cl_program->name);
+		__assign_str(progname);
 		__entry->version = task->tk_client->cl_vers;
-		__assign_str(procedure, task->tk_msg.rpc_proc->p_name);
+		__assign_str(procedure);
 
 		__entry->offset = offset;
 		__entry->copied = copied;
@@ -1018,8 +1016,8 @@ DECLARE_EVENT_CLASS(rpc_xprt_lifetime_class,
 
 	TP_fast_assign(
 		__entry->state = xprt->state;
-		__assign_str(addr, xprt->address_strings[RPC_DISPLAY_ADDR]);
-		__assign_str(port, xprt->address_strings[RPC_DISPLAY_PORT]);
+		__assign_str(addr);
+		__assign_str(port);
 	),
 
 	TP_printk("peer=[%s]:%s state=%s",
@@ -1061,8 +1059,8 @@ DECLARE_EVENT_CLASS(rpc_xprt_event,
 	TP_fast_assign(
 		__entry->xid = be32_to_cpu(xid);
 		__entry->status = status;
-		__assign_str(addr, xprt->address_strings[RPC_DISPLAY_ADDR]);
-		__assign_str(port, xprt->address_strings[RPC_DISPLAY_PORT]);
+		__assign_str(addr);
+		__assign_str(port);
 	),
 
 	TP_printk("peer=[%s]:%s xid=0x%08x status=%d", __get_str(addr),
@@ -1140,10 +1138,9 @@ TRACE_EVENT(xprt_retransmit,
 		__entry->xid = be32_to_cpu(rqst->rq_xid);
 		__entry->ntrans = rqst->rq_ntrans;
 		__entry->timeout = task->tk_timeout;
-		__assign_str(progname,
-			     task->tk_client->cl_program->name);
+		__assign_str(progname);
 		__entry->version = task->tk_client->cl_vers;
-		__assign_str(procname, rpc_proc_name(task));
+		__assign_str(procname);
 	),
 
 	TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
@@ -1167,8 +1164,8 @@ TRACE_EVENT(xprt_ping,
 
 	TP_fast_assign(
 		__entry->status = status;
-		__assign_str(addr, xprt->address_strings[RPC_DISPLAY_ADDR]);
-		__assign_str(port, xprt->address_strings[RPC_DISPLAY_PORT]);
+		__assign_str(addr);
+		__assign_str(port);
 	),
 
 	TP_printk("peer=[%s]:%s status=%d",
@@ -1315,8 +1312,8 @@ TRACE_EVENT(xs_data_ready,
 	),
 
 	TP_fast_assign(
-		__assign_str(addr, xprt->address_strings[RPC_DISPLAY_ADDR]);
-		__assign_str(port, xprt->address_strings[RPC_DISPLAY_PORT]);
+		__assign_str(addr);
+		__assign_str(port);
 	),
 
 	TP_printk("peer=[%s]:%s", __get_str(addr), __get_str(port))
@@ -1339,10 +1336,8 @@ TRACE_EVENT(xs_stream_read_data,
 	TP_fast_assign(
 		__entry->err = err;
 		__entry->total = total;
-		__assign_str(addr, xprt ?
-			xprt->address_strings[RPC_DISPLAY_ADDR] : EVENT_NULL_STR);
-		__assign_str(port, xprt ?
-			xprt->address_strings[RPC_DISPLAY_PORT] : EVENT_NULL_STR);
+		__assign_str(addr);
+		__assign_str(port);
 	),
 
 	TP_printk("peer=[%s]:%s err=%zd total=%zu", __get_str(addr),
@@ -1364,8 +1359,8 @@ TRACE_EVENT(xs_stream_read_request,
 	),
 
 	TP_fast_assign(
-		__assign_str(addr, xs->xprt.address_strings[RPC_DISPLAY_ADDR]);
-		__assign_str(port, xs->xprt.address_strings[RPC_DISPLAY_PORT]);
+		__assign_str(addr);
+		__assign_str(port);
 		__entry->xid = be32_to_cpu(xs->recv.xid);
 		__entry->copied = xs->recv.copied;
 		__entry->reclen = xs->recv.len;
@@ -1403,7 +1398,7 @@ TRACE_EVENT(rpcb_getport,
 		__entry->version = clnt->cl_vers;
 		__entry->protocol = task->tk_xprt->prot;
 		__entry->bind_version = bind_version;
-		__assign_str(servername, task->tk_xprt->servername);
+		__assign_str(servername);
 	),
 
 	TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
@@ -1493,8 +1488,8 @@ TRACE_EVENT(rpcb_register,
 	TP_fast_assign(
 		__entry->program = program;
 		__entry->version = version;
-		__assign_str(addr, addr);
-		__assign_str(netid, netid);
+		__assign_str(addr);
+		__assign_str(netid);
 	),
 
 	TP_printk("program=%u version=%u addr=%s netid=%s",
@@ -1521,7 +1516,7 @@ TRACE_EVENT(rpcb_unregister,
 	TP_fast_assign(
 		__entry->program = program;
 		__entry->version = version;
-		__assign_str(netid, netid);
+		__assign_str(netid);
 	),
 
 	TP_printk("program=%u version=%u netid=%s",
@@ -1551,8 +1546,8 @@ DECLARE_EVENT_CLASS(rpc_tls_class,
 	TP_fast_assign(
 		__entry->requested_policy = clnt->cl_xprtsec.policy;
 		__entry->version = clnt->cl_vers;
-		__assign_str(servername, xprt->servername);
-		__assign_str(progname, clnt->cl_program->name)
+		__assign_str(servername);
+		__assign_str(progname);
 	),
 
 	TP_printk("server=%s %sv%u requested_policy=%s",
@@ -1794,10 +1789,9 @@ TRACE_EVENT(svc_process,
 		__entry->xid = be32_to_cpu(rqst->rq_xid);
 		__entry->vers = rqst->rq_vers;
 		__entry->proc = rqst->rq_proc;
-		__assign_str(service, name);
-		__assign_str(procedure, svc_proc_name(rqst));
-		__assign_str(addr, rqst->rq_xprt ?
-			     rqst->rq_xprt->xpt_remotebuf : EVENT_NULL_STR);
+		__assign_str(service);
+		__assign_str(procedure);
+		__assign_str(addr);
 	),
 
 	TP_printk("addr=%s xid=0x%08x service=%s vers=%u proc=%s",
@@ -1915,7 +1909,7 @@ TRACE_EVENT(svc_stats_latency,
 
 		__entry->execute = ktime_to_us(ktime_sub(ktime_get(),
 							 rqst->rq_stime));
-		__assign_str(procedure, svc_proc_name(rqst));
+		__assign_str(procedure);
 	),
 
 	TP_printk(SVC_RQST_ENDPOINT_FORMAT " proc=%s execute-us=%lu",
@@ -1980,8 +1974,8 @@ TRACE_EVENT(svc_xprt_create_err,
 
 	TP_fast_assign(
 		__entry->error = PTR_ERR(xprt);
-		__assign_str(program, program);
-		__assign_str(protocol, protocol);
+		__assign_str(program);
+		__assign_str(protocol);
 		__assign_sockaddr(addr, sap, salen);
 	),
 
@@ -2120,8 +2114,8 @@ TRACE_EVENT(svc_xprt_accept,
 	TP_fast_assign(
 		SVC_XPRT_ENDPOINT_ASSIGNMENTS(xprt);
 
-		__assign_str(protocol, xprt->xpt_class->xcl_name);
-		__assign_str(service, service);
+		__assign_str(protocol);
+		__assign_str(service);
 	),
 
 	TP_printk(SVC_XPRT_ENDPOINT_FORMAT " protocol=%s service=%s",
@@ -2260,7 +2254,7 @@ TRACE_EVENT(svcsock_marker,
 	TP_fast_assign(
 		__entry->length = be32_to_cpu(marker) & RPC_FRAGMENT_SIZE_MASK;
 		__entry->last = be32_to_cpu(marker) & RPC_LAST_STREAM_FRAGMENT;
-		__assign_str(addr, xprt->xpt_remotebuf);
+		__assign_str(addr);
 	),
 
 	TP_printk("addr=%s length=%u%s", __get_str(addr),
@@ -2284,7 +2278,7 @@ DECLARE_EVENT_CLASS(svcsock_class,
 	TP_fast_assign(
 		__entry->result = result;
 		__entry->flags = xprt->xpt_flags;
-		__assign_str(addr, xprt->xpt_remotebuf);
+		__assign_str(addr);
 	),
 
 	TP_printk("addr=%s result=%zd flags=%s", __get_str(addr),
@@ -2330,7 +2324,7 @@ TRACE_EVENT(svcsock_tcp_recv_short,
 		__entry->expected = expected;
 		__entry->received = received;
 		__entry->flags = xprt->xpt_flags;
-		__assign_str(addr, xprt->xpt_remotebuf);
+		__assign_str(addr);
 	),
 
 	TP_printk("addr=%s flags=%s expected=%u received=%u",
@@ -2358,7 +2352,7 @@ TRACE_EVENT(svcsock_tcp_state,
 		__entry->socket_state = socket->state;
 		__entry->sock_state = socket->sk->sk_state;
 		__entry->flags = xprt->xpt_flags;
-		__assign_str(addr, xprt->xpt_remotebuf);
+		__assign_str(addr);
 	),
 
 	TP_printk("addr=%s state=%s sk_state=%s flags=%s", __get_str(addr),
@@ -2385,7 +2379,7 @@ DECLARE_EVENT_CLASS(svcsock_accept_class,
 
 	TP_fast_assign(
 		__entry->status = status;
-		__assign_str(service, service);
+		__assign_str(service);
 		__entry->netns_ino = xprt->xpt_net->ns.inum;
 	),
 
@@ -2421,7 +2415,7 @@ DECLARE_EVENT_CLASS(cache_event,
 
 	TP_fast_assign(
 		__entry->h = h;
-		__assign_str(name, cd->name);
+		__assign_str(name);
 	),
 
 	TP_printk("cache=%s entry=%p", __get_str(name), __entry->h)
@@ -2466,7 +2460,7 @@ DECLARE_EVENT_CLASS(register_class,
 		__entry->protocol = protocol;
 		__entry->port = port;
 		__entry->error = error;
-		__assign_str(program, program);
+		__assign_str(program);
 	),
 
 	TP_printk("program=%sv%u proto=%s port=%u family=%s error=%d",
@@ -2511,7 +2505,7 @@ TRACE_EVENT(svc_unregister,
 	TP_fast_assign(
 		__entry->version = version;
 		__entry->error = error;
-		__assign_str(program, program);
+		__assign_str(program);
 	),
 
 	TP_printk("program=%sv%u error=%d",
diff --git a/include/trace/events/swiotlb.h b/include/trace/events/swiotlb.h
index da05c9ebd224..3b6ddb136e4e 100644
--- a/include/trace/events/swiotlb.h
+++ b/include/trace/events/swiotlb.h
@@ -20,7 +20,7 @@ TRACE_EVENT(swiotlb_bounced,
 	),
 
 	TP_fast_assign(
-		__assign_str(dev_name, dev_name(dev));
+		__assign_str(dev_name);
 		__entry->dma_mask = (dev->dma_mask ? *dev->dma_mask : 0);
 		__entry->dev_addr = dev_addr;
 		__entry->size = size;
diff --git a/include/trace/events/target.h b/include/trace/events/target.h
index 67fad2677ed5..a13cbf2b3405 100644
--- a/include/trace/events/target.h
+++ b/include/trace/events/target.h
@@ -154,7 +154,7 @@ TRACE_EVENT(target_sequencer_start,
 		__entry->task_attribute	= cmd->sam_task_attr;
 		__entry->control	= scsi_command_control(cmd->t_task_cdb);
 		memcpy(__entry->cdb, cmd->t_task_cdb, TCM_MAX_COMMAND_SIZE);
-		__assign_str(initiator, cmd->se_sess->se_node_acl->initiatorname);
+		__assign_str(initiator);
 	),
 
 	TP_printk("%s -> LUN %03u tag %#llx %s data_length %6u  CDB %s  (TA:%s C:%02x)",
@@ -198,7 +198,7 @@ TRACE_EVENT(target_cmd_complete,
 			min(18, ((u8 *) cmd->sense_buffer)[SPC_ADD_SENSE_LEN_OFFSET] + 8) : 0;
 		memcpy(__entry->cdb, cmd->t_task_cdb, TCM_MAX_COMMAND_SIZE);
 		memcpy(__entry->sense_data, cmd->sense_buffer, __entry->sense_length);
-		__assign_str(initiator, cmd->se_sess->se_node_acl->initiatorname);
+		__assign_str(initiator);
 	),
 
 	TP_printk("%s <- LUN %03u tag %#llx status %s (sense len %d%s%s)  %s data_length %6u  CDB %s  (TA:%s C:%02x)",
diff --git a/include/trace/events/tegra_apb_dma.h b/include/trace/events/tegra_apb_dma.h
index 971cd02d2daf..6d9f5075baa3 100644
--- a/include/trace/events/tegra_apb_dma.h
+++ b/include/trace/events/tegra_apb_dma.h
@@ -16,7 +16,7 @@ TRACE_EVENT(tegra_dma_tx_status,
 		__field(__u32,	residue)
 	),
 	TP_fast_assign(
-		__assign_str(chan, dev_name(&dc->dev->device));
+		__assign_str(chan);
 		__entry->cookie = cookie;
 		__entry->residue = state ? state->residue : (u32)-1;
 	),
@@ -33,7 +33,7 @@ TRACE_EVENT(tegra_dma_complete_cb,
 		__field(void *,	ptr)
 		),
 	TP_fast_assign(
-		__assign_str(chan, dev_name(&dc->dev->device));
+		__assign_str(chan);
 		__entry->count = count;
 		__entry->ptr = ptr;
 		),
@@ -49,7 +49,7 @@ TRACE_EVENT(tegra_dma_isr,
 		__field(int,	irq)
 	),
 	TP_fast_assign(
-		__assign_str(chan, dev_name(&dc->dev->device));
+		__assign_str(chan);
 		__entry->irq = irq;
 	),
 	TP_printk("%s: irq %d\n",  __get_str(chan), __entry->irq)
diff --git a/include/trace/events/ufs.h b/include/trace/events/ufs.h
index b930669bd1f0..c4e209fbdfbb 100644
--- a/include/trace/events/ufs.h
+++ b/include/trace/events/ufs.h
@@ -92,7 +92,7 @@ TRACE_EVENT(ufshcd_clk_gating,
 	),
 
 	TP_fast_assign(
-		__assign_str(dev_name, dev_name);
+		__assign_str(dev_name);
 		__entry->state = state;
 	),
 
@@ -117,9 +117,9 @@ TRACE_EVENT(ufshcd_clk_scaling,
 	),
 
 	TP_fast_assign(
-		__assign_str(dev_name, dev_name);
-		__assign_str(state, state);
-		__assign_str(clk, clk);
+		__assign_str(dev_name);
+		__assign_str(state);
+		__assign_str(clk);
 		__entry->prev_state = prev_state;
 		__entry->curr_state = curr_state;
 	),
@@ -141,8 +141,8 @@ TRACE_EVENT(ufshcd_auto_bkops_state,
 	),
 
 	TP_fast_assign(
-		__assign_str(dev_name, dev_name);
-		__assign_str(state, state);
+		__assign_str(dev_name);
+		__assign_str(state);
 	),
 
 	TP_printk("%s: auto bkops - %s",
@@ -163,8 +163,8 @@ DECLARE_EVENT_CLASS(ufshcd_profiling_template,
 	),
 
 	TP_fast_assign(
-		__assign_str(dev_name, dev_name);
-		__assign_str(profile_info, profile_info);
+		__assign_str(dev_name);
+		__assign_str(profile_info);
 		__entry->time_us = time_us;
 		__entry->err = err;
 	),
@@ -206,7 +206,7 @@ DECLARE_EVENT_CLASS(ufshcd_template,
 	TP_fast_assign(
 		__entry->usecs = usecs;
 		__entry->err = err;
-		__assign_str(dev_name, dev_name);
+		__assign_str(dev_name);
 		__entry->dev_state = dev_state;
 		__entry->link_state = link_state;
 	),
@@ -326,7 +326,7 @@ TRACE_EVENT(ufshcd_uic_command,
 	),
 
 	TP_fast_assign(
-		__assign_str(dev_name, dev_name);
+		__assign_str(dev_name);
 		__entry->str_t = str_t;
 		__entry->cmd = cmd;
 		__entry->arg1 = arg1;
@@ -356,7 +356,7 @@ TRACE_EVENT(ufshcd_upiu,
 	),
 
 	TP_fast_assign(
-		__assign_str(dev_name, dev_name);
+		__assign_str(dev_name);
 		__entry->str_t = str_t;
 		memcpy(__entry->hdr, hdr, sizeof(__entry->hdr));
 		memcpy(__entry->tsf, tsf, sizeof(__entry->tsf));
@@ -384,7 +384,7 @@ TRACE_EVENT(ufshcd_exception_event,
 	),
 
 	TP_fast_assign(
-		__assign_str(dev_name, dev_name);
+		__assign_str(dev_name);
 		__entry->status = status;
 	),
 
diff --git a/include/trace/events/workqueue.h b/include/trace/events/workqueue.h
index 6ef5b7254070..b0de2bc9ed52 100644
--- a/include/trace/events/workqueue.h
+++ b/include/trace/events/workqueue.h
@@ -38,7 +38,7 @@ TRACE_EVENT(workqueue_queue_work,
 	TP_fast_assign(
 		__entry->work		= work;
 		__entry->function	= work->func;
-		__assign_str(workqueue, pwq->wq->name);
+		__assign_str(workqueue);
 		__entry->req_cpu	= req_cpu;
 		__entry->cpu		= pwq->pool->cpu;
 	),
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 9adc2bdf2f94..a7e5452b5d21 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -416,7 +416,7 @@ TRACE_EVENT(bpf_xdp_link_attach_failed,
 	),
 
 	TP_fast_assign(
-		__assign_str(msg, msg);
+		__assign_str(msg);
 	),
 
 	TP_printk("errmsg=%s", __get_str(msg))
diff --git a/include/trace/stages/stage6_event_callback.h b/include/trace/stages/stage6_event_callback.h
index 3690e677263f..1691676fd858 100644
--- a/include/trace/stages/stage6_event_callback.h
+++ b/include/trace/stages/stage6_event_callback.h
@@ -31,12 +31,10 @@
 #define __vstring(item, fmt, ap) __dynamic_array(char, item, -1)
 
 #undef __assign_str
-#define __assign_str(dst, src)						\
+#define __assign_str(dst)						\
 	do {								\
 		char *__str__ = __get_str(dst);				\
 		int __len__ = __get_dynamic_array_len(dst) - 1;		\
-		WARN_ON_ONCE(!(void *)(src) != !(void *)__data_offsets.dst##_ptr_); \
-		WARN_ON_ONCE((src) && strcmp((src), __data_offsets.dst##_ptr_)); \
 		memcpy(__str__, __data_offsets.dst##_ptr_ ? :		\
 		       EVENT_NULL_STR, __len__);			\
 		__str__[__len__] = '\0';				\
diff --git a/kernel/trace/bpf_trace.h b/kernel/trace/bpf_trace.h
index 9acbc11ac7bb..c4075b56becc 100644
--- a/kernel/trace/bpf_trace.h
+++ b/kernel/trace/bpf_trace.h
@@ -19,7 +19,7 @@ TRACE_EVENT(bpf_trace_printk,
 	),
 
 	TP_fast_assign(
-		__assign_str(bpf_string, bpf_string);
+		__assign_str(bpf_string);
 	),
 
 	TP_printk("%s", __get_str(bpf_string))
diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h
index 5dd52bc5cabb..6b816cf1a953 100644
--- a/net/batman-adv/trace.h
+++ b/net/batman-adv/trace.h
@@ -40,8 +40,8 @@ TRACE_EVENT(batadv_dbg,
 	    ),
 
 	    TP_fast_assign(
-		    __assign_str(device, bat_priv->soft_iface->name);
-		    __assign_str(driver, KBUILD_MODNAME);
+		    __assign_str(device);
+		    __assign_str(driver);
 		    __assign_vstr(msg, vaf->fmt, vaf->va);
 	    ),
 
diff --git a/net/dsa/trace.h b/net/dsa/trace.h
index 567f29a39707..83f3e5f78491 100644
--- a/net/dsa/trace.h
+++ b/net/dsa/trace.h
@@ -39,8 +39,8 @@ DECLARE_EVENT_CLASS(dsa_port_addr_op_hw,
 	),
 
 	TP_fast_assign(
-		__assign_str(dev, dev_name(dp->ds->dev));
-		__assign_str(kind, dsa_port_kind(dp));
+		__assign_str(dev);
+		__assign_str(kind);
 		__entry->port = dp->index;
 		ether_addr_copy(__entry->addr, addr);
 		__entry->vid = vid;
@@ -98,8 +98,8 @@ DECLARE_EVENT_CLASS(dsa_port_addr_op_refcount,
 	),
 
 	TP_fast_assign(
-		__assign_str(dev, dev_name(dp->ds->dev));
-		__assign_str(kind, dsa_port_kind(dp));
+		__assign_str(dev);
+		__assign_str(kind);
 		__entry->port = dp->index;
 		ether_addr_copy(__entry->addr, addr);
 		__entry->vid = vid;
@@ -157,8 +157,8 @@ DECLARE_EVENT_CLASS(dsa_port_addr_del_not_found,
 	),
 
 	TP_fast_assign(
-		__assign_str(dev, dev_name(dp->ds->dev));
-		__assign_str(kind, dsa_port_kind(dp));
+		__assign_str(dev);
+		__assign_str(kind);
 		__entry->port = dp->index;
 		ether_addr_copy(__entry->addr, addr);
 		__entry->vid = vid;
@@ -199,7 +199,7 @@ TRACE_EVENT(dsa_lag_fdb_add_hw,
 	),
 
 	TP_fast_assign(
-		__assign_str(dev, lag_dev->name);
+		__assign_str(dev);
 		ether_addr_copy(__entry->addr, addr);
 		__entry->vid = vid;
 		dsa_db_print(db, __entry->db_buf);
@@ -227,7 +227,7 @@ TRACE_EVENT(dsa_lag_fdb_add_bump,
 	),
 
 	TP_fast_assign(
-		__assign_str(dev, lag_dev->name);
+		__assign_str(dev);
 		ether_addr_copy(__entry->addr, addr);
 		__entry->vid = vid;
 		dsa_db_print(db, __entry->db_buf);
@@ -255,7 +255,7 @@ TRACE_EVENT(dsa_lag_fdb_del_hw,
 	),
 
 	TP_fast_assign(
-		__assign_str(dev, lag_dev->name);
+		__assign_str(dev);
 		ether_addr_copy(__entry->addr, addr);
 		__entry->vid = vid;
 		dsa_db_print(db, __entry->db_buf);
@@ -283,7 +283,7 @@ TRACE_EVENT(dsa_lag_fdb_del_drop,
 	),
 
 	TP_fast_assign(
-		__assign_str(dev, lag_dev->name);
+		__assign_str(dev);
 		ether_addr_copy(__entry->addr, addr);
 		__entry->vid = vid;
 		dsa_db_print(db, __entry->db_buf);
@@ -310,7 +310,7 @@ TRACE_EVENT(dsa_lag_fdb_del_not_found,
 	),
 
 	TP_fast_assign(
-		__assign_str(dev, lag_dev->name);
+		__assign_str(dev);
 		ether_addr_copy(__entry->addr, addr);
 		__entry->vid = vid;
 		dsa_db_print(db, __entry->db_buf);
@@ -338,8 +338,8 @@ DECLARE_EVENT_CLASS(dsa_vlan_op_hw,
 	),
 
 	TP_fast_assign(
-		__assign_str(dev, dev_name(dp->ds->dev));
-		__assign_str(kind, dsa_port_kind(dp));
+		__assign_str(dev);
+		__assign_str(kind);
 		__entry->port = dp->index;
 		__entry->vid = vlan->vid;
 		__entry->flags = vlan->flags;
@@ -383,8 +383,8 @@ DECLARE_EVENT_CLASS(dsa_vlan_op_refcount,
 	),
 
 	TP_fast_assign(
-		__assign_str(dev, dev_name(dp->ds->dev));
-		__assign_str(kind, dsa_port_kind(dp));
+		__assign_str(dev);
+		__assign_str(kind);
 		__entry->port = dp->index;
 		__entry->vid = vlan->vid;
 		__entry->flags = vlan->flags;
@@ -426,8 +426,8 @@ TRACE_EVENT(dsa_vlan_del_not_found,
 	),
 
 	TP_fast_assign(
-		__assign_str(dev, dev_name(dp->ds->dev));
-		__assign_str(kind, dsa_port_kind(dp));
+		__assign_str(dev);
+		__assign_str(kind);
 		__entry->port = dp->index;
 		__entry->vid = vlan->vid;
 	),
diff --git a/net/ieee802154/trace.h b/net/ieee802154/trace.h
index 62aa6465253a..591ce0a16fc0 100644
--- a/net/ieee802154/trace.h
+++ b/net/ieee802154/trace.h
@@ -75,7 +75,7 @@ TRACE_EVENT(802154_rdev_add_virtual_intf,
 	),
 	TP_fast_assign(
 		WPAN_PHY_ASSIGN;
-		__assign_str(vir_intf_name, name ? name : "<noname>");
+		__assign_str(vir_intf_name);
 		__entry->type = type;
 		__entry->extended_addr = extended_addr;
 	),
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 8e758b5074bd..b26aacfbc622 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -33,7 +33,7 @@
 			__string(vif_name, sdata->name)
 #define VIF_ASSIGN	__entry->vif_type = sdata->vif.type; __entry->sdata = sdata;	\
 			__entry->p2p = sdata->vif.p2p;					\
-			__assign_str(vif_name, sdata->name)
+			__assign_str(vif_name)
 #define VIF_PR_FMT	" vif:%s(%d%s)"
 #define VIF_PR_ARG	__get_str(vif_name), __entry->vif_type, __entry->p2p ? "/p2p" : ""
 
diff --git a/net/openvswitch/openvswitch_trace.h b/net/openvswitch/openvswitch_trace.h
index 3eb35d9eb700..74d75aaebef4 100644
--- a/net/openvswitch/openvswitch_trace.h
+++ b/net/openvswitch/openvswitch_trace.h
@@ -43,8 +43,8 @@ TRACE_EVENT(ovs_do_execute_action,
 
 	TP_fast_assign(
 		__entry->dpaddr = dp;
-		__assign_str(dp_name, ovs_dp_name(dp));
-		__assign_str(dev_name, skb->dev->name);
+		__assign_str(dp_name);
+		__assign_str(dev_name);
 		__entry->skbaddr = skb;
 		__entry->len = skb->len;
 		__entry->data_len = skb->data_len;
@@ -113,8 +113,8 @@ TRACE_EVENT(ovs_dp_upcall,
 
 	TP_fast_assign(
 		__entry->dpaddr = dp;
-		__assign_str(dp_name, ovs_dp_name(dp));
-		__assign_str(dev_name, skb->dev->name);
+		__assign_str(dp_name);
+		__assign_str(dev_name);
 		__entry->skbaddr = skb;
 		__entry->len = skb->len;
 		__entry->data_len = skb->data_len;
diff --git a/net/smc/smc_tracepoint.h b/net/smc/smc_tracepoint.h
index 9fc5e586d24a..a9a6e3c1113a 100644
--- a/net/smc/smc_tracepoint.h
+++ b/net/smc/smc_tracepoint.h
@@ -60,7 +60,7 @@ DECLARE_EVENT_CLASS(smc_msg_event,
 				   __entry->smc = smc;
 				   __entry->net_cookie = sock_net(sk)->net_cookie;
 				   __entry->len = len;
-				   __assign_str(name, smc->conn.lnk->ibname);
+				   __assign_str(name);
 		    ),
 
 		    TP_printk("smc=%p net=%llu len=%zu dev=%s",
@@ -104,7 +104,7 @@ TRACE_EVENT(smcr_link_down,
 			   __entry->lgr = lgr;
 			   __entry->net_cookie = lgr->net->net_cookie;
 			   __entry->state = lnk->state;
-			   __assign_str(name, lnk->ibname);
+			   __assign_str(name);
 			   __entry->location = location;
 	    ),
 
diff --git a/net/tipc/trace.h b/net/tipc/trace.h
index 04af83f0500c..865142ed0ab4 100644
--- a/net/tipc/trace.h
+++ b/net/tipc/trace.h
@@ -145,7 +145,7 @@ DECLARE_EVENT_CLASS(tipc_skb_class,
 	),
 
 	TP_fast_assign(
-		__assign_str(header, header);
+		__assign_str(header);
 		tipc_skb_dump(skb, more, __get_str(buf));
 	),
 
@@ -172,7 +172,7 @@ DECLARE_EVENT_CLASS(tipc_list_class,
 	),
 
 	TP_fast_assign(
-		__assign_str(header, header);
+		__assign_str(header);
 		tipc_list_dump(list, more, __get_str(buf));
 	),
 
@@ -200,7 +200,7 @@ DECLARE_EVENT_CLASS(tipc_sk_class,
 	),
 
 	TP_fast_assign(
-		__assign_str(header, header);
+		__assign_str(header);
 		__entry->portid = tipc_sock_get_portid(sk);
 		tipc_sk_dump(sk, dqueues, __get_str(buf));
 		if (skb)
@@ -254,7 +254,7 @@ DECLARE_EVENT_CLASS(tipc_link_class,
 	),
 
 	TP_fast_assign(
-		__assign_str(header, header);
+		__assign_str(header);
 		memcpy(__entry->name, tipc_link_name(l), TIPC_MAX_LINK_NAME);
 		tipc_link_dump(l, dqueues, __get_str(buf));
 	),
@@ -337,7 +337,7 @@ DECLARE_EVENT_CLASS(tipc_node_class,
 	),
 
 	TP_fast_assign(
-		__assign_str(header, header);
+		__assign_str(header);
 		__entry->addr = tipc_node_get_addr(n);
 		tipc_node_dump(n, more, __get_str(buf));
 	),
@@ -374,7 +374,7 @@ DECLARE_EVENT_CLASS(tipc_fsm_class,
 	),
 
 	TP_fast_assign(
-		__assign_str(name, name);
+		__assign_str(name);
 		__entry->os = os;
 		__entry->ns = ns;
 		__entry->evt = evt;
@@ -409,8 +409,8 @@ TRACE_EVENT(tipc_l2_device_event,
 	),
 
 	TP_fast_assign(
-		__assign_str(dev_name, dev->name);
-		__assign_str(b_name, b->name);
+		__assign_str(dev_name);
+		__assign_str(b_name);
 		__entry->evt = evt;
 		__entry->b_up = test_bit(0, &b->up);
 		__entry->carrier = netif_carrier_ok(dev);
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 9bf987519811..87986170d1b1 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -372,7 +372,7 @@ TRACE_EVENT(rdev_add_virtual_intf,
 	),
 	TP_fast_assign(
 		WIPHY_ASSIGN;
-		__assign_str(vir_intf_name, name ? name : "<noname>");
+		__assign_str(vir_intf_name);
 		__entry->type = type;
 	),
 	TP_printk(WIPHY_PR_FMT ", virtual intf name: %s, type: %d",
diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h
index 500981eca74d..55f9a3da92d5 100644
--- a/samples/trace_events/trace-events-sample.h
+++ b/samples/trace_events/trace-events-sample.h
@@ -136,10 +136,11 @@
  *
  *         To assign a string, use the helper macro __assign_str().
  *
- *         __assign_str(foo, bar);
+ *         __assign_str(foo);
  *
- *         In most cases, the __assign_str() macro will take the same
- *         parameters as the __string() macro had to declare the string.
+ *	   The __string() macro saves off the string that is passed into
+ *         the second parameter, and the __assign_str() will store than
+ *         saved string into the "foo" field.
  *
  *   __vstring: This is similar to __string() but instead of taking a
  *         dynamic length, it takes a variable list va_list 'va' variable.
@@ -177,7 +178,7 @@
  *         The length is saved via the __string_len() and is retrieved in
  *         __assign_str().
  *
- *         __assign_str(foo, bar);
+ *         __assign_str(foo);
  *
  *         Then len + 1 is allocated to the ring buffer, and a nul terminating
  *         byte is added. This is similar to:
@@ -311,8 +312,8 @@ TRACE_EVENT(foo_bar,
 		__entry->bar	= bar;
 		memcpy(__get_dynamic_array(list), lst,
 		       __length_of(lst) * sizeof(int));
-		__assign_str(str, string);
-		__assign_str(lstr, foo);
+		__assign_str(str);
+		__assign_str(lstr);
 		__assign_vstr(vstr, fmt, va);
 		__assign_bitmask(cpus, cpumask_bits(mask), num_possible_cpus());
 		__assign_cpumask(cpum, cpumask_bits(mask));
@@ -418,7 +419,7 @@ TRACE_EVENT_CONDITION(foo_bar_with_cond,
 	),
 
 	TP_fast_assign(
-		__assign_str(foo, foo);
+		__assign_str(foo);
 		__entry->bar	= bar;
 	),
 
@@ -459,7 +460,7 @@ TRACE_EVENT_FN(foo_bar_with_fn,
 	),
 
 	TP_fast_assign(
-		__assign_str(foo, foo);
+		__assign_str(foo);
 		__entry->bar	= bar;
 	),
 
@@ -506,7 +507,7 @@ DECLARE_EVENT_CLASS(foo_template,
 	),
 
 	TP_fast_assign(
-		__assign_str(foo, foo);
+		__assign_str(foo);
 		__entry->bar	= bar;
 	),
 
diff --git a/sound/core/pcm_trace.h b/sound/core/pcm_trace.h
index 350b40b906ca..adb9b1f3bbfa 100644
--- a/sound/core/pcm_trace.h
+++ b/sound/core/pcm_trace.h
@@ -95,7 +95,7 @@ TRACE_EVENT(hw_ptr_error,
 		__entry->device = (substream)->pcm->device;
 		__entry->number = (substream)->number;
 		__entry->stream = (substream)->stream;
-		__assign_str(reason, why);
+		__assign_str(reason);
 	),
 	TP_printk("pcmC%dD%d%s/sub%d: ERROR: %s",
 		  __entry->card, __entry->device,
diff --git a/sound/hda/trace.h b/sound/hda/trace.h
index 2cc493434a8f..280c42f3eb75 100644
--- a/sound/hda/trace.h
+++ b/sound/hda/trace.h
@@ -24,7 +24,7 @@ TRACE_EVENT(hda_send_cmd,
 		__field(u32, cmd)
 	),
 	TP_fast_assign(
-		__assign_str(name, dev_name((bus)->dev));
+		__assign_str(name);
 		__entry->cmd = cmd;
 	),
 	TP_printk("[%s:%d] val=0x%08x", __get_str(name), __entry->cmd >> 28, __entry->cmd)
@@ -39,7 +39,7 @@ TRACE_EVENT(hda_get_response,
 		__field(u32, res)
 	),
 	TP_fast_assign(
-		__assign_str(name, dev_name((bus)->dev));
+		__assign_str(name);
 		__entry->addr = addr;
 		__entry->res = res;
 	),
@@ -55,7 +55,7 @@ TRACE_EVENT(hda_unsol_event,
 		__field(u32, res_ex)
 	),
 	TP_fast_assign(
-		__assign_str(name, dev_name((bus)->dev));
+		__assign_str(name);
 		__entry->res = res;
 		__entry->res_ex = res_ex;
 	),
diff --git a/sound/soc/intel/avs/trace.h b/sound/soc/intel/avs/trace.h
index 855b06bb14b0..c9eaa5a60ed3 100644
--- a/sound/soc/intel/avs/trace.h
+++ b/sound/soc/intel/avs/trace.h
@@ -24,7 +24,7 @@ TRACE_EVENT(avs_dsp_core_op,
 	TP_fast_assign(
 		__entry->reg = reg;
 		__entry->mask = mask;
-		__assign_str(op, op);
+		__assign_str(op);
 		__entry->flag = flag;
 	),
 
@@ -135,7 +135,7 @@ TRACE_EVENT(avs_d0ix,
 	),
 
 	TP_fast_assign(
-		__assign_str(op, op);
+		__assign_str(op);
 		__entry->proceed = proceed;
 		__entry->header = header;
 	),
-- 
cgit v1.2.3


From ff388fe5c481d39cc0a5940d1ad46f7920f1d646 Mon Sep 17 00:00:00 2001
From: Jeff Xu <jeffxu@chromium.org>
Date: Mon, 15 Apr 2024 16:35:20 +0000
Subject: mseal: wire up mseal syscall
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Patch series "Introduce mseal", v10.

This patchset proposes a new mseal() syscall for the Linux kernel.

In a nutshell, mseal() protects the VMAs of a given virtual memory range
against modifications, such as changes to their permission bits.

Modern CPUs support memory permissions, such as the read/write (RW) and
no-execute (NX) bits.  Linux has supported NX since the release of kernel
version 2.6.8 in August 2004 [1].  The memory permission feature improves
the security stance on memory corruption bugs, as an attacker cannot
simply write to arbitrary memory and point the code to it.  The memory
must be marked with the X bit, or else an exception will occur.
Internally, the kernel maintains the memory permissions in a data
structure called VMA (vm_area_struct).  mseal() additionally protects the
VMA itself against modifications of the selected seal type.

Memory sealing is useful to mitigate memory corruption issues where a
corrupted pointer is passed to a memory management system.  For example,
such an attacker primitive can break control-flow integrity guarantees
since read-only memory that is supposed to be trusted can become writable
or .text pages can get remapped.  Memory sealing can automatically be
applied by the runtime loader to seal .text and .rodata pages and
applications can additionally seal security critical data at runtime.  A
similar feature already exists in the XNU kernel with the
VM_FLAGS_PERMANENT [3] flag and on OpenBSD with the mimmutable syscall
[4].  Also, Chrome wants to adopt this feature for their CFI work [2] and
this patchset has been designed to be compatible with the Chrome use case.

Two system calls are involved in sealing the map:  mmap() and mseal().

The new mseal() is an syscall on 64 bit CPU, and with following signature:

int mseal(void addr, size_t len, unsigned long flags)
addr/len: memory range.
flags: reserved.

mseal() blocks following operations for the given memory range.

1> Unmapping, moving to another location, and shrinking the size,
   via munmap() and mremap(), can leave an empty space, therefore can
   be replaced with a VMA with a new set of attributes.

2> Moving or expanding a different VMA into the current location,
   via mremap().

3> Modifying a VMA via mmap(MAP_FIXED).

4> Size expansion, via mremap(), does not appear to pose any specific
   risks to sealed VMAs. It is included anyway because the use case is
   unclear. In any case, users can rely on merging to expand a sealed VMA.

5> mprotect() and pkey_mprotect().

6> Some destructive madvice() behaviors (e.g. MADV_DONTNEED) for anonymous
   memory, when users don't have write permission to the memory. Those
   behaviors can alter region contents by discarding pages, effectively a
   memset(0) for anonymous memory.

The idea that inspired this patch comes from Stephen Röttger’s work in
V8 CFI [5].  Chrome browser in ChromeOS will be the first user of this
API.

Indeed, the Chrome browser has very specific requirements for sealing,
which are distinct from those of most applications.  For example, in the
case of libc, sealing is only applied to read-only (RO) or read-execute
(RX) memory segments (such as .text and .RELRO) to prevent them from
becoming writable, the lifetime of those mappings are tied to the lifetime
of the process.

Chrome wants to seal two large address space reservations that are managed
by different allocators.  The memory is mapped RW- and RWX respectively
but write access to it is restricted using pkeys (or in the future ARM
permission overlay extensions).  The lifetime of those mappings are not
tied to the lifetime of the process, therefore, while the memory is
sealed, the allocators still need to free or discard the unused memory.
For example, with madvise(DONTNEED).

However, always allowing madvise(DONTNEED) on this range poses a security
risk.  For example if a jump instruction crosses a page boundary and the
second page gets discarded, it will overwrite the target bytes with zeros
and change the control flow.  Checking write-permission before the discard
operation allows us to control when the operation is valid.  In this case,
the madvise will only succeed if the executing thread has PKEY write
permissions and PKRU changes are protected in software by control-flow
integrity.

Although the initial version of this patch series is targeting the Chrome
browser as its first user, it became evident during upstream discussions
that we would also want to ensure that the patch set eventually is a
complete solution for memory sealing and compatible with other use cases.
The specific scenario currently in mind is glibc's use case of loading and
sealing ELF executables.  To this end, Stephen is working on a change to
glibc to add sealing support to the dynamic linker, which will seal all
non-writable segments at startup.  Once this work is completed, all
applications will be able to automatically benefit from these new
protections.

In closing, I would like to formally acknowledge the valuable
contributions received during the RFC process, which were instrumental in
shaping this patch:

Jann Horn: raising awareness and providing valuable insights on the
  destructive madvise operations.
Liam R. Howlett: perf optimization.
Linus Torvalds: assisting in defining system call signature and scope.
Theo de Raadt: sharing the experiences and insight gained from
  implementing mimmutable() in OpenBSD.

MM perf benchmarks
==================
This patch adds a loop in the mprotect/munmap/madvise(DONTNEED) to
check the VMAs’ sealing flag, so that no partial update can be made,
when any segment within the given memory range is sealed.

To measure the performance impact of this loop, two tests are developed.
[8]

The first is measuring the time taken for a particular system call,
by using clock_gettime(CLOCK_MONOTONIC). The second is using
PERF_COUNT_HW_REF_CPU_CYCLES (exclude user space). Both tests have
similar results.

The tests have roughly below sequence:
for (i = 0; i < 1000, i++)
    create 1000 mappings (1 page per VMA)
    start the sampling
    for (j = 0; j < 1000, j++)
        mprotect one mapping
    stop and save the sample
    delete 1000 mappings
calculates all samples.

Below tests are performed on Intel(R) Pentium(R) Gold 7505 @ 2.00GHz,
4G memory, Chromebook.

Based on the latest upstream code:
The first test (measuring time)
syscall__	vmas	t	t_mseal	delta_ns	per_vma	%
munmap__  	1	909	944	35	35	104%
munmap__  	2	1398	1502	104	52	107%
munmap__  	4	2444	2594	149	37	106%
munmap__  	8	4029	4323	293	37	107%
munmap__  	16	6647	6935	288	18	104%
munmap__  	32	11811	12398	587	18	105%
mprotect	1	439	465	26	26	106%
mprotect	2	1659	1745	86	43	105%
mprotect	4	3747	3889	142	36	104%
mprotect	8	6755	6969	215	27	103%
mprotect	16	13748	14144	396	25	103%
mprotect	32	27827	28969	1142	36	104%
madvise_	1	240	262	22	22	109%
madvise_	2	366	442	76	38	121%
madvise_	4	623	751	128	32	121%
madvise_	8	1110	1324	215	27	119%
madvise_	16	2127	2451	324	20	115%
madvise_	32	4109	4642	534	17	113%

The second test (measuring cpu cycle)
syscall__	vmas	cpu	cmseal	delta_cpu	per_vma	%
munmap__	1	1790	1890	100	100	106%
munmap__	2	2819	3033	214	107	108%
munmap__	4	4959	5271	312	78	106%
munmap__	8	8262	8745	483	60	106%
munmap__	16	13099	14116	1017	64	108%
munmap__	32	23221	24785	1565	49	107%
mprotect	1	906	967	62	62	107%
mprotect	2	3019	3203	184	92	106%
mprotect	4	6149	6569	420	105	107%
mprotect	8	9978	10524	545	68	105%
mprotect	16	20448	21427	979	61	105%
mprotect	32	40972	42935	1963	61	105%
madvise_	1	434	497	63	63	115%
madvise_	2	752	899	147	74	120%
madvise_	4	1313	1513	200	50	115%
madvise_	8	2271	2627	356	44	116%
madvise_	16	4312	4883	571	36	113%
madvise_	32	8376	9319	943	29	111%

Based on the result, for 6.8 kernel, sealing check adds
20-40 nano seconds, or around 50-100 CPU cycles, per VMA.

In addition, I applied the sealing to 5.10 kernel:
The first test (measuring time)
syscall__	vmas	t	tmseal	delta_ns	per_vma	%
munmap__	1	357	390	33	33	109%
munmap__	2	442	463	21	11	105%
munmap__	4	614	634	20	5	103%
munmap__	8	1017	1137	120	15	112%
munmap__	16	1889	2153	263	16	114%
munmap__	32	4109	4088	-21	-1	99%
mprotect	1	235	227	-7	-7	97%
mprotect	2	495	464	-30	-15	94%
mprotect	4	741	764	24	6	103%
mprotect	8	1434	1437	2	0	100%
mprotect	16	2958	2991	33	2	101%
mprotect	32	6431	6608	177	6	103%
madvise_	1	191	208	16	16	109%
madvise_	2	300	324	24	12	108%
madvise_	4	450	473	23	6	105%
madvise_	8	753	806	53	7	107%
madvise_	16	1467	1592	125	8	108%
madvise_	32	2795	3405	610	19	122%

The second test (measuring cpu cycle)
syscall__	nbr_vma	cpu	cmseal	delta_cpu	per_vma	%
munmap__	1	684	715	31	31	105%
munmap__	2	861	898	38	19	104%
munmap__	4	1183	1235	51	13	104%
munmap__	8	1999	2045	46	6	102%
munmap__	16	3839	3816	-23	-1	99%
munmap__	32	7672	7887	216	7	103%
mprotect	1	397	443	46	46	112%
mprotect	2	738	788	50	25	107%
mprotect	4	1221	1256	35	9	103%
mprotect	8	2356	2429	72	9	103%
mprotect	16	4961	4935	-26	-2	99%
mprotect	32	9882	10172	291	9	103%
madvise_	1	351	380	29	29	108%
madvise_	2	565	615	49	25	109%
madvise_	4	872	933	61	15	107%
madvise_	8	1508	1640	132	16	109%
madvise_	16	3078	3323	245	15	108%
madvise_	32	5893	6704	811	25	114%

For 5.10 kernel, sealing check adds 0-15 ns in time, or 10-30
CPU cycles, there is even decrease in some cases.

It might be interesting to compare 5.10 and 6.8 kernel
The first test (measuring time)
syscall__	vmas	t_5_10	t_6_8	delta_ns	per_vma	%
munmap__	1	357	909	552	552	254%
munmap__	2	442	1398	956	478	316%
munmap__	4	614	2444	1830	458	398%
munmap__	8	1017	4029	3012	377	396%
munmap__	16	1889	6647	4758	297	352%
munmap__	32	4109	11811	7702	241	287%
mprotect	1	235	439	204	204	187%
mprotect	2	495	1659	1164	582	335%
mprotect	4	741	3747	3006	752	506%
mprotect	8	1434	6755	5320	665	471%
mprotect	16	2958	13748	10790	674	465%
mprotect	32	6431	27827	21397	669	433%
madvise_	1	191	240	49	49	125%
madvise_	2	300	366	67	33	122%
madvise_	4	450	623	173	43	138%
madvise_	8	753	1110	357	45	147%
madvise_	16	1467	2127	660	41	145%
madvise_	32	2795	4109	1314	41	147%

The second test (measuring cpu cycle)
syscall__	vmas	cpu_5_10	c_6_8	delta_cpu	per_vma	%
munmap__	1	684	1790	1106	1106	262%
munmap__	2	861	2819	1958	979	327%
munmap__	4	1183	4959	3776	944	419%
munmap__	8	1999	8262	6263	783	413%
munmap__	16	3839	13099	9260	579	341%
munmap__	32	7672	23221	15549	486	303%
mprotect	1	397	906	509	509	228%
mprotect	2	738	3019	2281	1140	409%
mprotect	4	1221	6149	4929	1232	504%
mprotect	8	2356	9978	7622	953	423%
mprotect	16	4961	20448	15487	968	412%
mprotect	32	9882	40972	31091	972	415%
madvise_	1	351	434	82	82	123%
madvise_	2	565	752	186	93	133%
madvise_	4	872	1313	442	110	151%
madvise_	8	1508	2271	763	95	151%
madvise_	16	3078	4312	1234	77	140%
madvise_	32	5893	8376	2483	78	142%

From 5.10 to 6.8
munmap: added 250-550 ns in time, or 500-1100 in cpu cycle, per vma.
mprotect: added 200-750 ns in time, or 500-1200 in cpu cycle, per vma.
madvise: added 33-50 ns in time, or 70-110 in cpu cycle, per vma.

In comparison to mseal, which adds 20-40 ns or 50-100 CPU cycles, the
increase from 5.10 to 6.8 is significantly larger, approximately ten times
greater for munmap and mprotect.

When I discuss the mm performance with Brian Makin, an engineer who worked
on performance, it was brought to my attention that such performance
benchmarks, which measuring millions of mm syscall in a tight loop, may
not accurately reflect real-world scenarios, such as that of a database
service.  Also this is tested using a single HW and ChromeOS, the data
from another HW or distribution might be different.  It might be best to
take this data with a grain of salt.


This patch (of 5):

Wire up mseal syscall for all architectures.

Link: https://lkml.kernel.org/r/20240415163527.626541-1-jeffxu@chromium.org
Link: https://lkml.kernel.org/r/20240415163527.626541-2-jeffxu@chromium.org
Signed-off-by: Jeff Xu <jeffxu@chromium.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guenter Roeck <groeck@chromium.org>
Cc: Jann Horn <jannh@google.com> [Bug #2]
Cc: Jeff Xu <jeffxu@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Jorge Lucangeli Obes <jorgelo@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Pedro Falcato <pedro.falcato@gmail.com>
Cc: Stephen Röttger <sroettger@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Amer Al Shanawany <amer.shanawany@gmail.com>
Cc: Javier Carrasco <javier.carrasco.cruz@gmail.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/alpha/kernel/syscalls/syscall.tbl      | 1 +
 arch/arm/tools/syscall.tbl                  | 1 +
 arch/arm64/include/asm/unistd.h             | 2 +-
 arch/arm64/include/asm/unistd32.h           | 2 ++
 arch/m68k/kernel/syscalls/syscall.tbl       | 1 +
 arch/microblaze/kernel/syscalls/syscall.tbl | 1 +
 arch/mips/kernel/syscalls/syscall_n32.tbl   | 1 +
 arch/mips/kernel/syscalls/syscall_n64.tbl   | 1 +
 arch/mips/kernel/syscalls/syscall_o32.tbl   | 1 +
 arch/parisc/kernel/syscalls/syscall.tbl     | 1 +
 arch/powerpc/kernel/syscalls/syscall.tbl    | 1 +
 arch/s390/kernel/syscalls/syscall.tbl       | 1 +
 arch/sh/kernel/syscalls/syscall.tbl         | 1 +
 arch/sparc/kernel/syscalls/syscall.tbl      | 1 +
 arch/x86/entry/syscalls/syscall_32.tbl      | 1 +
 arch/x86/entry/syscalls/syscall_64.tbl      | 1 +
 arch/xtensa/kernel/syscalls/syscall.tbl     | 1 +
 include/uapi/asm-generic/unistd.h           | 5 ++++-
 kernel/sys_ni.c                             | 1 +
 19 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 26cce7e7f70b..74720667fe09 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -501,3 +501,4 @@
 569	common	lsm_get_self_attr		sys_lsm_get_self_attr
 570	common	lsm_set_self_attr		sys_lsm_set_self_attr
 571	common	lsm_list_modules		sys_lsm_list_modules
+572	common  mseal				sys_mseal
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index b6c9e01e14f5..2ed7d229c8f9 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -475,3 +475,4 @@
 459	common	lsm_get_self_attr		sys_lsm_get_self_attr
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
+462	common	mseal				sys_mseal
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 491b2b9bd553..1346579f802f 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -39,7 +39,7 @@
 #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls		462
+#define __NR_compat_syscalls		463
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 7118282d1c79..266b96acc014 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -929,6 +929,8 @@ __SYSCALL(__NR_lsm_get_self_attr, sys_lsm_get_self_attr)
 __SYSCALL(__NR_lsm_set_self_attr, sys_lsm_set_self_attr)
 #define __NR_lsm_list_modules 461
 __SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules)
+#define __NR_mseal 462
+__SYSCALL(__NR_mseal, sys_mseal)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index 7fd43fd4c9f2..22a3cbd4c602 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -461,3 +461,4 @@
 459	common	lsm_get_self_attr		sys_lsm_get_self_attr
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
+462	common	mseal				sys_mseal
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index b00ab2cabab9..2b81a6bd78b2 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -467,3 +467,4 @@
 459	common	lsm_get_self_attr		sys_lsm_get_self_attr
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
+462	common	mseal				sys_mseal
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 83cfc9eb6b88..cc869f5d5693 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -400,3 +400,4 @@
 459	n32	lsm_get_self_attr		sys_lsm_get_self_attr
 460	n32	lsm_set_self_attr		sys_lsm_set_self_attr
 461	n32	lsm_list_modules		sys_lsm_list_modules
+462	n32	mseal				sys_mseal
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index 532b855df589..1464c6be6eb3 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -376,3 +376,4 @@
 459	n64	lsm_get_self_attr		sys_lsm_get_self_attr
 460	n64	lsm_set_self_attr		sys_lsm_set_self_attr
 461	n64	lsm_list_modules		sys_lsm_list_modules
+462	n64	mseal				sys_mseal
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index f45c9530ea93..008ebe60263e 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -449,3 +449,4 @@
 459	o32	lsm_get_self_attr		sys_lsm_get_self_attr
 460	o32	lsm_set_self_attr		sys_lsm_set_self_attr
 461	o32	lsm_list_modules		sys_lsm_list_modules
+462	o32	mseal				sys_mseal
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index b236a84c4e12..b13c21373974 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -460,3 +460,4 @@
 459	common	lsm_get_self_attr		sys_lsm_get_self_attr
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
+462	common	mseal				sys_mseal
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 17173b82ca21..3656f1ca7a21 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -548,3 +548,4 @@
 459	common	lsm_get_self_attr		sys_lsm_get_self_attr
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
+462	common	mseal				sys_mseal
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index 095bb86339a7..bd0fee24ad10 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -464,3 +464,4 @@
 459  common	lsm_get_self_attr	sys_lsm_get_self_attr		sys_lsm_get_self_attr
 460  common	lsm_set_self_attr	sys_lsm_set_self_attr		sys_lsm_set_self_attr
 461  common	lsm_list_modules	sys_lsm_list_modules		sys_lsm_list_modules
+462  common	mseal			sys_mseal			sys_mseal
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index 86fe269f0220..bbf83a2db986 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -464,3 +464,4 @@
 459	common	lsm_get_self_attr		sys_lsm_get_self_attr
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
+462	common	mseal				sys_mseal
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index b23d59313589..ac6c281ccfe0 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -507,3 +507,4 @@
 459	common	lsm_get_self_attr		sys_lsm_get_self_attr
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
+462	common	mseal 				sys_mseal
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 5f8591ce7f25..7fd1f57ad3d3 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -466,3 +466,4 @@
 459	i386	lsm_get_self_attr	sys_lsm_get_self_attr
 460	i386	lsm_set_self_attr	sys_lsm_set_self_attr
 461	i386	lsm_list_modules	sys_lsm_list_modules
+462	i386	mseal 			sys_mseal
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index cc78226ffc35..a396f6e6ab5b 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -383,6 +383,7 @@
 459	common	lsm_get_self_attr	sys_lsm_get_self_attr
 460	common	lsm_set_self_attr	sys_lsm_set_self_attr
 461	common	lsm_list_modules	sys_lsm_list_modules
+462 	common  mseal			sys_mseal
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index dd116598fb25..67083fc1b2f5 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -432,3 +432,4 @@
 459	common	lsm_get_self_attr		sys_lsm_get_self_attr
 460	common	lsm_set_self_attr		sys_lsm_set_self_attr
 461	common	lsm_list_modules		sys_lsm_list_modules
+462	common	mseal 				sys_mseal
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 75f00965ab15..d983c48a3b6a 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -842,8 +842,11 @@ __SYSCALL(__NR_lsm_set_self_attr, sys_lsm_set_self_attr)
 #define __NR_lsm_list_modules 461
 __SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules)
 
+#define __NR_mseal 462
+__SYSCALL(__NR_mseal, sys_mseal)
+
 #undef __NR_syscalls
-#define __NR_syscalls 462
+#define __NR_syscalls 463
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index faad00cce269..d7eee421d4bc 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -196,6 +196,7 @@ COND_SYSCALL(migrate_pages);
 COND_SYSCALL(move_pages);
 COND_SYSCALL(set_mempolicy_home_node);
 COND_SYSCALL(cachestat);
+COND_SYSCALL(mseal);
 
 COND_SYSCALL(perf_event_open);
 COND_SYSCALL(accept4);
-- 
cgit v1.2.3


From 8be7258aad44b5e25977a98db136f677fa6f4370 Mon Sep 17 00:00:00 2001
From: Jeff Xu <jeffxu@chromium.org>
Date: Mon, 15 Apr 2024 16:35:21 +0000
Subject: mseal: add mseal syscall
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The new mseal() is an syscall on 64 bit CPU, and with following signature:

int mseal(void addr, size_t len, unsigned long flags)
addr/len: memory range.
flags: reserved.

mseal() blocks following operations for the given memory range.

1> Unmapping, moving to another location, and shrinking the size,
   via munmap() and mremap(), can leave an empty space, therefore can
   be replaced with a VMA with a new set of attributes.

2> Moving or expanding a different VMA into the current location,
   via mremap().

3> Modifying a VMA via mmap(MAP_FIXED).

4> Size expansion, via mremap(), does not appear to pose any specific
   risks to sealed VMAs. It is included anyway because the use case is
   unclear. In any case, users can rely on merging to expand a sealed VMA.

5> mprotect() and pkey_mprotect().

6> Some destructive madvice() behaviors (e.g. MADV_DONTNEED) for anonymous
   memory, when users don't have write permission to the memory. Those
   behaviors can alter region contents by discarding pages, effectively a
   memset(0) for anonymous memory.

Following input during RFC are incooperated into this patch:

Jann Horn: raising awareness and providing valuable insights on the
destructive madvise operations.
Linus Torvalds: assisting in defining system call signature and scope.
Liam R. Howlett: perf optimization.
Theo de Raadt: sharing the experiences and insight gained from
  implementing mimmutable() in OpenBSD.

Finally, the idea that inspired this patch comes from Stephen Röttger's
work in Chrome V8 CFI.

[jeffxu@chromium.org: add branch prediction hint, per Pedro]
  Link: https://lkml.kernel.org/r/20240423192825.1273679-2-jeffxu@chromium.org
Link: https://lkml.kernel.org/r/20240415163527.626541-3-jeffxu@chromium.org
Signed-off-by: Jeff Xu <jeffxu@chromium.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Pedro Falcato <pedro.falcato@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Guenter Roeck <groeck@chromium.org>
Cc: Jann Horn <jannh@google.com>
Cc: Jeff Xu <jeffxu@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Jorge Lucangeli Obes <jorgelo@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Muhammad Usama Anjum <usama.anjum@collabora.com>
Cc: Pedro Falcato <pedro.falcato@gmail.com>
Cc: Stephen Röttger <sroettger@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Amer Al Shanawany <amer.shanawany@gmail.com>
Cc: Javier Carrasco <javier.carrasco.cruz@gmail.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/syscalls.h |   1 +
 mm/Makefile              |   4 +
 mm/internal.h            |  37 ++++++
 mm/madvise.c             |  12 ++
 mm/mmap.c                |  31 ++++-
 mm/mprotect.c            |  10 ++
 mm/mremap.c              |  31 +++++
 mm/mseal.c               | 307 +++++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 432 insertions(+), 1 deletion(-)
 create mode 100644 mm/mseal.c

(limited to 'include')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index e619ac10cd23..9104952d323d 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -821,6 +821,7 @@ asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags);
 asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
 			unsigned long prot, unsigned long pgoff,
 			unsigned long flags);
+asmlinkage long sys_mseal(unsigned long start, size_t len, unsigned long flags);
 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 				unsigned long mode,
 				const unsigned long __user *nmask,
diff --git a/mm/Makefile b/mm/Makefile
index 85f29ef7bedd..8fb85acda1b1 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -43,6 +43,10 @@ ifdef CONFIG_CROSS_MEMORY_ATTACH
 mmu-$(CONFIG_MMU)	+= process_vm_access.o
 endif
 
+ifdef CONFIG_64BIT
+mmu-$(CONFIG_MMU)	+= mseal.o
+endif
+
 obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
 			   maccess.o page-writeback.o folio-compat.o \
 			   readahead.o swap.o truncate.o vmscan.o shrinker.o \
diff --git a/mm/internal.h b/mm/internal.h
index 2adabe369403..b2c75b12014e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1435,6 +1435,43 @@ void __meminit __init_single_page(struct page *page, unsigned long pfn,
 unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
 			  int priority);
 
+#ifdef CONFIG_64BIT
+/* VM is sealed, in vm_flags */
+#define VM_SEALED	_BITUL(63)
+#endif
+
+#ifdef CONFIG_64BIT
+static inline int can_do_mseal(unsigned long flags)
+{
+	if (flags)
+		return -EINVAL;
+
+	return 0;
+}
+
+bool can_modify_mm(struct mm_struct *mm, unsigned long start,
+		unsigned long end);
+bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start,
+		unsigned long end, int behavior);
+#else
+static inline int can_do_mseal(unsigned long flags)
+{
+	return -EPERM;
+}
+
+static inline bool can_modify_mm(struct mm_struct *mm, unsigned long start,
+		unsigned long end)
+{
+	return true;
+}
+
+static inline bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start,
+		unsigned long end, int behavior)
+{
+	return true;
+}
+#endif
+
 #ifdef CONFIG_SHRINKER_DEBUG
 static inline __printf(2, 0) int shrinker_debugfs_name_alloc(
 			struct shrinker *shrinker, const char *fmt, va_list ap)
diff --git a/mm/madvise.c b/mm/madvise.c
index c8ba3f3eb54d..a77893462b92 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1401,6 +1401,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
  *  -EIO    - an I/O error occurred while paging in data.
  *  -EBADF  - map exists, but area maps something that isn't a file.
  *  -EAGAIN - a kernel resource was temporarily unavailable.
+ *  -EPERM  - memory is sealed.
  */
 int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
 {
@@ -1444,6 +1445,15 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
 	start = untagged_addr_remote(mm, start);
 	end = start + len;
 
+	/*
+	 * Check if the address range is sealed for do_madvise().
+	 * can_modify_mm_madv assumes we have acquired the lock on MM.
+	 */
+	if (unlikely(!can_modify_mm_madv(mm, start, end, behavior))) {
+		error = -EPERM;
+		goto out;
+	}
+
 	blk_start_plug(&plug);
 	switch (behavior) {
 	case MADV_POPULATE_READ:
@@ -1456,6 +1466,8 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
 		break;
 	}
 	blk_finish_plug(&plug);
+
+out:
 	if (write)
 		mmap_write_unlock(mm);
 	else
diff --git a/mm/mmap.c b/mm/mmap.c
index d6d8ab119b72..83b4682ec85c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1255,6 +1255,16 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 	if (mm->map_count > sysctl_max_map_count)
 		return -ENOMEM;
 
+	/*
+	 * addr is returned from get_unmapped_area,
+	 * There are two cases:
+	 * 1> MAP_FIXED == false
+	 *	unallocated memory, no need to check sealing.
+	 * 1> MAP_FIXED == true
+	 *	sealing is checked inside mmap_region when
+	 *	do_vmi_munmap is called.
+	 */
+
 	if (prot == PROT_EXEC) {
 		pkey = execute_only_pkey(mm);
 		if (pkey < 0)
@@ -2727,6 +2737,14 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
 	if (end == start)
 		return -EINVAL;
 
+	/*
+	 * Check if memory is sealed before arch_unmap.
+	 * Prevent unmapping a sealed VMA.
+	 * can_modify_mm assumes we have acquired the lock on MM.
+	 */
+	if (unlikely(!can_modify_mm(mm, start, end)))
+		return -EPERM;
+
 	 /* arch_unmap() might do unmaps itself.  */
 	arch_unmap(mm, start, end);
 
@@ -2789,7 +2807,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	}
 
 	/* Unmap any existing mapping in the area */
-	if (do_vmi_munmap(&vmi, mm, addr, len, uf, false))
+	error = do_vmi_munmap(&vmi, mm, addr, len, uf, false);
+	if (error == -EPERM)
+		return error;
+	else if (error)
 		return -ENOMEM;
 
 	/*
@@ -3139,6 +3160,14 @@ int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
 {
 	struct mm_struct *mm = vma->vm_mm;
 
+	/*
+	 * Check if memory is sealed before arch_unmap.
+	 * Prevent unmapping a sealed VMA.
+	 * can_modify_mm assumes we have acquired the lock on MM.
+	 */
+	if (unlikely(!can_modify_mm(mm, start, end)))
+		return -EPERM;
+
 	arch_unmap(mm, start, end);
 	return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
 }
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 94878c39ee32..8c6cd8825273 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -32,6 +32,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/userfaultfd_k.h>
 #include <linux/memory-tiers.h>
+#include <uapi/linux/mman.h>
 #include <asm/cacheflush.h>
 #include <asm/mmu_context.h>
 #include <asm/tlbflush.h>
@@ -744,6 +745,15 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
 		}
 	}
 
+	/*
+	 * checking if memory is sealed.
+	 * can_modify_mm assumes we have acquired the lock on MM.
+	 */
+	if (unlikely(!can_modify_mm(current->mm, start, end))) {
+		error = -EPERM;
+		goto out;
+	}
+
 	prev = vma_prev(&vmi);
 	if (start > vma->vm_start)
 		prev = vma;
diff --git a/mm/mremap.c b/mm/mremap.c
index f5aba752d35f..5f96bc5ee918 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -902,7 +902,25 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
 	if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
 		return -ENOMEM;
 
+	/*
+	 * In mremap_to().
+	 * Move a VMA to another location, check if src addr is sealed.
+	 *
+	 * Place can_modify_mm here because mremap_to()
+	 * does its own checking for address range, and we only
+	 * check the sealing after passing those checks.
+	 *
+	 * can_modify_mm assumes we have acquired the lock on MM.
+	 */
+	if (unlikely(!can_modify_mm(mm, addr, addr + old_len)))
+		return -EPERM;
+
 	if (flags & MREMAP_FIXED) {
+		/*
+		 * In mremap_to().
+		 * VMA is moved to dst address, and munmap dst first.
+		 * do_munmap will check if dst is sealed.
+		 */
 		ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
 		if (ret)
 			goto out;
@@ -1061,6 +1079,19 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 		goto out;
 	}
 
+	/*
+	 * Below is shrink/expand case (not mremap_to())
+	 * Check if src address is sealed, if so, reject.
+	 * In other words, prevent shrinking or expanding a sealed VMA.
+	 *
+	 * Place can_modify_mm here so we can keep the logic related to
+	 * shrink/expand together.
+	 */
+	if (unlikely(!can_modify_mm(mm, addr, addr + old_len))) {
+		ret = -EPERM;
+		goto out;
+	}
+
 	/*
 	 * Always allow a shrinking remap: that just unmaps
 	 * the unnecessary pages..
diff --git a/mm/mseal.c b/mm/mseal.c
new file mode 100644
index 000000000000..bf783bba8ed0
--- /dev/null
+++ b/mm/mseal.c
@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  Implement mseal() syscall.
+ *
+ *  Copyright (c) 2023,2024 Google, Inc.
+ *
+ *  Author: Jeff Xu <jeffxu@chromium.org>
+ */
+
+#include <linux/mempolicy.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/mm_inline.h>
+#include <linux/mmu_context.h>
+#include <linux/syscalls.h>
+#include <linux/sched.h>
+#include "internal.h"
+
+static inline bool vma_is_sealed(struct vm_area_struct *vma)
+{
+	return (vma->vm_flags & VM_SEALED);
+}
+
+static inline void set_vma_sealed(struct vm_area_struct *vma)
+{
+	vm_flags_set(vma, VM_SEALED);
+}
+
+/*
+ * check if a vma is sealed for modification.
+ * return true, if modification is allowed.
+ */
+static bool can_modify_vma(struct vm_area_struct *vma)
+{
+	if (unlikely(vma_is_sealed(vma)))
+		return false;
+
+	return true;
+}
+
+static bool is_madv_discard(int behavior)
+{
+	return	behavior &
+		(MADV_FREE | MADV_DONTNEED | MADV_DONTNEED_LOCKED |
+		 MADV_REMOVE | MADV_DONTFORK | MADV_WIPEONFORK);
+}
+
+static bool is_ro_anon(struct vm_area_struct *vma)
+{
+	/* check anonymous mapping. */
+	if (vma->vm_file || vma->vm_flags & VM_SHARED)
+		return false;
+
+	/*
+	 * check for non-writable:
+	 * PROT=RO or PKRU is not writeable.
+	 */
+	if (!(vma->vm_flags & VM_WRITE) ||
+		!arch_vma_access_permitted(vma, true, false, false))
+		return true;
+
+	return false;
+}
+
+/*
+ * Check if the vmas of a memory range are allowed to be modified.
+ * the memory ranger can have a gap (unallocated memory).
+ * return true, if it is allowed.
+ */
+bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end)
+{
+	struct vm_area_struct *vma;
+
+	VMA_ITERATOR(vmi, mm, start);
+
+	/* going through each vma to check. */
+	for_each_vma_range(vmi, vma, end) {
+		if (unlikely(!can_modify_vma(vma)))
+			return false;
+	}
+
+	/* Allow by default. */
+	return true;
+}
+
+/*
+ * Check if the vmas of a memory range are allowed to be modified by madvise.
+ * the memory ranger can have a gap (unallocated memory).
+ * return true, if it is allowed.
+ */
+bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end,
+		int behavior)
+{
+	struct vm_area_struct *vma;
+
+	VMA_ITERATOR(vmi, mm, start);
+
+	if (!is_madv_discard(behavior))
+		return true;
+
+	/* going through each vma to check. */
+	for_each_vma_range(vmi, vma, end)
+		if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma)))
+			return false;
+
+	/* Allow by default. */
+	return true;
+}
+
+static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
+		struct vm_area_struct **prev, unsigned long start,
+		unsigned long end, vm_flags_t newflags)
+{
+	int ret = 0;
+	vm_flags_t oldflags = vma->vm_flags;
+
+	if (newflags == oldflags)
+		goto out;
+
+	vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
+	if (IS_ERR(vma)) {
+		ret = PTR_ERR(vma);
+		goto out;
+	}
+
+	set_vma_sealed(vma);
+out:
+	*prev = vma;
+	return ret;
+}
+
+/*
+ * Check for do_mseal:
+ * 1> start is part of a valid vma.
+ * 2> end is part of a valid vma.
+ * 3> No gap (unallocated address) between start and end.
+ * 4> map is sealable.
+ */
+static int check_mm_seal(unsigned long start, unsigned long end)
+{
+	struct vm_area_struct *vma;
+	unsigned long nstart = start;
+
+	VMA_ITERATOR(vmi, current->mm, start);
+
+	/* going through each vma to check. */
+	for_each_vma_range(vmi, vma, end) {
+		if (vma->vm_start > nstart)
+			/* unallocated memory found. */
+			return -ENOMEM;
+
+		if (vma->vm_end >= end)
+			return 0;
+
+		nstart = vma->vm_end;
+	}
+
+	return -ENOMEM;
+}
+
+/*
+ * Apply sealing.
+ */
+static int apply_mm_seal(unsigned long start, unsigned long end)
+{
+	unsigned long nstart;
+	struct vm_area_struct *vma, *prev;
+
+	VMA_ITERATOR(vmi, current->mm, start);
+
+	vma = vma_iter_load(&vmi);
+	/*
+	 * Note: check_mm_seal should already checked ENOMEM case.
+	 * so vma should not be null, same for the other ENOMEM cases.
+	 */
+	prev = vma_prev(&vmi);
+	if (start > vma->vm_start)
+		prev = vma;
+
+	nstart = start;
+	for_each_vma_range(vmi, vma, end) {
+		int error;
+		unsigned long tmp;
+		vm_flags_t newflags;
+
+		newflags = vma->vm_flags | VM_SEALED;
+		tmp = vma->vm_end;
+		if (tmp > end)
+			tmp = end;
+		error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
+		if (error)
+			return error;
+		nstart = vma_iter_end(&vmi);
+	}
+
+	return 0;
+}
+
+/*
+ * mseal(2) seals the VM's meta data from
+ * selected syscalls.
+ *
+ * addr/len: VM address range.
+ *
+ *  The address range by addr/len must meet:
+ *   start (addr) must be in a valid VMA.
+ *   end (addr + len) must be in a valid VMA.
+ *   no gap (unallocated memory) between start and end.
+ *   start (addr) must be page aligned.
+ *
+ *  len: len will be page aligned implicitly.
+ *
+ *   Below VMA operations are blocked after sealing.
+ *   1> Unmapping, moving to another location, and shrinking
+ *	the size, via munmap() and mremap(), can leave an empty
+ *	space, therefore can be replaced with a VMA with a new
+ *	set of attributes.
+ *   2> Moving or expanding a different vma into the current location,
+ *	via mremap().
+ *   3> Modifying a VMA via mmap(MAP_FIXED).
+ *   4> Size expansion, via mremap(), does not appear to pose any
+ *	specific risks to sealed VMAs. It is included anyway because
+ *	the use case is unclear. In any case, users can rely on
+ *	merging to expand a sealed VMA.
+ *   5> mprotect and pkey_mprotect.
+ *   6> Some destructive madvice() behavior (e.g. MADV_DONTNEED)
+ *      for anonymous memory, when users don't have write permission to the
+ *	memory. Those behaviors can alter region contents by discarding pages,
+ *	effectively a memset(0) for anonymous memory.
+ *
+ *  flags: reserved.
+ *
+ * return values:
+ *  zero: success.
+ *  -EINVAL:
+ *   invalid input flags.
+ *   start address is not page aligned.
+ *   Address arange (start + len) overflow.
+ *  -ENOMEM:
+ *   addr is not a valid address (not allocated).
+ *   end (start + len) is not a valid address.
+ *   a gap (unallocated memory) between start and end.
+ *  -EPERM:
+ *  - In 32 bit architecture, sealing is not supported.
+ * Note:
+ *  user can call mseal(2) multiple times, adding a seal on an
+ *  already sealed memory is a no-action (no error).
+ *
+ *  unseal() is not supported.
+ */
+static int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
+{
+	size_t len;
+	int ret = 0;
+	unsigned long end;
+	struct mm_struct *mm = current->mm;
+
+	ret = can_do_mseal(flags);
+	if (ret)
+		return ret;
+
+	start = untagged_addr(start);
+	if (!PAGE_ALIGNED(start))
+		return -EINVAL;
+
+	len = PAGE_ALIGN(len_in);
+	/* Check to see whether len was rounded up from small -ve to zero. */
+	if (len_in && !len)
+		return -EINVAL;
+
+	end = start + len;
+	if (end < start)
+		return -EINVAL;
+
+	if (end == start)
+		return 0;
+
+	if (mmap_write_lock_killable(mm))
+		return -EINTR;
+
+	/*
+	 * First pass, this helps to avoid
+	 * partial sealing in case of error in input address range,
+	 * e.g. ENOMEM error.
+	 */
+	ret = check_mm_seal(start, end);
+	if (ret)
+		goto out;
+
+	/*
+	 * Second pass, this should success, unless there are errors
+	 * from vma_modify_flags, e.g. merge/split error, or process
+	 * reaching the max supported VMAs, however, those cases shall
+	 * be rare.
+	 */
+	ret = apply_mm_seal(start, end);
+
+out:
+	mmap_write_unlock(current->mm);
+	return ret;
+}
+
+SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long,
+		flags)
+{
+	return do_mseal(start, len, flags);
+}
-- 
cgit v1.2.3


From 2e577732e8d28b9183df701fb90cb7943aa4ed16 Mon Sep 17 00:00:00 2001
From: Andrey Konovalov <andreyknvl@gmail.com>
Date: Fri, 17 May 2024 15:01:18 +0200
Subject: kasan, fortify: properly rename memintrinsics

After commit 69d4c0d32186 ("entry, kasan, x86: Disallow overriding mem*()
functions") and the follow-up fixes, with CONFIG_FORTIFY_SOURCE enabled,
even though the compiler instruments meminstrinsics by generating calls to
__asan/__hwasan_ prefixed functions, FORTIFY_SOURCE still uses
uninstrumented memset/memmove/memcpy as the underlying functions.

As a result, KASAN cannot detect bad accesses in memset/memmove/memcpy.
This also makes KASAN tests corrupt kernel memory and cause crashes.

To fix this, use __asan_/__hwasan_memset/memmove/memcpy as the underlying
functions whenever appropriate.  Do this only for the instrumented code
(as indicated by __SANITIZE_ADDRESS__).

Link: https://lkml.kernel.org/r/20240517130118.759301-1-andrey.konovalov@linux.dev
Fixes: 69d4c0d32186 ("entry, kasan, x86: Disallow overriding mem*() functions")
Fixes: 51287dcb00cc ("kasan: emit different calls for instrumentable memintrinsics")
Fixes: 36be5cba99f6 ("kasan: treat meminstrinsic as builtins in uninstrumented files")
Signed-off-by: Andrey Konovalov <andreyknvl@gmail.com>
Reported-by: Erhard Furtner <erhard_f@mailbox.org>
Reported-by: Nico Pache <npache@redhat.com>
Closes: https://lore.kernel.org/all/20240501144156.17e65021@outsider.home/
Reviewed-by: Marco Elver <elver@google.com>
Tested-by: Nico Pache <npache@redhat.com>
Acked-by: Nico Pache <npache@redhat.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Daniel Axtens <dja@axtens.net>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/fortify-string.h | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h
index d658ae729a02..7e0f340bf363 100644
--- a/include/linux/fortify-string.h
+++ b/include/linux/fortify-string.h
@@ -75,17 +75,30 @@ void __write_overflow_field(size_t avail, size_t wanted) __compiletime_warning("
 	__ret;							\
 })
 
-#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
+#if defined(__SANITIZE_ADDRESS__)
+
+#if !defined(CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX) && !defined(CONFIG_GENERIC_ENTRY)
+extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(memset);
+extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(memmove);
+extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy);
+#elif defined(CONFIG_KASAN_GENERIC)
+extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(__asan_memset);
+extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(__asan_memmove);
+extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(__asan_memcpy);
+#else /* CONFIG_KASAN_SW_TAGS */
+extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(__hwasan_memset);
+extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(__hwasan_memmove);
+extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(__hwasan_memcpy);
+#endif
+
 extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr);
 extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp);
-extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy);
-extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(memmove);
-extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(memset);
 extern char *__underlying_strcat(char *p, const char *q) __RENAME(strcat);
 extern char *__underlying_strcpy(char *p, const char *q) __RENAME(strcpy);
 extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen);
 extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat);
 extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy);
+
 #else
 
 #if defined(__SANITIZE_MEMORY__)
@@ -110,6 +123,7 @@ extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size)
 #define __underlying_strlen	__builtin_strlen
 #define __underlying_strncat	__builtin_strncat
 #define __underlying_strncpy	__builtin_strncpy
+
 #endif
 
 /**
-- 
cgit v1.2.3


From 9b0abe7948364bd35fff2b202ee7f30a2fa73c53 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 24 May 2024 11:42:09 -0400
Subject: mm: percpu: Include smp.h in alloc_tag.h

percpu.h depends on smp.h, but doesn't include it directly because of
circular header dependency issues; percpu.h is needed in a bunch of low
level headers.

This fixes a randconfig build error on mips:

  include/linux/alloc_tag.h: In function '__alloc_tag_ref_set':
  include/asm-generic/percpu.h:31:40: error: implicit declaration of function 'raw_smp_processor_id' [-Werror=implicit-function-declaration]

Reported-by: kernel test robot <lkp@intel.com>
Fixes: 24e44cc22aa3 ("mm: percpu: enable per-cpu allocation tagging")
Closes: https://lore.kernel.org/oe-kbuild-all/202405210052.DIrMXJNz-lkp@intel.com/
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/alloc_tag.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index afc9e259a2d3..abd24016a900 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -11,6 +11,7 @@
 #include <linux/preempt.h>
 #include <asm/percpu.h>
 #include <linux/cpumask.h>
+#include <linux/smp.h>
 #include <linux/static_key.h>
 #include <linux/irqflags.h>
 
-- 
cgit v1.2.3